Source code for snps.testing

"""Shared test utilities for snps."""

from __future__ import annotations

import os
from typing import Any

import numpy as np
import pandas as pd
from pandas.api.types import is_object_dtype, is_string_dtype

# Standard dtypes for normalized SNP DataFrames
NORMALIZED_DTYPES = {
    "rsid": object,
    "chrom": object,
    "pos": np.uint32,
    "genotype": object,
}



[docs]
def get_complement(base: str) -> str:
    """Get the complement of a DNA base.

    Parameters
    ----------
    base : str
        A single DNA base (A, C, G, or T)

    Returns
    -------
    str
        The complementary base (A<->T, C<->G), or the original if not a valid base
    """
    complements = {"A": "T", "T": "A", "C": "G", "G": "C"}
    return complements.get(base, base)




[docs]
def complement_genotype(genotype: str) -> str:
    """Get the complement of a genotype (both alleles).

    Parameters
    ----------
    genotype : str
        A two-character genotype string (e.g., "AT", "CG")

    Returns
    -------
    str
        The complemented genotype, or np.nan if input is null
    """
    if pd.isnull(genotype):
        return np.nan
    return "".join(get_complement(base) for base in genotype)




[docs]
def complement_one_allele(genotype: str) -> str:
    """Get the complement of only the first allele of a genotype.

    The second allele is preserved unchanged. This is useful for simulating
    partial strand complementation in test data.

    Parameters
    ----------
    genotype : str
        A two-character genotype string (e.g., "AT", "CG")

    Returns
    -------
    str
        Genotype with first allele complemented, or np.nan if input is null
    """
    if pd.isnull(genotype):
        return np.nan
    return get_complement(genotype[0]) + genotype[1]




[docs]
def create_snp_df(
    rsid: list[str],
    chrom: list[str],
    pos: list[int],
    genotype: list[str],
) -> pd.DataFrame:
    """Create a normalized SNP DataFrame.

    Parameters
    ----------
    rsid : list of str
        SNP identifiers (becomes the index)
    chrom : list of str
        Chromosome values
    pos : list of int
        Position values
    genotype : list of str
        Genotype values

    Returns
    -------
    ~pandas.DataFrame
        DataFrame with rsid index and chrom, pos, genotype columns
    """
    df = pd.DataFrame(
        {"rsid": rsid, "chrom": chrom, "pos": pos, "genotype": genotype},
        columns=["rsid", "chrom", "pos", "genotype"],
    )
    df = df.astype(NORMALIZED_DTYPES)
    df = df.set_index("rsid")
    return df




[docs]
def create_simulated_snp_df(
    chrom: str = "1",
    pos_start: int = 1,
    pos_max: int = 248140902,
    pos_step: int = 100,
    pos_dtype: type = np.uint32,
    genotype: str = "AA",
    insert_nulls: bool = True,
    null_snp_step: int = 101,
    complement_genotype_one_allele: bool = False,
    complement_genotype_two_alleles: bool = False,
    complement_snp_step: int = 50,
) -> pd.DataFrame:
    """Create a simulated SNP DataFrame for testing.

    This is the core logic for creating simulated SNP data. Each project
    can wrap this to assign to their specific object types.

    Parameters
    ----------
    chrom : str
        Chromosome value for all SNPs (default: "1")
    pos_start : int
        Starting position (default: 1)
    pos_max : int
        Maximum position (default: 248140902)
    pos_step : int
        Step between positions (default: 100)
    pos_dtype : type
        Numpy dtype for positions (default: np.uint32)
    genotype : str
        Default genotype for all SNPs (default: "AA")
    insert_nulls : bool
        Whether to insert null genotypes (default: True)
    null_snp_step : int
        Insert null every N SNPs (default: 101)
    complement_genotype_one_allele : bool
        Complement first allele at intervals (default: False)
    complement_genotype_two_alleles : bool
        Complement both alleles at intervals (default: False)
    complement_snp_step : int
        Apply complement every N SNPs (default: 50)

    Returns
    -------
    ~pandas.DataFrame
        DataFrame with rsid index and chrom, pos, genotype columns
    """
    positions = np.arange(pos_start, pos_max, pos_step, dtype=pos_dtype)
    snps = pd.DataFrame(
        {"chrom": chrom},
        index=pd.Index([f"rs{x + 1}" for x in range(len(positions))], name="rsid"),
    )
    snps["pos"] = positions
    snps["genotype"] = genotype

    if insert_nulls:
        snps.loc[snps.iloc[0::null_snp_step, :].index, "genotype"] = np.nan

    indices = snps.iloc[0::complement_snp_step, :].index
    if complement_genotype_two_alleles:
        snps.loc[indices, "genotype"] = snps.loc[indices, "genotype"].apply(
            complement_genotype
        )
    elif complement_genotype_one_allele:
        snps.loc[indices, "genotype"] = snps.loc[indices, "genotype"].apply(
            complement_one_allele
        )

    return snps




[docs]
def assert_series_equal_with_string_dtype(
    left: pd.Series,
    right: pd.Series,
    test_case: Any = None,
    **kwargs,
) -> None:
    """Assert Series are equal, accepting both object and StringDtype for string data.

    In Python 3.14+, pandas infers StringDtype for string data instead of object.
    This function compares Series without strict dtype matching for string data.

    Parameters
    ----------
    left : ~pandas.Series
        First Series to compare
    right : ~pandas.Series
        Second Series to compare
    test_case : object, optional
        Object with assertTrue method for assertions (uses assert if None)
    **kwargs : dict
        Additional arguments passed to pd.testing.assert_series_equal
    """
    import pandas as pd

    # Verify string series have string or object dtypes
    if is_string_dtype(left.dtype) or is_object_dtype(left.dtype):
        right_is_string = is_string_dtype(right.dtype) or is_object_dtype(right.dtype)
        if test_case:
            test_case.assertTrue(
                right_is_string,
                f"Right series dtype {right.dtype} should be string/object type",
            )
        else:
            assert right_is_string, (
                f"Right series dtype {right.dtype} should be string/object type"
            )
    # Compare Series without strict dtype matching
    pd.testing.assert_series_equal(left, right, check_dtype=False, **kwargs)




[docs]
def assert_frame_equal_with_string_index(
    left: pd.DataFrame,
    right: pd.DataFrame,
    test_case: Any = None,
    **kwargs,
) -> None:
    """Assert DataFrames are equal, accepting both object and StringDtype for string columns.

    In Python 3.14+, pandas infers StringDtype for string columns/indices instead of object.
    This function validates that string columns have string types, then compares the
    DataFrames without strict dtype matching for object/string columns.

    Parameters
    ----------
    left : ~pandas.DataFrame
        First DataFrame to compare
    right : ~pandas.DataFrame
        Second DataFrame to compare
    test_case : object, optional
        Object with assertTrue method for assertions (uses assert if None)
    **kwargs : dict
        Additional arguments passed to pd.testing.assert_frame_equal
    """
    import pandas as pd

    def _assert(condition: bool, message: str) -> None:
        if test_case:
            test_case.assertTrue(condition, message)
        else:
            assert condition, message

    # Verify index dtypes are string types if they're named 'rsid'
    if left.index.name == "rsid":
        _assert(
            is_string_dtype(left.index.dtype),
            f"Left index dtype {left.index.dtype} is not a string type",
        )
    if right.index.name == "rsid":
        _assert(
            is_string_dtype(right.index.dtype),
            f"Right index dtype {right.index.dtype} is not a string type",
        )

    # Verify string columns (chrom, genotype) have string dtypes
    for col in ["chrom", "genotype"]:
        if col in left.columns:
            _assert(
                is_string_dtype(left[col].dtype) or is_object_dtype(left[col].dtype),
                f"Left column '{col}' dtype {left[col].dtype} is not a string/object type",
            )
        if col in right.columns:
            _assert(
                is_string_dtype(right[col].dtype) or is_object_dtype(right[col].dtype),
                f"Right column '{col}' dtype {right[col].dtype} is not a string/object type",
            )

    # Compare DataFrames without strict dtype matching for string columns
    pd.testing.assert_frame_equal(
        left, right, check_index_type=False, check_dtype=False, **kwargs
    )




[docs]
class SNPsTestMixin:
    """Mixin class providing common test assertions and utilities for SNP DataFrames.

    This mixin can be combined with unittest.TestCase to add convenient
    assertion methods for comparing SNP DataFrames with flexible string dtype handling,
    plus common test utilities like creating test DataFrames.

    Example
    -------
    >>> class MyTestCase(SNPsTestMixin, TestCase):
    ...     def test_something(self):
    ...         df = self.generic_snps()
    ...         self.assert_frame_equal_with_string_index(df, expected_df)
    """

    @property
    def downloads_enabled(self) -> bool:
        """Check if external downloads are enabled for tests.

        Only download from external resources when an environment variable named
        "DOWNLOADS_ENABLED" is set to "true".

        Returns
        -------
        bool
        """
        return os.getenv("DOWNLOADS_ENABLED") == "true"


[docs]
    @staticmethod
    def get_complement(base: str) -> str:
        """Get the complement of a DNA base.

        See :func:`get_complement` for details.
        """
        return get_complement(base)



[docs]
    def complement_genotype(self, genotype: str) -> str:
        """Get the complement of a genotype (both alleles).

        See :func:`complement_genotype` for details.
        """
        return complement_genotype(genotype)



[docs]
    def complement_one_allele(self, genotype: str) -> str:
        """Get the complement of only the first allele of a genotype.

        See :func:`complement_one_allele` for details.
        """
        return complement_one_allele(genotype)



[docs]
    @staticmethod
    def create_snp_df(
        rsid: list[str],
        chrom: list[str],
        pos: list[int],
        genotype: list[str],
    ) -> pd.DataFrame:
        """Create a normalized SNP DataFrame.

        See :func:`create_snp_df` for details.
        """
        return create_snp_df(rsid, chrom, pos, genotype)



[docs]
    def generic_snps(self) -> pd.DataFrame:
        """Create a generic SNP DataFrame for testing.

        Returns
        -------
        ~pandas.DataFrame
            DataFrame with 8 SNPs (rs1-rs8) on chromosome 1
        """
        return create_snp_df(
            rsid=[f"rs{i}" for i in range(1, 9)],
            chrom=["1"] * 8,
            pos=list(range(101, 109)),
            genotype=["AA", "CC", "GG", "TT", np.nan, "GC", "TC", "AT"],
        )



[docs]
    def assert_series_equal_with_string_dtype(self, left, right, **kwargs):
        """Assert Series are equal, accepting both object and StringDtype for string data.

        See :func:`assert_series_equal_with_string_dtype` for details.
        """
        assert_series_equal_with_string_dtype(left, right, test_case=self, **kwargs)



[docs]
    def assert_frame_equal_with_string_index(self, left, right, **kwargs):
        """Assert DataFrames are equal, accepting both object and StringDtype for string columns.

        See :func:`assert_frame_equal_with_string_index` for details.
        """
        assert_frame_equal_with_string_index(left, right, test_case=self, **kwargs)