Source code for snps.testing

"""Shared test utilities for snps."""

from __future__ import annotations

import os
from typing import Any

import numpy as np
import pandas as pd
from pandas.api.types import is_object_dtype, is_string_dtype

# Standard dtypes for normalized SNP DataFrames
NORMALIZED_DTYPES = {
    "rsid": object,
    "chrom": object,
    "pos": np.uint32,
    "genotype": object,
}


[docs] def get_complement(base: str) -> str: """Get the complement of a DNA base. Parameters ---------- base : str A single DNA base (A, C, G, or T) Returns ------- str The complementary base (A<->T, C<->G), or the original if not a valid base """ complements = {"A": "T", "T": "A", "C": "G", "G": "C"} return complements.get(base, base)
[docs] def complement_genotype(genotype: str) -> str: """Get the complement of a genotype (both alleles). Parameters ---------- genotype : str A two-character genotype string (e.g., "AT", "CG") Returns ------- str The complemented genotype, or np.nan if input is null """ if pd.isnull(genotype): return np.nan return "".join(get_complement(base) for base in genotype)
[docs] def complement_one_allele(genotype: str) -> str: """Get the complement of only the first allele of a genotype. The second allele is preserved unchanged. This is useful for simulating partial strand complementation in test data. Parameters ---------- genotype : str A two-character genotype string (e.g., "AT", "CG") Returns ------- str Genotype with first allele complemented, or np.nan if input is null """ if pd.isnull(genotype): return np.nan return get_complement(genotype[0]) + genotype[1]
[docs] def create_snp_df( rsid: list[str], chrom: list[str], pos: list[int], genotype: list[str], ) -> pd.DataFrame: """Create a normalized SNP DataFrame. Parameters ---------- rsid : list of str SNP identifiers (becomes the index) chrom : list of str Chromosome values pos : list of int Position values genotype : list of str Genotype values Returns ------- ~pandas.DataFrame DataFrame with rsid index and chrom, pos, genotype columns """ df = pd.DataFrame( {"rsid": rsid, "chrom": chrom, "pos": pos, "genotype": genotype}, columns=["rsid", "chrom", "pos", "genotype"], ) df = df.astype(NORMALIZED_DTYPES) df = df.set_index("rsid") return df
[docs] def create_simulated_snp_df( chrom: str = "1", pos_start: int = 1, pos_max: int = 248140902, pos_step: int = 100, pos_dtype: type = np.uint32, genotype: str = "AA", insert_nulls: bool = True, null_snp_step: int = 101, complement_genotype_one_allele: bool = False, complement_genotype_two_alleles: bool = False, complement_snp_step: int = 50, ) -> pd.DataFrame: """Create a simulated SNP DataFrame for testing. This is the core logic for creating simulated SNP data. Each project can wrap this to assign to their specific object types. Parameters ---------- chrom : str Chromosome value for all SNPs (default: "1") pos_start : int Starting position (default: 1) pos_max : int Maximum position (default: 248140902) pos_step : int Step between positions (default: 100) pos_dtype : type Numpy dtype for positions (default: np.uint32) genotype : str Default genotype for all SNPs (default: "AA") insert_nulls : bool Whether to insert null genotypes (default: True) null_snp_step : int Insert null every N SNPs (default: 101) complement_genotype_one_allele : bool Complement first allele at intervals (default: False) complement_genotype_two_alleles : bool Complement both alleles at intervals (default: False) complement_snp_step : int Apply complement every N SNPs (default: 50) Returns ------- ~pandas.DataFrame DataFrame with rsid index and chrom, pos, genotype columns """ positions = np.arange(pos_start, pos_max, pos_step, dtype=pos_dtype) snps = pd.DataFrame( {"chrom": chrom}, index=pd.Index([f"rs{x + 1}" for x in range(len(positions))], name="rsid"), ) snps["pos"] = positions snps["genotype"] = genotype if insert_nulls: snps.loc[snps.iloc[0::null_snp_step, :].index, "genotype"] = np.nan indices = snps.iloc[0::complement_snp_step, :].index if complement_genotype_two_alleles: snps.loc[indices, "genotype"] = snps.loc[indices, "genotype"].apply( complement_genotype ) elif complement_genotype_one_allele: snps.loc[indices, "genotype"] = snps.loc[indices, "genotype"].apply( complement_one_allele ) return snps
[docs] def assert_series_equal_with_string_dtype( left: pd.Series, right: pd.Series, test_case: Any = None, **kwargs, ) -> None: """Assert Series are equal, accepting both object and StringDtype for string data. In Python 3.14+, pandas infers StringDtype for string data instead of object. This function compares Series without strict dtype matching for string data. Parameters ---------- left : ~pandas.Series First Series to compare right : ~pandas.Series Second Series to compare test_case : object, optional Object with assertTrue method for assertions (uses assert if None) **kwargs : dict Additional arguments passed to pd.testing.assert_series_equal """ import pandas as pd # Verify string series have string or object dtypes if is_string_dtype(left.dtype) or is_object_dtype(left.dtype): right_is_string = is_string_dtype(right.dtype) or is_object_dtype(right.dtype) if test_case: test_case.assertTrue( right_is_string, f"Right series dtype {right.dtype} should be string/object type", ) else: assert right_is_string, ( f"Right series dtype {right.dtype} should be string/object type" ) # Compare Series without strict dtype matching pd.testing.assert_series_equal(left, right, check_dtype=False, **kwargs)
[docs] def assert_frame_equal_with_string_index( left: pd.DataFrame, right: pd.DataFrame, test_case: Any = None, **kwargs, ) -> None: """Assert DataFrames are equal, accepting both object and StringDtype for string columns. In Python 3.14+, pandas infers StringDtype for string columns/indices instead of object. This function validates that string columns have string types, then compares the DataFrames without strict dtype matching for object/string columns. Parameters ---------- left : ~pandas.DataFrame First DataFrame to compare right : ~pandas.DataFrame Second DataFrame to compare test_case : object, optional Object with assertTrue method for assertions (uses assert if None) **kwargs : dict Additional arguments passed to pd.testing.assert_frame_equal """ import pandas as pd def _assert(condition: bool, message: str) -> None: if test_case: test_case.assertTrue(condition, message) else: assert condition, message # Verify index dtypes are string types if they're named 'rsid' if left.index.name == "rsid": _assert( is_string_dtype(left.index.dtype), f"Left index dtype {left.index.dtype} is not a string type", ) if right.index.name == "rsid": _assert( is_string_dtype(right.index.dtype), f"Right index dtype {right.index.dtype} is not a string type", ) # Verify string columns (chrom, genotype) have string dtypes for col in ["chrom", "genotype"]: if col in left.columns: _assert( is_string_dtype(left[col].dtype) or is_object_dtype(left[col].dtype), f"Left column '{col}' dtype {left[col].dtype} is not a string/object type", ) if col in right.columns: _assert( is_string_dtype(right[col].dtype) or is_object_dtype(right[col].dtype), f"Right column '{col}' dtype {right[col].dtype} is not a string/object type", ) # Compare DataFrames without strict dtype matching for string columns pd.testing.assert_frame_equal( left, right, check_index_type=False, check_dtype=False, **kwargs )
[docs] class SNPsTestMixin: """Mixin class providing common test assertions and utilities for SNP DataFrames. This mixin can be combined with unittest.TestCase to add convenient assertion methods for comparing SNP DataFrames with flexible string dtype handling, plus common test utilities like creating test DataFrames. Example ------- >>> class MyTestCase(SNPsTestMixin, TestCase): ... def test_something(self): ... df = self.generic_snps() ... self.assert_frame_equal_with_string_index(df, expected_df) """ @property def downloads_enabled(self) -> bool: """Check if external downloads are enabled for tests. Only download from external resources when an environment variable named "DOWNLOADS_ENABLED" is set to "true". Returns ------- bool """ return os.getenv("DOWNLOADS_ENABLED") == "true"
[docs] @staticmethod def get_complement(base: str) -> str: """Get the complement of a DNA base. See :func:`get_complement` for details. """ return get_complement(base)
[docs] def complement_genotype(self, genotype: str) -> str: """Get the complement of a genotype (both alleles). See :func:`complement_genotype` for details. """ return complement_genotype(genotype)
[docs] def complement_one_allele(self, genotype: str) -> str: """Get the complement of only the first allele of a genotype. See :func:`complement_one_allele` for details. """ return complement_one_allele(genotype)
[docs] @staticmethod def create_snp_df( rsid: list[str], chrom: list[str], pos: list[int], genotype: list[str], ) -> pd.DataFrame: """Create a normalized SNP DataFrame. See :func:`create_snp_df` for details. """ return create_snp_df(rsid, chrom, pos, genotype)
[docs] def generic_snps(self) -> pd.DataFrame: """Create a generic SNP DataFrame for testing. Returns ------- ~pandas.DataFrame DataFrame with 8 SNPs (rs1-rs8) on chromosome 1 """ return create_snp_df( rsid=[f"rs{i}" for i in range(1, 9)], chrom=["1"] * 8, pos=list(range(101, 109)), genotype=["AA", "CC", "GG", "TT", np.nan, "GC", "TC", "AT"], )
[docs] def assert_series_equal_with_string_dtype(self, left, right, **kwargs): """Assert Series are equal, accepting both object and StringDtype for string data. See :func:`assert_series_equal_with_string_dtype` for details. """ assert_series_equal_with_string_dtype(left, right, test_case=self, **kwargs)
[docs] def assert_frame_equal_with_string_index(self, left, right, **kwargs): """Assert DataFrames are equal, accepting both object and StringDtype for string columns. See :func:`assert_frame_equal_with_string_index` for details. """ assert_frame_equal_with_string_index(left, right, test_case=self, **kwargs)