Source code for snps.resources

""" Class for downloading and loading required external resources.

References
----------
1. International Human Genome Sequencing Consortium. Initial sequencing and
   analysis of the human genome. Nature. 2001 Feb 15;409(6822):860-921.
   http://dx.doi.org/10.1038/35057062
2. hg19 (GRCh37): Hiram Clawson, Brooke Rhead, Pauline Fujita, Ann Zweig, Katrina
   Learned, Donna Karolchik and Robert Kuhn, https://genome.ucsc.edu/cgi-bin/hgGateway?db=hg19
3. Yates et. al. (doi:10.1093/bioinformatics/btu613),
   `<http://europepmc.org/search/?query=DOI:10.1093/bioinformatics/btu613>`_
4. Zerbino et. al. (doi.org/10.1093/nar/gkx1098), https://doi.org/10.1093/nar/gkx1098

"""

"""
BSD 3-Clause License

Copyright (c) 2019, Andrew Riha
All rights reserved.

Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are met:

1. Redistributions of source code must retain the above copyright notice, this
   list of conditions and the following disclaimer.

2. Redistributions in binary form must reproduce the above copyright notice,
   this list of conditions and the following disclaimer in the documentation
   and/or other materials provided with the distribution.

3. Neither the name of the copyright holder nor the names of its
   contributors may be used to endorse or promote products derived from
   this software without specific prior written permission.

THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.

"""

import gzip
import hashlib
import itertools
import json
import logging
import os
import socket
import tarfile
import tempfile
import urllib.error
import urllib.request
import zipfile

from atomicwrites import atomic_write
import numpy as np
import pandas as pd

from snps.ensembl import EnsemblRestClient
from snps.utils import create_dir, Singleton

logger = logging.getLogger(__name__)


[docs] class Resources(metaclass=Singleton): """Object used to manage resources required by `snps`."""
[docs] def __init__(self, resources_dir="resources"): """Initialize a ``Resources`` object. Parameters ---------- resources_dir : str name / path of resources directory """ self._resources_dir = os.path.abspath(resources_dir) self._ensembl_rest_client = EnsemblRestClient() self._init_resource_attributes()
def _init_resource_attributes(self): self._reference_sequences = {} self._gsa_rsid_map = None self._gsa_chrpos_map = None self._dbsnp_151_37_reverse = None self._opensnp_datadump_filenames = [] self._chip_clusters = None self._low_quality_snps = None
[docs] def get_reference_sequences( self, assembly="GRCh37", chroms=( "1", "2", "3", "4", "5", "6", "7", "8", "9", "10", "11", "12", "13", "14", "15", "16", "17", "18", "19", "20", "21", "22", "X", "Y", "MT", ), ): """Get Homo sapiens reference sequences for `chroms` of `assembly`. Notes ----- This function can download over 800MB of data for each assembly. Parameters ---------- assembly : {'NCBI36', 'GRCh37', 'GRCh38'} reference sequence assembly chroms : list of str reference sequence chromosomes Returns ------- dict dict of ReferenceSequence, else {} """ valid_assemblies = ["NCBI36", "GRCh37", "GRCh38"] if assembly not in valid_assemblies: logger.warning("Invalid assembly") return {} if not self._reference_chroms_available(assembly, chroms): self._reference_sequences[assembly] = self._create_reference_sequences( *self._get_paths_reference_sequences(assembly=assembly, chroms=chroms) ) return self._reference_sequences[assembly]
def _reference_chroms_available(self, assembly, chroms): if assembly in self._reference_sequences: for chrom in chroms: if chrom not in self._reference_sequences[assembly]: return False return True else: return False
[docs] def get_assembly_mapping_data(self, source_assembly, target_assembly): """Get assembly mapping data. Parameters ---------- source_assembly : {'NCBI36', 'GRCh37', 'GRCh38'} assembly to remap from target_assembly : {'NCBI36', 'GRCh37', 'GRCh38'} assembly to remap to Returns ------- dict dict of json assembly mapping data if loading was successful, else {} """ return self._load_assembly_mapping_data( self._get_path_assembly_mapping_data(source_assembly, target_assembly) )
[docs] def download_example_datasets(self): """Download example datasets from `openSNP <https://opensnp.org>`_. Per openSNP, "the data is donated into the public domain using `CC0 1.0 <http://creativecommons.org/publicdomain/zero/1.0/>`_." Returns ------- paths : list of str or empty str paths to example datasets References ---------- 1. Greshake B, Bayer PE, Rausch H, Reda J (2014), "openSNP-A Crowdsourced Web Resource for Personal Genomics," PLOS ONE, 9(3): e89204, https://doi.org/10.1371/journal.pone.0089204 """ paths = [] paths.append( self._download_file( "https://opensnp.org/data/662.23andme.340", "662.23andme.340.txt.gz", compress=True, ) ) paths.append( self._download_file( "https://opensnp.org/data/662.ftdna-illumina.341", "662.ftdna-illumina.341.csv.gz", compress=True, ) ) return paths
[docs] def get_all_resources(self): """Get / download all resources used throughout `snps`. Notes ----- This function does not download reference sequences and the openSNP datadump, due to their large sizes. Returns ------- dict dict of resources """ resources = {} for source, target in itertools.permutations(["NCBI36", "GRCh37", "GRCh38"], 2): resources[source + "_" + target] = self.get_assembly_mapping_data( source, target ) resources["gsa_resources"] = self.get_gsa_resources() resources["chip_clusters"] = self.get_chip_clusters() resources["low_quality_snps"] = self.get_low_quality_snps() return resources
[docs] def get_all_reference_sequences(self, **kwargs): """Get Homo sapiens reference sequences for Builds 36, 37, and 38 from Ensembl. Notes ----- This function can download over 2.5GB of data. Returns ------- dict dict of ReferenceSequence, else {} """ for assembly in ("NCBI36", "GRCh37", "GRCh38"): self.get_reference_sequences(assembly=assembly, **kwargs) return self._reference_sequences
[docs] def get_gsa_resources(self): """Get resources for reading Global Screening Array files. https://support.illumina.com/downloads/infinium-global-screening-array-v2-0-product-files.html Returns ------- dict """ return { "rsid_map": self.get_gsa_rsid(), "chrpos_map": self.get_gsa_chrpos(), "dbsnp_151_37_reverse": self.get_dbsnp_151_37_reverse(), }
[docs] def get_chip_clusters(self): """Get resource for identifying deduced genotype / chip array based on chip clusters. Returns ------- pandas.DataFrame References ---------- 1. Chang Lu, Bastian Greshake Tzovaras, Julian Gough, A survey of direct-to-consumer genotype data, and quality control tool (GenomePrep) for research, Computational and Structural Biotechnology Journal, Volume 19, 2021, Pages 3747-3754, ISSN 2001-0370, https://doi.org/10.1016/j.csbj.2021.06.040. """ if self._chip_clusters is None: chip_clusters_path = self._download_file( "https://supfam.mrc-lmb.cam.ac.uk/GenomePrep/datadir/the_list.tsv.gz", "chip_clusters.tsv.gz", ) df = pd.read_csv( chip_clusters_path, sep="\t", names=["locus", "clusters"], dtype={"locus": object, "clusters": pd.CategoricalDtype(ordered=False)}, ) clusters = df.clusters df = df.locus.str.split(":", expand=True) df.rename({0: "chrom", 1: "pos"}, axis=1, inplace=True) df.pos = df.pos.astype(np.uint32) df.chrom = df.chrom.astype(pd.CategoricalDtype(ordered=False)) df["clusters"] = clusters self._chip_clusters = df return self._chip_clusters
[docs] def get_low_quality_snps(self): """Get listing of low quality SNPs for quality control based on chip clusters. Returns ------- pandas.DataFrame References ---------- 1. Chang Lu, Bastian Greshake Tzovaras, Julian Gough, A survey of direct-to-consumer genotype data, and quality control tool (GenomePrep) for research, Computational and Structural Biotechnology Journal, Volume 19, 2021, Pages 3747-3754, ISSN 2001-0370, https://doi.org/10.1016/j.csbj.2021.06.040. """ if self._low_quality_snps is None: low_quality_snps_path = self._download_file( "https://supfam.mrc-lmb.cam.ac.uk/GenomePrep/datadir/badalleles.tsv.gz", "low_quality_snps.tsv.gz", ) df = pd.read_csv( low_quality_snps_path, sep="\t", names=["cluster", "loci"], ) cluster_dfs = [] for row in df.itertuples(): loci = row.loci.split(",") cluster_dfs.append( pd.DataFrame({"cluster": [row.cluster] * len(loci), "locus": loci}) ) df = pd.concat(cluster_dfs) df.reset_index(inplace=True, drop=True) cluster = df.cluster.astype(pd.CategoricalDtype(ordered=False)) df = df.locus.str.split(":", expand=True) df.rename({0: "chrom", 1: "pos"}, axis=1, inplace=True) df.pos = df.pos.astype(np.uint32) df.chrom = df.chrom.astype(pd.CategoricalDtype(ordered=False)) df["cluster"] = cluster self._low_quality_snps = df return self._low_quality_snps
[docs] def get_dbsnp_151_37_reverse(self): """Get and load RSIDs that are on the reference reverse (-) strand in dbSNP 151 and lower. Returns ------- pandas.DataFrame References ---------- 1. Sherry ST, Ward MH, Kholodov M, Baker J, Phan L, Smigielski EM, Sirotkin K. dbSNP: the NCBI database of genetic variation. Nucleic Acids Res. 2001 Jan 1; 29(1):308-11. 2. Database of Single Nucleotide Polymorphisms (dbSNP). Bethesda (MD): National Center for Biotechnology Information, National Library of Medicine. (dbSNP Build ID: 151). Available from: http://www.ncbi.nlm.nih.gov/SNP/ """ if self._dbsnp_151_37_reverse is None: # download the file from the cloud, if not done already dbsnp_rev_path = self._download_file( "https://sano-public.s3.eu-west-2.amazonaws.com/dbsnp151.b37.snps_reverse.txt.gz", "dbsnp_151_37_reverse.txt.gz", ) # load into pandas rsids = pd.read_csv( dbsnp_rev_path, sep=" ", header=None, # dont infer header as there isn't one names=( "dbsnp151revrsid", "dbsnp151freqa", "dbsnp151freqt", "dbsnp151freqc", "dbsnp151freqg", ), dtype={ "dbsnp151revrsid": "string", "dbsnp151freqa": "double", "dbsnp151freqt": "double", "dbsnp151freqc": "double", "dbsnp151freqg": "double", }, engine="c", # force c engine for performance comment="#", # skip the first row ) # store in memory so we don't have to load again self._dbsnp_151_37_reverse = rsids return self._dbsnp_151_37_reverse
[docs] def get_opensnp_datadump_filenames(self): """Get filenames internal to the `openSNP <https://opensnp.org>`_ datadump zip. Per openSNP, "the data is donated into the public domain using `CC0 1.0 <http://creativecommons.org/publicdomain/zero/1.0/>`_." Notes ----- This function can download over 27GB of data. If the download is not successful, try using a different tool like `wget` or `curl` to download the file and move it to the resources directory (see `_get_path_opensnp_datadump`). Returns ------- filenames : list of str filenames internal to the openSNP datadump References ---------- 1. Greshake B, Bayer PE, Rausch H, Reda J (2014), "openSNP-A Crowdsourced Web Resource for Personal Genomics," PLOS ONE, 9(3): e89204, https://doi.org/10.1371/journal.pone.0089204 """ if not self._opensnp_datadump_filenames: self._opensnp_datadump_filenames = self._get_opensnp_datadump_filenames( self._get_path_opensnp_datadump() ) return self._opensnp_datadump_filenames
[docs] def load_opensnp_datadump_file(self, filename): """Load the specified file from the openSNP datadump. Per openSNP, "the data is donated into the public domain using `CC0 1.0 <http://creativecommons.org/publicdomain/zero/1.0/>`_." Parameters ---------- filename : str filename internal to the openSNP datadump Returns ------- bytes content of specified file internal to the openSNP datadump References ---------- 1. Greshake B, Bayer PE, Rausch H, Reda J (2014), "openSNP-A Crowdsourced Web Resource for Personal Genomics," PLOS ONE, 9(3): e89204, https://doi.org/10.1371/journal.pone.0089204 """ if self._get_path_opensnp_datadump(): with zipfile.ZipFile(self._get_path_opensnp_datadump()) as z: with z.open(filename, "r") as f: return f.read() else: return bytes()
@staticmethod def _get_opensnp_datadump_filenames(filename): """Get list of filenames internal to the openSNP datadump zip. Parameters ---------- filename : str path to openSNP datadump Returns ------- filenames : list of str filenames internal to the openSNP datadump """ if filename: with zipfile.ZipFile(filename) as z: return z.namelist() else: return [] @staticmethod def _write_data_to_gzip(f, data): """Write `data` to `f` in `gzip` format. Parameters ---------- f : file object opened with `mode="wb"` data : `bytes` object """ with gzip.open(f, "wb") as f_gzip: f_gzip.write(data) @staticmethod def _load_assembly_mapping_data(filename): """Load assembly mapping data. Parameters ---------- filename : str path to compressed archive with assembly mapping data Returns ------- assembly_mapping_data : dict dict of assembly maps Notes ----- Keys of returned dict are chromosomes and values are the corresponding assembly map. """ assembly_mapping_data = {} with tarfile.open(filename, "r") as tar: # http://stackoverflow.com/a/2018576 for member in tar.getmembers(): if ".json" in member.name: with tar.extractfile(member) as tar_file: tar_bytes = tar_file.read() # https://stackoverflow.com/a/42683509/4727627 assembly_mapping_data[member.name.split(".")[0]] = json.loads( tar_bytes.decode("utf-8") ) return assembly_mapping_data def _get_paths_reference_sequences( self, sub_dir="fasta", assembly="GRCh37", chroms=() ): """Get local paths to Homo sapiens reference sequences from Ensembl. Notes ----- This function can download over 800MB of data for each assembly. Parameters ---------- sub_dir : str directory under resources to store reference sequence data assembly : {'NCBI36', 'GRCh37', 'GRCh38'} reference sequence assembly chroms : list of str reference sequence chromosomes Returns ------- assembly : str reference sequence assembly chroms : list of str reference sequence chromosomes urls : list of str urls to Ensembl reference sequences paths : list of str paths to local reference sequences References ---------- 1. Daniel R. Zerbino, Premanand Achuthan, Wasiu Akanni, M. Ridwan Amode, Daniel Barrell, Jyothish Bhai, Konstantinos Billis, Carla Cummins, Astrid Gall, Carlos García Giro´n, Laurent Gil, Leo Gordon, Leanne Haggerty, Erin Haskell, Thibaut Hourlier, Osagie G. Izuogu, Sophie H. Janacek, Thomas Juettemann, Jimmy Kiang To, Matthew R. Laird, Ilias Lavidas, Zhicheng Liu, Jane E. Loveland, Thomas Maurel, William McLaren, Benjamin Moore, Jonathan Mudge, Daniel N. Murphy, Victoria Newman, Michael Nuhn, Denye Ogeh, Chuang Kee Ong, Anne Parker, Mateus Patricio, Harpreet Singh Riat, Helen Schuilenburg, Dan Sheppard, Helen Sparrow, Kieron Taylor, Anja Thormann, Alessandro Vullo, Brandon Walts, Amonida Zadissa, Adam Frankish, Sarah E. Hunt, Myrto Kostadima, Nicholas Langridge, Fergal J. Martin, Matthieu Muffato, Emily Perry, Magali Ruffier, Dan M. Staines, Stephen J. Trevanion, Bronwen L. Aken, Fiona Cunningham, Andrew Yates, Paul Flicek Ensembl 2018. PubMed PMID: 29155950. doi:10.1093/nar/gkx1098 2. NCBI 36, Oct 2005, Ensembl release 54, Database version: 54.36p 3. GRCh37.p13 (Genome Reference Consortium Human Reference 37), INSDC Assembly GCA_000001405.14, Feb 2009, Ensembl GRCh37 release 96, Database version: 96.37 4. GRCh38.p12 (Genome Reference Consortium Human Build 38), INSDC Assembly GCA_000001405.27, Dec 2013, Ensembl release 96, Database version: 96.38 """ release = "" # https://www.biostars.org/p/374149/#374219 if assembly == "GRCh37": base = "ftp://ftp.ensembl.org/pub/grch37/release-96/fasta/homo_sapiens/dna/" elif assembly == "NCBI36": base = "ftp://ftp.ensembl.org/pub/release-54/fasta/homo_sapiens/dna/" release = "54." elif assembly == "GRCh38": base = "ftp://ftp.ensembl.org/pub/release-96/fasta/homo_sapiens/dna/" else: return ("", [], [], []) filenames = [ f"Homo_sapiens.{assembly}.{release}dna.chromosome.{chrom}.fa.gz" for chrom in chroms ] urls = [f"{base}{filename}" for filename in filenames] local_filenames = [ f"{sub_dir}{os.sep}{assembly}{os.sep}{filename}" for filename in filenames ] return ( assembly, chroms, urls, list(map(self._download_file, urls, local_filenames)), ) def _create_reference_sequences(self, assembly, chroms, urls, paths): # https://samtools.github.io/hts-specs/VCFv4.2.pdf seqs = {} for i, path in enumerate(paths): if not path: continue d = {} d["ID"] = chroms[i] d["url"] = urls[i] d["path"] = os.path.relpath(path) d["assembly"] = assembly d["species"] = "Homo sapiens" d["taxonomy"] = "x" seqs[chroms[i]] = ReferenceSequence(**d) return seqs def _get_path_assembly_mapping_data( self, source_assembly, target_assembly, retries=10 ): """Get local path to assembly mapping data, downloading if necessary. Parameters ---------- source_assembly : {'NCBI36', 'GRCh37', 'GRCh38'} assembly to remap from target_assembly : {'NCBI36', 'GRCh37', 'GRCh38'} assembly to remap to retries : int number of retries per chromosome to download assembly mapping data Returns ------- str path to <source_assembly>_<target_assembly>.tar.gz References ---------- 1. Ensembl, Assembly Information Endpoint, https://rest.ensembl.org/documentation/info/assembly_info 2. Ensembl, Assembly Map Endpoint, http://rest.ensembl.org/documentation/info/assembly_map """ if not create_dir(self._resources_dir): return "" chroms = [str(i) for i in range(1, 23)] chroms.extend(["X", "Y", "MT"]) assembly_mapping_data = source_assembly + "_" + target_assembly destination = os.path.join( self._resources_dir, assembly_mapping_data + ".tar.gz" ) if not os.path.exists(destination): logger.info(f"Downloading {os.path.relpath(destination)}") self._download_assembly_mapping_data( destination, chroms, source_assembly, target_assembly, retries ) return destination def _download_assembly_mapping_data( self, destination, chroms, source_assembly, target_assembly, retries ): with atomic_write(destination, mode="wb", overwrite=True) as f: with tarfile.open(fileobj=f, mode="w:gz") as out_tar: for chrom in chroms: file = chrom + ".json" map_endpoint = ( f"/map/human/{source_assembly}/{chrom}/{target_assembly}?" ) # get assembly mapping data response = None retry = 0 while response is None and retry < retries: response = self._ensembl_rest_client.perform_rest_action( map_endpoint ) retry += 1 if response is not None: # open temp file, save json response to file, close temp file with tempfile.NamedTemporaryFile( delete=False, mode="w" ) as f_tmp: json.dump(response, f_tmp) # add temp file to archive out_tar.add(f_tmp.name, arcname=file) # remove temp file os.remove(f_tmp.name)
[docs] def get_gsa_rsid(self): """Get and load GSA RSID map. https://support.illumina.com/downloads/infinium-global-screening-array-v2-0-product-files.html Returns ------- pandas.DataFrame """ if self._gsa_rsid_map is None: # download the file from the cloud, if not done already rsid_path = self._download_file( "https://sano-public.s3.eu-west-2.amazonaws.com/gsa_rsid_map.txt.gz", "gsa_rsid_map.txt.gz", ) # load into pandas rsids = pd.read_csv( rsid_path, sep=r"\s+", # whitespace separators header=0, # dont infer header as there isn't one names=("gsaname_rsid", "gsarsid"), dtype={"gsaname_rsid": "string", "gsarsid": "string"}, engine="c", # force c engine for performance ) self._gsa_rsid_map = rsids return self._gsa_rsid_map
[docs] def get_gsa_chrpos(self): """Get and load GSA chromosome position map. https://support.illumina.com/downloads/infinium-global-screening-array-v2-0-product-files.html Returns ------- pandas.DataFrame """ if self._gsa_chrpos_map is None: # download the file from the cloud, if not done already chrpos_path = self._download_file( "https://sano-public.s3.eu-west-2.amazonaws.com/gsa_chrpos_map.txt.gz", "gsa_chrpos_map.txt.gz", ) # load into pandas chrpos = pd.read_csv( chrpos_path, sep=r"\s+", # whitespace separators header=0, # dont infer header as there isn't one names=("gsaname_chrpos", "gsachr", "gsapos", "gsacm"), dtype={ "gsaname_chrpos": "string", "gsachr": "category", "gsapos": "uint32", "gsacm": "double", }, engine="c", # force c engine for performance ) self._gsa_chrpos_map = chrpos return self._gsa_chrpos_map
def _get_path_opensnp_datadump(self): return self._download_file( "https://opensnp.org/data/zip/opensnp_datadump.current.zip", "opensnp_datadump.current.zip", ) def _download_file(self, url, filename, compress=False, timeout=30): """Download a file to the resources folder. Download data from `url`, save as `filename`, and optionally compress with gzip. Parameters ---------- url : str URL to download data from filename : str name of file to save; if compress, ensure '.gz' is appended compress : bool compress with gzip timeout : int seconds for timeout of download request Returns ------- str path to downloaded file, empty str if error """ if compress and filename[-3:] != ".gz": filename += ".gz" destination = os.path.join(self._resources_dir, filename) if not create_dir(os.path.dirname(destination)): return "" if not os.path.exists(destination): try: # get file if it hasn't already been downloaded # http://stackoverflow.com/a/7244263 with urllib.request.urlopen( url, timeout=timeout ) as response, atomic_write( destination, mode="wb", overwrite=True ) as f: self._print_download_msg(destination) data = response.read() # a `bytes` object if compress: self._write_data_to_gzip(f, data) else: f.write(data) except urllib.error.URLError as err: logger.warning(err) destination = "" # try HTTP if an FTP error occurred if "ftp://" in url: destination = self._download_file( url.replace("ftp://", "http://"), filename, compress=compress, timeout=timeout, ) except socket.timeout: logger.warning(f"Timeout downloading {url}") destination = "" except FileExistsError: # if the file exists, another process has created it while it was # being downloaded # in such a case, the other copy is identical, so ignore this error pass return destination @staticmethod def _print_download_msg(path): """Print download message. Parameters ---------- path : str path to file being downloaded """ logger.info(f"Downloading {os.path.relpath(path)}")
[docs] class ReferenceSequence: """Object used to represent and interact with a reference sequence."""
[docs] def __init__(self, ID="", url="", path="", assembly="", species="", taxonomy=""): """Initialize a ``ReferenceSequence`` object. Parameters ---------- ID : str reference sequence chromosome url : str url to Ensembl reference sequence path : str path to local reference sequence assembly : str reference sequence assembly (e.g., "GRCh37") species : str reference sequence species taxonomy : str reference sequence taxonomy References ---------- 1. The Variant Call Format (VCF) Version 4.2 Specification, 8 Mar 2019, https://samtools.github.io/hts-specs/VCFv4.2.pdf """ self._ID = ID self._url = url self._path = path self._assembly = assembly self._species = species self._taxonomy = taxonomy self._sequence = np.array([], dtype=np.uint8) self._md5 = "" self._start = 0 self._end = 0 self._length = 0
def __repr__(self): return f"ReferenceSequence(assembly={self._assembly!r}, ID={self._ID!r})" @property def ID(self): """Get reference sequence chromosome. Returns ------- str """ return self._ID @property def chrom(self): """Get reference sequence chromosome. Returns ------- str """ return self._ID @property def url(self): """Get URL to Ensembl reference sequence. Returns ------- str """ return self._url @property def path(self): """Get path to local reference sequence. Returns ------- str """ return self._path @property def assembly(self): """Get reference sequence assembly. Returns ------- str """ return self._assembly @property def build(self): """Get reference sequence build. Returns ------- str e.g., "B37" """ return f"B{self._assembly[-2:]}" @property def species(self): """Get reference sequence species. Returns ------- str """ return self._species @property def taxonomy(self): """Get reference sequence taxonomy. Returns ------- str """ return self._taxonomy @property def sequence(self): """Get reference sequence. Returns ------- np.array(dtype=np.uint8) """ self._load_sequence() return self._sequence @property def md5(self): """Get reference sequence MD5 hash. Returns ------- str """ self._load_sequence() return self._md5 @property def start(self): """Get reference sequence start position (1-based). Returns ------- int """ self._load_sequence() return self._start @property def end(self): """Get reference sequence end position (1-based). Returns ------- int """ self._load_sequence() return self._end @property def length(self): """Get reference sequence length. Returns ------- int """ self._load_sequence() return self._sequence.size
[docs] def clear(self): """Clear reference sequence.""" self._sequence = np.array([], dtype=np.uint8) self._md5 = "" self._start = 0 self._end = 0 self._length = 0
def _load_sequence(self): if not self._sequence.size: # decompress and read file with gzip.open(self._path, "rb") as f: data = f.read() # convert bytes to str data = str(data, encoding="utf-8", errors="strict") data = data.splitlines() self._start, self._end = self._parse_first_line(data[0]) # convert str (FASTA sequence) to bytes data = bytearray("".join(data[1:]), encoding="utf-8", errors="strict") # get MD5 of FASTA sequence self._md5 = hashlib.md5(data).hexdigest() # store FASTA sequence as `np.uint8` array self._sequence = np.array(data, dtype=np.uint8) def _parse_first_line(self, first_line): items = first_line.split(":") return ( int(items[items.index(self._ID) + 1]), int(items[items.index(self._ID) + 2]), )