"""Class for downloading and loading required external resources.
References
----------
1. International Human Genome Sequencing Consortium. Initial sequencing and
analysis of the human genome. Nature. 2001 Feb 15;409(6822):860-921.
http://dx.doi.org/10.1038/35057062
2. hg19 (GRCh37): Hiram Clawson, Brooke Rhead, Pauline Fujita, Ann Zweig, Katrina
Learned, Donna Karolchik and Robert Kuhn, https://genome.ucsc.edu/cgi-bin/hgGateway?db=hg19
3. Yates et. al. (doi:10.1093/bioinformatics/btu613),
`<http://europepmc.org/search/?query=DOI:10.1093/bioinformatics/btu613>`_
4. Zerbino et. al. (doi.org/10.1093/nar/gkx1098), https://doi.org/10.1093/nar/gkx1098
"""
import gzip
import hashlib
import itertools
import json
import logging
import os
import socket
import tarfile
import tempfile
import urllib.error
import urllib.request
import numpy as np
import pandas as pd
from atomicwrites import atomic_write
from snps.constants import REFERENCE_SEQUENCE_CHROMS
from snps.ensembl import EnsemblRestClient
from snps.utils import Singleton, create_dir
logger = logging.getLogger(__name__)
[docs]
class Resources(metaclass=Singleton):
"""Object used to manage resources required by `snps`."""
[docs]
def __init__(self, resources_dir="resources"):
"""Initialize a ``Resources`` object.
Parameters
----------
resources_dir : str
name / path of resources directory
"""
self._resources_dir = os.path.abspath(resources_dir)
self._ensembl_rest_client = EnsemblRestClient()
self._init_resource_attributes()
def _init_resource_attributes(self):
self._reference_sequences = {}
self._gsa_rsid_map = None
self._gsa_chrpos_map = None
self._dbsnp_151_37_reverse = None
self._chip_clusters = None
self._low_quality_snps = None
[docs]
def get_reference_sequences(
self,
assembly="GRCh37",
chroms=REFERENCE_SEQUENCE_CHROMS,
):
"""Get Homo sapiens reference sequences for `chroms` of `assembly`.
Notes
-----
This function can download over 800MB of data for each assembly.
Parameters
----------
assembly : {'NCBI36', 'GRCh37', 'GRCh38'}
reference sequence assembly
chroms : list of str
reference sequence chromosomes
Returns
-------
dict
dict of ReferenceSequence, else {}
"""
valid_assemblies = ["NCBI36", "GRCh37", "GRCh38"]
if assembly not in valid_assemblies:
logger.warning("Invalid assembly")
return {}
if not self._reference_chroms_available(assembly, chroms):
self._reference_sequences[assembly] = self._create_reference_sequences(
*self._get_paths_reference_sequences(assembly=assembly, chroms=chroms)
)
return self._reference_sequences[assembly]
def _reference_chroms_available(self, assembly, chroms):
if assembly in self._reference_sequences:
for chrom in chroms:
if chrom not in self._reference_sequences[assembly]:
return False
return True
else:
return False
[docs]
def get_assembly_mapping_data(self, source_assembly, target_assembly):
"""Get assembly mapping data.
Parameters
----------
source_assembly : {'NCBI36', 'GRCh37', 'GRCh38'}
assembly to remap from
target_assembly : {'NCBI36', 'GRCh37', 'GRCh38'}
assembly to remap to
Returns
-------
dict
dict of json assembly mapping data if loading was successful, else {}
"""
return self._load_assembly_mapping_data(
self._get_path_assembly_mapping_data(source_assembly, target_assembly)
)
[docs]
def create_example_datasets(self, output_dir=None):
"""Create synthetic example datasets for demonstrations.
Generates two correlated genotype files in different formats and builds,
suitable for demonstrating merging and remapping functionality. The files
share ~700K common SNPs with intentional discrepancies to demonstrate
merge conflict detection.
Parameters
----------
output_dir : str, optional
Directory for output files (default: resources directory)
Returns
-------
paths : list of str
Paths to created example datasets
Examples
--------
>>> from snps.resources import Resources
>>> r = Resources()
>>> paths = r.create_example_datasets()
Creating resources/sample1.23andme.txt.gz
Creating resources/sample2.ftdna.csv.gz
"""
from snps.io.generator import SyntheticSNPGenerator
if output_dir is None:
output_dir = self._resources_dir
# Ensure output directory exists
os.makedirs(output_dir, exist_ok=True)
# Create correlated dataset pair with realistic merge characteristics
gen = SyntheticSNPGenerator(build=37, seed=47)
path1, path2 = gen.create_example_dataset_pair(output_dir)
return [path1, path2]
[docs]
def get_all_resources(self):
"""Get / download all resources used throughout `snps`.
Notes
-----
This function does not download reference sequences due to their large sizes.
Returns
-------
dict
dict of resources
"""
resources = {}
for source, target in itertools.permutations(["NCBI36", "GRCh37", "GRCh38"], 2):
resources[source + "_" + target] = self.get_assembly_mapping_data(
source, target
)
resources["gsa_resources"] = self.get_gsa_resources()
resources["chip_clusters"] = self.get_chip_clusters()
resources["low_quality_snps"] = self.get_low_quality_snps()
return resources
[docs]
def get_all_reference_sequences(self, **kwargs):
"""Get Homo sapiens reference sequences for Builds 36, 37, and 38 from Ensembl.
Notes
-----
This function can download over 2.5GB of data.
Returns
-------
dict
dict of ReferenceSequence, else {}
"""
for assembly in ("NCBI36", "GRCh37", "GRCh38"):
self.get_reference_sequences(assembly=assembly, **kwargs)
return self._reference_sequences
[docs]
def get_gsa_resources(self):
"""Get resources for reading Global Screening Array files.
https://support.illumina.com/downloads/infinium-global-screening-array-v2-0-product-files.html
Returns
-------
dict
"""
return {
"rsid_map": self.get_gsa_rsid(),
"chrpos_map": self.get_gsa_chrpos(),
"dbsnp_151_37_reverse": self.get_dbsnp_151_37_reverse(),
}
[docs]
def get_chip_clusters(self):
"""Get resource for identifying deduced genotype / chip array based on chip clusters.
Returns
-------
pandas.DataFrame
References
----------
1. Chang Lu, Bastian Greshake Tzovaras, Julian Gough, A survey of
direct-to-consumer genotype data, and quality control tool
(GenomePrep) for research, Computational and Structural
Biotechnology Journal, Volume 19, 2021, Pages 3747-3754, ISSN
2001-0370, https://doi.org/10.1016/j.csbj.2021.06.040.
2. Lu, Tzovaras, & Gough. (2021). OpenSNP data-freeze of 5,393
(19.10.2020) [Data set]. In Computational and Structural
Biotechnology Journal. Zenodo.
https://doi.org/10.1016/j.csbj.2021.06.040
"""
if self._chip_clusters is None:
chip_clusters_path = self._download_file(
"https://zenodo.org/records/5047472/files/the_list.tsv.gz",
"chip_clusters.tsv.gz",
)
df = pd.read_csv(
chip_clusters_path,
sep="\t",
names=["locus", "clusters"],
dtype={"locus": object, "clusters": pd.CategoricalDtype(ordered=False)},
)
clusters = df.clusters
df = df.locus.str.split(":", expand=True)
df.rename({0: "chrom", 1: "pos"}, axis=1, inplace=True)
df.pos = df.pos.astype(np.uint32)
df.chrom = df.chrom.astype(pd.CategoricalDtype(ordered=False))
df["clusters"] = clusters
self._chip_clusters = df
return self._chip_clusters
[docs]
def get_low_quality_snps(self):
"""Get listing of low quality SNPs for quality control based on chip clusters.
Returns
-------
pandas.DataFrame
References
----------
1. Chang Lu, Bastian Greshake Tzovaras, Julian Gough, A survey of
direct-to-consumer genotype data, and quality control tool
(GenomePrep) for research, Computational and Structural
Biotechnology Journal, Volume 19, 2021, Pages 3747-3754, ISSN
2001-0370, https://doi.org/10.1016/j.csbj.2021.06.040.
2. Lu, Tzovaras, & Gough. (2021). OpenSNP data-freeze of 5,393
(19.10.2020) [Data set]. In Computational and Structural
Biotechnology Journal. Zenodo.
https://doi.org/10.1016/j.csbj.2021.06.040
"""
if self._low_quality_snps is None:
low_quality_snps_path = self._download_file(
"https://zenodo.org/records/5047472/files/badalleles.tsv.gz",
"low_quality_snps.tsv.gz",
)
df = pd.read_csv(
low_quality_snps_path,
sep="\t",
names=["cluster", "loci"],
)
cluster_dfs = []
for row in df.itertuples():
loci = row.loci.split(",")
cluster_dfs.append(
pd.DataFrame({"cluster": [row.cluster] * len(loci), "locus": loci})
)
df = pd.concat(cluster_dfs)
df.reset_index(inplace=True, drop=True)
cluster = df.cluster.astype(pd.CategoricalDtype(ordered=False))
df = df.locus.str.split(":", expand=True)
df.rename({0: "chrom", 1: "pos"}, axis=1, inplace=True)
df.pos = df.pos.astype(np.uint32)
df.chrom = df.chrom.astype(pd.CategoricalDtype(ordered=False))
df["cluster"] = cluster
self._low_quality_snps = df
return self._low_quality_snps
[docs]
def get_dbsnp_151_37_reverse(self):
"""Get and load RSIDs that are on the reference reverse (-) strand in dbSNP 151 and lower.
Returns
-------
pandas.DataFrame
References
----------
1. Sherry ST, Ward MH, Kholodov M, Baker J, Phan L, Smigielski EM, Sirotkin K.
dbSNP: the NCBI database of genetic variation. Nucleic Acids Res. 2001 Jan 1;
29(1):308-11.
2. Database of Single Nucleotide Polymorphisms (dbSNP). Bethesda (MD): National Center
for Biotechnology Information, National Library of Medicine. (dbSNP Build ID: 151).
Available from: http://www.ncbi.nlm.nih.gov/SNP/
"""
if self._dbsnp_151_37_reverse is None:
# download the file from the cloud, if not done already
dbsnp_rev_path = self._download_file(
"https://sano-public.s3.eu-west-2.amazonaws.com/dbsnp151.b37.snps_reverse.txt.gz",
"dbsnp_151_37_reverse.txt.gz",
)
# load into pandas
rsids = pd.read_csv(
dbsnp_rev_path,
sep=" ",
header=None, # dont infer header as there isn't one
names=(
"dbsnp151revrsid",
"dbsnp151freqa",
"dbsnp151freqt",
"dbsnp151freqc",
"dbsnp151freqg",
),
dtype={
"dbsnp151revrsid": "string",
"dbsnp151freqa": "double",
"dbsnp151freqt": "double",
"dbsnp151freqc": "double",
"dbsnp151freqg": "double",
},
engine="c", # force c engine for performance
comment="#", # skip the first row
)
# store in memory so we don't have to load again
self._dbsnp_151_37_reverse = rsids
return self._dbsnp_151_37_reverse
@staticmethod
def _write_data_to_gzip(f, data):
"""Write `data` to `f` in `gzip` format.
Parameters
----------
f : file object opened with `mode="wb"`
data : `bytes` object
"""
with gzip.open(f, "wb") as f_gzip:
f_gzip.write(data)
@staticmethod
def _load_assembly_mapping_data(filename):
"""Load assembly mapping data.
Parameters
----------
filename : str
path to compressed archive with assembly mapping data
Returns
-------
assembly_mapping_data : dict
dict of assembly maps
Notes
-----
Keys of returned dict are chromosomes and values are the corresponding assembly map.
"""
assembly_mapping_data = {}
with tarfile.open(filename, "r") as tar:
# http://stackoverflow.com/a/2018576
for member in tar.getmembers():
if ".json" in member.name:
with tar.extractfile(member) as tar_file:
tar_bytes = tar_file.read()
# https://stackoverflow.com/a/42683509/4727627
assembly_mapping_data[member.name.split(".")[0]] = json.loads(
tar_bytes.decode("utf-8")
)
return assembly_mapping_data
def _get_paths_reference_sequences(
self, sub_dir="fasta", assembly="GRCh37", chroms=()
):
"""Get local paths to Homo sapiens reference sequences from Ensembl.
Notes
-----
This function can download over 800MB of data for each assembly.
Parameters
----------
sub_dir : str
directory under resources to store reference sequence data
assembly : {'NCBI36', 'GRCh37', 'GRCh38'}
reference sequence assembly
chroms : list of str
reference sequence chromosomes
Returns
-------
assembly : str
reference sequence assembly
chroms : list of str
reference sequence chromosomes
urls : list of str
urls to Ensembl reference sequences
paths : list of str
paths to local reference sequences
References
----------
1. Daniel R. Zerbino, Premanand Achuthan, Wasiu Akanni, M. Ridwan Amode,
Daniel Barrell, Jyothish Bhai, Konstantinos Billis, Carla Cummins, Astrid Gall,
Carlos García Giro´n, Laurent Gil, Leo Gordon, Leanne Haggerty, Erin Haskell,
Thibaut Hourlier, Osagie G. Izuogu, Sophie H. Janacek, Thomas Juettemann,
Jimmy Kiang To, Matthew R. Laird, Ilias Lavidas, Zhicheng Liu, Jane E. Loveland,
Thomas Maurel, William McLaren, Benjamin Moore, Jonathan Mudge, Daniel N. Murphy,
Victoria Newman, Michael Nuhn, Denye Ogeh, Chuang Kee Ong, Anne Parker,
Mateus Patricio, Harpreet Singh Riat, Helen Schuilenburg, Dan Sheppard,
Helen Sparrow, Kieron Taylor, Anja Thormann, Alessandro Vullo, Brandon Walts,
Amonida Zadissa, Adam Frankish, Sarah E. Hunt, Myrto Kostadima, Nicholas Langridge,
Fergal J. Martin, Matthieu Muffato, Emily Perry, Magali Ruffier, Dan M. Staines,
Stephen J. Trevanion, Bronwen L. Aken, Fiona Cunningham, Andrew Yates, Paul Flicek
Ensembl 2018.
PubMed PMID: 29155950.
doi:10.1093/nar/gkx1098
2. NCBI 36, Oct 2005, Ensembl release 54, Database version: 54.36p
3. GRCh37.p13 (Genome Reference Consortium Human Reference 37),
INSDC Assembly GCA_000001405.14, Feb 2009, Ensembl GRCh37 release 96, Database
version: 96.37
4. GRCh38.p12 (Genome Reference Consortium Human Build 38),
INSDC Assembly GCA_000001405.27, Dec 2013, Ensembl release 96, Database
version: 96.38
"""
release = ""
# https://www.biostars.org/p/374149/#374219
if assembly == "GRCh37":
base = "ftp://ftp.ensembl.org/pub/grch37/release-96/fasta/homo_sapiens/dna/"
elif assembly == "NCBI36":
base = "ftp://ftp.ensembl.org/pub/release-54/fasta/homo_sapiens/dna/"
release = "54."
elif assembly == "GRCh38":
base = "ftp://ftp.ensembl.org/pub/release-96/fasta/homo_sapiens/dna/"
else:
return ("", [], [], [])
filenames = [
f"Homo_sapiens.{assembly}.{release}dna.chromosome.{chrom}.fa.gz"
for chrom in chroms
]
urls = [f"{base}{filename}" for filename in filenames]
local_filenames = [
f"{sub_dir}{os.sep}{assembly}{os.sep}{filename}" for filename in filenames
]
return (
assembly,
chroms,
urls,
list(map(self._download_file, urls, local_filenames)),
)
def _create_reference_sequences(self, assembly, chroms, urls, paths):
# https://samtools.github.io/hts-specs/VCFv4.3.pdf
seqs = {}
for i, path in enumerate(paths):
if not path:
continue
d = {}
d["ID"] = chroms[i]
d["url"] = urls[i]
d["path"] = os.path.relpath(path)
d["assembly"] = assembly
d["species"] = "Homo sapiens"
d["taxonomy"] = "x"
seqs[chroms[i]] = ReferenceSequence(**d)
return seqs
def _get_path_assembly_mapping_data(
self, source_assembly, target_assembly, retries=10
):
"""Get local path to assembly mapping data, downloading if necessary.
Parameters
----------
source_assembly : {'NCBI36', 'GRCh37', 'GRCh38'}
assembly to remap from
target_assembly : {'NCBI36', 'GRCh37', 'GRCh38'}
assembly to remap to
retries : int
number of retries per chromosome to download assembly mapping data
Returns
-------
str
path to <source_assembly>_<target_assembly>.tar.gz
References
----------
1. Ensembl, Assembly Information Endpoint,
https://rest.ensembl.org/documentation/info/assembly_info
2. Ensembl, Assembly Map Endpoint,
http://rest.ensembl.org/documentation/info/assembly_map
"""
if not create_dir(self._resources_dir):
return ""
chroms = [str(i) for i in range(1, 23)]
chroms.extend(["X", "Y", "MT"])
assembly_mapping_data = source_assembly + "_" + target_assembly
destination = os.path.join(
self._resources_dir, assembly_mapping_data + ".tar.gz"
)
if not os.path.exists(destination):
logger.info(f"Downloading {os.path.relpath(destination)}")
self._download_assembly_mapping_data(
destination, chroms, source_assembly, target_assembly, retries
)
return destination
def _download_assembly_mapping_data(
self, destination, chroms, source_assembly, target_assembly, retries
):
with atomic_write(destination, mode="wb", overwrite=True) as f:
with tarfile.open(fileobj=f, mode="w:gz") as out_tar:
for chrom in chroms:
file = chrom + ".json"
map_endpoint = (
f"/map/human/{source_assembly}/{chrom}/{target_assembly}?"
)
# get assembly mapping data
response = None
retry = 0
while response is None and retry < retries:
response = self._ensembl_rest_client.perform_rest_action(
map_endpoint
)
retry += 1
if response is not None:
# open temp file, save json response to file, close temp file
with tempfile.NamedTemporaryFile(
delete=False, mode="w"
) as f_tmp:
json.dump(response, f_tmp)
# add temp file to archive
out_tar.add(f_tmp.name, arcname=file)
# remove temp file
os.remove(f_tmp.name)
[docs]
def get_gsa_rsid(self):
"""Get and load GSA RSID map.
https://support.illumina.com/downloads/infinium-global-screening-array-v2-0-product-files.html
Returns
-------
pandas.DataFrame
"""
if self._gsa_rsid_map is None:
# download the file from the cloud, if not done already
rsid_path = self._download_file(
"https://sano-public.s3.eu-west-2.amazonaws.com/gsa_rsid_map.txt.gz",
"gsa_rsid_map.txt.gz",
)
# load into pandas
rsids = pd.read_csv(
rsid_path,
sep=r"\s+", # whitespace separators
header=0, # dont infer header as there isn't one
names=("gsaname_rsid", "gsarsid"),
dtype={"gsaname_rsid": "string", "gsarsid": "string"},
engine="c", # force c engine for performance
)
self._gsa_rsid_map = rsids
return self._gsa_rsid_map
[docs]
def get_gsa_chrpos(self):
"""Get and load GSA chromosome position map.
https://support.illumina.com/downloads/infinium-global-screening-array-v2-0-product-files.html
Returns
-------
pandas.DataFrame
"""
if self._gsa_chrpos_map is None:
# download the file from the cloud, if not done already
chrpos_path = self._download_file(
"https://sano-public.s3.eu-west-2.amazonaws.com/gsa_chrpos_map.txt.gz",
"gsa_chrpos_map.txt.gz",
)
# load into pandas
chrpos = pd.read_csv(
chrpos_path,
sep=r"\s+", # whitespace separators
header=0, # dont infer header as there isn't one
names=("gsaname_chrpos", "gsachr", "gsapos", "gsacm"),
dtype={
"gsaname_chrpos": "string",
"gsachr": "category",
"gsapos": "uint32",
"gsacm": "double",
},
engine="c", # force c engine for performance
)
self._gsa_chrpos_map = chrpos
return self._gsa_chrpos_map
def _download_file(self, url, filename, compress=False, timeout=30):
"""Download a file to the resources folder.
Download data from `url`, save as `filename`, and optionally compress with gzip.
Parameters
----------
url : str
URL to download data from
filename : str
name of file to save; if compress, ensure '.gz' is appended
compress : bool
compress with gzip
timeout : int
seconds for timeout of download request
Returns
-------
str
path to downloaded file, empty str if error
"""
if compress and filename[-3:] != ".gz":
filename += ".gz"
destination = os.path.join(self._resources_dir, filename)
if not create_dir(os.path.dirname(destination)):
return ""
if not os.path.exists(destination):
try:
# get file if it hasn't already been downloaded
# http://stackoverflow.com/a/7244263
with urllib.request.urlopen(url, timeout=timeout) as response:
with atomic_write(destination, mode="wb", overwrite=True) as f:
self._print_download_msg(destination)
data = response.read() # a `bytes` object
if compress:
self._write_data_to_gzip(f, data)
else:
f.write(data)
except urllib.error.URLError as err:
logger.warning(err)
destination = ""
# try HTTP if an FTP error occurred
if "ftp://" in url:
destination = self._download_file(
url.replace("ftp://", "http://"),
filename,
compress=compress,
timeout=timeout,
)
except socket.timeout:
logger.warning(f"Timeout downloading {url}")
destination = ""
except FileExistsError:
# if the file exists, another process has created it while it was
# being downloaded
# in such a case, the other copy is identical, so ignore this error
pass
return destination
@staticmethod
def _print_download_msg(path):
"""Print download message.
Parameters
----------
path : str
path to file being downloaded
"""
logger.info(f"Downloading {os.path.relpath(path)}")
[docs]
class ReferenceSequence:
"""Object used to represent and interact with a reference sequence."""
[docs]
def __init__(self, ID="", url="", path="", assembly="", species="", taxonomy=""):
"""Initialize a ``ReferenceSequence`` object.
Parameters
----------
ID : str
reference sequence chromosome
url : str
url to Ensembl reference sequence
path : str
path to local reference sequence
assembly : str
reference sequence assembly (e.g., "GRCh37")
species : str
reference sequence species
taxonomy : str
reference sequence taxonomy
References
----------
1. The Variant Call Format (VCF) Version 4.3 Specification, 27 Nov 2022,
https://samtools.github.io/hts-specs/VCFv4.3.pdf
"""
self._ID = ID
self._url = url
self._path = path
self._assembly = assembly
self._species = species
self._taxonomy = taxonomy
self._sequence = np.array([], dtype=np.uint8)
self._md5 = ""
self._start = 0
self._end = 0
self._length = 0
def __repr__(self):
return f"ReferenceSequence(assembly={self._assembly!r}, ID={self._ID!r})"
@property
def ID(self):
"""Get reference sequence chromosome.
Returns
-------
str
"""
return self._ID
@property
def chrom(self):
"""Get reference sequence chromosome.
Returns
-------
str
"""
return self._ID
@property
def url(self):
"""Get URL to Ensembl reference sequence.
Returns
-------
str
"""
return self._url
@property
def path(self):
"""Get path to local reference sequence.
Returns
-------
str
"""
return self._path
@property
def assembly(self):
"""Get reference sequence assembly.
Returns
-------
str
"""
return self._assembly
@property
def build(self):
"""Get reference sequence build.
Returns
-------
str
e.g., "B37"
"""
return f"B{self._assembly[-2:]}"
@property
def species(self):
"""Get reference sequence species.
Returns
-------
str
"""
return self._species
@property
def taxonomy(self):
"""Get reference sequence taxonomy.
Returns
-------
str
"""
return self._taxonomy
@property
def sequence(self):
"""Get reference sequence.
Returns
-------
np.array(dtype=np.uint8)
"""
self._load_sequence()
return self._sequence
@property
def md5(self):
"""Get reference sequence MD5 hash.
Returns
-------
str
"""
self._load_sequence()
return self._md5
@property
def start(self):
"""Get reference sequence start position (1-based).
Returns
-------
int
"""
self._load_sequence()
return self._start
@property
def end(self):
"""Get reference sequence end position (1-based).
Returns
-------
int
"""
self._load_sequence()
return self._end
@property
def length(self):
"""Get reference sequence length.
Returns
-------
int
"""
self._load_sequence()
return self._sequence.size
[docs]
def clear(self):
"""Clear reference sequence."""
self._sequence = np.array([], dtype=np.uint8)
self._md5 = ""
self._start = 0
self._end = 0
self._length = 0
def _load_sequence(self):
if not self._sequence.size:
# decompress and read file
with gzip.open(self._path, "rb") as f:
data = f.read()
# convert bytes to str
data = str(data, encoding="utf-8", errors="strict")
data = data.splitlines()
self._start, self._end = self._parse_first_line(data[0])
# convert str (FASTA sequence) to bytes
data = bytearray("".join(data[1:]), encoding="utf-8", errors="strict")
# get MD5 of FASTA sequence
self._md5 = hashlib.md5(data).hexdigest()
# store FASTA sequence as `np.uint8` array
self._sequence = np.array(data, dtype=np.uint8)
def _parse_first_line(self, first_line):
items = first_line.split(":")
return (
int(items[items.index(self._ID) + 1]),
int(items[items.index(self._ID) + 2]),
)