seqmat

SeqMat — fast, vectorized genomic sequences with first-class mutation tracking.

View Source

 1"""SeqMat — fast, vectorized genomic sequences with first-class mutation tracking."""
 2
 3__version__ = "1.5.1"
 4__author__ = "Nicolas Lynn Vila"
 5__email__ = "nicolasalynn@gmail.com"
 6
 7# Core classes
 8from .seqmat import SeqMat
 9from .gene import Gene
10from .transcript import Transcript
11
12# Config / paths
13from .config import (
14    get_config_dir,
15    get_config_file,
16    get_data_base,
17    get_data_dir,
18    get_default_organism,
19    load_config,
20    save_config,
21)
22
23# Optional LMDB backend
24from .lmdb_store import build_lmdb
25
26# Position → gene lookup
27from .locator import build_location_index, gene_names_at_position
28
29# Data setup
30from .data_setup import set_fasta_path, setup_genomics_data
31
32# Discovery
33from .discovery import (
34    available_genes,
35    count_genes,
36    data_summary,
37    get_all_genes,
38    get_gene_list,
39    get_organism_info,
40    list_available_organisms,
41    list_gene_biotypes,
42    list_supported_organisms,
43    print_data_summary,
44    search_genes,
45)
46
47# Errors
48from .errors import (
49    DataUnavailableError,
50    GeneNotFoundError,
51    OrganismNotConfiguredError,
52    PrebuiltDataUnavailableError,
53    SeqMatError,
54)
55
56__all__ = [
57    # Core
58    "SeqMat", "Gene", "Transcript",
59    # Config / paths
60    "get_default_organism", "get_data_dir", "get_config_dir",
61    "get_config_file", "get_data_base", "load_config", "save_config",
62    # Setup
63    "setup_genomics_data", "set_fasta_path",
64    # Discovery
65    "list_available_organisms", "list_supported_organisms", "get_organism_info",
66    "list_gene_biotypes", "count_genes", "get_gene_list", "get_all_genes",
67    "data_summary", "print_data_summary", "search_genes", "available_genes",
68    # Position lookup
69    "gene_names_at_position", "build_location_index",
70    # LMDB
71    "build_lmdb",
72    # Errors
73    "SeqMatError", "GeneNotFoundError", "OrganismNotConfiguredError",
74    "DataUnavailableError", "PrebuiltDataUnavailableError",
75]

class Gene: View Source

 17class Gene:
 18    """
 19    A class representing a Gene, with associated transcripts and metadata.
 20
 21    Attributes:
 22        organism (str): The organism build (e.g. 'hg38').
 23        transcripts (dict): A dictionary of transcript annotations keyed by transcript ID.
 24        gene_name (str): The name of the gene.
 25        gene_id (str): The unique identifier for the gene.
 26        chrm (str): The chromosome on which the gene resides.
 27        rev (bool): Whether the gene is on the reverse strand.
 28    """
 29
 30    def __init__(self, gene_name: str, gene_id: str, rev: bool, chrm: str, 
 31                 transcripts: Optional[Dict[str, Any]] = None, organism: Optional[str] = None):
 32        """
 33        Initialize a Gene instance.
 34
 35        Args:
 36            gene_name: Name of the gene
 37            gene_id: Unique identifier for the gene
 38            rev: Whether gene is on reverse strand
 39            chrm: Chromosome identifier
 40            transcripts: Dictionary of transcript annotations
 41            organism: Organism reference build (default from config)
 42        """
 43        self.gene_name = gene_name
 44        self.gene_id = gene_id
 45        self.rev = rev
 46        self.chrm = chrm
 47        self.organism = organism if organism is not None else get_default_organism()
 48        self.transcripts = transcripts if transcripts is not None else {}
 49
 50    def __repr__(self) -> str:
 51        """Official string representation of the Gene object."""
 52        return f"Gene({self.gene_name})"
 53
 54    def __str__(self) -> str:
 55        """User-friendly string representation of the Gene object."""
 56        return f"Gene: {self.gene_name}, ID: {self.gene_id}, Chr: {self.chrm}, Transcripts: {len(self.transcripts)}"
 57
 58    def __len__(self) -> int:
 59        """Returns the number of transcripts associated with this gene."""
 60        return len(self.transcripts)
 61
 62    def __iter__(self) -> Iterator[Transcript]:
 63        """Allow iteration over the gene's transcripts, yielding Transcript objects."""
 64        for tid, annotations in self.transcripts.items():
 65            yield Transcript(annotations, organism=self.organism)
 66
 67    def __getitem__(self, item: str) -> Optional[Transcript]:
 68        """Get a transcript by ID. Returns ``None`` if the ID isn't an annotated transcript."""
 69        if item not in self.transcripts:
 70            _log.warning("%s is not an annotated transcript of %s", item, self.gene_name)
 71            return None
 72        return Transcript(self.transcripts[item], organism=self.organism)
 73
 74    @classmethod
 75    def _load_dict(cls, gene_name: str, organism: str) -> Optional[Dict[str, Any]]:
 76        """Load the raw gene dict from LMDB → SQLite → per-gene pickle, in that order.
 77
 78        Returns ``None`` when the gene isn't found in any backend.
 79        """
 80        try:
 81            from .lmdb_store import load_gene_from_lmdb
 82            data = load_gene_from_lmdb(gene_name, organism)
 83            if data is not None:
 84                return data
 85        except ImportError:
 86            pass
 87        try:
 88            from .sqlite_store import load_gene_from_sqlite
 89            data = load_gene_from_sqlite(gene_name, organism)
 90            if data is not None:
 91                return data
 92        except ImportError:
 93            pass
 94        try:
 95            config = get_organism_config(organism)
 96        except ValueError as exc:
 97            raise OrganismNotConfiguredError(
 98                f"Organism '{organism}' not configured. Run setup_genomics_data() first."
 99            ) from exc
100        mrna_path = Path(config["MRNA_PATH"])
101        if mrna_path.exists():
102            for biotype_dir in mrna_path.iterdir():
103                if biotype_dir.is_dir():
104                    for pkl in biotype_dir.glob(f"*_{gene_name}.pkl"):
105                        return unload_pickle(pkl)
106        return None
107
108    @classmethod
109    def get(cls, gene_name: str, organism: Optional[str] = None) -> 'Gene':
110        """Load a gene, raising on failure.
111
112        Raises:
113            OrganismNotConfiguredError: when the organism has no data set up.
114            GeneNotFoundError: when the organism is configured but the gene is missing.
115        """
116        if organism is None:
117            organism = get_default_organism()
118        data = cls._load_dict(gene_name, organism)
119        if data is None:
120            raise GeneNotFoundError(
121                f"Gene '{gene_name}' not found in organism '{organism}'."
122            )
123        return cls(
124            gene_name=data.get('gene_name'),
125            gene_id=data.get('gene_id'),
126            rev=data.get('rev'),
127            chrm=data.get('chrm'),
128            transcripts=data.get('transcripts', {}),
129            organism=organism,
130        )
131
132    @classmethod
133    def from_file(cls, gene_name: str, organism: Optional[str] = None) -> Optional['Gene']:
134        """Load a gene by name, returning ``None`` if not found.
135
136        Load order: LMDB (if installed) → SQLite (``genes.db``) → per-gene pickle files.
137        Prefer :meth:`get` if you'd rather have a typed exception than ``None``.
138        """
139        if organism is None:
140            organism = get_default_organism()
141        try:
142            return cls.get(gene_name, organism=organism)
143        except OrganismNotConfiguredError as exc:
144            _log.warning("%s", exc)
145            return None
146        except GeneNotFoundError as exc:
147            _log.warning("%s", exc)
148            return None
149
150    @classmethod
151    def from_position(
152        cls,
153        chrm: str,
154        pos: PosArg,
155        organism: Optional[str] = None,
156    ) -> List['Gene']:
157        """Return all genes overlapping a point or range on a chromosome.
158
159        Args:
160            chrm: Chromosome (e.g. "12", "chr12", "X"). Leading 'chr' is stripped.
161            pos: Either an int position or a (start, end) tuple (inclusive).
162            organism: Organism build (uses default if None).
163
164        Returns:
165            List of Gene objects, possibly empty. Returned in ascending start order.
166        """
167        if organism is None:
168            organism = get_default_organism()
169        names = gene_names_at_position(chrm, pos, organism=organism)
170        genes: List['Gene'] = []
171        for name in names:
172            g = cls.from_file(name, organism=organism)
173            if g is not None:
174                genes.append(g)
175        return genes
176
177    def splice_sites(self) -> Tuple[Counter, Counter]:
178        """Return (Counter of acceptors, Counter of donors) across all transcripts."""
179        acceptors: List[Any] = []
180        donors: List[Any] = []
181        for transcript in self.transcripts.values():
182            acceptors.extend(transcript.get('acceptors', []))
183            donors.extend(transcript.get('donors', []))
184
185        return Counter(acceptors), Counter(donors)
186
187    def transcript(self, tid: Optional[str] = None) -> Optional[Transcript]:
188        """Return Transcript by ID, or primary transcript if tid is None."""
189        if tid is None:
190            tid = self.primary_transcript
191            
192        if tid is None or tid not in self.transcripts:
193            return None
194
195        return Transcript(self.transcripts[tid], organism=self.organism)
196
197    @property
198    def primary_transcript(self) -> Optional[str]:
199        """Primary transcript ID, or first protein-coding transcript, or None."""
200        if hasattr(self, "_primary_transcript"):
201            return self._primary_transcript
202        primary = [k for k, v in self.transcripts.items() if v.get("primary_transcript")]
203        if primary:
204            self._primary_transcript = primary[0]
205            return self._primary_transcript
206        protein_coding = [k for k, v in self.transcripts.items()
207                         if v.get("transcript_biotype") == "protein_coding"]
208        if protein_coding:
209            self._primary_transcript = protein_coding[0]
210            return self._primary_transcript
211        self._primary_transcript = None
212        return None

A class representing a Gene, with associated transcripts and metadata.

Attributes:

organism (str): The organism build (e.g. 'hg38').
transcripts (dict): A dictionary of transcript annotations keyed by transcript ID.
gene_name (str): The name of the gene.
gene_id (str): The unique identifier for the gene.
chrm (str): The chromosome on which the gene resides.
rev (bool): Whether the gene is on the reverse strand.

Gene( gene_name: str, gene_id: str, rev: bool, chrm: str, transcripts: Optional[Dict[str, Any]] = None, organism: Optional[str] = None) View Source

30    def __init__(self, gene_name: str, gene_id: str, rev: bool, chrm: str, 
31                 transcripts: Optional[Dict[str, Any]] = None, organism: Optional[str] = None):
32        """
33        Initialize a Gene instance.
34
35        Args:
36            gene_name: Name of the gene
37            gene_id: Unique identifier for the gene
38            rev: Whether gene is on reverse strand
39            chrm: Chromosome identifier
40            transcripts: Dictionary of transcript annotations
41            organism: Organism reference build (default from config)
42        """
43        self.gene_name = gene_name
44        self.gene_id = gene_id
45        self.rev = rev
46        self.chrm = chrm
47        self.organism = organism if organism is not None else get_default_organism()
48        self.transcripts = transcripts if transcripts is not None else {}

Initialize a Gene instance.

Arguments:

gene_name: Name of the gene
gene_id: Unique identifier for the gene
rev: Whether gene is on reverse strand
chrm: Chromosome identifier
transcripts: Dictionary of transcript annotations
organism: Organism reference build (default from config)

gene_name

gene_id

rev

chrm

organism

transcripts

@classmethod

def get(cls, gene_name: str, organism: Optional[str] = None) -> Gene: View Source

108    @classmethod
109    def get(cls, gene_name: str, organism: Optional[str] = None) -> 'Gene':
110        """Load a gene, raising on failure.
111
112        Raises:
113            OrganismNotConfiguredError: when the organism has no data set up.
114            GeneNotFoundError: when the organism is configured but the gene is missing.
115        """
116        if organism is None:
117            organism = get_default_organism()
118        data = cls._load_dict(gene_name, organism)
119        if data is None:
120            raise GeneNotFoundError(
121                f"Gene '{gene_name}' not found in organism '{organism}'."
122            )
123        return cls(
124            gene_name=data.get('gene_name'),
125            gene_id=data.get('gene_id'),
126            rev=data.get('rev'),
127            chrm=data.get('chrm'),
128            transcripts=data.get('transcripts', {}),
129            organism=organism,
130        )

Load a gene, raising on failure.

Raises:

OrganismNotConfiguredError: when the organism has no data set up.
GeneNotFoundError: when the organism is configured but the gene is missing.

@classmethod

def from_file( cls, gene_name: str, organism: Optional[str] = None) -> Optional[Gene]: View Source

132    @classmethod
133    def from_file(cls, gene_name: str, organism: Optional[str] = None) -> Optional['Gene']:
134        """Load a gene by name, returning ``None`` if not found.
135
136        Load order: LMDB (if installed) → SQLite (``genes.db``) → per-gene pickle files.
137        Prefer :meth:`get` if you'd rather have a typed exception than ``None``.
138        """
139        if organism is None:
140            organism = get_default_organism()
141        try:
142            return cls.get(gene_name, organism=organism)
143        except OrganismNotConfiguredError as exc:
144            _log.warning("%s", exc)
145            return None
146        except GeneNotFoundError as exc:
147            _log.warning("%s", exc)
148            return None

Load a gene by name, returning None if not found.

Load order: LMDB (if installed) → SQLite (genes.db) → per-gene pickle files. Prefer get() if you'd rather have a typed exception than None.

@classmethod

def from_position( cls, chrm: str, pos: Union[int, Tuple[int, int], List[int]], organism: Optional[str] = None) -> List[Gene]: View Source

150    @classmethod
151    def from_position(
152        cls,
153        chrm: str,
154        pos: PosArg,
155        organism: Optional[str] = None,
156    ) -> List['Gene']:
157        """Return all genes overlapping a point or range on a chromosome.
158
159        Args:
160            chrm: Chromosome (e.g. "12", "chr12", "X"). Leading 'chr' is stripped.
161            pos: Either an int position or a (start, end) tuple (inclusive).
162            organism: Organism build (uses default if None).
163
164        Returns:
165            List of Gene objects, possibly empty. Returned in ascending start order.
166        """
167        if organism is None:
168            organism = get_default_organism()
169        names = gene_names_at_position(chrm, pos, organism=organism)
170        genes: List['Gene'] = []
171        for name in names:
172            g = cls.from_file(name, organism=organism)
173            if g is not None:
174                genes.append(g)
175        return genes

Return all genes overlapping a point or range on a chromosome.

Arguments:

chrm: Chromosome (e.g. "12", "chr12", "X"). Leading 'chr' is stripped.
pos: Either an int position or a (start, end) tuple (inclusive).
organism: Organism build (uses default if None).

Returns:

List of Gene objects, possibly empty. Returned in ascending start order.

def splice_sites(self) -> Tuple[collections.Counter, collections.Counter]: View Source

177    def splice_sites(self) -> Tuple[Counter, Counter]:
178        """Return (Counter of acceptors, Counter of donors) across all transcripts."""
179        acceptors: List[Any] = []
180        donors: List[Any] = []
181        for transcript in self.transcripts.values():
182            acceptors.extend(transcript.get('acceptors', []))
183            donors.extend(transcript.get('donors', []))
184
185        return Counter(acceptors), Counter(donors)

Return (Counter of acceptors, Counter of donors) across all transcripts.

def transcript( self, tid: Optional[str] = None) -> Optional[Transcript]: View Source

187    def transcript(self, tid: Optional[str] = None) -> Optional[Transcript]:
188        """Return Transcript by ID, or primary transcript if tid is None."""
189        if tid is None:
190            tid = self.primary_transcript
191            
192        if tid is None or tid not in self.transcripts:
193            return None
194
195        return Transcript(self.transcripts[tid], organism=self.organism)

Return Transcript by ID, or primary transcript if tid is None.

primary_transcript: Optional[str] View Source

197    @property
198    def primary_transcript(self) -> Optional[str]:
199        """Primary transcript ID, or first protein-coding transcript, or None."""
200        if hasattr(self, "_primary_transcript"):
201            return self._primary_transcript
202        primary = [k for k, v in self.transcripts.items() if v.get("primary_transcript")]
203        if primary:
204            self._primary_transcript = primary[0]
205            return self._primary_transcript
206        protein_coding = [k for k, v in self.transcripts.items()
207                         if v.get("transcript_biotype") == "protein_coding"]
208        if protein_coding:
209            self._primary_transcript = protein_coding[0]
210            return self._primary_transcript
211        self._primary_transcript = None
212        return None

Primary transcript ID, or first protein-coding transcript, or None.

def get_default_organism() -> str: View Source

115def get_default_organism() -> str:
116    """Default organism: from SEQMAT_DEFAULT_ORGANISM in config-less mode, else from config file."""
117    if get_data_base() is not None:
118        return os.environ.get("SEQMAT_DEFAULT_ORGANISM", "").strip() or "hg38"
119    config = load_config()
120    return config.get('default_organism', DEFAULT_SETTINGS['default_organism'])

Default organism: from SEQMAT_DEFAULT_ORGANISM in config-less mode, else from config file.

def get_data_dir() -> pathlib.Path: View Source

206def get_data_dir() -> Path:
207    """Return user data directory for seqmat (platformdirs)."""
208    return DEFAULT_DATA_DIR

Return user data directory for seqmat (platformdirs).

def get_config_dir() -> pathlib.Path: View Source

211def get_config_dir() -> Path:
212    """Return the directory containing the active config file."""
213    return DEFAULT_CONFIG_DIR

Return the directory containing the active config file.

def get_config_file() -> pathlib.Path: View Source

216def get_config_file() -> Path:
217    """Return the active config file path (for display or override)."""
218    return CONFIG_FILE

Return the active config file path (for display or override).

def get_data_base() -> Optional[pathlib.Path]: View Source

106def get_data_base() -> Optional[Path]:
107    """If SEQMAT_DATA_DIR is set and exists, return that path (config-less mode). Else None."""
108    env = os.environ.get("SEQMAT_DATA_DIR", "").strip()
109    if not env:
110        return None
111    p = Path(env).expanduser().resolve()
112    return p if p.exists() else None

If SEQMAT_DATA_DIR is set and exists, return that path (config-less mode). Else None.

def load_config() -> Dict[str, Any]: View Source

90def load_config() -> Dict[str, Any]:
91    """Load config from CONFIG_FILE, merged with DEFAULT_SETTINGS."""
92    if CONFIG_FILE.exists():
93        with open(CONFIG_FILE, "r") as f:
94            config = json.load(f)
95            merged_config = DEFAULT_SETTINGS.copy()
96            merged_config.update(config)
97            return merged_config
98    return DEFAULT_SETTINGS.copy()

Load config from CONFIG_FILE, merged with DEFAULT_SETTINGS.

def save_config(config: Dict[str, Any]) -> None: View Source

100def save_config(config: Dict[str, Any]) -> None:
101    """Save config to CONFIG_FILE (creates parent dir if needed)."""
102    CONFIG_FILE.parent.mkdir(parents=True, exist_ok=True)
103    with open(CONFIG_FILE, "w") as f:
104        json.dump(config, f, indent=2)

Save config to CONFIG_FILE (creates parent dir if needed).

def setup_genomics_data( basepath: str, organism: Optional[str] = None, force: bool = False, pickup: bool = False, n_jobs: Optional[int] = None, from_prebuilt: bool = True) -> None: View Source

 72def setup_genomics_data(
 73    basepath: str,
 74    organism: Optional[str] = None,
 75    force: bool = False,
 76    pickup: bool = False,
 77    n_jobs: Optional[int] = None,
 78    from_prebuilt: bool = True,
 79) -> None:
 80    """Set up genomics data for a specific organism.
 81
 82    Args:
 83        basepath: Base directory for storing genomic data.
 84        organism: Organism identifier (e.g. ``"hg38"`` or ``"mm39"``). Uses the configured
 85            default if omitted.
 86        force: Overwrite existing data.
 87        pickup: Resume an interrupted setup, reusing any already-downloaded files.
 88        n_jobs: Workers for GTF parsing (default: CPU count - 1). ``1`` is serial.
 89        from_prebuilt: When ``True`` (default), download prebuilt ``genes.db`` + FASTA from S3.
 90            When ``False``, download GTF/FASTA from Ensembl/UCSC and build ``genes.db`` locally.
 91    """
 92    if organism is None:
 93        organism = get_default_organism()
 94    base_path = Path(basepath) / organism
 95
 96    config_paths = {
 97        "CHROM_SOURCE": str(base_path),
 98        "MRNA_PATH": str(base_path),
 99        "MISSPLICING_PATH": str(base_path / "missplicing"),
100        "ONCOSPLICE_PATH": str(base_path / "oncosplice"),
101        "BASE": str(base_path),
102        "TEMP": str(base_path / "temp"),
103    }
104
105    config = load_config()
106    if organism in config and not force and not pickup:
107        _log.warning(
108            "Organism %s already configured. Use force=True to overwrite or pickup=True to resume.",
109            organism,
110        )
111        return
112
113    if base_path.exists() and any(base_path.iterdir()) and not force and not pickup:
114        _log.warning(
115            "Directory %s not empty. Use force=True to overwrite or pickup=True to resume.",
116            base_path,
117        )
118        return
119
120    _log.info("Setting up genomics data in %s", base_path)
121    base_path.mkdir(parents=True, exist_ok=True)
122
123    if from_prebuilt:
124        _log.info("Downloading prebuilt data for %s from S3...", organism)
125        try:
126            files = download_prebuilt_data(organism, base_path, skip_existing=pickup)
127        except PrebuiltDataUnavailableError as e:
128            _log.error("%s", e)
129            raise
130        config_paths["fasta_full_genome"] = str(files["fasta_file"])
131        config_paths["genes_db"] = str(files["genes_db"])
132    else:
133        _log.info("Downloading source data and building genes.db for %s...", organism)
134        files = download_genome_data(organism, base_path, skip_existing=pickup)
135        config_paths["fasta_full_genome"] = str(files["fasta_file"])
136        cons_data = None
137        if files["cons_file"] is not None:
138            try:
139                cons_data = load_conservation(files["cons_file"])
140            except Exception:
141                cons_data = None
142        annotation_jobs = n_jobs if n_jobs is not None else max(1, cpu_count() - 1)
143        retrieve_and_parse_ensembl_annotations(
144            base_path,
145            files["ensembl_file"],
146            cons_data,
147            gtex_file=files["gtex_file"],
148            n_jobs=annotation_jobs,
149        )
150        config_paths["genes_db"] = str(base_path / "genes.db")
151        if not pickup:
152            for key, file_path in files.items():
153                if key == "fasta_file":
154                    continue
155                if file_path and file_path.exists():
156                    file_path.unlink()
157
158    for path_key in ("MISSPLICING_PATH", "ONCOSPLICE_PATH", "TEMP"):
159        Path(config_paths[path_key]).mkdir(parents=True, exist_ok=True)
160
161    if get_data_base() is None:
162        config[organism] = config_paths
163        save_config(config)
164        _log.info("Configuration saved to: %s", CONFIG_FILE)
165    else:
166        _log.info("SEQMAT_DATA_DIR is set; no config file written.")
167    _append_seqmat_env_to_shell(Path(basepath).resolve(), organism)
168    _log.info("Successfully set up genomics data for %s in %s", organism, basepath)

Set up genomics data for a specific organism.

Arguments:

basepath: Base directory for storing genomic data.
organism: Organism identifier (e.g. "hg38" or "mm39"). Uses the configured default if omitted.
force: Overwrite existing data.
pickup: Resume an interrupted setup, reusing any already-downloaded files.
n_jobs: Workers for GTF parsing (default: CPU count - 1). 1 is serial.
from_prebuilt: When True (default), download prebuilt genes.db + FASTA from S3. When False, download GTF/FASTA from Ensembl/UCSC and build genes.db locally.

def set_fasta_path(fasta_path: str, organism: Optional[str] = None) -> None: View Source

171def set_fasta_path(fasta_path: str, organism: Optional[str] = None) -> None:
172    """Set the full genome FASTA path for an organism in the saved config.
173
174    Useful when you have a FASTA but haven't run the full setup, or need to point
175    SeqMat at a different reference build.
176    """
177    if organism is None:
178        organism = get_default_organism()
179    fa_path = Path(fasta_path)
180    if not fa_path.exists():
181        raise ValueError(f"FASTA file not found: {fa_path}")
182
183    config = load_config()
184    config.setdefault(organism, {})
185    config[organism]["fasta_full_genome"] = str(fa_path)
186    save_config(config)
187    _log.info("Set fasta_full_genome for %s to: %s (config: %s)", organism, fa_path, CONFIG_FILE)

Set the full genome FASTA path for an organism in the saved config.

Useful when you have a FASTA but haven't run the full setup, or need to point SeqMat at a different reference build.

def list_available_organisms() -> List[str]: View Source

26def list_available_organisms() -> List[str]:
27    """Organism IDs that are currently configured/installed."""
28    return get_available_organisms()

Organism IDs that are currently configured/installed.

def list_supported_organisms() -> List[str]: View Source

31def list_supported_organisms() -> List[str]:
32    """Alias of :func:`list_available_organisms` (kept for back-compat)."""
33    return get_available_organisms()

Alias of list_available_organisms() (kept for back-compat).

def get_organism_info(organism: str) -> Dict[str, Any]: View Source

197def get_organism_info(organism: str) -> Dict[str, Any]:
198    """Detailed information about an organism: paths + biotype-keyed gene counts."""
199    try:
200        config = get_organism_config(organism)
201    except ValueError:
202        return {"error": f"Organism '{organism}' not configured"}
203
204    info: Dict[str, Any] = {
205        "organism": organism,
206        "configured": True,
207        "paths": {k: str(v) for k, v in config.items()},
208        "data_available": {},
209    }
210
211    counts = count_genes(organism)
212    if counts:
213        info["data_available"]["biotypes"] = sorted(counts.keys())
214        info["data_available"]["gene_counts"] = counts
215
216    chrom_path = config.get("CHROM_SOURCE")
217    if chrom_path and Path(chrom_path).exists():
218        chrom_files = list(Path(chrom_path).glob("*.fasta"))
219        if chrom_files:
220            info["data_available"]["chromosomes"] = [f.stem for f in chrom_files]
221
222    return info

Detailed information about an organism: paths + biotype-keyed gene counts.

def list_gene_biotypes(organism: Optional[str] = None) -> List[str]: View Source

52def list_gene_biotypes(organism: Optional[str] = None) -> List[str]:
53    """List distinct ``gene_biotype`` values present in ``genes.db``."""
54    conn = _connect(organism)
55    if conn is None:
56        return []
57    try:
58        rows = conn.execute("SELECT DISTINCT biotype FROM genes ORDER BY biotype").fetchall()
59    finally:
60        conn.close()
61    return [r[0] for r in rows if r[0]]

List distinct gene_biotype values present in genes.db.

def count_genes( organism: Optional[str] = None, biotype: Optional[str] = None) -> Dict[str, int]: View Source

64def count_genes(organism: Optional[str] = None, biotype: Optional[str] = None) -> Dict[str, int]:
65    """Count genes per biotype.
66
67    If ``biotype`` is given, returns ``{biotype: count}`` for that one biotype.
68    Otherwise returns a dict of all biotypes -> counts.
69    """
70    conn = _connect(organism)
71    if conn is None:
72        return {}
73    try:
74        if biotype:
75            row = conn.execute(
76                "SELECT COUNT(*) FROM genes WHERE biotype = ?", (biotype,)
77            ).fetchone()
78            return {biotype: int(row[0])} if row else {biotype: 0}
79        rows = conn.execute(
80            "SELECT biotype, COUNT(*) FROM genes GROUP BY biotype ORDER BY biotype"
81        ).fetchall()
82    finally:
83        conn.close()
84    return {bt: int(n) for bt, n in rows if bt}

Count genes per biotype.

If biotype is given, returns {biotype: count} for that one biotype. Otherwise returns a dict of all biotypes -> counts.

def get_gene_list( organism: Optional[str] = None, biotype: Optional[str] = None, limit: Optional[int] = None) -> List[str]: View Source

 87def get_gene_list(
 88    organism: Optional[str] = None,
 89    biotype: Optional[str] = None,
 90    limit: Optional[int] = None,
 91) -> List[str]:
 92    """List gene names (skipping empty Ensembl symbols), optionally filtered by biotype."""
 93    conn = _connect(organism)
 94    if conn is None:
 95        return []
 96    sql = "SELECT gene_name FROM genes WHERE gene_name != ''"
 97    params: tuple = ()
 98    if biotype:
 99        sql += " AND biotype = ?"
100        params = (biotype,)
101    sql += " ORDER BY gene_name"
102    if limit:
103        sql += " LIMIT ?"
104        params = (*params, int(limit))
105    try:
106        rows = conn.execute(sql, params).fetchall()
107    finally:
108        conn.close()
109    return [r[0] for r in rows]

List gene names (skipping empty Ensembl symbols), optionally filtered by biotype.

def get_all_genes( organism: Optional[str] = None, biotype: Optional[str] = None) -> List[Dict[str, str]]: View Source

112def get_all_genes(
113    organism: Optional[str] = None,
114    biotype: Optional[str] = None,
115) -> List[Dict[str, str]]:
116    """Return all genes for an organism as ``[{organism, biotype, gene_name, gene_id}, ...]``."""
117    organism = organism or "<default>"
118    conn = _connect(organism if organism != "<default>" else None)
119    if conn is None:
120        return []
121    sql = "SELECT gene_name, gene_id, biotype FROM genes"
122    params: tuple = ()
123    if biotype:
124        sql += " WHERE biotype = ?"
125        params = (biotype,)
126    sql += " ORDER BY gene_name"
127    try:
128        rows = conn.execute(sql, params).fetchall()
129    finally:
130        conn.close()
131    return [
132        {"organism": organism, "biotype": bt, "gene_name": gn, "gene_id": gid}
133        for gn, gid, bt in rows
134    ]

Return all genes for an organism as [{organism, biotype, gene_name, gene_id}, ...].

def data_summary() -> Dict[str, Any]: View Source

225def data_summary() -> Dict[str, Any]:
226    """A complete data overview across every configured organism."""
227    configured = list_available_organisms()
228    summary: Dict[str, Any] = {
229        "supported_organisms": configured,
230        "configured_organisms": configured,
231        "organisms": {},
232    }
233    for organism in configured:
234        try:
235            summary["organisms"][organism] = get_organism_info(organism)
236        except Exception as e:  # pragma: no cover - defensive
237            summary["organisms"][organism] = {"error": f"Configuration error: {e}"}
238
239    total_genes = 0
240    total_biotypes: set = set()
241    for org_info in summary["organisms"].values():
242        gc = org_info.get("data_available", {}).get("gene_counts", {})
243        for biotype, count in gc.items():
244            total_genes += count
245            total_biotypes.add(biotype)
246    summary["totals"] = {
247        "organisms": len(configured),
248        "biotypes": len(total_biotypes),
249        "genes": total_genes,
250    }
251    return summary

A complete data overview across every configured organism.

def print_data_summary() -> None: View Source

254def print_data_summary() -> None:
255    """Print a formatted summary of all installed genomics data."""
256    from .config import DEFAULT_ORGANISM_DATA
257
258    summary = data_summary()
259    print("SeqMat Genomics Data Summary")
260    print("=" * 40)
261    totals = summary["totals"]
262    print(f"Total: {totals['organisms']} organism(s), {totals['biotypes']} biotype(s), {totals['genes']:,} gene(s)\n")
263
264    print("Configured organisms:")
265    configured = set(summary["configured_organisms"])
266    for org in sorted(DEFAULT_ORGANISM_DATA.keys() | configured):
267        status = "configured" if org in configured else "not configured"
268        name = DEFAULT_ORGANISM_DATA.get(org, {}).get("name", org)
269        print(f"  - {org}: {name}  [{status}]")
270    print()
271
272    for organism, info in summary["organisms"].items():
273        print(f"{organism.upper()} data:")
274        if "error" in info:
275            print(f"  error: {info['error']}")
276            continue
277        data_avail = info.get("data_available", {})
278        if "gene_counts" in data_avail:
279            print("  Gene types:")
280            for biotype, count in sorted(data_avail["gene_counts"].items()):
281                print(f"    {biotype}: {count:,}")
282        if "chromosomes" in data_avail:
283            chroms = data_avail["chromosomes"]
284            preview = ", ".join(sorted(chroms)[:5]) + ("..." if len(chroms) > 5 else "")
285            print(f"  Chromosomes ({len(chroms)}): {preview}")
286        print("  Paths:")
287        for path_name, path_value in info["paths"].items():
288            ok = "✓" if Path(path_value).exists() else "✗"
289            print(f"    {ok} {path_name}: {path_value}")
290        print()

Print a formatted summary of all installed genomics data.

def search_genes( organism: Optional[str] = None, query: str = '', biotype: Optional[str] = None, limit: int = 10) -> List[Dict[str, str]]: View Source

137def search_genes(
138    organism: Optional[str] = None,
139    query: str = "",
140    biotype: Optional[str] = None,
141    limit: int = 10,
142) -> List[Dict[str, str]]:
143    """Search genes by ``gene_name`` or ``gene_id`` substring (case-insensitive)."""
144    if not query:
145        return []
146    conn = _connect(organism)
147    if conn is None:
148        return []
149    pattern = f"%{query}%"
150    sql = (
151        "SELECT gene_name, gene_id, biotype FROM genes "
152        "WHERE (UPPER(gene_name) LIKE UPPER(?) OR UPPER(gene_id) LIKE UPPER(?))"
153    )
154    params: tuple = (pattern, pattern)
155    if biotype:
156        sql += " AND biotype = ?"
157        params = (*params, biotype)
158    sql += " ORDER BY gene_name LIMIT ?"
159    params = (*params, int(limit))
160    try:
161        rows = conn.execute(sql, params).fetchall()
162    finally:
163        conn.close()
164    org_label = organism or "<default>"
165    return [
166        {"organism": org_label, "biotype": bt, "gene_name": gn, "gene_id": gid}
167        for gn, gid, bt in rows
168    ]

Search genes by gene_name or gene_id substring (case-insensitive).

def available_genes( organism: str = 'hg38', biotype: Optional[str] = 'protein_coding') -> Iterator[str]: View Source

171def available_genes(organism: str = "hg38", biotype: Optional[str] = "protein_coding") -> Iterator[str]:
172    """Yield distinct gene symbols installed for an organism.
173
174    Defaults to protein-coding genes (matches the legacy filesystem-backed behavior).
175    Pass ``biotype=None`` to stream every biotype. Empty Ensembl symbols are skipped.
176    """
177    conn = _connect(organism)
178    if conn is None:
179        return
180    sql = "SELECT DISTINCT gene_name FROM genes WHERE gene_name != ''"
181    params: tuple = ()
182    if biotype is not None:
183        sql += " AND biotype = ?"
184        params = (biotype,)
185    sql += " ORDER BY gene_name"
186    try:
187        for (name,) in conn.execute(sql, params):
188            yield name
189    finally:
190        conn.close()

Yield distinct gene symbols installed for an organism.

Defaults to protein-coding genes (matches the legacy filesystem-backed behavior). Pass biotype=None to stream every biotype. Empty Ensembl symbols are skipped.

def gene_names_at_position( chrm: str, pos: Union[int, Tuple[int, int], List[int]], organism: Optional[str] = None) -> List[str]: View Source

201def gene_names_at_position(
202    chrm: str, pos: PosArg, organism: Optional[str] = None
203) -> List[str]:
204    """Fast name-only lookup: gene names overlapping a point or range. No BLOB load."""
205    return _get_index(organism).query_names(chrm, pos)

Fast name-only lookup: gene names overlapping a point or range. No BLOB load.

def build_location_index(organism: Optional[str] = None, force: bool = False) -> pathlib.Path: View Source

184def build_location_index(organism: Optional[str] = None, force: bool = False) -> Path:
185    """Build (or rebuild) the on-disk location index for an organism.
186
187    Returns the sidecar path. Use ``force=True`` to overwrite an existing index.
188    """
189    if organism is None:
190        organism = get_default_organism()
191    npz_path = _locations_path(organism)
192    if npz_path.exists() and not force:
193        _INDEX_CACHE.pop(organism, None)
194        _get_index(organism)
195        return npz_path
196    _INDEX_CACHE.pop(organism, None)
197    _get_index(organism, rebuild=True)
198    return npz_path

Build (or rebuild) the on-disk location index for an organism.

Returns the sidecar path. Use force=True to overwrite an existing index.

def build_lmdb( annotations_dir: Optional[str] = None, output_path: Optional[str] = None, organism: Optional[str] = None) -> str: View Source

105def build_lmdb(
106    annotations_dir: Optional[str] = None,
107    output_path: Optional[str] = None,
108    organism: Optional[str] = None,
109) -> str:
110    """Build LMDB from per-gene pickle files. Requires pip install seqmat[lmdb]."""
111    _require_lmdb()
112    if organism is None:
113        organism = get_default_organism()
114    if annotations_dir is None:
115        config = get_organism_config(organism)
116        annotations_dir = str(config["MRNA_PATH"])
117    ann_path = Path(annotations_dir)
118    if not ann_path.exists():
119        raise FileNotFoundError(f"Annotations directory not found: {ann_path}")
120    if output_path is None:
121        output_path = str(ann_path / "genes.lmdb")
122    out = Path(output_path)
123    if out.exists():
124        shutil.rmtree(str(out))
125    pkl_files = sorted(ann_path.glob("**/*.pkl"))
126    if not pkl_files:
127        raise FileNotFoundError(f"No .pkl files found under {ann_path}")
128    total_bytes = sum(f.stat().st_size for f in pkl_files)
129    map_size = int(total_bytes * 1.5) + 10 * 1024 * 1024
130    env = _lmdb.open(str(out), map_size=map_size, max_dbs=0)
131    genes_written = 0
132    skipped = 0
133    total_size = 0
134    with env.begin(write=True) as txn:
135        for pkl_file in pkl_files:
136            try:
137                raw_bytes = pkl_file.read_bytes()
138                gene_name = _gene_name_from_pkl_stem(pkl_file.stem)
139                txn.put(gene_name.encode("utf-8"), raw_bytes)
140                genes_written += 1
141                total_size += len(raw_bytes)
142            except Exception as exc:
143                _log.warning("Skipped %s: %s", pkl_file.name, exc)
144                skipped += 1
145    env.close()
146    _log.info(
147        "LMDB built at %s — %s genes, %.1f MB%s",
148        out, f"{genes_written:,}", total_size / (1024 * 1024),
149        f", {skipped} skipped" if skipped else "",
150    )
151    return str(out)

Build LMDB from per-gene pickle files. Requires pip install seqmat[lmdb].

class SeqMatError(builtins.Exception): View Source

10class SeqMatError(Exception):
11    """Base class for all SeqMat exceptions."""

Base class for all SeqMat exceptions.

class GeneNotFoundError(seqmat.SeqMatError, builtins.LookupError): View Source

14class GeneNotFoundError(SeqMatError, LookupError):
15    """Raised when a gene cannot be located in the configured organism's database."""

Raised when a gene cannot be located in the configured organism's database.

class OrganismNotConfiguredError(seqmat.SeqMatError, builtins.ValueError): View Source

18class OrganismNotConfiguredError(SeqMatError, ValueError):
19    """Raised when an organism has no data set up (run ``seqmat setup``)."""

Raised when an organism has no data set up (run seqmat setup).

class DataUnavailableError(seqmat.SeqMatError, builtins.RuntimeError): View Source

22class DataUnavailableError(SeqMatError, RuntimeError):
23    """Raised when prebuilt data (genes.db, FASTA, etc.) cannot be obtained."""

Raised when prebuilt data (genes.db, FASTA, etc.) cannot be obtained.

PrebuiltDataUnavailableError = <class 'DataUnavailableError'>