Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
18 changes: 16 additions & 2 deletions chorus/core/interval.py
Original file line number Diff line number Diff line change
Expand Up @@ -49,8 +49,22 @@ def sequence(self) -> str:

def slop(self, extension_needed: int, how: str = 'both') -> 'GenomeRef':
with pysam.FastaFile(self.fasta) as fasta:
# Get chromosome length
chrom_length = fasta.get_reference_length(self.chrom)
# Get chromosome length. pysam raises KeyError(chrom) if the
# reference doesn't contain this contig — catch and re-raise
# as InvalidRegionError so users get an actionable message
# (naming the bad chromosome and the fasta path) instead of
# a low-level pysam traceback. Without this, calling
# oracle.predict(('chrZZ', ...)) crashes deep inside the
# oracle's one-hot encoder with KeyError: 'H' (or similar),
# which tells the user nothing about what went wrong.
try:
chrom_length = fasta.get_reference_length(self.chrom)
except KeyError:
raise IntervalException(
f"Chromosome {self.chrom!r} not found in {self.fasta}. "
f"Check that the chromosome name matches the reference "
f"(hg38 uses 'chr1'..'chr22', 'chrX', 'chrY', 'chrM')."
)
if how == 'left':
extend_left = extension_needed
elif how == 'right':
Expand Down
2 changes: 1 addition & 1 deletion scripts/build_backgrounds_borzoi.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
"""Build per-track background distributions for Borzoi.

Produces ``borzoi_pertrack.npz`` with three CDF matrices (effect, summary,
perbin) per track for all 7,612 Borzoi tracks: CAGE, RNA, DNASE, ATAC,
perbin) per track for all 7,611 Borzoi tracks: CAGE, RNA, DNASE, ATAC,
CHIP-TF, CHIP-Histone.

RNA-seq tracks use **exon-precise sampling**: only bins overlapping
Expand Down
34 changes: 34 additions & 0 deletions tests/test_prediction_methods.py
Original file line number Diff line number Diff line change
Expand Up @@ -251,6 +251,40 @@ def test_variant_position_is_1_based(self, caplog):

shutil.rmtree(tmp)

def test_bad_chromosome_gives_actionable_error(self):
"""A chromosome not in the reference FASTA must fail with a
message that names the bad chrom and the FASTA path — not a
low-level pysam.KeyError or a downstream one-hot-encoder
KeyError('H').

Regression for v20 §14.4 finding:
oracle.predict(('chrZZ', 100, 300), [...]) used to crash
deep in LegNet's transforms with KeyError: 'H'.

MockOracle._predict shortcircuits the input to random data so
we exercise the chokepoint (GenomeRef.slop → pysam) directly
plus the predict_variant_effect path which does go through
real region_interval[...] indexing.
"""
from chorus.core.interval import GenomeRef, IntervalException
from chorus.core.exceptions import InvalidRegionError

# Path A: GenomeRef.slop — the actual crash site before the fix
gr = GenomeRef(chrom="chrZZ", start=100, end=300,
fasta=str(self.fasta_path))
with pytest.raises(IntervalException, match="Chromosome 'chrZZ' not found"):
gr.slop(extension_needed=1000, how="both")

# Path B: predict_variant_effect(string) — goes through
# extract_sequence → raises InvalidRegionError
with pytest.raises(InvalidRegionError, match="[Cc]hromosome.*chrZZ.*not found"):
self.oracle.predict_variant_effect(
genomic_region="chrZZ:100-300",
variant_position="chrZZ:150",
alleles=["A", "C"],
assay_ids=["DNase:K562"],
)

def test_error_handling_model_not_loaded(self):
"""Test error when model not loaded."""
unloaded_oracle = MockOracle(reference_fasta=str(self.fasta_path))
Expand Down
Loading