diff --git a/chorus/core/interval.py b/chorus/core/interval.py index d6f1033..a15a547 100644 --- a/chorus/core/interval.py +++ b/chorus/core/interval.py @@ -49,8 +49,22 @@ def sequence(self) -> str: def slop(self, extension_needed: int, how: str = 'both') -> 'GenomeRef': with pysam.FastaFile(self.fasta) as fasta: - # Get chromosome length - chrom_length = fasta.get_reference_length(self.chrom) + # Get chromosome length. pysam raises KeyError(chrom) if the + # reference doesn't contain this contig — catch and re-raise + # as InvalidRegionError so users get an actionable message + # (naming the bad chromosome and the fasta path) instead of + # a low-level pysam traceback. Without this, calling + # oracle.predict(('chrZZ', ...)) crashes deep inside the + # oracle's one-hot encoder with KeyError: 'H' (or similar), + # which tells the user nothing about what went wrong. + try: + chrom_length = fasta.get_reference_length(self.chrom) + except KeyError: + raise IntervalException( + f"Chromosome {self.chrom!r} not found in {self.fasta}. " + f"Check that the chromosome name matches the reference " + f"(hg38 uses 'chr1'..'chr22', 'chrX', 'chrY', 'chrM')." + ) if how == 'left': extend_left = extension_needed elif how == 'right': diff --git a/scripts/build_backgrounds_borzoi.py b/scripts/build_backgrounds_borzoi.py index 4e6200e..4967903 100644 --- a/scripts/build_backgrounds_borzoi.py +++ b/scripts/build_backgrounds_borzoi.py @@ -1,7 +1,7 @@ """Build per-track background distributions for Borzoi. Produces ``borzoi_pertrack.npz`` with three CDF matrices (effect, summary, -perbin) per track for all 7,612 Borzoi tracks: CAGE, RNA, DNASE, ATAC, +perbin) per track for all 7,611 Borzoi tracks: CAGE, RNA, DNASE, ATAC, CHIP-TF, CHIP-Histone. RNA-seq tracks use **exon-precise sampling**: only bins overlapping diff --git a/tests/test_prediction_methods.py b/tests/test_prediction_methods.py index cd1f60a..f5e550c 100644 --- a/tests/test_prediction_methods.py +++ b/tests/test_prediction_methods.py @@ -251,6 +251,40 @@ def test_variant_position_is_1_based(self, caplog): shutil.rmtree(tmp) + def test_bad_chromosome_gives_actionable_error(self): + """A chromosome not in the reference FASTA must fail with a + message that names the bad chrom and the FASTA path — not a + low-level pysam.KeyError or a downstream one-hot-encoder + KeyError('H'). + + Regression for v20 §14.4 finding: + oracle.predict(('chrZZ', 100, 300), [...]) used to crash + deep in LegNet's transforms with KeyError: 'H'. + + MockOracle._predict shortcircuits the input to random data so + we exercise the chokepoint (GenomeRef.slop → pysam) directly + plus the predict_variant_effect path which does go through + real region_interval[...] indexing. + """ + from chorus.core.interval import GenomeRef, IntervalException + from chorus.core.exceptions import InvalidRegionError + + # Path A: GenomeRef.slop — the actual crash site before the fix + gr = GenomeRef(chrom="chrZZ", start=100, end=300, + fasta=str(self.fasta_path)) + with pytest.raises(IntervalException, match="Chromosome 'chrZZ' not found"): + gr.slop(extension_needed=1000, how="both") + + # Path B: predict_variant_effect(string) — goes through + # extract_sequence → raises InvalidRegionError + with pytest.raises(InvalidRegionError, match="[Cc]hromosome.*chrZZ.*not found"): + self.oracle.predict_variant_effect( + genomic_region="chrZZ:100-300", + variant_position="chrZZ:150", + alleles=["A", "C"], + assay_ids=["DNase:K562"], + ) + def test_error_handling_model_not_loaded(self): """Test error when model not loaded.""" unloaded_oracle = MockOracle(reference_fasta=str(self.fasta_path))