diff --git a/modkit-core/src/pileup/base_mods_adapter.rs b/modkit-core/src/pileup/base_mods_adapter.rs index fa4aee1..702e257 100644 --- a/modkit-core/src/pileup/base_mods_adapter.rs +++ b/modkit-core/src/pileup/base_mods_adapter.rs @@ -74,7 +74,7 @@ impl<'a, const SIZE: usize> BaseModsAdapter<'a, SIZE> { b'C' | b'c' => agg[2] += 1u32, b'G' | b'g' => agg[1] += 1u32, b'T' | b't' => agg[0] += 1u32, - _ => unreachable!(), + _ => {} } agg }, @@ -155,7 +155,7 @@ impl<'a, const SIZE: usize> BaseModsAdapter<'a, SIZE> { b'C' => freq[1], b'G' => freq[2], b'T' => freq[3], - _ => unreachable!(), + _ => bail!("invalid canonical base in MM tag: {base}"), }; let remainder = total .checked_sub(count) @@ -278,7 +278,7 @@ impl<'a, const SIZE: usize> BaseModsAdapter<'a, SIZE> { } } let canonical_qual = 255u8.checked_sub(total_mod_qual).unwrap(); - let primary_base = DnaBase::parse(base as char).unwrap(); + let primary_base = DnaBase::parse(base as char)?; let threshold = filter_thresholds[primary_base as usize]; let mod_state = if canonical_qual > mod_qual { #[cfg(test)] @@ -475,7 +475,7 @@ fn base_complement(base: u8) -> u8 { b'C' => b'G', b'G' => b'C', b'T' => b'A', - _ => panic!("not allowed base"), + _ => base, } } @@ -885,4 +885,76 @@ mod base_mods_adapter_tests { // - [x] Empty // - [x] Multi-base // - [ ] Remaining edge cases from htslib + + #[test] + fn test_base_complement_n_base() { + assert_eq!(super::base_complement(b'A'), b'T'); + assert_eq!(super::base_complement(b'T'), b'A'); + assert_eq!(super::base_complement(b'C'), b'G'); + assert_eq!(super::base_complement(b'G'), b'C'); + // N bases should pass through without panicking + assert_eq!(super::base_complement(b'N'), b'N'); + } + + #[test] + fn test_forward_with_n_bases() { + // N bases interspersed in the sequence should not affect + // modification calls on valid positions + let seq = "ANTCATNCATTCCTACCGCTATAGCCT"; + // 0123456789... + // C positions: 3, 7, 11, 15, 16, 18, 24, 25 + // The MM tag skips 2 C's, then calls next C, then skips 1 + let mm = "C+mh,2,0,1;"; + let ml = vec![200, 10, 50, 170, 160, 20]; + let record = make_record(mm, &ml, seq, None, false); + let thresholds = [0f32; 4]; + let mut scanner = BaseModsAdapter::<2>::new(&record).unwrap(); + // Should still be able to iterate mod positions without panicking + let mod_positions: Vec = std::iter::from_fn(|| { + scanner.next_modified_position(thresholds, &[]).unwrap() + }) + .filter(|ms| !ms.inferred) + .map(|ms| ms.mod_position) + .collect(); + // Should find modification positions on C bases, not N bases + assert!(!mod_positions.is_empty()); + for pos in &mod_positions { + assert_ne!(seq.as_bytes()[*pos], b'N'); + } + } + + #[test] + fn test_reverse_with_n_bases() { + // Reverse strand with N bases in the sequence should not panic + // during frequency counting + let mm = "C+m.,0,0;C+h.,0,0;"; + let seq = "CGNGATGGANTC"; + let ml = vec![100, 25, 101, 24]; + let record = make_record(mm, &ml, seq, None, true); + + let mut scanner = BaseModsAdapter::<2>::new(&record).unwrap(); + let thresholds = [0f32; 4]; + // Should not panic; N bases are skipped in frequency counting + let mod_positions: Vec = std::iter::from_fn(|| { + scanner.next_modified_position(thresholds, &[]).unwrap() + }) + .map(|ms| ms.mod_position) + .collect(); + assert!(!mod_positions.is_empty()); + } + + #[test] + fn test_all_n_bases_reverse() { + // A sequence of all N bases on the reverse strand should not + // panic during frequency counting, and should produce no + // modification calls + let mm = "C+m?;"; + let seq = "NNNNNNNN"; + let ml = vec![]; + let record = make_record(mm, &ml, seq, None, true); + let mut scanner = BaseModsAdapter::<1>::new(&record).unwrap(); + let thresholds = [0f32; 4]; + let result = scanner.next_modified_position(thresholds, &[]).unwrap(); + assert!(result.is_none()); + } } diff --git a/modkit-core/src/pileup/pileup_processor.rs b/modkit-core/src/pileup/pileup_processor.rs index d9c2a86..a3b8882 100644 --- a/modkit-core/src/pileup/pileup_processor.rs +++ b/modkit-core/src/pileup/pileup_processor.rs @@ -343,21 +343,20 @@ impl< break 'overran; } else { assert!(pos > q); - let base = { - let tmp = DnaBase::try_from( - record.seq()[q], - ) - .unwrap(); - if record.is_reverse() { - tmp.complement() - } else { - tmp - } - }; - self.matrix.incr_diff_call( - rpos, base, ref_base, reverse, - hp, - ); + if let Ok(tmp) = DnaBase::try_from( + record.seq()[q], + ) { + let base = + if record.is_reverse() { + tmp.complement() + } else { + tmp + }; + self.matrix.incr_diff_call( + rpos, base, ref_base, + reverse, hp, + ); + } mod_state = Some(ms); break 'overran; } @@ -369,19 +368,18 @@ impl< continue 'records; } Ok(None) => { - let base = { - let tmp = - DnaBase::try_from(record.seq()[q]) - .unwrap(); - if record.is_reverse() { + if let Ok(tmp) = + DnaBase::try_from(record.seq()[q]) + { + let base = if record.is_reverse() { tmp.complement() } else { tmp - } - }; - self.matrix.incr_diff_call( - rpos, base, ref_base, reverse, hp, - ); + }; + self.matrix.incr_diff_call( + rpos, base, ref_base, reverse, hp, + ); + } mod_state = None; break 'overran; } @@ -393,14 +391,13 @@ impl< self.matrix.incr_delete(rpos, reverse, hp); } (Some(q), None) => { - let base = { - let tmp = - DnaBase::try_from(record.seq()[q]).unwrap(); - if record.is_reverse() { - tmp.complement() - } else { - tmp - } + let Ok(tmp) = DnaBase::try_from(record.seq()[q]) else { + continue 'pileup; + }; + let base = if record.is_reverse() { + tmp.complement() + } else { + tmp }; self.matrix .incr_diff_call(rpos, base, ref_base, reverse, hp); @@ -871,14 +868,13 @@ impl PileupWorker for GenericPileupWorker { ); } (Some(q), None) => { - let base = { - let tmp = - DnaBase::try_from(record.seq()[q]).unwrap(); - if record.is_reverse() { - tmp.complement() - } else { - tmp - } + let Ok(tmp) = DnaBase::try_from(record.seq()[q]) else { + continue 'pileup; + }; + let base = if record.is_reverse() { + tmp.complement() + } else { + tmp }; if implicit_bases.contains(&base) { add_to_tally(