Skip to content

Commit b2bb023

Browse files
authored
Errors out when the chain map validator finds an issue (#92)
The leviosam2 algorithm does not support the case when two or more intervals in the source genome overlap. Previously an error message was displayed, but the program could still proceed. In this PR, we update the program behavior to error out when validation fails. The below shows the functional change. Other changed lines are tests and docs. ``` if (!validate_intervals()) { std::cerr << "[E::chain::build] Interval validation failed\n"; exit(1); } ```
1 parent 93a192e commit b2bb023

3 files changed

Lines changed: 188 additions & 10 deletions

File tree

src/chain.cpp

Lines changed: 82 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -19,6 +19,14 @@
1919
#include "leviosam_utils.hpp"
2020

2121
namespace chain {
22+
/**
23+
* @brief Default constructor for Interval.
24+
*
25+
* Creates an empty interval with default values:
26+
* - Empty target string
27+
* - Zero offset and coordinates
28+
* - Positive strand (true)
29+
*/
2230
Interval::Interval() {
2331
target = "";
2432
offset = 0;
@@ -27,6 +35,18 @@ Interval::Interval() {
2735
strand = true;
2836
}
2937

38+
/**
39+
* @brief Parameterized constructor for Interval.
40+
*
41+
* Creates an interval with specified alignment coordinates and properties.
42+
* An interval represents a gapless alignment segment between source and target references.
43+
*
44+
* @param t Target reference/contig name
45+
* @param so Source start position (0-based)
46+
* @param se Source end position (0-based, exclusive)
47+
* @param o Offset to convert source positions to target positions
48+
* @param ss Strand orientation (true = forward/+, false = reverse/-)
49+
*/
3050
Interval::Interval(std::string t, int32_t so, int32_t se, int32_t o, bool ss) {
3151
target = t;
3252
offset = o;
@@ -35,6 +55,14 @@ Interval::Interval(std::string t, int32_t so, int32_t se, int32_t o, bool ss) {
3555
strand = ss;
3656
}
3757

58+
/**
59+
* @brief Constructs an Interval by loading data from a binary input stream.
60+
*
61+
* This constructor loads a previously serialized Interval from a binary stream.
62+
* The data must have been written using the serialize() method.
63+
*
64+
* @param in Input stream containing the serialized Interval data
65+
*/
3866
Interval::Interval(std::ifstream &in) { load(in); }
3967

4068
void Interval::debug_print_interval() {
@@ -84,11 +112,39 @@ void Interval::load(std::istream &in) {
84112
strand = load_strand;
85113
}
86114

115+
/**
116+
* @brief Constructs a ChainMap by loading a pre-built index from an input stream.
117+
*
118+
* This constructor loads a previously serialized ChainMap index from a binary stream.
119+
* The index must have been created using the serialize() method.
120+
*
121+
* @param in Input stream containing the serialized ChainMap data
122+
* @param verbose Verbosity level (0=quiet, higher=more verbose)
123+
* @param allowed_intvl_gaps Maximum allowed gaps between intervals for lift-over
124+
*/
87125
ChainMap::ChainMap(std::ifstream &in, int verbose, int allowed_intvl_gaps)
88126
: verbose(verbose), allowed_intvl_gaps(allowed_intvl_gaps) {
89127
load(in);
90128
}
91129

130+
/**
131+
* @brief Constructs a ChainMap by parsing a chain file and building the index.
132+
*
133+
* This constructor reads a chain file (typically from UCSC's liftOver tool) and
134+
* builds an efficient index for performing lift-over operations. The chain file
135+
* contains pairwise alignments between source and target reference sequences.
136+
*
137+
* The constructor:
138+
* - Parses chain file format and extracts alignment intervals
139+
* - Builds bit vectors for efficient interval queries
140+
* - Sorts intervals and validates for overlaps
141+
* - Exits with error if interval sanity check fails
142+
*
143+
* @param fname Path to the chain file to parse
144+
* @param verbose Verbosity level (0=quiet, higher=more verbose)
145+
* @param allowed_intvl_gaps Maximum allowed gaps between intervals for lift-over
146+
* @param lm Length map containing chromosome/contig lengths for validation
147+
*/
92148
ChainMap::ChainMap(std::string fname, int verbose, int allowed_intvl_gaps,
93149
LengthMap &lm)
94150
: verbose(verbose), allowed_intvl_gaps(allowed_intvl_gaps), length_map(lm) {
@@ -132,8 +188,10 @@ ChainMap::ChainMap(std::string fname, int verbose, int allowed_intvl_gaps,
132188
if (verbose > 2) {
133189
debug_print_interval_map();
134190
}
135-
// TEMP
136-
interval_map_sanity_check();
191+
if (!validate_intervals()) {
192+
std::cerr << "[E::chain::build] Interval validation failed\n";
193+
exit(1);
194+
}
137195
}
138196

139197
/* Create start and end bitvectors when see a new `source`.
@@ -187,26 +245,41 @@ void ChainMap::debug_print_intervals(std::string contig, const int n) {
187245
std::cerr << "\n";
188246
}
189247

190-
/* Check if the interval map contains any overlaps in the source reference
191-
* Logic: for each interval, its ending position <= next starting position
248+
/**
249+
* @brief Adds an interval to the specified contig for testing purposes.
250+
*
251+
* This method allows adding intervals to the interval map, primarily intended
252+
* for unit testing scenarios where controlled interval data is needed.
253+
*
254+
* @param contig The contig/chromosome name to add the interval to
255+
* @param interval The interval to add
256+
*/
257+
void ChainMap::add_interval(const std::string& contig, const Interval& interval) {
258+
interval_map[contig].push_back(interval);
259+
}
260+
261+
/**
262+
* @brief Validates that intervals in the interval map do not overlap in the source reference.
263+
*
264+
* For each interval in each contig, this function ensures that the end position
265+
* of the current interval is less than or equal to the start position of the next interval.
192266
*
193-
* Return true if pass; false otherwise.
267+
* @return true if validation passes (no overlaps found), false otherwise.
194268
*/
195-
bool ChainMap::interval_map_sanity_check() {
269+
bool ChainMap::validate_intervals() {
196270
for (auto &itr : interval_map) {
197271
std::vector<chain::Interval> v = interval_map[itr.first];
198272
for (int i = 0; i < v.size() - 1; i++) {
199273
if (v[i].source_end > v[i + 1].source_start) {
200-
std::cerr << "[E::chain::interval_map_sanity_check] "
274+
std::cerr << "[E::chain::validate_intervals] "
201275
<< itr.first << "\n";
202276
v[i].debug_print_interval();
203277
v[i + 1].debug_print_interval();
204278
return false;
205279
}
206280
}
207281
}
208-
std::cerr << "[I::chain::interval_map_sanity_check] Interval_map sanity "
209-
"check: passed (no overlaps)\n";
282+
std::cerr << "[I::chain::validate_intervals] Interval validation: passed (no overlaps)\n";
210283
return true;
211284
}
212285

src/chain.hpp

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -62,7 +62,7 @@ class ChainMap {
6262
void sort_intervals(std::string contig);
6363
void debug_print_interval_map();
6464
void debug_print_intervals(std::string contig, const int n);
65-
bool interval_map_sanity_check();
65+
bool validate_intervals();
6666
void log_index_size(size_t bytes = 0) const;
6767
int get_start_rank(std::string contig, int pos);
6868
int get_end_rank(std::string contig, int pos);
@@ -115,6 +115,8 @@ class ChainMap {
115115
size_t serialize(std::ofstream &out);
116116
void load(std::ifstream &in);
117117
std::vector<std::pair<std::string, int32_t>> length_map;
118+
119+
void add_interval(const std::string& contig, const Interval& interval);
118120

119121
private:
120122
void init_rs();

src/chain_test.cpp

Lines changed: 103 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -539,6 +539,109 @@ TEST(ChainTest, UpdateIntervalIndexes) {
539539
EXPECT_EQ(eidx, 784); // TODO: why not 785
540540
}
541541

542+
TEST(ChainTest, IntervalMapSanityCheckEmpty) {
543+
chain::ChainMap cmap;
544+
// Empty interval map should pass sanity check
545+
EXPECT_EQ(cmap.validate_intervals(), true);
546+
}
547+
548+
TEST(ChainTest, IntervalMapSanityCheckSingleInterval) {
549+
chain::ChainMap cmap;
550+
// Single interval should pass sanity check
551+
cmap.add_interval("chr1", chain::Interval("chr1_dest", 100, 200, 50, true));
552+
EXPECT_EQ(cmap.validate_intervals(), true);
553+
}
554+
555+
TEST(ChainTest, IntervalMapSanityCheckNoOverlaps) {
556+
chain::ChainMap cmap;
557+
// Add non-overlapping intervals
558+
cmap.add_interval("chr1", chain::Interval("chr1_dest", 100, 200, 50, true));
559+
cmap.add_interval("chr1", chain::Interval("chr1_dest", 250, 350, 50, true));
560+
cmap.add_interval("chr1", chain::Interval("chr1_dest", 400, 500, 50, true));
561+
562+
EXPECT_EQ(cmap.validate_intervals(), true);
563+
}
564+
565+
TEST(ChainTest, IntervalMapSanityCheckWithOverlaps) {
566+
chain::ChainMap cmap;
567+
// Add overlapping intervals - first interval ends at 200, second starts at 150
568+
cmap.add_interval("chr1", chain::Interval("chr1_dest", 100, 200, 50, true));
569+
cmap.add_interval("chr1", chain::Interval("chr1_dest", 150, 250, 50, true));
570+
571+
EXPECT_EQ(cmap.validate_intervals(), false);
572+
}
573+
574+
TEST(ChainTest, IntervalMapSanityCheckAdjacentIntervals) {
575+
chain::ChainMap cmap;
576+
// Add adjacent intervals (no gap, no overlap) - should pass
577+
cmap.add_interval("chr1", chain::Interval("chr1_dest", 100, 200, 50, true));
578+
cmap.add_interval("chr1", chain::Interval("chr1_dest", 200, 300, 50, true));
579+
580+
EXPECT_EQ(cmap.validate_intervals(), true);
581+
}
582+
583+
TEST(ChainTest, IntervalMapSanityCheckMultipleContigs) {
584+
chain::ChainMap cmap;
585+
// Add intervals to multiple contigs - some with overlaps, some without
586+
// chr1: no overlaps (should pass)
587+
cmap.add_interval("chr1", chain::Interval("chr1_dest", 100, 200, 50, true));
588+
cmap.add_interval("chr1", chain::Interval("chr1_dest", 250, 350, 50, true));
589+
590+
// chr2: has overlaps (should fail)
591+
cmap.add_interval("chr2", chain::Interval("chr2_dest", 100, 200, 50, true));
592+
cmap.add_interval("chr2", chain::Interval("chr2_dest", 150, 250, 50, true));
593+
594+
// Should fail because chr2 has overlaps
595+
EXPECT_EQ(cmap.validate_intervals(), false);
596+
}
597+
598+
TEST(ChainTest, IntervalMapSanityCheckAllContigsValid) {
599+
chain::ChainMap cmap;
600+
// Add intervals to multiple contigs - all valid (no overlaps)
601+
// chr1: no overlaps
602+
cmap.add_interval("chr1", chain::Interval("chr1_dest", 100, 200, 50, true));
603+
cmap.add_interval("chr1", chain::Interval("chr1_dest", 250, 350, 50, true));
604+
605+
// chr2: no overlaps
606+
cmap.add_interval("chr2", chain::Interval("chr2_dest", 100, 200, 50, true));
607+
cmap.add_interval("chr2", chain::Interval("chr2_dest", 300, 400, 50, true));
608+
609+
// chr3: single interval
610+
cmap.add_interval("chr3", chain::Interval("chr3_dest", 500, 600, 50, true));
611+
612+
// Should pass because all contigs have valid intervals
613+
EXPECT_EQ(cmap.validate_intervals(), true);
614+
}
615+
616+
TEST(ChainTest, IntervalMapSanityCheckExactOverlap) {
617+
chain::ChainMap cmap;
618+
// Add intervals with exact overlap (same start/end positions)
619+
cmap.add_interval("chr1", chain::Interval("chr1_dest", 100, 200, 50, true));
620+
cmap.add_interval("chr1", chain::Interval("chr1_dest", 100, 200, 50, true));
621+
622+
EXPECT_EQ(cmap.validate_intervals(), false);
623+
}
624+
625+
TEST(ChainTest, IntervalMapSanityCheckPartialOverlap) {
626+
chain::ChainMap cmap;
627+
// Add intervals with partial overlap
628+
cmap.add_interval("chr1", chain::Interval("chr1_dest", 100, 300, 50, true));
629+
cmap.add_interval("chr1", chain::Interval("chr1_dest", 250, 400, 50, true));
630+
631+
EXPECT_EQ(cmap.validate_intervals(), false);
632+
}
633+
634+
TEST(ChainTest, IntervalMapSanityCheckDestOverlap) {
635+
chain::ChainMap cmap;
636+
// The dest intervals overlap, but the source intervals do not
637+
// Dest intervals: chr1_dest: [100, 300), chr1_dest: [100, 200)
638+
// Source intervals: chr1: [100, 300), chr1: [300, 400)
639+
cmap.add_interval("chr1", chain::Interval("chr1_dest", 100, 300, 0, true));
640+
cmap.add_interval("chr1", chain::Interval("chr1_dest", 300, 400, -200, true));
641+
642+
EXPECT_EQ(cmap.validate_intervals(), true);
643+
}
644+
542645
int main(int argc, char **argv) {
543646
testing::InitGoogleTest(&argc, argv);
544647
return RUN_ALL_TESTS();

0 commit comments

Comments
 (0)