diff --git a/cram/cram_encode.c b/cram/cram_encode.c index 7ec3c003a..735784e89 100644 --- a/cram/cram_encode.c +++ b/cram/cram_encode.c @@ -1502,17 +1502,20 @@ int next_cigar_op(uint32_t *cigar, uint32_t ncigar, int *skip, int *spos, // Ensure ref and hist are large enough. static inline int extend_ref(char **ref, uint32_t (**hist)[5], hts_pos_t pos, - hts_pos_t ref_start, hts_pos_t *ref_end) { + hts_pos_t ref_start, hts_pos_t *ref_end, + hts_pos_t *ref_end_alloc) { + if (*ref_end < pos) + *ref_end = pos; if (pos < ref_start) return -1; - if (pos < *ref_end) + if (pos < *ref_end_alloc) return 0; // realloc if (pos - ref_start > UINT_MAX) return -2; // protect overflow in new_end calculation - hts_pos_t old_end = *ref_end ? *ref_end : ref_start; + hts_pos_t old_end = *ref_end_alloc ? *ref_end_alloc : ref_start; hts_pos_t new_end = ref_start + 1000 + (pos-ref_start)*1.5; // Refuse to work on excessively large blocks. @@ -1531,7 +1534,7 @@ static inline int extend_ref(char **ref, uint32_t (**hist)[5], hts_pos_t pos, if (!tmp5) return -1; *hist = tmp5; - *ref_end = new_end; + *ref_end_alloc = new_end; // initialise old_end -= ref_start; @@ -1546,7 +1549,7 @@ static inline int extend_ref(char **ref, uint32_t (**hist)[5], hts_pos_t pos, // Returns 1 on success, <0 on failure static int cram_add_to_ref_MD(bam1_t *b, char **ref, uint32_t (**hist)[5], hts_pos_t ref_start, hts_pos_t *ref_end, - const uint8_t *MD) { + hts_pos_t *ref_end_alloc, const uint8_t *MD) { uint8_t *seq = bam_get_seq(b); uint32_t *cigar = bam_get_cigar(b); uint32_t ncigar = b->core.n_cigar; @@ -1558,7 +1561,7 @@ static int cram_add_to_ref_MD(bam1_t *b, char **ref, uint32_t (**hist)[5], // No sequence means extend based on CIGAR instead if (!b->core.l_qseq && extend_ref(ref, hist, rseq_end, - ref_start, ref_end) < 0) + ref_start, ref_end, ref_end_alloc) < 0) return -1; int iseq = 0, next_op; @@ -1573,7 +1576,7 @@ static int cram_add_to_ref_MD(bam1_t *b, char **ref, uint32_t (**hist)[5], int len = hts_str2uint((char *)MD, (char **)&MD, 31, &overflow); if (overflow || extend_ref(ref, hist, iref+ref_start + len, - ref_start, ref_end) < 0) + ref_start, ref_end, ref_end_alloc) < 0) return -1; while (iseq < b->core.l_qseq && len) { // rewrite to have internal loops? @@ -1605,7 +1608,7 @@ static int cram_add_to_ref_MD(bam1_t *b, char **ref, uint32_t (**hist)[5], MD++; while (isalpha(*MD)) { if (extend_ref(ref, hist, iref+ref_start, ref_start, - ref_end) < 0) + ref_end, ref_end_alloc) < 0) return -1; if ((next_op = next_cigar_op(cigar, ncigar, cig_skip, &iseq, &cig_ind, &cig_op, @@ -1621,7 +1624,8 @@ static int cram_add_to_ref_MD(bam1_t *b, char **ref, uint32_t (**hist)[5], } } else { // substitution - if (extend_ref(ref, hist, iref+ref_start, ref_start, ref_end) < 0) + if (extend_ref(ref, hist, iref+ref_start, ref_start, ref_end, + ref_end_alloc) < 0) return -1; if ((next_op = next_cigar_op(cigar, ncigar, cig_skip, &iseq, &cig_ind, &cig_op, @@ -1650,12 +1654,14 @@ static int cram_add_to_ref_MD(bam1_t *b, char **ref, uint32_t (**hist)[5], // Returns >=0 on success, // -1 on failure (eg inconsistent data) static int cram_add_to_ref(bam1_t *b, char **ref, uint32_t (**hist)[5], - hts_pos_t ref_start, hts_pos_t *ref_end) { + hts_pos_t ref_start, hts_pos_t *ref_end, + hts_pos_t *ref_end_alloc) { const uint8_t *MD = bam_aux_get(b, "MD"); int ret = 0; if (MD && *MD == 'Z') { // We can use MD to directly compute the reference - int ret = cram_add_to_ref_MD(b, ref, hist, ref_start, ref_end, MD+1); + int ret = cram_add_to_ref_MD(b, ref, hist, ref_start, ref_end, + ref_end_alloc, MD+1); if (ret > 0) return ret; @@ -1683,7 +1689,7 @@ static int cram_add_to_ref(bam1_t *b, char **ref, uint32_t (**hist)[5], static uint8_t L16[16] = {4,0,1,4, 2,4,4,4, 3,4,4,4, 4,4,4,4}; if (extend_ref(ref, hist, iref+ref_start + len, - ref_start, ref_end) < 0) + ref_start, ref_end, ref_end_alloc) < 0) return -1; if (iseq + len <= b->core.l_qseq) { // Nullify failed MD:Z if appropriate @@ -1726,7 +1732,8 @@ static int cram_generate_reference(cram_container *c, cram_slice *s, int r1) { // user told us to do embed_ref=2. char *ref = NULL; uint32_t (*hist)[5] = NULL; - hts_pos_t ref_start = c->bams[r1]->core.pos, ref_end = 0; + hts_pos_t ref_start = c->bams[r1]->core.pos, ref_end = 0, + ref_end_alloc = 0; if (ref_start < 0) return -1; // cannot build consensus from unmapped data @@ -1734,7 +1741,7 @@ static int cram_generate_reference(cram_container *c, cram_slice *s, int r1) { if (extend_ref(&ref, &hist, c->bams[r1 + s->hdr->num_records-1]->core.pos + c->bams[r1 + s->hdr->num_records-1]->core.l_qseq, - ref_start, &ref_end) < 0) + ref_start, &ref_end, &ref_end_alloc) < 0) return -1; // Add each bam file to the reference/consensus arrays @@ -1746,7 +1753,8 @@ static int cram_generate_reference(cram_container *c, cram_slice *s, int r1) { goto err; } last_pos = c->bams[r1]->core.pos; - if (cram_add_to_ref(c->bams[r1], &ref, &hist, ref_start, &ref_end) < 0) + if (cram_add_to_ref(c->bams[r1], &ref, &hist, ref_start, &ref_end, + &ref_end_alloc) < 0) goto err; } @@ -1770,6 +1778,7 @@ static int cram_generate_reference(cram_container *c, cram_slice *s, int r1) { c->ref_start = ref_start+1; c->ref_end = ref_end+1; c->ref_free = 1; + return 0; err: @@ -2012,8 +2021,10 @@ int cram_encode_container(cram_fd *fd, cram_container *c) { pthread_mutex_unlock(&fd->ref_lock); } - if (c->ref_end > fd->refs->ref_id[c->ref_id]->LN_length) - c->ref_end = fd->refs->ref_id[c->ref_id]->LN_length; + hts_pos_t rlen = MAX(fd->refs->ref_id[c->ref_id]->LN_length, + fd->refs->ref_id[c->ref_id]->length); + if (c->ref_end > rlen && rlen) + c->ref_end = rlen; } // Iterate through records creating the cram blocks for some @@ -3927,7 +3938,8 @@ static int process_one_read(cram_fd *fd, cram_container *c, if (p->cram_flags & CRAM_FLAG_STATS_ADDED) { cram_stats_del(c->stats[DS_NP], p->mate_pos); cram_stats_del(c->stats[DS_MF], p->mate_flags); - if (!(p->cram_flags & CRAM_FLAG_EXPLICIT_TLEN)) + if (!(p->cram_flags & CRAM_FLAG_EXPLICIT_TLEN) + && !explicit_tlen) cram_stats_del(c->stats[DS_TS], p->tlen); cram_stats_del(c->stats[DS_NS], p->mate_ref_id); } diff --git a/test/test.pl b/test/test.pl index 8a68a658c..eaa65ea30 100755 --- a/test/test.pl +++ b/test/test.pl @@ -894,6 +894,16 @@ sub test_view testv $opts, "./test_view $tv_args -p $ersam2 $ercram"; testv $opts, "./compare_sam.pl $ersam $ersam2"; + # Embed_ref=2 with CRAM v4 uses explicit_len if it has to instead of + # breaking pairs with detached mode. + # Oddly this bug was only triggered when also specifying a reference. + $ersam = "xx#pair.sam"; + $ercram = "xx#pair.tmp.cram"; + $ersam2 = "${ercram}.sam"; + testv $opts, "./test_view $tv_args -o version=4.0 -o embed_ref=2 -t xx.fa -C -p $ercram $ersam"; + testv $opts, "./test_view $tv_args -p $ersam2 $ercram"; + testv $opts, "./compare_sam.pl $ersam $ersam2"; + if ($test_view_failures == 0) { passed($opts, "embed_ref=2 tests"); } else {