Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
48 changes: 30 additions & 18 deletions cram/cram_encode.c
Original file line number Diff line number Diff line change
Expand Up @@ -1502,17 +1502,20 @@ int next_cigar_op(uint32_t *cigar, uint32_t ncigar, int *skip, int *spos,

// Ensure ref and hist are large enough.
static inline int extend_ref(char **ref, uint32_t (**hist)[5], hts_pos_t pos,
hts_pos_t ref_start, hts_pos_t *ref_end) {
hts_pos_t ref_start, hts_pos_t *ref_end,
hts_pos_t *ref_end_alloc) {
if (*ref_end < pos)
*ref_end = pos;
if (pos < ref_start)
return -1;
if (pos < *ref_end)
if (pos < *ref_end_alloc)
return 0;

// realloc
if (pos - ref_start > UINT_MAX)
return -2; // protect overflow in new_end calculation

hts_pos_t old_end = *ref_end ? *ref_end : ref_start;
hts_pos_t old_end = *ref_end_alloc ? *ref_end_alloc : ref_start;
hts_pos_t new_end = ref_start + 1000 + (pos-ref_start)*1.5;

// Refuse to work on excessively large blocks.
Expand All @@ -1531,7 +1534,7 @@ static inline int extend_ref(char **ref, uint32_t (**hist)[5], hts_pos_t pos,
if (!tmp5)
return -1;
*hist = tmp5;
*ref_end = new_end;
*ref_end_alloc = new_end;

// initialise
old_end -= ref_start;
Expand All @@ -1546,7 +1549,7 @@ static inline int extend_ref(char **ref, uint32_t (**hist)[5], hts_pos_t pos,
// Returns 1 on success, <0 on failure
static int cram_add_to_ref_MD(bam1_t *b, char **ref, uint32_t (**hist)[5],
hts_pos_t ref_start, hts_pos_t *ref_end,
const uint8_t *MD) {
hts_pos_t *ref_end_alloc, const uint8_t *MD) {
uint8_t *seq = bam_get_seq(b);
uint32_t *cigar = bam_get_cigar(b);
uint32_t ncigar = b->core.n_cigar;
Expand All @@ -1558,7 +1561,7 @@ static int cram_add_to_ref_MD(bam1_t *b, char **ref, uint32_t (**hist)[5],

// No sequence means extend based on CIGAR instead
if (!b->core.l_qseq && extend_ref(ref, hist, rseq_end,
ref_start, ref_end) < 0)
ref_start, ref_end, ref_end_alloc) < 0)
return -1;

int iseq = 0, next_op;
Expand All @@ -1573,7 +1576,7 @@ static int cram_add_to_ref_MD(bam1_t *b, char **ref, uint32_t (**hist)[5],
int len = hts_str2uint((char *)MD, (char **)&MD, 31, &overflow);
if (overflow ||
extend_ref(ref, hist, iref+ref_start + len,
ref_start, ref_end) < 0)
ref_start, ref_end, ref_end_alloc) < 0)
return -1;
while (iseq < b->core.l_qseq && len) {
// rewrite to have internal loops?
Expand Down Expand Up @@ -1605,7 +1608,7 @@ static int cram_add_to_ref_MD(bam1_t *b, char **ref, uint32_t (**hist)[5],
MD++;
while (isalpha(*MD)) {
if (extend_ref(ref, hist, iref+ref_start, ref_start,
ref_end) < 0)
ref_end, ref_end_alloc) < 0)
return -1;
if ((next_op = next_cigar_op(cigar, ncigar, cig_skip,
&iseq, &cig_ind, &cig_op,
Expand All @@ -1621,7 +1624,8 @@ static int cram_add_to_ref_MD(bam1_t *b, char **ref, uint32_t (**hist)[5],
}
} else {
// substitution
if (extend_ref(ref, hist, iref+ref_start, ref_start, ref_end) < 0)
if (extend_ref(ref, hist, iref+ref_start, ref_start, ref_end,
ref_end_alloc) < 0)
return -1;
if ((next_op = next_cigar_op(cigar, ncigar, cig_skip,
&iseq, &cig_ind, &cig_op,
Expand Down Expand Up @@ -1650,12 +1654,14 @@ static int cram_add_to_ref_MD(bam1_t *b, char **ref, uint32_t (**hist)[5],
// Returns >=0 on success,
// -1 on failure (eg inconsistent data)
static int cram_add_to_ref(bam1_t *b, char **ref, uint32_t (**hist)[5],
hts_pos_t ref_start, hts_pos_t *ref_end) {
hts_pos_t ref_start, hts_pos_t *ref_end,
hts_pos_t *ref_end_alloc) {
const uint8_t *MD = bam_aux_get(b, "MD");
int ret = 0;
if (MD && *MD == 'Z') {
// We can use MD to directly compute the reference
int ret = cram_add_to_ref_MD(b, ref, hist, ref_start, ref_end, MD+1);
int ret = cram_add_to_ref_MD(b, ref, hist, ref_start, ref_end,
ref_end_alloc, MD+1);

if (ret > 0)
return ret;
Expand Down Expand Up @@ -1683,7 +1689,7 @@ static int cram_add_to_ref(bam1_t *b, char **ref, uint32_t (**hist)[5],
static uint8_t L16[16] = {4,0,1,4, 2,4,4,4, 3,4,4,4, 4,4,4,4};

if (extend_ref(ref, hist, iref+ref_start + len,
ref_start, ref_end) < 0)
ref_start, ref_end, ref_end_alloc) < 0)
return -1;
if (iseq + len <= b->core.l_qseq) {
// Nullify failed MD:Z if appropriate
Expand Down Expand Up @@ -1726,15 +1732,16 @@ static int cram_generate_reference(cram_container *c, cram_slice *s, int r1) {
// user told us to do embed_ref=2.
char *ref = NULL;
uint32_t (*hist)[5] = NULL;
hts_pos_t ref_start = c->bams[r1]->core.pos, ref_end = 0;
hts_pos_t ref_start = c->bams[r1]->core.pos, ref_end = 0,
ref_end_alloc = 0;
if (ref_start < 0)
return -1; // cannot build consensus from unmapped data

// initial allocation
if (extend_ref(&ref, &hist,
c->bams[r1 + s->hdr->num_records-1]->core.pos +
c->bams[r1 + s->hdr->num_records-1]->core.l_qseq,
ref_start, &ref_end) < 0)
ref_start, &ref_end, &ref_end_alloc) < 0)
return -1;

// Add each bam file to the reference/consensus arrays
Expand All @@ -1746,7 +1753,8 @@ static int cram_generate_reference(cram_container *c, cram_slice *s, int r1) {
goto err;
}
last_pos = c->bams[r1]->core.pos;
if (cram_add_to_ref(c->bams[r1], &ref, &hist, ref_start, &ref_end) < 0)
if (cram_add_to_ref(c->bams[r1], &ref, &hist, ref_start, &ref_end,
&ref_end_alloc) < 0)
goto err;
}

Expand All @@ -1770,6 +1778,7 @@ static int cram_generate_reference(cram_container *c, cram_slice *s, int r1) {
c->ref_start = ref_start+1;
c->ref_end = ref_end+1;
c->ref_free = 1;

return 0;

err:
Expand Down Expand Up @@ -2012,8 +2021,10 @@ int cram_encode_container(cram_fd *fd, cram_container *c) {
pthread_mutex_unlock(&fd->ref_lock);
}

if (c->ref_end > fd->refs->ref_id[c->ref_id]->LN_length)
c->ref_end = fd->refs->ref_id[c->ref_id]->LN_length;
hts_pos_t rlen = MAX(fd->refs->ref_id[c->ref_id]->LN_length,
fd->refs->ref_id[c->ref_id]->length);
if (c->ref_end > rlen && rlen)
c->ref_end = rlen;
}

// Iterate through records creating the cram blocks for some
Expand Down Expand Up @@ -3927,7 +3938,8 @@ static int process_one_read(cram_fd *fd, cram_container *c,
if (p->cram_flags & CRAM_FLAG_STATS_ADDED) {
cram_stats_del(c->stats[DS_NP], p->mate_pos);
cram_stats_del(c->stats[DS_MF], p->mate_flags);
if (!(p->cram_flags & CRAM_FLAG_EXPLICIT_TLEN))
if (!(p->cram_flags & CRAM_FLAG_EXPLICIT_TLEN)
&& !explicit_tlen)
cram_stats_del(c->stats[DS_TS], p->tlen);
cram_stats_del(c->stats[DS_NS], p->mate_ref_id);
}
Expand Down
10 changes: 10 additions & 0 deletions test/test.pl
Original file line number Diff line number Diff line change
Expand Up @@ -894,6 +894,16 @@ sub test_view
testv $opts, "./test_view $tv_args -p $ersam2 $ercram";
testv $opts, "./compare_sam.pl $ersam $ersam2";

# Embed_ref=2 with CRAM v4 uses explicit_len if it has to instead of
# breaking pairs with detached mode.
# Oddly this bug was only triggered when also specifying a reference.
$ersam = "xx#pair.sam";
$ercram = "xx#pair.tmp.cram";
$ersam2 = "${ercram}.sam";
testv $opts, "./test_view $tv_args -o version=4.0 -o embed_ref=2 -t xx.fa -C -p $ercram $ersam";
testv $opts, "./test_view $tv_args -p $ersam2 $ercram";
testv $opts, "./compare_sam.pl $ersam $ersam2";

if ($test_view_failures == 0) {
passed($opts, "embed_ref=2 tests");
} else {
Expand Down
Loading