From 418a183ef842a966afb00e783c09ac7fd7eaeb62 Mon Sep 17 00:00:00 2001 From: James Bonfield Date: Thu, 31 May 2018 16:28:24 +0100 Subject: [PATCH 01/23] First stage of supporting large chromosomes. The in-memory data structures are 64-bit for pos, mate-pos and insert size, along with iterators. The code that fills these out is still all 32-bit so this is basically a place-holder for ABI purposes. The exception to this is SAM support, which being purely textual has the minimal changes necessary to read and write 64-bit values. Split the hts_parse_reg API to 32-bit and 64-bit variants (although 64 bit version is only used internally at the moment). To much code uses this with addresses of 32-bit quantities, so for compatibility hts_parse_reg() cannot change. 64 bit parse_reg uses a slightly tweaked value for the end for chromosomes with no range (eg "chr1"). Using INT64_MAX would yield -1 when cast into int. We now have nearly 64-bit max which when truncated to 32-bit is still INT_MAX. The only change needed in samtools to pass tests is fixing cur5 and pre5 in bam_mate.c. --- cram/cram_encode.c | 2 +- cram/cram_structs.h | 4 +-- hts.c | 82 ++++++++++++++++++++++++++++++------------- htslib/hts.h | 19 +++++----- htslib/kstring.h | 8 +++-- htslib/sam.h | 30 ++++++++++++---- htslib/tbx.h | 2 +- htslib/vcf.h | 6 ++-- region.c | 2 ++ sam.c | 49 +++++++++++++++----------- synced_bcf_reader.c | 3 +- tbx.c | 2 +- test/sam.c | 6 ++-- test/test-bcf-sr.c | 3 +- test/test-parse-reg.c | 24 +++++++------ vcf.c | 59 +++++++++++++++++++------------ vcfutils.c | 53 ++++++++++++++-------------- 17 files changed, 220 insertions(+), 134 deletions(-) diff --git a/cram/cram_encode.c b/cram/cram_encode.c index 35b701a80..42d82f6b9 100644 --- a/cram/cram_encode.c +++ b/cram/cram_encode.c @@ -3065,7 +3065,7 @@ static int process_one_read(cram_fd *fd, cram_container *c, // This vs p: tlen, matepos, flags. Permit TLEN 0 and/or TLEN +/- // a small amount, if appropriate options set. if ((bam_ins_size(b) && - abs(bam_ins_size(b) - sign*(aright-aleft+1)) > fd->tlen_approx) || + llabs(bam_ins_size(b) - sign*(aright-aleft+1)) > fd->tlen_approx) || (!bam_ins_size(b) && !fd->tlen_zero)) goto detached; diff --git a/cram/cram_structs.h b/cram/cram_structs.h index 2cde6cfef..a1b7d8e7b 100644 --- a/cram/cram_structs.h +++ b/cram/cram_structs.h @@ -661,8 +661,8 @@ typedef struct cram_index { typedef struct { int refid; - int start; - int end; + int64_t start; + int64_t end; } cram_range; /*----------------------------------------------------------------------------- diff --git a/hts.c b/hts.c index 6133f893b..ecd46ee48 100644 --- a/hts.c +++ b/hts.c @@ -32,6 +32,7 @@ DEALINGS IN THE SOFTWARE. */ #include #include #include +#include #include #include #include @@ -1517,7 +1518,7 @@ KHASH_MAP_INIT_INT(bin, bins_t) typedef khash_t(bin) bidx_t; typedef struct { - int32_t n, m; + int64_t n, m; uint64_t *offset; } lidx_t; @@ -1532,7 +1533,8 @@ struct __hts_idx_t { int tbi_n, last_tbi_tid; struct { uint32_t last_bin, save_bin; - int last_coor, last_tid, save_tid, finished; + int64_t last_coor; + int last_tid, save_tid, finished; uint64_t last_off, save_off; uint64_t off_beg, off_end; uint64_t n_mapped, n_unmapped; @@ -1578,7 +1580,8 @@ static inline int insert_to_b(bidx_t *b, int bin, uint64_t beg, uint64_t end) static inline int insert_to_l(lidx_t *l, int64_t _beg, int64_t _end, uint64_t offset, int min_shift) { - int i, beg, end; + int i; + int64_t beg, end; beg = _beg >> min_shift; end = (_end - 1) >> min_shift; if (l->m < end + 1) { @@ -1732,7 +1735,7 @@ int hts_idx_finish(hts_idx_t *idx, uint64_t final_offset) return ret; } -int hts_idx_push(hts_idx_t *idx, int tid, int beg, int end, uint64_t offset, int is_mapped) +int hts_idx_push(hts_idx_t *idx, int tid, int64_t beg, int64_t end, uint64_t offset, int is_mapped) { int bin; int64_t maxpos = (int64_t) 1 << (idx->min_shift + idx->n_lvls * 3); @@ -1773,12 +1776,12 @@ int hts_idx_push(hts_idx_t *idx, int tid, int beg, int end, uint64_t offset, int idx->z.last_tid = tid; idx->z.last_bin = 0xffffffffu; } else if (tid >= 0 && idx->z.last_coor > beg) { // test if positions are out of order - hts_log_error("Unsorted positions on sequence #%d: %d followed by %d", tid+1, idx->z.last_coor+1, beg+1); + hts_log_error("Unsorted positions on sequence #%d: %"PRId64" followed by %"PRId64, tid+1, idx->z.last_coor+1, beg+1); return -1; } else if (end < beg) { // Malformed ranges are errors. (Empty ranges (beg==end) are unusual but acceptable.) - hts_log_error("Invalid record on sequence #%d: end %d < begin %d", tid+1, end, beg+1); + hts_log_error("Invalid record on sequence #%d: end %"PRId64" < begin %"PRId64, tid+1, end, beg+1); return -1; } if ( tid>=0 ) @@ -1828,14 +1831,14 @@ int hts_idx_push(hts_idx_t *idx, int tid, int beg, int end, uint64_t offset, int } if (idx->fmt == HTS_FMT_CSI) { - hts_log_error("Region %d..%d cannot be stored in a csi index " + hts_log_error("Region %"PRId64"..%"PRId64" cannot be stored in a csi index " "with min_shift = %d, n_lvls = %d. Try using " "min_shift = 14, n_lvls >= %d", beg, end, idx->min_shift, idx->n_lvls, n_lvls); } else { - hts_log_error("Region %d..%d cannot be stored in a %s index. " + hts_log_error("Region %"PRId64"..%"PRId64" cannot be stored in a %s index. " "Try using a csi index with min_shift = 14, " "n_lvls >= %d", beg, end, idx_format_name(idx->fmt), @@ -2275,7 +2278,8 @@ static inline int reg2bins(int64_t beg, int64_t end, hts_itr_t *itr, int min_shi if (beg >= end) return 0; if (end >= 1LL<>s); e = t + (end>>s); n = e - b + 1; if (itr->bins.n + n > itr->bins.m) { itr->bins.m = itr->bins.n + n; @@ -2396,7 +2400,7 @@ uint64_t hts_itr_off(const hts_idx_t* idx, int tid) { return off0; } -hts_itr_t *hts_itr_query(const hts_idx_t *idx, int tid, int beg, int end, hts_readrec_func *readrec) +hts_itr_t *hts_itr_query(const hts_idx_t *idx, int tid, int64_t beg, int64_t end, hts_readrec_func *readrec) { int i, n_off, l, bin; hts_pair64_max_t *off; @@ -2527,7 +2531,8 @@ int hts_itr_multi_bam(const hts_idx_t *idx, hts_itr_t *iter) khint_t k; bidx_t *bidx; uint64_t min_off, max_off, t_off = (uint64_t)-1; - int tid, beg, end; + int tid; + int64_t beg, end; hts_reglist_t *curr_reg; if (!idx || !iter || !iter->multi) @@ -2646,7 +2651,8 @@ int hts_itr_multi_bam(const hts_idx_t *idx, hts_itr_t *iter) int hts_itr_multi_cram(const hts_idx_t *idx, hts_itr_t *iter) { const hts_cram_idx_t *cidx = (const hts_cram_idx_t *) idx; - int tid, beg, end, i, j, l, n_off = 0; + int tid, i, j, l, n_off = 0; + int64_t beg, end; hts_reglist_t *curr_reg; hts_pair32_t *curr_intv; hts_pair64_max_t *off = NULL; @@ -2700,10 +2706,10 @@ int hts_itr_multi_cram(const hts_idx_t *idx, hts_itr_t *iter) off[n_off].max = (uint64_t)tid<<32 | end; n_off++; } else { - hts_log_warning("Could not set offset end for region %d:%d-%d. Skipping", tid, beg, end); + hts_log_warning("Could not set offset end for region %d:%"PRId64"-%"PRId64". Skipping", tid, beg, end); } } else { - hts_log_warning("No index entry for region %d:%d-%d", tid, beg, end); + hts_log_warning("No index entry for region %d:%"PRId64"-%"PRId64"", tid, beg, end); } } } else { @@ -2856,6 +2862,11 @@ static void *hts_memrchr(const void *s, int c, size_t n) { return NULL; } +// Almost INT64_MAX, but when cast into a 32-bit int it's +// also INT_MAX instead of -1. This avoids bugs with old code +// using the new data types. +#define INT64_32_MAX ((((int64_t)INT_MAX)<<32)|INT_MAX) + /* * A variant of hts_parse_reg which is reference-id aware. It uses * the iterator name2id callbacks to validate the region tokenisation works. @@ -2957,7 +2968,7 @@ const char *hts_parse_region(const char *s, int *tid, int64_t *beg, int64_t *end // No colon is simplest case; just check and return. if (colon == NULL) { - *beg = 0; *end = INT64_MAX; + *beg = 0; *end = INT64_32_MAX; kputsn(s, s_len-quoted, &ks); // convert to nul terminated string if (!ks.s) { *tid = -2; @@ -2972,7 +2983,7 @@ const char *hts_parse_region(const char *s, int *tid, int64_t *beg, int64_t *end // Has a colon, but check whole name first. if (!quoted) { - *beg = 0; *end = INT64_MAX; + *beg = 0; *end = INT64_32_MAX; kputsn(s, s_len, &ks); // convert to nul terminated string if (!ks.s) { *tid = -2; @@ -3023,7 +3034,7 @@ const char *hts_parse_region(const char *s, int *tid, int64_t *beg, int64_t *end if (*beg < 0) { if (isdigit(*hyphen) || *hyphen == '\0' || *hyphen == ',') { // interpret chr:-100 as chr:1-100 - *end = *beg==-1 ? INT64_MAX : -(*beg+1); + *end = *beg==-1 ? INT64_32_MAX : -(*beg+1); *beg = 0; return s_end; } else if (*hyphen == '-') { @@ -3035,7 +3046,7 @@ const char *hts_parse_region(const char *s, int *tid, int64_t *beg, int64_t *end } if (*hyphen == '\0' || ((flags & HTS_PARSE_LIST) && *hyphen == ',')) { - *end = flags & HTS_PARSE_ONE_COORD ? *beg+1 : INT64_MAX; + *end = flags & HTS_PARSE_ONE_COORD ? *beg+1 : INT64_32_MAX; } else if (*hyphen == '-') { *end = hts_parse_decimal(hyphen+1, &hyphen, flags); if (*hyphen != '\0' && *hyphen != ',') { @@ -3048,7 +3059,7 @@ const char *hts_parse_region(const char *s, int *tid, int64_t *beg, int64_t *end } if (*end == 0) - *end = INT64_MAX; // interpret chr:100- as chr:100- + *end = INT64_32_MAX; // interpret chr:100- as chr:100- if (*beg >= *end) return NULL; @@ -3057,19 +3068,19 @@ const char *hts_parse_region(const char *s, int *tid, int64_t *beg, int64_t *end // Next release we should mark this as deprecated? // Use hts_parse_region above instead. -const char *hts_parse_reg(const char *s, int *beg, int *end) +const char *hts_parse_reg_(const char *s, int64_t *beg, int64_t *end) { char *hyphen; const char *colon = strrchr(s, ':'); if (colon == NULL) { - *beg = 0; *end = INT_MAX; + *beg = 0; *end = INT64_32_MAX; return s + strlen(s); } *beg = hts_parse_decimal(colon+1, &hyphen, HTS_PARSE_THOUSANDS_SEP) - 1; if (*beg < 0) *beg = 0; - if (*hyphen == '\0') *end = INT_MAX; + if (*hyphen == '\0') *end = INT64_32_MAX; else if (*hyphen == '-') *end = hts_parse_decimal(hyphen+1, NULL, HTS_PARSE_THOUSANDS_SEP); else return NULL; @@ -3077,6 +3088,27 @@ const char *hts_parse_reg(const char *s, int *beg, int *end) return colon; } +const char *hts_parse_reg(const char *s, int *beg, int *end) +{ + int64_t beg64 = 0, end64 = 0; + const char *colon = hts_parse_reg_(s, &beg64, &end64); + if (beg64 > INT_MAX) { + hts_log_error("Position %"PRId64" too large", beg64); + return NULL; + } + if (end64 > INT_MAX) { + if (end64 == INT64_32_MAX) { + end64 = INT_MAX; + } else { + hts_log_error("Position %"PRId64" too large", end64); + return NULL; + } + } + *beg = beg64; + *end = end64; + return colon; +} + hts_itr_t *hts_itr_querys(const hts_idx_t *idx, const char *reg, hts_name2id_f getid, void *hdr, hts_itr_query_func *itr_query, hts_readrec_func *readrec) { int tid; @@ -3152,7 +3184,8 @@ hts_itr_t *hts_itr_regions(const hts_idx_t *idx, hts_reglist_t *reglist, int cou int hts_itr_next(BGZF *fp, hts_itr_t *iter, void *r, void *data) { - int ret, tid, beg, end; + int ret, tid; + int64_t beg, end; if (iter == NULL || iter->finished) return -1; if (iter->read_rest) { if (iter->curr_off) { // seek to the start @@ -3196,7 +3229,8 @@ int hts_itr_next(BGZF *fp, hts_itr_t *iter, void *r, void *data) int hts_itr_multi_next(htsFile *fd, hts_itr_t *iter, void *r) { void *fp; - int ret, tid, beg, end, i, cr, ci; + int ret, tid, i, cr, ci; + int64_t beg, end; hts_reglist_t *found_reg; if (iter == NULL || iter->finished) return -1; diff --git a/htslib/hts.h b/htslib/hts.h index 85847a8fe..08c357815 100644 --- a/htslib/hts.h +++ b/htslib/hts.h @@ -578,7 +578,8 @@ When REST or NONE is used, idx is also ignored and may be NULL. #define HTS_FMT_CRAI 3 typedef struct { - uint32_t beg, end; + //uint32_t beg, end; + uint64_t beg, end; // sorry for the bad naming: FIXME! } hts_pair32_t; typedef struct { @@ -595,18 +596,20 @@ typedef struct { hts_pair32_t *intervals; int tid; uint32_t count; - uint32_t min_beg, max_end; + uint64_t min_beg, max_end; } hts_reglist_t; -typedef int hts_readrec_func(BGZF *fp, void *data, void *r, int *tid, int *beg, int *end); +typedef int hts_readrec_func(BGZF *fp, void *data, void *r, int *tid, int64_t *beg, int64_t *end); typedef int hts_seek_func(void *fp, int64_t offset, int where); typedef int64_t hts_tell_func(void *fp); typedef struct { uint32_t read_rest:1, finished:1, is_cram:1, nocoor:1, multi:1, dummy:27; - int tid, beg, end, n_off, i, n_reg; + int tid, n_off, i, n_reg; + int64_t beg, end; hts_reglist_t *reg_list; - int curr_tid, curr_beg, curr_end, curr_reg, curr_intv; + int curr_tid, curr_reg, curr_intv; + int64_t curr_beg, curr_end; uint64_t curr_off, nocoor_off; hts_pair64_max_t *off; hts_readrec_func *readrec; @@ -658,7 +661,7 @@ void hts_idx_destroy(hts_idx_t *idx); The @p is_mapped parameter is used to update the n_mapped / n_unmapped counts stored in the meta-data bin. */ -int hts_idx_push(hts_idx_t *idx, int tid, int beg, int end, uint64_t offset, int is_mapped); +int hts_idx_push(hts_idx_t *idx, int tid, int64_t beg, int64_t end, uint64_t offset, int is_mapped); /// Finish building an index /** @param idx Index @@ -940,14 +943,14 @@ const char *hts_parse_region(const char *str, int *tid, int64_t *beg, int64_t *e @param readrec Callback to read a record from the input file @return An iterator on success; NULL on failure */ -hts_itr_t *hts_itr_query(const hts_idx_t *idx, int tid, int beg, int end, hts_readrec_func *readrec); +hts_itr_t *hts_itr_query(const hts_idx_t *idx, int tid, int64_t beg, int64_t end, hts_readrec_func *readrec); /// Free an iterator /** @param iter Iterator to free */ void hts_itr_destroy(hts_itr_t *iter); -typedef hts_itr_t *hts_itr_query_func(const hts_idx_t *idx, int tid, int beg, int end, hts_readrec_func *readrec); +typedef hts_itr_t *hts_itr_query_func(const hts_idx_t *idx, int tid, int64_t beg, int64_t end, hts_readrec_func *readrec); /// Create a single-region iterator from a text region specification /** @param idx Index diff --git a/htslib/kstring.h b/htslib/kstring.h index c440cd5e9..f817ed5dc 100644 --- a/htslib/kstring.h +++ b/htslib/kstring.h @@ -354,11 +354,11 @@ static inline int kputw(int c, kstring_t *s) return kputuw(x, s); } -static inline int kputl(long c, kstring_t *s) +static inline int kputll(long long c, kstring_t *s) { char buf[32]; int i, l = 0; - unsigned long x = c; + unsigned long long x = c; if (c < 0) x = -x; do { buf[l++] = x%10 + '0'; x /= 10; } while (x > 0); if (c < 0) buf[l++] = '-'; @@ -369,6 +369,10 @@ static inline int kputl(long c, kstring_t *s) return 0; } +static inline int kputl(long c, kstring_t *s) { + return kputll(c, s); +} + /* * Returns 's' split by delimiter, with *n being the number of components; * NULL on failue. diff --git a/htslib/sam.h b/htslib/sam.h index 8e1615e61..a4086ca1a 100644 --- a/htslib/sam.h +++ b/htslib/sam.h @@ -169,6 +169,22 @@ extern const int8_t bam_cigar_table[256]; *** Alignment records *** *************************/ +/* + * Assumptions made here. While pos can be 64-bit, no sequence + * itself is that long, but due to ref skip CIGAR fields it + * may span more than that. (CIGAR itself is 28-bit len + 4 bit + * type, but in theory we can combine multiples together.) + * + * Mate position and insert size also need to be 64-bit, but + * we won't accept more than 32-bit for tid. + * + * The bam_core_t structure is the *in memory* layout and not + * the same as the on-disk format. 64-bit changes here permit + * SAM to work with very long chromosomes and permit BAM and CRAM + * to seamlessly update in the future without further API/ABI + * revisions. + */ + /*! @typedef @abstract Structure for core alignment information. @field tid chromosome ID, defined by sam_hdr_t @@ -185,8 +201,8 @@ extern const int8_t bam_cigar_table[256]; */ typedef struct { int32_t tid; - int32_t pos; - uint16_t bin; + int64_t pos; + uint16_t bin; // NB: invalid on 64-bit pos uint8_t qual; uint8_t l_extranul; uint16_t flag; @@ -194,8 +210,8 @@ typedef struct { uint32_t n_cigar; int32_t l_qseq; int32_t mtid; - int32_t mpos; - int32_t isize; + int64_t mpos; + int64_t isize; } bam1_core_t; /*! @typedef @@ -946,7 +962,7 @@ int bam_cigar2qlen(int n_cigar, const uint32_t *cigar); operations in @p cigar (these are the operations that "consume" reference bases). All other operations (including invalid ones) are ignored. */ -int bam_cigar2rlen(int n_cigar, const uint32_t *cigar); +int64_t bam_cigar2rlen(int n_cigar, const uint32_t *cigar); /*! @abstract Calculate the rightmost base position of an alignment on the @@ -959,7 +975,7 @@ int bam_cigar2rlen(int n_cigar, const uint32_t *cigar); For an unmapped read (either according to its flags or if it has no cigar string), we return b->core.pos + 1 by convention. */ -int32_t bam_endpos(const bam1_t *b); +int64_t bam_endpos(const bam1_t *b); int bam_str2flag(const char *str); /** returns negative value on error */ char *bam_flag2str(int flag); /** The string must be freed by the user */ @@ -1084,7 +1100,7 @@ When using one of these values, @p beg and @p end are ignored. When using HTS_IDX_REST or HTS_IDX_NONE, NULL can be passed in to @p idx. */ -hts_itr_t *sam_itr_queryi(const hts_idx_t *idx, int tid, int beg, int end); +hts_itr_t *sam_itr_queryi(const hts_idx_t *idx, int tid, int64_t beg, int64_t end); /// Create a SAM/BAM/CRAM iterator /** @param idx Index diff --git a/htslib/tbx.h b/htslib/tbx.h index 9119ab8a3..52f103b11 100644 --- a/htslib/tbx.h +++ b/htslib/tbx.h @@ -64,7 +64,7 @@ extern const tbx_conf_t tbx_conf_gff, tbx_conf_bed, tbx_conf_psltbl, tbx_conf_sa /* Internal helper function used by tbx_itr_next() */ BGZF *hts_get_bgzfp(htsFile *fp); - int tbx_readrec(BGZF *fp, void *tbxv, void *sv, int *tid, int *beg, int *end); + int tbx_readrec(BGZF *fp, void *tbxv, void *sv, int *tid, int64_t *beg, int64_t *end); tbx_t *tbx_index(BGZF *fp, int min_shift, const tbx_conf_t *conf); /* diff --git a/htslib/vcf.h b/htslib/vcf.h index 31720d7f1..7116a1229 100644 --- a/htslib/vcf.h +++ b/htslib/vcf.h @@ -209,8 +209,8 @@ typedef struct { */ typedef struct { int32_t rid; // CHROM - int32_t pos; // POS - int32_t rlen; // length of REF + int64_t pos; // POS + int64_t rlen; // length of REF float qual; // QUAL uint32_t n_info:16, n_allele:16; uint32_t n_fmt:8, n_sample:24; @@ -427,7 +427,7 @@ set to one of BCF_ERR* codes and must be checked before calling bcf_write(). int vcf_write(htsFile *fp, const bcf_hdr_t *h, bcf1_t *v) HTS_RESULT_USED; /** Helper function for the bcf_itr_next() macro; internal use, ignore it */ - int bcf_readrec(BGZF *fp, void *null, void *v, int *tid, int *beg, int *end); + int bcf_readrec(BGZF *fp, void *null, void *v, int *tid, int64_t *beg, int64_t *end); diff --git a/region.c b/region.c index d9679f79f..d6680b8a0 100644 --- a/region.c +++ b/region.c @@ -37,6 +37,8 @@ typedef struct reglist KHASH_MAP_INIT_INT(reg, reglist_t) typedef kh_reg_t reghash_t; +const char *hts_parse_reg_(const char *s, int64_t *beg, int64_t *end); + static int compare_uint64 (const void * a, const void * b) { if (*(uint64_t *)a < *(uint64_t *)b) return -1; diff --git a/sam.c b/sam.c index 5d794de68..7948d1724 100644 --- a/sam.c +++ b/sam.c @@ -455,16 +455,17 @@ int bam_cigar2qlen(int n_cigar, const uint32_t *cigar) return l; } -int bam_cigar2rlen(int n_cigar, const uint32_t *cigar) +int64_t bam_cigar2rlen(int n_cigar, const uint32_t *cigar) { - int k, l; + int k; + int64_t l; for (k = l = 0; k < n_cigar; ++k) if (bam_cigar_type(bam_cigar_op(cigar[k]))&2) l += bam_cigar_oplen(cigar[k]); return l; } -int32_t bam_endpos(const bam1_t *b) +int64_t bam_endpos(const bam1_t *b) { if (!(b->core.flag & BAM_FUNMAP) && b->core.n_cigar > 0) return b->core.pos + bam_cigar2rlen(b->core.n_cigar, bam_get_cigar(b)); @@ -556,12 +557,12 @@ int bam_read1(BGZF *fp, bam1_t *b) if (fp->is_be) { for (i = 0; i < 8; ++i) ed_swap_4p(x + i); } - c->tid = x[0]; c->pos = x[1]; + c->tid = x[0]; c->pos = (int32_t)x[1]; c->bin = x[2]>>16; c->qual = x[2]>>8&0xff; c->l_qname = x[2]&0xff; c->l_extranul = (c->l_qname%4 != 0)? (4 - c->l_qname%4) : 0; c->flag = x[3]>>16; c->n_cigar = x[3]&0xffff; c->l_qseq = x[4]; - c->mtid = x[5]; c->mpos = x[6]; c->isize = x[7]; + c->mtid = x[5]; c->mpos = (int32_t)x[6]; c->isize = (int32_t)x[7]; new_l_data = block_len - 32 + c->l_extranul; if (new_l_data > INT_MAX || c->l_qseq < 0 || c->l_qname < 1) return -4; @@ -608,6 +609,12 @@ int bam_write1(BGZF *fp, const bam1_t *b) return -1; } if (c->n_cigar > 0xffff) block_len += 16; // "16" for "CGBI", 4-byte tag length and 8-byte fake CIGAR + if (c->pos > INT_MAX || + c->mpos > INT_MAX || + c->isize < INT_MIN || c->isize > INT_MAX) { + hts_log_error("Positional data is too large for BAM format"); + return -1; + } x[0] = c->tid; x[1] = c->pos; x[2] = (uint32_t)c->bin<<16 | c->qual<<8 | (c->l_qname - c->l_extranul); @@ -828,7 +835,7 @@ int sam_idx_save(htsFile *fp) { return 0; } -static int sam_readrec(BGZF *ignored, void *fpv, void *bv, int *tid, int *beg, int *end) +static int sam_readrec(BGZF *ignored, void *fpv, void *bv, int *tid, int64_t *beg, int64_t *end) { htsFile *fp = (htsFile *)fpv; bam1_t *b = bv; @@ -843,7 +850,7 @@ static int sam_readrec(BGZF *ignored, void *fpv, void *bv, int *tid, int *beg, i } // This is used only with read_rest=1 iterators, so need not set tid/beg/end. -static int sam_readrec_rest(BGZF *ignored, void *fpv, void *bv, int *tid, int *beg, int *end) +static int sam_readrec_rest(BGZF *ignored, void *fpv, void *bv, int *tid, int64_t *beg, int64_t *end) { htsFile *fp = (htsFile *)fpv; bam1_t *b = bv; @@ -852,7 +859,7 @@ static int sam_readrec_rest(BGZF *ignored, void *fpv, void *bv, int *tid, int *b return ret; } -static int cram_readrec(BGZF *ignored, void *fpv, void *bv, int *tid, int *beg, int *end) +static int cram_readrec(BGZF *ignored, void *fpv, void *bv, int *tid, int64_t *beg, int64_t *end) { htsFile *fp = fpv; bam1_t *b = bv; @@ -975,7 +982,7 @@ hts_idx_t *sam_index_load(htsFile *fp, const char *fn) return index_load(fp, fn, NULL, HTS_IDX_SAVE_REMOTE); } -static hts_itr_t *cram_itr_query(const hts_idx_t *idx, int tid, int beg, int end, hts_readrec_func *readrec) +static hts_itr_t *cram_itr_query(const hts_idx_t *idx, int tid, int64_t beg, int64_t end, hts_readrec_func *readrec) { const hts_cram_idx_t *cidx = (const hts_cram_idx_t *) idx; hts_itr_t *iter = (hts_itr_t *) calloc(1, sizeof(hts_itr_t)); @@ -1032,7 +1039,7 @@ static hts_itr_t *cram_itr_query(const hts_idx_t *idx, int tid, int beg, int end return iter; } -hts_itr_t *sam_itr_queryi(const hts_idx_t *idx, int tid, int beg, int end) +hts_itr_t *sam_itr_queryi(const hts_idx_t *idx, int tid, int64_t beg, int64_t end) { const hts_cram_idx_t *cidx = (const hts_cram_idx_t *) idx; if (idx == NULL) @@ -2759,7 +2766,7 @@ static int sam_format1_append(const bam_hdr_t *h, const bam1_t *b, kstring_t *st r |= kputs(h->target_name[c->tid] , str); r |= kputc_('\t', str); } else r |= kputsn_("*\t", 2, str); - r |= kputw(c->pos + 1, str); r |= kputc_('\t', str); // pos + r |= kputll(c->pos + 1, str); r |= kputc_('\t', str); // pos r |= kputw(c->qual, str); r |= kputc_('\t', str); // qual if (c->n_cigar) { // cigar uint32_t *cigar = bam_get_cigar(b); @@ -2775,8 +2782,8 @@ static int sam_format1_append(const bam_hdr_t *h, const bam1_t *b, kstring_t *st r |= kputs(h->target_name[c->mtid], str); r |= kputc_('\t', str); } - r |= kputw(c->mpos + 1, str); r |= kputc_('\t', str); // mate pos - r |= kputw(c->isize, str); r |= kputc_('\t', str); // template len + r |= kputll(c->mpos + 1, str); r |= kputc_('\t', str); // mate pos + r |= kputll(c->isize, str); r |= kputc_('\t', str); // template len if (c->l_qseq) { // seq and qual uint8_t *s = bam_get_seq(b); if (ks_resize(str, str->l+2+2*c->l_qseq) < 0) goto mem_err; @@ -3659,7 +3666,7 @@ static cstate_t g_cstate_null = { -1, 0, 0, 0 }; typedef struct __linkbuf_t { bam1_t b; - int32_t beg, end; + int64_t beg, end; cstate_t s; struct __linkbuf_t *next; bam_pileup_cd cd; @@ -3957,7 +3964,7 @@ void bam_plp_destructor(bam_plp_t plp, * Returns BAM_CMATCH, -1 when there is no more cigar to process or the requested position is not covered, * or -2 on error. */ -static inline int cigar_iref2iseq_set(uint32_t **cigar, uint32_t *cigar_max, int *icig, int *iseq, int *iref) +static inline int cigar_iref2iseq_set(uint32_t **cigar, uint32_t *cigar_max, int *icig, int *iseq, int64_t *iref) { int pos = *iref; if ( pos < 0 ) return -1; @@ -3992,7 +3999,7 @@ static inline int cigar_iref2iseq_set(uint32_t **cigar, uint32_t *cigar_max, int *iseq = -1; return -1; } -static inline int cigar_iref2iseq_next(uint32_t **cigar, uint32_t *cigar_max, int *icig, int *iseq, int *iref) +static inline int cigar_iref2iseq_next(uint32_t **cigar, uint32_t *cigar_max, int *icig, int *iseq, int64_t *iref) { while ( *cigar < cigar_max ) { @@ -4026,16 +4033,16 @@ static int tweak_overlap_quality(bam1_t *a, bam1_t *b) uint8_t *a_qual = bam_get_qual(a), *b_qual = bam_get_qual(b); uint8_t *a_seq = bam_get_seq(a), *b_seq = bam_get_seq(b); - int iref = b->core.pos; - int a_iref = iref - a->core.pos; - int b_iref = iref - b->core.pos; + int64_t iref = b->core.pos; + int64_t a_iref = iref - a->core.pos; + int64_t b_iref = iref - b->core.pos; int a_ret = cigar_iref2iseq_set(&a_cigar, a_cigar_max, &a_icig, &a_iseq, &a_iref); if ( a_ret<0 ) return a_ret<-1 ? -1:0; // no overlap or error int b_ret = cigar_iref2iseq_set(&b_cigar, b_cigar_max, &b_icig, &b_iseq, &b_iref); if ( b_ret<0 ) return b_ret<-1 ? -1:0; // no overlap or error #if DBG - fprintf(stderr,"tweak %s n_cigar=%d %d .. %d-%d vs %d-%d\n", bam_get_qname(a), a->core.n_cigar, b->core.n_cigar, + fprintf(stderr,"tweak %s n_cigar=%d %d .. %d-%d vs %"PRId64"-%"PRId64"\n", bam_get_qname(a), a->core.n_cigar, b->core.n_cigar, a->core.pos+1,a->core.pos+bam_cigar2rlen(a->core.n_cigar,bam_get_cigar(a)), b->core.pos+1, b->core.pos+bam_cigar2rlen(b->core.n_cigar,bam_get_cigar(b))); #endif @@ -4105,7 +4112,7 @@ static int overlap_push(bam_plp_t iter, lbnode_t *node) // no overlap possible, unless some wild cigar if ( node->b.core.tid != node->b.core.mtid - || (abs(node->b.core.isize) >= 2*node->b.core.l_qseq + || (llabs(node->b.core.isize) >= 2*node->b.core.l_qseq && node->b.core.mpos >= node->end) // for those wild cigars ) return 0; diff --git a/synced_bcf_reader.c b/synced_bcf_reader.c index 6b65e3133..425fae1ce 100644 --- a/synced_bcf_reader.c +++ b/synced_bcf_reader.c @@ -29,6 +29,7 @@ DEALINGS IN THE SOFTWARE. */ #include #include #include +#include #include #include #include "htslib/synced_bcf_reader.h" @@ -383,7 +384,7 @@ void debug_buffer(FILE *fp, bcf_sr_t *reader) for (j=0; j<=reader->nbuffer; j++) { bcf1_t *line = reader->buffer[j]; - fprintf(fp,"\t%p\t%s%s\t%s:%d\t%s ", (void*)line,reader->fname,j==0?"*":" ",reader->header->id[BCF_DT_CTG][line->rid].key,line->pos+1,line->n_allele?line->d.allele[0]:""); + fprintf(fp,"\t%p\t%s%s\t%s:%"PRId64"\t%s ", (void*)line,reader->fname,j==0?"*":" ",reader->header->id[BCF_DT_CTG][line->rid].key,line->pos+1,line->n_allele?line->d.allele[0]:""); int k; for (k=1; kn_allele; k++) fprintf(fp," %s", line->d.allele[k]); fprintf(fp,"\n"); diff --git a/tbx.c b/tbx.c index 2e0f3499e..d5e3c3be5 100644 --- a/tbx.c +++ b/tbx.c @@ -172,7 +172,7 @@ static inline int get_intv(tbx_t *tbx, kstring_t *str, tbx_intv_t *intv, int is_ * -1 on EOF * <= -2 on error */ -int tbx_readrec(BGZF *fp, void *tbxv, void *sv, int *tid, int *beg, int *end) +int tbx_readrec(BGZF *fp, void *tbxv, void *sv, int *tid, int64_t *beg, int64_t *end) { tbx_t *tbx = (tbx_t *) tbxv; kstring_t *s = (kstring_t *) sv; diff --git a/test/sam.c b/test/sam.c index 7cbfd179e..bf429bf98 100644 --- a/test/sam.c +++ b/test/sam.c @@ -1148,12 +1148,12 @@ static void samrecord_layout(void) size_t bam1_t_size, bam1_t_size2; - bam1_t_size = (36 + sizeof(int) + 4 + sizeof (char *) + sizeof(uint64_t) + bam1_t_size = (56 + sizeof(int) + 4 + sizeof (char *) + sizeof(uint64_t) + sizeof(uint32_t)); bam1_t_size2 = bam1_t_size + 4; // Account for padding on some platforms - if (sizeof (bam1_core_t) != 36) - fail("sizeof bam1_core_t is %zu, expected 36", sizeof (bam1_core_t)); + if (sizeof (bam1_core_t) != 56) + fail("sizeof bam1_core_t is %zu, expected 56", sizeof (bam1_core_t)); if (sizeof (bam1_t) != bam1_t_size && sizeof (bam1_t) != bam1_t_size2) fail("sizeof bam1_t is %zu, expected either %zu or %zu", diff --git a/test/test-bcf-sr.c b/test/test-bcf-sr.c index ebe93904a..23ee1d3e0 100644 --- a/test/test-bcf-sr.c +++ b/test/test-bcf-sr.c @@ -31,6 +31,7 @@ #include #include #include +#include #include void error(const char *format, ...) @@ -103,7 +104,7 @@ int main(int argc, char *argv[]) { if ( !bcf_sr_has_line(sr,i) ) continue; bcf1_t *rec = bcf_sr_get_line(sr, i); - printf("%s:%d", bcf_seqname(bcf_sr_get_header(sr,i),rec),rec->pos+1); + printf("%s:%"PRId64, bcf_seqname(bcf_sr_get_header(sr,i),rec),rec->pos+1); break; } diff --git a/test/test-parse-reg.c b/test/test-parse-reg.c index 404e98ddf..74bb3187f 100644 --- a/test/test-parse-reg.c +++ b/test/test-parse-reg.c @@ -47,6 +47,10 @@ #include #include +#ifndef INT64_32_MAX +#define INT64_32_MAX ((((int64_t)INT_MAX)<<32)|INT_MAX) +#endif + void reg_expected(sam_hdr_t *hdr, const char *reg, int flags, char *reg_exp, int tid_exp, int64_t beg_exp, int64_t end_exp) { const char *reg_out; @@ -87,26 +91,26 @@ int reg_test(char *fn) { // 5 chr1,chr3 // Check range extensions. - reg_expected(hdr, "chr1", 0, "", 0, 0, INT64_MAX); - reg_expected(hdr, "chr1:50", 0, "", 0, 49, INT64_MAX); + reg_expected(hdr, "chr1", 0, "", 0, 0, INT64_32_MAX); + reg_expected(hdr, "chr1:50", 0, "", 0, 49, INT64_32_MAX); reg_expected(hdr, "chr1:50", HTS_PARSE_ONE_COORD, "", 0, 49, 50); reg_expected(hdr, "chr1:50-100", 0, "", 0, 49, 100); - reg_expected(hdr, "chr1:50-", 0, "", 0, 49, INT64_MAX); + reg_expected(hdr, "chr1:50-", 0, "", 0, 49, INT64_32_MAX); reg_expected(hdr, "chr1:-50", 0, "", 0, 0, 50); // Check quoting fprintf(stderr, "Expected error: "); reg_expected(hdr, "chr1:100-200", 0, NULL, 0, 0, 0); // ambiguous reg_expected(hdr, "{chr1}:100-200", 0, "", 0, 99, 200); - reg_expected(hdr, "{chr1:100-200}", 0, "", 2, 0, INT64_MAX); + reg_expected(hdr, "{chr1:100-200}", 0, "", 2, 0, INT64_32_MAX); reg_expected(hdr, "{chr1:100-200}:100-200", 0, "", 2, 99, 200); reg_expected(hdr, "{chr2:100-200}:100-200", 0, "", 3, 99, 200); reg_expected(hdr, "chr2:100-200:100-200", 0, "", 3, 99, 200); - reg_expected(hdr, "chr2:100-200", 0, "", 3, 0, INT64_MAX); + reg_expected(hdr, "chr2:100-200", 0, "", 3, 0, INT64_32_MAX); // Check numerics - reg_expected(hdr, "chr3", 0, "", 4, 0, INT64_MAX); - reg_expected(hdr, "chr3:", 0, "", 4, 0, INT64_MAX); + reg_expected(hdr, "chr3", 0, "", 4, 0, INT64_32_MAX); + reg_expected(hdr, "chr3:", 0, "", 4, 0, INT64_32_MAX); reg_expected(hdr, "chr3:1000-1500", 0, "", 4, 999, 1500); reg_expected(hdr, "chr3:1,000-1,500", 0, "", 4, 999, 1500); reg_expected(hdr, "chr3:1k-1.5K", 0, "", 4, 999, 1500); @@ -114,11 +118,11 @@ int reg_test(char *fn) { reg_expected(hdr, "chr3:1e3-15e2", 0, "", 4, 999, 1500); // Check list mode - reg_expected(hdr, "chr1,chr3", HTS_PARSE_LIST, "chr3", 0, 0, INT64_MAX); + reg_expected(hdr, "chr1,chr3", HTS_PARSE_LIST, "chr3", 0, 0, INT64_32_MAX); fprintf(stderr, "Expected error: "); reg_expected(hdr, "chr1:100-200,chr3", HTS_PARSE_LIST, NULL, 0, 0, 0); // ambiguous - reg_expected(hdr, "{chr1,chr3}", HTS_PARSE_LIST, "", 5, 0, INT64_MAX); - reg_expected(hdr, "{chr1,chr3},chr1", HTS_PARSE_LIST, "chr1", 5, 0, INT64_MAX); + reg_expected(hdr, "{chr1,chr3}", HTS_PARSE_LIST, "", 5, 0, INT64_32_MAX); + reg_expected(hdr, "{chr1,chr3},chr1", HTS_PARSE_LIST, "chr1", 5, 0, INT64_32_MAX); // incorrect usage; first reg is valid (but not what user expects). reg_expected(hdr, "chr3:1,000-1,500", HTS_PARSE_LIST | HTS_PARSE_ONE_COORD, "000-1,500", 4, 0, 1); diff --git a/vcf.c b/vcf.c index 1ace26b26..c53e3a2db 100644 --- a/vcf.c +++ b/vcf.c @@ -33,6 +33,7 @@ DEALINGS IN THE SOFTWARE. */ #include #include #include +#include #include #include "htslib/vcf.h" @@ -1180,21 +1181,27 @@ void bcf_destroy(bcf1_t *v) static inline int bcf_read1_core(BGZF *fp, bcf1_t *v) { - uint32_t x[8]; + union { + uint32_t i; + float f; + } x[8]; ssize_t ret; if ((ret = bgzf_read(fp, x, 32)) != 32) { if (ret == 0) return -1; return -2; } bcf_clear1(v); - if (x[0] < 24) return -2; - x[0] -= 24; // to exclude six 32-bit integers - if (ks_resize(&v->shared, x[0]) != 0) return -2; - if (ks_resize(&v->indiv, x[1]) != 0) return -2; - memcpy(v, x + 2, 16); - v->n_allele = x[6]>>16; v->n_info = x[6]&0xffff; - v->n_fmt = x[7]>>24; v->n_sample = x[7]&0xffffff; - v->shared.l = x[0], v->indiv.l = x[1]; + if (x[0].i < 24) return -2; + x[0].i -= 24; // to exclude six 32-bit integers + if (ks_resize(&v->shared, x[0].i) != 0) return -2; + if (ks_resize(&v->indiv, x[1].i) != 0) return -2; + v->rid = x[2].i; + v->pos = x[3].i; + v->rlen = x[4].i; + v->qual = x[5].f; + v->n_allele = x[6].i>>16; v->n_info = x[6].i&0xffff; + v->n_fmt = x[7].i>>24; v->n_sample = x[7].i&0xffffff; + v->shared.l = x[0].i, v->indiv.l = x[1].i; // silent fix of broken BCFs produced by earlier versions of bcf_subset, prior to and including bd6ed8b4 if ( (!v->indiv.l || !v->n_sample) && v->n_fmt ) v->n_fmt = 0; @@ -1436,7 +1443,7 @@ int bcf_read(htsFile *fp, const bcf_hdr_t *h, bcf1_t *v) return bcf_subset_format(h,v); } -int bcf_readrec(BGZF *fp, void *null, void *vv, int *tid, int *beg, int *end) +int bcf_readrec(BGZF *fp, void *null, void *vv, int *tid, int64_t *beg, int64_t *end) { bcf1_t *v = (bcf1_t *) vv; int ret; @@ -1684,7 +1691,7 @@ int bcf_write(htsFile *hfp, bcf_hdr_t *h, bcf1_t *v) } if ( bcf_hdr_nsamples(h)!=v->n_sample ) { - hts_log_error("Broken VCF record, the number of columns at %s:%d does not match the number of samples (%d vs %d)", + hts_log_error("Broken VCF record, the number of columns at %s:%"PRId64" does not match the number of samples (%d vs %d)", bcf_seqname(h,v), v->pos+1, v->n_sample, bcf_hdr_nsamples(h)); return -1; } @@ -1704,12 +1711,18 @@ int bcf_write(htsFile *hfp, bcf_hdr_t *h, bcf1_t *v) bcf1_sync(v); // check if the BCF record was modified BGZF *fp = hfp->fp.bgzf; - uint32_t x[8]; - x[0] = v->shared.l + 24; // to include six 32-bit integers - x[1] = v->indiv.l; - memcpy(x + 2, v, 16); - x[6] = (uint32_t)v->n_allele<<16 | v->n_info; - x[7] = (uint32_t)v->n_fmt<<24 | v->n_sample; + union { + uint32_t i; + float f; + } x[8]; + x[0].i = v->shared.l + 24; // to include six 32-bit integers + x[1].i = v->indiv.l; + x[2].i = v->rid; + x[3].i = v->pos; + x[4].i = v->rlen; + x[5].f = v->qual; + x[6].i = (uint32_t)v->n_allele<<16 | v->n_info; + x[7].i = (uint32_t)v->n_fmt<<24 | v->n_sample; if ( bgzf_write(fp, x, 32) != 32 ) return -1; if ( bgzf_write(fp, v->shared.s, v->shared.l) != v->shared.l ) return -1; if ( bgzf_write(fp, v->indiv.s, v->indiv.l) != v->indiv.l ) return -1; @@ -2132,7 +2145,7 @@ static int vcf_parse_format(kstring_t *s, const bcf_hdr_t *h, bcf1_t *v, char *p char *end = s->s + s->l; if ( q>=end ) { - hts_log_error("FORMAT column with no sample columns starting at %s:%d", s->s, v->pos+1); + hts_log_error("FORMAT column with no sample columns starting at %s:%"PRId64"", s->s, v->pos+1); v->errcode |= BCF_ERR_NCOLS; return -1; } @@ -2148,7 +2161,7 @@ static int vcf_parse_format(kstring_t *s, const bcf_hdr_t *h, bcf1_t *v, char *p for (j = 0, t = kstrtok(p, ":", &aux1); t; t = kstrtok(0, 0, &aux1), ++j) { if (j >= MAX_N_FMT) { v->errcode |= BCF_ERR_LIMITS; - hts_log_error("FORMAT column at %s:%d lists more identifiers than htslib can handle", + hts_log_error("FORMAT column at %s:%"PRId64" lists more identifiers than htslib can handle", bcf_seqname(h,v), v->pos+1); return -1; } @@ -2220,7 +2233,7 @@ static int vcf_parse_format(kstring_t *s, const bcf_hdr_t *h, bcf1_t *v, char *p j++; if ( j>=v->n_fmt ) { - hts_log_error("Incorrect number of FORMAT fields at %s:%d", + hts_log_error("Incorrect number of FORMAT fields at %s:%"PRId64"", h->id[BCF_DT_CTG][v->rid].key, v->pos+1); v->errcode |= BCF_ERR_NCOLS; return -1; @@ -2327,7 +2340,7 @@ static int vcf_parse_format(kstring_t *s, const bcf_hdr_t *h, bcf1_t *v, char *p } else { char buffer[8]; - hts_log_error("Invalid character '%s' in '%s' FORMAT field at %s:%d", + hts_log_error("Invalid character '%s' in '%s' FORMAT field at %s:%"PRId64"", dump_char(buffer, *t), h->id[BCF_DT_ID][z->key].key, bcf_seqname(h,v), v->pos+1); v->errcode |= BCF_ERR_CHAR; return -1; @@ -2386,14 +2399,14 @@ static int vcf_parse_format(kstring_t *s, const bcf_hdr_t *h, bcf1_t *v, char *p if ( v->n_sample!=bcf_hdr_nsamples(h) ) { - hts_log_error("Number of columns at %s:%d does not match the number of samples (%d vs %d)", + hts_log_error("Number of columns at %s:%"PRId64" does not match the number of samples (%d vs %d)", bcf_seqname(h,v), v->pos+1, v->n_sample, bcf_hdr_nsamples(h)); v->errcode |= BCF_ERR_NCOLS; return -1; } if ( v->indiv.l > 0xffffffff ) { - hts_log_error("The FORMAT at %s:%d is too long", bcf_seqname(h,v), v->pos+1); + hts_log_error("The FORMAT at %s:%"PRId64" is too long", bcf_seqname(h,v), v->pos+1); v->errcode |= BCF_ERR_LIMITS; // Error recovery: return -1 if this is a critical error or 0 if we want to ignore the FORMAT and proceed diff --git a/vcfutils.c b/vcfutils.c index 008dbe6f5..3e96c286a 100644 --- a/vcfutils.c +++ b/vcfutils.c @@ -23,6 +23,7 @@ FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. */ #include +#include #include "htslib/vcfutils.h" #include "htslib/kbitset.h" @@ -64,12 +65,12 @@ int bcf_calc_ac(const bcf_hdr_t *header, bcf1_t *line, int *ac, int which) case BCF_BT_INT8: BRANCH_INT(int8_t); break; case BCF_BT_INT16: BRANCH_INT(int16_t); break; case BCF_BT_INT32: BRANCH_INT(int32_t); break; - default: hts_log_error("Unexpected type %d at %s:%d", ac_type, header->id[BCF_DT_CTG][line->rid].key, line->pos+1); exit(1); break; + default: hts_log_error("Unexpected type %d at %s:%"PRId64, ac_type, header->id[BCF_DT_CTG][line->rid].key, line->pos+1); exit(1); break; } #undef BRANCH_INT if ( anid[BCF_DT_CTG][line->rid].key, line->pos+1); + hts_log_error("Incorrect AN/AC counts at %s:%"PRId64, header->id[BCF_DT_CTG][line->rid].key, line->pos+1); exit(1); } ac[0] = an - nac; @@ -98,7 +99,7 @@ int bcf_calc_ac(const bcf_hdr_t *header, bcf1_t *line, int *ac, int which) if ( bcf_gt_is_missing(p[ial]) ) continue; /* missing allele */ \ if ( p[ial]>>1 > line->n_allele ) \ { \ - hts_log_error("Incorrect allele (\"%d\") in %s at %s:%d", (p[ial]>>1)-1, header->samples[i], header->id[BCF_DT_CTG][line->rid].key, line->pos+1); \ + hts_log_error("Incorrect allele (\"%d\") in %s at %s:%"PRId64, (p[ial]>>1)-1, header->samples[i], header->id[BCF_DT_CTG][line->rid].key, line->pos+1); \ exit(1); \ } \ ac[(p[ial]>>1)-1]++; \ @@ -109,7 +110,7 @@ int bcf_calc_ac(const bcf_hdr_t *header, bcf1_t *line, int *ac, int which) case BCF_BT_INT8: BRANCH_INT(int8_t, bcf_int8_vector_end); break; case BCF_BT_INT16: BRANCH_INT(int16_t, bcf_int16_vector_end); break; case BCF_BT_INT32: BRANCH_INT(int32_t, bcf_int32_vector_end); break; - default: hts_log_error("Unexpected type %d at %s:%d", fmt_gt->type, header->id[BCF_DT_CTG][line->rid].key, line->pos+1); exit(1); break; + default: hts_log_error("Unexpected type %d at %s:%"PRId64, fmt_gt->type, header->id[BCF_DT_CTG][line->rid].key, line->pos+1); exit(1); break; } #undef BRANCH_INT return 1; @@ -188,7 +189,7 @@ int bcf_trim_alleles(const bcf_hdr_t *header, bcf1_t *line) if ( p[ial]==vector_end ) break; /* smaller ploidy */ \ if ( bcf_gt_is_missing(p[ial]) ) continue; /* missing allele */ \ if ( (p[ial]>>1)-1 >= line->n_allele ) { \ - hts_log_error("Allele index is out of bounds at %s:%d", header->id[BCF_DT_CTG][line->rid].key, line->pos+1); \ + hts_log_error("Allele index is out of bounds at %s:%"PRId64, header->id[BCF_DT_CTG][line->rid].key, line->pos+1); \ ret = -1; \ goto clean; \ } \ @@ -200,7 +201,7 @@ int bcf_trim_alleles(const bcf_hdr_t *header, bcf1_t *line) case BCF_BT_INT8: BRANCH(int8_t, bcf_int8_vector_end); break; case BCF_BT_INT16: BRANCH(int16_t, bcf_int16_vector_end); break; case BCF_BT_INT32: BRANCH(int32_t, bcf_int32_vector_end); break; - default: hts_log_error("Unexpected GT %d at %s:%d", + default: hts_log_error("Unexpected GT %d at %s:%"PRId64, gt->type, header->id[BCF_DT_CTG][line->rid].key, line->pos + 1); goto clean; } @@ -265,7 +266,7 @@ int bcf_remove_allele_set(const bcf_hdr_t *header, bcf1_t *line, const struct kb int nR_new = line->n_allele-nrm; if ( nR_new<=0 ) // should not be able to remove reference allele { - hts_log_error("Cannot remove reference allele at %s:%d [%d]", + hts_log_error("Cannot remove reference allele at %s:%"PRId64" [%d]", bcf_seqname(header,line), line->pos+1, nR_new); goto err; } @@ -296,7 +297,7 @@ int bcf_remove_allele_set(const bcf_hdr_t *header, bcf1_t *line, const struct kb mdat_bytes = mdat * size; if ( nret<0 ) { - hts_log_error("Could not access INFO/%s at %s:%d [%d]", + hts_log_error("Could not access INFO/%s at %s:%"PRId64" [%d]", bcf_hdr_int2id(header,BCF_DT_ID,info->key), bcf_seqname(header,line), line->pos+1, nret); goto err; } @@ -334,7 +335,7 @@ int bcf_remove_allele_set(const bcf_hdr_t *header, bcf1_t *line, const struct kb if ( j==1 && s == '.' ) continue; // missing if ( j!=nexp ) { - hts_log_error("Unexpected number of values in INFO/%s at %s:%d; expected Number=%c=%d, but found %d", + hts_log_error("Unexpected number of values in INFO/%s at %s:%"PRId64"; expected Number=%c=%d, but found %d", bcf_hdr_int2id(header,BCF_DT_ID,info->key), bcf_seqname(header,line), line->pos+1, vlen==BCF_VL_A ? 'A' : 'R', nexp, j); goto err; } @@ -365,7 +366,7 @@ int bcf_remove_allele_set(const bcf_hdr_t *header, bcf1_t *line, const struct kb if ( n==1 && s == '.' ) continue; // missing if ( n!=nG_ori ) { - hts_log_error("Unexpected number of values in INFO/%s at %s:%d; expected Number=G=%d, but found %d", + hts_log_error("Unexpected number of values in INFO/%s at %s:%"PRId64"; expected Number=G=%d, but found %d", bcf_hdr_int2id(header,BCF_DT_ID,info->key), bcf_seqname(header,line), line->pos+1, nG_ori, n); goto err; } @@ -374,7 +375,7 @@ int bcf_remove_allele_set(const bcf_hdr_t *header, bcf1_t *line, const struct kb nret = bcf_update_info(header, line, bcf_hdr_int2id(header,BCF_DT_ID,info->key), (void*)str.s, str.l, type); if ( nret<0 ) { - hts_log_error("Could not update INFO/%s at %s:%d [%d]", + hts_log_error("Could not update INFO/%s at %s:%"PRId64" [%d]", bcf_hdr_int2id(header,BCF_DT_ID,info->key), bcf_seqname(header,line), line->pos+1, nret); goto err; } @@ -406,7 +407,7 @@ int bcf_remove_allele_set(const bcf_hdr_t *header, bcf1_t *line, const struct kb { if ( nret!=nA_ori ) { - hts_log_error("Unexpected number of values in INFO/%s at %s:%d; expected Number=A=%d, but found %d", + hts_log_error("Unexpected number of values in INFO/%s at %s:%"PRId64"; expected Number=A=%d, but found %d", bcf_hdr_int2id(header,BCF_DT_ID,info->key), bcf_seqname(header,line), line->pos+1, nA_ori, nret); goto err; } @@ -418,7 +419,7 @@ int bcf_remove_allele_set(const bcf_hdr_t *header, bcf1_t *line, const struct kb { if ( nret!=nR_ori ) { - hts_log_error("Unexpected number of values in INFO/%s at %s:%d; expected Number=R=%d, but found %d", + hts_log_error("Unexpected number of values in INFO/%s at %s:%"PRId64"; expected Number=R=%d, but found %d", bcf_hdr_int2id(header,BCF_DT_ID,info->key), bcf_seqname(header,line), line->pos+1, nR_ori, nret); goto err; } @@ -450,7 +451,7 @@ int bcf_remove_allele_set(const bcf_hdr_t *header, bcf1_t *line, const struct kb { if ( nret!=nG_ori ) { - hts_log_error("Unexpected number of values in INFO/%s at %s:%d; expected Number=R=%d, but found %d", + hts_log_error("Unexpected number of values in INFO/%s at %s:%"PRId64"; expected Number=R=%d, but found %d", bcf_hdr_int2id(header,BCF_DT_ID,info->key), bcf_seqname(header,line), line->pos+1, nG_ori, nret); goto err; } @@ -484,7 +485,7 @@ int bcf_remove_allele_set(const bcf_hdr_t *header, bcf1_t *line, const struct kb nret = bcf_update_info(header, line, bcf_hdr_int2id(header,BCF_DT_ID,info->key), (void*)dat, ndat, type); if ( nret<0 ) { - hts_log_error("Could not update INFO/%s at %s:%d [%d]", + hts_log_error("Could not update INFO/%s at %s:%"PRId64" [%d]", bcf_hdr_int2id(header,BCF_DT_ID,info->key), bcf_seqname(header,line), line->pos+1, nret); goto err; } @@ -510,7 +511,7 @@ int bcf_remove_allele_set(const bcf_hdr_t *header, bcf1_t *line, const struct kb int al = bcf_gt_allele(ptr[j]); if ( !( al=0 ) ) { - hts_log_error("Problem updating genotypes at %s:%d [ al=0 :: al=%d,nR_ori=%d,map[al]=%d ]", + hts_log_error("Problem updating genotypes at %s:%"PRId64" [ al=0 :: al=%d,nR_ori=%d,map[al]=%d ]", bcf_seqname(header,line), line->pos+1, al, nR_ori, map[al]); goto err; } @@ -521,7 +522,7 @@ int bcf_remove_allele_set(const bcf_hdr_t *header, bcf1_t *line, const struct kb nret = bcf_update_genotypes(header, line, (void*)dat, nret*line->n_sample); if ( nret<0 ) { - hts_log_error("Could not update FORMAT/GT at %s:%d [%d]", + hts_log_error("Could not update FORMAT/GT at %s:%"PRId64" [%d]", bcf_seqname(header,line), line->pos+1, nret); goto err; } @@ -548,7 +549,7 @@ int bcf_remove_allele_set(const bcf_hdr_t *header, bcf1_t *line, const struct kb mdat_bytes = mdat * size; if ( nret<0 ) { - hts_log_error("Could not access FORMAT/%s at %s:%d [%d]", + hts_log_error("Could not access FORMAT/%s at %s:%"PRId64" [%d]", bcf_hdr_int2id(header,BCF_DT_ID,fmt->id), bcf_seqname(header,line), line->pos+1, nret); goto err; } @@ -589,7 +590,7 @@ int bcf_remove_allele_set(const bcf_hdr_t *header, bcf1_t *line, const struct kb if ( k_src==1 && s == '.' ) continue; // missing if ( k_src!=nexp ) { - hts_log_error("Unexpected number of values in FORMAT/%s at %s:%d; expected Number=%c=%d, but found %d", + hts_log_error("Unexpected number of values in FORMAT/%s at %s:%"PRId64"; expected Number=%c=%d, but found %d", bcf_hdr_int2id(header,BCF_DT_ID,fmt->id), bcf_seqname(header,line), line->pos+1, vlen==BCF_VL_A ? 'A' : 'R', nexp, k_src); goto err; } @@ -614,7 +615,7 @@ int bcf_remove_allele_set(const bcf_hdr_t *header, bcf1_t *line, const struct kb if ( nexp==1 && s == '.' ) continue; // missing if ( nexp!=nG_ori && nexp!=nR_ori ) { - hts_log_error("Unexpected number of values in FORMAT/%s at %s:%d; expected Number=G=%d(diploid) or %d(haploid), but found %d", + hts_log_error("Unexpected number of values in FORMAT/%s at %s:%"PRId64"; expected Number=G=%d(diploid) or %d(haploid), but found %d", bcf_hdr_int2id(header,BCF_DT_ID,fmt->id), bcf_seqname(header,line), line->pos+1, nG_ori, nR_ori, nexp); goto err; } @@ -659,7 +660,7 @@ int bcf_remove_allele_set(const bcf_hdr_t *header, bcf1_t *line, const struct kb } if ( k_src!=nR_ori ) { - hts_log_error("Unexpected number of values in FORMAT/%s at %s:%d; expected Number=G=%d(haploid), but found %d", + hts_log_error("Unexpected number of values in FORMAT/%s at %s:%"PRId64"; expected Number=G=%d(haploid), but found %d", bcf_hdr_int2id(header,BCF_DT_ID,fmt->id), bcf_seqname(header,line), line->pos+1, nR_ori, k_src); goto err; } @@ -671,7 +672,7 @@ int bcf_remove_allele_set(const bcf_hdr_t *header, bcf1_t *line, const struct kb nret = bcf_update_format(header, line, bcf_hdr_int2id(header,BCF_DT_ID,fmt->id), (void*)str.s, str.l, type); if ( nret<0 ) { - hts_log_error("Could not update FORMAT/%s at %s:%d [%d]", + hts_log_error("Could not update FORMAT/%s at %s:%"PRId64" [%d]", bcf_hdr_int2id(header,BCF_DT_ID,fmt->id), bcf_seqname(header,line), line->pos+1, nret); goto err; } @@ -707,7 +708,7 @@ int bcf_remove_allele_set(const bcf_hdr_t *header, bcf1_t *line, const struct kb { if ( nori!=nA_ori ) { - hts_log_error("Unexpected number of values in FORMAT/%s at %s:%d; expected Number=A=%d, but found %d", + hts_log_error("Unexpected number of values in FORMAT/%s at %s:%"PRId64"; expected Number=A=%d, but found %d", bcf_hdr_int2id(header,BCF_DT_ID,fmt->id), bcf_seqname(header,line), line->pos+1, nA_ori, nori); goto err; } @@ -719,7 +720,7 @@ int bcf_remove_allele_set(const bcf_hdr_t *header, bcf1_t *line, const struct kb { if ( nori!=nR_ori ) { - hts_log_error("Unexpected number of values in FORMAT/%s at %s:%d; expected Number=R=%d, but found %d", + hts_log_error("Unexpected number of values in FORMAT/%s at %s:%"PRId64"; expected Number=R=%d, but found %d", bcf_hdr_int2id(header,BCF_DT_ID,fmt->id), bcf_seqname(header,line), line->pos+1, nR_ori, nori); goto err; } @@ -755,7 +756,7 @@ int bcf_remove_allele_set(const bcf_hdr_t *header, bcf1_t *line, const struct kb { if ( nori!=nG_ori ) { - hts_log_error("Unexpected number of values in FORMAT/%s at %s:%d; expected Number=G=%d, but found %d", + hts_log_error("Unexpected number of values in FORMAT/%s at %s:%"PRId64"; expected Number=G=%d, but found %d", bcf_hdr_int2id(header,BCF_DT_ID,fmt->id), bcf_seqname(header,line), line->pos+1, nG_ori, nori); goto err; } @@ -808,7 +809,7 @@ int bcf_remove_allele_set(const bcf_hdr_t *header, bcf1_t *line, const struct kb nret = bcf_update_format(header, line, bcf_hdr_int2id(header,BCF_DT_ID,fmt->id), (void*)dat, ndat, type); if ( nret<0 ) { - hts_log_error("Could not update FORMAT/%s at %s:%d [%d]", + hts_log_error("Could not update FORMAT/%s at %s:%"PRId64" [%d]", bcf_hdr_int2id(header,BCF_DT_ID,fmt->id), bcf_seqname(header,line), line->pos+1, nret); goto err; } From 9e2984a9925197e962ad89d49d972b9dce7154f1 Mon Sep 17 00:00:00 2001 From: James Bonfield Date: Fri, 1 Jun 2018 15:06:59 +0100 Subject: [PATCH 02/23] ABI/API placeholder for 64-bit positions in CRAM. This upgrades all the internal data types to 64-bit and adds I/O functions for encoding and decoding, but doesn't change the format itself. There is also code with #ifdef LARGE_POS **which should not be used** in production. This is there simply to act as a test for the 64-bit API in htslib iterators. The code is mainly copied from io_lib CRAM4 experimental branch: https://github.com/jkbonfield/io_lib/commit/1150b9c3669055723276236ffd1f720575ed36cc --- cram/cram_codecs.c | 120 ++++++++++++++++++++++++++++++++++++++++---- cram/cram_codecs.h | 3 +- cram/cram_decode.c | 86 +++++++++++++++++++++++++++---- cram/cram_encode.c | 80 ++++++++++++++++++++++++----- cram/cram_index.c | 11 ++-- cram/cram_io.c | 41 +++++++++++++-- cram/cram_io.h | 3 +- cram/cram_stats.c | 9 ++-- cram/cram_stats.h | 4 +- cram/cram_structs.h | 28 +++++------ 10 files changed, 323 insertions(+), 62 deletions(-) diff --git a/cram/cram_codecs.c b/cram/cram_codecs.c index a991766c6..5c574ed03 100644 --- a/cram/cram_codecs.c +++ b/cram/cram_codecs.c @@ -331,6 +331,26 @@ int cram_external_decode_int(cram_slice *slice, cram_codec *c, return l > 0 ? 0 : -1; } +int cram_external_decode_long(cram_slice *slice, cram_codec *c, + cram_block *in, char *out, int *out_size) { + int64_t l; + char *cp; + cram_block *b; + + /* Find the external block */ + b = cram_get_block_by_id(slice, c->u.external.content_id); + if (!b) + return *out_size?-1:0; + + cp = (char *)b->data + b->idx; + // E_INT and E_LONG are guaranteed single item queries + l = safe_ltf8_get(cp, (char *)b->data + b->uncomp_size, (int64_t *)out); + b->idx += l; + *out_size = 1; + + return l > 0 ? 0 : -1; +} + int cram_external_decode_char(cram_slice *slice, cram_codec *c, cram_block *in, char *out, int *out_size) { @@ -392,8 +412,10 @@ cram_codec *cram_external_decode_init(char *data, int size, return NULL; c->codec = E_EXTERNAL; - if (option == E_INT || option == E_LONG) + if (option == E_INT) c->decode = cram_external_decode_int; + else if (option == E_LONG) + c->decode = cram_external_decode_long; else if (option == E_BYTE_ARRAY || option == E_BYTE) c->decode = cram_external_decode_char; else @@ -422,6 +444,14 @@ int cram_external_encode_int(cram_slice *slice, cram_codec *c, return itf8_put_blk(c->out, *i32) >= 0 ? 0 : -1; } +int cram_external_encode_long(cram_slice *slice, cram_codec *c, + char *in, int in_size) { + uint64_t *i64 = (uint64_t *)in; + + ltf8_put_blk(c->out, *i64); + return 0; +} + int cram_external_encode_char(cram_slice *slice, cram_codec *c, char *in, int in_size) { BLOCK_APPEND(c->out, in, in_size); @@ -472,8 +502,10 @@ cram_codec *cram_external_encode_init(cram_stats *st, return NULL; c->codec = E_EXTERNAL; c->free = cram_external_encode_free; - if (option == E_INT || option == E_LONG) + if (option == E_INT) c->encode = cram_external_encode_int; + else if (option == E_LONG) + c->encode = cram_external_encode_long; else if (option == E_BYTE_ARRAY || option == E_BYTE) c->encode = cram_external_encode_char; else @@ -962,6 +994,56 @@ int cram_huffman_decode_int(cram_slice *slice, cram_codec *c, return 0; } +int cram_huffman_decode_long0(cram_slice *slice, cram_codec *c, + cram_block *in, char *out, int *out_size) { + int64_t *out_i = (int64_t *)out; + int i, n; + const cram_huffman_code * const codes = c->u.huffman.codes; + + /* Special case of 0 length codes */ + for (i = 0, n = *out_size; i < n; i++) { + out_i[i] = codes[0].symbol; + } + return 0; +} + +int cram_huffman_decode_long(cram_slice *slice, cram_codec *c, + cram_block *in, char *out, int *out_size) { + int64_t *out_i = (int64_t *)out; + int i, n, ncodes = c->u.huffman.ncodes; + const cram_huffman_code * const codes = c->u.huffman.codes; + + for (i = 0, n = *out_size; i < n; i++) { + int idx = 0; + int val = 0, len = 0, last_len = 0; + + // Now one bit at a time for remaining checks + for (;;) { + int dlen = codes[idx].len - last_len; + if (cram_not_enough_bits(in, dlen)) + return -1; + + //val <<= dlen; + //val |= get_bits_MSB(in, dlen); + //last_len = (len += dlen); + + last_len = (len += dlen); + for (; dlen; dlen--) GET_BIT_MSB(in, val); + + idx = val - codes[idx].p; + if (idx >= ncodes || idx < 0) + return -1; + + if (codes[idx].code == val && codes[idx].len == len) { + out_i[i] = codes[idx].symbol; + break; + } + } + } + + return 0; +} + /* * Initialises a huffman decoder from an encoding data stream. */ @@ -1011,8 +1093,16 @@ cram_codec *cram_huffman_decode_init(char *data, int size, } /* Read symbols and bit-lengths */ - for (i = 0, l = 1; i < ncodes && l > 0; i++, cp += l) { - l = safe_itf8_get(cp, data_end, &codes[i].symbol); + if (option == E_LONG) { + for (i = 0, l = 1; i < ncodes && l > 0; i++, cp += l) { + l = safe_ltf8_get(cp, data_end, &codes[i].symbol); + } + } else { + for (i = 0, l = 1; i < ncodes && l > 0; i++, cp += l) { + int32_t i32; + l = safe_itf8_get(cp, data_end, &i32); + codes[i].symbol = i32; + } } if (l < 1) @@ -1100,13 +1190,18 @@ cram_codec *cram_huffman_decode_init(char *data, int size, h->decode = cram_huffman_decode_char0; else h->decode = cram_huffman_decode_char; - } else if (option == E_BYTE_ARRAY_BLOCK) { - abort(); - } else { + } else if (option == E_LONG) { + if (h->u.huffman.codes[0].len == 0) + h->decode = cram_huffman_decode_long0; + else + h->decode = cram_huffman_decode_long; + } else if (option == E_INT) { if (h->u.huffman.codes[0].len == 0) h->decode = cram_huffman_decode_int0; else h->decode = cram_huffman_decode_int; + } else { + return NULL; } return (cram_codec *)h; @@ -1230,8 +1325,14 @@ int cram_huffman_encode_store(cram_codec *c, cram_block *b, char *prefix, } tp += itf8_put(tp, c->u.e_huffman.nvals); - for (i = 0; i < c->u.e_huffman.nvals; i++) { - tp += itf8_put(tp, codes[i].symbol); + if (c->u.e_huffman.option == E_LONG) { + for (i = 0; i < c->u.e_huffman.nvals; i++) { + tp += ltf8_put(tp, codes[i].symbol); + } + } else { + for (i = 0; i < c->u.e_huffman.nvals; i++) { + tp += itf8_put(tp, codes[i].symbol); + } } tp += itf8_put(tp, c->u.e_huffman.nvals); @@ -1409,6 +1510,7 @@ cram_codec *cram_huffman_encode_init(cram_stats *st, c->u.e_huffman.codes = codes; c->u.e_huffman.nvals = nvals; + c->u.e_huffman.option = option; c->free = cram_huffman_encode_free; if (option == E_BYTE || option == E_BYTE_ARRAY) { diff --git a/cram/cram_codecs.h b/cram/cram_codecs.h index 59ce1313f..02b9f8e85 100644 --- a/cram/cram_codecs.h +++ b/cram/cram_codecs.h @@ -49,7 +49,7 @@ struct cram_codec; * appears. */ typedef struct { - int32_t symbol; + int64_t symbol; int32_t p; // next code start value, minus index to codes[] int32_t code; int32_t len; @@ -65,6 +65,7 @@ typedef struct { cram_huffman_code *codes; int nvals; int val2code[MAX_HUFF+1]; // value to code lookup for small values + int option; } cram_huffman_encoder; typedef struct { diff --git a/cram/cram_decode.c b/cram/cram_decode.c index 7dcc24470..5aefe761c 100644 --- a/cram/cram_decode.c +++ b/cram/cram_decode.c @@ -45,6 +45,7 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include #include #include +#include #include "cram/cram.h" #include "cram/os.h" @@ -158,9 +159,25 @@ cram_block_compression_hdr *cram_decode_compression_header(cram_fd *fd, endp = cp + b->uncomp_size; if (CRAM_MAJOR_VERS(fd->version) == 1) { + int32_t i32; cp += safe_itf8_get(cp, endp, &hdr->ref_seq_id); - cp += safe_itf8_get(cp, endp, &hdr->ref_seq_start); - cp += safe_itf8_get(cp, endp, &hdr->ref_seq_span); +/* + * LARGE_POS used in this code is purely a debugging mechanism for testing + * whether the htslib API can cope with 64-bit quantities. These are + * possible in SAM, but not *yet* in BAM or CRAM. + * + * DO NOT ENABLE LARGE_POS for anything other than debugging / testing. + * + * At some point it is expected these ifdefs will become a version check + * instead. + */ +#ifdef LARGE_POS + cp += safe_ltf8_get(cp, endp, &hdr->ref_seq_start); + cp += safe_ltf8_get(cp, endp, &hdr->ref_seq_span); +#else + cp += safe_itf8_get(cp, endp, &i32); hdr->ref_seq_start=i32; + cp += safe_itf8_get(cp, endp, &i32); hdr->ref_seq_span=i32; +#endif cp += safe_itf8_get(cp, endp, &hdr->num_records); cp += safe_itf8_get(cp, endp, &hdr->num_landmarks); if (hdr->num_landmarks < 0 || @@ -409,7 +426,12 @@ cram_block_compression_hdr *cram_decode_compression_header(cram_fd *fd, } else if (key[0] == 'R' && key[1] == 'L') { ds_id = DS_RL; type = E_INT; } else if (key[0] == 'A' && key[1] == 'P') { - ds_id = DS_AP; type = E_INT; + ds_id = DS_AP; +#ifdef LARGE_POS + type = E_LONG, +#else + type = E_INT; +#endif } else if (key[0] == 'R' && key[1] == 'G') { ds_id = DS_RG; type = E_INT; } else if (key[0] == 'M' && key[1] == 'F') { @@ -417,9 +439,19 @@ cram_block_compression_hdr *cram_decode_compression_header(cram_fd *fd, } else if (key[0] == 'N' && key[1] == 'S') { ds_id = DS_NS; type = E_INT; } else if (key[0] == 'N' && key[1] == 'P') { - ds_id = DS_NP; type = E_INT; + ds_id = DS_NP; +#ifdef LARGE_POS + type = E_LONG, +#else + type = E_INT; +#endif } else if (key[0] == 'T' && key[1] == 'S') { - ds_id = DS_TS; type = E_INT; + ds_id = DS_TS; +#ifdef LARGE_POS + type = E_LONG, +#else + type = E_INT; +#endif } else if (key[0] == 'N' && key[1] == 'F') { ds_id = DS_NF; type = E_INT; } else if (key[0] == 'T' && key[1] == 'C') { @@ -978,8 +1010,16 @@ cram_block_slice_hdr *cram_decode_slice_header(cram_fd *fd, cram_block *b) { if (b->content_type == MAPPED_SLICE) { cp += safe_itf8_get((char *)cp, (char *)cp_end, &hdr->ref_seq_id); - cp += safe_itf8_get((char *)cp, (char *)cp_end, &hdr->ref_seq_start); - cp += safe_itf8_get((char *)cp, (char *)cp_end, &hdr->ref_seq_span); +#ifdef LARGE_POS + cp += safe_ltf8_get((char *)cp, (char *)cp_end, &hdr->ref_seq_start); + cp += safe_ltf8_get((char *)cp, (char *)cp_end, &hdr->ref_seq_span); +#else + int32_t i32; + cp += safe_itf8_get((char *)cp, (char *)cp_end, &i32); + hdr->ref_seq_start = i32; + cp += safe_itf8_get((char *)cp, (char *)cp_end, &i32); + hdr->ref_seq_span = i32; +#endif } cp += safe_itf8_get((char *)cp, (char *)cp_end, &hdr->num_records); hdr->record_counter = 0; @@ -1090,7 +1130,8 @@ static int cram_decode_seq(cram_fd *fd, cram_container *c, cram_slice *s, int has_MD, int has_NM) { int prev_pos = 0, f, r = 0, out_sz = 1; int seq_pos = 1; - int cig_len = 0, ref_pos = cr->apos; + int cig_len = 0; + int64_t ref_pos = cr->apos; int32_t fn, i32; enum cigar_op cig_op = BAM_CMATCH; uint32_t *cigar = s->cigar; @@ -2235,7 +2276,7 @@ int cram_decode_slice(cram_fd *fd, cram_container *c, cram_slice *s, if ((fd->required_fields & SAM_SEQ) && s->ref == NULL && s->hdr->ref_seq_id >= 0 && !c->comp_hdr->no_ref) { - hts_log_error("Unable to fetch reference #%d %d..%d", + hts_log_error("Unable to fetch reference #%d %"PRId64"..%"PRId64"\n", s->hdr->ref_seq_id, s->hdr->ref_seq_start, s->hdr->ref_seq_start + s->hdr->ref_seq_span-1); return -1; @@ -2429,9 +2470,17 @@ int cram_decode_slice(cram_fd *fd, cram_container *c, cram_slice *s, if (ds & CRAM_AP) { if (!c->comp_hdr->codecs[DS_AP]) return -1; +#ifdef LARGE_POS r |= c->comp_hdr->codecs[DS_AP] ->decode(s, c->comp_hdr->codecs[DS_AP], blk, (char *)&cr->apos, &out_sz); +#else + int32_t i32; + r |= c->comp_hdr->codecs[DS_AP] + ->decode(s, c->comp_hdr->codecs[DS_AP], blk, + (char *)&i32, &out_sz); + cr->apos = i32; +#endif if (r) return r; if (c->comp_hdr->AP_delta) cr->apos += s->last_apos; @@ -2528,17 +2577,33 @@ int cram_decode_slice(cram_fd *fd, cram_container *c, cram_slice *s, if (ds & CRAM_NP) { if (!c->comp_hdr->codecs[DS_NP]) return -1; +#ifdef LARGE_POS r |= c->comp_hdr->codecs[DS_NP] ->decode(s, c->comp_hdr->codecs[DS_NP], blk, (char *)&cr->mate_pos, &out_sz); +#else + int32_t i32; + r |= c->comp_hdr->codecs[DS_NP] + ->decode(s, c->comp_hdr->codecs[DS_NP], blk, + (char *)&i32, &out_sz); + cr->mate_pos = i32; +#endif if (r) return r; } if (ds & CRAM_TS) { if (!c->comp_hdr->codecs[DS_TS]) return -1; +#ifdef LARGE_POS r |= c->comp_hdr->codecs[DS_TS] ->decode(s, c->comp_hdr->codecs[DS_TS], blk, (char *)&cr->tlen, &out_sz); +#else + int32_t i32; + r |= c->comp_hdr->codecs[DS_TS] + ->decode(s, c->comp_hdr->codecs[DS_TS], blk, + (char *)&i32, &out_sz); + cr->tlen = i32; +#endif if (r) return r; } else { cr->tlen = INT_MIN; @@ -2609,7 +2674,8 @@ int cram_decode_slice(cram_fd *fd, cram_container *c, cram_slice *s, if (!(bf & BAM_FUNMAP)) { if ((ds & CRAM_AP) && cr->apos <= 0) { - hts_log_error("Read has alignment position %d but no unmapped flag", + hts_log_error("Read has alignment position %"PRId64 + " but no unmapped flag", cr->apos); return -1; } diff --git a/cram/cram_encode.c b/cram/cram_encode.c index 42d82f6b9..72d64b140 100644 --- a/cram/cram_encode.c +++ b/cram/cram_encode.c @@ -40,6 +40,7 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include #include #include +#include #include "cram/cram.h" #include "cram/os.h" @@ -92,11 +93,27 @@ cram_block *cram_encode_compression_header(cram_fd *fd, cram_container *c, * the total size (stored as a variable length string). */ +/* + * LARGE_POS used in this code is purely a debugging mechanism for testing + * whether the htslib API can cope with 64-bit quantities. These are + * possible in SAM, but not *yet* in BAM or CRAM. + * + * DO NOT ENABLE LARGE_POS for anything other than debugging / testing. + * + * At some point it is expected these ifdefs will become a version check + * instead. + */ + // Duplicated from container itself, and removed in 1.1 if (CRAM_MAJOR_VERS(fd->version) == 1) { r |= itf8_put_blk(cb, h->ref_seq_id); +#ifdef LARGE_POS + r |= ltf8_put_blk(cb, h->ref_seq_start); + r |= ltf8_put_blk(cb, h->ref_seq_span); +#else r |= itf8_put_blk(cb, h->ref_seq_start); r |= itf8_put_blk(cb, h->ref_seq_span); +#endif r |= itf8_put_blk(cb, h->num_records); r |= itf8_put_blk(cb, h->num_landmarks); for (i = 0; i < h->num_landmarks; i++) { @@ -574,7 +591,7 @@ static int cram_encode_slice_read(cram_fd *fd, cram_block_compression_hdr *h, cram_slice *s, cram_record *cr, - int *last_pos) { + int64_t *last_pos) { int r = 0; int32_t i32; unsigned char uc; @@ -595,12 +612,24 @@ static int cram_encode_slice_read(cram_fd *fd, r |= h->codecs[DS_RL]->encode(s, h->codecs[DS_RL], (char *)&cr->len, 1); if (c->pos_sorted) { +#ifdef LARGE_POS + int64_t i64; + i64 = cr->apos - *last_pos; + r |= h->codecs[DS_AP]->encode(s, h->codecs[DS_AP], (char *)&i64, 1); +#else i32 = cr->apos - *last_pos; r |= h->codecs[DS_AP]->encode(s, h->codecs[DS_AP], (char *)&i32, 1); +#endif *last_pos = cr->apos; } else { +#ifdef LARGE_POS + int64_t i64; + i64 = cr->apos; + r |= h->codecs[DS_AP]->encode(s, h->codecs[DS_AP], (char *)&i64, 1); +#else i32 = cr->apos; r |= h->codecs[DS_AP]->encode(s, h->codecs[DS_AP], (char *)&i32, 1); +#endif } r |= h->codecs[DS_RG]->encode(s, h->codecs[DS_RG], (char *)&cr->rg, 1); @@ -612,11 +641,20 @@ static int cram_encode_slice_read(cram_fd *fd, r |= h->codecs[DS_NS]->encode(s, h->codecs[DS_NS], (char *)&cr->mate_ref_id, 1); +#ifdef LARGE_POS r |= h->codecs[DS_NP]->encode(s, h->codecs[DS_NP], (char *)&cr->mate_pos, 1); r |= h->codecs[DS_TS]->encode(s, h->codecs[DS_TS], (char *)&cr->tlen, 1); +#else + i32 = cr->mate_pos; + r |= h->codecs[DS_NP]->encode(s, h->codecs[DS_NP], + (char *)&i32, 1); + i32 = cr->tlen; + r |= h->codecs[DS_TS]->encode(s, h->codecs[DS_TS], + (char *)&i32, 1); +#endif } else if (cr->cram_flags & CRAM_FLAG_MATE_DOWNSTREAM) { r |= h->codecs[DS_NF]->encode(s, h->codecs[DS_NF], (char *)&cr->mate_line, 1); @@ -910,7 +948,8 @@ static int cram_compress_slice(cram_fd *fd, cram_container *c, cram_slice *s) { */ static int cram_encode_slice(cram_fd *fd, cram_container *c, cram_block_compression_hdr *h, cram_slice *s) { - int rec, r = 0, last_pos; + int rec, r = 0; + int64_t last_pos; int embed_ref; enum cram_DS_ID id; @@ -1312,7 +1351,7 @@ int cram_encode_container(cram_fd *fd, cram_container *c) { /* Turn bams into cram_records and gather basic stats */ for (r1 = sn = 0; r1 < c->curr_c_rec; sn++) { cram_slice *s = c->slices[sn]; - int first_base = INT_MAX, last_base = INT_MIN; + int64_t first_base = INT64_MAX, last_base = INT64_MIN; int r1_start = r1; @@ -1488,8 +1527,13 @@ int cram_encode_container(cram_fd *fd, cram_container *c) { //fprintf(stderr, "=== AP ===\n"); if (c->pos_sorted) { h->codecs[DS_AP] = cram_encoder_init(cram_stats_encoding(fd, c->stats[DS_AP]), - c->stats[DS_AP], E_INT, NULL, - fd->version); + c->stats[DS_AP], +#ifdef LARGE_POS + E_LONG, +#else + E_INT, +#endif + NULL, fd->version); } else { int p[2] = {0, c->max_apos}; h->codecs[DS_AP] = cram_encoder_init(E_BETA, NULL, E_INT, p, @@ -1523,14 +1567,24 @@ int cram_encode_container(cram_fd *fd, cram_container *c) { //fprintf(stderr, "=== TS ===\n"); h->codecs[DS_TS] = cram_encoder_init(cram_stats_encoding(fd, c->stats[DS_TS]), - c->stats[DS_TS], E_INT, NULL, - fd->version); + c->stats[DS_TS], +#ifdef LARGE_POS + E_LONG, +#else + E_INT, +#endif + NULL, fd->version); if (c->stats[DS_TS]->nvals && !h->codecs[DS_TS]) goto_err; //fprintf(stderr, "=== NP ===\n"); h->codecs[DS_NP] = cram_encoder_init(cram_stats_encoding(fd, c->stats[DS_NP]), - c->stats[DS_NP], E_INT, NULL, - fd->version); + c->stats[DS_NP], +#ifdef LARGE_POS + E_LONG, +#else + E_INT, +#endif + NULL, fd->version); if (c->stats[DS_NP]->nvals && !h->codecs[DS_NP]) goto_err; //fprintf(stderr, "=== NF ===\n"); @@ -2569,7 +2623,7 @@ static cram_container *cram_next_container(cram_fd *fd, bam_seq_t *b) { if (c->curr_slice == c->max_slice || (bam_ref(b) != c->curr_ref && !c->multi_seq)) { c->ref_seq_span = fd->last_base - c->ref_seq_start + 1; - hts_log_info("Flush container %d/%d..%d", + hts_log_info("Flush container %d/%"PRId64"..%"PRId64, c->ref_seq_id, c->ref_seq_start, c->ref_seq_start + c->ref_seq_span -1); @@ -2751,8 +2805,8 @@ static int process_one_read(cram_fd *fd, cram_container *c, /* Copy and parse */ if (!(cr->flags & BAM_FUNMAP)) { uint32_t *cig_to, *cig_from; - int apos = cr->apos-1, spos = 0; - int MD_last = apos; // last position of edit in MD tag + int64_t apos = cr->apos-1, spos = 0; + int64_t MD_last = apos; // last position of edit in MD tag cr->cigar = s->ncigar; cr->ncigar = bam_cigar_len(b); @@ -3087,7 +3141,7 @@ static int process_one_read(cram_fd *fd, cram_container *c, !(fd->tlen_zero && p->ref_id == -1)) goto detached; - if ((p->tlen && abs(p->tlen - -sign*(aright-aleft+1)) > fd->tlen_approx) || + if ((p->tlen && llabs(p->tlen - -sign*(aright-aleft+1)) > fd->tlen_approx) || (!p->tlen && !fd->tlen_zero)) goto detached; diff --git a/cram/cram_index.c b/cram/cram_index.c index 3b8cef833..222bbee6d 100644 --- a/cram/cram_index.c +++ b/cram/cram_index.c @@ -544,7 +544,8 @@ static int cram_index_build_multiref(cram_fd *fd, off_t cpos, int32_t landmark, int sz) { - int i, ref = -2, ref_start = 0, ref_end; + int i, ref = -2; + int64_t ref_start = 0, ref_end; char buf[1024]; if (fd->mode != 'w') { @@ -571,7 +572,7 @@ static int cram_index_build_multiref(cram_fd *fd, } if (ref != -2) { - sprintf(buf, "%d\t%d\t%d\t%"PRId64"\t%d\t%d\n", + sprintf(buf, "%d\t%"PRId64"\t%"PRId64"\t%"PRId64"\t%d\t%d\n", ref, ref_start, ref_end - ref_start + 1, (int64_t)cpos, landmark, sz); if (bgzf_write(fp, buf, strlen(buf)) < 0) @@ -584,7 +585,7 @@ static int cram_index_build_multiref(cram_fd *fd, } if (ref != -2) { - sprintf(buf, "%d\t%d\t%d\t%"PRId64"\t%d\t%d\n", + sprintf(buf, "%d\t%"PRId64"\t%"PRId64"\t%"PRId64"\t%d\t%d\n", ref, ref_start, ref_end - ref_start + 1, (int64_t)cpos, landmark, sz); if (bgzf_write(fp, buf, strlen(buf)) < 0) @@ -616,7 +617,7 @@ int cram_index_slice(cram_fd *fd, if (s->hdr->ref_seq_id == -2) { ret = cram_index_build_multiref(fd, c, s, fp, cpos, spos, sz); } else { - sprintf(buf, "%d\t%d\t%d\t%"PRId64"\t%d\t%d\n", + sprintf(buf, "%d\t%"PRId64"\t%"PRId64"\t%"PRId64"\t%d\t%d\n", s->hdr->ref_seq_id, s->hdr->ref_seq_start, s->hdr->ref_seq_span, (int64_t)cpos, (int)spos, (int)sz); ret = (bgzf_write(fp, buf, strlen(buf)) >= 0)? 0 : -4; @@ -684,7 +685,7 @@ int cram_index_build(cram_fd *fd, const char *fn_base, const char *fn_idx) { off_t cpos, hpos; BGZF *fp; kstring_t fn_idx_str = {0}; - int32_t last_ref = -9, last_start = -9; + int64_t last_ref = -9, last_start = -9; // Useful for cram_index_build_multiref cram_set_option(fd, CRAM_OPT_REQUIRED_FIELDS, SAM_RNAME | SAM_POS | SAM_CIGAR); diff --git a/cram/cram_io.c b/cram/cram_io.c index ea10eced8..0c3c10c58 100644 --- a/cram/cram_io.c +++ b/cram/cram_io.c @@ -490,7 +490,7 @@ int ltf8_decode_crc(cram_fd *fd, int64_t *val_p, uint32_t *crc) { * * Returns the number of bytes written */ -int itf8_put_blk(cram_block *blk, int val) { +int itf8_put_blk(cram_block *blk, int32_t val) { char buf[5]; int sz; @@ -502,6 +502,18 @@ int itf8_put_blk(cram_block *blk, int val) { return -1; } +int ltf8_put_blk(cram_block *blk, int64_t val) { + char buf[9]; + int sz; + + sz = ltf8_put(buf, val); + BLOCK_APPEND(blk, buf, sz); + return sz; + + block_err: + return -1; +} + /* * Decodes a 32-bit little endian value from fd and stores in val. * @@ -2944,8 +2956,26 @@ cram_container *cram_read_container(cram_fd *fd) { crc = crc32(0L, (unsigned char *)&len, 4); } if ((s = itf8_decode_crc(fd, &c2.ref_seq_id, &crc)) == -1) return NULL; else rd+=s; - if ((s = itf8_decode_crc(fd, &c2.ref_seq_start, &crc))== -1) return NULL; else rd+=s; - if ((s = itf8_decode_crc(fd, &c2.ref_seq_span, &crc)) == -1) return NULL; else rd+=s; +/* + * LARGE_POS used in this code is purely a debugging mechanism for testing + * whether the htslib API can cope with 64-bit quantities. These are + * possible in SAM, but not *yet* in BAM or CRAM. + * + * DO NOT ENABLE LARGE_POS for anything other than debugging / testing. + * + * At some point it is expected these ifdefs will become a version check + * instead. + */ +#ifdef LARGE_POS + if ((s = ltf8_decode_crc(fd, &c2.ref_seq_start, &crc))== -1) return NULL; else rd+=s; + if ((s = ltf8_decode_crc(fd, &c2.ref_seq_span, &crc)) == -1) return NULL; else rd+=s; +#else + int32_t i32; + if ((s = itf8_decode_crc(fd, &i32, &crc))== -1) return NULL; else rd+=s; + c2.ref_seq_start = i32; + if ((s = itf8_decode_crc(fd, &i32, &crc)) == -1) return NULL; else rd+=s; + c2.ref_seq_span = i32; +#endif if ((s = itf8_decode_crc(fd, &c2.num_records, &crc)) == -1) return NULL; else rd+=s; if (CRAM_MAJOR_VERS(fd->version) == 1) { @@ -3070,8 +3100,13 @@ int cram_store_container(cram_fd *fd, cram_container *c, char *dat, int *size) cp += itf8_put((char*)cp, 0); } else { cp += itf8_put((char*)cp, c->ref_seq_id); +#ifdef LARGE_POS + cp += ltf8_put((char*)cp, c->ref_seq_start); + cp += ltf8_put((char*)cp, c->ref_seq_span); +#else cp += itf8_put((char*)cp, c->ref_seq_start); cp += itf8_put((char*)cp, c->ref_seq_span); +#endif } cp += itf8_put((char*)cp, c->num_records); if (CRAM_MAJOR_VERS(fd->version) == 2) { diff --git a/cram/cram_io.h b/cram/cram_io.h index 43ac7dcd5..5cb2b9b8a 100644 --- a/cram/cram_io.h +++ b/cram/cram_io.h @@ -377,7 +377,8 @@ static inline int safe_ltf8_get(const char *cp, const char *endp, * @return * Returns the number of bytes written */ -int itf8_put_blk(cram_block *blk, int val); +int itf8_put_blk(cram_block *blk, int32_t val); +int ltf8_put_blk(cram_block *blk, int64_t val); /*! Pulls a literal 32-bit value from a block. * diff --git a/cram/cram_stats.c b/cram/cram_stats.c index 87adde009..1b107b687 100644 --- a/cram/cram_stats.c +++ b/cram/cram_stats.c @@ -39,6 +39,7 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include #include #include +#include #include "cram/cram.h" #include "cram/os.h" @@ -47,7 +48,7 @@ cram_stats *cram_stats_create(void) { return calloc(1, sizeof(cram_stats)); } -int cram_stats_add(cram_stats *st, int32_t val) { +int cram_stats_add(cram_stats *st, int64_t val) { st->nsamp++; //assert(val >= 0); @@ -75,7 +76,7 @@ int cram_stats_add(cram_stats *st, int32_t val) { return 0; } -void cram_stats_del(cram_stats *st, int32_t val) { +void cram_stats_del(cram_stats *st, int64_t val) { st->nsamp--; //assert(val >= 0); @@ -90,11 +91,11 @@ void cram_stats_del(cram_stats *st, int32_t val) { if (--kh_val(st->h, k) == 0) kh_del(m_i2i, st->h, k); } else { - hts_log_warning("Failed to remove val %d from cram_stats", val); + hts_log_warning("Failed to remove val %"PRId64" from cram_stats", val); st->nsamp++; } } else { - hts_log_warning("Failed to remove val %d from cram_stats", val); + hts_log_warning("Failed to remove val %"PRId64" from cram_stats", val); st->nsamp++; } } diff --git a/cram/cram_stats.h b/cram/cram_stats.h index d9b37a7c3..6a87fb1e8 100644 --- a/cram/cram_stats.h +++ b/cram/cram_stats.h @@ -36,8 +36,8 @@ extern "C" { #endif cram_stats *cram_stats_create(void); -int cram_stats_add(cram_stats *st, int32_t val); -void cram_stats_del(cram_stats *st, int32_t val); +int cram_stats_add(cram_stats *st, int64_t val); +void cram_stats_del(cram_stats *st, int64_t val); void cram_stats_dump(cram_stats *st); void cram_stats_free(cram_stats *st); diff --git a/cram/cram_structs.h b/cram/cram_structs.h index a1b7d8e7b..5c5fe4628 100644 --- a/cram/cram_structs.h +++ b/cram/cram_structs.h @@ -60,7 +60,7 @@ extern "C" { #endif // Generic hash-map integer -> integer -KHASH_MAP_INIT_INT(m_i2i, int) +KHASH_MAP_INIT_INT64(m_i2i, int) // Generic hash-set integer -> (existance) KHASH_SET_INIT_INT(s_i2i) @@ -281,8 +281,8 @@ struct cram_map; /* Compression header block */ typedef struct cram_block_compression_hdr { int32_t ref_seq_id; - int32_t ref_seq_start; - int32_t ref_seq_span; + int64_t ref_seq_start; + int64_t ref_seq_span; int32_t num_records; int32_t num_landmarks; int32_t *landmark; @@ -337,8 +337,8 @@ KHASH_MAP_INIT_INT(m_tagmap, cram_tag_map*) typedef struct cram_block_slice_hdr { enum cram_content_type content_type; int32_t ref_seq_id; /* if content_type == MAPPED_SLICE */ - int32_t ref_seq_start; /* if content_type == MAPPED_SLICE */ - int32_t ref_seq_span; /* if content_type == MAPPED_SLICE */ + int64_t ref_seq_start; /* if content_type == MAPPED_SLICE */ + int64_t ref_seq_span; /* if content_type == MAPPED_SLICE */ int32_t num_records; int64_t record_counter; int32_t num_blocks; @@ -362,8 +362,8 @@ struct ref_entry; typedef struct cram_container { int32_t length; int32_t ref_seq_id; - int32_t ref_seq_start; - int32_t ref_seq_span; + int64_t ref_seq_start; + int64_t ref_seq_span; int64_t record_counter; int64_t num_bases; int32_t num_records; @@ -385,10 +385,10 @@ typedef struct cram_container { int max_c_rec, curr_c_rec; // current and max recs per container int slice_rec; // rec no. for start of this slice int curr_ref; // current ref ID. -2 for no previous - int last_pos; // last record position + int64_t last_pos; // last record position struct cram_slice **slices, *slice; int pos_sorted; // boolean, 1=>position sorted data - int max_apos; // maximum position, used if pos_sorted==0 + int64_t max_apos; // maximum position, used if pos_sorted==0 int last_slice; // number of reads in last slice (0 for 1st) int multi_seq; // true if packing multi seqs per cont/slice int unsorted; // true is AP_delta is 0. @@ -422,14 +422,14 @@ typedef struct cram_record { int32_t flags; // BF int32_t cram_flags; // CF int32_t len; // RL - int32_t apos; // AP + int64_t apos; // AP int32_t rg; // RG int32_t name; // RN; idx to s->names_blk int32_t name_len; int32_t mate_line; // index to another cram_record int32_t mate_ref_id; - int32_t mate_pos; // NP - int32_t tlen; // TS + int64_t mate_pos; // NP + int64_t tlen; // TS // Auxiliary data int32_t ntags; // TC @@ -446,7 +446,7 @@ typedef struct cram_record { int32_t qual; // idx to s->qual_blk int32_t cigar; // idx to s->cigar int32_t ncigar; - int32_t aend; // alignment end + int64_t aend; // alignment end int32_t mqual; // MQ int32_t feature; // idx to s->feature @@ -543,7 +543,7 @@ typedef struct cram_slice { cram_block **block_by_id; /* State used during encoding/decoding */ - int last_apos, max_apos; + int64_t last_apos, max_apos; /* Array of decoded cram records */ cram_record *crecs; From 00b72b6e7daa44065a185cafdec6f7735d2f7389 Mon Sep 17 00:00:00 2001 From: James Bonfield Date: Fri, 1 Jun 2018 17:03:38 +0100 Subject: [PATCH 03/23] Expose hts_parse_reg64() as part of external API --- hts.c | 4 ++-- htslib/hts.h | 14 +++++++++++--- region.c | 2 -- 3 files changed, 13 insertions(+), 7 deletions(-) diff --git a/hts.c b/hts.c index ecd46ee48..780119207 100644 --- a/hts.c +++ b/hts.c @@ -3068,7 +3068,7 @@ const char *hts_parse_region(const char *s, int *tid, int64_t *beg, int64_t *end // Next release we should mark this as deprecated? // Use hts_parse_region above instead. -const char *hts_parse_reg_(const char *s, int64_t *beg, int64_t *end) +const char *hts_parse_reg64(const char *s, int64_t *beg, int64_t *end) { char *hyphen; const char *colon = strrchr(s, ':'); @@ -3091,7 +3091,7 @@ const char *hts_parse_reg_(const char *s, int64_t *beg, int64_t *end) const char *hts_parse_reg(const char *s, int *beg, int *end) { int64_t beg64 = 0, end64 = 0; - const char *colon = hts_parse_reg_(s, &beg64, &end64); + const char *colon = hts_parse_reg64(s, &beg64, &end64); if (beg64 > INT_MAX) { hts_log_error("Position %"PRId64" too large", beg64); return NULL; diff --git a/htslib/hts.h b/htslib/hts.h index 08c357815..989435d2a 100644 --- a/htslib/hts.h +++ b/htslib/hts.h @@ -847,6 +847,9 @@ uint64_t hts_idx_get_n_no_coor(const hts_idx_t* idx); */ long long hts_parse_decimal(const char *str, char **strend, int flags); +typedef int (*hts_name2id_f)(void*, const char*); +typedef const char *(*hts_id2name_f)(void*, int); + /// Parse a "CHR:START-END"-style region string /** @param str String to be parsed @param beg Set on return to the 0-based start of the region @@ -854,10 +857,15 @@ long long hts_parse_decimal(const char *str, char **strend, int flags); @return Pointer to the colon or '\0' after the reference sequence name, or NULL if @a str could not be parsed. */ +const char *hts_parse_reg64(const char *str, int64_t *beg, int64_t *end); -typedef int (*hts_name2id_f)(void*, const char*); -typedef const char *(*hts_id2name_f)(void*, int); - +/// Parse a "CHR:START-END"-style region string +/** @param str String to be parsed + @param beg Set on return to the 0-based start of the region + @param end Set on return to the 1-based end of the region + @return Pointer to the colon or '\0' after the reference sequence name, + or NULL if @a str could not be parsed. +*/ const char *hts_parse_reg(const char *str, int *beg, int *end); /// Parse a "CHR:START-END"-style region string diff --git a/region.c b/region.c index d6680b8a0..d9679f79f 100644 --- a/region.c +++ b/region.c @@ -37,8 +37,6 @@ typedef struct reglist KHASH_MAP_INIT_INT(reg, reglist_t) typedef kh_reg_t reghash_t; -const char *hts_parse_reg_(const char *s, int64_t *beg, int64_t *end); - static int compare_uint64 (const void * a, const void * b) { if (*(uint64_t *)a < *(uint64_t *)b) return -1; From a19593a0f7a6fb59cf78524be9aefce6c57452bc Mon Sep 17 00:00:00 2001 From: James Bonfield Date: Mon, 4 Jun 2018 10:24:00 +0100 Subject: [PATCH 04/23] More CRAM LARGE_POS fixes for testing 64-bit positions. --- cram/cram_decode.c | 4 ++-- cram/cram_encode.c | 8 +++++++- cram/cram_io.c | 5 +++++ cram/cram_samtools.c | 8 ++++---- cram/cram_samtools.h | 8 ++++---- 5 files changed, 22 insertions(+), 11 deletions(-) diff --git a/cram/cram_decode.c b/cram/cram_decode.c index 5aefe761c..6140b585c 100644 --- a/cram/cram_decode.c +++ b/cram/cram_decode.c @@ -2036,8 +2036,8 @@ static int cram_decode_slice_xref(cram_slice *s, int required_fields) { */ if (cr->tlen == INT_MIN) { int id1 = rec, id2 = rec; - int aleft = cr->apos, aright = cr->aend; - int tlen; + int64_t aleft = cr->apos, aright = cr->aend; + int64_t tlen; int ref = cr->ref_id; // number of segments starting at the same point. diff --git a/cram/cram_encode.c b/cram/cram_encode.c index 72d64b140..cb573bd7c 100644 --- a/cram/cram_encode.c +++ b/cram/cram_encode.c @@ -552,8 +552,13 @@ cram_block *cram_encode_slice_header(cram_fd *fd, cram_slice *s) { } cp += itf8_put(cp, s->hdr->ref_seq_id); +#ifdef LARGE_POS + cp += ltf8_put(cp, s->hdr->ref_seq_start); + cp += ltf8_put(cp, s->hdr->ref_seq_span); +#else cp += itf8_put(cp, s->hdr->ref_seq_start); cp += itf8_put(cp, s->hdr->ref_seq_span); +#endif cp += itf8_put(cp, s->hdr->num_records); if (CRAM_MAJOR_VERS(fd->version) == 2) cp += itf8_put(cp, s->hdr->record_counter); @@ -3102,7 +3107,8 @@ static int process_one_read(cram_fd *fd, cram_container *c, if (new == 0) { cram_record *p = &s->crecs[kh_val(s->pair[sec], k)]; - int aleft, aright, sign; + int64_t aleft, aright; + int sign; aleft = MIN(cr->apos, p->apos); aright = MAX(cr->aend, p->aend); diff --git a/cram/cram_io.c b/cram/cram_io.c index 0c3c10c58..3c47f865f 100644 --- a/cram/cram_io.c +++ b/cram/cram_io.c @@ -3164,8 +3164,13 @@ int cram_write_container(cram_fd *fd, cram_container *c) { cp += itf8_put((char*)cp, 0); } else { cp += itf8_put((char*)cp, c->ref_seq_id); +#ifdef LARGE_POS + cp += ltf8_put((char*)cp, c->ref_seq_start); + cp += ltf8_put((char*)cp, c->ref_seq_span); +#else cp += itf8_put((char*)cp, c->ref_seq_start); cp += itf8_put((char*)cp, c->ref_seq_span); +#endif } cp += itf8_put((char*)cp, c->num_records); if (CRAM_MAJOR_VERS(fd->version) == 2) { diff --git a/cram/cram_samtools.c b/cram/cram_samtools.c index aab68df31..1f33eaeba 100644 --- a/cram/cram_samtools.c +++ b/cram/cram_samtools.c @@ -45,13 +45,13 @@ int bam_construct_seq(bam_seq_t **bp, size_t extra_len, const char *qname, size_t qname_len, int flag, int rname, // Ref ID - int pos, - int end, // aligned start/end coords + int64_t pos, + int64_t end, // aligned start/end coords int mapq, uint32_t ncigar, const uint32_t *cigar, int mrnm, // Mate Ref ID - int mpos, - int isize, + int64_t mpos, + int64_t isize, int len, const char *seq, const char *qual) { diff --git a/cram/cram_samtools.h b/cram/cram_samtools.h index 4bbc39b04..4bed1465d 100644 --- a/cram/cram_samtools.h +++ b/cram/cram_samtools.h @@ -80,13 +80,13 @@ int bam_construct_seq(bam_seq_t **bp, size_t extra_len, const char *qname, size_t qname_len, int flag, int rname, // Ref ID - int pos, - int end, // aligned start/end coords + int64_t pos, + int64_t end, // aligned start/end coords int mapq, uint32_t ncigar, const uint32_t *cigar, int mrnm, // Mate Ref ID - int mpos, - int isize, + int64_t mpos, + int64_t isize, int len, const char *seq, const char *qual); From 24436de366b44f4d69119f9a1f46f538d9368e65 Mon Sep 17 00:00:00 2001 From: James Bonfield Date: Mon, 4 Jun 2018 17:15:31 +0100 Subject: [PATCH 05/23] More cram codec improvements for 64-bit quantities. Added BETA and HUFFMAN support. (Beta can occasionally be used, althoug huffman is mainly for completeness now.) Also fixed the decoder2encoder logic (used by cram_transcode_rg). --- cram/cram_codecs.c | 88 ++++++++++++++++++++++++++++++++++++++++++++-- 1 file changed, 86 insertions(+), 2 deletions(-) diff --git a/cram/cram_codecs.c b/cram/cram_codecs.c index 5c574ed03..276ad3c84 100644 --- a/cram/cram_codecs.c +++ b/cram/cram_codecs.c @@ -521,6 +521,24 @@ cram_codec *cram_external_encode_init(cram_stats *st, * --------------------------------------------------------------------------- * BETA */ +int cram_beta_decode_long(cram_slice *slice, cram_codec *c, cram_block *in, char *out, int *out_size) { + int64_t *out_i = (int64_t *)out; + int i, n = *out_size; + + if (c->u.beta.nbits) { + if (cram_not_enough_bits(in, c->u.beta.nbits * n)) + return -1; + + for (i = 0; i < n; i++) + out_i[i] = get_bits_MSB(in, c->u.beta.nbits) - c->u.beta.offset; + } else { + for (i = 0; i < n; i++) + out_i[i] = -c->u.beta.offset; + } + + return 0; +} + int cram_beta_decode_int(cram_slice *slice, cram_codec *c, cram_block *in, char *out, int *out_size) { int32_t *out_i = (int32_t *)out; int i, n = *out_size; @@ -577,8 +595,10 @@ cram_codec *cram_beta_decode_init(char *data, int size, return NULL; c->codec = E_BETA; - if (option == E_INT || option == E_LONG) + if (option == E_INT) c->decode = cram_beta_decode_int; + else if (option == E_LONG) + c->decode = cram_beta_decode_long; else if (option == E_BYTE_ARRAY || option == E_BYTE) c->decode = cram_beta_decode_char; else { @@ -626,6 +646,18 @@ int cram_beta_encode_store(cram_codec *c, cram_block *b, return -1; } +int cram_beta_encode_long(cram_slice *slice, cram_codec *c, + char *in, int in_size) { + int64_t *syms = (int64_t *)in; + int i, r = 0; + + for (i = 0; i < in_size; i++) + r |= store_bits_MSB(c->out, syms[i] + c->u.e_beta.offset, + c->u.e_beta.nbits); + + return r; +} + int cram_beta_encode_int(cram_slice *slice, cram_codec *c, char *in, int in_size) { int *syms = (int *)in; @@ -669,6 +701,8 @@ cram_codec *cram_beta_encode_init(cram_stats *st, c->free = cram_beta_encode_free; if (option == E_INT) c->encode = cram_beta_encode_int; + else if (option == E_LONG) + c->encode = cram_beta_encode_long; else c->encode = cram_beta_encode_char; c->store = cram_beta_encode_store; @@ -1286,6 +1320,43 @@ int cram_huffman_encode_int(cram_slice *slice, cram_codec *c, return r; } +int cram_huffman_encode_long0(cram_slice *slice, cram_codec *c, + char *in, int in_size) { + return 0; +} + +int cram_huffman_encode_long(cram_slice *slice, cram_codec *c, + char *in, int in_size) { + int i, code, len, r = 0; + int64_t *syms = (int64_t *)in; + + while (in_size--) { + int sym = *syms++; + + if (sym >= -1 && sym < MAX_HUFF) { + i = c->u.e_huffman.val2code[sym+1]; + assert(c->u.e_huffman.codes[i].symbol == sym); + code = c->u.e_huffman.codes[i].code; + len = c->u.e_huffman.codes[i].len; + } else { + /* Slow - use a lookup table for when sym < MAX_HUFFMAN_SYM? */ + for (i = 0; i < c->u.e_huffman.nvals; i++) { + if (c->u.e_huffman.codes[i].symbol == sym) + break; + } + if (i == c->u.e_huffman.nvals) + return -1; + + code = c->u.e_huffman.codes[i].code; + len = c->u.e_huffman.codes[i].len; + } + + r |= store_bits_MSB(c->out, code, len); + } + + return r; +} + void cram_huffman_encode_free(cram_codec *c) { if (!c) return; @@ -1518,11 +1589,16 @@ cram_codec *cram_huffman_encode_init(cram_stats *st, c->encode = cram_huffman_encode_char0; else c->encode = cram_huffman_encode_char; - } else { + } else if (option == E_INT) { if (c->u.e_huffman.codes[0].len == 0) c->encode = cram_huffman_encode_int0; else c->encode = cram_huffman_encode_int; + } else if (option == E_LONG) { + if (c->u.e_huffman.codes[0].len == 0) + c->encode = cram_huffman_encode_long0; + else + c->encode = cram_huffman_encode_long; } c->store = cram_huffman_encode_store; @@ -2082,6 +2158,8 @@ int cram_codec_decoder2encoder(cram_fd *fd, cram_codec *c) { c->store = cram_external_encode_store; if (c->decode == cram_external_decode_int) c->encode = cram_external_encode_int; + if (c->decode == cram_external_decode_long) + c->encode = cram_external_encode_long; else if (c->decode == cram_external_decode_char) c->encode = cram_external_encode_char; else @@ -2112,6 +2190,10 @@ int cram_codec_decoder2encoder(cram_fd *fd, cram_codec *c) { t->encode = cram_huffman_encode_int0; else if (c->decode == cram_huffman_decode_int) t->encode = cram_huffman_encode_int; + else if (c->decode == cram_huffman_decode_long0) + t->encode = cram_huffman_encode_long0; + else if (c->decode == cram_huffman_decode_long) + t->encode = cram_huffman_encode_long; else { free(t); return -1; @@ -2127,6 +2209,8 @@ int cram_codec_decoder2encoder(cram_fd *fd, cram_codec *c) { c->store = cram_beta_encode_store; if (c->decode == cram_beta_decode_int) c->encode = cram_beta_encode_int; + else if (c->decode == cram_beta_decode_long) + c->encode = cram_beta_encode_long; else if (c->decode == cram_beta_decode_char) c->encode = cram_beta_encode_char; else From eb2334e6e895982ba280dedc1a3884547089fd43 Mon Sep 17 00:00:00 2001 From: James Bonfield Date: Mon, 4 Jun 2018 17:40:25 +0100 Subject: [PATCH 06/23] Further 64-bit position API/ABI changes. Fixed the pileup iterators to internally use 64-bit states. The API for this returns via int *pos, so to keep API consistency we now have new bam_plp64_next, bam_plp64_auto and bam_mplp64_auto functions. Similarly for fai handling fai_fetch64 and faidx_fetch_seq64. Minor tweak to sam_cap_mapq and sam_prob_realn API. Pos parameter is passed by value so doesn't need a new API (promotion is enough), but code hasn't been curated yet. The implementation of these two functions needs more work to be 64-bit clean. --- faidx.c | 47 ++++++++++++++----- htslib/faidx.h | 28 ++++++++++++ htslib/sam.h | 7 ++- realn.c | 10 +++-- sam.c | 120 +++++++++++++++++++++++++++++++++++++++---------- 5 files changed, 171 insertions(+), 41 deletions(-) diff --git a/faidx.c b/faidx.c index 3c6ec5158..00ac74571 100644 --- a/faidx.c +++ b/faidx.c @@ -693,7 +693,7 @@ faidx_t *fai_load_format(const char *fn, enum fai_format_options format) { static char *fai_retrieve(const faidx_t *fai, const faidx1_t *val, - uint64_t offset, int64_t beg, int64_t end, int *len) { + uint64_t offset, int64_t beg, int64_t end, int64_t *len) { char *s; size_t l; int c = 0; @@ -739,7 +739,7 @@ static char *fai_retrieve(const faidx_t *fai, const faidx1_t *val, } static int fai_get_val(const faidx_t *fai, const char *str, - int *len, faidx1_t *val, int64_t *fbeg, int64_t *fend) { + int64_t *len, faidx1_t *val, int64_t *fbeg, int64_t *fend) { khiter_t iter; khash_t(s) *h; int id; @@ -770,7 +770,7 @@ static int fai_get_val(const faidx_t *fai, const char *str, } -char *fai_fetch(const faidx_t *fai, const char *str, int *len) +char *fai_fetch64(const faidx_t *fai, const char *str, int64_t *len) { faidx1_t val; int64_t beg, end; @@ -783,8 +783,15 @@ char *fai_fetch(const faidx_t *fai, const char *str, int *len) return fai_retrieve(fai, &val, val.seq_offset, beg, end, len); } +char *fai_fetch(const faidx_t *fai, const char *str, int *len) +{ + int64_t len64; + char *ret = fai_fetch64(fai, str, &len64); + *len = len64; // trunc + return ret; +} -char *fai_fetchqual(const faidx_t *fai, const char *str, int *len) { +char *fai_fetchqual64(const faidx_t *fai, const char *str, int64_t *len) { faidx1_t val; int64_t beg, end; @@ -796,6 +803,12 @@ char *fai_fetchqual(const faidx_t *fai, const char *str, int *len) { return fai_retrieve(fai, &val, val.qual_offset, beg, end, len); } +char *fai_fetchqual(const faidx_t *fai, const char *str, int *len) { + int64_t len64; + char *ret = fai_fetchqual64(fai, str, &len64); + *len = len64; // trunc + return ret; +} int faidx_fetch_nseq(const faidx_t *fai) { @@ -819,8 +832,7 @@ int faidx_seq_len(const faidx_t *fai, const char *seq) return kh_val(fai->hash, k).len; } - -static int faidx_adjust_position(const faidx_t *fai, faidx1_t *val, const char *c_name, int *p_beg_i, int *p_end_i, int *len) { +static int faidx_adjust_position(const faidx_t *fai, faidx1_t *val, const char *c_name, int64_t *p_beg_i, int64_t *p_end_i, int64_t *len) { khiter_t iter; // Adjust position @@ -850,8 +862,7 @@ static int faidx_adjust_position(const faidx_t *fai, faidx1_t *val, const char * return 0; } - -char *faidx_fetch_seq(const faidx_t *fai, const char *c_name, int p_beg_i, int p_end_i, int *len) +char *faidx_fetch_seq64(const faidx_t *fai, const char *c_name, int64_t p_beg_i, int64_t p_end_i, int64_t *len) { faidx1_t val; @@ -861,11 +872,18 @@ char *faidx_fetch_seq(const faidx_t *fai, const char *c_name, int p_beg_i, int p } // Now retrieve the sequence - return fai_retrieve(fai, &val, val.seq_offset, p_beg_i, (long) p_end_i + 1, len); + return fai_retrieve(fai, &val, val.seq_offset, p_beg_i, p_end_i + 1, len); } +char *faidx_fetch_seq(const faidx_t *fai, const char *c_name, int p_beg_i, int p_end_i, int *len) +{ + int64_t len64; + char *ret = faidx_fetch_seq64(fai, c_name, p_beg_i, p_end_i, &len64); + *len = len64; // trunc + return ret; +} -char *faidx_fetch_qual(const faidx_t *fai, const char *c_name, int p_beg_i, int p_end_i, int *len) +char *faidx_fetch_qual64(const faidx_t *fai, const char *c_name, int64_t p_beg_i, int64_t p_end_i, int64_t *len) { faidx1_t val; @@ -875,9 +893,16 @@ char *faidx_fetch_qual(const faidx_t *fai, const char *c_name, int p_beg_i, int } // Now retrieve the sequence - return fai_retrieve(fai, &val, val.qual_offset, p_beg_i, (long) p_end_i + 1, len); + return fai_retrieve(fai, &val, val.qual_offset, p_beg_i, p_end_i + 1, len); } +char *faidx_fetch_qual(const faidx_t *fai, const char *c_name, int p_beg_i, int p_end_i, int *len) +{ + int64_t len64; + char *ret = faidx_fetch_qual64(fai, c_name, p_beg_i, p_end_i, &len64); + *len = len64; // trunc + return ret; +} int faidx_has_seq(const faidx_t *fai, const char *seq) { diff --git a/htslib/faidx.h b/htslib/faidx.h index 48ae040e9..cb5eb2dea 100644 --- a/htslib/faidx.h +++ b/htslib/faidx.h @@ -170,6 +170,7 @@ are reference names, quote using curly braces. Thus "{chr1}:100-200" and "{chr1:100-200}" disambiguate the above example. */ char *fai_fetch(const faidx_t *fai, const char *reg, int *len); +char *fai_fetch64(const faidx_t *fai, const char *reg, int64_t *len); /// Fetch the quality string for a region for FASTQ files /** @param fai Pointer to the faidx_t struct @@ -183,6 +184,7 @@ destroyed by end users by calling `free()` on it. Region names can be quoted with curly braces, as for fai_fetch(). */ char *fai_fetchqual(const faidx_t *fai, const char *reg, int *len); +char *fai_fetchqual64(const faidx_t *fai, const char *reg, int64_t *len); /// Fetch the number of sequences /** @param fai Pointer to the faidx_t struct @@ -203,6 +205,19 @@ by end users by calling `free()` on it. */ char *faidx_fetch_seq(const faidx_t *fai, const char *c_name, int p_beg_i, int p_end_i, int *len); +/// Fetch the sequence in a region +/** @param fai Pointer to the faidx_t struct + @param c_name Region name + @param p_beg_i Beginning position number (zero-based) + @param p_end_i End position number (zero-based) + @param len Length of the region; -2 if c_name not present, -1 general error + @return Pointer to the sequence; null on failure + +The returned sequence is allocated by `malloc()` family and should be destroyed +by end users by calling `free()` on it. +*/ +char *faidx_fetch_seq64(const faidx_t *fai, const char *c_name, int64_t p_beg_i, int64_t p_end_i, int64_t *len); + /// Fetch the quality string in a region for FASTQ files /** @param fai Pointer to the faidx_t struct @param c_name Region name @@ -216,6 +231,19 @@ by end users by calling `free()` on it. */ char *faidx_fetch_qual(const faidx_t *fai, const char *c_name, int p_beg_i, int p_end_i, int *len); +/// Fetch the quality string in a region for FASTQ files +/** @param fai Pointer to the faidx_t struct + @param c_name Region name + @param p_beg_i Beginning position number (zero-based) + @param p_end_i End position number (zero-based) + @param len Length of the region; -2 if c_name not present, -1 general error + @return Pointer to the sequence; null on failure + +The returned sequence is allocated by `malloc()` family and should be destroyed +by end users by calling `free()` on it. +*/ +char *faidx_fetch_qual64(const faidx_t *fai, const char *c_name, int64_t p_beg_i, int64_t p_end_i, int64_t *len); + /// Query if sequence is present /** @param fai Pointer to the faidx_t struct @param seq Sequence name diff --git a/htslib/sam.h b/htslib/sam.h index a4086ca1a..eeb96075e 100644 --- a/htslib/sam.h +++ b/htslib/sam.h @@ -1501,6 +1501,8 @@ typedef struct __bam_mplp_t *bam_mplp_t; int bam_plp_push(bam_plp_t iter, const bam1_t *b); const bam_pileup1_t *bam_plp_next(bam_plp_t iter, int *_tid, int *_pos, int *_n_plp); const bam_pileup1_t *bam_plp_auto(bam_plp_t iter, int *_tid, int *_pos, int *_n_plp); + const bam_pileup1_t *bam_plp64_next(bam_plp_t iter, int *_tid, int64_t *_pos, int *_n_plp); + const bam_pileup1_t *bam_plp64_auto(bam_plp_t iter, int *_tid, int64_t *_pos, int *_n_plp); void bam_plp_set_maxcnt(bam_plp_t iter, int maxcnt); void bam_plp_reset(bam_plp_t iter); @@ -1549,6 +1551,7 @@ typedef struct __bam_mplp_t *bam_mplp_t; void bam_mplp_destroy(bam_mplp_t iter); void bam_mplp_set_maxcnt(bam_mplp_t iter, int maxcnt); int bam_mplp_auto(bam_mplp_t iter, int *_tid, int *_pos, int *n_plp, const bam_pileup1_t **plp); + int bam_mplp64_auto(bam_mplp_t iter, int *_tid, int64_t *_pos, int *n_plp, const bam_pileup1_t **plp); void bam_mplp_reset(bam_mplp_t iter); void bam_mplp_constructor(bam_mplp_t iter, int (*func)(void *data, const bam1_t *b, bam_pileup_cd *cd)); @@ -1562,7 +1565,7 @@ typedef struct __bam_mplp_t *bam_mplp_t; * BAQ calculation and realignment * ***********************************/ -int sam_cap_mapq(bam1_t *b, const char *ref, int ref_len, int thres); +int sam_cap_mapq(bam1_t *b, const char *ref, int64_t ref_len, int thres); /// Calculate BAQ scores /** @param b BAM record @@ -1604,7 +1607,7 @@ Depending on what previous processing happened, this may or may not be the correct thing to do. It would be wise to avoid this situation if possible. */ -int sam_prob_realn(bam1_t *b, const char *ref, int ref_len, int flag); +int sam_prob_realn(bam1_t *b, const char *ref, int64_t ref_len, int flag); #ifdef __cplusplus } diff --git a/realn.c b/realn.c index bc21f8083..9579e285c 100644 --- a/realn.c +++ b/realn.c @@ -35,12 +35,13 @@ DEALINGS IN THE SOFTWARE. */ #include "htslib/hts.h" #include "htslib/sam.h" -int sam_cap_mapq(bam1_t *b, const char *ref, int ref_len, int thres) +int sam_cap_mapq(bam1_t *b, const char *ref, int64_t ref_len, int thres) { uint8_t *seq = bam_get_seq(b), *qual = bam_get_qual(b); uint32_t *cigar = bam_get_cigar(b); bam1_core_t *c = &b->core; - int i, x, y, mm, q, len, clip_l, clip_q; + int i, y, mm, q, len, clip_l, clip_q; + int64_t x; double t; if (thres < 0) thres = 40; // set the default mm = q = len = clip_l = clip_q = 0; @@ -101,9 +102,10 @@ static int realn_check_tag(const uint8_t *tg, enum htsLogLevel severity, return 0; } -int sam_prob_realn(bam1_t *b, const char *ref, int ref_len, int flag) +int sam_prob_realn(bam1_t *b, const char *ref, int64_t ref_len, int flag) { - int k, i, bw, x, y, yb, ye, xb, xe, apply_baq = flag&1, extend_baq = flag>>1&1, redo_baq = flag&4, fix_bq = 0; + int k, bw, y, yb, ye, xb, xe, apply_baq = flag&1, extend_baq = flag>>1&1, redo_baq = flag&4, fix_bq = 0; + int64_t i, x; uint32_t *cigar = bam_get_cigar(b); bam1_core_t *c = &b->core; probaln_par_t conf = { 0.001, 0.1, 10 }; diff --git a/sam.c b/sam.c index 7948d1724..337b90d80 100644 --- a/sam.c +++ b/sam.c @@ -33,6 +33,7 @@ DEALINGS IN THE SOFTWARE. */ #include #include #include +#include // Suppress deprecation message for cigar_tab, which we initialise #include "htslib/hts_defs.h" @@ -3659,7 +3660,8 @@ char *bam_flag2str(int flag) *******************/ typedef struct { - int k, x, y, end; + int k, y; + int64_t x, end; } cstate_t; static cstate_t g_cstate_null = { -1, 0, 0, 0 }; @@ -3717,7 +3719,7 @@ static inline void mp_free(mempool_t *mp, lbnode_t *p) s->x: the reference coordinate of the start of s->k s->y: the query coordiante of the start of s->k */ -static inline int resolve_cigar2(bam_pileup1_t *p, int32_t pos, cstate_t *s) +static inline int resolve_cigar2(bam_pileup1_t *p, int64_t pos, cstate_t *s) { #define _cop(c) ((c)&BAM_CIGAR_MASK) #define _cln(c) ((c)>>BAM_CIGAR_SHIFT) @@ -3886,7 +3888,8 @@ typedef khash_t(olap_hash) olap_hash_t; struct __bam_plp_t { mempool_t *mp; lbnode_t *head, *tail; - int32_t tid, pos, max_tid, max_pos; + int32_t tid, max_tid; + int64_t pos, max_pos; int is_eof, max_plp, error, maxcnt; uint64_t id; bam_pileup1_t *plp; @@ -4164,7 +4167,7 @@ static void overlap_remove(bam_plp_t iter, const bam1_t *b) // Prepares next pileup position in bam records collected by bam_plp_auto -> user func -> bam_plp_push. Returns // pointer to the piled records if next position is ready or NULL if there is not enough records in the // buffer yet (the current position is still the maximum position across all buffered reads). -const bam_pileup1_t *bam_plp_next(bam_plp_t iter, int *_tid, int *_pos, int *_n_plp) +const bam_pileup1_t *bam_plp64_next(bam_plp_t iter, int *_tid, int64_t *_pos, int *_n_plp) { if (iter->error) { *_n_plp = -1; return NULL; } *_n_plp = 0; @@ -4216,6 +4219,22 @@ const bam_pileup1_t *bam_plp_next(bam_plp_t iter, int *_tid, int *_pos, int *_n_ return NULL; } +const bam_pileup1_t *bam_plp_next(bam_plp_t iter, int *_tid, int *_pos, int *_n_plp) +{ + int64_t pos64 = 0; + const bam_pileup1_t *p = bam_plp64_next(iter, _tid, &pos64, _n_plp); + if (pos64 < INT_MAX) { + *_pos = pos64; + } else { + hts_log_error("Position %"PRId64" too large", pos64); + *_pos = INT_MAX; + iter->error = 1; + *_n_plp = -1; + return NULL; + } + return p; +} + int bam_plp_push(bam_plp_t iter, const bam1_t *b) { if (iter->error) return -1; @@ -4265,11 +4284,11 @@ int bam_plp_push(bam_plp_t iter, const bam1_t *b) return 0; } -const bam_pileup1_t *bam_plp_auto(bam_plp_t iter, int *_tid, int *_pos, int *_n_plp) +const bam_pileup1_t *bam_plp64_auto(bam_plp_t iter, int *_tid, int64_t *_pos, int *_n_plp) { const bam_pileup1_t *plp; if (iter->func == 0 || iter->error) { *_n_plp = -1; return 0; } - if ((plp = bam_plp_next(iter, _tid, _pos, _n_plp)) != 0) return plp; + if ((plp = bam_plp64_next(iter, _tid, _pos, _n_plp)) != 0) return plp; else { // no pileup line can be obtained; read alignments *_n_plp = 0; if (iter->is_eof) return 0; @@ -4279,7 +4298,7 @@ const bam_pileup1_t *bam_plp_auto(bam_plp_t iter, int *_tid, int *_pos, int *_n_ *_n_plp = -1; return 0; } - if ((plp = bam_plp_next(iter, _tid, _pos, _n_plp)) != 0) return plp; + if ((plp = bam_plp64_next(iter, _tid, _pos, _n_plp)) != 0) return plp; // otherwise no pileup line can be returned; read the next alignment. } if ( ret < -1 ) { iter->error = ret; *_n_plp = -1; return 0; } @@ -4287,11 +4306,27 @@ const bam_pileup1_t *bam_plp_auto(bam_plp_t iter, int *_tid, int *_pos, int *_n_ *_n_plp = -1; return 0; } - if ((plp = bam_plp_next(iter, _tid, _pos, _n_plp)) != 0) return plp; + if ((plp = bam_plp64_next(iter, _tid, _pos, _n_plp)) != 0) return plp; return 0; } } +const bam_pileup1_t *bam_plp_auto(bam_plp_t iter, int *_tid, int *_pos, int *_n_plp) +{ + int64_t pos64 = 0; + const bam_pileup1_t *p = bam_plp64_auto(iter, _tid, &pos64, _n_plp); + if (pos64 < INT_MAX) { + *_pos = pos64; + } else { + hts_log_error("Position %"PRId64" too large", pos64); + *_pos = INT_MAX; + iter->error = 1; + *_n_plp = -1; + return NULL; + } + return p; +} + void bam_plp_reset(bam_plp_t iter) { overlap_remove(iter, NULL); @@ -4316,7 +4351,8 @@ void bam_plp_set_maxcnt(bam_plp_t iter, int maxcnt) struct __bam_mplp_t { int n; - uint64_t min, *pos; + int32_t min_tid, *tid; + uint64_t min_pos, *pos; bam_plp_t *iter; int *n_plp; const bam_pileup1_t **plp; @@ -4328,14 +4364,17 @@ bam_mplp_t bam_mplp_init(int n, bam_plp_auto_f func, void **data) bam_mplp_t iter; iter = (bam_mplp_t)calloc(1, sizeof(struct __bam_mplp_t)); iter->pos = (uint64_t*)calloc(n, sizeof(uint64_t)); + iter->tid = (int32_t*)calloc(n, sizeof(int32_t)); iter->n_plp = (int*)calloc(n, sizeof(int)); iter->plp = (const bam_pileup1_t**)calloc(n, sizeof(bam_pileup1_t*)); iter->iter = (bam_plp_t*)calloc(n, sizeof(bam_plp_t)); iter->n = n; - iter->min = (uint64_t)-1; + iter->min_pos = (uint64_t)-1; + iter->min_tid = (uint32_t)-1; for (i = 0; i < n; ++i) { iter->iter[i] = bam_plp_init(func, data[i]); - iter->pos[i] = iter->min; + iter->pos[i] = iter->min_pos; + iter->tid[i] = iter->min_tid; } return iter; } @@ -4359,28 +4398,45 @@ void bam_mplp_destroy(bam_mplp_t iter) { int i; for (i = 0; i < iter->n; ++i) bam_plp_destroy(iter->iter[i]); - free(iter->iter); free(iter->pos); free(iter->n_plp); free(iter->plp); + free(iter->iter); free(iter->pos); free(iter->tid); + free(iter->n_plp); free(iter->plp); free(iter); } -int bam_mplp_auto(bam_mplp_t iter, int *_tid, int *_pos, int *n_plp, const bam_pileup1_t **plp) +int bam_mplp64_auto(bam_mplp_t iter, int *_tid, int64_t *_pos, int *n_plp, const bam_pileup1_t **plp) { int i, ret = 0; - uint64_t new_min = (uint64_t)-1; + uint64_t new_min_pos = (uint64_t)-1; + uint32_t new_min_tid = (uint32_t)-1; for (i = 0; i < iter->n; ++i) { - if (iter->pos[i] == iter->min) { - int tid, pos; - iter->plp[i] = bam_plp_auto(iter->iter[i], &tid, &pos, &iter->n_plp[i]); + if (iter->pos[i] == iter->min_pos && iter->tid[i] == iter->min_tid) { + int tid; + int64_t pos; + iter->plp[i] = bam_plp64_auto(iter->iter[i], &tid, &pos, &iter->n_plp[i]); if ( iter->iter[i]->error ) return -1; - iter->pos[i] = iter->plp[i] ? (uint64_t)tid<<32 | pos : 0; + if (iter->plp[i]) { + iter->tid[i] = tid; + iter->pos[i] = pos; + } else { + iter->tid[i] = 0; + iter->pos[i] = 0; + } + } + if (iter->plp[i]) { + if (iter->tid[i] < new_min_tid) { + new_min_tid = iter->tid[i]; + new_min_pos = iter->pos[i]; + } else if (iter->pos[i] < new_min_pos) { + new_min_pos = iter->pos[i]; + } } - if (iter->plp[i] && iter->pos[i] < new_min) new_min = iter->pos[i]; } - iter->min = new_min; - if (new_min == (uint64_t)-1) return 0; - *_tid = new_min>>32; *_pos = (uint32_t)new_min; + iter->min_pos = new_min_pos; + iter->min_tid = new_min_tid; + if (new_min_pos == (uint64_t)-1) return 0; + *_tid = new_min_tid; *_pos = new_min_pos; for (i = 0; i < iter->n; ++i) { - if (iter->pos[i] == iter->min) { // FIXME: valgrind reports "uninitialised value(s) at this line" + if (iter->pos[i] == iter->min_pos && iter->tid[i] == iter->min_tid) { n_plp[i] = iter->n_plp[i], plp[i] = iter->plp[i]; ++ret; } else n_plp[i] = 0, plp[i] = 0; @@ -4388,13 +4444,29 @@ int bam_mplp_auto(bam_mplp_t iter, int *_tid, int *_pos, int *n_plp, const bam_p return ret; } +int bam_mplp_auto(bam_mplp_t iter, int *_tid, int *_pos, int *n_plp, const bam_pileup1_t **plp) +{ + int64_t pos64 = 0; + int ret = bam_mplp64_auto(iter, _tid, &pos64, n_plp, plp); + if (pos64 < INT_MAX) { + *_pos = pos64; + } else { + hts_log_error("Position %"PRId64" too large", pos64); + *_pos = INT_MAX; + return -1; + } + return ret; +} + void bam_mplp_reset(bam_mplp_t iter) { int i; - iter->min = (uint64_t)-1; + iter->min_pos = (uint64_t)-1; + iter->min_tid = (uint32_t)-1; for (i = 0; i < iter->n; ++i) { bam_plp_reset(iter->iter[i]); iter->pos[i] = (uint64_t)-1; + iter->tid[i] = (uint32_t)-1; iter->n_plp[i] = 0; iter->plp[i] = NULL; } From 835e133b941a175246b18a1a3261f7d91611a1dc Mon Sep 17 00:00:00 2001 From: James Bonfield Date: Tue, 5 Jun 2018 15:07:54 +0100 Subject: [PATCH 07/23] Change from int64_t to hts_pos_t typedef. Also fixed missing dependency in bcf_sr_sort.o. CRAM is still using int64_t internally as this is referring to the *potential* on-disk format (with -DLARGE_POS) and none of the changes there are externally visible in the public API anyway. Include stdio.h in hts_defs.h so the mingw __MINGW_PRINTF_FORMAT gets defined in all the places where it's needed. --- faidx.c | 22 ++++++------- hts.c | 59 ++++++++++++++++------------------ htslib/faidx.h | 10 +++--- htslib/hts.h | 41 +++++++++++++++++------- htslib/hts_defs.h | 2 ++ htslib/sam.h | 22 ++++++------- htslib/tbx.h | 2 +- htslib/vcf.h | 6 ++-- realn.c | 8 ++--- sam.c | 78 +++++++++++++++++++++++---------------------- synced_bcf_reader.c | 2 +- tbx.c | 2 +- test/sam.c | 9 ++++-- test/test-bcf-sr.c | 2 +- vcf.c | 16 +++++----- vcfutils.c | 52 +++++++++++++++--------------- 16 files changed, 178 insertions(+), 155 deletions(-) diff --git a/faidx.c b/faidx.c index 00ac74571..801129c24 100644 --- a/faidx.c +++ b/faidx.c @@ -693,7 +693,7 @@ faidx_t *fai_load_format(const char *fn, enum fai_format_options format) { static char *fai_retrieve(const faidx_t *fai, const faidx1_t *val, - uint64_t offset, int64_t beg, int64_t end, int64_t *len) { + uint64_t offset, hts_pos_t beg, hts_pos_t end, hts_pos_t *len) { char *s; size_t l; int c = 0; @@ -739,7 +739,7 @@ static char *fai_retrieve(const faidx_t *fai, const faidx1_t *val, } static int fai_get_val(const faidx_t *fai, const char *str, - int64_t *len, faidx1_t *val, int64_t *fbeg, int64_t *fend) { + hts_pos_t *len, faidx1_t *val, hts_pos_t *fbeg, hts_pos_t *fend) { khiter_t iter; khash_t(s) *h; int id; @@ -770,7 +770,7 @@ static int fai_get_val(const faidx_t *fai, const char *str, } -char *fai_fetch64(const faidx_t *fai, const char *str, int64_t *len) +char *fai_fetch64(const faidx_t *fai, const char *str, hts_pos_t *len) { faidx1_t val; int64_t beg, end; @@ -785,13 +785,13 @@ char *fai_fetch64(const faidx_t *fai, const char *str, int64_t *len) char *fai_fetch(const faidx_t *fai, const char *str, int *len) { - int64_t len64; + hts_pos_t len64; char *ret = fai_fetch64(fai, str, &len64); *len = len64; // trunc return ret; } -char *fai_fetchqual64(const faidx_t *fai, const char *str, int64_t *len) { +char *fai_fetchqual64(const faidx_t *fai, const char *str, hts_pos_t *len) { faidx1_t val; int64_t beg, end; @@ -804,7 +804,7 @@ char *fai_fetchqual64(const faidx_t *fai, const char *str, int64_t *len) { } char *fai_fetchqual(const faidx_t *fai, const char *str, int *len) { - int64_t len64; + hts_pos_t len64; char *ret = fai_fetchqual64(fai, str, &len64); *len = len64; // trunc return ret; @@ -832,7 +832,7 @@ int faidx_seq_len(const faidx_t *fai, const char *seq) return kh_val(fai->hash, k).len; } -static int faidx_adjust_position(const faidx_t *fai, faidx1_t *val, const char *c_name, int64_t *p_beg_i, int64_t *p_end_i, int64_t *len) { +static int faidx_adjust_position(const faidx_t *fai, faidx1_t *val, const char *c_name, hts_pos_t *p_beg_i, hts_pos_t *p_end_i, hts_pos_t *len) { khiter_t iter; // Adjust position @@ -862,7 +862,7 @@ static int faidx_adjust_position(const faidx_t *fai, faidx1_t *val, const char * return 0; } -char *faidx_fetch_seq64(const faidx_t *fai, const char *c_name, int64_t p_beg_i, int64_t p_end_i, int64_t *len) +char *faidx_fetch_seq64(const faidx_t *fai, const char *c_name, hts_pos_t p_beg_i, hts_pos_t p_end_i, hts_pos_t *len) { faidx1_t val; @@ -877,13 +877,13 @@ char *faidx_fetch_seq64(const faidx_t *fai, const char *c_name, int64_t p_beg_i, char *faidx_fetch_seq(const faidx_t *fai, const char *c_name, int p_beg_i, int p_end_i, int *len) { - int64_t len64; + hts_pos_t len64; char *ret = faidx_fetch_seq64(fai, c_name, p_beg_i, p_end_i, &len64); *len = len64; // trunc return ret; } -char *faidx_fetch_qual64(const faidx_t *fai, const char *c_name, int64_t p_beg_i, int64_t p_end_i, int64_t *len) +char *faidx_fetch_qual64(const faidx_t *fai, const char *c_name, hts_pos_t p_beg_i, hts_pos_t p_end_i, hts_pos_t *len) { faidx1_t val; @@ -898,7 +898,7 @@ char *faidx_fetch_qual64(const faidx_t *fai, const char *c_name, int64_t p_beg_i char *faidx_fetch_qual(const faidx_t *fai, const char *c_name, int p_beg_i, int p_end_i, int *len) { - int64_t len64; + hts_pos_t len64; char *ret = faidx_fetch_qual64(fai, c_name, p_beg_i, p_end_i, &len64); *len = len64; // trunc return ret; diff --git a/hts.c b/hts.c index 780119207..e403d4f41 100644 --- a/hts.c +++ b/hts.c @@ -1518,7 +1518,7 @@ KHASH_MAP_INIT_INT(bin, bins_t) typedef khash_t(bin) bidx_t; typedef struct { - int64_t n, m; + hts_pos_t n, m; uint64_t *offset; } lidx_t; @@ -1533,7 +1533,7 @@ struct __hts_idx_t { int tbi_n, last_tbi_tid; struct { uint32_t last_bin, save_bin; - int64_t last_coor; + hts_pos_t last_coor; int last_tid, save_tid, finished; uint64_t last_off, save_off; uint64_t off_beg, off_end; @@ -1581,7 +1581,7 @@ static inline int insert_to_b(bidx_t *b, int bin, uint64_t beg, uint64_t end) static inline int insert_to_l(lidx_t *l, int64_t _beg, int64_t _end, uint64_t offset, int min_shift) { int i; - int64_t beg, end; + hts_pos_t beg, end; beg = _beg >> min_shift; end = (_end - 1) >> min_shift; if (l->m < end + 1) { @@ -1735,7 +1735,7 @@ int hts_idx_finish(hts_idx_t *idx, uint64_t final_offset) return ret; } -int hts_idx_push(hts_idx_t *idx, int tid, int64_t beg, int64_t end, uint64_t offset, int is_mapped) +int hts_idx_push(hts_idx_t *idx, int tid, hts_pos_t beg, hts_pos_t end, uint64_t offset, int is_mapped) { int bin; int64_t maxpos = (int64_t) 1 << (idx->min_shift + idx->n_lvls * 3); @@ -1776,7 +1776,7 @@ int hts_idx_push(hts_idx_t *idx, int tid, int64_t beg, int64_t end, uint64_t off idx->z.last_tid = tid; idx->z.last_bin = 0xffffffffu; } else if (tid >= 0 && idx->z.last_coor > beg) { // test if positions are out of order - hts_log_error("Unsorted positions on sequence #%d: %"PRId64" followed by %"PRId64, tid+1, idx->z.last_coor+1, beg+1); + hts_log_error("Unsorted positions on sequence #%d: %"PRIhts_pos" followed by %"PRIhts_pos, tid+1, idx->z.last_coor+1, beg+1); return -1; } else if (end < beg) { @@ -1831,14 +1831,14 @@ int hts_idx_push(hts_idx_t *idx, int tid, int64_t beg, int64_t end, uint64_t off } if (idx->fmt == HTS_FMT_CSI) { - hts_log_error("Region %"PRId64"..%"PRId64" cannot be stored in a csi index " + hts_log_error("Region %"PRIhts_pos"..%"PRIhts_pos" cannot be stored in a csi index " "with min_shift = %d, n_lvls = %d. Try using " "min_shift = 14, n_lvls >= %d", beg, end, idx->min_shift, idx->n_lvls, n_lvls); } else { - hts_log_error("Region %"PRId64"..%"PRId64" cannot be stored in a %s index. " + hts_log_error("Region %"PRIhts_pos"..%"PRIhts_pos" cannot be stored in a %s index. " "Try using a csi index with min_shift = 14, " "n_lvls >= %d", beg, end, idx_format_name(idx->fmt), @@ -2272,13 +2272,14 @@ uint64_t hts_idx_get_n_no_coor(const hts_idx_t* idx) *** Iterator *** ****************/ +// Note: even with 32-bit hts_pos_t, end needs to be 64-bit here due to 1LL<= end) return 0; if (end >= 1LL<>s); e = t + (end>>s); n = e - b + 1; if (itr->bins.n + n > itr->bins.m) { @@ -2294,7 +2295,8 @@ static inline int reg2bins(int64_t beg, int64_t end, hts_itr_t *itr, int min_shi static inline int reg2intervals(hts_itr_t *iter, const hts_idx_t *idx, int tid, int64_t beg, int64_t end, uint64_t min_off, uint64_t max_off, int min_shift, int n_lvls) { int l, t, s; - int b, e, i, j; + int i, j; + hts_pos_t b, e; hts_pair64_max_t *off; bidx_t *bidx; khint_t k; @@ -2400,7 +2402,7 @@ uint64_t hts_itr_off(const hts_idx_t* idx, int tid) { return off0; } -hts_itr_t *hts_itr_query(const hts_idx_t *idx, int tid, int64_t beg, int64_t end, hts_readrec_func *readrec) +hts_itr_t *hts_itr_query(const hts_idx_t *idx, int tid, hts_pos_t beg, hts_pos_t end, hts_readrec_func *readrec) { int i, n_off, l, bin; hts_pair64_max_t *off; @@ -2532,7 +2534,7 @@ int hts_itr_multi_bam(const hts_idx_t *idx, hts_itr_t *iter) bidx_t *bidx; uint64_t min_off, max_off, t_off = (uint64_t)-1; int tid; - int64_t beg, end; + hts_pos_t beg, end; hts_reglist_t *curr_reg; if (!idx || !iter || !iter->multi) @@ -2652,7 +2654,7 @@ int hts_itr_multi_cram(const hts_idx_t *idx, hts_itr_t *iter) { const hts_cram_idx_t *cidx = (const hts_cram_idx_t *) idx; int tid, i, j, l, n_off = 0; - int64_t beg, end; + hts_pos_t beg, end; hts_reglist_t *curr_reg; hts_pair32_t *curr_intv; hts_pair64_max_t *off = NULL; @@ -2706,10 +2708,10 @@ int hts_itr_multi_cram(const hts_idx_t *idx, hts_itr_t *iter) off[n_off].max = (uint64_t)tid<<32 | end; n_off++; } else { - hts_log_warning("Could not set offset end for region %d:%"PRId64"-%"PRId64". Skipping", tid, beg, end); + hts_log_warning("Could not set offset end for region %d:%"PRIhts_pos"-%"PRIhts_pos". Skipping", tid, beg, end); } } else { - hts_log_warning("No index entry for region %d:%"PRId64"-%"PRId64"", tid, beg, end); + hts_log_warning("No index entry for region %d:%"PRIhts_pos"-%"PRIhts_pos"", tid, beg, end); } } } else { @@ -2862,11 +2864,6 @@ static void *hts_memrchr(const void *s, int c, size_t n) { return NULL; } -// Almost INT64_MAX, but when cast into a 32-bit int it's -// also INT_MAX instead of -1. This avoids bugs with old code -// using the new data types. -#define INT64_32_MAX ((((int64_t)INT_MAX)<<32)|INT_MAX) - /* * A variant of hts_parse_reg which is reference-id aware. It uses * the iterator name2id callbacks to validate the region tokenisation works. @@ -2968,7 +2965,7 @@ const char *hts_parse_region(const char *s, int *tid, int64_t *beg, int64_t *end // No colon is simplest case; just check and return. if (colon == NULL) { - *beg = 0; *end = INT64_32_MAX; + *beg = 0; *end = HTS_POS_MAX; kputsn(s, s_len-quoted, &ks); // convert to nul terminated string if (!ks.s) { *tid = -2; @@ -2983,7 +2980,7 @@ const char *hts_parse_region(const char *s, int *tid, int64_t *beg, int64_t *end // Has a colon, but check whole name first. if (!quoted) { - *beg = 0; *end = INT64_32_MAX; + *beg = 0; *end = HTS_POS_MAX; kputsn(s, s_len, &ks); // convert to nul terminated string if (!ks.s) { *tid = -2; @@ -3034,7 +3031,7 @@ const char *hts_parse_region(const char *s, int *tid, int64_t *beg, int64_t *end if (*beg < 0) { if (isdigit(*hyphen) || *hyphen == '\0' || *hyphen == ',') { // interpret chr:-100 as chr:1-100 - *end = *beg==-1 ? INT64_32_MAX : -(*beg+1); + *end = *beg==-1 ? HTS_POS_MAX : -(*beg+1); *beg = 0; return s_end; } else if (*hyphen == '-') { @@ -3046,7 +3043,7 @@ const char *hts_parse_region(const char *s, int *tid, int64_t *beg, int64_t *end } if (*hyphen == '\0' || ((flags & HTS_PARSE_LIST) && *hyphen == ',')) { - *end = flags & HTS_PARSE_ONE_COORD ? *beg+1 : INT64_32_MAX; + *end = flags & HTS_PARSE_ONE_COORD ? *beg+1 : HTS_POS_MAX; } else if (*hyphen == '-') { *end = hts_parse_decimal(hyphen+1, &hyphen, flags); if (*hyphen != '\0' && *hyphen != ',') { @@ -3059,7 +3056,7 @@ const char *hts_parse_region(const char *s, int *tid, int64_t *beg, int64_t *end } if (*end == 0) - *end = INT64_32_MAX; // interpret chr:100- as chr:100- + *end = HTS_POS_MAX; // interpret chr:100- as chr:100- if (*beg >= *end) return NULL; @@ -3068,19 +3065,19 @@ const char *hts_parse_region(const char *s, int *tid, int64_t *beg, int64_t *end // Next release we should mark this as deprecated? // Use hts_parse_region above instead. -const char *hts_parse_reg64(const char *s, int64_t *beg, int64_t *end) +const char *hts_parse_reg64(const char *s, hts_pos_t *beg, hts_pos_t *end) { char *hyphen; const char *colon = strrchr(s, ':'); if (colon == NULL) { - *beg = 0; *end = INT64_32_MAX; + *beg = 0; *end = HTS_POS_MAX; return s + strlen(s); } *beg = hts_parse_decimal(colon+1, &hyphen, HTS_PARSE_THOUSANDS_SEP) - 1; if (*beg < 0) *beg = 0; - if (*hyphen == '\0') *end = INT64_32_MAX; + if (*hyphen == '\0') *end = HTS_POS_MAX; else if (*hyphen == '-') *end = hts_parse_decimal(hyphen+1, NULL, HTS_PARSE_THOUSANDS_SEP); else return NULL; @@ -3097,7 +3094,7 @@ const char *hts_parse_reg(const char *s, int *beg, int *end) return NULL; } if (end64 > INT_MAX) { - if (end64 == INT64_32_MAX) { + if (end64 == HTS_POS_MAX) { end64 = INT_MAX; } else { hts_log_error("Position %"PRId64" too large", end64); @@ -3112,7 +3109,7 @@ const char *hts_parse_reg(const char *s, int *beg, int *end) hts_itr_t *hts_itr_querys(const hts_idx_t *idx, const char *reg, hts_name2id_f getid, void *hdr, hts_itr_query_func *itr_query, hts_readrec_func *readrec) { int tid; - int64_t beg, end; + hts_pos_t beg, end; if (strcmp(reg, ".") == 0) return itr_query(idx, HTS_IDX_START, 0, 0, readrec); @@ -3185,7 +3182,7 @@ hts_itr_t *hts_itr_regions(const hts_idx_t *idx, hts_reglist_t *reglist, int cou int hts_itr_next(BGZF *fp, hts_itr_t *iter, void *r, void *data) { int ret, tid; - int64_t beg, end; + hts_pos_t beg, end; if (iter == NULL || iter->finished) return -1; if (iter->read_rest) { if (iter->curr_off) { // seek to the start @@ -3230,7 +3227,7 @@ int hts_itr_multi_next(htsFile *fd, hts_itr_t *iter, void *r) { void *fp; int ret, tid, i, cr, ci; - int64_t beg, end; + hts_pos_t beg, end; hts_reglist_t *found_reg; if (iter == NULL || iter->finished) return -1; diff --git a/htslib/faidx.h b/htslib/faidx.h index cb5eb2dea..cf0e4e0b3 100644 --- a/htslib/faidx.h +++ b/htslib/faidx.h @@ -30,7 +30,7 @@ #define HTSLIB_FAIDX_H #include -#include "hts_defs.h" +#include "hts.h" #ifdef __cplusplus extern "C" { @@ -170,7 +170,7 @@ are reference names, quote using curly braces. Thus "{chr1}:100-200" and "{chr1:100-200}" disambiguate the above example. */ char *fai_fetch(const faidx_t *fai, const char *reg, int *len); -char *fai_fetch64(const faidx_t *fai, const char *reg, int64_t *len); +char *fai_fetch64(const faidx_t *fai, const char *reg, hts_pos_t *len); /// Fetch the quality string for a region for FASTQ files /** @param fai Pointer to the faidx_t struct @@ -184,7 +184,7 @@ destroyed by end users by calling `free()` on it. Region names can be quoted with curly braces, as for fai_fetch(). */ char *fai_fetchqual(const faidx_t *fai, const char *reg, int *len); -char *fai_fetchqual64(const faidx_t *fai, const char *reg, int64_t *len); +char *fai_fetchqual64(const faidx_t *fai, const char *reg, hts_pos_t *len); /// Fetch the number of sequences /** @param fai Pointer to the faidx_t struct @@ -216,7 +216,7 @@ char *faidx_fetch_seq(const faidx_t *fai, const char *c_name, int p_beg_i, int p The returned sequence is allocated by `malloc()` family and should be destroyed by end users by calling `free()` on it. */ -char *faidx_fetch_seq64(const faidx_t *fai, const char *c_name, int64_t p_beg_i, int64_t p_end_i, int64_t *len); +char *faidx_fetch_seq64(const faidx_t *fai, const char *c_name, hts_pos_t p_beg_i, hts_pos_t p_end_i, hts_pos_t *len); /// Fetch the quality string in a region for FASTQ files /** @param fai Pointer to the faidx_t struct @@ -242,7 +242,7 @@ char *faidx_fetch_qual(const faidx_t *fai, const char *c_name, int p_beg_i, int The returned sequence is allocated by `malloc()` family and should be destroyed by end users by calling `free()` on it. */ -char *faidx_fetch_qual64(const faidx_t *fai, const char *c_name, int64_t p_beg_i, int64_t p_end_i, int64_t *len); +char *faidx_fetch_qual64(const faidx_t *fai, const char *c_name, hts_pos_t p_beg_i, hts_pos_t p_end_i, hts_pos_t *len); /// Query if sequence is present /** @param fai Pointer to the faidx_t struct diff --git a/htslib/hts.h b/htslib/hts.h index 989435d2a..c72a426cc 100644 --- a/htslib/hts.h +++ b/htslib/hts.h @@ -30,8 +30,8 @@ DEALINGS IN THE SOFTWARE. */ #include #include +#include -#include "hts_defs.h" #include "hts_log.h" #ifdef __cplusplus @@ -577,11 +577,30 @@ When REST or NONE is used, idx is also ignored and may be NULL. #define HTS_FMT_TBI 2 #define HTS_FMT_CRAI 3 +// Almost INT64_MAX, but when cast into a 32-bit int it's +// also INT_MAX instead of -1. This avoids bugs with old code +// using the new hts_pos_t data type. +#define HTS_POS_MAX ((((int64_t)INT_MAX)<<32)|INT_MAX) +#define HTS_POS_MIN INT64_MIN +#define PRIhts_pos PRId64 +typedef int64_t hts_pos_t; + +// For comparison with previous release: +// +// #define HTS_POS_MAX INT_MAX +// #define HTS_POS_MIN INT_MIN +// #define PRIhts_pos PRId32 +// typedef int32_t hts_pos_t; + typedef struct { //uint32_t beg, end; - uint64_t beg, end; // sorry for the bad naming: FIXME! + hts_pos_t beg, end; // sorry for the bad naming: FIXME! } hts_pair32_t; +typedef struct { + hts_pos_t beg, end; +} hts_pair_pos_t; + typedef struct { uint64_t u, v; } hts_pair64_t; @@ -596,20 +615,20 @@ typedef struct { hts_pair32_t *intervals; int tid; uint32_t count; - uint64_t min_beg, max_end; + hts_pos_t min_beg, max_end; } hts_reglist_t; -typedef int hts_readrec_func(BGZF *fp, void *data, void *r, int *tid, int64_t *beg, int64_t *end); +typedef int hts_readrec_func(BGZF *fp, void *data, void *r, int *tid, hts_pos_t *beg, hts_pos_t *end); typedef int hts_seek_func(void *fp, int64_t offset, int where); typedef int64_t hts_tell_func(void *fp); typedef struct { uint32_t read_rest:1, finished:1, is_cram:1, nocoor:1, multi:1, dummy:27; int tid, n_off, i, n_reg; - int64_t beg, end; + hts_pos_t beg, end; hts_reglist_t *reg_list; int curr_tid, curr_reg, curr_intv; - int64_t curr_beg, curr_end; + hts_pos_t curr_beg, curr_end; uint64_t curr_off, nocoor_off; hts_pair64_max_t *off; hts_readrec_func *readrec; @@ -661,7 +680,7 @@ void hts_idx_destroy(hts_idx_t *idx); The @p is_mapped parameter is used to update the n_mapped / n_unmapped counts stored in the meta-data bin. */ -int hts_idx_push(hts_idx_t *idx, int tid, int64_t beg, int64_t end, uint64_t offset, int is_mapped); +int hts_idx_push(hts_idx_t *idx, int tid, hts_pos_t beg, hts_pos_t end, uint64_t offset, int is_mapped); /// Finish building an index /** @param idx Index @@ -857,7 +876,7 @@ typedef const char *(*hts_id2name_f)(void*, int); @return Pointer to the colon or '\0' after the reference sequence name, or NULL if @a str could not be parsed. */ -const char *hts_parse_reg64(const char *str, int64_t *beg, int64_t *end); +const char *hts_parse_reg64(const char *str, hts_pos_t *beg, hts_pos_t *end); /// Parse a "CHR:START-END"-style region string /** @param str String to be parsed @@ -951,14 +970,14 @@ const char *hts_parse_region(const char *str, int *tid, int64_t *beg, int64_t *e @param readrec Callback to read a record from the input file @return An iterator on success; NULL on failure */ -hts_itr_t *hts_itr_query(const hts_idx_t *idx, int tid, int64_t beg, int64_t end, hts_readrec_func *readrec); +hts_itr_t *hts_itr_query(const hts_idx_t *idx, int tid, hts_pos_t beg, hts_pos_t end, hts_readrec_func *readrec); /// Free an iterator /** @param iter Iterator to free */ void hts_itr_destroy(hts_itr_t *iter); -typedef hts_itr_t *hts_itr_query_func(const hts_idx_t *idx, int tid, int64_t beg, int64_t end, hts_readrec_func *readrec); +typedef hts_itr_t *hts_itr_query_func(const hts_idx_t *idx, int tid, hts_pos_t beg, hts_pos_t end, hts_readrec_func *readrec); /// Create a single-region iterator from a text region specification /** @param idx Index @@ -1159,7 +1178,7 @@ int probaln_glocal(const uint8_t *ref, int l_ref, const uint8_t *query, int l_qu void hts_md5_destroy(hts_md5_context *ctx); -static inline int hts_reg2bin(int64_t beg, int64_t end, int min_shift, int n_lvls) +static inline int hts_reg2bin(hts_pos_t beg, hts_pos_t end, int min_shift, int n_lvls) { int l, s = min_shift, t = ((1<<((n_lvls<<1) + n_lvls)) - 1) / 7; for (--end, l = n_lvls; l > 0; --l, s += 3, t -= 1<<((l<<1)+l)) diff --git a/htslib/hts_defs.h b/htslib/hts_defs.h index 3bf4a4630..ec02b081a 100644 --- a/htslib/hts_defs.h +++ b/htslib/hts_defs.h @@ -25,6 +25,8 @@ DEALINGS IN THE SOFTWARE. */ #ifndef HTSLIB_HTS_DEFS_H #define HTSLIB_HTS_DEFS_H +#include // For __MINGW_PRINTF_FORMAT macro + #ifdef __clang__ #ifdef __has_attribute #define HTS_COMPILER_HAS(attribute) __has_attribute(attribute) diff --git a/htslib/sam.h b/htslib/sam.h index eeb96075e..5b73d6644 100644 --- a/htslib/sam.h +++ b/htslib/sam.h @@ -201,7 +201,7 @@ extern const int8_t bam_cigar_table[256]; */ typedef struct { int32_t tid; - int64_t pos; + hts_pos_t pos; uint16_t bin; // NB: invalid on 64-bit pos uint8_t qual; uint8_t l_extranul; @@ -210,8 +210,8 @@ typedef struct { uint32_t n_cigar; int32_t l_qseq; int32_t mtid; - int64_t mpos; - int64_t isize; + hts_pos_t mpos; + hts_pos_t isize; } bam1_core_t; /*! @typedef @@ -962,7 +962,7 @@ int bam_cigar2qlen(int n_cigar, const uint32_t *cigar); operations in @p cigar (these are the operations that "consume" reference bases). All other operations (including invalid ones) are ignored. */ -int64_t bam_cigar2rlen(int n_cigar, const uint32_t *cigar); +hts_pos_t bam_cigar2rlen(int n_cigar, const uint32_t *cigar); /*! @abstract Calculate the rightmost base position of an alignment on the @@ -975,7 +975,7 @@ int64_t bam_cigar2rlen(int n_cigar, const uint32_t *cigar); For an unmapped read (either according to its flags or if it has no cigar string), we return b->core.pos + 1 by convention. */ -int64_t bam_endpos(const bam1_t *b); +hts_pos_t bam_endpos(const bam1_t *b); int bam_str2flag(const char *str); /** returns negative value on error */ char *bam_flag2str(int flag); /** The string must be freed by the user */ @@ -1100,7 +1100,7 @@ When using one of these values, @p beg and @p end are ignored. When using HTS_IDX_REST or HTS_IDX_NONE, NULL can be passed in to @p idx. */ -hts_itr_t *sam_itr_queryi(const hts_idx_t *idx, int tid, int64_t beg, int64_t end); +hts_itr_t *sam_itr_queryi(const hts_idx_t *idx, int tid, hts_pos_t beg, hts_pos_t end); /// Create a SAM/BAM/CRAM iterator /** @param idx Index @@ -1501,8 +1501,8 @@ typedef struct __bam_mplp_t *bam_mplp_t; int bam_plp_push(bam_plp_t iter, const bam1_t *b); const bam_pileup1_t *bam_plp_next(bam_plp_t iter, int *_tid, int *_pos, int *_n_plp); const bam_pileup1_t *bam_plp_auto(bam_plp_t iter, int *_tid, int *_pos, int *_n_plp); - const bam_pileup1_t *bam_plp64_next(bam_plp_t iter, int *_tid, int64_t *_pos, int *_n_plp); - const bam_pileup1_t *bam_plp64_auto(bam_plp_t iter, int *_tid, int64_t *_pos, int *_n_plp); + const bam_pileup1_t *bam_plp64_next(bam_plp_t iter, int *_tid, hts_pos_t *_pos, int *_n_plp); + const bam_pileup1_t *bam_plp64_auto(bam_plp_t iter, int *_tid, hts_pos_t *_pos, int *_n_plp); void bam_plp_set_maxcnt(bam_plp_t iter, int maxcnt); void bam_plp_reset(bam_plp_t iter); @@ -1551,7 +1551,7 @@ typedef struct __bam_mplp_t *bam_mplp_t; void bam_mplp_destroy(bam_mplp_t iter); void bam_mplp_set_maxcnt(bam_mplp_t iter, int maxcnt); int bam_mplp_auto(bam_mplp_t iter, int *_tid, int *_pos, int *n_plp, const bam_pileup1_t **plp); - int bam_mplp64_auto(bam_mplp_t iter, int *_tid, int64_t *_pos, int *n_plp, const bam_pileup1_t **plp); + int bam_mplp64_auto(bam_mplp_t iter, int *_tid, hts_pos_t *_pos, int *n_plp, const bam_pileup1_t **plp); void bam_mplp_reset(bam_mplp_t iter); void bam_mplp_constructor(bam_mplp_t iter, int (*func)(void *data, const bam1_t *b, bam_pileup_cd *cd)); @@ -1565,7 +1565,7 @@ typedef struct __bam_mplp_t *bam_mplp_t; * BAQ calculation and realignment * ***********************************/ -int sam_cap_mapq(bam1_t *b, const char *ref, int64_t ref_len, int thres); +int sam_cap_mapq(bam1_t *b, const char *ref, hts_pos_t ref_len, int thres); /// Calculate BAQ scores /** @param b BAM record @@ -1607,7 +1607,7 @@ Depending on what previous processing happened, this may or may not be the correct thing to do. It would be wise to avoid this situation if possible. */ -int sam_prob_realn(bam1_t *b, const char *ref, int64_t ref_len, int flag); +int sam_prob_realn(bam1_t *b, const char *ref, hts_pos_t ref_len, int flag); #ifdef __cplusplus } diff --git a/htslib/tbx.h b/htslib/tbx.h index 52f103b11..1180d575b 100644 --- a/htslib/tbx.h +++ b/htslib/tbx.h @@ -64,7 +64,7 @@ extern const tbx_conf_t tbx_conf_gff, tbx_conf_bed, tbx_conf_psltbl, tbx_conf_sa /* Internal helper function used by tbx_itr_next() */ BGZF *hts_get_bgzfp(htsFile *fp); - int tbx_readrec(BGZF *fp, void *tbxv, void *sv, int *tid, int64_t *beg, int64_t *end); + int tbx_readrec(BGZF *fp, void *tbxv, void *sv, int *tid, hts_pos_t *beg, hts_pos_t *end); tbx_t *tbx_index(BGZF *fp, int min_shift, const tbx_conf_t *conf); /* diff --git a/htslib/vcf.h b/htslib/vcf.h index 7116a1229..742b341d8 100644 --- a/htslib/vcf.h +++ b/htslib/vcf.h @@ -209,8 +209,8 @@ typedef struct { */ typedef struct { int32_t rid; // CHROM - int64_t pos; // POS - int64_t rlen; // length of REF + hts_pos_t pos; // POS + hts_pos_t rlen; // length of REF float qual; // QUAL uint32_t n_info:16, n_allele:16; uint32_t n_fmt:8, n_sample:24; @@ -427,7 +427,7 @@ set to one of BCF_ERR* codes and must be checked before calling bcf_write(). int vcf_write(htsFile *fp, const bcf_hdr_t *h, bcf1_t *v) HTS_RESULT_USED; /** Helper function for the bcf_itr_next() macro; internal use, ignore it */ - int bcf_readrec(BGZF *fp, void *null, void *v, int *tid, int64_t *beg, int64_t *end); + int bcf_readrec(BGZF *fp, void *null, void *v, int *tid, hts_pos_t *beg, hts_pos_t *end); diff --git a/realn.c b/realn.c index 9579e285c..78da5df16 100644 --- a/realn.c +++ b/realn.c @@ -35,13 +35,13 @@ DEALINGS IN THE SOFTWARE. */ #include "htslib/hts.h" #include "htslib/sam.h" -int sam_cap_mapq(bam1_t *b, const char *ref, int64_t ref_len, int thres) +int sam_cap_mapq(bam1_t *b, const char *ref, hts_pos_t ref_len, int thres) { uint8_t *seq = bam_get_seq(b), *qual = bam_get_qual(b); uint32_t *cigar = bam_get_cigar(b); bam1_core_t *c = &b->core; int i, y, mm, q, len, clip_l, clip_q; - int64_t x; + hts_pos_t x; double t; if (thres < 0) thres = 40; // set the default mm = q = len = clip_l = clip_q = 0; @@ -102,10 +102,10 @@ static int realn_check_tag(const uint8_t *tg, enum htsLogLevel severity, return 0; } -int sam_prob_realn(bam1_t *b, const char *ref, int64_t ref_len, int flag) +int sam_prob_realn(bam1_t *b, const char *ref, hts_pos_t ref_len, int flag) { int k, bw, y, yb, ye, xb, xe, apply_baq = flag&1, extend_baq = flag>>1&1, redo_baq = flag&4, fix_bq = 0; - int64_t i, x; + hts_pos_t i, x; uint32_t *cigar = bam_get_cigar(b); bam1_core_t *c = &b->core; probaln_par_t conf = { 0.001, 0.1, 10 }; diff --git a/sam.c b/sam.c index 337b90d80..04effb959 100644 --- a/sam.c +++ b/sam.c @@ -456,17 +456,17 @@ int bam_cigar2qlen(int n_cigar, const uint32_t *cigar) return l; } -int64_t bam_cigar2rlen(int n_cigar, const uint32_t *cigar) +hts_pos_t bam_cigar2rlen(int n_cigar, const uint32_t *cigar) { int k; - int64_t l; + hts_pos_t l; for (k = l = 0; k < n_cigar; ++k) if (bam_cigar_type(bam_cigar_op(cigar[k]))&2) l += bam_cigar_oplen(cigar[k]); return l; } -int64_t bam_endpos(const bam1_t *b) +hts_pos_t bam_endpos(const bam1_t *b) { if (!(b->core.flag & BAM_FUNMAP) && b->core.n_cigar > 0) return b->core.pos + bam_cigar2rlen(b->core.n_cigar, bam_get_cigar(b)); @@ -836,7 +836,7 @@ int sam_idx_save(htsFile *fp) { return 0; } -static int sam_readrec(BGZF *ignored, void *fpv, void *bv, int *tid, int64_t *beg, int64_t *end) +static int sam_readrec(BGZF *ignored, void *fpv, void *bv, int *tid, hts_pos_t *beg, hts_pos_t *end) { htsFile *fp = (htsFile *)fpv; bam1_t *b = bv; @@ -851,7 +851,7 @@ static int sam_readrec(BGZF *ignored, void *fpv, void *bv, int *tid, int64_t *be } // This is used only with read_rest=1 iterators, so need not set tid/beg/end. -static int sam_readrec_rest(BGZF *ignored, void *fpv, void *bv, int *tid, int64_t *beg, int64_t *end) +static int sam_readrec_rest(BGZF *ignored, void *fpv, void *bv, int *tid, hts_pos_t *beg, hts_pos_t *end) { htsFile *fp = (htsFile *)fpv; bam1_t *b = bv; @@ -860,7 +860,7 @@ static int sam_readrec_rest(BGZF *ignored, void *fpv, void *bv, int *tid, int64_ return ret; } -static int cram_readrec(BGZF *ignored, void *fpv, void *bv, int *tid, int64_t *beg, int64_t *end) +static int cram_readrec(BGZF *ignored, void *fpv, void *bv, int *tid, hts_pos_t *beg, hts_pos_t *end) { htsFile *fp = fpv; bam1_t *b = bv; @@ -983,7 +983,7 @@ hts_idx_t *sam_index_load(htsFile *fp, const char *fn) return index_load(fp, fn, NULL, HTS_IDX_SAVE_REMOTE); } -static hts_itr_t *cram_itr_query(const hts_idx_t *idx, int tid, int64_t beg, int64_t end, hts_readrec_func *readrec) +static hts_itr_t *cram_itr_query(const hts_idx_t *idx, int tid, hts_pos_t beg, hts_pos_t end, hts_readrec_func *readrec) { const hts_cram_idx_t *cidx = (const hts_cram_idx_t *) idx; hts_itr_t *iter = (hts_itr_t *) calloc(1, sizeof(hts_itr_t)); @@ -1040,7 +1040,7 @@ static hts_itr_t *cram_itr_query(const hts_idx_t *idx, int tid, int64_t beg, int return iter; } -hts_itr_t *sam_itr_queryi(const hts_idx_t *idx, int tid, int64_t beg, int64_t end) +hts_itr_t *sam_itr_queryi(const hts_idx_t *idx, int tid, hts_pos_t beg, hts_pos_t end) { const hts_cram_idx_t *cidx = (const hts_cram_idx_t *) idx; if (idx == NULL) @@ -3661,14 +3661,14 @@ char *bam_flag2str(int flag) typedef struct { int k, y; - int64_t x, end; + hts_pos_t x, end; } cstate_t; static cstate_t g_cstate_null = { -1, 0, 0, 0 }; typedef struct __linkbuf_t { bam1_t b; - int64_t beg, end; + hts_pos_t beg, end; cstate_t s; struct __linkbuf_t *next; bam_pileup_cd cd; @@ -3719,7 +3719,7 @@ static inline void mp_free(mempool_t *mp, lbnode_t *p) s->x: the reference coordinate of the start of s->k s->y: the query coordiante of the start of s->k */ -static inline int resolve_cigar2(bam_pileup1_t *p, int64_t pos, cstate_t *s) +static inline int resolve_cigar2(bam_pileup1_t *p, hts_pos_t pos, cstate_t *s) { #define _cop(c) ((c)&BAM_CIGAR_MASK) #define _cln(c) ((c)>>BAM_CIGAR_SHIFT) @@ -3889,7 +3889,7 @@ struct __bam_plp_t { mempool_t *mp; lbnode_t *head, *tail; int32_t tid, max_tid; - int64_t pos, max_pos; + hts_pos_t pos, max_pos; int is_eof, max_plp, error, maxcnt; uint64_t id; bam_pileup1_t *plp; @@ -3967,7 +3967,7 @@ void bam_plp_destructor(bam_plp_t plp, * Returns BAM_CMATCH, -1 when there is no more cigar to process or the requested position is not covered, * or -2 on error. */ -static inline int cigar_iref2iseq_set(uint32_t **cigar, uint32_t *cigar_max, int *icig, int *iseq, int64_t *iref) +static inline int cigar_iref2iseq_set(uint32_t **cigar, uint32_t *cigar_max, int *icig, int *iseq, hts_pos_t *iref) { int pos = *iref; if ( pos < 0 ) return -1; @@ -4002,7 +4002,7 @@ static inline int cigar_iref2iseq_set(uint32_t **cigar, uint32_t *cigar_max, int *iseq = -1; return -1; } -static inline int cigar_iref2iseq_next(uint32_t **cigar, uint32_t *cigar_max, int *icig, int *iseq, int64_t *iref) +static inline int cigar_iref2iseq_next(uint32_t **cigar, uint32_t *cigar_max, int *icig, int *iseq, hts_pos_t *iref) { while ( *cigar < cigar_max ) { @@ -4036,16 +4036,16 @@ static int tweak_overlap_quality(bam1_t *a, bam1_t *b) uint8_t *a_qual = bam_get_qual(a), *b_qual = bam_get_qual(b); uint8_t *a_seq = bam_get_seq(a), *b_seq = bam_get_seq(b); - int64_t iref = b->core.pos; - int64_t a_iref = iref - a->core.pos; - int64_t b_iref = iref - b->core.pos; + hts_pos_t iref = b->core.pos; + hts_pos_t a_iref = iref - a->core.pos; + hts_pos_t b_iref = iref - b->core.pos; int a_ret = cigar_iref2iseq_set(&a_cigar, a_cigar_max, &a_icig, &a_iseq, &a_iref); if ( a_ret<0 ) return a_ret<-1 ? -1:0; // no overlap or error int b_ret = cigar_iref2iseq_set(&b_cigar, b_cigar_max, &b_icig, &b_iseq, &b_iref); if ( b_ret<0 ) return b_ret<-1 ? -1:0; // no overlap or error #if DBG - fprintf(stderr,"tweak %s n_cigar=%d %d .. %d-%d vs %"PRId64"-%"PRId64"\n", bam_get_qname(a), a->core.n_cigar, b->core.n_cigar, + fprintf(stderr,"tweak %s n_cigar=%d %d .. %d-%d vs %"PRIhts_pos"-%"PRIhts_pos"\n", bam_get_qname(a), a->core.n_cigar, b->core.n_cigar, a->core.pos+1,a->core.pos+bam_cigar2rlen(a->core.n_cigar,bam_get_cigar(a)), b->core.pos+1, b->core.pos+bam_cigar2rlen(b->core.n_cigar,bam_get_cigar(b))); #endif @@ -4167,7 +4167,7 @@ static void overlap_remove(bam_plp_t iter, const bam1_t *b) // Prepares next pileup position in bam records collected by bam_plp_auto -> user func -> bam_plp_push. Returns // pointer to the piled records if next position is ready or NULL if there is not enough records in the // buffer yet (the current position is still the maximum position across all buffered reads). -const bam_pileup1_t *bam_plp64_next(bam_plp_t iter, int *_tid, int64_t *_pos, int *_n_plp) +const bam_pileup1_t *bam_plp64_next(bam_plp_t iter, int *_tid, hts_pos_t *_pos, int *_n_plp) { if (iter->error) { *_n_plp = -1; return NULL; } *_n_plp = 0; @@ -4221,7 +4221,7 @@ const bam_pileup1_t *bam_plp64_next(bam_plp_t iter, int *_tid, int64_t *_pos, in const bam_pileup1_t *bam_plp_next(bam_plp_t iter, int *_tid, int *_pos, int *_n_plp) { - int64_t pos64 = 0; + hts_pos_t pos64 = 0; const bam_pileup1_t *p = bam_plp64_next(iter, _tid, &pos64, _n_plp); if (pos64 < INT_MAX) { *_pos = pos64; @@ -4284,7 +4284,7 @@ int bam_plp_push(bam_plp_t iter, const bam1_t *b) return 0; } -const bam_pileup1_t *bam_plp64_auto(bam_plp_t iter, int *_tid, int64_t *_pos, int *_n_plp) +const bam_pileup1_t *bam_plp64_auto(bam_plp_t iter, int *_tid, hts_pos_t *_pos, int *_n_plp) { const bam_pileup1_t *plp; if (iter->func == 0 || iter->error) { *_n_plp = -1; return 0; } @@ -4313,7 +4313,7 @@ const bam_pileup1_t *bam_plp64_auto(bam_plp_t iter, int *_tid, int64_t *_pos, in const bam_pileup1_t *bam_plp_auto(bam_plp_t iter, int *_tid, int *_pos, int *_n_plp) { - int64_t pos64 = 0; + hts_pos_t pos64 = 0; const bam_pileup1_t *p = bam_plp64_auto(iter, _tid, &pos64, _n_plp); if (pos64 < INT_MAX) { *_pos = pos64; @@ -4352,7 +4352,7 @@ void bam_plp_set_maxcnt(bam_plp_t iter, int maxcnt) struct __bam_mplp_t { int n; int32_t min_tid, *tid; - uint64_t min_pos, *pos; + hts_pos_t min_pos, *pos; bam_plp_t *iter; int *n_plp; const bam_pileup1_t **plp; @@ -4363,13 +4363,13 @@ bam_mplp_t bam_mplp_init(int n, bam_plp_auto_f func, void **data) int i; bam_mplp_t iter; iter = (bam_mplp_t)calloc(1, sizeof(struct __bam_mplp_t)); - iter->pos = (uint64_t*)calloc(n, sizeof(uint64_t)); + iter->pos = (hts_pos_t*)calloc(n, sizeof(hts_pos_t)); iter->tid = (int32_t*)calloc(n, sizeof(int32_t)); iter->n_plp = (int*)calloc(n, sizeof(int)); iter->plp = (const bam_pileup1_t**)calloc(n, sizeof(bam_pileup1_t*)); iter->iter = (bam_plp_t*)calloc(n, sizeof(bam_plp_t)); iter->n = n; - iter->min_pos = (uint64_t)-1; + iter->min_pos = HTS_POS_MAX; iter->min_tid = (uint32_t)-1; for (i = 0; i < n; ++i) { iter->iter[i] = bam_plp_init(func, data[i]); @@ -4403,15 +4403,15 @@ void bam_mplp_destroy(bam_mplp_t iter) free(iter); } -int bam_mplp64_auto(bam_mplp_t iter, int *_tid, int64_t *_pos, int *n_plp, const bam_pileup1_t **plp) +int bam_mplp64_auto(bam_mplp_t iter, int *_tid, hts_pos_t *_pos, int *n_plp, const bam_pileup1_t **plp) { int i, ret = 0; - uint64_t new_min_pos = (uint64_t)-1; + hts_pos_t new_min_pos = HTS_POS_MAX; uint32_t new_min_tid = (uint32_t)-1; for (i = 0; i < iter->n; ++i) { if (iter->pos[i] == iter->min_pos && iter->tid[i] == iter->min_tid) { int tid; - int64_t pos; + hts_pos_t pos; iter->plp[i] = bam_plp64_auto(iter->iter[i], &tid, &pos, &iter->n_plp[i]); if ( iter->iter[i]->error ) return -1; if (iter->plp[i]) { @@ -4433,7 +4433,7 @@ int bam_mplp64_auto(bam_mplp_t iter, int *_tid, int64_t *_pos, int *n_plp, const } iter->min_pos = new_min_pos; iter->min_tid = new_min_tid; - if (new_min_pos == (uint64_t)-1) return 0; + if (new_min_pos == HTS_POS_MAX) return 0; *_tid = new_min_tid; *_pos = new_min_pos; for (i = 0; i < iter->n; ++i) { if (iter->pos[i] == iter->min_pos && iter->tid[i] == iter->min_tid) { @@ -4446,14 +4446,16 @@ int bam_mplp64_auto(bam_mplp_t iter, int *_tid, int64_t *_pos, int *n_plp, const int bam_mplp_auto(bam_mplp_t iter, int *_tid, int *_pos, int *n_plp, const bam_pileup1_t **plp) { - int64_t pos64 = 0; + hts_pos_t pos64 = 0; int ret = bam_mplp64_auto(iter, _tid, &pos64, n_plp, plp); - if (pos64 < INT_MAX) { - *_pos = pos64; - } else { - hts_log_error("Position %"PRId64" too large", pos64); - *_pos = INT_MAX; - return -1; + if (ret >= 0) { + if (pos64 < INT_MAX) { + *_pos = pos64; + } else { + hts_log_error("Position %"PRId64" too large", pos64); + *_pos = INT_MAX; + return -1; + } } return ret; } @@ -4461,11 +4463,11 @@ int bam_mplp_auto(bam_mplp_t iter, int *_tid, int *_pos, int *n_plp, const bam_p void bam_mplp_reset(bam_mplp_t iter) { int i; - iter->min_pos = (uint64_t)-1; + iter->min_pos = HTS_POS_MAX; iter->min_tid = (uint32_t)-1; for (i = 0; i < iter->n; ++i) { bam_plp_reset(iter->iter[i]); - iter->pos[i] = (uint64_t)-1; + iter->pos[i] = HTS_POS_MAX; iter->tid[i] = (uint32_t)-1; iter->n_plp[i] = 0; iter->plp[i] = NULL; diff --git a/synced_bcf_reader.c b/synced_bcf_reader.c index 425fae1ce..709917b01 100644 --- a/synced_bcf_reader.c +++ b/synced_bcf_reader.c @@ -384,7 +384,7 @@ void debug_buffer(FILE *fp, bcf_sr_t *reader) for (j=0; j<=reader->nbuffer; j++) { bcf1_t *line = reader->buffer[j]; - fprintf(fp,"\t%p\t%s%s\t%s:%"PRId64"\t%s ", (void*)line,reader->fname,j==0?"*":" ",reader->header->id[BCF_DT_CTG][line->rid].key,line->pos+1,line->n_allele?line->d.allele[0]:""); + fprintf(fp,"\t%p\t%s%s\t%s:%"PRIhts_pos"\t%s ", (void*)line,reader->fname,j==0?"*":" ",reader->header->id[BCF_DT_CTG][line->rid].key,line->pos+1,line->n_allele?line->d.allele[0]:""); int k; for (k=1; kn_allele; k++) fprintf(fp," %s", line->d.allele[k]); fprintf(fp,"\n"); diff --git a/tbx.c b/tbx.c index d5e3c3be5..c7e047894 100644 --- a/tbx.c +++ b/tbx.c @@ -172,7 +172,7 @@ static inline int get_intv(tbx_t *tbx, kstring_t *str, tbx_intv_t *intv, int is_ * -1 on EOF * <= -2 on error */ -int tbx_readrec(BGZF *fp, void *tbxv, void *sv, int *tid, int64_t *beg, int64_t *end) +int tbx_readrec(BGZF *fp, void *tbxv, void *sv, int *tid, hts_pos_t *beg, hts_pos_t *end) { tbx_t *tbx = (tbx_t *) tbxv; kstring_t *s = (kstring_t *) sv; diff --git a/test/sam.c b/test/sam.c index bf429bf98..3c15459aa 100644 --- a/test/sam.c +++ b/test/sam.c @@ -31,6 +31,7 @@ DEALINGS IN THE SOFTWARE. */ #include #include #include +#include // Suppress message for faidx_fetch_nseq(), which we're intentionally testing #include "htslib/hts_defs.h" @@ -1148,11 +1149,13 @@ static void samrecord_layout(void) size_t bam1_t_size, bam1_t_size2; - bam1_t_size = (56 + sizeof(int) + 4 + sizeof (char *) + sizeof(uint64_t) - + sizeof(uint32_t)); + assert(sizeof(hts_pos_t) == 8 || sizeof(hts_pos_t) == 4); + int core_size = sizeof(hts_pos_t) == 8 ? 56 : 36; + bam1_t_size = (core_size + sizeof(int) + sizeof(char *) + sizeof(uint64_t) + + 2 * sizeof(uint32_t)); bam1_t_size2 = bam1_t_size + 4; // Account for padding on some platforms - if (sizeof (bam1_core_t) != 56) + if (sizeof (bam1_core_t) != core_size) fail("sizeof bam1_core_t is %zu, expected 56", sizeof (bam1_core_t)); if (sizeof (bam1_t) != bam1_t_size && sizeof (bam1_t) != bam1_t_size2) diff --git a/test/test-bcf-sr.c b/test/test-bcf-sr.c index 23ee1d3e0..ee0aadedc 100644 --- a/test/test-bcf-sr.c +++ b/test/test-bcf-sr.c @@ -104,7 +104,7 @@ int main(int argc, char *argv[]) { if ( !bcf_sr_has_line(sr,i) ) continue; bcf1_t *rec = bcf_sr_get_line(sr, i); - printf("%s:%"PRId64, bcf_seqname(bcf_sr_get_header(sr,i),rec),rec->pos+1); + printf("%s:%"PRIhts_pos, bcf_seqname(bcf_sr_get_header(sr,i),rec),rec->pos+1); break; } diff --git a/vcf.c b/vcf.c index c53e3a2db..e2c94eabd 100644 --- a/vcf.c +++ b/vcf.c @@ -1443,7 +1443,7 @@ int bcf_read(htsFile *fp, const bcf_hdr_t *h, bcf1_t *v) return bcf_subset_format(h,v); } -int bcf_readrec(BGZF *fp, void *null, void *vv, int *tid, int64_t *beg, int64_t *end) +int bcf_readrec(BGZF *fp, void *null, void *vv, int *tid, hts_pos_t *beg, hts_pos_t *end) { bcf1_t *v = (bcf1_t *) vv; int ret; @@ -1691,7 +1691,7 @@ int bcf_write(htsFile *hfp, bcf_hdr_t *h, bcf1_t *v) } if ( bcf_hdr_nsamples(h)!=v->n_sample ) { - hts_log_error("Broken VCF record, the number of columns at %s:%"PRId64" does not match the number of samples (%d vs %d)", + hts_log_error("Broken VCF record, the number of columns at %s:%"PRIhts_pos" does not match the number of samples (%d vs %d)", bcf_seqname(h,v), v->pos+1, v->n_sample, bcf_hdr_nsamples(h)); return -1; } @@ -2145,7 +2145,7 @@ static int vcf_parse_format(kstring_t *s, const bcf_hdr_t *h, bcf1_t *v, char *p char *end = s->s + s->l; if ( q>=end ) { - hts_log_error("FORMAT column with no sample columns starting at %s:%"PRId64"", s->s, v->pos+1); + hts_log_error("FORMAT column with no sample columns starting at %s:%"PRIhts_pos"", s->s, v->pos+1); v->errcode |= BCF_ERR_NCOLS; return -1; } @@ -2161,7 +2161,7 @@ static int vcf_parse_format(kstring_t *s, const bcf_hdr_t *h, bcf1_t *v, char *p for (j = 0, t = kstrtok(p, ":", &aux1); t; t = kstrtok(0, 0, &aux1), ++j) { if (j >= MAX_N_FMT) { v->errcode |= BCF_ERR_LIMITS; - hts_log_error("FORMAT column at %s:%"PRId64" lists more identifiers than htslib can handle", + hts_log_error("FORMAT column at %s:%"PRIhts_pos" lists more identifiers than htslib can handle", bcf_seqname(h,v), v->pos+1); return -1; } @@ -2233,7 +2233,7 @@ static int vcf_parse_format(kstring_t *s, const bcf_hdr_t *h, bcf1_t *v, char *p j++; if ( j>=v->n_fmt ) { - hts_log_error("Incorrect number of FORMAT fields at %s:%"PRId64"", + hts_log_error("Incorrect number of FORMAT fields at %s:%"PRIhts_pos"", h->id[BCF_DT_CTG][v->rid].key, v->pos+1); v->errcode |= BCF_ERR_NCOLS; return -1; @@ -2340,7 +2340,7 @@ static int vcf_parse_format(kstring_t *s, const bcf_hdr_t *h, bcf1_t *v, char *p } else { char buffer[8]; - hts_log_error("Invalid character '%s' in '%s' FORMAT field at %s:%"PRId64"", + hts_log_error("Invalid character '%s' in '%s' FORMAT field at %s:%"PRIhts_pos"", dump_char(buffer, *t), h->id[BCF_DT_ID][z->key].key, bcf_seqname(h,v), v->pos+1); v->errcode |= BCF_ERR_CHAR; return -1; @@ -2399,14 +2399,14 @@ static int vcf_parse_format(kstring_t *s, const bcf_hdr_t *h, bcf1_t *v, char *p if ( v->n_sample!=bcf_hdr_nsamples(h) ) { - hts_log_error("Number of columns at %s:%"PRId64" does not match the number of samples (%d vs %d)", + hts_log_error("Number of columns at %s:%"PRIhts_pos" does not match the number of samples (%d vs %d)", bcf_seqname(h,v), v->pos+1, v->n_sample, bcf_hdr_nsamples(h)); v->errcode |= BCF_ERR_NCOLS; return -1; } if ( v->indiv.l > 0xffffffff ) { - hts_log_error("The FORMAT at %s:%"PRId64" is too long", bcf_seqname(h,v), v->pos+1); + hts_log_error("The FORMAT at %s:%"PRIhts_pos" is too long", bcf_seqname(h,v), v->pos+1); v->errcode |= BCF_ERR_LIMITS; // Error recovery: return -1 if this is a critical error or 0 if we want to ignore the FORMAT and proceed diff --git a/vcfutils.c b/vcfutils.c index 3e96c286a..b2a477b4a 100644 --- a/vcfutils.c +++ b/vcfutils.c @@ -65,12 +65,12 @@ int bcf_calc_ac(const bcf_hdr_t *header, bcf1_t *line, int *ac, int which) case BCF_BT_INT8: BRANCH_INT(int8_t); break; case BCF_BT_INT16: BRANCH_INT(int16_t); break; case BCF_BT_INT32: BRANCH_INT(int32_t); break; - default: hts_log_error("Unexpected type %d at %s:%"PRId64, ac_type, header->id[BCF_DT_CTG][line->rid].key, line->pos+1); exit(1); break; + default: hts_log_error("Unexpected type %d at %s:%"PRIhts_pos, ac_type, header->id[BCF_DT_CTG][line->rid].key, line->pos+1); exit(1); break; } #undef BRANCH_INT if ( anid[BCF_DT_CTG][line->rid].key, line->pos+1); + hts_log_error("Incorrect AN/AC counts at %s:%"PRIhts_pos, header->id[BCF_DT_CTG][line->rid].key, line->pos+1); exit(1); } ac[0] = an - nac; @@ -99,7 +99,7 @@ int bcf_calc_ac(const bcf_hdr_t *header, bcf1_t *line, int *ac, int which) if ( bcf_gt_is_missing(p[ial]) ) continue; /* missing allele */ \ if ( p[ial]>>1 > line->n_allele ) \ { \ - hts_log_error("Incorrect allele (\"%d\") in %s at %s:%"PRId64, (p[ial]>>1)-1, header->samples[i], header->id[BCF_DT_CTG][line->rid].key, line->pos+1); \ + hts_log_error("Incorrect allele (\"%d\") in %s at %s:%"PRIhts_pos, (p[ial]>>1)-1, header->samples[i], header->id[BCF_DT_CTG][line->rid].key, line->pos+1); \ exit(1); \ } \ ac[(p[ial]>>1)-1]++; \ @@ -110,7 +110,7 @@ int bcf_calc_ac(const bcf_hdr_t *header, bcf1_t *line, int *ac, int which) case BCF_BT_INT8: BRANCH_INT(int8_t, bcf_int8_vector_end); break; case BCF_BT_INT16: BRANCH_INT(int16_t, bcf_int16_vector_end); break; case BCF_BT_INT32: BRANCH_INT(int32_t, bcf_int32_vector_end); break; - default: hts_log_error("Unexpected type %d at %s:%"PRId64, fmt_gt->type, header->id[BCF_DT_CTG][line->rid].key, line->pos+1); exit(1); break; + default: hts_log_error("Unexpected type %d at %s:%"PRIhts_pos, fmt_gt->type, header->id[BCF_DT_CTG][line->rid].key, line->pos+1); exit(1); break; } #undef BRANCH_INT return 1; @@ -189,7 +189,7 @@ int bcf_trim_alleles(const bcf_hdr_t *header, bcf1_t *line) if ( p[ial]==vector_end ) break; /* smaller ploidy */ \ if ( bcf_gt_is_missing(p[ial]) ) continue; /* missing allele */ \ if ( (p[ial]>>1)-1 >= line->n_allele ) { \ - hts_log_error("Allele index is out of bounds at %s:%"PRId64, header->id[BCF_DT_CTG][line->rid].key, line->pos+1); \ + hts_log_error("Allele index is out of bounds at %s:%"PRIhts_pos, header->id[BCF_DT_CTG][line->rid].key, line->pos+1); \ ret = -1; \ goto clean; \ } \ @@ -201,7 +201,7 @@ int bcf_trim_alleles(const bcf_hdr_t *header, bcf1_t *line) case BCF_BT_INT8: BRANCH(int8_t, bcf_int8_vector_end); break; case BCF_BT_INT16: BRANCH(int16_t, bcf_int16_vector_end); break; case BCF_BT_INT32: BRANCH(int32_t, bcf_int32_vector_end); break; - default: hts_log_error("Unexpected GT %d at %s:%"PRId64, + default: hts_log_error("Unexpected GT %d at %s:%"PRIhts_pos, gt->type, header->id[BCF_DT_CTG][line->rid].key, line->pos + 1); goto clean; } @@ -266,7 +266,7 @@ int bcf_remove_allele_set(const bcf_hdr_t *header, bcf1_t *line, const struct kb int nR_new = line->n_allele-nrm; if ( nR_new<=0 ) // should not be able to remove reference allele { - hts_log_error("Cannot remove reference allele at %s:%"PRId64" [%d]", + hts_log_error("Cannot remove reference allele at %s:%"PRIhts_pos" [%d]", bcf_seqname(header,line), line->pos+1, nR_new); goto err; } @@ -297,7 +297,7 @@ int bcf_remove_allele_set(const bcf_hdr_t *header, bcf1_t *line, const struct kb mdat_bytes = mdat * size; if ( nret<0 ) { - hts_log_error("Could not access INFO/%s at %s:%"PRId64" [%d]", + hts_log_error("Could not access INFO/%s at %s:%"PRIhts_pos" [%d]", bcf_hdr_int2id(header,BCF_DT_ID,info->key), bcf_seqname(header,line), line->pos+1, nret); goto err; } @@ -335,7 +335,7 @@ int bcf_remove_allele_set(const bcf_hdr_t *header, bcf1_t *line, const struct kb if ( j==1 && s == '.' ) continue; // missing if ( j!=nexp ) { - hts_log_error("Unexpected number of values in INFO/%s at %s:%"PRId64"; expected Number=%c=%d, but found %d", + hts_log_error("Unexpected number of values in INFO/%s at %s:%"PRIhts_pos"; expected Number=%c=%d, but found %d", bcf_hdr_int2id(header,BCF_DT_ID,info->key), bcf_seqname(header,line), line->pos+1, vlen==BCF_VL_A ? 'A' : 'R', nexp, j); goto err; } @@ -366,7 +366,7 @@ int bcf_remove_allele_set(const bcf_hdr_t *header, bcf1_t *line, const struct kb if ( n==1 && s == '.' ) continue; // missing if ( n!=nG_ori ) { - hts_log_error("Unexpected number of values in INFO/%s at %s:%"PRId64"; expected Number=G=%d, but found %d", + hts_log_error("Unexpected number of values in INFO/%s at %s:%"PRIhts_pos"; expected Number=G=%d, but found %d", bcf_hdr_int2id(header,BCF_DT_ID,info->key), bcf_seqname(header,line), line->pos+1, nG_ori, n); goto err; } @@ -375,7 +375,7 @@ int bcf_remove_allele_set(const bcf_hdr_t *header, bcf1_t *line, const struct kb nret = bcf_update_info(header, line, bcf_hdr_int2id(header,BCF_DT_ID,info->key), (void*)str.s, str.l, type); if ( nret<0 ) { - hts_log_error("Could not update INFO/%s at %s:%"PRId64" [%d]", + hts_log_error("Could not update INFO/%s at %s:%"PRIhts_pos" [%d]", bcf_hdr_int2id(header,BCF_DT_ID,info->key), bcf_seqname(header,line), line->pos+1, nret); goto err; } @@ -407,7 +407,7 @@ int bcf_remove_allele_set(const bcf_hdr_t *header, bcf1_t *line, const struct kb { if ( nret!=nA_ori ) { - hts_log_error("Unexpected number of values in INFO/%s at %s:%"PRId64"; expected Number=A=%d, but found %d", + hts_log_error("Unexpected number of values in INFO/%s at %s:%"PRIhts_pos"; expected Number=A=%d, but found %d", bcf_hdr_int2id(header,BCF_DT_ID,info->key), bcf_seqname(header,line), line->pos+1, nA_ori, nret); goto err; } @@ -419,7 +419,7 @@ int bcf_remove_allele_set(const bcf_hdr_t *header, bcf1_t *line, const struct kb { if ( nret!=nR_ori ) { - hts_log_error("Unexpected number of values in INFO/%s at %s:%"PRId64"; expected Number=R=%d, but found %d", + hts_log_error("Unexpected number of values in INFO/%s at %s:%"PRIhts_pos"; expected Number=R=%d, but found %d", bcf_hdr_int2id(header,BCF_DT_ID,info->key), bcf_seqname(header,line), line->pos+1, nR_ori, nret); goto err; } @@ -451,7 +451,7 @@ int bcf_remove_allele_set(const bcf_hdr_t *header, bcf1_t *line, const struct kb { if ( nret!=nG_ori ) { - hts_log_error("Unexpected number of values in INFO/%s at %s:%"PRId64"; expected Number=R=%d, but found %d", + hts_log_error("Unexpected number of values in INFO/%s at %s:%"PRIhts_pos"; expected Number=R=%d, but found %d", bcf_hdr_int2id(header,BCF_DT_ID,info->key), bcf_seqname(header,line), line->pos+1, nG_ori, nret); goto err; } @@ -485,7 +485,7 @@ int bcf_remove_allele_set(const bcf_hdr_t *header, bcf1_t *line, const struct kb nret = bcf_update_info(header, line, bcf_hdr_int2id(header,BCF_DT_ID,info->key), (void*)dat, ndat, type); if ( nret<0 ) { - hts_log_error("Could not update INFO/%s at %s:%"PRId64" [%d]", + hts_log_error("Could not update INFO/%s at %s:%"PRIhts_pos" [%d]", bcf_hdr_int2id(header,BCF_DT_ID,info->key), bcf_seqname(header,line), line->pos+1, nret); goto err; } @@ -511,7 +511,7 @@ int bcf_remove_allele_set(const bcf_hdr_t *header, bcf1_t *line, const struct kb int al = bcf_gt_allele(ptr[j]); if ( !( al=0 ) ) { - hts_log_error("Problem updating genotypes at %s:%"PRId64" [ al=0 :: al=%d,nR_ori=%d,map[al]=%d ]", + hts_log_error("Problem updating genotypes at %s:%"PRIhts_pos" [ al=0 :: al=%d,nR_ori=%d,map[al]=%d ]", bcf_seqname(header,line), line->pos+1, al, nR_ori, map[al]); goto err; } @@ -522,7 +522,7 @@ int bcf_remove_allele_set(const bcf_hdr_t *header, bcf1_t *line, const struct kb nret = bcf_update_genotypes(header, line, (void*)dat, nret*line->n_sample); if ( nret<0 ) { - hts_log_error("Could not update FORMAT/GT at %s:%"PRId64" [%d]", + hts_log_error("Could not update FORMAT/GT at %s:%"PRIhts_pos" [%d]", bcf_seqname(header,line), line->pos+1, nret); goto err; } @@ -549,7 +549,7 @@ int bcf_remove_allele_set(const bcf_hdr_t *header, bcf1_t *line, const struct kb mdat_bytes = mdat * size; if ( nret<0 ) { - hts_log_error("Could not access FORMAT/%s at %s:%"PRId64" [%d]", + hts_log_error("Could not access FORMAT/%s at %s:%"PRIhts_pos" [%d]", bcf_hdr_int2id(header,BCF_DT_ID,fmt->id), bcf_seqname(header,line), line->pos+1, nret); goto err; } @@ -590,7 +590,7 @@ int bcf_remove_allele_set(const bcf_hdr_t *header, bcf1_t *line, const struct kb if ( k_src==1 && s == '.' ) continue; // missing if ( k_src!=nexp ) { - hts_log_error("Unexpected number of values in FORMAT/%s at %s:%"PRId64"; expected Number=%c=%d, but found %d", + hts_log_error("Unexpected number of values in FORMAT/%s at %s:%"PRIhts_pos"; expected Number=%c=%d, but found %d", bcf_hdr_int2id(header,BCF_DT_ID,fmt->id), bcf_seqname(header,line), line->pos+1, vlen==BCF_VL_A ? 'A' : 'R', nexp, k_src); goto err; } @@ -615,7 +615,7 @@ int bcf_remove_allele_set(const bcf_hdr_t *header, bcf1_t *line, const struct kb if ( nexp==1 && s == '.' ) continue; // missing if ( nexp!=nG_ori && nexp!=nR_ori ) { - hts_log_error("Unexpected number of values in FORMAT/%s at %s:%"PRId64"; expected Number=G=%d(diploid) or %d(haploid), but found %d", + hts_log_error("Unexpected number of values in FORMAT/%s at %s:%"PRIhts_pos"; expected Number=G=%d(diploid) or %d(haploid), but found %d", bcf_hdr_int2id(header,BCF_DT_ID,fmt->id), bcf_seqname(header,line), line->pos+1, nG_ori, nR_ori, nexp); goto err; } @@ -660,7 +660,7 @@ int bcf_remove_allele_set(const bcf_hdr_t *header, bcf1_t *line, const struct kb } if ( k_src!=nR_ori ) { - hts_log_error("Unexpected number of values in FORMAT/%s at %s:%"PRId64"; expected Number=G=%d(haploid), but found %d", + hts_log_error("Unexpected number of values in FORMAT/%s at %s:%"PRIhts_pos"; expected Number=G=%d(haploid), but found %d", bcf_hdr_int2id(header,BCF_DT_ID,fmt->id), bcf_seqname(header,line), line->pos+1, nR_ori, k_src); goto err; } @@ -672,7 +672,7 @@ int bcf_remove_allele_set(const bcf_hdr_t *header, bcf1_t *line, const struct kb nret = bcf_update_format(header, line, bcf_hdr_int2id(header,BCF_DT_ID,fmt->id), (void*)str.s, str.l, type); if ( nret<0 ) { - hts_log_error("Could not update FORMAT/%s at %s:%"PRId64" [%d]", + hts_log_error("Could not update FORMAT/%s at %s:%"PRIhts_pos" [%d]", bcf_hdr_int2id(header,BCF_DT_ID,fmt->id), bcf_seqname(header,line), line->pos+1, nret); goto err; } @@ -708,7 +708,7 @@ int bcf_remove_allele_set(const bcf_hdr_t *header, bcf1_t *line, const struct kb { if ( nori!=nA_ori ) { - hts_log_error("Unexpected number of values in FORMAT/%s at %s:%"PRId64"; expected Number=A=%d, but found %d", + hts_log_error("Unexpected number of values in FORMAT/%s at %s:%"PRIhts_pos"; expected Number=A=%d, but found %d", bcf_hdr_int2id(header,BCF_DT_ID,fmt->id), bcf_seqname(header,line), line->pos+1, nA_ori, nori); goto err; } @@ -720,7 +720,7 @@ int bcf_remove_allele_set(const bcf_hdr_t *header, bcf1_t *line, const struct kb { if ( nori!=nR_ori ) { - hts_log_error("Unexpected number of values in FORMAT/%s at %s:%"PRId64"; expected Number=R=%d, but found %d", + hts_log_error("Unexpected number of values in FORMAT/%s at %s:%"PRIhts_pos"; expected Number=R=%d, but found %d", bcf_hdr_int2id(header,BCF_DT_ID,fmt->id), bcf_seqname(header,line), line->pos+1, nR_ori, nori); goto err; } @@ -756,7 +756,7 @@ int bcf_remove_allele_set(const bcf_hdr_t *header, bcf1_t *line, const struct kb { if ( nori!=nG_ori ) { - hts_log_error("Unexpected number of values in FORMAT/%s at %s:%"PRId64"; expected Number=G=%d, but found %d", + hts_log_error("Unexpected number of values in FORMAT/%s at %s:%"PRIhts_pos"; expected Number=G=%d, but found %d", bcf_hdr_int2id(header,BCF_DT_ID,fmt->id), bcf_seqname(header,line), line->pos+1, nG_ori, nori); goto err; } @@ -809,7 +809,7 @@ int bcf_remove_allele_set(const bcf_hdr_t *header, bcf1_t *line, const struct kb nret = bcf_update_format(header, line, bcf_hdr_int2id(header,BCF_DT_ID,fmt->id), (void*)dat, ndat, type); if ( nret<0 ) { - hts_log_error("Could not update FORMAT/%s at %s:%"PRId64" [%d]", + hts_log_error("Could not update FORMAT/%s at %s:%"PRIhts_pos" [%d]", bcf_hdr_int2id(header,BCF_DT_ID,fmt->id), bcf_seqname(header,line), line->pos+1, nret); goto err; } From 329d2b9484fced50e6275e734af7826e168698a6 Mon Sep 17 00:00:00 2001 From: Rob Davies Date: Wed, 31 Jul 2019 15:11:37 +0100 Subject: [PATCH 08/23] Make headers 64-bit compliant Make sam_hdr_tid2len() return hts_pos_t. Change length stored in sam_hrec_sq_t to hts_pos_t. Make sam header parser use strtoll() instead of atoi(). Unfortunately changing the size of the header target_len array is difficult as some external software attempts to resize it as a multiple of sizeof(uint32_t). Work around this by storing large values as UINT32_MAX and repurpose the sdict pointer (unused since 7a853e8) as a way of passing the real size through. Code that supports long references will need to use sam_hdr_tid2len() to get the length. Adds tests for reading and writing SAM files. --- cram/cram_io.c | 2 +- header.c | 48 ++++++++++++--- header.h | 2 +- htslib/sam.h | 4 +- sam.c | 64 ++++++++++++++++--- test/sam.c | 163 +++++++++++++++++++++++++++++++++++++++++++++++++ 6 files changed, 262 insertions(+), 21 deletions(-) diff --git a/cram/cram_io.c b/cram/cram_io.c index 3c47f865f..1b5c820dc 100644 --- a/cram/cram_io.c +++ b/cram/cram_io.c @@ -1853,7 +1853,7 @@ static void sanitise_SQ_lines(cram_fd *fd) { // Should we also check MD5sums here to ensure the correct // reference was given? - hts_log_warning("Header @SQ length mismatch for ref %s, %d vs %d", + hts_log_warning("Header @SQ length mismatch for ref %s, %"PRIhts_pos" vs %d", r->name, fd->header->hrecs->ref[i].len, (int)r->length); // Fixing the parsed @SQ header will make MD:Z: strings work diff --git a/header.c b/header.c index cf5ca14ac..75ffb001c 100644 --- a/header.c +++ b/header.c @@ -38,6 +38,9 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. // Hash table for removing multiple lines from the header KHASH_SET_INIT_STR(rm) +// Used for long refs in SAM files +KHASH_DECLARE(s2i, kh_cstr_t, int64_t) + typedef khash_t(rm) rmhash_t; static int sam_hdr_link_pg(sam_hdr_t *bh); @@ -140,7 +143,8 @@ static int sam_hrecs_update_hashes(sam_hrecs_t *hrecs, int nref = hrecs->nref; const char *name = NULL; const char *altnames = NULL; - int len = -1, r; + hts_pos_t len = -1; + int r; khint_t k; while (tag) { @@ -149,7 +153,7 @@ static int sam_hrecs_update_hashes(sam_hrecs_t *hrecs, name = tag->str+3; } else if (tag->str[0] == 'L' && tag->str[1] == 'N') { assert(tag->len >= 3); - len = atoi(tag->str+3); + len = strtoll(tag->str+3, NULL, 10); } else if (tag->str[0] == 'A' && tag->str[1] == 'N') { assert(tag->len >= 3); altnames = tag->str+3; @@ -180,7 +184,8 @@ static int sam_hrecs_update_hashes(sam_hrecs_t *hrecs, // Check lengths match; correct if not. if (len != hrecs->ref[nref].len) { char tmp[32]; - snprintf(tmp, sizeof(tmp), "%u", hrecs->ref[nref].len); + snprintf(tmp, sizeof(tmp), "%" PRIhts_pos, + hrecs->ref[nref].len); if (sam_hrecs_update(hrecs, h_type, "LN", tmp, NULL) < 0) return -1; } @@ -921,7 +926,11 @@ int sam_hdr_update_target_arrays(sam_hdr_t *bh, const sam_hrecs_t *hrecs, if (!bh->target_name[i]) return -1; } - bh->target_len[i] = hrecs->ref[i].len; + if (hrecs->ref[i].len < UINT32_MAX) { + bh->target_len[i] = hrecs->ref[i].len; + } else { + bh->target_len[i] = UINT32_MAX; + } } // Free up any names that have been removed @@ -991,7 +1000,17 @@ static int sam_hrecs_refs_from_targets_array(sam_hrecs_t *hrecs, int r; hrecs->ref[tid].name = string_dup(hrecs->str_pool, bh->target_name[tid]); if (!hrecs->ref[tid].name) goto fail; - hrecs->ref[tid].len = bh->target_len[tid]; + if (bh->target_len[tid] < UINT32_MAX || !bh->sdict) { + hrecs->ref[tid].len = bh->target_len[tid]; + } else { + khash_t(s2i) *long_refs = (khash_t(s2i) *) bh->sdict; + k = kh_get(s2i, long_refs, hrecs->ref[tid].name); + if (k < kh_end(long_refs)) { + hrecs->ref[tid].len = kh_val(long_refs, k); + } else { + hrecs->ref[tid].len = UINT32_MAX; + } + } hrecs->ref[tid].ty = NULL; k = kh_put(m_s2i, hrecs->ref_hash, hrecs->ref[tid].name, &r); if (r < 0) goto fail; @@ -1038,7 +1057,7 @@ static int add_stub_ref_sq_lines(sam_hrecs_t *hrecs) { for (tid = 0; tid < hrecs->nref; tid++) { if (hrecs->ref[tid].ty == NULL) { - snprintf(len, sizeof(len), "%d", hrecs->ref[tid].len); + snprintf(len, sizeof(len), "%"PRIhts_pos, hrecs->ref[tid].len); if (sam_hrecs_add(hrecs, "SQ", "SN", hrecs->ref[tid].name, "LN", len, NULL) != 0) @@ -1938,7 +1957,7 @@ const char *sam_hdr_tid2name(const sam_hdr_t *h, int tid) { return NULL; } -uint32_t sam_hdr_tid2len(const sam_hdr_t *h, int tid) { +hts_pos_t sam_hdr_tid2len(const sam_hdr_t *h, int tid) { sam_hrecs_t *hrecs; if (!h) @@ -1947,8 +1966,19 @@ uint32_t sam_hdr_tid2len(const sam_hdr_t *h, int tid) { if ((hrecs = h->hrecs) != NULL && tid < hrecs->nref) { return hrecs->ref[tid].len; } else { - if (tid < h->n_targets) - return h->target_len[tid]; + if (tid < h->n_targets) { + if (h->target_len[tid] < UINT32_MAX || !h->sdict) { + return h->target_len[tid]; + } else { + khash_t(s2i) *long_refs = (khash_t(s2i) *) h->sdict; + khint_t k = kh_get(s2i, long_refs, h->target_name[tid]); + if (k < kh_end(long_refs)) { + return kh_val(long_refs, k); + } else { + return UINT32_MAX; + } + } + } } return 0; diff --git a/header.h b/header.h index 18c8ee89c..810a3dda1 100644 --- a/header.h +++ b/header.h @@ -122,7 +122,7 @@ typedef struct sam_hrec_type_s { /*! Parsed \@SQ lines */ typedef struct { const char *name; - uint32_t len; + hts_pos_t len; sam_hrec_type_t *ty; } sam_hrec_sq_t; diff --git a/htslib/sam.h b/htslib/sam.h index 5b73d6644..b842b71be 100644 --- a/htslib/sam.h +++ b/htslib/sam.h @@ -74,7 +74,7 @@ typedef struct sam_hdr_t { const int8_t *cigar_tab HTS_DEPRECATED("Use bam_cigar_table[] instead"); char **target_name; char *text; - void *sdict HTS_DEPRECATED("Unused since 1.10"); + void *sdict; sam_hrecs_t *hrecs; uint32_t ref_count; } sam_hdr_t; @@ -704,7 +704,7 @@ const char *sam_hdr_tid2name(const sam_hdr_t *h, int tid); * Fetch the reference sequence length from the target length array, * using the numerical target id. */ -uint32_t sam_hdr_tid2len(const sam_hdr_t *h, int tid); +hts_pos_t sam_hdr_tid2len(const sam_hdr_t *h, int tid); /// Alias of sam_hdr_name2tid(), for backwards compatibility. /*! diff --git a/sam.c b/sam.c index 04effb959..4ecfe3cb9 100644 --- a/sam.c +++ b/sam.c @@ -124,6 +124,8 @@ void sam_hdr_destroy(sam_hdr_t *bh) free(bh->text); if (bh->hrecs) sam_hrecs_free(bh->hrecs); + if (bh->sdict) + kh_destroy(s2i, (khash_t(s2i) *) bh->sdict); free(bh); } @@ -696,9 +698,11 @@ static hts_idx_t *sam_index(htsFile *fp, int min_shift) h = sam_hdr_read(fp); if (h == NULL) return NULL; if (min_shift > 0) { - int64_t max_len = 0, s; - for (i = 0; i < h->n_targets; ++i) - if (max_len < h->target_len[i]) max_len = h->target_len[i]; + hts_pos_t max_len = 0, s; + for (i = 0; i < h->n_targets; ++i) { + hts_pos_t len = sam_hdr_tid2len(h, i); + if (max_len < len) max_len = len; + } max_len += 256; for (n_lvls = 0, s = 1< s; ++n_lvls, s <<= 3); fmt = HTS_FMT_CSI; @@ -1211,6 +1215,7 @@ static sam_hdr_t *sam_hdr_create(htsFile* fp) { const char *q, *r; char* sn = NULL; khash_t(s2i) *d = kh_init(s2i); + khash_t(s2i) *long_refs = NULL; if (!h || !d) goto error; @@ -1222,7 +1227,7 @@ static sam_hdr_t *sam_hdr_create(htsFile* fp) { if (fp->line.l > 3 && strncmp(fp->line.s, "@SQ", 3) == 0) { has_SQ = 1; - int ln = -1; + hts_pos_t ln = -1; for (q = fp->line.s + 4;; ++q) { if (strncmp(q, "SN:", 3) == 0) { q += 3; @@ -1240,7 +1245,7 @@ static sam_hdr_t *sam_hdr_create(htsFile* fp) { q = r; } else { if (strncmp(q, "LN:", 3) == 0) - ln = strtol(q + 3, (char**)&q, 10); + ln = strtoll(q + 3, (char**)&q, 10); } while (*q != '\t' && *q != '\n' && *q != '\0') @@ -1259,7 +1264,24 @@ static sam_hdr_t *sam_hdr_create(htsFile* fp) { hts_log_warning("Duplicated sequence '%s'", sn); free(sn); } else { - kh_val(d, k) = (int64_t) (kh_size(d) - 1) << 32 | ln; + if (ln >= UINT32_MAX) { + // Stash away ref length that + // doesn't fit in target_len array + int k2; + if (!long_refs) { + long_refs = kh_init(s2i); + if (!long_refs) + goto error; + } + k2 = kh_put(s2i, long_refs, sn, &absent); + if (absent < 0) + goto error; + kh_val(long_refs, k2) = ln; + kh_val(d, k) = ((int64_t) (kh_size(d) - 1) << 32 + | UINT32_MAX); + } else { + kh_val(d, k) = (int64_t) (kh_size(d) - 1) << 32 | ln; + } } } else { hts_log_warning("Ignored @SQ SN:%s : bad or missing LN tag", sn); @@ -1301,6 +1323,8 @@ static sam_hdr_t *sam_hdr_create(htsFile* fp) { while (line.l = 0, kgetline(&line, (kgets_func*) hgets, f) >= 0) { char* tab = strchr(line.s, '\t'); + hts_pos_t ln; + if (tab == NULL) continue; @@ -1312,18 +1336,38 @@ static sam_hdr_t *sam_hdr_create(htsFile* fp) { if (absent < 0) break; + ln = strtoll(tab, NULL, 10); + if (!absent) { hts_log_warning("Duplicated sequence '%s'", sn); free(sn); } else { - kh_val(d, k) = (int64_t) (kh_size(d) - 1) << 32 | atol(tab); + if (ln >= UINT32_MAX) { + // Stash away ref length that + // doesn't fit in target_len array + khint_t k2; + int absent = -1; + if (!long_refs) { + long_refs = kh_init(s2i); + if (!long_refs) + goto error; + } + k2 = kh_put(s2i, long_refs, sn, &absent); + if (absent < 0) + goto error; + kh_val(long_refs, k2) = ln; + kh_val(d, k) = ((int64_t) (kh_size(d) - 1) << 32 + | UINT32_MAX); + } else { + kh_val(d, k) = (int64_t) (kh_size(d) - 1) << 32 | ln; + } has_SQ = 1; } e |= kputs("@SQ\tSN:", &str) < 0; e |= kputsn(line.s, tab - line.s, &str) < 0; e |= kputs("\tLN:", &str) < 0; - e |= kputl(atol(tab), &str) < 0; + e |= kputll(ln, &str) < 0; e |= kputc('\n', &str) < 0; if (e) break; @@ -1360,6 +1404,9 @@ static sam_hdr_t *sam_hdr_create(htsFile* fp) { } } + // Repurpose sdict to hold any references longer than UINT32_MAX + h->sdict = long_refs; + kh_destroy(s2i, d); if (str.l == 0) @@ -1375,6 +1422,7 @@ static sam_hdr_t *sam_hdr_create(htsFile* fp) { sam_hdr_destroy(h); ks_free(&str); kh_destroy(s2i, d); + kh_destroy(s2i, long_refs); if (sn) free(sn); return NULL; } diff --git a/test/sam.c b/test/sam.c index 3c15459aa..0af45e1a9 100644 --- a/test/sam.c +++ b/test/sam.c @@ -32,6 +32,7 @@ DEALINGS IN THE SOFTWARE. */ #include #include #include +#include // Suppress message for faidx_fetch_nseq(), which we're intentionally testing #include "htslib/hts_defs.h" @@ -1170,6 +1171,166 @@ static void samrecord_layout(void) "test/sam_alignment.tmp.sam_", "w", NULL); } +static void check_big_ref(int parse_header) +{ + static const char sam_text[] = "data:," + "@HD\tVN:1.4\n" + "@SQ\tSN:large#1\tLN:5000000000\n" + "@SQ\tSN:small#1\tLN:100\n" + "@SQ\tSN:large#2\tLN:9223372034707292158\n" + "@SQ\tSN:small#2\tLN:1\n" + "r1\t0\tlarge#1\t4999999000\t50\t8M\t*\t0\t0\tACGTACGT\tabcdefgh\n" + "r2\t0\tsmall#1\t1\t50\t8M\t*\t0\t0\tACGTACGT\tabcdefgh\n" + "r3\t0\tlarge#2\t9223372034707292000\t50\t8M\t*\t0\t0\tACGTACGT\tabcdefgh\n" + "p1\t99\tlarge#2\t1\t50\t8M\t=\t9223372034707292150\t9223372034707292158\tACGTACGT\tabcdefgh\n" + "p1\t147\tlarge#2\t9223372034707292150\t50\t8M\t=\t1\t-9223372034707292158\tACGTACGT\tabcdefgh\n" + "r4\t0\tsmall#2\t2\t50\t8M\t*\t0\t0\tACGTACGT\tabcdefgh\n"; + const hts_pos_t expected_lengths[] = { + 5000000000LL, 100LL, 9223372034707292158LL, 1LL + }; + const int expected_tids[] = { + 0, 1, 2, 2, 2, 3 + }; + const int expected_mtid[] = { + -1, -1, -1, 2, 2, -1 + }; + const hts_pos_t expected_positions[] = { + 4999999000LL - 1, 1LL - 1, 9223372034707292000LL - 1, 1LL - 1, + 9223372034707292150LL - 1, 2LL - 1 + }; + const hts_pos_t expected_mpos[] = { + -1, -1, -1, 9223372034707292150LL - 1, 1LL - 1, -1 + }; + samFile *in = NULL, *out = NULL; + sam_hdr_t *header = NULL; + bam1_t *aln = bam_init1(); + const int num_refs = sizeof(expected_lengths) / sizeof(expected_lengths[0]); + const int num_align = sizeof(expected_tids) / sizeof(expected_tids[0]); + const char *outfname = "test/sam_big_ref.tmp.sam_"; + int i, r; + char buffer[sizeof(sam_text) + 1024]; + FILE *inf = NULL; + size_t bytes; + + if (!aln) { + fail("Out of memory"); + goto cleanup; + } + + in = sam_open(sam_text, "r"); + if (!in) { + fail("Opening SAM file"); + goto cleanup; + } + out = sam_open(outfname, "w"); + if (!out) { + fail("Opening output SAM file \"%s\"", outfname); + goto cleanup; + } + header = sam_hdr_read(in); + if (!header) { + fail("Reading SAM header"); + goto cleanup; + } + if (parse_header) { + // This will force the reader to be parsed + if (sam_hdr_count_lines(header, "SQ") != num_refs) { + fail("Wrong number of SQ lines in header"); + goto cleanup; + } + } + for (i = 0; i < num_refs; i++) { + hts_pos_t ln = sam_hdr_tid2len(header, i); + if (ln != expected_lengths[i]) { + fail("Wrong length for ref %d : " + "expected %"PRIhts_pos" got %"PRIhts_pos"\n", + i, expected_lengths[i], ln); + goto cleanup; + } + } + if (sam_hdr_write(out, header) < 0) { + fail("Failed to write SAM header"); + goto cleanup; + } + i = 0; + while ((r = sam_read1(in, header, aln)) >= 0) { + if (i >= num_align) { + fail("Too many alignment records.\n"); + goto cleanup; + } + if (aln->core.tid != expected_tids[i]) { + fail("Wrong tid for record %d : expected %d got %d\n", + i, expected_tids[i], aln->core.tid); + goto cleanup; + } + if (aln->core.mtid != expected_mtid[i]) { + fail("Wrong mate tid for record %d : expected %d got %d\n", + i, expected_mtid[i], aln->core.mtid); + goto cleanup; + } + if (aln->core.pos != expected_positions[i]) { + fail("Wrong position for record %d : " + "expected %"PRIhts_pos" got %"PRIhts_pos"\n", + i, expected_positions[i], aln->core.pos); + } + if (aln->core.mpos != expected_mpos[i]) { + fail("Wrong mate position for record %d : " + "expected %"PRIhts_pos" got %"PRIhts_pos"\n", + i, expected_mpos[i], aln->core.mpos); + } + if (sam_write1(out, header, aln) < 0) { + fail("Failed to write alignment record %d\n", i); + goto cleanup; + } + i++; + } + if (r < -1) { + fail("Error reading SAM alignment\n"); + goto cleanup; + } + if (i < num_align) { + fail("Not enough alignment records\n"); + goto cleanup; + } + r = sam_close(in); in = NULL; + if (r < 0) { + fail("sam_close(in)"); + goto cleanup; + } + r = sam_close(out); out = NULL; + if (r < 0) { + fail("sam_close(out)"); + goto cleanup; + } + + inf = fopen(outfname, "r"); + if (!inf) { + fail("Opening \"%s\"", outfname); + goto cleanup; + } + bytes = fread(buffer, 1, sizeof(buffer), inf); + if (bytes != sizeof(sam_text) - 7 + || memcmp(buffer, sam_text + 6, bytes - 7) != 0) { + fail("Output file does not match original version"); + fprintf(stderr, + "---------- Expected:\n%.*s\n" + "++++++++++ Got:\n%.*s\n" + "====================\n", + (int) sizeof(sam_text) - 7, sam_text + 6, + (int) bytes, buffer); + goto cleanup; + } + + cleanup: + bam_destroy1(aln); + sam_hdr_destroy(header); + if (in) sam_close(in); + if (out) sam_close(out); + if (inf) fclose(inf); + unlink(outfname); + return; +} + static void faidx1(const char *filename) { int n, n_exp = 0, n_fq_exp = 0; @@ -1578,6 +1739,8 @@ int main(int argc, char **argv) test_text_file("test/fastqs.fq", 500); check_enum1(); check_cigar_tab(); + check_big_ref(0); + check_big_ref(1); test_mempolicy(); for (i = 1; i < argc; i++) faidx1(argv[i]); From 3108bee1d1b782994bdf354f95533e9e132d259b Mon Sep 17 00:00:00 2001 From: Rob Davies Date: Wed, 31 Jul 2019 16:28:35 +0100 Subject: [PATCH 09/23] Eliminate struct holes in bam1_core_t and bam1_t Swap tid and pos in bam1_core_t. Removes four bytes of padding between tid and pos, and four bytes between mtid and mpos. Reverse order of l_data, data and id in bam1_t. Removes four bytes of padding after l_data on 64-bit platforms. Adjust documentation to match new ordering. --- htslib/sam.h | 16 +++++++++------- test/sam.c | 5 +++-- 2 files changed, 12 insertions(+), 9 deletions(-) diff --git a/htslib/sam.h b/htslib/sam.h index b842b71be..39bd80cdf 100644 --- a/htslib/sam.h +++ b/htslib/sam.h @@ -187,21 +187,22 @@ extern const int8_t bam_cigar_table[256]; /*! @typedef @abstract Structure for core alignment information. - @field tid chromosome ID, defined by sam_hdr_t @field pos 0-based leftmost coordinate + @field tid chromosome ID, defined by sam_hdr_t @field bin bin calculated by bam_reg2bin() @field qual mapping quality - @field l_qname length of the query name - @field flag bitwise flag @field l_extranul length of extra NULs between qname & cigar (for alignment) + @field flag bitwise flag + @field l_qname length of the query name @field n_cigar number of CIGAR operations @field l_qseq length of the query sequence (read) @field mtid chromosome ID of next read in template, defined by sam_hdr_t @field mpos 0-based leftmost coordinate of next read in template + @field isize observed template length ("insert size") */ typedef struct { - int32_t tid; hts_pos_t pos; + int32_t tid; uint16_t bin; // NB: invalid on 64-bit pos uint8_t qual; uint8_t l_extranul; @@ -217,9 +218,10 @@ typedef struct { /*! @typedef @abstract Structure for one alignment. @field core core information about the alignment + @field id + @field data all variable-length data, concatenated; structure: qname-cigar-seq-qual-aux @field l_data current length of bam1_t::data @field m_data maximum length of bam1_t::data - @field data all variable-length data, concatenated; structure: qname-cigar-seq-qual-aux @field mempolicy memory handling policy, see bam_set_mempolicy() @discussion Notes: @@ -239,9 +241,9 @@ typedef struct { */ typedef struct { bam1_core_t core; - int l_data; - uint8_t *data; uint64_t id; + uint8_t *data; + int l_data; uint32_t m_data; uint32_t mempolicy:2, :30 /* Reserved */; } bam1_t; diff --git a/test/sam.c b/test/sam.c index 0af45e1a9..bbd759fb9 100644 --- a/test/sam.c +++ b/test/sam.c @@ -1151,13 +1151,14 @@ static void samrecord_layout(void) size_t bam1_t_size, bam1_t_size2; assert(sizeof(hts_pos_t) == 8 || sizeof(hts_pos_t) == 4); - int core_size = sizeof(hts_pos_t) == 8 ? 56 : 36; + int core_size = sizeof(hts_pos_t) == 8 ? 48 : 36; bam1_t_size = (core_size + sizeof(int) + sizeof(char *) + sizeof(uint64_t) + 2 * sizeof(uint32_t)); bam1_t_size2 = bam1_t_size + 4; // Account for padding on some platforms if (sizeof (bam1_core_t) != core_size) - fail("sizeof bam1_core_t is %zu, expected 56", sizeof (bam1_core_t)); + fail("sizeof bam1_core_t is %zu, expected %d", + sizeof (bam1_core_t), core_size); if (sizeof (bam1_t) != bam1_t_size && sizeof (bam1_t) != bam1_t_size2) fail("sizeof bam1_t is %zu, expected either %zu or %zu", From 3d49c61fcb2edfa2a3397d7aed5e0cf0df238570 Mon Sep 17 00:00:00 2001 From: Rob Davies Date: Fri, 23 Aug 2019 17:19:41 +0100 Subject: [PATCH 10/23] Rearrange bcf1_t struct to eliminate hole. --- htslib/vcf.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/htslib/vcf.h b/htslib/vcf.h index 742b341d8..9d8464551 100644 --- a/htslib/vcf.h +++ b/htslib/vcf.h @@ -208,9 +208,9 @@ typedef struct { line must be formatted in vcf_format. */ typedef struct { - int32_t rid; // CHROM hts_pos_t pos; // POS hts_pos_t rlen; // length of REF + int32_t rid; // CHROM float qual; // QUAL uint32_t n_info:16, n_allele:16; uint32_t n_fmt:8, n_sample:24; From c557f7272148a24f70132334e4f7fda1d7ea7e7f Mon Sep 17 00:00:00 2001 From: Rob Davies Date: Mon, 9 Sep 2019 16:51:13 +0100 Subject: [PATCH 11/23] Make regions and indexing work with long references Update reglist functions to work with 64 bit positions. Make bgzf_idx_push take 64 bit positions and increase size of hts_idx_cache_entry::beg and hts_idx_cache_entry::end. Remove restriction on stored positions to INT_MAX in hts_reglist_create() and hts_iter_querys(). hts_reglist_create() can be simplified a bit as it's internally using hts_pair_pos_t to store intervals, which is the same as hts_reglist_t::intervals where they are eventually stored. Old type hts_pair32_t is made a typedef for hts_pair_pos_t as the two structs had become exactly the same. This allows hts_reglist_t::intervals to be changed to type hts_pair_pos_t. --- bgzf.c | 5 ++-- hts.c | 2 -- hts_internal.h | 2 +- htslib/hts.h | 9 +++---- region.c | 72 ++++++++++++++++++++++++++------------------------ 5 files changed, 45 insertions(+), 45 deletions(-) diff --git a/bgzf.c b/bgzf.c index 7cdf1d68c..c2caa0443 100644 --- a/bgzf.c +++ b/bgzf.c @@ -110,7 +110,8 @@ enum mtaux_cmd { // When multi-threaded bgzf_tell won't work, so we delay the hts_idx_push // until we've written the last block. typedef struct { - int tid, beg, end, is_mapped; // args for hts_idx_push + hts_pos_t beg, end; + int tid, is_mapped; // args for hts_idx_push uint64_t offset, block_number; } hts_idx_cache_entry; @@ -183,7 +184,7 @@ struct __bgzidx_t * Returns 0 on success, * -1 on failure */ -int bgzf_idx_push(BGZF *fp, hts_idx_t *hidx, int tid, int beg, int end, uint64_t offset, int is_mapped) { +int bgzf_idx_push(BGZF *fp, hts_idx_t *hidx, int tid, hts_pos_t beg, hts_pos_t end, uint64_t offset, int is_mapped) { hts_idx_cache_entry *e; mtaux_t *mt = fp->mt; diff --git a/hts.c b/hts.c index e403d4f41..ee43c5125 100644 --- a/hts.c +++ b/hts.c @@ -3119,8 +3119,6 @@ hts_itr_t *hts_itr_querys(const hts_idx_t *idx, const char *reg, hts_name2id_f g if (!hts_parse_region(reg, &tid, &beg, &end, getid, hdr, HTS_PARSE_THOUSANDS_SEP)) return NULL; - if (end > INT_MAX) end = INT_MAX; // Remove when fully 64-bit compliant - return itr_query(idx, tid, beg, end, readrec); } diff --git a/hts_internal.h b/hts_internal.h index c4f6b611d..36a7fba69 100644 --- a/hts_internal.h +++ b/hts_internal.h @@ -91,7 +91,7 @@ void close_plugin(void *plugin); * Returns 0 on success, * -1 on failure */ -int bgzf_idx_push(BGZF *fp, hts_idx_t *hidx, int tid, int beg, int end, uint64_t offset, int is_mapped); +int bgzf_idx_push(BGZF *fp, hts_idx_t *hidx, int tid, hts_pos_t beg, hts_pos_t end, uint64_t offset, int is_mapped); #ifdef __cplusplus } diff --git a/htslib/hts.h b/htslib/hts.h index c72a426cc..6c430eeb0 100644 --- a/htslib/hts.h +++ b/htslib/hts.h @@ -592,15 +592,12 @@ typedef int64_t hts_pos_t; // #define PRIhts_pos PRId32 // typedef int32_t hts_pos_t; -typedef struct { - //uint32_t beg, end; - hts_pos_t beg, end; // sorry for the bad naming: FIXME! -} hts_pair32_t; - typedef struct { hts_pos_t beg, end; } hts_pair_pos_t; +typedef hts_pair_pos_t hts_pair32_t; // For backwards compatibility + typedef struct { uint64_t u, v; } hts_pair64_t; @@ -612,7 +609,7 @@ typedef struct { typedef struct { const char *reg; - hts_pair32_t *intervals; + hts_pair_pos_t *intervals; int tid; uint32_t count; hts_pos_t min_beg, max_end; diff --git a/region.c b/region.c index d9679f79f..4b5dd4cfc 100644 --- a/region.c +++ b/region.c @@ -30,17 +30,21 @@ DEALINGS IN THE SOFTWARE. */ typedef struct reglist { uint32_t n, m; - uint64_t *a; + hts_pair_pos_t *a; int tid; } reglist_t; KHASH_MAP_INIT_INT(reg, reglist_t) typedef kh_reg_t reghash_t; -static int compare_uint64 (const void * a, const void * b) +static int compare_hts_pair_pos_t (const void *av, const void *bv) { - if (*(uint64_t *)a < *(uint64_t *)b) return -1; - if (*(uint64_t *)a > *(uint64_t *)b) return 1; + hts_pair_pos_t *a = (hts_pair_pos_t *) av; + hts_pair_pos_t *b = (hts_pair_pos_t *) bv; + if (a->beg < b->beg) return -1; + if (a->beg > b->beg) return 1; + if (a->end < b->end) return -1; + if (a->end > b->end) return 1; return 0; } @@ -54,7 +58,6 @@ static void reg_print(reghash_t *h) { khint_t k; uint32_t i; khint32_t key; - uint32_t beg, end; if (!h) { fprintf(stderr, "Hash table is empty!\n"); @@ -66,9 +69,8 @@ static void reg_print(reghash_t *h) { fprintf(stderr, "Region: key %u tid %d\n", key, p->tid); if ((p = &kh_val(h,k)) != NULL && p->n > 0) { for (i=0; in; i++) { - beg = (uint32_t)(p->a[i]>>32); - end = (uint32_t)(p->a[i]); - fprintf(stderr, "\tinterval[%d]: %d-%d\n", i, beg, end); + fprintf(stderr, "\tinterval[%d]: %"PRIhts_pos"-%"PRIhts_pos"\n", i, + p->a[i].beg, p->a[i].end); } } else { fprintf(stderr, "Region key %u has no intervals!\n", key); @@ -94,23 +96,30 @@ static int reg_compact(reghash_t *h) { if (!kh_exist(h,i) || !(p = &kh_val(h,i)) || !(p->n)) continue; - qsort(p->a, p->n, sizeof(uint64_t), compare_uint64); + qsort(p->a, p->n, sizeof(p->a[0]), compare_hts_pair_pos_t); for (new_n = 0, j = 1; j < p->n; j++) { - if ((uint32_t)p->a[new_n] < (uint32_t)(p->a[j]>>32)) { - p->a[++new_n] = p->a[j]; + if (p->a[new_n].end < p->a[j].beg) { + p->a[++new_n].beg = p->a[j].beg; + p->a[new_n].end = p->a[j].end; } else { - if ((uint32_t)p->a[new_n] < (uint32_t)p->a[j]) - p->a[new_n] = (p->a[new_n] & 0xFFFFFFFF00000000) | (uint32_t)(p->a[j]); + if (p->a[new_n].end < p->a[j].end) + p->a[new_n].end = p->a[j].end; } } - p->n = ++new_n; + ++new_n; + if (p->n > new_n) { + // Shrink array to required size. + hts_pair_pos_t *new_a = realloc(p->a, new_n * sizeof(p->a[0])); + if (new_a) p->a = new_a; + } + p->n = new_n; count++; } return count; } -static int reg_insert(reghash_t *h, int tid, unsigned int beg, unsigned int end) { +static int reg_insert(reghash_t *h, int tid, hts_pos_t beg, hts_pos_t end) { khint_t k; reglist_t *p; @@ -135,12 +144,13 @@ static int reg_insert(reghash_t *h, int tid, unsigned int beg, unsigned int end) if (p->n == p->m) { uint32_t new_m = p->m ? p->m<<1 : 4; if (new_m == 0) return -1; - uint64_t *new_a = realloc(p->a, new_m * sizeof(uint64_t)); + hts_pair_pos_t *new_a = realloc(p->a, new_m * sizeof(p->a[0])); if (new_a == NULL) return -1; p->m = new_m; p->a = new_a; } - p->a[p->n++] = (uint64_t)beg<<32 | end; + p->a[p->n].beg = beg; + p->a[p->n++].end = end; return 0; } @@ -174,9 +184,8 @@ hts_reglist_t *hts_reglist_create(char **argv, int argc, int *r_count, void *hdr khint_t k; int i, l_count = 0, tid; - uint32_t j; const char *q; - int64_t beg, end; + hts_pos_t beg, end; /* First, transform the char array into a hash table */ h = kh_init(reg); @@ -207,9 +216,6 @@ hts_reglist_t *hts_reglist_create(char **argv, int argc, int *r_count, void *hdr } } - if (beg > INT_MAX) beg = INT_MAX; // Remove when fully 64-bit compliant - if (end > INT_MAX) end = INT_MAX; // Remove when fully 64-bit compliant - if (reg_insert(h, tid, beg, end) != 0) { hts_log_error("Error when inserting region='%s' in the bed hash table at address=%p", argv[i], (void *) h); goto fail; @@ -230,21 +236,19 @@ hts_reglist_t *hts_reglist_create(char **argv, int argc, int *r_count, void *hdr continue; h_reglist[l_count].tid = p->tid; - h_reglist[l_count].intervals = calloc(p->n, sizeof(h_reglist[l_count].intervals[0])); - if(!(h_reglist[l_count].intervals)) { - hts_log_error("Could not allocate memory for intervals"); - goto fail; - } + h_reglist[l_count].intervals = p->a; h_reglist[l_count].count = p->n; - h_reglist[l_count].max_end = 0; + p->a = NULL; // As we stole it. - for (j = 0; j < p->n; j++) { - h_reglist[l_count].intervals[j].beg = (uint32_t)(p->a[j]>>32); - h_reglist[l_count].intervals[j].end = (uint32_t)(p->a[j] & 0xffffffffU); - - if (h_reglist[l_count].intervals[j].end > h_reglist[l_count].max_end) - h_reglist[l_count].max_end = h_reglist[l_count].intervals[j].end; + // After reg_compact(), list is ordered and non-overlapping, so... + if (p->n > 0) { + h_reglist[l_count].min_beg = h_reglist[l_count].intervals[0].beg; + h_reglist[l_count].max_end = h_reglist[l_count].intervals[p->n - 1].end; + } else { + h_reglist[l_count].min_beg = 0; + h_reglist[l_count].max_end = 0; } + l_count++; } reg_destroy(h); From 674714e2c433fab8d4e4782b3bcbd79a71086968 Mon Sep 17 00:00:00 2001 From: Rob Davies Date: Tue, 10 Sep 2019 12:27:19 +0100 Subject: [PATCH 12/23] Add large position tests Round-trip SAM -> SAM.gz -> SAM Indexing both on-the-fly and for an existing file. Index look-ups and iterators. --- .gitignore | 1 + Makefile | 2 +- test/longrefs/longref.sam | 96 ++++++++++++++++++++++++ test/longrefs/longref_itr.expected.sam | 26 +++++++ test/longrefs/longref_multi.expected.sam | 46 ++++++++++++ test/test.pl | 23 ++++++ 6 files changed, 193 insertions(+), 1 deletion(-) create mode 100644 test/longrefs/longref.sam create mode 100644 test/longrefs/longref_itr.expected.sam create mode 100644 test/longrefs/longref_multi.expected.sam diff --git a/.gitignore b/.gitignore index ac72d4bf4..4ac78c986 100644 --- a/.gitignore +++ b/.gitignore @@ -37,6 +37,7 @@ lib*.so.* /test/fieldarith /test/hfile /test/hts_endian +/test/longrefs/*.tmp.* /test/pileup /test/sam /test/tabix/*.tmp.* diff --git a/Makefile b/Makefile index d8c09e3fb..3d9528304 100644 --- a/Makefile +++ b/Makefile @@ -532,7 +532,7 @@ htslib-uninstalled.pc: htslib.pc.tmp testclean: - -rm -f test/*.tmp test/*.tmp.* test/tabix/*.tmp.* test/tabix/FAIL* + -rm -f test/*.tmp test/*.tmp.* test/longrefs/*.tmp.* test/tabix/*.tmp.* test/tabix/FAIL* mostlyclean: testclean -rm -f *.o *.pico cram/*.o cram/*.pico test/*.o test/*.dSYM version.h diff --git a/test/longrefs/longref.sam b/test/longrefs/longref.sam new file mode 100644 index 000000000..a2611f675 --- /dev/null +++ b/test/longrefs/longref.sam @@ -0,0 +1,96 @@ +@SQ SN:CHROMOSOME_I LN:10001009800 +SRR065390.14978392 16 CHROMOSOME_I 10000000002 1 27M1D73M * 0 0 CCTAGCCCTAACCCTAACCCTAACCCTAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAA #############################@B?8B?BA@@DDBCDDCBC@CDCDCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCC AS:i:-18 XS:i:-18 XN:i:0 XM:i:5 XO:i:1 XG:i:1 YT:Z:UU MD:Z:4A0G5G5G5G3^A73 NM:i:6 +SRR065390.921023 16 CHROMOSOME_I 10000000003 12 100M * 0 0 CTAAGCCTAAATCTAAGCCTAACCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAA ###############################################???88:;98768700000<>:BBA?BBAB?BBBBBBBB>B>BB::;?:00000 AS:i:-6 XS:i:-13 XN:i:0 XM:i:3 XO:i:0 XG:i:0 YT:Z:UU MD:Z:10G0C10G77 NM:i:3 +SRR065390.1871511 16 CHROMOSOME_I 10000000003 1 100M * 0 0 CTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAA 0:BB@>B<=B@???@=8@B>BB@CA@DACDCBBCCCA@CCCCACCBCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCC AS:i:0 XS:i:0 XN:i:0 XM:i:0 XO:i:0 XG:i:0 YT:Z:UU MD:Z:100 NM:i:0 +SRR065390.3743423 16 CHROMOSOME_I 10000000003 1 100M * 0 0 CTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAA ##################?6@:7<=@3=@ABAAB>BDBBABADABDDDBDDBCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCC AS:i:0 XS:i:0 XN:i:0 XM:i:0 XO:i:0 XG:i:0 YT:Z:UU MD:Z:100 NM:i:0 +SRR065390.4251890 16 CHROMOSOME_I 10000000003 1 100M * 0 0 CTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAA ###########@BB=BCBBC?B>B;>B@@ADBBB@DBBBBDCCBBCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCC AS:i:0 XS:i:0 XN:i:0 XM:i:0 XO:i:0 XG:i:0 YT:Z:UU MD:Z:100 NM:i:0 +SRR065390.5238868 16 CHROMOSOME_I 10000000003 1 100M * 0 0 CTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAA @,=@@D8D;?BBB>;?BBB==BB@D;>D>BBB>BBDDBA@@BCCB@=BACBCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCC AS:i:0 XS:i:0 XN:i:0 XM:i:0 XO:i:0 XG:i:0 YT:Z:UU MD:Z:100 NM:i:0 +SRR065390.8289592 16 CHROMOSOME_I 10000000003 1 100M * 0 0 CTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAA ###############################A?@C9@@BC=AABDD@A@DC@CB=@BA?6@CCAAC@+CCCCCCCCCCCCCCC@CCCCCCCCCCCCCCCC AS:i:0 XS:i:0 XN:i:0 XM:i:0 XO:i:0 XG:i:0 YT:Z:UU MD:Z:100 NM:i:0 +SRR065390.14497557 16 CHROMOSOME_I 10000000003 1 100M * 0 0 CTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAA ######@:@@.>=><;;B>AB>>BB?B=>B=BD>BDADDD>CCDDDBCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCC AS:i:0 XS:i:0 XN:i:0 XM:i:0 XO:i:0 XG:i:0 YT:Z:UU MD:Z:100 NM:i:0 +SRR065390.15617929 16 CHROMOSOME_I 10000000003 1 100M * 0 0 CTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAA D?;;D>?C>CBAAACD@BB?B>BBDB>@BBDDBDC@CBDDCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCC AS:i:0 XS:i:0 XN:i:0 XM:i:0 XO:i:0 XG:i:0 YT:Z:UU MD:Z:100 NM:i:0 +SRR065390.16049575 16 CHROMOSOME_I 10000000003 1 100M * 0 0 CTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAA #######################@??BB8BBB@@:AB@BDBCCDCBDCCCCACCCCCCBCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCC AS:i:0 XS:i:0 XN:i:0 XM:i:0 XO:i:0 XG:i:0 YT:Z:UU MD:Z:100 NM:i:0 +SRR065390.17838261 16 CHROMOSOME_I 10000000003 1 100M * 0 0 CTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAA #################@>=?B@DCBDB>@D>DBADCDDD>CC@DCCCCBCCACCCCCDCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCC AS:i:0 XS:i:0 XN:i:0 XM:i:0 XO:i:0 XG:i:0 YT:Z:UU MD:Z:100 NM:i:0 +SRR065390.22711273 16 CHROMOSOME_I 10000000003 1 100M * 0 0 CTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAA #############################B<@=<:6/0307==72@@=?788==;AAA:@CCAACCC?CCAACCCCCCCCCCCCCCCCCCCCCCCCCCCC AS:i:0 XS:i:0 XN:i:0 XM:i:0 XO:i:0 XG:i:0 YT:Z:UU MD:Z:100 NM:i:0 +SRR065390.22922978 16 CHROMOSOME_I 10000000003 1 100M * 0 0 CTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAA ##########################B=B>A@BBBC??=@=A@AC<><<8>C6CCCCC8CCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCDCC AS:i:0 XS:i:0 XN:i:0 XM:i:0 XO:i:0 XG:i:0 YT:Z:UU MD:Z:100 NM:i:0 +SRR065390.23087186 16 CHROMOSOME_I 10000000003 1 100M * 0 0 CTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAA ############@:73???@6;D?B>:>BBA?B<>B@B>@B>@>BCDCCCDCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCC AS:i:0 XS:i:0 XN:i:0 XM:i:0 XO:i:0 XG:i:0 YT:Z:UU MD:Z:100 NM:i:0 +SRR065390.23506653 16 CHROMOSOME_I 10000000003 1 100M * 0 0 CTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAA ###############A/=A5::87@:=>6@AA>@CDBA@ABCB?BC>CD>DDBDC@CCCCCDCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCC AS:i:0 XS:i:0 XN:i:0 XM:i:0 XO:i:0 XG:i:0 YT:Z:UU MD:Z:100 NM:i:0 +SRR065390.23791575 16 CHROMOSOME_I 10000000003 1 100M * 0 0 CTAAGCCCCACCCCAACCCTTAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAA ##############################B4;:=B@>A@BCB@@ABCCBB@BCC@CCDCCDCCDCCACCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCC AS:i:-12 XS:i:-12 XN:i:0 XM:i:6 XO:i:0 XG:i:0 YT:Z:UU MD:Z:7T0A1G2T2G3A79 NM:i:6 +SRR065390.25911768 16 CHROMOSOME_I 10000000003 1 100M * 0 0 CTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAA ##############@8B@B?9=:A?=@DDB>;B6?DDBCABABB@DDCCBDBDCCDACCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCBCCC AS:i:0 XS:i:0 XN:i:0 XM:i:0 XO:i:0 XG:i:0 YT:Z:UU MD:Z:100 NM:i:0 +SRR065390.26055380 16 CHROMOSOME_I 10000000003 1 100M * 0 0 CTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAA #################################DAA><0=>=B;?BACDBDABCBBC@CACACACACCACCCCCCCCCCCCCCCCCCCCCCBCCCCCCCC AS:i:0 XS:i:0 XN:i:0 XM:i:0 XO:i:0 XG:i:0 YT:Z:UU MD:Z:100 NM:i:0 +SRR065390.26121674 16 CHROMOSOME_I 10000000003 1 100M * 0 0 CTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAA #################?:AA::@DAAA>B??@A4@=BBBBDDBDBDCCBDDBCCCCDCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCC AS:i:0 XS:i:0 XN:i:0 XM:i:0 XO:i:0 XG:i:0 YT:Z:UU MD:Z:100 NM:i:0 +SRR065390.30352568 16 CHROMOSOME_I 10000000003 7 100M * 0 0 CTAGGGCTAACCCTCAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAACCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAA #################################################################A>>5A?CCC@CCCCCCCCCC?CC:C@A@==@A@A@ AS:i:-10 XS:i:-19 XN:i:0 XM:i:5 XO:i:0 XG:i:0 YT:Z:UU MD:Z:3A1C4G3A37G47 NM:i:5 +SRR065390.31341126 16 CHROMOSOME_I 10000000003 1 100M * 0 0 CTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAA ########################?AD?D@BCAABBBD@=DBCDBAACCDCAABCDCCCACCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCC AS:i:0 XS:i:0 XN:i:0 XM:i:0 XO:i:0 XG:i:0 YT:Z:UU MD:Z:100 NM:i:0 +SRR065390.33653624 16 CHROMOSOME_I 10000000003 17 100M * 0 0 CTAATCCTAGGCCTAAGCCCAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAA ####################################??8?000-+0000,@ABBBB@B:B@B>BB????>>>@@?::?6?>>;>>@ACCCCBCCBACCCC AS:i:-6 XS:i:-19 XN:i:0 XM:i:3 XO:i:0 XG:i:0 YT:Z:UU MD:Z:4G4A9T80 NM:i:3 +SRR065390.28043279 16 CHROMOSOME_I 10000000004 0 9M1I90M * 0 0 TCTTCCGATCTCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAA #####A>=7A6DD=@AA?>AAABC@CAABDBCBBABDADBADCABBBDCDCDCACDCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCBCCCC AS:i:-26 XS:i:-26 XN:i:0 XM:i:6 XO:i:1 XG:i:1 YT:Z:UU MD:Z:1A0A0G2T1A0G89 NM:i:7 +SRR065390.29270443 16 CHROMOSOME_I 10000000006 1 100M * 0 0 AGCCTAAGCCGAACCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCC ###################################@:88@@>B>C>CCCCA@CCCCCCCCCCCCCCCCCCCCCCCCDCCCCCCCCCCCCCCCCCCCCCCC AS:i:-4 XS:i:-4 XN:i:0 XM:i:2 XO:i:0 XG:i:0 YT:Z:UU MD:Z:10T2G86 NM:i:2 +SRR065390.1364843 16 CHROMOSOME_I 10000000011 1 100M * 0 0 AAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGC ####################@=A=8@:>@;@@=>>B8?C6CCCCCCCCCCACCCCBBCCCCCCCBCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCC AS:i:0 XS:i:0 XN:i:0 XM:i:0 XO:i:0 XG:i:0 YT:Z:UU MD:Z:100 NM:i:0 +SRR065390.10190875 16 CHROMOSOME_I 10000000011 1 100M * 0 0 AAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGC ##################@@@@@@;>BBB?>A6BAB?BB=BAB@?:A.<===@7:4::>8D@BABBACCCCAB@CCCDDCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCC AS:i:0 XS:i:0 XN:i:0 XM:i:0 XO:i:0 XG:i:0 YT:Z:UU MD:Z:100 NM:i:0 +SRR065390.13556211 0 CHROMOSOME_I 10000000011 0 100M * 0 0 AAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGATTGGAAGAGCGGCTAAGCAGGAACGCCGGGCTCGATCTCAGC CCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCDCCCCCBCDCCB>BBBBB########################################### AS:i:-50 XS:i:-50 XN:i:0 XM:i:25 XO:i:0 XG:i:0 YT:Z:UU MD:Z:57C0C1A0A0G0C0C0T0A0A1C6C0T0A1G1C0T0A0A1C2A0A0G0C2A3 NM:i:25 +SRR065390.20524775 16 CHROMOSOME_I 10000000011 1 100M * 0 0 AAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGC ############################?9<8B=?@C8A<@?@C8CBDCCC=CCCCC??@CCDCCCCCCCCCCCCCCCCCCCCDCCCCCCCDCCCCCCCC AS:i:0 XS:i:0 XN:i:0 XM:i:0 XO:i:0 XG:i:0 YT:Z:UU MD:Z:100 NM:i:0 +SRR065390.20580336 16 CHROMOSOME_I 10000000011 1 100M * 0 0 AAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGC ############################?:>@?@=>@=0<:CB>@B=DCADB@CCCCC@CCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCC AS:i:0 XS:i:0 XN:i:0 XM:i:0 XO:i:0 XG:i:0 YT:Z:UU MD:Z:100 NM:i:0 +SRR065390.22573273 16 CHROMOSOME_I 10000000011 1 100M * 0 0 AAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGC ##################################A9;?@CBBDBA>BB;ABDB>AAA;=>=0943@########### AS:i:0 XS:i:0 XN:i:0 XM:i:0 XO:i:0 XG:i:0 YT:Z:UU MD:Z:100 NM:i:0 +SRR065390.20870114 0 CHROMOSOME_I 10000000012 1 100M * 0 0 AGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCC CCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCABCCCC=BBBCA@B>B?D;B=>9?############################ AS:i:0 XS:i:0 XN:i:0 XM:i:0 XO:i:0 XG:i:0 YT:Z:UU MD:Z:100 NM:i:0 +SRR065390.3863623 16 CHROMOSOME_I 10000000012 1 100M * 0 0 CGCCTACGCCTACGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCC ##############################?@BB>B@BCABBB?DC@DADC@DCDCACDCBCCCCCCCCCCC@CCCCCCCCCCCCCCC1CCCCCCCCCCC AS:i:-6 XS:i:-6 XN:i:0 XM:i:3 XO:i:0 XG:i:0 YT:Z:UU MD:Z:0A5A5A87 NM:i:3 +SRR065390.1659845 0 CHROMOSOME_I 10000000013 0 100M * 0 0 GCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAACCTAAGCCTAAGCCCAACCCTAAGACCGAGACCGAGACC CCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCACCCAB@CCC######################################### AS:i:-22 XS:i:-22 XN:i:0 XM:i:11 XO:i:0 XG:i:0 YT:Z:UU MD:Z:60G14T2G6C1T0A2C1T0A2C1T0 NM:i:11 +SRR065390.1567418 16 CHROMOSOME_I 10000000015 1 100M * 0 0 CACAGCCTACGTCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAA #########################################?:8A@<@>>BBB8>BBB@BBBB>@:??::87688:?:::?@<@@97866@?>@@;;>:< AS:i:-8 XS:i:-8 XN:i:0 XM:i:4 XO:i:0 XG:i:0 YT:Z:UU MD:Z:1T0A6A1C88 NM:i:4 +SRR065390.4996386 16 CHROMOSOME_I 10000000015 17 100M * 0 0 CCAAGCCGAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAA ###################################@@@@A=BB@C>>DCCACCCCCCC@CCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCC AS:i:-4 XS:i:-22 XN:i:0 XM:i:2 XO:i:0 XG:i:0 YT:Z:UU MD:Z:1T5T92 NM:i:2 +SRR065390.14822977 16 CHROMOSOME_I 10000000015 1 100M * 0 0 CGAAGCCAGAGCCTAGGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAA ####################################B:B?:==2>6@B@@C>?>A@CB5@??@28C@CCCBC@CC?CC?A@CC:CBCCCCCCCCCCCCCC AS:i:-8 XS:i:-8 XN:i:0 XM:i:4 XO:i:0 XG:i:0 YT:Z:UU MD:Z:1T5T0A6A84 NM:i:4 +SRR065390.15148736 16 CHROMOSOME_I 10000000015 17 100M * 0 0 CTGAGCCGAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAA ###########################CCBC<=C;9??<;==C@BCCCCC=CCCCACACACCBBCCCCCCCCCCCCCCCCCBCCCCCCCCCCCBCA?CCC AS:i:-4 XS:i:-21 XN:i:0 XM:i:2 XO:i:0 XG:i:0 YT:Z:UU MD:Z:2A4T92 NM:i:2 +SRR065390.18089757 16 CHROMOSOME_I 10000000015 1 100M * 0 0 CTGAGCCTGAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAA ########################A212.0:?.>8?BB?B<@@C?CCBCB;DCCCACDCCACCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCC AS:i:-4 XS:i:-4 XN:i:0 XM:i:2 XO:i:0 XG:i:0 YT:Z:UU MD:Z:2A5A91 NM:i:2 +SRR065390.25601994 16 CHROMOSOME_I 10000000015 17 100M * 0 0 ATAAGCCTAATCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAA #####################???DD?BD?BDBB>ACBDBDDBDDDBDBDCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCC AS:i:-4 XS:i:-21 XN:i:0 XM:i:2 XO:i:0 XG:i:0 YT:Z:UU MD:Z:0C9G89 NM:i:2 +SRR065390.29400981 16 CHROMOSOME_I 10000000015 17 100M * 0 0 CGAACCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAA ###############################A<:?C>>BCABABC?AD>BDADDDBDBBDBDCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCC AS:i:-4 XS:i:-18 XN:i:0 XM:i:2 XO:i:0 XG:i:0 YT:Z:UU MD:Z:1T2G95 NM:i:2 +SRR065390.29022479 0 CHROMOSOME_I 10000000167 0 100M * 0 0 AAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAACCCTAAGCCTAATCCTATGCATAAACCTAAACAGAATCAAAAGAAAAATCCAATCT CCCCCCCCCACCCCCBCCCC?CCCCCCCD;?D?D################################ AS:i:-6 XS:i:-6 XN:i:0 XM:i:3 XO:i:0 XG:i:0 YT:Z:UU MD:Z:94C0T0A3 NM:i:3 +SRR065390.23298396 16 CHROMOSOME_I 10000000167 1 100M * 0 0 AAGCCTCGGCCTACGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGC #####################A@><>B==BC@CCBB?BA'@>>;>>DADDDBDBADB?B6@7=;;7DBD?B<8=AA:4-9<@@1:@A################################ AS:i:-2 XS:i:-2 XN:i:0 XM:i:1 XO:i:0 XG:i:0 YT:Z:UU MD:Z:98C1 NM:i:1 +SRR065390.23263331 0 CHROMOSOME_I 10000000168 1 100M * 0 0 AGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCC CCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCBCBCCCDCCDCDDDBBDA=B@BB@B>B>AB?@?BB>;;ACC>CAA@;9<5@############## AS:i:0 XS:i:0 XN:i:0 XM:i:0 XO:i:0 XG:i:0 YT:Z:UU MD:Z:100 NM:i:0 +SRR065390.1428659 16 CHROMOSOME_I 10000000168 1 100M * 0 0 AGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCC #######?DB@;>BBB::>:D=>D?BDDBBBBCCAC@DCCBDCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCC AS:i:0 XS:i:0 XN:i:0 XM:i:0 XO:i:0 XG:i:0 YT:Z:UU MD:Z:100 NM:i:0 +SRR065390.9270489 16 CHROMOSOME_I 10000000168 1 100M * 0 0 AGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCC ##########?4=>@BAA>BB>AA@====3BBBBB;B?@C==CCC?@CCC?CCC?ACCCCCBCCCBCCCCBCCCCCCCCCCCCCC=BCCCCCACCCDCCC AS:i:0 XS:i:0 XN:i:0 XM:i:0 XO:i:0 XG:i:0 YT:Z:UU MD:Z:100 NM:i:0 +SRR065390.9538669 16 CHROMOSOME_I 10000000168 1 100M * 0 0 AGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCC ##########@=?6??@B;BA@@@?.@?@@;D>A;DB@DBBBD>@DDDBADCCBCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCC AS:i:0 XS:i:0 XN:i:0 XM:i:0 XO:i:0 XG:i:0 YT:Z:UU MD:Z:100 NM:i:0 +SRR065390.15525407 16 CHROMOSOME_I 10000000168 1 100M * 0 0 AGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCC ####################@37:0BC@@C@ACCAB?@CCACCACCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCBCCCCCCCCCCCCCCCCCCCCC AS:i:0 XS:i:0 XN:i:0 XM:i:0 XO:i:0 XG:i:0 YT:Z:UU MD:Z:100 NM:i:0 +SRR065390.18387934 16 CHROMOSOME_I 10000000168 1 100M * 0 0 AGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCC ##########################@@A@4BDDBB@ACABB@8BCACCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCC AS:i:0 XS:i:0 XN:i:0 XM:i:0 XO:i:0 XG:i:0 YT:Z:UU MD:Z:100 NM:i:0 +SRR065390.27778447 16 CHROMOSOME_I 10000000168 1 100M * 0 0 AGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCC ###############@@B=;>89<>/8?<8@>=ABDCCDCC@CCACB@@C@9ACCCC;CCCC@CCAAB@@CCCCCBCCCCCCCBCCCCCCCCCCCCCCCC AS:i:0 XS:i:0 XN:i:0 XM:i:0 XO:i:0 XG:i:0 YT:Z:UU MD:Z:100 NM:i:0 +SRR065390.4767844 0 CHROMOSOME_I 10000000170 1 100M * 0 0 CCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCCAAGCCTAAGCCTAACCCCA CCCCCCCCCCCCCCCCCCCCCCCCCCCCCACCCCCCCCCCCDCC=CCBA=BCCACCBCC<@@@A@>A?D<5/772AA####################### AS:i:-6 XS:i:-6 XN:i:0 XM:i:3 XO:i:0 XG:i:0 YT:Z:UU MD:Z:80T14G2T1 NM:i:3 +SRR065390.6036148 0 CHROMOSOME_I 10000000170 1 100M * 0 0 CCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCCAAGCCTCAGACCA CCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCACCCCCCCBCCC=C########################################## AS:i:-8 XS:i:-8 XN:i:0 XM:i:4 XO:i:0 XG:i:0 YT:Z:UU MD:Z:86T6A2C1T1 NM:i:4 +SRR065390.7523697 0 CHROMOSOME_I 10000000170 1 100M * 0 0 CCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTATACCTATGCATA 8773399<;8BBB>BAA<A################################### AS:i:-8 XS:i:-8 XN:i:0 XM:i:4 XO:i:0 XG:i:0 YT:Z:UU MD:Z:84C3A4A5A0 NM:i:4 +SRR065390.21777229 0 CHROMOSOME_I 10000000170 1 100M * 0 0 CCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCCAAGCCCAAGACCAAGCCAAGACCCC CCCCCCCCCCCCCCCCCCCCCCCCCCC@CCCCCCCCCCCCCCCCCBDABAA@48@############################################# AS:i:-18 XS:i:-18 XN:i:0 XM:i:9 XO:i:0 XG:i:0 YT:Z:UU MD:Z:74T5T3C1T5T1A0G2T0A0 NM:i:9 +SRR065390.22082412 0 CHROMOSOME_I 10000000170 1 100M * 0 0 CCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTTGGCCGCAGCCTCAGCCTGAACAGA CCCCACACCCCCCCC??:??@CCCC@9A>9?AA@AC>@CA@B-73>8=53@=:=A?><=>49778?################ AS:i:-4 XS:i:-4 XN:i:0 XM:i:2 XO:i:0 XG:i:0 YT:Z:UU MD:Z:88A5A5 NM:i:2 +SRR065390.32243033 0 CHROMOSOME_I 10000000170 1 100M * 0 0 CCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTACGACTA CCCCCCCCCCCACC@CCACCCCCCCCCCCCCCCCC@CADCCBBD@BB>=?A@9C@?C>A88?>8A?:@CCCCCCCCC:?>;:CCC?BCCCCACCCCCCCCCC AS:i:-39 XS:i:-39 XN:i:0 XM:i:18 XO:i:0 XG:i:0 YT:Z:UU MD:Z:0C0T0A0A2C0T2G1C0T0A0A0G0C1T0A1G1C64C10 NM:i:18 +SRR065390.28296401 16 CHROMOSOME_I 10000000171 1 100M * 0 0 CTAAGCCTAAGCCTAAGGCTAAGCCTAAACCCACGCCTAGGCCGAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAA ##################################################BAADDDBBDDCCDCCCCCACDCCCCCCCCCCCCCCCCCCCCCCCCCCCCC AS:i:-12 XS:i:-12 XN:i:0 XM:i:6 XO:i:0 XG:i:0 YT:Z:UU MD:Z:17C10G2T1A5A3T56 NM:i:6 +SRR065390.1242089 0 CHROMOSOME_I 10000000173 1 100M * 0 0 AAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGC A=@@?=?=8A3BB>>B@B>BAB@B@B77//8<;>5:@@@B6ABA@BA<@BB5):5;*83736?;;;@@=;6B>??##################### AS:i:0 XS:i:0 XN:i:0 XM:i:0 XO:i:0 XG:i:0 YT:Z:UU MD:Z:100 NM:i:0 +SRR065390.3872193 0 CHROMOSOME_I 10000000173 1 100M * 0 0 AAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGC CCCCCCCCCCCCCCCCCCCCCCCCCCCACCCCCCBCCCC@DCACD=ABCB@BCDDA@BA=BBB@C??@;:0A>?>B>?)?#################### AS:i:0 XS:i:0 XN:i:0 XM:i:0 XO:i:0 XG:i:0 YT:Z:UU MD:Z:100 NM:i:0 +SRR065390.14566073 0 CHROMOSOME_I 10000000173 1 100M * 0 0 AAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGC CCBCCCCCCCCCCCCCCCCCCCCCCCCACCCCCCCCCCAB=?CCCA6?AACABCCAC=1B@A@;B<@A@@;>?@>8BB?B#################### AS:i:0 XS:i:0 XN:i:0 XM:i:0 XO:i:0 XG:i:0 YT:Z:UU MD:Z:100 NM:i:0 +SRR065390.18391952 0 CHROMOSOME_I 10000000173 1 100M * 0 0 AAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGC CCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCBCCCDCCCAADCCB?CBABD=A>?BB5:??:B;>?@AA?>3?;@(8>=>>/(5500;+@@6 AS:i:0 XS:i:0 XN:i:0 XM:i:0 XO:i:0 XG:i:0 YT:Z:UU MD:Z:100 NM:i:0 +SRR065390.18719419 0 CHROMOSOME_I 10000000173 1 100M * 0 0 AAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGC CCCCBCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCACCAC@@C@@B@DBBDBB################################ AS:i:0 XS:i:0 XN:i:0 XM:i:0 XO:i:0 XG:i:0 YT:Z:UU MD:Z:100 NM:i:0 +SRR065390.23668023 0 CHROMOSOME_I 10000000173 1 100M * 0 0 AAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGC CCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCBCCCCCC@@ABDB@@BBB>DBABB@D@BDBAABAB>B>AA@??9:8>>A:255@###### AS:i:0 XS:i:0 XN:i:0 XM:i:0 XO:i:0 XG:i:0 YT:Z:UU MD:Z:100 NM:i:0 +SRR065390.23826980 0 CHROMOSOME_I 10000000173 1 100M * 0 0 AAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGC CCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCDCCCDCDBCDDBDDDABBBBDDBBBBBBB>D?#################### AS:i:0 XS:i:0 XN:i:0 XM:i:0 XO:i:0 XG:i:0 YT:Z:UU MD:Z:100 NM:i:0 +SRR065390.28024258 0 CHROMOSOME_I 10000000173 1 100M * 0 0 AAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGC CCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCDCCCBACDDBC>DDBDB>BBBBB;?@BBB3@???=0<=>@@:@################ AS:i:0 XS:i:0 XN:i:0 XM:i:0 XO:i:0 XG:i:0 YT:Z:UU MD:Z:100 NM:i:0 +SRR065390.30039772 0 CHROMOSOME_I 10000000173 1 100M * 0 0 AAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGC CCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCDCDACDBBDDDDDBBBDBBD>BBAADAABAAC??B??######################### AS:i:0 XS:i:0 XN:i:0 XM:i:0 XO:i:0 XG:i:0 YT:Z:UU MD:Z:100 NM:i:0 +SRR065390.5345749 16 CHROMOSOME_I 10000000173 1 100M * 0 0 GACCCAGACCCCGCGCCTAAGCCCAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGC ##########################################@BA=>AAA@;AAAA@AA9AAAA@BAA@:=@@@4A=?A@AAAAA:B@@BBBBB@>>>>> AS:i:-18 XS:i:-18 XN:i:0 XM:i:9 XO:i:0 XG:i:0 YT:Z:UU MD:Z:0A1G2T0A1G2T0A0A9T76 NM:i:9 +SRR065390.16932911 16 CHROMOSOME_I 10000000173 1 100M * 0 0 AACCCTAAACCTAACCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGC ###############B?BAA;;9>0A1BAAA@=CA*@CCCCACCCC@@?CAAB>AC=C?CCCCBCCBBCBCCCABCCBCA@CCCCCCBCCCCC?BCCCCC AS:i:-6 XS:i:-6 XN:i:0 XM:i:3 XO:i:0 XG:i:0 YT:Z:UU MD:Z:2G5G5G85 NM:i:3 +SRR065390.17106354 0 CHROMOSOME_I 10000000173 1 100M * 0 0 AAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGC CCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCACCACC?CCADCCAC@BB@CBB@C?@A@@A>=B?BAABBABB6A>BBB:BBA=?DD??;D/<71; AS:i:0 XS:i:0 XN:i:0 XM:i:0 XO:i:0 XG:i:0 YT:Z:UU MD:Z:100 NM:i:0 +SRR065390.22716808 0 CHROMOSOME_I 10000000174 1 100M * 0 0 AGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCGAAGCC CCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCDCABBBBB?################################### AS:i:-2 XS:i:-2 XN:i:0 XM:i:1 XO:i:0 XG:i:0 YT:Z:UU MD:Z:94T5 NM:i:1 +SRR065390.12986460 0 CHROMOSOME_I 10000000176 1 100M * 0 0 CCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCATAATCGTAAGACTAAGAGCAAGCCTCAGCATA CCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCA?CCA############################### AS:i:-4 XS:i:-4 XN:i:0 XM:i:2 XO:i:0 XG:i:0 YT:Z:UU MD:Z:92T2G4 NM:i:2 +SRR065390.14729559 16 CHROMOSOME_I 10000000176 1 100M * 0 0 CCTACGCCCAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTA #########################?(4<=B@;BBBBCB?>BCCA?DCCACCCCCC@C;BBB??B<;9=C@BCAACBBCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCC AS:i:-8 XS:i:-8 XN:i:0 XM:i:4 XO:i:0 XG:i:0 YT:Z:UU MD:Z:2T0A0A6G88 NM:i:4 +SRR065390.26023345 0 CHROMOSOME_I 10000000177 1 100M * 0 0 CTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTCAGCCGAA CCCCCCCCCCCCCCCCCCCCCCCCCCBCCCCCCCCCCCCCA?CDADABDBDDBDDBAB>>BBBB@;>@BBB?A>CBBB<>>B@@4@?>>?0ABD@@###### AS:i:0 XS:i:0 XN:i:0 XM:i:0 XO:i:0 XG:i:0 YT:Z:UU MD:Z:100 NM:i:0 +SRR065390.6149508 0 CHROMOSOME_I 10000000179 1 100M * 0 0 AAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGC CCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCBCCCCCDDCCBD=CCDB@@DABAB=ABB??>>@BB=BCBAB>>D;A?><>AA>?A==+@A AS:i:0 XS:i:0 XN:i:0 XM:i:0 XO:i:0 XG:i:0 YT:Z:UU MD:Z:100 NM:i:0 +SRR065390.6618950 0 CHROMOSOME_I 10000000179 1 100M * 0 0 AAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGC CCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCDCCDCCCCCDCCBCAACBBCBB@DADABBDAB?CBB@B;?BB=B>>>?:? AS:i:0 XS:i:0 XN:i:0 XM:i:0 XO:i:0 XG:i:0 YT:Z:UU MD:Z:100 NM:i:0 +SRR065390.7246333 0 CHROMOSOME_I 10000000179 1 100M * 0 0 AAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGC CCCCCCCCCCCCCCCCCCCCCCACCCDCCCCCCCCCCCDCCBCD@CBBDCADADADBDABBDBDABDBCBBA>BAB>>AC9A################## AS:i:0 XS:i:0 XN:i:0 XM:i:0 XO:i:0 XG:i:0 YT:Z:UU MD:Z:100 NM:i:0 +SRR065390.8266146 0 CHROMOSOME_I 10000000179 1 100M * 0 0 AAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAACCCTCAGCCGAGGCCTACGC CDCCCCCCCCCCCCCCCCCBCCCCCCDCCCCCCACDCCCCCDACBDCABCB@A=ABBB@BBD@DB?B################################# AS:i:-10 XS:i:-10 XN:i:0 XM:i:5 XO:i:0 XG:i:0 YT:Z:UU MD:Z:80G3A4T1A5A2 NM:i:5 +SRR065390.8986893 0 CHROMOSOME_I 10000000179 1 100M * 0 0 AAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGC CCCCCCCCCCCCCCCCCCCCCCCCCCCC@CCCCCCCCCCA@CCCCD=CCCDAABBDB>BDDBDB;BB@@B=@BDB:.A>>BB:@################ AS:i:0 XS:i:0 XN:i:0 XM:i:0 XO:i:0 XG:i:0 YT:Z:UU MD:Z:100 NM:i:0 diff --git a/test/longrefs/longref_itr.expected.sam b/test/longrefs/longref_itr.expected.sam new file mode 100644 index 000000000..6aca06706 --- /dev/null +++ b/test/longrefs/longref_itr.expected.sam @@ -0,0 +1,26 @@ +@SQ SN:CHROMOSOME_I LN:10001009800 +SRR065390.14978392 16 CHROMOSOME_I 10000000002 1 27M1D73M * 0 0 CCTAGCCCTAACCCTAACCCTAACCCTAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAA #############################@B?8B?BA@@DDBCDDCBC@CDCDCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCC AS:i:-18 XS:i:-18 XN:i:0 XM:i:5 XO:i:1 XG:i:1 YT:Z:UU MD:Z:4A0G5G5G5G3^A73 NM:i:6 +SRR065390.921023 16 CHROMOSOME_I 10000000003 12 100M * 0 0 CTAAGCCTAAATCTAAGCCTAACCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAA ###############################################???88:;98768700000<>:BBA?BBAB?BBBBBBBB>B>BB::;?:00000 AS:i:-6 XS:i:-13 XN:i:0 XM:i:3 XO:i:0 XG:i:0 YT:Z:UU MD:Z:10G0C10G77 NM:i:3 +SRR065390.1871511 16 CHROMOSOME_I 10000000003 1 100M * 0 0 CTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAA 0:BB@>B<=B@???@=8@B>BB@CA@DACDCBBCCCA@CCCCACCBCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCC AS:i:0 XS:i:0 XN:i:0 XM:i:0 XO:i:0 XG:i:0 YT:Z:UU MD:Z:100 NM:i:0 +SRR065390.3743423 16 CHROMOSOME_I 10000000003 1 100M * 0 0 CTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAA ##################?6@:7<=@3=@ABAAB>BDBBABADABDDDBDDBCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCC AS:i:0 XS:i:0 XN:i:0 XM:i:0 XO:i:0 XG:i:0 YT:Z:UU MD:Z:100 NM:i:0 +SRR065390.4251890 16 CHROMOSOME_I 10000000003 1 100M * 0 0 CTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAA ###########@BB=BCBBC?B>B;>B@@ADBBB@DBBBBDCCBBCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCC AS:i:0 XS:i:0 XN:i:0 XM:i:0 XO:i:0 XG:i:0 YT:Z:UU MD:Z:100 NM:i:0 +SRR065390.5238868 16 CHROMOSOME_I 10000000003 1 100M * 0 0 CTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAA @,=@@D8D;?BBB>;?BBB==BB@D;>D>BBB>BBDDBA@@BCCB@=BACBCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCC AS:i:0 XS:i:0 XN:i:0 XM:i:0 XO:i:0 XG:i:0 YT:Z:UU MD:Z:100 NM:i:0 +SRR065390.8289592 16 CHROMOSOME_I 10000000003 1 100M * 0 0 CTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAA ###############################A?@C9@@BC=AABDD@A@DC@CB=@BA?6@CCAAC@+CCCCCCCCCCCCCCC@CCCCCCCCCCCCCCCC AS:i:0 XS:i:0 XN:i:0 XM:i:0 XO:i:0 XG:i:0 YT:Z:UU MD:Z:100 NM:i:0 +SRR065390.14497557 16 CHROMOSOME_I 10000000003 1 100M * 0 0 CTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAA ######@:@@.>=><;;B>AB>>BB?B=>B=BD>BDADDD>CCDDDBCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCC AS:i:0 XS:i:0 XN:i:0 XM:i:0 XO:i:0 XG:i:0 YT:Z:UU MD:Z:100 NM:i:0 +SRR065390.15617929 16 CHROMOSOME_I 10000000003 1 100M * 0 0 CTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAA D?;;D>?C>CBAAACD@BB?B>BBDB>@BBDDBDC@CBDDCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCC AS:i:0 XS:i:0 XN:i:0 XM:i:0 XO:i:0 XG:i:0 YT:Z:UU MD:Z:100 NM:i:0 +SRR065390.16049575 16 CHROMOSOME_I 10000000003 1 100M * 0 0 CTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAA #######################@??BB8BBB@@:AB@BDBCCDCBDCCCCACCCCCCBCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCC AS:i:0 XS:i:0 XN:i:0 XM:i:0 XO:i:0 XG:i:0 YT:Z:UU MD:Z:100 NM:i:0 +SRR065390.17838261 16 CHROMOSOME_I 10000000003 1 100M * 0 0 CTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAA #################@>=?B@DCBDB>@D>DBADCDDD>CC@DCCCCBCCACCCCCDCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCC AS:i:0 XS:i:0 XN:i:0 XM:i:0 XO:i:0 XG:i:0 YT:Z:UU MD:Z:100 NM:i:0 +SRR065390.22711273 16 CHROMOSOME_I 10000000003 1 100M * 0 0 CTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAA #############################B<@=<:6/0307==72@@=?788==;AAA:@CCAACCC?CCAACCCCCCCCCCCCCCCCCCCCCCCCCCCC AS:i:0 XS:i:0 XN:i:0 XM:i:0 XO:i:0 XG:i:0 YT:Z:UU MD:Z:100 NM:i:0 +SRR065390.22922978 16 CHROMOSOME_I 10000000003 1 100M * 0 0 CTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAA ##########################B=B>A@BBBC??=@=A@AC<><<8>C6CCCCC8CCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCDCC AS:i:0 XS:i:0 XN:i:0 XM:i:0 XO:i:0 XG:i:0 YT:Z:UU MD:Z:100 NM:i:0 +SRR065390.23087186 16 CHROMOSOME_I 10000000003 1 100M * 0 0 CTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAA ############@:73???@6;D?B>:>BBA?B<>B@B>@B>@>BCDCCCDCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCC AS:i:0 XS:i:0 XN:i:0 XM:i:0 XO:i:0 XG:i:0 YT:Z:UU MD:Z:100 NM:i:0 +SRR065390.23506653 16 CHROMOSOME_I 10000000003 1 100M * 0 0 CTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAA ###############A/=A5::87@:=>6@AA>@CDBA@ABCB?BC>CD>DDBDC@CCCCCDCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCC AS:i:0 XS:i:0 XN:i:0 XM:i:0 XO:i:0 XG:i:0 YT:Z:UU MD:Z:100 NM:i:0 +SRR065390.23791575 16 CHROMOSOME_I 10000000003 1 100M * 0 0 CTAAGCCCCACCCCAACCCTTAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAA ##############################B4;:=B@>A@BCB@@ABCCBB@BCC@CCDCCDCCDCCACCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCC AS:i:-12 XS:i:-12 XN:i:0 XM:i:6 XO:i:0 XG:i:0 YT:Z:UU MD:Z:7T0A1G2T2G3A79 NM:i:6 +SRR065390.25911768 16 CHROMOSOME_I 10000000003 1 100M * 0 0 CTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAA ##############@8B@B?9=:A?=@DDB>;B6?DDBCABABB@DDCCBDBDCCDACCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCBCCC AS:i:0 XS:i:0 XN:i:0 XM:i:0 XO:i:0 XG:i:0 YT:Z:UU MD:Z:100 NM:i:0 +SRR065390.26055380 16 CHROMOSOME_I 10000000003 1 100M * 0 0 CTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAA #################################DAA><0=>=B;?BACDBDABCBBC@CACACACACCACCCCCCCCCCCCCCCCCCCCCCBCCCCCCCC AS:i:0 XS:i:0 XN:i:0 XM:i:0 XO:i:0 XG:i:0 YT:Z:UU MD:Z:100 NM:i:0 +SRR065390.26121674 16 CHROMOSOME_I 10000000003 1 100M * 0 0 CTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAA #################?:AA::@DAAA>B??@A4@=BBBBDDBDBDCCBDDBCCCCDCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCC AS:i:0 XS:i:0 XN:i:0 XM:i:0 XO:i:0 XG:i:0 YT:Z:UU MD:Z:100 NM:i:0 +SRR065390.30352568 16 CHROMOSOME_I 10000000003 7 100M * 0 0 CTAGGGCTAACCCTCAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAACCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAA #################################################################A>>5A?CCC@CCCCCCCCCC?CC:C@A@==@A@A@ AS:i:-10 XS:i:-19 XN:i:0 XM:i:5 XO:i:0 XG:i:0 YT:Z:UU MD:Z:3A1C4G3A37G47 NM:i:5 +SRR065390.31341126 16 CHROMOSOME_I 10000000003 1 100M * 0 0 CTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAA ########################?AD?D@BCAABBBD@=DBCDBAACCDCAABCDCCCACCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCC AS:i:0 XS:i:0 XN:i:0 XM:i:0 XO:i:0 XG:i:0 YT:Z:UU MD:Z:100 NM:i:0 +SRR065390.33653624 16 CHROMOSOME_I 10000000003 17 100M * 0 0 CTAATCCTAGGCCTAAGCCCAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAA ####################################??8?000-+0000,@ABBBB@B:B@B>BB????>>>@@?::?6?>>;>>@ACCCCBCCBACCCC AS:i:-6 XS:i:-19 XN:i:0 XM:i:3 XO:i:0 XG:i:0 YT:Z:UU MD:Z:4G4A9T80 NM:i:3 diff --git a/test/longrefs/longref_multi.expected.sam b/test/longrefs/longref_multi.expected.sam new file mode 100644 index 000000000..997ead54c --- /dev/null +++ b/test/longrefs/longref_multi.expected.sam @@ -0,0 +1,46 @@ +@SQ SN:CHROMOSOME_I LN:10001009800 +SRR065390.14978392 16 CHROMOSOME_I 10000000002 1 27M1D73M * 0 0 CCTAGCCCTAACCCTAACCCTAACCCTAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAA #############################@B?8B?BA@@DDBCDDCBC@CDCDCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCC AS:i:-18 XS:i:-18 XN:i:0 XM:i:5 XO:i:1 XG:i:1 YT:Z:UU MD:Z:4A0G5G5G5G3^A73 NM:i:6 +SRR065390.921023 16 CHROMOSOME_I 10000000003 12 100M * 0 0 CTAAGCCTAAATCTAAGCCTAACCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAA ###############################################???88:;98768700000<>:BBA?BBAB?BBBBBBBB>B>BB::;?:00000 AS:i:-6 XS:i:-13 XN:i:0 XM:i:3 XO:i:0 XG:i:0 YT:Z:UU MD:Z:10G0C10G77 NM:i:3 +SRR065390.1871511 16 CHROMOSOME_I 10000000003 1 100M * 0 0 CTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAA 0:BB@>B<=B@???@=8@B>BB@CA@DACDCBBCCCA@CCCCACCBCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCC AS:i:0 XS:i:0 XN:i:0 XM:i:0 XO:i:0 XG:i:0 YT:Z:UU MD:Z:100 NM:i:0 +SRR065390.3743423 16 CHROMOSOME_I 10000000003 1 100M * 0 0 CTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAA ##################?6@:7<=@3=@ABAAB>BDBBABADABDDDBDDBCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCC AS:i:0 XS:i:0 XN:i:0 XM:i:0 XO:i:0 XG:i:0 YT:Z:UU MD:Z:100 NM:i:0 +SRR065390.4251890 16 CHROMOSOME_I 10000000003 1 100M * 0 0 CTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAA ###########@BB=BCBBC?B>B;>B@@ADBBB@DBBBBDCCBBCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCC AS:i:0 XS:i:0 XN:i:0 XM:i:0 XO:i:0 XG:i:0 YT:Z:UU MD:Z:100 NM:i:0 +SRR065390.5238868 16 CHROMOSOME_I 10000000003 1 100M * 0 0 CTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAA @,=@@D8D;?BBB>;?BBB==BB@D;>D>BBB>BBDDBA@@BCCB@=BACBCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCC AS:i:0 XS:i:0 XN:i:0 XM:i:0 XO:i:0 XG:i:0 YT:Z:UU MD:Z:100 NM:i:0 +SRR065390.8289592 16 CHROMOSOME_I 10000000003 1 100M * 0 0 CTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAA ###############################A?@C9@@BC=AABDD@A@DC@CB=@BA?6@CCAAC@+CCCCCCCCCCCCCCC@CCCCCCCCCCCCCCCC AS:i:0 XS:i:0 XN:i:0 XM:i:0 XO:i:0 XG:i:0 YT:Z:UU MD:Z:100 NM:i:0 +SRR065390.14497557 16 CHROMOSOME_I 10000000003 1 100M * 0 0 CTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAA ######@:@@.>=><;;B>AB>>BB?B=>B=BD>BDADDD>CCDDDBCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCC AS:i:0 XS:i:0 XN:i:0 XM:i:0 XO:i:0 XG:i:0 YT:Z:UU MD:Z:100 NM:i:0 +SRR065390.15617929 16 CHROMOSOME_I 10000000003 1 100M * 0 0 CTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAA D?;;D>?C>CBAAACD@BB?B>BBDB>@BBDDBDC@CBDDCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCC AS:i:0 XS:i:0 XN:i:0 XM:i:0 XO:i:0 XG:i:0 YT:Z:UU MD:Z:100 NM:i:0 +SRR065390.16049575 16 CHROMOSOME_I 10000000003 1 100M * 0 0 CTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAA #######################@??BB8BBB@@:AB@BDBCCDCBDCCCCACCCCCCBCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCC AS:i:0 XS:i:0 XN:i:0 XM:i:0 XO:i:0 XG:i:0 YT:Z:UU MD:Z:100 NM:i:0 +SRR065390.17838261 16 CHROMOSOME_I 10000000003 1 100M * 0 0 CTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAA #################@>=?B@DCBDB>@D>DBADCDDD>CC@DCCCCBCCACCCCCDCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCC AS:i:0 XS:i:0 XN:i:0 XM:i:0 XO:i:0 XG:i:0 YT:Z:UU MD:Z:100 NM:i:0 +SRR065390.22711273 16 CHROMOSOME_I 10000000003 1 100M * 0 0 CTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAA #############################B<@=<:6/0307==72@@=?788==;AAA:@CCAACCC?CCAACCCCCCCCCCCCCCCCCCCCCCCCCCCC AS:i:0 XS:i:0 XN:i:0 XM:i:0 XO:i:0 XG:i:0 YT:Z:UU MD:Z:100 NM:i:0 +SRR065390.22922978 16 CHROMOSOME_I 10000000003 1 100M * 0 0 CTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAA ##########################B=B>A@BBBC??=@=A@AC<><<8>C6CCCCC8CCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCDCC AS:i:0 XS:i:0 XN:i:0 XM:i:0 XO:i:0 XG:i:0 YT:Z:UU MD:Z:100 NM:i:0 +SRR065390.23087186 16 CHROMOSOME_I 10000000003 1 100M * 0 0 CTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAA ############@:73???@6;D?B>:>BBA?B<>B@B>@B>@>BCDCCCDCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCC AS:i:0 XS:i:0 XN:i:0 XM:i:0 XO:i:0 XG:i:0 YT:Z:UU MD:Z:100 NM:i:0 +SRR065390.23506653 16 CHROMOSOME_I 10000000003 1 100M * 0 0 CTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAA ###############A/=A5::87@:=>6@AA>@CDBA@ABCB?BC>CD>DDBDC@CCCCCDCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCC AS:i:0 XS:i:0 XN:i:0 XM:i:0 XO:i:0 XG:i:0 YT:Z:UU MD:Z:100 NM:i:0 +SRR065390.23791575 16 CHROMOSOME_I 10000000003 1 100M * 0 0 CTAAGCCCCACCCCAACCCTTAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAA ##############################B4;:=B@>A@BCB@@ABCCBB@BCC@CCDCCDCCDCCACCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCC AS:i:-12 XS:i:-12 XN:i:0 XM:i:6 XO:i:0 XG:i:0 YT:Z:UU MD:Z:7T0A1G2T2G3A79 NM:i:6 +SRR065390.25911768 16 CHROMOSOME_I 10000000003 1 100M * 0 0 CTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAA ##############@8B@B?9=:A?=@DDB>;B6?DDBCABABB@DDCCBDBDCCDACCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCBCCC AS:i:0 XS:i:0 XN:i:0 XM:i:0 XO:i:0 XG:i:0 YT:Z:UU MD:Z:100 NM:i:0 +SRR065390.26055380 16 CHROMOSOME_I 10000000003 1 100M * 0 0 CTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAA #################################DAA><0=>=B;?BACDBDABCBBC@CACACACACCACCCCCCCCCCCCCCCCCCCCCCBCCCCCCCC AS:i:0 XS:i:0 XN:i:0 XM:i:0 XO:i:0 XG:i:0 YT:Z:UU MD:Z:100 NM:i:0 +SRR065390.26121674 16 CHROMOSOME_I 10000000003 1 100M * 0 0 CTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAA #################?:AA::@DAAA>B??@A4@=BBBBDDBDBDCCBDDBCCCCDCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCC AS:i:0 XS:i:0 XN:i:0 XM:i:0 XO:i:0 XG:i:0 YT:Z:UU MD:Z:100 NM:i:0 +SRR065390.30352568 16 CHROMOSOME_I 10000000003 7 100M * 0 0 CTAGGGCTAACCCTCAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAACCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAA #################################################################A>>5A?CCC@CCCCCCCCCC?CC:C@A@==@A@A@ AS:i:-10 XS:i:-19 XN:i:0 XM:i:5 XO:i:0 XG:i:0 YT:Z:UU MD:Z:3A1C4G3A37G47 NM:i:5 +SRR065390.31341126 16 CHROMOSOME_I 10000000003 1 100M * 0 0 CTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAA ########################?AD?D@BCAABBBD@=DBCDBAACCDCAABCDCCCACCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCC AS:i:0 XS:i:0 XN:i:0 XM:i:0 XO:i:0 XG:i:0 YT:Z:UU MD:Z:100 NM:i:0 +SRR065390.33653624 16 CHROMOSOME_I 10000000003 17 100M * 0 0 CTAATCCTAGGCCTAAGCCCAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAA ####################################??8?000-+0000,@ABBBB@B:B@B>BB????>>>@@?::?6?>>;>>@ACCCCBCCBACCCC AS:i:-6 XS:i:-19 XN:i:0 XM:i:3 XO:i:0 XG:i:0 YT:Z:UU MD:Z:4G4A9T80 NM:i:3 +SRR065390.28043279 16 CHROMOSOME_I 10000000004 0 9M1I90M * 0 0 TCTTCCGATCTCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAA #####A>=7A6DD=@AA?>AAABC@CAABDBCBBABDADBADCABBBDCDCDCACDCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCBCCCC AS:i:-26 XS:i:-26 XN:i:0 XM:i:6 XO:i:1 XG:i:1 YT:Z:UU MD:Z:1A0A0G2T1A0G89 NM:i:7 +SRR065390.29270443 16 CHROMOSOME_I 10000000006 1 100M * 0 0 AGCCTAAGCCGAACCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCC ###################################@:88@@>B>C>CCCCA@CCCCCCCCCCCCCCCCCCCCCCCCDCCCCCCCCCCCCCCCCCCCCCCC AS:i:-4 XS:i:-4 XN:i:0 XM:i:2 XO:i:0 XG:i:0 YT:Z:UU MD:Z:10T2G86 NM:i:2 +SRR065390.1364843 16 CHROMOSOME_I 10000000011 1 100M * 0 0 AAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGC ####################@=A=8@:>@;@@=>>B8?C6CCCCCCCCCCACCCCBBCCCCCCCBCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCC AS:i:0 XS:i:0 XN:i:0 XM:i:0 XO:i:0 XG:i:0 YT:Z:UU MD:Z:100 NM:i:0 +SRR065390.10190875 16 CHROMOSOME_I 10000000011 1 100M * 0 0 AAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGC ##################@@@@@@;>BBB?>A6BAB?BB=BAB@?:A.<===@7:4::>8D@BABBACCCCAB@CCCDDCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCC AS:i:0 XS:i:0 XN:i:0 XM:i:0 XO:i:0 XG:i:0 YT:Z:UU MD:Z:100 NM:i:0 +SRR065390.13556211 0 CHROMOSOME_I 10000000011 0 100M * 0 0 AAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGATTGGAAGAGCGGCTAAGCAGGAACGCCGGGCTCGATCTCAGC CCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCDCCCCCBCDCCB>BBBBB########################################### AS:i:-50 XS:i:-50 XN:i:0 XM:i:25 XO:i:0 XG:i:0 YT:Z:UU MD:Z:57C0C1A0A0G0C0C0T0A0A1C6C0T0A1G1C0T0A0A1C2A0A0G0C2A3 NM:i:25 +SRR065390.20524775 16 CHROMOSOME_I 10000000011 1 100M * 0 0 AAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGC ############################?9<8B=?@C8A<@?@C8CBDCCC=CCCCC??@CCDCCCCCCCCCCCCCCCCCCCCDCCCCCCCDCCCCCCCC AS:i:0 XS:i:0 XN:i:0 XM:i:0 XO:i:0 XG:i:0 YT:Z:UU MD:Z:100 NM:i:0 +SRR065390.20580336 16 CHROMOSOME_I 10000000011 1 100M * 0 0 AAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGC ############################?:>@?@=>@=0<:CB>@B=DCADB@CCCCC@CCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCC AS:i:0 XS:i:0 XN:i:0 XM:i:0 XO:i:0 XG:i:0 YT:Z:UU MD:Z:100 NM:i:0 +SRR065390.22573273 16 CHROMOSOME_I 10000000011 1 100M * 0 0 AAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGC ##################################A9;?@CBBDBA>BB;ABDB>AAA;=>=0943@########### AS:i:0 XS:i:0 XN:i:0 XM:i:0 XO:i:0 XG:i:0 YT:Z:UU MD:Z:100 NM:i:0 +SRR065390.20870114 0 CHROMOSOME_I 10000000012 1 100M * 0 0 AGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCC CCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCABCCCC=BBBCA@B>B?D;B=>9?############################ AS:i:0 XS:i:0 XN:i:0 XM:i:0 XO:i:0 XG:i:0 YT:Z:UU MD:Z:100 NM:i:0 +SRR065390.3863623 16 CHROMOSOME_I 10000000012 1 100M * 0 0 CGCCTACGCCTACGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCC ##############################?@BB>B@BCABBB?DC@DADC@DCDCACDCBCCCCCCCCCCC@CCCCCCCCCCCCCCC1CCCCCCCCCCC AS:i:-6 XS:i:-6 XN:i:0 XM:i:3 XO:i:0 XG:i:0 YT:Z:UU MD:Z:0A5A5A87 NM:i:3 +SRR065390.1659845 0 CHROMOSOME_I 10000000013 0 100M * 0 0 GCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAACCTAAGCCTAAGCCCAACCCTAAGACCGAGACCGAGACC CCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCACCCAB@CCC######################################### AS:i:-22 XS:i:-22 XN:i:0 XM:i:11 XO:i:0 XG:i:0 YT:Z:UU MD:Z:60G14T2G6C1T0A2C1T0A2C1T0 NM:i:11 +SRR065390.1567418 16 CHROMOSOME_I 10000000015 1 100M * 0 0 CACAGCCTACGTCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAA #########################################?:8A@<@>>BBB8>BBB@BBBB>@:??::87688:?:::?@<@@97866@?>@@;;>:< AS:i:-8 XS:i:-8 XN:i:0 XM:i:4 XO:i:0 XG:i:0 YT:Z:UU MD:Z:1T0A6A1C88 NM:i:4 +SRR065390.4996386 16 CHROMOSOME_I 10000000015 17 100M * 0 0 CCAAGCCGAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAA ###################################@@@@A=BB@C>>DCCACCCCCCC@CCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCC AS:i:-4 XS:i:-22 XN:i:0 XM:i:2 XO:i:0 XG:i:0 YT:Z:UU MD:Z:1T5T92 NM:i:2 +SRR065390.14822977 16 CHROMOSOME_I 10000000015 1 100M * 0 0 CGAAGCCAGAGCCTAGGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAA ####################################B:B?:==2>6@B@@C>?>A@CB5@??@28C@CCCBC@CC?CC?A@CC:CBCCCCCCCCCCCCCC AS:i:-8 XS:i:-8 XN:i:0 XM:i:4 XO:i:0 XG:i:0 YT:Z:UU MD:Z:1T5T0A6A84 NM:i:4 +SRR065390.15148736 16 CHROMOSOME_I 10000000015 17 100M * 0 0 CTGAGCCGAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAA ###########################CCBC<=C;9??<;==C@BCCCCC=CCCCACACACCBBCCCCCCCCCCCCCCCCCBCCCCCCCCCCCBCA?CCC AS:i:-4 XS:i:-21 XN:i:0 XM:i:2 XO:i:0 XG:i:0 YT:Z:UU MD:Z:2A4T92 NM:i:2 +SRR065390.18089757 16 CHROMOSOME_I 10000000015 1 100M * 0 0 CTGAGCCTGAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAA ########################A212.0:?.>8?BB?B<@@C?CCBCB;DCCCACDCCACCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCC AS:i:-4 XS:i:-4 XN:i:0 XM:i:2 XO:i:0 XG:i:0 YT:Z:UU MD:Z:2A5A91 NM:i:2 +SRR065390.25601994 16 CHROMOSOME_I 10000000015 17 100M * 0 0 ATAAGCCTAATCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAA #####################???DD?BD?BDBB>ACBDBDDBDDDBDBDCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCC AS:i:-4 XS:i:-21 XN:i:0 XM:i:2 XO:i:0 XG:i:0 YT:Z:UU MD:Z:0C9G89 NM:i:2 +SRR065390.29400981 16 CHROMOSOME_I 10000000015 17 100M * 0 0 CGAACCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAA ###############################A<:?C>>BCABABC?AD>BDADDDBDBBDBDCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCC AS:i:-4 XS:i:-18 XN:i:0 XM:i:2 XO:i:0 XG:i:0 YT:Z:UU MD:Z:1T2G95 NM:i:2 diff --git a/test/test.pl b/test/test.pl index 8b84ca3c5..a71717e31 100755 --- a/test/test.pl +++ b/test/test.pl @@ -631,6 +631,29 @@ sub test_view } else { failed($opts, "no_hdr_sq tests", "$test_view_failures subtests failed"); } + + # File with large (> 2Gbases) positions + # Only works for SAM at the moment, but we can still round-trip it. + print "test_view testing large (> 2Gbases) positions:\n"; + $test_view_failures = 0; + testv $opts, "./test_view $tv_args -z -p longrefs/longref.tmp.sam.gz -x longrefs/longref.tmp.sam.gz.csi.otf -m 14 longrefs/longref.sam"; + testv $opts, "./test_view $tv_args -p longrefs/longref.tmp.sam_ longrefs/longref.tmp.sam.gz"; + testv $opts, "./compare_sam.pl longrefs/longref.sam longrefs/longref.tmp.sam_"; + + # Build index and compare with on-the-fly one made earlier. + test_compare $opts, "$$opts{path}/test_index -c longrefs/longref.tmp.sam.gz", "longrefs/longref.tmp.sam.gz.csi.otf", "longrefs/longref.tmp.sam.gz.csi", gz=>1; + + # Large position iterator tests + testv $opts, "./test_view $tv_args -p longrefs/longref_itr.tmp.sam longrefs/longref.tmp.sam.gz CHROMOSOME_I:10000000000-10000000003"; + testv $opts, "./compare_sam.pl longrefs/longref_itr.expected.sam longrefs/longref_itr.tmp.sam"; + testv $opts, "./test_view $tv_args -M -p longrefs/longref_multi.tmp.sam longrefs/longref.tmp.sam.gz CHROMOSOME_I:10000000000-10000000003 CHROMOSOME_I:10000000100-10000000110"; + testv $opts, "./compare_sam.pl longrefs/longref_multi.expected.sam longrefs/longref_multi.tmp.sam"; + + if ($test_view_failures == 0) { + passed($opts, "large position tests"); + } else { + failed($opts, "large position tests", "$test_view_failures subtests failed"); + } } # Tests CRAM's ability to correctly preserve MD and NM, irrespective of whether From c804f81b30da02504cff8e84bfc4bda5049ece08 Mon Sep 17 00:00:00 2001 From: Rob Davies Date: Tue, 10 Sep 2019 14:10:58 +0100 Subject: [PATCH 13/23] Parse / format bcf1_t::pos as 64 bit --- vcf.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/vcf.c b/vcf.c index e2c94eabd..fcc8bed31 100644 --- a/vcf.c +++ b/vcf.c @@ -2466,7 +2466,7 @@ int vcf_parse(kstring_t *s, const bcf_hdr_t *h, bcf1_t *v) } v->rid = kh_val(d, k).id; } else if (i == 1) { // POS - v->pos = atoi(p) - 1; + v->pos = strtoll(p, NULL, 10) - 1; } else if (i == 2) { // ID if (strcmp(p, ".")) bcf_enc_vchar(str, q - p, p); else bcf_enc_size(str, 0, BCF_BT_CHAR); @@ -2766,7 +2766,7 @@ int vcf_format(const bcf_hdr_t *h, const bcf1_t *v, kstring_t *s) int i; bcf_unpack((bcf1_t*)v, BCF_UN_ALL); kputs(h->id[BCF_DT_CTG][v->rid].key, s); // CHROM - kputc('\t', s); kputw(v->pos + 1, s); // POS + kputc('\t', s); kputll(v->pos + 1, s); // POS kputc('\t', s); kputs(v->d.id ? v->d.id : ".", s); // ID kputc('\t', s); // REF if (v->n_allele > 0) kputs(v->d.allele[0], s); From 2e9bfe381b8864b65125d6f870802806a3ffe2ff Mon Sep 17 00:00:00 2001 From: Rob Davies Date: Tue, 10 Sep 2019 17:40:43 +0100 Subject: [PATCH 14/23] Fix n_lvls calculation in vcf_idx_init. Move duplicated code for calculating n_lvls into its own function. Allow n_lvls to increase in vcf_idx_init for very long references. --- vcf.c | 65 +++++++++++++++++++++++++++++------------------------------ 1 file changed, 32 insertions(+), 33 deletions(-) diff --git a/vcf.c b/vcf.c index fcc8bed31..83bcc200a 100644 --- a/vcf.c +++ b/vcf.c @@ -2916,26 +2916,41 @@ int bcf_hdr_id2int(const bcf_hdr_t *h, int which, const char *id) *** BCF indexing *** ********************/ +// Calculate number of index levels given min_shift and the header contig +// list. Also returns number of contigs in *nids_out. +static int idx_calc_n_lvls_ids(const bcf_hdr_t *h, int min_shift, + int starting_n_lvls, int *nids_out) +{ + int n_lvls, i, nids = 0; + int64_t max_len = 0, s; + + for (i = 0; i < h->n[BCF_DT_CTG]; ++i) + { + if ( !h->id[BCF_DT_CTG][i].val ) continue; + if ( max_len < h->id[BCF_DT_CTG][i].val->info[0] ) + max_len = h->id[BCF_DT_CTG][i].val->info[0]; + nids++; + } + if ( !max_len ) max_len = (1LL<<31) - 1; // In case contig line is broken. + max_len += 256; + s = 1LL << (min_shift + starting_n_lvls * 3); + for (n_lvls = starting_n_lvls; max_len > s; ++n_lvls, s <<= 3); + + if (nids_out) *nids_out = nids; + return n_lvls; +} + hts_idx_t *bcf_index(htsFile *fp, int min_shift) { - int n_lvls, i; + int n_lvls; bcf1_t *b = NULL; hts_idx_t *idx = NULL; bcf_hdr_t *h; - int64_t max_len = 0, s; int r; h = bcf_hdr_read(fp); if ( !h ) return NULL; int nids = 0; - for (i = 0; i < h->n[BCF_DT_CTG]; ++i) - { - if ( !h->id[BCF_DT_CTG][i].val ) continue; - if ( max_len < h->id[BCF_DT_CTG][i].val->info[0] ) max_len = h->id[BCF_DT_CTG][i].val->info[0]; - nids++; - } - if ( !max_len ) max_len = ((int64_t)1<<31) - 1; // In case contig line is broken. - max_len += 256; - for (n_lvls = 0, s = 1< s; ++n_lvls, s <<= 3); + n_lvls = idx_calc_n_lvls_ids(h, min_shift, 0, &nids); idx = hts_idx_init(nids, HTS_FMT_CSI, bgzf_tell(fp->fp.bgzf), min_shift, n_lvls); if (!idx) goto fail; b = bcf_init1(); @@ -3025,23 +3040,17 @@ int bcf_index_build(const char *fn, int min_shift) // Initialise fp->idx for the current format type. // This must be called after the header has been written but no other data. static int vcf_idx_init(htsFile *fp, bcf_hdr_t *h, int min_shift, const char *fnidx) { - int n_lvls, i, fmt; - int64_t max_len = 0; - - for (i = 0; i < h->n[BCF_DT_CTG]; i++) { - if (!h->id[BCF_DT_CTG][i].val) continue; - if (max_len < h->id[BCF_DT_CTG][i].val->info[0]) - max_len = h->id[BCF_DT_CTG][i].val->info[0]; - } - if ( !max_len ) max_len = ((int64_t)1<<31) - 1; // In case contig line is broken. - max_len += 256; + int n_lvls, fmt; if (min_shift == 0) { min_shift = 14; n_lvls = 5; fmt = HTS_FMT_TBI; } else { - n_lvls = (TBX_MAX_SHIFT - min_shift + 2) / 3; + // Set initial n_lvls to match tbx_index() + int starting_n_lvls = (TBX_MAX_SHIFT - min_shift + 2) / 3; + // Increase if necessary + n_lvls = idx_calc_n_lvls_ids(h, min_shift, starting_n_lvls, NULL); fmt = HTS_FMT_CSI; } @@ -3071,8 +3080,6 @@ static int vcf_idx_init(htsFile *fp, bcf_hdr_t *h, int min_shift, const char *fn // This must be called after the header has been written but no other data. int bcf_idx_init(htsFile *fp, bcf_hdr_t *h, int min_shift, const char *fnidx) { int n_lvls, nids = 0; - int64_t max_len = 0, s; - int i; if (fp->format.format == vcf) return vcf_idx_init(fp, h, min_shift, fnidx); @@ -3080,15 +3087,7 @@ int bcf_idx_init(htsFile *fp, bcf_hdr_t *h, int min_shift, const char *fnidx) { if (!min_shift) min_shift = 14; - for (i = 0; i < h->n[BCF_DT_CTG]; i++) { - if (!h->id[BCF_DT_CTG][i].val) continue; - if (max_len < h->id[BCF_DT_CTG][i].val->info[0]) - max_len = h->id[BCF_DT_CTG][i].val->info[0]; - nids++; - } - if ( !max_len ) max_len = ((int64_t)1<<31) - 1; // In case contig line is broken. - max_len += 256; - for (n_lvls = 0, s = 1< s; ++n_lvls, s <<= 3); + n_lvls = idx_calc_n_lvls_ids(h, min_shift, 0, &nids); fp->idx = hts_idx_init(nids, HTS_FMT_CSI, bgzf_tell(fp->fp.bgzf), min_shift, n_lvls); if (!fp->idx) return -1; From 10709d6bf9aa3c4fd90fd98f4e7616726ca5059f Mon Sep 17 00:00:00 2001 From: Rob Davies Date: Wed, 11 Sep 2019 10:31:25 +0100 Subject: [PATCH 15/23] Store > 32 bit reference lengths in bcf_idinfo_t contig data Increases size of bcf_idinfo_t::info. This could be used in the future to increase the maximum value for Number= supported in header lines (although the current value is already rather generous). --- htslib/vcf.h | 6 +++--- vcf.c | 13 +++++++++---- 2 files changed, 12 insertions(+), 7 deletions(-) diff --git a/htslib/vcf.h b/htslib/vcf.h index 9d8464551..ea415df0c 100644 --- a/htslib/vcf.h +++ b/htslib/vcf.h @@ -94,7 +94,7 @@ typedef struct { } bcf_hrec_t; typedef struct { - uint32_t info[3]; // stores Number:20, var:4, Type:4, ColType:4 in info[0..2] + uint64_t info[3]; // stores Number:20, var:4, Type:4, ColType:4 in info[0..2] // for BCF_HL_FLT,INFO,FMT and contig length in info[0] for BCF_HL_CTG bcf_hrec_t *hrec[3]; int id; @@ -876,8 +876,8 @@ set to one of BCF_ERR* codes and must be checked before calling bcf_write(). */ #define bcf_hdr_id2length(hdr,type,int_id) ((hdr)->id[BCF_DT_ID][int_id].val->info[type]>>8 & 0xf) #define bcf_hdr_id2number(hdr,type,int_id) ((hdr)->id[BCF_DT_ID][int_id].val->info[type]>>12) - #define bcf_hdr_id2type(hdr,type,int_id) ((hdr)->id[BCF_DT_ID][int_id].val->info[type]>>4 & 0xf) - #define bcf_hdr_id2coltype(hdr,type,int_id) ((hdr)->id[BCF_DT_ID][int_id].val->info[type] & 0xf) + #define bcf_hdr_id2type(hdr,type,int_id) (uint32_t)((hdr)->id[BCF_DT_ID][int_id].val->info[type]>>4 & 0xf) + #define bcf_hdr_id2coltype(hdr,type,int_id) (uint32_t)((hdr)->id[BCF_DT_ID][int_id].val->info[type] & 0xf) #define bcf_hdr_idinfo_exists(hdr,type,int_id) ((int_id<0 || bcf_hdr_id2coltype(hdr,type,int_id)==0xf) ? 0 : 1) #define bcf_hdr_id2hrec(hdr,dict_type,col_type,int_id) ((hdr)->id[(dict_type)==BCF_DT_CTG?BCF_DT_CTG:BCF_DT_ID][int_id].val->hrec[(dict_type)==BCF_DT_CTG?0:(col_type)]) /// Convert BCF FORMAT data to string form diff --git a/vcf.c b/vcf.c index 83bcc200a..6420b114d 100644 --- a/vcf.c +++ b/vcf.c @@ -499,18 +499,23 @@ static int bcf_hdr_set_idx(bcf_hdr_t *hdr, int dict_type, const char *tag, bcf_i static int bcf_hdr_register_hrec(bcf_hdr_t *hdr, bcf_hrec_t *hrec) { // contig - int i,j, ret, replacing = 0; + int i, ret, replacing = 0; khint_t k; char *str; if ( !strcmp(hrec->key, "contig") ) { + hts_pos_t len = 0; hrec->type = BCF_HL_CTG; // Get the contig ID ($str) and length ($j) i = bcf_hrec_find_key(hrec,"length"); - if ( i<0 ) j = 0; - else if ( sscanf(hrec->vals[i],"%d",&j)!=1 ) return 0; + if ( i<0 ) len = 0; + else { + char *end = hrec->vals[i]; + len = strtoll(hrec->vals[i], &end, 10); + if (end == hrec->vals[i] || len < 0) return 0; + } i = bcf_hrec_find_key(hrec,"ID"); if ( i<0 ) return 0; @@ -548,7 +553,7 @@ static int bcf_hdr_register_hrec(bcf_hdr_t *hdr, bcf_hrec_t *hrec) kh_val(d, k) = bcf_idinfo_def; kh_val(d, k).id = idx; - kh_val(d, k).info[0] = j; + kh_val(d, k).info[0] = len; kh_val(d, k).hrec[0] = hrec; if (bcf_hdr_set_idx(hdr, BCF_DT_CTG, kh_key(d,k), &kh_val(d,k)) < 0) { if (!replacing) { From 1606913ef69aff91a4bff0c29e8738b83b917d50 Mon Sep 17 00:00:00 2001 From: Rob Davies Date: Thu, 12 Sep 2019 17:58:12 +0100 Subject: [PATCH 16/23] Allow storage of 64 bit INFO values for vcf files Needed to support structural variants with END > 2 Gbases. Add BCF_BT_INT64 with the obvious value left clear in the BCF spec. Add BCF_HT_LONG so that it's possible to use int64_t arrays with bcf_get_info_values() and bcf_update_info(). Currently bcf_update_info() only allows a single 64-bit value to be stored. Change bcf_info_t so it can handle a single int64_t value. bcf_info_t::len is also moved to avoid creating a hole. Update vcf_parse() so it can store 64-bit INFO values (again only one is allowed) and use this for END. Add 64 bit value support in bcf_unpack_info_core1() and vcf_format(). It's now possible to round-trip a VCF with large positions, including for structural variants. It's also possible to index them on-the-fly. --- htslib/vcf.h | 17 +++++++-- vcf.c | 103 ++++++++++++++++++++++++++++++++++++++++++--------- 2 files changed, 98 insertions(+), 22 deletions(-) diff --git a/htslib/vcf.h b/htslib/vcf.h index ea415df0c..0c77b4afe 100644 --- a/htslib/vcf.h +++ b/htslib/vcf.h @@ -61,6 +61,7 @@ extern "C" { #define BCF_HT_INT 1 #define BCF_HT_REAL 2 #define BCF_HT_STR 3 +#define BCF_HT_LONG (BCF_HT_INT | 0x100) // BCF_HT_INT, but for int64_t values; VCF only! #define BCF_VL_FIXED 0 // variable length #define BCF_VL_VAR 1 @@ -130,6 +131,7 @@ extern uint8_t bcf_type_shift[]; #define BCF_BT_INT8 1 #define BCF_BT_INT16 2 #define BCF_BT_INT32 3 +#define BCF_BT_INT64 4 // Unofficial, for internal use only. #define BCF_BT_FLOAT 5 #define BCF_BT_CHAR 7 @@ -155,9 +157,9 @@ typedef struct { typedef struct { int key; // key: numeric tag id, the corresponding string is bcf_hdr_t::id[BCF_DT_ID][$key].key - int type, len; // type: one of BCF_BT_* types; len: vector length, 1 for scalars + int type; // type: one of BCF_BT_* types union { - int32_t i; // integer value + int64_t i; // integer value float f; // float value } v1; // only set if $len==1; for easier access uint8_t *vptr; // pointer to data array in bcf1_t->shared.s, excluding the size+type and tag id bytes @@ -165,6 +167,7 @@ typedef struct { uint32_t vptr_off:31, // vptr offset, i.e., the size of the INFO key plus size+type bytes vptr_free:1; // indicates that vptr-vptr_off must be freed; set only when modified and the new // data block is bigger than the original + int len; // vector length, 1 for scalars } bcf_info_t; @@ -680,6 +683,7 @@ set to one of BCF_ERR* codes and must be checked before calling bcf_write(). * Returns 0 on success or negative value on error. */ #define bcf_update_info_int32(hdr,line,key,values,n) bcf_update_info((hdr),(line),(key),(values),(n),BCF_HT_INT) + #define bcf_update_info_int64(hdr,line,key,values,n) bcf_update_info((hdr),(line),(key),(values),(n),BCF_HT_LONG) #define bcf_update_info_float(hdr,line,key,values,n) bcf_update_info((hdr),(line),(key),(values),(n),BCF_HT_REAL) #define bcf_update_info_flag(hdr,line,key,string,n) bcf_update_info((hdr),(line),(key),(string),(n),BCF_HT_FLAG) #define bcf_update_info_string(hdr,line,key,string) bcf_update_info((hdr),(line),(key),(string),1,BCF_HT_STR) @@ -1067,10 +1071,12 @@ which works for both BCF and VCF. #define bcf_int8_vector_end (-127) /* INT8_MIN + 1 */ #define bcf_int16_vector_end (-32767) /* INT16_MIN + 1 */ #define bcf_int32_vector_end (-2147483647) /* INT32_MIN + 1 */ +#define bcf_int64_vector_end (-9223372036854775807LL) /* INT64_MIN + 1 */ #define bcf_str_vector_end 0 #define bcf_int8_missing (-128) /* INT8_MIN */ #define bcf_int16_missing (-32767-1) /* INT16_MIN */ #define bcf_int32_missing (-2147483647-1) /* INT32_MIN */ +#define bcf_int64_missing (-9223372036854775807LL - 1LL) /* INT64_MIN */ #define bcf_str_missing 0x07 // Limits on BCF values stored in given types. Max values are the same @@ -1200,7 +1206,7 @@ Cautious callers can detect invalid type codes by checking that *q has actually been updated. */ -static inline int32_t bcf_dec_int1(const uint8_t *p, int type, uint8_t **q) +static inline int64_t bcf_dec_int1(const uint8_t *p, int type, uint8_t **q) { if (type == BCF_BT_INT8) { *q = (uint8_t*)p + 1; @@ -1211,6 +1217,9 @@ static inline int32_t bcf_dec_int1(const uint8_t *p, int type, uint8_t **q) } else if (type == BCF_BT_INT32) { *q = (uint8_t*)p + 4; return le_to_i32(p); + } else if (type == BCF_BT_INT64) { + *q = (uint8_t*)p + 4; + return le_to_i64(p); } else { // Invalid type. return 0; } @@ -1232,7 +1241,7 @@ the integer value. Cautious callers can detect invalid type codes by checking that *q has actually been updated. */ -static inline int32_t bcf_dec_typed_int1(const uint8_t *p, uint8_t **q) +static inline int64_t bcf_dec_typed_int1(const uint8_t *p, uint8_t **q) { return bcf_dec_int1(p + 1, *p&0xf, q); } diff --git a/vcf.c b/vcf.c index 6420b114d..c2228312f 100644 --- a/vcf.c +++ b/vcf.c @@ -2027,6 +2027,24 @@ int bcf_enc_vint(kstring_t *s, int n, int32_t *a, int wsize) return 0; // FIXME: check for errs in this function } +static int bcf_enc_long1(kstring_t *s, int64_t x) { + uint32_t e = 0; + if (x <= BCF_MAX_BT_INT32 && x >= BCF_MIN_BT_INT32) + return bcf_enc_int1(s, x); + if (x == bcf_int64_vector_end) { + e |= bcf_enc_size(s, 1, BCF_BT_INT8); + e |= kputc(bcf_int8_vector_end, s) < 0; + } else if (x == bcf_int64_missing) { + e |= bcf_enc_size(s, 1, BCF_BT_INT8); + e |= kputc(bcf_int8_missing, s) < 0; + } else { + e |= bcf_enc_size(s, 1, BCF_BT_INT64); + e |= ks_expand(s, 8); + if (e == 0) { u64_to_le(x, (uint8_t *) s->s + s->l); s->l += 8; } + } + return e == 0 ? 0 : -1; +} + static inline int serialize_float_array(kstring_t *s, size_t n, const float *a) { uint8_t *p; size_t i; @@ -2606,29 +2624,39 @@ int vcf_parse(kstring_t *s, const bcf_hdr_t *h, bcf1_t *v) val_a = z; } if ((y>>4&0xf) == BCF_HT_INT) { - for (i = 0, t = val; i < n_val; ++i, ++t) + // Allow first value only to be 64 bit + // (for large END value) + int64_t v64 = strtoll(val, &te, 10); + if ( te==val ) { // conversion failed + val_a[0] = bcf_int32_missing; + v64 = bcf_int64_missing; + } else { + val_a[0] = v64 >= BCF_MIN_BT_INT32 && v64 <= BCF_MAX_BT_INT32 ? v64 : bcf_int32_missing; + } + for (t = te; *t && *t != ','; t++); + if (*t == ',') ++t; + for (i = 1; i < n_val; ++i, ++t) { val_a[i] = strtol(t, &te, 10); if ( te==t ) // conversion failed - { val_a[i] = bcf_int32_missing; - while ( *te && *te!=',' ) te++; - } - t = te; + for (t = te; *t && *t != ','; t++); + } + if (n_val == 1) { + bcf_enc_long1(str, v64); + } else { + bcf_enc_vint(str, n_val, val_a, -1); } - bcf_enc_vint(str, n_val, val_a, -1); - if (strcmp(key, "END") == 0) v->rlen = val_a[0] - v->pos; + if (strcmp(key, "END") == 0) + v->rlen = v64 - v->pos; } else if ((y>>4&0xf) == BCF_HT_REAL) { float *val_f = (float *)val_a; for (i = 0, t = val; i < n_val; ++i, ++t) { val_f[i] = strtod(t, &te); if ( te==t ) // conversion failed - { bcf_float_set_missing(val_f[i]); - while ( *te && *te!=',' ) te++; - } - t = te; + for (t = te; *t && *t != ','; t++); } bcf_enc_vfloat(str, n_val, val_f); } @@ -2691,6 +2719,7 @@ static inline uint8_t *bcf_unpack_info_core1(uint8_t *ptr, bcf_info_t *info) else if (info->type == BCF_BT_INT32) info->v1.i = le_to_i32(ptr); else if (info->type == BCF_BT_FLOAT) info->v1.f = le_to_float(ptr); else if (info->type == BCF_BT_INT16) info->v1.i = le_to_i16(ptr); + else if (info->type == BCF_BT_INT64) info->v1.i = le_to_i64(ptr); } ptr += info->len << bcf_type_shift[info->type]; info->vptr_len = ptr - info->vptr; @@ -2816,6 +2845,7 @@ int vcf_format(const bcf_hdr_t *h, const bcf1_t *v, kstring_t *s) case BCF_BT_INT8: if ( z->v1.i==bcf_int8_missing ) kputc('.', s); else kputw(z->v1.i, s); break; case BCF_BT_INT16: if ( z->v1.i==bcf_int16_missing ) kputc('.', s); else kputw(z->v1.i, s); break; case BCF_BT_INT32: if ( z->v1.i==bcf_int32_missing ) kputc('.', s); else kputw(z->v1.i, s); break; + case BCF_BT_INT64: if ( z->v1.i==bcf_int64_missing ) kputc('.', s); else kputll(z->v1.i, s); break; case BCF_BT_FLOAT: if ( bcf_float_is_missing(z->v1.f) ) kputc('.', s); else kputd(z->v1.f, s); break; case BCF_BT_CHAR: kputc(z->v1.i, s); break; default: hts_log_error("Unexpected type %d", z->type); exit(1); break; @@ -3756,6 +3786,14 @@ int bcf_update_info(const bcf_hdr_t *hdr, bcf1_t *line, const char *key, const v else bcf_enc_vchar(&str, strlen((char*)values), (char*)values); } + else if ( type==BCF_HT_LONG ) + { + if (n != 1) { + hts_log_error("Only storing a single BCF_HT_LONG value is supported"); + abort(); + } + bcf_enc_long1(&str, *(int64_t *) values); + } else { hts_log_error("The type %d not implemented yet", type); @@ -3797,7 +3835,11 @@ int bcf_update_info(const bcf_hdr_t *hdr, bcf1_t *line, const char *key, const v } line->unpacked |= BCF_UN_INFO; - if ( n==1 && !strcmp("END",key) ) line->rlen = ((int32_t*)values)[0] - line->pos; + if ( n==1 && !strcmp("END",key) ) { + assert(type == BCF_HT_INT || type == BCF_HT_LONG); + int64_t end = type == BCF_HT_INT ? *(int32_t *) values : *(int64_t *) values; + line->rlen = end - line->pos; + } return 0; } @@ -4140,7 +4182,7 @@ int bcf_get_info_values(const bcf_hdr_t *hdr, bcf1_t *line, const char *tag, voi { int i, ret = -4, tag_id = bcf_hdr_id2int(hdr, BCF_DT_ID, tag); if ( !bcf_hdr_idinfo_exists(hdr,BCF_HL_INFO,tag_id) ) return -1; // no such INFO field in the header - if ( bcf_hdr_id2type(hdr,BCF_HL_INFO,tag_id)!=type ) return -2; // expected different type + if ( bcf_hdr_id2type(hdr,BCF_HL_INFO,tag_id)!=(type & 0xff) ) return -2; // expected different type if ( !(line->unpacked & BCF_UN_INFO) ) bcf_unpack(line, BCF_UN_INFO); @@ -4164,7 +4206,15 @@ int bcf_get_info_values(const bcf_hdr_t *hdr, bcf1_t *line, const char *tag, voi } // Make sure the buffer is big enough - int size1 = type==BCF_HT_INT ? sizeof(int32_t) : sizeof(float); + int size1; + switch (type) { + case BCF_HT_INT: size1 = sizeof(int32_t); break; + case BCF_HT_LONG: size1 = sizeof(int64_t); break; + case BCF_HT_REAL: size1 = sizeof(float); break; + default: + hts_log_error("Unexpected output type %d", type); + return -2; + } if ( *ndst < info->len ) { *ndst = info->len; @@ -4185,11 +4235,28 @@ int bcf_get_info_values(const bcf_hdr_t *hdr, bcf1_t *line, const char *tag, voi ret = j; \ } while (0) switch (info->type) { - case BCF_BT_INT8: BRANCH(int8_t, le_to_i8, p==bcf_int8_missing, p==bcf_int8_vector_end, *tmp=bcf_int32_missing, *tmp=p, int32_t); break; - case BCF_BT_INT16: BRANCH(int16_t, le_to_i16, p==bcf_int16_missing, p==bcf_int16_vector_end, *tmp=bcf_int32_missing, *tmp=p, int32_t); break; - case BCF_BT_INT32: BRANCH(int32_t, le_to_i32, p==bcf_int32_missing, p==bcf_int32_vector_end, *tmp=bcf_int32_missing, *tmp=p, int32_t); break; + case BCF_BT_INT8: + if (type == BCF_HT_LONG) { + BRANCH(int8_t, le_to_i8, p==bcf_int8_missing, p==bcf_int8_vector_end, *tmp=bcf_int64_missing, *tmp=p, int64_t); + } else { + BRANCH(int8_t, le_to_i8, p==bcf_int8_missing, p==bcf_int8_vector_end, *tmp=bcf_int32_missing, *tmp=p, int32_t); + } + break; + case BCF_BT_INT16: + if (type == BCF_HT_LONG) { + BRANCH(int16_t, le_to_i16, p==bcf_int16_missing, p==bcf_int16_vector_end, *tmp=bcf_int64_missing, *tmp=p, int64_t); + } else { + BRANCH(int16_t, le_to_i16, p==bcf_int16_missing, p==bcf_int16_vector_end, *tmp=bcf_int32_missing, *tmp=p, int32_t); + } + break; + case BCF_BT_INT32: + if (type == BCF_HT_LONG) { + BRANCH(int32_t, le_to_i32, p==bcf_int32_missing, p==bcf_int32_vector_end, *tmp=bcf_int64_missing, *tmp=p, int64_t); break; + } else { + BRANCH(int32_t, le_to_i32, p==bcf_int32_missing, p==bcf_int32_vector_end, *tmp=bcf_int32_missing, *tmp=p, int32_t); break; + } case BCF_BT_FLOAT: BRANCH(uint32_t, le_to_u32, p==bcf_float_missing, p==bcf_float_vector_end, bcf_float_set_missing(*tmp), bcf_float_set(tmp, p), float); break; - default: hts_log_error("Unexpected type %d", info->type); exit(1); + default: hts_log_error("Unexpected type %d", info->type); return -2; } #undef BRANCH return ret; // set by BRANCH From 9ed0641516eae0975fe9fcdc5a15f38188fc22e3 Mon Sep 17 00:00:00 2001 From: Rob Davies Date: Thu, 12 Sep 2019 18:07:01 +0100 Subject: [PATCH 17/23] Attempt to make tabix work for VCF and SAM with long references The normal way of estimating n_lvls breaks down at about 4 Gbases for the default CSI min_shift. This adds a very simple parser to grab any reference lengths from the headers and find the longest. The value is then used to adjust n_lvls if necessary. --- tbx.c | 57 ++++++++++++++++++++++++++++++++++++++++++++++++++++++--- 1 file changed, 54 insertions(+), 3 deletions(-) diff --git a/tbx.c b/tbx.c index c7e047894..f96785ffe 100644 --- a/tbx.c +++ b/tbx.c @@ -93,7 +93,7 @@ int tbx_parse1(const tbx_conf_t *conf, int len, char *line, tbx_intv_t *intv) intv->ss = line + b; intv->se = line + i; } else if (id == conf->bc) { // here ->beg is 0-based. - intv->beg = intv->end = strtol(line + b, &s, 0); + intv->beg = intv->end = strtoll(line + b, &s, 0); if ( s==line+b ) return -1; // expected int if (!(conf->preset&TBX_UCSC)) --intv->beg; else ++intv->end; @@ -103,7 +103,7 @@ int tbx_parse1(const tbx_conf_t *conf, int len, char *line, tbx_intv_t *intv) if ((conf->preset&0xffff) == TBX_GENERIC) { if (id == conf->ec) { - intv->end = strtol(line + b, &s, 0); + intv->end = strtoll(line + b, &s, 0); if ( s==line+b ) return -1; // expected int } } else if ((conf->preset&0xffff) == TBX_SAM) { @@ -131,7 +131,7 @@ int tbx_parse1(const tbx_conf_t *conf, int len, char *line, tbx_intv_t *intv) s = strstr(line + b, ";END="); if (s) s += 5; } - if (s) intv->end = strtol(s, &s, 0); + if (s) intv->end = strtoll(s, &s, 0); line[i] = c; } } @@ -220,6 +220,44 @@ static int tbx_set_meta(tbx_t *tbx) return 0; } +// Minimal effort parser to extract reference length out of VCF header line +// This is used only used to adjust the number of levels if necessary, +// so not a major problem if it doesn't always work. +static void adjust_max_ref_len_vcf(const char *str, int64_t *max_ref_len) +{ + const char *ptr; + int64_t len; + if (strncmp(str, "##contig", 8) != 0) return; + ptr = strstr(str + 8, "length"); + if (!ptr) return; + for (ptr += 6; *ptr == ' ' || *ptr == '='; ptr++) {} + len = strtoll(ptr, NULL, 10); + if (*max_ref_len < len) *max_ref_len = len; +} + +// Same for sam files +static void adjust_max_ref_len_sam(const char *str, int64_t *max_ref_len) +{ + const char *ptr; + int64_t len; + if (strncmp(str, "@SQ", 3) != 0) return; + ptr = strstr(str + 3, "\tLN:"); + if (!ptr) return; + ptr += 4; + len = strtoll(ptr, NULL, 10); + if (*max_ref_len < len) *max_ref_len = len; +} + +// Adjusts number of levels if not big enough. This can happen for +// files with very large contigs. +static int adjust_n_lvls(int min_shift, int n_lvls, int64_t max_len) +{ + int64_t s = 1LL << (min_shift + n_lvls * 3); + max_len += 256; + for (; max_len > s; ++n_lvls, s <<= 3) {} + return n_lvls; +} + tbx_t *tbx_index(BGZF *fp, int min_shift, const tbx_conf_t *conf) { tbx_t *tbx; @@ -228,6 +266,7 @@ tbx_t *tbx_index(BGZF *fp, int min_shift, const tbx_conf_t *conf) int64_t lineno = 0; uint64_t last_off = 0; tbx_intv_t intv; + int64_t max_ref_len = 0; str.s = 0; str.l = str.m = 0; tbx = (tbx_t*)calloc(1, sizeof(tbx_t)); @@ -237,11 +276,23 @@ tbx_t *tbx_index(BGZF *fp, int min_shift, const tbx_conf_t *conf) else min_shift = 14, n_lvls = 5, fmt = HTS_FMT_TBI; while ((ret = bgzf_getline(fp, '\n', &str)) >= 0) { ++lineno; + if (str.s[0] == tbx->conf.meta_char && fmt == HTS_FMT_CSI) { + switch (tbx->conf.preset) { + case TBX_SAM: + adjust_max_ref_len_sam(str.s, &max_ref_len); break; + case TBX_VCF: + adjust_max_ref_len_vcf(str.s, &max_ref_len); break; + default: + break; + } + } if (lineno <= tbx->conf.line_skip || str.s[0] == tbx->conf.meta_char) { last_off = bgzf_tell(fp); continue; } if (first == 0) { + if (fmt == HTS_FMT_CSI) + n_lvls = adjust_n_lvls(min_shift, n_lvls, max_ref_len); tbx->idx = hts_idx_init(0, fmt, last_off, min_shift, n_lvls); if (!tbx->idx) goto fail; first = 1; From 0f0d0915ee95f8e32efc25bd90c9d98076c496f3 Mon Sep 17 00:00:00 2001 From: Rob Davies Date: Thu, 12 Sep 2019 22:26:11 +0100 Subject: [PATCH 18/23] Make synced_bcf_reader work with 64 bit positions. MAX_CSI_COOR is now about 12 Tbp, which is the limit for a CSI index with min_shift 14. --- bcf_sr_sort.c | 6 +++--- bcf_sr_sort.h | 5 +++-- htslib/synced_bcf_reader.h | 9 +++++---- synced_bcf_reader.c | 39 ++++++++++++++++++++++---------------- 4 files changed, 34 insertions(+), 25 deletions(-) diff --git a/bcf_sr_sort.c b/bcf_sr_sort.c index bc31f438c..e9b20a62b 100644 --- a/bcf_sr_sort.c +++ b/bcf_sr_sort.c @@ -288,7 +288,7 @@ void debug_vbuf(sr_sort_t *srt) for (i=0; isr->nreaders; i++) { vcf_buf_t *buf = &srt->vcf_buf[i]; - fprintf(stderr,"\t%d", buf->rec[j] ? buf->rec[j]->pos+1 : 0); + fprintf(stderr,"\t%"PRIhts_pos, buf->rec[j] ? buf->rec[j]->pos+1 : 0); } fprintf(stderr,"\n"); } @@ -330,7 +330,7 @@ int bcf_sr_sort_add_active(sr_sort_t *srt, int idx) srt->active[srt->nactive - 1] = idx; return 0; // FIXME: check for errs in this function } -static int bcf_sr_sort_set(bcf_srs_t *readers, sr_sort_t *srt, const char *chr, int min_pos) +static int bcf_sr_sort_set(bcf_srs_t *readers, sr_sort_t *srt, const char *chr, hts_pos_t min_pos) { if ( !srt->grp_str2int ) { @@ -556,7 +556,7 @@ static int bcf_sr_sort_set(bcf_srs_t *readers, sr_sort_t *srt, const char *chr, return 0; // FIXME: check for errs in this function } -int bcf_sr_sort_next(bcf_srs_t *readers, sr_sort_t *srt, const char *chr, int min_pos) +int bcf_sr_sort_next(bcf_srs_t *readers, sr_sort_t *srt, const char *chr, hts_pos_t min_pos) { int i,j; assert( srt->nactive>0 ); diff --git a/bcf_sr_sort.h b/bcf_sr_sort.h index 0a31e13c8..c8bd787a1 100644 --- a/bcf_sr_sort.h +++ b/bcf_sr_sort.h @@ -90,7 +90,8 @@ typedef struct int moff, noff, *off, mcharp; char **charp; const char *chr; - int pos, nsr, msr; + hts_pos_t pos; + int nsr, msr; int pair; int nactive, mactive, *active; // list of readers with lines at the current pos } @@ -98,7 +99,7 @@ sr_sort_t; sr_sort_t *bcf_sr_sort_init(sr_sort_t *srt); void bcf_sr_sort_reset(sr_sort_t *srt); -int bcf_sr_sort_next(bcf_srs_t *readers, sr_sort_t *srt, const char *chr, int pos); +int bcf_sr_sort_next(bcf_srs_t *readers, sr_sort_t *srt, const char *chr, hts_pos_t pos); int bcf_sr_sort_set_active(sr_sort_t *srt, int i); int bcf_sr_sort_add_active(sr_sort_t *srt, int i); void bcf_sr_sort_destroy(sr_sort_t *srt); diff --git a/htslib/synced_bcf_reader.h b/htslib/synced_bcf_reader.h index c047b29fa..b9585f1af 100644 --- a/htslib/synced_bcf_reader.h +++ b/htslib/synced_bcf_reader.h @@ -125,8 +125,9 @@ typedef struct _bcf_sr_regions_t char **seq_names; // sequence names int nseqs; // number of sequences (chromosomes) in the file int iseq; // current position: chr name, index to snames - int start, end; // current position: start, end of the region (0-based) - int prev_seq, prev_start; + hts_pos_t start, end; // current position: start, end of the region (0-based) + int prev_seq; + hts_pos_t prev_start; } bcf_sr_regions_t; @@ -241,7 +242,7 @@ int bcf_sr_next_line(bcf_srs_t *readers); * @seq: sequence name; NULL to seek to start * @pos: 0-based coordinate */ -int bcf_sr_seek(bcf_srs_t *readers, const char *seq, int pos); +int bcf_sr_seek(bcf_srs_t *readers, const char *seq, hts_pos_t pos); /** * bcf_sr_set_samples() - sets active samples @@ -336,7 +337,7 @@ int bcf_sr_regions_next(bcf_sr_regions_t *reg); * regions and more regions exist; -2 if not in the regions and there are no more * regions left. */ -int bcf_sr_regions_overlap(bcf_sr_regions_t *reg, const char *seq, int start, int end); +int bcf_sr_regions_overlap(bcf_sr_regions_t *reg, const char *seq, hts_pos_t start, hts_pos_t end); /* * bcf_sr_regions_flush() - calls repeatedly regs->missed_reg_handler() until diff --git a/synced_bcf_reader.c b/synced_bcf_reader.c index 709917b01..315a4cf65 100644 --- a/synced_bcf_reader.c +++ b/synced_bcf_reader.c @@ -39,11 +39,15 @@ DEALINGS IN THE SOFTWARE. */ #include "htslib/thread_pool.h" #include "bcf_sr_sort.h" -#define MAX_CSI_COOR 0x7fffffff // maximum indexable coordinate of .csi +// Maximum indexable coordinate of .csi, for default min_shift of 14. +// This comes out to about 17 Tbp. Limiting factor is the bin number, +// which is a uint32_t in CSI. The highest number of levels compatible +// with this is 10 (needs 31 bits). +#define MAX_CSI_COOR ((1LL << (14 + 30)) - 1) typedef struct { - uint32_t start, end; + hts_pos_t start, end; } region1_t; @@ -61,7 +65,7 @@ typedef struct } aux_t; -static int _regions_add(bcf_sr_regions_t *reg, const char *chr, int start, int end); +static int _regions_add(bcf_sr_regions_t *reg, const char *chr, hts_pos_t start, hts_pos_t end); static bcf_sr_regions_t *_regions_init_string(const char *str); static int _regions_match_alleles(bcf_sr_regions_t *reg, int als_idx, bcf1_t *rec); @@ -420,11 +424,11 @@ static inline int has_filter(bcf_sr_t *reader, bcf1_t *line) return 0; } -static int _reader_seek(bcf_sr_t *reader, const char *seq, int start, int end) +static int _reader_seek(bcf_sr_t *reader, const char *seq, hts_pos_t start, hts_pos_t end) { if ( end>=MAX_CSI_COOR ) { - hts_log_error("The coordinate is out of csi index limit: %d", end+1); + hts_log_error("The coordinate is out of csi index limit: %"PRIhts_pos, end+1); exit(1); } if ( reader->itr ) @@ -446,7 +450,7 @@ static int _reader_seek(bcf_sr_t *reader, const char *seq, int start, int end) reader->itr = bcf_itr_queryi(reader->bcf_idx,tid,start,end+1); } if (!reader->itr) { - hts_log_error("Could not seek: %s:%d-%d", seq, start + 1, end + 1); + hts_log_error("Could not seek: %s:%"PRIhts_pos"-%"PRIhts_pos, seq, start + 1, end + 1); assert(0); } return 0; @@ -581,7 +585,8 @@ static void _reader_shift_buffer(bcf_sr_t *reader) static int next_line(bcf_srs_t *files) { - int i, min_pos = INT_MAX; + int i; + hts_pos_t min_pos = HTS_POS_MAX; const char *chr = NULL; // Loop until next suitable line is found or all readers have finished @@ -606,7 +611,7 @@ static int next_line(bcf_srs_t *files) else if ( min_pos==files->readers[i].buffer[1]->pos ) bcf_sr_sort_add_active(&BCF_SR_AUX(files)->sort, i); } - if ( min_pos==INT_MAX ) + if ( min_pos==HTS_POS_MAX ) { if ( !files->regions ) break; continue; @@ -622,7 +627,7 @@ static int next_line(bcf_srs_t *files) for (i=0; inreaders; i++) if ( files->readers[i].nbuffer && files->readers[i].buffer[1]->pos==min_pos ) _reader_shift_buffer(&files->readers[i]); - min_pos = INT_MAX; + min_pos = HTS_POS_MAX; chr = NULL; continue; } @@ -672,7 +677,7 @@ static void bcf_sr_seek_start(bcf_srs_t *readers) } -int bcf_sr_seek(bcf_srs_t *readers, const char *seq, int pos) +int bcf_sr_seek(bcf_srs_t *readers, const char *seq, hts_pos_t pos) { if ( !readers->regions ) return 0; bcf_sr_sort_reset(&BCF_SR_AUX(readers)->sort); @@ -767,7 +772,7 @@ int bcf_sr_set_samples(bcf_srs_t *files, const char *fname, int is_file) // Add a new region into a list sorted by start,end. On input the coordinates // are 1-based, stored 0-based, inclusive. -static int _regions_add(bcf_sr_regions_t *reg, const char *chr, int start, int end) +static int _regions_add(bcf_sr_regions_t *reg, const char *chr, hts_pos_t start, hts_pos_t end) { if ( start==-1 && end==-1 ) { @@ -828,7 +833,7 @@ static bcf_sr_regions_t *_regions_init_string(const char *str) kstring_t tmp = {0,0,0}; const char *sp = str, *ep = str; - int from, to; + hts_pos_t from, to; while ( 1 ) { while ( *ep && *ep!=',' && *ep!=':' ) ep++; @@ -880,7 +885,7 @@ static bcf_sr_regions_t *_regions_init_string(const char *str) // ichr,ifrom,ito are 0-based; // returns -1 on error, 0 if the line is a comment line, 1 on success -static int _regions_parse_line(char *line, int ichr,int ifrom,int ito, char **chr,char **chr_end,int *from,int *to) +static int _regions_parse_line(char *line, int ichr, int ifrom, int ito, char **chr, char **chr_end, hts_pos_t *from, hts_pos_t *to) { if (ifrom < 0 || ito < 0) return -1; *chr_end = NULL; @@ -970,7 +975,8 @@ bcf_sr_regions_t *bcf_sr_regions_init(const char *regions, int is_file, int ichr while ( hts_getline(reg->file, KS_SEP_LINE, ®->line) > 0 ) { char *chr, *chr_end; - int from, to, ret; + hts_pos_t from, to; + int ret; ret = _regions_parse_line(reg->line.s, ichr,ifrom,abs(ito), &chr,&chr_end,&from,&to); if ( ret < 0 ) { @@ -1077,7 +1083,8 @@ int bcf_sr_regions_next(bcf_sr_regions_t *reg) // reading from tabix char *chr, *chr_end; - int ichr = 0, ifrom = 1, ito = 2, is_bed = 0, from, to; + int ichr = 0, ifrom = 1, ito = 2, is_bed = 0; + hts_pos_t from, to; if ( reg->tbx ) { ichr = reg->tbx->conf.sc-1; @@ -1196,7 +1203,7 @@ static int _regions_match_alleles(bcf_sr_regions_t *reg, int als_idx, bcf1_t *re return !(type & VCF_INDEL) ? 1 : 0; } -int bcf_sr_regions_overlap(bcf_sr_regions_t *reg, const char *seq, int start, int end) +int bcf_sr_regions_overlap(bcf_sr_regions_t *reg, const char *seq, hts_pos_t start, hts_pos_t end) { int iseq; if ( khash_str2int_get(reg->seq_hash, seq, &iseq)<0 ) return -1; // no such sequence From 9c943da76e8844220678a6fadb5e59486b7d5240 Mon Sep 17 00:00:00 2001 From: Rob Davies Date: Fri, 13 Sep 2019 09:47:50 +0100 Subject: [PATCH 19/23] Make regidx work with 64 bit positions --- htslib/regidx.h | 5 +++-- htslib_vars.mk | 2 +- regidx.c | 7 ++++--- tabix.c | 4 ++-- 4 files changed, 10 insertions(+), 8 deletions(-) diff --git a/htslib/regidx.h b/htslib/regidx.h index f2e0e00da..7ac2d3a7f 100644 --- a/htslib/regidx.h +++ b/htslib/regidx.h @@ -55,6 +55,7 @@ #include #include +#include "hts.h" #ifdef __cplusplus extern "C" { @@ -63,7 +64,7 @@ extern "C" { typedef struct _regidx_t regidx_t; typedef struct { - uint32_t start, end; + hts_pos_t start, end; } reg_t; typedef struct @@ -125,7 +126,7 @@ void regidx_destroy(regidx_t *idx); * Returns 0 if there is no overlap or 1 if overlap is found. The overlapping * regions can be iterated as shown in the example above. */ -int regidx_overlap(regidx_t *idx, const char *chr, uint32_t start, uint32_t end, regitr_t *itr); +int regidx_overlap(regidx_t *idx, const char *chr, hts_pos_t start, hts_pos_t end, regitr_t *itr); /* * regidx_insert() - add a new region. diff --git a/htslib_vars.mk b/htslib_vars.mk index 9b2ae8a10..0db722b0e 100644 --- a/htslib_vars.mk +++ b/htslib_vars.mk @@ -43,7 +43,7 @@ htslib_knetfile_h = $(HTSPREFIX)htslib/knetfile.h htslib_kseq_h = $(HTSPREFIX)htslib/kseq.h htslib_ksort_h = $(HTSPREFIX)htslib/ksort.h htslib_kstring_h = $(HTSPREFIX)htslib/kstring.h -htslib_regidx_h = $(HTSPREFIX)htslib/regidx.h +htslib_regidx_h = $(HTSPREFIX)htslib/regidx.h $(htslib_hts_h) htslib_sam_h = $(HTSPREFIX)htslib/sam.h $(htslib_hts_h) htslib_synced_bcf_reader_h = $(HTSPREFIX)htslib/synced_bcf_reader.h $(htslib_hts_h) $(htslib_vcf_h) $(htslib_tbx_h) htslib_tbx_h = $(HTSPREFIX)htslib/tbx.h $(htslib_hts_h) diff --git a/regidx.c b/regidx.c index c1d177d6e..10b5cccf8 100644 --- a/regidx.c +++ b/regidx.c @@ -57,7 +57,8 @@ struct _regidx_t // temporary data for index initialization kstring_t str; - int rid_prev, start_prev, end_prev; + int rid_prev; + hts_pos_t start_prev, end_prev; int payload_size; void *payload; }; @@ -155,7 +156,7 @@ int regidx_insert(regidx_t *idx, char *line) { if ( idx->start_prev > reg.start || (idx->start_prev==reg.start && idx->end_prev>reg.end) ) { - hts_log_error("The regions are not sorted: %s:%d-%d is before %s:%d-%d", + hts_log_error("The regions are not sorted: %s:%"PRIhts_pos"-%"PRIhts_pos" is before %s:%"PRIhts_pos"-%"PRIhts_pos, idx->str.s,idx->start_prev+1,idx->end_prev+1,idx->str.s,reg.start+1,reg.end+1); return -1; } @@ -243,7 +244,7 @@ void regidx_destroy(regidx_t *idx) free(idx); } -int regidx_overlap(regidx_t *idx, const char *chr, uint32_t from, uint32_t to, regitr_t *itr) +int regidx_overlap(regidx_t *idx, const char *chr, hts_pos_t from, hts_pos_t to, regitr_t *itr) { if ( itr ) itr->i = itr->n = 0; diff --git a/tabix.c b/tabix.c index 8888f1ee6..32e20caed 100644 --- a/tabix.c +++ b/tabix.c @@ -120,11 +120,11 @@ static char **parse_regions(char *regions_fname, char **argv, int argc, int *nre for (iseq=0; iseq Date: Wed, 11 Sep 2019 10:29:36 +0100 Subject: [PATCH 20/23] Add VCF long reference tests. Round trip test, including structural variation with END INFO tag. Indexing an existing file. Index look-up using tabix. Allow test_compare() to avoid differences due to newlines on Windows. --- test/longrefs/index.expected1.vcf | 6 + test/longrefs/index.expected2.vcf | 1 + test/longrefs/index.vcf | 216 ++++++++++++++++++++++++++++++ test/test.pl | 18 +++ 4 files changed, 241 insertions(+) create mode 100644 test/longrefs/index.expected1.vcf create mode 100644 test/longrefs/index.expected2.vcf create mode 100644 test/longrefs/index.vcf diff --git a/test/longrefs/index.expected1.vcf b/test/longrefs/index.expected1.vcf new file mode 100644 index 000000000..e0e7f91ad --- /dev/null +++ b/test/longrefs/index.expected1.vcf @@ -0,0 +1,6 @@ +1 10010000100 . C <*> 0 . DP=1;I16=0,1,0,0,37,1369,0,0,29,841,0,0,9,81,0,0;QS=1,0;MQ0F=0 PL 0,3,29 +1 10010000101 . T <*> 0 . DP=1;I16=0,1,0,0,38,1444,0,0,29,841,0,0,8,64,0,0;QS=1,0;MQ0F=0 PL 0,3,29 +1 10010000102 . T <*> 0 . DP=1;I16=0,1,0,0,36,1296,0,0,29,841,0,0,7,49,0,0;QS=1,0;MQ0F=0 PL 0,3,29 +1 10010000103 . G <*> 0 . DP=1;I16=0,1,0,0,37,1369,0,0,29,841,0,0,6,36,0,0;QS=1,0;MQ0F=0 PL 0,3,29 +1 10010000104 . G <*> 0 . DP=1;I16=0,1,0,0,38,1444,0,0,29,841,0,0,5,25,0,0;QS=1,0;MQ0F=0 PL 0,3,29 +1 10010000105 . A <*> 0 . DP=1;I16=0,1,0,0,37,1369,0,0,29,841,0,0,4,16,0,0;QS=1,0;MQ0F=0 PL 0,3,29 diff --git a/test/longrefs/index.expected2.vcf b/test/longrefs/index.expected2.vcf new file mode 100644 index 000000000..4898e2563 --- /dev/null +++ b/test/longrefs/index.expected2.vcf @@ -0,0 +1 @@ +1 10010000110 . G 0 . SVTYPE=DEL;SVLEN=-890;END=10010001000 PL 0,1,45 diff --git a/test/longrefs/index.vcf b/test/longrefs/index.vcf new file mode 100644 index 000000000..54c8e03d3 --- /dev/null +++ b/test/longrefs/index.vcf @@ -0,0 +1,216 @@ +##fileformat=VCFv4.2 +##FILTER= +##reference=file:10_gig_at_front.fa +##contig= +##ALT= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##ALT= +##FORMAT= +#CHROM POS ID REF ALT QUAL FILTER INFO FORMAT ERS220911 +1 10009999919 . G <*> 0 . DP=1;I16=1,0,0,0,26,676,0,0,60,3600,0,0,0,0,0,0;QS=1,0;MQ0F=0 PL 0,3,26 +1 10009999920 . T <*> 0 . DP=1;I16=1,0,0,0,34,1156,0,0,60,3600,0,0,1,1,0,0;QS=1,0;MQ0F=0 PL 0,3,34 +1 10009999921 . A <*> 0 . DP=1;I16=1,0,0,0,33,1089,0,0,60,3600,0,0,2,4,0,0;QS=1,0;MQ0F=0 PL 0,3,33 +1 10009999922 . A <*> 0 . DP=1;I16=1,0,0,0,34,1156,0,0,60,3600,0,0,3,9,0,0;QS=1,0;MQ0F=0 PL 0,3,34 +1 10009999923 . T <*> 0 . DP=1;I16=1,0,0,0,35,1225,0,0,60,3600,0,0,4,16,0,0;QS=1,0;MQ0F=0 PL 0,3,35 +1 10009999924 . C <*> 0 . DP=1;I16=1,0,0,0,35,1225,0,0,60,3600,0,0,5,25,0,0;QS=1,0;MQ0F=0 PL 0,3,35 +1 10009999925 . C <*> 0 . DP=1;I16=1,0,0,0,36,1296,0,0,60,3600,0,0,6,36,0,0;QS=1,0;MQ0F=0 PL 0,3,36 +1 10009999926 . C <*> 0 . DP=1;I16=1,0,0,0,39,1521,0,0,60,3600,0,0,7,49,0,0;QS=1,0;MQ0F=0 PL 0,3,39 +1 10009999927 . A <*> 0 . DP=1;I16=1,0,0,0,36,1296,0,0,60,3600,0,0,8,64,0,0;QS=1,0;MQ0F=0 PL 0,3,36 +1 10009999928 . G <*> 0 . DP=1;I16=1,0,0,0,34,1156,0,0,60,3600,0,0,9,81,0,0;QS=1,0;MQ0F=0 PL 0,3,34 +1 10009999929 . C <*> 0 . DP=1;I16=1,0,0,0,37,1369,0,0,60,3600,0,0,10,100,0,0;QS=1,0;MQ0F=0 PL 0,3,37 +1 10009999930 . A <*> 0 . DP=1;I16=1,0,0,0,38,1444,0,0,60,3600,0,0,11,121,0,0;QS=1,0;MQ0F=0 PL 0,3,38 +1 10009999931 . C <*> 0 . DP=1;I16=1,0,0,0,39,1521,0,0,60,3600,0,0,12,144,0,0;QS=1,0;MQ0F=0 PL 0,3,39 +1 10009999932 . T <*> 0 . DP=1;I16=1,0,0,0,38,1444,0,0,60,3600,0,0,13,169,0,0;QS=1,0;MQ0F=0 PL 0,3,38 +1 10009999933 . T <*> 0 . DP=1;I16=1,0,0,0,40,1600,0,0,60,3600,0,0,14,196,0,0;QS=1,0;MQ0F=0 PL 0,3,40 +1 10009999934 . T <*> 0 . DP=1;I16=1,0,0,0,38,1444,0,0,60,3600,0,0,15,225,0,0;QS=1,0;MQ0F=0 PL 0,3,38 +1 10009999935 . A <*> 0 . DP=1;I16=1,0,0,0,40,1600,0,0,60,3600,0,0,16,256,0,0;QS=1,0;MQ0F=0 PL 0,3,40 +1 10009999936 . G <*> 0 . DP=1;I16=1,0,0,0,40,1600,0,0,60,3600,0,0,17,289,0,0;QS=1,0;MQ0F=0 PL 0,3,40 +1 10009999937 . G <*> 0 . DP=1;I16=1,0,0,0,40,1600,0,0,60,3600,0,0,18,324,0,0;QS=1,0;MQ0F=0 PL 0,3,40 +1 10009999938 . A <*> 0 . DP=1;I16=1,0,0,0,37,1369,0,0,60,3600,0,0,19,361,0,0;QS=1,0;MQ0F=0 PL 0,3,37 +1 10009999939 . G <*> 0 . DP=1;I16=1,0,0,0,40,1600,0,0,60,3600,0,0,20,400,0,0;QS=1,0;MQ0F=0 PL 0,3,40 +1 10009999940 . G <*> 0 . DP=1;I16=1,0,0,0,38,1444,0,0,60,3600,0,0,21,441,0,0;QS=1,0;MQ0F=0 PL 0,3,38 +1 10009999941 . C <*> 0 . DP=1;I16=1,0,0,0,41,1681,0,0,60,3600,0,0,22,484,0,0;QS=1,0;MQ0F=0 PL 0,3,41 +1 10009999942 . T <*> 0 . DP=1;I16=1,0,0,0,40,1600,0,0,60,3600,0,0,23,529,0,0;QS=1,0;MQ0F=0 PL 0,3,40 +1 10009999943 . A <*> 0 . DP=1;I16=1,0,0,0,40,1600,0,0,60,3600,0,0,24,576,0,0;QS=1,0;MQ0F=0 PL 0,3,40 +1 10009999944 . A <*> 0 . DP=1;I16=1,0,0,0,38,1444,0,0,60,3600,0,0,25,625,0,0;QS=1,0;MQ0F=0 PL 0,3,38 +1 10009999945 . G <*> 0 . DP=1;I16=1,0,0,0,40,1600,0,0,60,3600,0,0,25,625,0,0;QS=1,0;MQ0F=0 PL 0,3,40 +1 10009999946 . G <*> 0 . DP=1;I16=1,0,0,0,38,1444,0,0,60,3600,0,0,25,625,0,0;QS=1,0;MQ0F=0 PL 0,3,38 +1 10009999947 . C <*> 0 . DP=1;I16=1,0,0,0,35,1225,0,0,60,3600,0,0,25,625,0,0;QS=1,0;MQ0F=0 PL 0,3,35 +1 10009999948 . A <*> 0 . DP=1;I16=1,0,0,0,34,1156,0,0,60,3600,0,0,25,625,0,0;QS=1,0;MQ0F=0 PL 0,3,34 +1 10009999949 . G <*> 0 . DP=1;I16=1,0,0,0,38,1444,0,0,60,3600,0,0,25,625,0,0;QS=1,0;MQ0F=0 PL 0,3,38 +1 10009999950 . G <*> 0 . DP=1;I16=1,0,0,0,38,1444,0,0,60,3600,0,0,25,625,0,0;QS=1,0;MQ0F=0 PL 0,3,38 +1 10009999951 . C <*> 0 . DP=1;I16=1,0,0,0,37,1369,0,0,60,3600,0,0,25,625,0,0;QS=1,0;MQ0F=0 PL 0,3,37 +1 10009999952 . A <*> 0 . DP=1;I16=1,0,0,0,39,1521,0,0,60,3600,0,0,25,625,0,0;QS=1,0;MQ0F=0 PL 0,3,39 +1 10009999953 . G <*> 0 . DP=1;I16=1,0,0,0,39,1521,0,0,60,3600,0,0,25,625,0,0;QS=1,0;MQ0F=0 PL 0,3,39 +1 10009999954 . A <*> 0 . DP=1;I16=1,0,0,0,37,1369,0,0,60,3600,0,0,25,625,0,0;QS=1,0;MQ0F=0 PL 0,3,37 +1 10009999955 . T <*> 0 . DP=1;I16=1,0,0,0,40,1600,0,0,60,3600,0,0,25,625,0,0;QS=1,0;MQ0F=0 PL 0,3,40 +1 10009999956 . C <*> 0 . DP=1;I16=1,0,0,0,41,1681,0,0,60,3600,0,0,25,625,0,0;QS=1,0;MQ0F=0 PL 0,3,41 +1 10009999957 . A <*> 0 . DP=1;I16=1,0,0,0,39,1521,0,0,60,3600,0,0,25,625,0,0;QS=1,0;MQ0F=0 PL 0,3,39 +1 10009999958 . C <*> 0 . DP=1;I16=1,0,0,0,38,1444,0,0,60,3600,0,0,25,625,0,0;QS=1,0;MQ0F=0 PL 0,3,38 +1 10009999959 . T <*> 0 . DP=1;I16=1,0,0,0,39,1521,0,0,60,3600,0,0,25,625,0,0;QS=1,0;MQ0F=0 PL 0,3,39 +1 10009999960 . T <*> 0 . DP=1;I16=1,0,0,0,35,1225,0,0,60,3600,0,0,25,625,0,0;QS=1,0;MQ0F=0 PL 0,3,35 +1 10009999961 . G <*> 0 . DP=1;I16=1,0,0,0,39,1521,0,0,60,3600,0,0,25,625,0,0;QS=1,0;MQ0F=0 PL 0,3,39 +1 10009999962 . A <*> 0 . DP=1;I16=1,0,0,0,39,1521,0,0,60,3600,0,0,25,625,0,0;QS=1,0;MQ0F=0 PL 0,3,39 +1 10009999963 . G <*> 0 . DP=1;I16=1,0,0,0,38,1444,0,0,60,3600,0,0,25,625,0,0;QS=1,0;MQ0F=0 PL 0,3,38 +1 10009999964 . A <*> 0 . DP=1;I16=1,0,0,0,38,1444,0,0,60,3600,0,0,25,625,0,0;QS=1,0;MQ0F=0 PL 0,3,38 +1 10009999965 . C <*> 0 . DP=1;I16=1,0,0,0,38,1444,0,0,60,3600,0,0,25,625,0,0;QS=1,0;MQ0F=0 PL 0,3,38 +1 10009999966 . C <*> 0 . DP=1;I16=1,0,0,0,40,1600,0,0,60,3600,0,0,25,625,0,0;QS=1,0;MQ0F=0 PL 0,3,40 +1 10009999967 . A <*> 0 . DP=1;I16=1,0,0,0,41,1681,0,0,60,3600,0,0,25,625,0,0;QS=1,0;MQ0F=0 PL 0,3,41 +1 10009999968 . G <*> 0 . DP=1;I16=1,0,0,0,38,1444,0,0,60,3600,0,0,25,625,0,0;QS=1,0;MQ0F=0 PL 0,3,38 +1 10009999969 . G <*> 0 . DP=1;I16=1,0,0,0,39,1521,0,0,60,3600,0,0,25,625,0,0;QS=1,0;MQ0F=0 PL 0,3,39 +1 10009999970 . A <*> 0 . DP=1;I16=1,0,0,0,39,1521,0,0,60,3600,0,0,25,625,0,0;QS=1,0;MQ0F=0 PL 0,3,39 +1 10009999971 . G <*> 0 . DP=1;I16=1,0,0,0,38,1444,0,0,60,3600,0,0,25,625,0,0;QS=1,0;MQ0F=0 PL 0,3,38 +1 10009999972 . T <*> 0 . DP=1;I16=1,0,0,0,36,1296,0,0,60,3600,0,0,25,625,0,0;QS=1,0;MQ0F=0 PL 0,3,36 +1 10009999973 . T <*> 0 . DP=1;I16=1,0,0,0,36,1296,0,0,60,3600,0,0,25,625,0,0;QS=1,0;MQ0F=0 PL 0,3,36 +1 10009999974 . A <*> 0 . DP=1;I16=1,0,0,0,37,1369,0,0,60,3600,0,0,25,625,0,0;QS=1,0;MQ0F=0 PL 0,3,37 +1 10009999975 . C <*> 0 . DP=1;I16=1,0,0,0,37,1369,0,0,60,3600,0,0,25,625,0,0;QS=1,0;MQ0F=0 PL 0,3,37 +1 10009999976 . A <*> 0 . DP=1;I16=1,0,0,0,39,1521,0,0,60,3600,0,0,25,625,0,0;QS=1,0;MQ0F=0 PL 0,3,39 +1 10009999977 . G <*> 0 . DP=1;I16=1,0,0,0,36,1296,0,0,60,3600,0,0,25,625,0,0;QS=1,0;MQ0F=0 PL 0,3,36 +1 10009999978 . A <*> 0 . DP=1;I16=1,0,0,0,37,1369,0,0,60,3600,0,0,25,625,0,0;QS=1,0;MQ0F=0 PL 0,3,37 +1 10009999979 . C <*> 0 . DP=1;I16=1,0,0,0,39,1521,0,0,60,3600,0,0,25,625,0,0;QS=1,0;MQ0F=0 PL 0,3,39 +1 10009999980 . C <*> 0 . DP=1;I16=1,0,0,0,33,1089,0,0,60,3600,0,0,25,625,0,0;QS=1,0;MQ0F=0 PL 0,3,33 +1 10009999981 . A <*> 0 . DP=1;I16=1,0,0,0,37,1369,0,0,60,3600,0,0,25,625,0,0;QS=1,0;MQ0F=0 PL 0,3,37 +1 10009999982 . G <*> 0 . DP=1;I16=1,0,0,0,37,1369,0,0,60,3600,0,0,25,625,0,0;QS=1,0;MQ0F=0 PL 0,3,37 +1 10009999983 . C <*> 0 . DP=1;I16=1,0,0,0,38,1444,0,0,60,3600,0,0,25,625,0,0;QS=1,0;MQ0F=0 PL 0,3,38 +1 10009999984 . C <*> 0 . DP=1;I16=1,0,0,0,39,1521,0,0,60,3600,0,0,25,625,0,0;QS=1,0;MQ0F=0 PL 0,3,39 +1 10009999985 . T <*> 0 . DP=1;I16=1,0,0,0,39,1521,0,0,60,3600,0,0,25,625,0,0;QS=1,0;MQ0F=0 PL 0,3,39 +1 10009999986 . G <*> 0 . DP=1;I16=1,0,0,0,38,1444,0,0,60,3600,0,0,25,625,0,0;QS=1,0;MQ0F=0 PL 0,3,38 +1 10009999987 . G <*> 0 . DP=1;I16=1,0,0,0,37,1369,0,0,60,3600,0,0,25,625,0,0;QS=1,0;MQ0F=0 PL 0,3,37 +1 10009999988 . C <*> 0 . DP=1;I16=1,0,0,0,40,1600,0,0,60,3600,0,0,25,625,0,0;QS=1,0;MQ0F=0 PL 0,3,40 +1 10009999989 . C <*> 0 . DP=1;I16=1,0,0,0,39,1521,0,0,60,3600,0,0,25,625,0,0;QS=1,0;MQ0F=0 PL 0,3,39 +1 10009999990 . G <*> 0 . DP=1;I16=1,0,0,0,39,1521,0,0,60,3600,0,0,25,625,0,0;QS=1,0;MQ0F=0 PL 0,3,39 +1 10009999991 . A <*> 0 . DP=1;I16=1,0,0,0,39,1521,0,0,60,3600,0,0,25,625,0,0;QS=1,0;MQ0F=0 PL 0,3,39 +1 10009999992 . C <*> 0 . DP=1;I16=1,0,0,0,36,1296,0,0,60,3600,0,0,25,625,0,0;QS=1,0;MQ0F=0 PL 0,3,36 +1 10009999993 . A <*> 0 . DP=1;I16=1,0,0,0,36,1296,0,0,60,3600,0,0,25,625,0,0;QS=1,0;MQ0F=0 PL 0,3,36 +1 10009999994 . C <*> 0 . DP=1;I16=1,0,0,0,39,1521,0,0,60,3600,0,0,24,576,0,0;QS=1,0;MQ0F=0 PL 0,3,39 +1 10009999995 . G <*> 0 . DP=1;I16=1,0,0,0,33,1089,0,0,60,3600,0,0,23,529,0,0;QS=1,0;MQ0F=0 PL 0,3,33 +1 10009999996 . G <*> 0 . DP=1;I16=1,0,0,0,37,1369,0,0,60,3600,0,0,22,484,0,0;QS=1,0;MQ0F=0 PL 0,3,37 +1 10009999997 . C <*> 0 . DP=1;I16=1,0,0,0,38,1444,0,0,60,3600,0,0,21,441,0,0;QS=1,0;MQ0F=0 PL 0,3,38 +1 10009999998 . G <*> 0 . DP=1;I16=1,0,0,0,40,1600,0,0,60,3600,0,0,20,400,0,0;QS=1,0;MQ0F=0 PL 0,3,40 +1 10009999999 . A <*> 0 . DP=1;I16=1,0,0,0,31,961,0,0,60,3600,0,0,19,361,0,0;QS=1,0;MQ0F=0 PL 0,3,31 +1 10010000000 . A <*> 0 . DP=1;I16=1,0,0,0,33,1089,0,0,60,3600,0,0,18,324,0,0;QS=1,0;MQ0F=0 PL 0,3,33 +1 10010000001 . A <*> 0 . DP=1;I16=1,0,0,0,37,1369,0,0,60,3600,0,0,17,289,0,0;QS=1,0;MQ0F=0 PL 0,3,37 +1 10010000002 . C <*> 0 . DP=1;I16=1,0,0,0,37,1369,0,0,60,3600,0,0,16,256,0,0;QS=1,0;MQ0F=0 PL 0,3,37 +1 10010000003 . C <*> 0 . DP=1;I16=1,0,0,0,40,1600,0,0,60,3600,0,0,15,225,0,0;QS=1,0;MQ0F=0 PL 0,3,40 +1 10010000004 . C <*> 0 . DP=1;I16=1,0,0,0,29,841,0,0,60,3600,0,0,14,196,0,0;QS=1,0;MQ0F=0 PL 0,3,29 +1 10010000005 . C <*> 0 . DP=1;I16=1,0,0,0,39,1521,0,0,60,3600,0,0,13,169,0,0;QS=1,0;MQ0F=0 PL 0,3,39 +1 10010000006 . G <*> 0 . DP=1;I16=1,0,0,0,38,1444,0,0,60,3600,0,0,12,144,0,0;QS=1,0;MQ0F=0 PL 0,3,38 +1 10010000007 . T <*> 0 . DP=1;I16=1,0,0,0,40,1600,0,0,60,3600,0,0,11,121,0,0;QS=1,0;MQ0F=0 PL 0,3,40 +1 10010000008 . C <*> 0 . DP=1;I16=1,0,0,0,37,1369,0,0,60,3600,0,0,10,100,0,0;QS=1,0;MQ0F=0 PL 0,3,37 +1 10010000009 . T <*> 0 . DP=1;I16=1,0,0,0,43,1849,0,0,60,3600,0,0,9,81,0,0;QS=1,0;MQ0F=0 PL 0,3,43 +1 10010000010 . C <*> 0 . DP=2;I16=1,1,0,0,59,2105,0,0,89,4441,0,0,8,64,0,0;QS=1,0;MQSB=1;MQ0F=0 PL 0,6,59 +1 10010000011 . T <*> 0 . DP=2;I16=1,1,0,0,76,2888,0,0,89,4441,0,0,8,50,0,0;QS=1,0;MQSB=1;MQ0F=0 PL 0,6,67 +1 10010000012 . A <*> 0 . DP=2;I16=1,1,0,0,77,2965,0,0,89,4441,0,0,8,40,0,0;QS=1,0;MQSB=1;MQ0F=0 PL 0,6,67 +1 10010000013 . C <*> 0 . DP=2;I16=1,1,0,0,66,2250,0,0,89,4441,0,0,8,34,0,0;QS=1,0;MQSB=1;MQ0F=0 PL 0,6,66 +1 10010000014 . A <*> 0 . DP=2;I16=1,1,0,0,67,2285,0,0,89,4441,0,0,8,32,0,0;QS=1,0;MQSB=1;MQ0F=0 PL 0,6,67 +1 10010000015 . A <*> 0 . DP=2;I16=1,1,0,0,69,2385,0,0,89,4441,0,0,8,34,0,0;QS=1,0;MQSB=1;MQ0F=0 PL 0,6,65 +1 10010000016 . T <*> 0 . DP=2;I16=1,1,0,0,75,2817,0,0,89,4441,0,0,8,40,0,0;QS=1,0;MQSB=1;MQ0F=0 PL 0,6,68 +1 10010000017 . A <*> 0 . DP=2;I16=1,1,0,0,67,2285,0,0,89,4441,0,0,8,50,0,0;QS=1,0;MQSB=1;MQ0F=0 PL 0,6,58 +1 10010000018 . A <*> 0 . DP=2;I16=1,1,0,0,64,2120,0,0,89,4441,0,0,8,64,0,0;QS=1,0;MQSB=1;MQ0F=0 PL 0,6,55 +1 10010000019 . A <*> 0 . DP=1;I16=0,1,0,0,38,1444,0,0,29,841,0,0,9,81,0,0;QS=1,0;MQ0F=0 PL 0,3,29 +1 10010000020 . T <*> 0 . DP=1;I16=0,1,0,0,38,1444,0,0,29,841,0,0,10,100,0,0;QS=1,0;MQ0F=0 PL 0,3,29 +1 10010000021 . T <*> 0 . DP=1;I16=0,1,0,0,37,1369,0,0,29,841,0,0,11,121,0,0;QS=1,0;MQ0F=0 PL 0,3,29 +1 10010000022 . A <*> 0 . DP=1;I16=0,1,0,0,39,1521,0,0,29,841,0,0,12,144,0,0;QS=1,0;MQ0F=0 PL 0,3,29 +1 10010000023 . A <*> 0 . DP=1;I16=0,1,0,0,38,1444,0,0,29,841,0,0,13,169,0,0;QS=1,0;MQ0F=0 PL 0,3,29 +1 10010000024 . A <*> 0 . DP=1;I16=0,1,0,0,38,1444,0,0,29,841,0,0,14,196,0,0;QS=1,0;MQ0F=0 PL 0,3,29 +1 10010000025 . A <*> 0 . DP=1;I16=0,1,0,0,39,1521,0,0,29,841,0,0,15,225,0,0;QS=1,0;MQ0F=0 PL 0,3,29 +1 10010000026 . T <*> 0 . DP=1;I16=0,1,0,0,29,841,0,0,29,841,0,0,16,256,0,0;QS=1,0;MQ0F=0 PL 0,3,29 +1 10010000027 . A <*> 0 . DP=1;I16=0,1,0,0,40,1600,0,0,29,841,0,0,17,289,0,0;QS=1,0;MQ0F=0 PL 0,3,29 +1 10010000028 . T <*> 0 . DP=1;I16=0,1,0,0,38,1444,0,0,29,841,0,0,18,324,0,0;QS=1,0;MQ0F=0 PL 0,3,29 +1 10010000029 . T <*> 0 . DP=1;I16=0,1,0,0,41,1681,0,0,29,841,0,0,19,361,0,0;QS=1,0;MQ0F=0 PL 0,3,29 +1 10010000030 . A <*> 0 . DP=1;I16=0,1,0,0,36,1296,0,0,29,841,0,0,20,400,0,0;QS=1,0;MQ0F=0 PL 0,3,29 +1 10010000031 . G <*> 0 . DP=1;I16=0,1,0,0,40,1600,0,0,29,841,0,0,21,441,0,0;QS=1,0;MQ0F=0 PL 0,3,29 +1 10010000032 . C <*> 0 . DP=1;I16=0,1,0,0,40,1600,0,0,29,841,0,0,22,484,0,0;QS=1,0;MQ0F=0 PL 0,3,29 +1 10010000033 . T <*> 0 . DP=1;I16=0,1,0,0,37,1369,0,0,29,841,0,0,23,529,0,0;QS=1,0;MQ0F=0 PL 0,3,29 +1 10010000034 . G <*> 0 . DP=1;I16=0,1,0,0,36,1296,0,0,29,841,0,0,24,576,0,0;QS=1,0;MQ0F=0 PL 0,3,29 +1 10010000035 . G <*> 0 . DP=1;I16=0,1,0,0,38,1444,0,0,29,841,0,0,25,625,0,0;QS=1,0;MQ0F=0 PL 0,3,29 +1 10010000036 . G <*> 0 . DP=1;I16=0,1,0,0,42,1764,0,0,29,841,0,0,25,625,0,0;QS=1,0;MQ0F=0 PL 0,3,29 +1 10010000037 . C <*> 0 . DP=1;I16=0,1,0,0,34,1156,0,0,29,841,0,0,25,625,0,0;QS=1,0;MQ0F=0 PL 0,3,29 +1 10010000038 . A <*> 0 . DP=1;I16=0,1,0,0,37,1369,0,0,29,841,0,0,25,625,0,0;QS=1,0;MQ0F=0 PL 0,3,29 +1 10010000039 . T <*> 0 . DP=1;I16=0,1,0,0,37,1369,0,0,29,841,0,0,25,625,0,0;QS=1,0;MQ0F=0 PL 0,3,29 +1 10010000040 . G <*> 0 . DP=1;I16=0,1,0,0,39,1521,0,0,29,841,0,0,25,625,0,0;QS=1,0;MQ0F=0 PL 0,3,29 +1 10010000041 . G <*> 0 . DP=1;I16=0,1,0,0,38,1444,0,0,29,841,0,0,25,625,0,0;QS=1,0;MQ0F=0 PL 0,3,29 +1 10010000042 . T <*> 0 . DP=1;I16=0,1,0,0,38,1444,0,0,29,841,0,0,25,625,0,0;QS=1,0;MQ0F=0 PL 0,3,29 +1 10010000043 . G <*> 0 . DP=1;I16=0,1,0,0,38,1444,0,0,29,841,0,0,25,625,0,0;QS=1,0;MQ0F=0 PL 0,3,29 +1 10010000044 . G <*> 0 . DP=1;I16=0,1,0,0,37,1369,0,0,29,841,0,0,25,625,0,0;QS=1,0;MQ0F=0 PL 0,3,29 +1 10010000045 . T <*> 0 . DP=1;I16=0,1,0,0,42,1764,0,0,29,841,0,0,25,625,0,0;QS=1,0;MQ0F=0 PL 0,3,29 +1 10010000046 . G <*> 0 . DP=1;I16=0,1,0,0,39,1521,0,0,29,841,0,0,25,625,0,0;QS=1,0;MQ0F=0 PL 0,3,29 +1 10010000047 . T <*> 0 . DP=1;I16=0,1,0,0,38,1444,0,0,29,841,0,0,25,625,0,0;QS=1,0;MQ0F=0 PL 0,3,29 +1 10010000048 . G <*> 0 . DP=1;I16=0,1,0,0,38,1444,0,0,29,841,0,0,25,625,0,0;QS=1,0;MQ0F=0 PL 0,3,29 +1 10010000049 . T <*> 0 . DP=1;I16=0,1,0,0,41,1681,0,0,29,841,0,0,25,625,0,0;QS=1,0;MQ0F=0 PL 0,3,29 +1 10010000050 . G <*> 0 . DP=1;I16=0,1,0,0,31,961,0,0,29,841,0,0,25,625,0,0;QS=1,0;MQ0F=0 PL 0,3,29 +1 10010000051 . C <*> 0 . DP=1;I16=0,1,0,0,16,256,0,0,29,841,0,0,25,625,0,0;QS=1,0;MQ0F=0 PL 0,3,16 +1 10010000052 . T <*> 0 . DP=1;I16=0,1,0,0,31,961,0,0,29,841,0,0,25,625,0,0;QS=1,0;MQ0F=0 PL 0,3,29 +1 10010000053 . T <*> 0 . DP=1;I16=0,1,0,0,35,1225,0,0,29,841,0,0,25,625,0,0;QS=1,0;MQ0F=0 PL 0,3,29 +1 10010000054 . G <*> 0 . DP=1;I16=0,1,0,0,40,1600,0,0,29,841,0,0,25,625,0,0;QS=1,0;MQ0F=0 PL 0,3,29 +1 10010000055 . T <*> 0 . DP=1;I16=0,1,0,0,33,1089,0,0,29,841,0,0,25,625,0,0;QS=1,0;MQ0F=0 PL 0,3,29 +1 10010000056 . A <*> 0 . DP=1;I16=0,1,0,0,22,484,0,0,29,841,0,0,25,625,0,0;QS=1,0;MQ0F=0 PL 0,3,22 +1 10010000057 . G <*> 0 . DP=1;I16=0,1,0,0,37,1369,0,0,29,841,0,0,25,625,0,0;QS=1,0;MQ0F=0 PL 0,3,29 +1 10010000058 . T <*> 0 . DP=1;I16=0,1,0,0,40,1600,0,0,29,841,0,0,25,625,0,0;QS=1,0;MQ0F=0 PL 0,3,29 +1 10010000059 . C <*> 0 . DP=1;I16=0,1,0,0,34,1156,0,0,29,841,0,0,25,625,0,0;QS=1,0;MQ0F=0 PL 0,3,29 +1 10010000060 . C <*> 0 . DP=1;I16=0,1,0,0,40,1600,0,0,29,841,0,0,25,625,0,0;QS=1,0;MQ0F=0 PL 0,3,29 +1 10010000061 . C <*> 0 . DP=1;I16=0,1,0,0,38,1444,0,0,29,841,0,0,25,625,0,0;QS=1,0;MQ0F=0 PL 0,3,29 +1 10010000062 . A <*> 0 . DP=1;I16=0,1,0,0,34,1156,0,0,29,841,0,0,25,625,0,0;QS=1,0;MQ0F=0 PL 0,3,29 +1 10010000063 . G <*> 0 . DP=1;I16=0,1,0,0,39,1521,0,0,29,841,0,0,25,625,0,0;QS=1,0;MQ0F=0 PL 0,3,29 +1 10010000064 . C <*> 0 . DP=1;I16=0,1,0,0,38,1444,0,0,29,841,0,0,25,625,0,0;QS=1,0;MQ0F=0 PL 0,3,29 +1 10010000065 . T <*> 0 . DP=1;I16=0,1,0,0,40,1600,0,0,29,841,0,0,25,625,0,0;QS=1,0;MQ0F=0 PL 0,3,29 +1 10010000066 . A <*> 0 . DP=1;I16=0,1,0,0,32,1024,0,0,29,841,0,0,25,625,0,0;QS=1,0;MQ0F=0 PL 0,3,29 +1 10010000067 . C <*> 0 . DP=1;I16=0,1,0,0,38,1444,0,0,29,841,0,0,25,625,0,0;QS=1,0;MQ0F=0 PL 0,3,29 +1 10010000068 . T <*> 0 . DP=1;I16=0,1,0,0,36,1296,0,0,29,841,0,0,25,625,0,0;QS=1,0;MQ0F=0 PL 0,3,29 +1 10010000069 . T <*> 0 . DP=1;I16=0,1,0,0,36,1296,0,0,29,841,0,0,25,625,0,0;QS=1,0;MQ0F=0 PL 0,3,29 +1 10010000070 . G <*> 0 . DP=1;I16=0,1,0,0,38,1444,0,0,29,841,0,0,25,625,0,0;QS=1,0;MQ0F=0 PL 0,3,29 +1 10010000071 . G <*> 0 . DP=1;I16=0,1,0,0,33,1089,0,0,29,841,0,0,25,625,0,0;QS=1,0;MQ0F=0 PL 0,3,29 +1 10010000072 . C <*> 0 . DP=1;I16=0,1,0,0,39,1521,0,0,29,841,0,0,25,625,0,0;QS=1,0;MQ0F=0 PL 0,3,29 +1 10010000073 . G <*> 0 . DP=1;I16=0,1,0,0,38,1444,0,0,29,841,0,0,25,625,0,0;QS=1,0;MQ0F=0 PL 0,3,29 +1 10010000074 . G <*> 0 . DP=1;I16=0,1,0,0,38,1444,0,0,29,841,0,0,25,625,0,0;QS=1,0;MQ0F=0 PL 0,3,29 +1 10010000075 . G <*> 0 . DP=1;I16=0,1,0,0,36,1296,0,0,29,841,0,0,25,625,0,0;QS=1,0;MQ0F=0 PL 0,3,29 +1 10010000076 . C <*> 0 . DP=1;I16=0,1,0,0,39,1521,0,0,29,841,0,0,25,625,0,0;QS=1,0;MQ0F=0 PL 0,3,29 +1 10010000077 . T <*> 0 . DP=1;I16=0,1,0,0,39,1521,0,0,29,841,0,0,25,625,0,0;QS=1,0;MQ0F=0 PL 0,3,29 +1 10010000078 . G <*> 0 . DP=1;I16=0,1,0,0,39,1521,0,0,29,841,0,0,25,625,0,0;QS=1,0;MQ0F=0 PL 0,3,29 +1 10010000079 . A <*> 0 . DP=1;I16=0,1,0,0,38,1444,0,0,29,841,0,0,25,625,0,0;QS=1,0;MQ0F=0 PL 0,3,29 +1 10010000080 . G <*> 0 . DP=1;I16=0,1,0,0,39,1521,0,0,29,841,0,0,25,625,0,0;QS=1,0;MQ0F=0 PL 0,3,29 +1 10010000081 . G <*> 0 . DP=1;I16=0,1,0,0,36,1296,0,0,29,841,0,0,25,625,0,0;QS=1,0;MQ0F=0 PL 0,3,29 +1 10010000082 . T <*> 0 . DP=1;I16=0,1,0,0,39,1521,0,0,29,841,0,0,25,625,0,0;QS=1,0;MQ0F=0 PL 0,3,29 +1 10010000083 . G <*> 0 . DP=1;I16=0,1,0,0,39,1521,0,0,29,841,0,0,25,625,0,0;QS=1,0;MQ0F=0 PL 0,3,29 +1 10010000084 . G <*> 0 . DP=1;I16=0,1,0,0,41,1681,0,0,29,841,0,0,25,625,0,0;QS=1,0;MQ0F=0 PL 0,3,29 +1 10010000085 . G <*> 0 . DP=1;I16=0,1,0,0,38,1444,0,0,29,841,0,0,24,576,0,0;QS=1,0;MQ0F=0 PL 0,3,29 +1 10010000086 . A <*> 0 . DP=1;I16=0,1,0,0,38,1444,0,0,29,841,0,0,23,529,0,0;QS=1,0;MQ0F=0 PL 0,3,29 +1 10010000087 . G <*> 0 . DP=1;I16=0,1,0,0,40,1600,0,0,29,841,0,0,22,484,0,0;QS=1,0;MQ0F=0 PL 0,3,29 +1 10010000088 . A <*> 0 . DP=1;I16=0,1,0,0,39,1521,0,0,29,841,0,0,21,441,0,0;QS=1,0;MQ0F=0 PL 0,3,29 +1 10010000089 . A <*> 0 . DP=1;I16=0,1,0,0,38,1444,0,0,29,841,0,0,20,400,0,0;QS=1,0;MQ0F=0 PL 0,3,29 +1 10010000090 . T <*> 0 . DP=1;I16=0,1,0,0,39,1521,0,0,29,841,0,0,19,361,0,0;QS=1,0;MQ0F=0 PL 0,3,29 +1 10010000091 . C <*> 0 . DP=1;I16=0,1,0,0,37,1369,0,0,29,841,0,0,18,324,0,0;QS=1,0;MQ0F=0 PL 0,3,29 +1 10010000092 . A <*> 0 . DP=1;I16=0,1,0,0,38,1444,0,0,29,841,0,0,17,289,0,0;QS=1,0;MQ0F=0 PL 0,3,29 +1 10010000093 . T <*> 0 . DP=1;I16=0,1,0,0,33,1089,0,0,29,841,0,0,16,256,0,0;QS=1,0;MQ0F=0 PL 0,3,29 +1 10010000094 . C <*> 0 . DP=1;I16=0,1,0,0,41,1681,0,0,29,841,0,0,15,225,0,0;QS=1,0;MQ0F=0 PL 0,3,29 +1 10010000095 . C <*> 0 . DP=1;I16=0,1,0,0,36,1296,0,0,29,841,0,0,14,196,0,0;QS=1,0;MQ0F=0 PL 0,3,29 +1 10010000096 . A <*> 0 . DP=1;I16=0,1,0,0,41,1681,0,0,29,841,0,0,13,169,0,0;QS=1,0;MQ0F=0 PL 0,3,29 +1 10010000097 . A <*> 0 . DP=1;I16=0,1,0,0,38,1444,0,0,29,841,0,0,12,144,0,0;QS=1,0;MQ0F=0 PL 0,3,29 +1 10010000098 . G <*> 0 . DP=1;I16=0,1,0,0,36,1296,0,0,29,841,0,0,11,121,0,0;QS=1,0;MQ0F=0 PL 0,3,29 +1 10010000099 . C <*> 0 . DP=1;I16=0,1,0,0,38,1444,0,0,29,841,0,0,10,100,0,0;QS=1,0;MQ0F=0 PL 0,3,29 +1 10010000100 . C <*> 0 . DP=1;I16=0,1,0,0,37,1369,0,0,29,841,0,0,9,81,0,0;QS=1,0;MQ0F=0 PL 0,3,29 +1 10010000101 . T <*> 0 . DP=1;I16=0,1,0,0,38,1444,0,0,29,841,0,0,8,64,0,0;QS=1,0;MQ0F=0 PL 0,3,29 +1 10010000102 . T <*> 0 . DP=1;I16=0,1,0,0,36,1296,0,0,29,841,0,0,7,49,0,0;QS=1,0;MQ0F=0 PL 0,3,29 +1 10010000103 . G <*> 0 . DP=1;I16=0,1,0,0,37,1369,0,0,29,841,0,0,6,36,0,0;QS=1,0;MQ0F=0 PL 0,3,29 +1 10010000104 . G <*> 0 . DP=1;I16=0,1,0,0,38,1444,0,0,29,841,0,0,5,25,0,0;QS=1,0;MQ0F=0 PL 0,3,29 +1 10010000105 . A <*> 0 . DP=1;I16=0,1,0,0,37,1369,0,0,29,841,0,0,4,16,0,0;QS=1,0;MQ0F=0 PL 0,3,29 +1 10010000106 . G <*> 0 . DP=1;I16=0,1,0,0,37,1369,0,0,29,841,0,0,3,9,0,0;QS=1,0;MQ0F=0 PL 0,3,29 +1 10010000107 . G <*> 0 . DP=1;I16=0,1,0,0,33,1089,0,0,29,841,0,0,2,4,0,0;QS=1,0;MQ0F=0 PL 0,3,29 +1 10010000108 . C <*> 0 . DP=1;I16=0,1,0,0,32,1024,0,0,29,841,0,0,1,1,0,0;QS=1,0;MQ0F=0 PL 0,3,29 +1 10010000109 . A <*> 0 . DP=1;I16=0,1,0,0,35,1225,0,0,29,841,0,0,0,0,0,0;QS=1,0;MQ0F=0 PL 0,3,29 +1 10010000110 . G 0 . SVTYPE=DEL;SVLEN=-890;END=10010001000 PL 0,1,45 diff --git a/test/test.pl b/test/test.pl index a71717e31..ca0d766c1 100755 --- a/test/test.pl +++ b/test/test.pl @@ -263,6 +263,11 @@ sub test_compare } } + if (exists($args{fix_newlines})) { + $exp =~ s/\015\012/\n/g; + $out =~ s/\015\012/\n/g; + } + if ( $exp ne $out ) { failed($opts,$test,"The outputs differ:\n\t\t$exp_fn\n\t\t$out_fn"); @@ -649,6 +654,19 @@ sub test_view testv $opts, "./test_view $tv_args -M -p longrefs/longref_multi.tmp.sam longrefs/longref.tmp.sam.gz CHROMOSOME_I:10000000000-10000000003 CHROMOSOME_I:10000000100-10000000110"; testv $opts, "./compare_sam.pl longrefs/longref_multi.expected.sam longrefs/longref_multi.tmp.sam"; + # VCF round trip + unlink("longrefs/index.tmp.vcf.gz.csi"); # To stop vcf_hdr_read from reading a stale index + testv $opts, "./test_view $tv_args -z -p longrefs/index.tmp.vcf.gz -x longrefs/index.tmp.vcf.gz.csi.otf -m 14 longrefs/index.vcf"; + testv $opts, "./test_view $tv_args -p longrefs/index.tmp.vcf_ longrefs/index.tmp.vcf.gz"; + testv $opts, "cmp longrefs/index.vcf longrefs/index.tmp.vcf_"; + + # Build index and compare with on-the-fly one made earlier. + test_compare $opts, "$$opts{path}/test_index -c longrefs/index.tmp.vcf.gz", "longrefs/index.tmp.vcf.gz.csi.otf", "longrefs/index.tmp.vcf.gz.csi", gz=>1; + + # test_view can't do indexed look-ups on vcf, but we can use tabix + test_compare $opts, "$$opts{bin}/tabix longrefs/index.tmp.vcf.gz 1:10010000100-10010000105 > longrefs/index.tmp.tabix1.vcf", "longrefs/index.expected1.vcf", "longrefs/index.tmp.tabix1.vcf", fix_newlines => 1; + test_compare $opts, "$$opts{bin}/tabix longrefs/index.tmp.vcf.gz 1:10010000120-10010000130 > longrefs/index.tmp.tabix2.vcf", "longrefs/index.expected2.vcf", "longrefs/index.tmp.tabix2.vcf", fix_newlines => 1; + if ($test_view_failures == 0) { passed($opts, "large position tests"); } else { From 995069decefeea8b8fcd931c30abc40b6ead25fb Mon Sep 17 00:00:00 2001 From: Rob Davies Date: Thu, 19 Sep 2019 11:09:46 +0100 Subject: [PATCH 21/23] Use hts_pos_t in tweak_overlap_quality() and related functions While not strictly necessary (the positions in question are relative to that of read 'b') it makes data types consistent and reduces the possibility of accidental overflow. Also adds a check that the position in the sequence is valid before trying to use it for array look-ups. --- sam.c | 13 ++++++++----- 1 file changed, 8 insertions(+), 5 deletions(-) diff --git a/sam.c b/sam.c index 4ecfe3cb9..33081d5d3 100644 --- a/sam.c +++ b/sam.c @@ -4015,9 +4015,9 @@ void bam_plp_destructor(bam_plp_t plp, * Returns BAM_CMATCH, -1 when there is no more cigar to process or the requested position is not covered, * or -2 on error. */ -static inline int cigar_iref2iseq_set(uint32_t **cigar, uint32_t *cigar_max, int *icig, int *iseq, hts_pos_t *iref) +static inline int cigar_iref2iseq_set(uint32_t **cigar, uint32_t *cigar_max, hts_pos_t *icig, hts_pos_t *iseq, hts_pos_t *iref) { - int pos = *iref; + hts_pos_t pos = *iref; if ( pos < 0 ) return -1; *icig = 0; *iseq = 0; @@ -4050,7 +4050,7 @@ static inline int cigar_iref2iseq_set(uint32_t **cigar, uint32_t *cigar_max, int *iseq = -1; return -1; } -static inline int cigar_iref2iseq_next(uint32_t **cigar, uint32_t *cigar_max, int *icig, int *iseq, hts_pos_t *iref) +static inline int cigar_iref2iseq_next(uint32_t **cigar, uint32_t *cigar_max, hts_pos_t *icig, hts_pos_t *iseq, hts_pos_t *iref) { while ( *cigar < cigar_max ) { @@ -4079,8 +4079,8 @@ static int tweak_overlap_quality(bam1_t *a, bam1_t *b) { uint32_t *a_cigar = bam_get_cigar(a), *a_cigar_max = a_cigar + a->core.n_cigar; uint32_t *b_cigar = bam_get_cigar(b), *b_cigar_max = b_cigar + b->core.n_cigar; - int a_icig = 0, a_iseq = 0; - int b_icig = 0, b_iseq = 0; + hts_pos_t a_icig = 0, a_iseq = 0; + hts_pos_t b_icig = 0, b_iseq = 0; uint8_t *a_qual = bam_get_qual(a), *b_qual = bam_get_qual(b); uint8_t *a_seq = bam_get_seq(a), *b_seq = bam_get_seq(b); @@ -4114,6 +4114,9 @@ static int tweak_overlap_quality(bam1_t *a, bam1_t *b) iref++; if ( a_iref+a->core.pos != b_iref+b->core.pos ) continue; // only CMATCH positions, don't know what to do with indels + if (a_iseq > a->core.l_qseq || b_iseq > b->core.l_qseq) + return -1; // Fell off end of sequence, bad CIGAR? + if ( bam_seqi(a_seq,a_iseq) == bam_seqi(b_seq,b_iseq) ) { #if DBG From f84bba110204b0655c554af0f531aaf930371306 Mon Sep 17 00:00:00 2001 From: Rob Davies Date: Thu, 19 Sep 2019 12:04:20 +0100 Subject: [PATCH 22/23] Add bcf_get_info_int64; make bcf_update_info_int64 a static inline As bcf_get_info_int64() and bcf_update_info_int64() are new interfaces, they can be made static inlines instead of macros so that the compiler can check the data type of the values / dst parameter. The old macros probably have to stay as they are as we don't know how they are being used in third-party code. The documentation around the bcf_get_info_ and bcf_update_info_ macros is made a bit more doxygen-like. --- htslib/vcf.h | 100 ++++++++++++++++++++++++++++++++++++++------------- 1 file changed, 76 insertions(+), 24 deletions(-) diff --git a/htslib/vcf.h b/htslib/vcf.h index 0c77b4afe..ccdf701ed 100644 --- a/htslib/vcf.h +++ b/htslib/vcf.h @@ -669,26 +669,48 @@ set to one of BCF_ERR* codes and must be checked before calling bcf_write(). int bcf_update_id(const bcf_hdr_t *hdr, bcf1_t *line, const char *id); int bcf_add_id(const bcf_hdr_t *hdr, bcf1_t *line, const char *id); - /* + /** * bcf_update_info_*() - functions for updating INFO fields - * @hdr: the BCF header - * @line: VCF line to be edited - * @key: the INFO tag to be updated - * @values: pointer to the array of values. Pass NULL to remove the tag. - * @n: number of values in the array. When set to 0, the INFO tag is removed + * @param hdr: the BCF header + * @param line: VCF line to be edited + * @param key: the INFO tag to be updated + * @param values: pointer to the array of values. Pass NULL to remove the tag. + * @param n: number of values in the array. When set to 0, the INFO tag is removed + * @return 0 on success or negative value on error. * - * The @string in bcf_update_info_flag() is optional, @n indicates whether - * the flag is set or removed. + * The @p string in bcf_update_info_flag() is optional, + * @p n indicates whether the flag is set or removed. * - * Returns 0 on success or negative value on error. */ #define bcf_update_info_int32(hdr,line,key,values,n) bcf_update_info((hdr),(line),(key),(values),(n),BCF_HT_INT) - #define bcf_update_info_int64(hdr,line,key,values,n) bcf_update_info((hdr),(line),(key),(values),(n),BCF_HT_LONG) #define bcf_update_info_float(hdr,line,key,values,n) bcf_update_info((hdr),(line),(key),(values),(n),BCF_HT_REAL) #define bcf_update_info_flag(hdr,line,key,string,n) bcf_update_info((hdr),(line),(key),(string),(n),BCF_HT_FLAG) #define bcf_update_info_string(hdr,line,key,string) bcf_update_info((hdr),(line),(key),(string),1,BCF_HT_STR) int bcf_update_info(const bcf_hdr_t *hdr, bcf1_t *line, const char *key, const void *values, int n, int type); + /// Set or update 64-bit integer INFO values + /** + * @param hdr: the BCF header + * @param line: VCF line to be edited + * @param key: the INFO tag to be updated + * @param values: pointer to the array of values. Pass NULL to remove the tag. + * @param n: number of values in the array. When set to 0, the INFO tag is removed + * @return 0 on success or negative value on error. + * + * This function takes an int64_t values array as input. The data + * actually stored will be shrunk to the minimum size that can + * accept all of the values. + * + * INFO values outside of the range BCF_MIN_BT_INT32 to BCF_MAX_BT_INT32 + * can only be written to VCF files. + */ + static inline int bcf_update_info_int64(const bcf_hdr_t *hdr, bcf1_t *line, + const char *key, + const int64_t *values, int n) + { + return bcf_update_info(hdr, line, key, values, n, BCF_HT_LONG); + } + /* * bcf_update_format_*() - functions for updating FORMAT fields * @values: pointer to the array of values, the same number of elements @@ -756,22 +778,25 @@ set to one of BCF_ERR* codes and must be checked before calling bcf_write(). /** * bcf_get_info_*() - get INFO values, integers or floats - * @hdr: BCF header - * @line: BCF record - * @tag: INFO tag to retrieve - * @dst: *dst is pointer to a memory location, can point to NULL - * @ndst: pointer to the size of allocated memory + * @param hdr: BCF header + * @param line: BCF record + * @param tag: INFO tag to retrieve + * @param dst: *dst is pointer to a memory location, can point to NULL + * @param ndst: pointer to the size of allocated memory + * @return >=0 on success + * -1 .. no such INFO tag defined in the header + * -2 .. clash between types defined in the header and encountered in the VCF record + * -3 .. tag is not present in the VCF record + * -4 .. the operation could not be completed (e.g. out of memory) * - * Returns negative value on error or the number of written values - * (including missing values) on success. bcf_get_info_string() returns - * on success the number of characters written excluding the null- - * terminating byte. bcf_get_info_flag() returns 1 when flag is set or 0 - * if not. + * Returns negative value on error or the number of values (including + * missing values) put in *dst on success. bcf_get_info_string() returns + * on success the number of characters stored excluding the nul- + * terminating byte. bcf_get_info_flag() does not store anything in *dst + * but returns 1 if the flag is set or 0 if not. * - * List of return codes: - * -1 .. no such INFO tag defined in the header - * -2 .. clash between types defined in the header and encountered in the VCF record - * -3 .. tag is not present in the VCF record + * *dst will be reallocated if it is not big enough (i.e. *ndst is too + * small) or NULL on entry. The new size will be stored in *ndst. */ #define bcf_get_info_int32(hdr,line,tag,dst,ndst) bcf_get_info_values(hdr,line,tag,(void**)(dst),ndst,BCF_HT_INT) #define bcf_get_info_float(hdr,line,tag,dst,ndst) bcf_get_info_values(hdr,line,tag,(void**)(dst),ndst,BCF_HT_REAL) @@ -779,6 +804,33 @@ set to one of BCF_ERR* codes and must be checked before calling bcf_write(). #define bcf_get_info_flag(hdr,line,tag,dst,ndst) bcf_get_info_values(hdr,line,tag,(void**)(dst),ndst,BCF_HT_FLAG) int bcf_get_info_values(const bcf_hdr_t *hdr, bcf1_t *line, const char *tag, void **dst, int *ndst, int type); + /// Put integer INFO values into an int64_t array + /** + * @param hdr: BCF header + * @param line: BCF record + * @param tag: INFO tag to retrieve + * @param dst: *dst is pointer to a memory location, can point to NULL + * @param ndst: pointer to the size of allocated memory + * @return >=0 on success + * -1 .. no such INFO tag defined in the header + * -2 .. clash between types defined in the header and encountered in the VCF record + * -3 .. tag is not present in the VCF record + * -4 .. the operation could not be completed (e.g. out of memory) + * + * Returns negative value on error or the number of values (including + * missing values) put in *dst on success. + * + * *dst will be reallocated if it is not big enough (i.e. *ndst is too + * small) or NULL on entry. The new size will be stored in *ndst. + */ + static inline int bcf_get_info_int64(const bcf_hdr_t *hdr, bcf1_t *line, + const char *tag, int64_t **dst, + int *ndst) + { + return bcf_get_info_values(hdr, line, tag, + (void **) dst, ndst, BCF_HT_LONG); + } + /** * bcf_get_format_*() - same as bcf_get_info*() above * From 983244bbd0ea1e3087ccbe750edcbfbbe49a19d7 Mon Sep 17 00:00:00 2001 From: Rob Davies Date: Thu, 26 Sep 2019 12:46:05 +0100 Subject: [PATCH 23/23] Add NEWS update and README.large_positions.md file --- NEWS | 12 ++ README.large_positions.md | 231 ++++++++++++++++++++++++++++++++++++++ 2 files changed, 243 insertions(+) create mode 100644 README.large_positions.md diff --git a/NEWS b/NEWS index b062f89dd..c7c548209 100644 --- a/NEWS +++ b/NEWS @@ -4,6 +4,18 @@ Noteworthy changes in release a.b * Incompatible changes: Several functions and data types have been changed in this release, and the shared library soversion has been bumped. + - HTSlib now supports 64 bit reference positions. This means several + structures, function parameters, and return values have been made bigger + to allow larger values to be stored. While most code that uses + HTSlib interfaces should still build after this change, some alterations + may be needed - notably to printf() formats where the values of structure + members are being printed. + + Due to file format limitations, large positions are only supported + when reading and writing SAM and VCF files. + + See README.large_positions.md for more information. + - An extra field has been added to the kbitset_t struct so bitsets can be made smaller (and later enlarged) without involving memory allocation. diff --git a/README.large_positions.md b/README.large_positions.md new file mode 100644 index 000000000..f639c48d9 --- /dev/null +++ b/README.large_positions.md @@ -0,0 +1,231 @@ +# HTSlib 64 bit reference positions + +HTSlib version 1.10 onwards internally use 64 bit reference positions. This +is to support analysis of species like axolotl, tulip and marbled lungfish +which have, or are expected to have, chromosomes longer than two gigabases. + +# File format support + +Currently 64 bit positions can only be stored in SAM and VCF format files. +Binary BAM, CRAM and BCF cannot be used due to limitations in the formats +themselves. As SAM and VCF are text formats, they have no limit on the +size of numeric values. + +# Compatibility issues to check + +Various data structure members, function parameters, and return values have +been expanded from 32 to 64 bits. As a result, some changes may be needed to +code that uses the library, even if it does not support long references. + +## Variadic functions taking format strings + +The type of various structure members (e.g. `bam1_core_t::pos`) and return +values from some functions (e.g. `bam_cigar2rlen()`) have been changed to +`hts_pos_t`, which is a 64-bit signed integer. Using these in 32-bit +code will generally work (as long as the stored positions are within range), +however care needs to be taken when these values are passed directly +to functions like `printf()` which take a variable-length argument list and +a format string. + +Header file `htslib/hts.h` defines macro `PRIhts_pos` which can be +used in `printf()` format strings to get the correct format specifier for +an `hts_pos_t` value. Code that needs to print positions should be +changed from: + +```c +printf("Position is %d\n", bam->core.pos); +``` + +to: + +```c +printf("Position is %"PRIhts_pos"\n", bam->core.pos); +``` + +If for some reason compatibility with older versions of HTSlib (which do +not have `hts_pos_t` or `PRIhts_pos`) is needed, the value can be cast to +`int64_t` and printed as an explicitly 64-bit value: + +```c +#include // For PRId64 and int64_t + +printf("Position is %" PRId64 "\n", (int64_t) bam->core.pos); +``` + +Passing incorrect types to variadic functions like `printf()` can lead +to incorrect behaviour and security risks, so it important to track down +and fix all of the places where this may happen. Modern C compilers like +gcc (version 3.0 onwards) and clang can check `printf()` and `scanf()` +parameter types for compatibility against the format string. To +enable this, build code with `-Wall` or `-Wformat` and fix all the +reported warnings. + +Where functions that take `printf`-style format strings are implemented, +they should use the appropriate gcc attributes to enable format string +checking. `htslib/hts_defs.h` includes macros `HTS_FORMAT` and +`HTS_PRINTF_FMT` which can be used to provide the attribute declaration +in a portable way. For example, `test/sam.c` uses them for a function +that prints error messages: + +``` +void HTS_FORMAT(HTS_PRINTF_FMT, 1, 2) fail(const char *fmt, ...) { /* ... */ } +``` + +## Implicit type conversions + +Conversion of signed `int` or `int32_t` to `hts_pos_t` will always work. + +Conversion of `hts_pos_t` to `int` or `int32_t` will work as long as the value +converted is within the range that can be stored in the destination. + +Code that casts unsigned `uint32_t` values to signed with the expectation +that the result may be negative will no longer work as `hts_pos_t` can store +values over UINT32_MAX. Such code should be changed to use signed values. + +Functions hts_parse_region() and hts_parse_reg64() return special value +`HTS_POS_MAX` for regions which extend to the end of the reference. +This value is slightly smaller than INT64_MAX, but should be larger than +any reference that is likely to be used. When cast to `int32_t` the +result should be `INT32_MAX`. + +# Upgrading code to work with 64 bit positions + +Variables used to store reference positions should be changed to +type `hts_pos_t`. Use `PRIhts_pos` in format strings when printing them. + +When converting positions stored in strings, use `strtoll()` in place of +`atoi()` or `strtol()` (which produces a 32 bit value on 64-bit Windows and +all 32-bit platforms). + +Programs which need to look up a reference sequence length from a `sam_hdr_t` +structure should use `sam_hdr_tid2len()` instead of the old +`sam_hdr_t::target_len` array (which is left as 32-bit for reasons of +compatibility). `sam_hdr_tid2len()` returns `hts_pos_t`, so works correctly +for large references. + +Various functions which take pointer arguments have new versions which +support `hts_pos_t *` arguments. Code supporting 64-bit positions should +use the new versions. These are: + +Original function | 64-bit version +------------------ | -------------------- +fai_fetch() | fai_fetch64() +fai_fetchqual() | fai_fetchqual64() +faidx_fetch_seq() | faidx_fetch_seq64() +faidx_fetch_qual() | faidx_fetch_qual64() +hts_parse_reg() | hts_parse_reg64() or hts_parse_region() +bam_plp_auto() | bam_plp64_auto() +bam_plp_next() | bam_plp64_next() +bam_mplp_auto() | bam_mplp64_auto() + +Limited support has been added for 64-bit INFO values in VCF files, for large +values in structural variant END tags. New functions `bcf_update_info_int64()` +and `bcf_get_info_int64()` can be used to set and fetch 64-bit INFO values. +They both take arrays of `int64_t`. `bcf_int64_missing` and +`bcf_int64_vector_end` can be used to set missing and vector end values in +these arrays. The INFO data is stored in the minimum size needed, so there +is no harm in using these functions to store smaller integer values. + +# Structure members that have changed size + +``` +File htslib/hts.h: + hts_pair32_t::begin + hts_pair32_t::end + + (typedef hts_pair_pos_t is provided as a better-named replacement for hts_pair32_t) + + hts_reglist_t::min_beg + hts_reglist_t::max_end + + hts_itr_t::beg + hts_itr_t::end + hts_itr_t::curr_beg + hts_itr_t::curr_end + +File htslib/regidx.h: + reg_t::start + reg_t::end + +File htslib/sam.h: + bam1_core_t::pos + bam1_core_t::mpos + bam1_core_t::isize + +File htslib/synced_bcf_reader.h: + bcf_sr_regions_t::start + bcf_sr_regions_t::end + bcf_sr_regions_t::prev_start + +File htslib/vcf.h: + bcf_idinfo_t::info + + bcf_info_t::v1::i + + bcf1_t::pos + bcf1_t::rlen +``` + +# Functions where parameters or the return value have changed size + +Functions are annotated as follows: + +* `[new]` The function has been added since version 1.9 +* `[parameters]` Function parameters have changed size +* `[return]` Function return value has changed size + +``` +File htslib/faidx.h: + + [new] fai_fetch64() + [new] fai_fetchqual64() + [new] faidx_fetch_seq64() + [new] faidx_fetch_qual64() + [new] fai_parse_region() + +File htslib/hts.h: + + [parameters] hts_idx_push() + [new] hts_parse_reg64() + [parameters] hts_itr_query() + [parameters] hts_reg2bin() + +File htslib/kstring.h: + + [new] kputll() + +File htslib/regidx.h: + + [parameters] regidx_overlap() + +File htslib/sam.h: + + [new] sam_hdr_tid2len() + [return] bam_cigar2rlen() + [return] bam_endpos() + [parameters] bam_itr_queryi() + [parameters] sam_itr_queryi() + [new] bam_plp64_next() + [new] bam_plp64_auto() + [new] bam_mplp64_auto() + [parameters] sam_cap_mapq() + [parameters] sam_prob_realn() + +File htslib/synced_bcf_reader.h: + + [parameters] bcf_sr_seek() + [parameters] bcf_sr_regions_overlap() + +File htslib/tbx.h: + + [parameters] tbx_readrec() + +File htslib/vcf.h: + + [parameters] bcf_readrec() + [new] bcf_update_info_int64() + [new] bcf_get_info_int64() + [return] bcf_dec_int1() + [return] bcf_dec_typed_int1() + +```