Skip to content

Commit 2f6ce23

Browse files
committed
update to 1.0.0
1 parent b4795a1 commit 2f6ce23

File tree

8 files changed

+788
-337
lines changed

8 files changed

+788
-337
lines changed

treerec/_README

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -8,6 +8,7 @@ The code in `tskit/kastore` is from [kastore/c](https://github.com/tskit-dev/kas
88

99
**Changelog:**
1010

11+
- *30 November 2025*: updated to tskit 1.0.0
1112
- *24 September 2025*: updated to tskit C_1.2.0 (=1.0.0b2) (still kastore C_2.1.1)
1213
- *15 April 2025*: updated to tskit C_1.1.4 (=0.6.2) (still kastore C_2.1.1)
1314
- *7 November 2023*: updated to tskit C_1.1.2 (=0.5.5) (still kastore C_2.1.1)

treerec/tskit/core.c

Lines changed: 74 additions & 51 deletions
Original file line numberDiff line numberDiff line change
@@ -584,6 +584,14 @@ tsk_strerror_internal(int err)
584584
ret = "Must have at least one allele when specifying an allele map. "
585585
"(TSK_ERR_ZERO_ALLELES)";
586586
break;
587+
case TSK_ERR_BAD_ALLELE_LENGTH:
588+
ret = "Alleles used when decoding alignments must have length one. "
589+
"(TSK_ERR_BAD_ALLELE_LENGTH)";
590+
break;
591+
case TSK_ERR_MISSING_CHAR_COLLISION:
592+
ret = "Alleles used when decoding alignments must not match the missing "
593+
"data character. (TSK_ERR_MISSING_CHAR_COLLISION)";
594+
break;
587595

588596
/* Distance metric errors */
589597
case TSK_ERR_SAMPLE_SIZE_MISMATCH:
@@ -1260,16 +1268,16 @@ tsk_avl_tree_int_ordered_nodes(const tsk_avl_tree_int_t *self, tsk_avl_node_int_
12601268
}
12611269

12621270
// Bit Array implementation. Allows us to store unsigned integers in a compact manner.
1263-
// Currently implemented as an array of 32-bit unsigned integers for ease of counting.
1271+
// Currently implemented as an array of 32-bit unsigned integers.
12641272

12651273
int
1266-
tsk_bit_array_init(tsk_bit_array_t *self, tsk_size_t num_bits, tsk_size_t length)
1274+
tsk_bitset_init(tsk_bitset_t *self, tsk_size_t num_bits, tsk_size_t length)
12671275
{
12681276
int ret = 0;
12691277

1270-
self->size = (num_bits >> TSK_BIT_ARRAY_CHUNK)
1271-
+ (num_bits % TSK_BIT_ARRAY_NUM_BITS ? 1 : 0);
1272-
self->data = tsk_calloc(self->size * length, sizeof(*self->data));
1278+
self->row_len = (num_bits / TSK_BITSET_BITS) + (num_bits % TSK_BITSET_BITS ? 1 : 0);
1279+
self->len = length;
1280+
self->data = tsk_calloc(self->row_len * length, sizeof(*self->data));
12731281
if (self->data == NULL) {
12741282
ret = tsk_trace_error(TSK_ERR_NO_MEMORY);
12751283
goto out;
@@ -1278,96 +1286,111 @@ tsk_bit_array_init(tsk_bit_array_t *self, tsk_size_t num_bits, tsk_size_t length
12781286
return ret;
12791287
}
12801288

1281-
void
1282-
tsk_bit_array_get_row(const tsk_bit_array_t *self, tsk_size_t row, tsk_bit_array_t *out)
1283-
{
1284-
out->size = self->size;
1285-
out->data = self->data + (row * self->size);
1286-
}
1289+
#define BITSET_DATA_ROW(bs, row) ((bs)->data + (row) * (bs)->row_len)
12871290

12881291
void
1289-
tsk_bit_array_intersect(
1290-
const tsk_bit_array_t *self, const tsk_bit_array_t *other, tsk_bit_array_t *out)
1292+
tsk_bitset_intersect(const tsk_bitset_t *self, tsk_size_t self_row,
1293+
const tsk_bitset_t *other, tsk_size_t other_row, tsk_bitset_t *out)
12911294
{
1292-
for (tsk_size_t i = 0; i < self->size; i++) {
1293-
out->data[i] = self->data[i] & other->data[i];
1295+
const tsk_bitset_val_t *restrict self_d = BITSET_DATA_ROW(self, self_row);
1296+
const tsk_bitset_val_t *restrict other_d = BITSET_DATA_ROW(other, other_row);
1297+
tsk_bitset_val_t *restrict out_d = out->data;
1298+
for (tsk_size_t i = 0; i < self->row_len; i++) {
1299+
out_d[i] = self_d[i] & other_d[i];
12941300
}
12951301
}
12961302

12971303
void
1298-
tsk_bit_array_subtract(tsk_bit_array_t *self, const tsk_bit_array_t *other)
1304+
tsk_bitset_subtract(tsk_bitset_t *self, tsk_size_t self_row, const tsk_bitset_t *other,
1305+
tsk_size_t other_row)
12991306
{
1300-
for (tsk_size_t i = 0; i < self->size; i++) {
1301-
self->data[i] &= ~(other->data[i]);
1307+
tsk_bitset_val_t *restrict self_d = BITSET_DATA_ROW(self, self_row);
1308+
const tsk_bitset_val_t *restrict other_d = BITSET_DATA_ROW(other, other_row);
1309+
for (tsk_size_t i = 0; i < self->row_len; i++) {
1310+
self_d[i] &= ~(other_d[i]);
13021311
}
13031312
}
13041313

13051314
void
1306-
tsk_bit_array_add(tsk_bit_array_t *self, const tsk_bit_array_t *other)
1315+
tsk_bitset_union(tsk_bitset_t *self, tsk_size_t self_row, const tsk_bitset_t *other,
1316+
tsk_size_t other_row)
13071317
{
1308-
for (tsk_size_t i = 0; i < self->size; i++) {
1309-
self->data[i] |= other->data[i];
1318+
tsk_bitset_val_t *restrict self_d = BITSET_DATA_ROW(self, self_row);
1319+
const tsk_bitset_val_t *restrict other_d = BITSET_DATA_ROW(other, other_row);
1320+
for (tsk_size_t i = 0; i < self->row_len; i++) {
1321+
self_d[i] |= other_d[i];
13101322
}
13111323
}
13121324

13131325
void
1314-
tsk_bit_array_add_bit(tsk_bit_array_t *self, const tsk_bit_array_value_t bit)
1326+
tsk_bitset_set_bit(tsk_bitset_t *self, tsk_size_t row, const tsk_bitset_val_t bit)
13151327
{
1316-
tsk_bit_array_value_t i = bit >> TSK_BIT_ARRAY_CHUNK;
1317-
self->data[i] |= (tsk_bit_array_value_t) 1 << (bit - (TSK_BIT_ARRAY_NUM_BITS * i));
1328+
tsk_bitset_val_t i = (bit / TSK_BITSET_BITS);
1329+
*(BITSET_DATA_ROW(self, row) + i) |= (tsk_bitset_val_t) 1
1330+
<< (bit - (TSK_BITSET_BITS * i));
13181331
}
13191332

13201333
bool
1321-
tsk_bit_array_contains(const tsk_bit_array_t *self, const tsk_bit_array_value_t bit)
1334+
tsk_bitset_contains(const tsk_bitset_t *self, tsk_size_t row, const tsk_bitset_val_t bit)
13221335
{
1323-
tsk_bit_array_value_t i = bit >> TSK_BIT_ARRAY_CHUNK;
1324-
return self->data[i]
1325-
& ((tsk_bit_array_value_t) 1 << (bit - (TSK_BIT_ARRAY_NUM_BITS * i)));
1336+
tsk_bitset_val_t i = (bit / TSK_BITSET_BITS);
1337+
return *(BITSET_DATA_ROW(self, row) + i)
1338+
& ((tsk_bitset_val_t) 1 << (bit - (TSK_BITSET_BITS * i)));
13261339
}
13271340

1328-
tsk_size_t
1329-
tsk_bit_array_count(const tsk_bit_array_t *self)
1341+
static inline uint32_t
1342+
popcount(tsk_bitset_val_t v)
13301343
{
1331-
// Utilizes 12 operations per bit array. NB this only works on 32 bit integers.
1344+
// Utilizes 12 operations per chunk. NB this only works on 32 bit integers.
13321345
// Taken from:
13331346
// https://graphics.stanford.edu/~seander/bithacks.html#CountBitsSetParallel
13341347
// There's a nice breakdown of this algorithm here:
13351348
// https://stackoverflow.com/a/109025
1336-
// Could probably do better with explicit SIMD (instead of SWAR), but not as
1337-
// portable: https://arxiv.org/pdf/1611.07612.pdf
13381349
//
1339-
// There is one solution to explore further, which uses __builtin_popcountll.
1340-
// This option is relatively simple, but requires a 64 bit bit array and also
1341-
// involves some compiler flag plumbing (-mpopcnt)
1350+
// The gcc/clang compiler flag will -mpopcnt will convert this code to a
1351+
// popcnt instruction (most if not all modern CPUs will support this). The
1352+
// popcnt instruction will yield some speed improvements, which depend on
1353+
// the tree sequence.
1354+
//
1355+
// NB: 32bit counting is typically faster than 64bit counting for this task.
1356+
// (at least on x86-64)
1357+
1358+
v = v - ((v >> 1) & 0x55555555);
1359+
v = (v & 0x33333333) + ((v >> 2) & 0x33333333);
1360+
return (((v + (v >> 4)) & 0xF0F0F0F) * 0x1010101) >> 24;
1361+
}
13421362

1343-
tsk_bit_array_value_t tmp;
1344-
tsk_size_t i, count = 0;
1363+
tsk_size_t
1364+
tsk_bitset_count(const tsk_bitset_t *self, tsk_size_t row)
1365+
{
1366+
tsk_size_t i = 0;
1367+
tsk_size_t count = 0;
1368+
const tsk_bitset_val_t *restrict self_d = BITSET_DATA_ROW(self, row);
13451369

1346-
for (i = 0; i < self->size; i++) {
1347-
tmp = self->data[i] - ((self->data[i] >> 1) & 0x55555555);
1348-
tmp = (tmp & 0x33333333) + ((tmp >> 2) & 0x33333333);
1349-
count += (((tmp + (tmp >> 4)) & 0xF0F0F0F) * 0x1010101) >> 24;
1370+
for (i = 0; i < self->row_len; i++) {
1371+
count += popcount(self_d[i]);
13501372
}
13511373
return count;
13521374
}
13531375

13541376
void
1355-
tsk_bit_array_get_items(
1356-
const tsk_bit_array_t *self, tsk_id_t *items, tsk_size_t *n_items)
1377+
tsk_bitset_get_items(
1378+
const tsk_bitset_t *self, tsk_size_t row, tsk_id_t *items, tsk_size_t *n_items)
13571379
{
13581380
// Get the items stored in the row of a bitset.
1359-
// Uses a de Bruijn sequence lookup table to determine the lowest bit set. See the
1360-
// wikipedia article for more info: https://w.wiki/BYiF
1381+
// Uses a de Bruijn sequence lookup table to determine the lowest bit set.
1382+
// See the wikipedia article for more info: https://w.wiki/BYiF
13611383

13621384
tsk_size_t i, n, off;
1363-
tsk_bit_array_value_t v, lsb; // least significant bit
1385+
tsk_bitset_val_t v, lsb; // least significant bit
13641386
static const tsk_id_t lookup[32] = { 0, 1, 28, 2, 29, 14, 24, 3, 30, 22, 20, 15, 25,
13651387
17, 4, 8, 31, 27, 13, 23, 21, 19, 16, 7, 26, 12, 18, 6, 11, 5, 10, 9 };
1388+
const tsk_bitset_val_t *restrict self_d = BITSET_DATA_ROW(self, row);
13661389

13671390
n = 0;
1368-
for (i = 0; i < self->size; i++) {
1369-
v = self->data[i];
1370-
off = i * ((tsk_size_t) TSK_BIT_ARRAY_NUM_BITS);
1391+
for (i = 0; i < self->row_len; i++) {
1392+
v = self_d[i];
1393+
off = i * TSK_BITSET_BITS;
13711394
if (v == 0) {
13721395
continue;
13731396
}
@@ -1381,7 +1404,7 @@ tsk_bit_array_get_items(
13811404
}
13821405

13831406
void
1384-
tsk_bit_array_free(tsk_bit_array_t *self)
1407+
tsk_bitset_free(tsk_bitset_t *self)
13851408
{
13861409
tsk_safe_free(self->data);
13871410
}

treerec/tskit/core.h

Lines changed: 33 additions & 23 deletions
Original file line numberDiff line numberDiff line change
@@ -147,7 +147,7 @@ sizes and types of externally visible structs.
147147
The library minor version. Incremented when non-breaking backward-compatible changes
148148
to the API or ABI are introduced, i.e., the addition of a new function.
149149
*/
150-
#define TSK_VERSION_MINOR 2
150+
#define TSK_VERSION_MINOR 3
151151
/**
152152
The library patch version. Incremented when any changes not relevant to the
153153
to the API or ABI are introduced, i.e., internal refactors of bugfixes.
@@ -803,6 +803,14 @@ More than 2147483647 alleles were specified.
803803
A user-specified allele map was used, but it contained zero alleles.
804804
*/
805805
#define TSK_ERR_ZERO_ALLELES -1103
806+
/**
807+
An allele used when decoding alignments had length other than one.
808+
*/
809+
#define TSK_ERR_BAD_ALLELE_LENGTH -1104
810+
/**
811+
An allele used when decoding alignments matched the missing data character.
812+
*/
813+
#define TSK_ERR_MISSING_CHAR_COLLISION -1105
806814
/** @} */
807815

808816
/**
@@ -1104,29 +1112,31 @@ FILE *tsk_get_debug_stream(void);
11041112

11051113
/* Bit Array functionality */
11061114

1107-
typedef uint32_t tsk_bit_array_value_t;
1115+
// define a 32-bit chunk size for our bitsets.
1116+
// this means we'll be able to hold 32 distinct items in each 32 bit uint
1117+
#define TSK_BITSET_BITS ((tsk_size_t) 32)
1118+
typedef uint32_t tsk_bitset_val_t;
1119+
11081120
typedef struct {
1109-
tsk_size_t size; // Number of chunks per row
1110-
tsk_bit_array_value_t *data; // Array data
1111-
} tsk_bit_array_t;
1112-
1113-
#define TSK_BIT_ARRAY_CHUNK 5U
1114-
#define TSK_BIT_ARRAY_NUM_BITS (1U << TSK_BIT_ARRAY_CHUNK)
1115-
1116-
int tsk_bit_array_init(tsk_bit_array_t *self, tsk_size_t num_bits, tsk_size_t length);
1117-
void tsk_bit_array_free(tsk_bit_array_t *self);
1118-
void tsk_bit_array_get_row(
1119-
const tsk_bit_array_t *self, tsk_size_t row, tsk_bit_array_t *out);
1120-
void tsk_bit_array_intersect(
1121-
const tsk_bit_array_t *self, const tsk_bit_array_t *other, tsk_bit_array_t *out);
1122-
void tsk_bit_array_subtract(tsk_bit_array_t *self, const tsk_bit_array_t *other);
1123-
void tsk_bit_array_add(tsk_bit_array_t *self, const tsk_bit_array_t *other);
1124-
void tsk_bit_array_add_bit(tsk_bit_array_t *self, const tsk_bit_array_value_t bit);
1125-
bool tsk_bit_array_contains(
1126-
const tsk_bit_array_t *self, const tsk_bit_array_value_t bit);
1127-
tsk_size_t tsk_bit_array_count(const tsk_bit_array_t *self);
1128-
void tsk_bit_array_get_items(
1129-
const tsk_bit_array_t *self, tsk_id_t *items, tsk_size_t *n_items);
1121+
tsk_size_t row_len; // Number of size TSK_BITSET_BITS chunks per row
1122+
tsk_size_t len; // Number of rows
1123+
tsk_bitset_val_t *data;
1124+
} tsk_bitset_t;
1125+
1126+
int tsk_bitset_init(tsk_bitset_t *self, tsk_size_t num_bits, tsk_size_t length);
1127+
void tsk_bitset_free(tsk_bitset_t *self);
1128+
void tsk_bitset_intersect(const tsk_bitset_t *self, tsk_size_t self_row,
1129+
const tsk_bitset_t *other, tsk_size_t other_row, tsk_bitset_t *out);
1130+
void tsk_bitset_subtract(tsk_bitset_t *self, tsk_size_t self_row,
1131+
const tsk_bitset_t *other, tsk_size_t other_row);
1132+
void tsk_bitset_union(tsk_bitset_t *self, tsk_size_t self_row, const tsk_bitset_t *other,
1133+
tsk_size_t other_row);
1134+
void tsk_bitset_set_bit(tsk_bitset_t *self, tsk_size_t row, const tsk_bitset_val_t bit);
1135+
bool tsk_bitset_contains(
1136+
const tsk_bitset_t *self, tsk_size_t row, const tsk_bitset_val_t bit);
1137+
tsk_size_t tsk_bitset_count(const tsk_bitset_t *self, tsk_size_t row);
1138+
void tsk_bitset_get_items(
1139+
const tsk_bitset_t *self, tsk_size_t row, tsk_id_t *items, tsk_size_t *n_items);
11301140

11311141
#ifdef __cplusplus
11321142
}

0 commit comments

Comments
 (0)