From 0a6e0e7983a7835bed7594c3527f46b53c2c70da Mon Sep 17 00:00:00 2001 From: Ivan K Date: Thu, 18 Dec 2025 10:43:20 +0300 Subject: [PATCH 01/18] frev: drop SET_ATTRIB Instead, backport and use CLEAR_ATTRIB (R >= 4.5). --- src/data.table.h | 1 + src/utils.c | 2 +- 2 files changed, 2 insertions(+), 1 deletion(-) diff --git a/src/data.table.h b/src/data.table.h index 0bfcb09d8f..0fb796eb05 100644 --- a/src/data.table.h +++ b/src/data.table.h @@ -15,6 +15,7 @@ #endif #if R_VERSION < R_Version(4, 5, 0) # define isDataFrame(x) isFrame(x) // #6180 +# define CLEAR_ATTRIB(x) SET_ATTRIB(x, R_NilValue) #endif #include #define SEXPPTR_RO(x) ((const SEXP *)DATAPTR_RO(x)) // to avoid overhead of looped STRING_ELT and VECTOR_ELT diff --git a/src/utils.c b/src/utils.c index fa95c60e78..1608f90202 100644 --- a/src/utils.c +++ b/src/utils.c @@ -625,7 +625,7 @@ SEXP frev(SEXP x, SEXP copyArg) { SEXP levels = PROTECT(getAttrib(x, R_LevelsSymbol)); nprotect += 2; // swipe attributes from x - SET_ATTRIB(x, R_NilValue); + CLEAR_ATTRIB(x); setAttrib(x, R_NamesSymbol, names); setAttrib(x, R_ClassSymbol, klass); setAttrib(x, R_LevelsSymbol, levels); From 2e0c3893a3607ab7813df03a4f330e6eb8300108 Mon Sep 17 00:00:00 2001 From: Ivan K Date: Thu, 18 Dec 2025 11:02:40 +0300 Subject: [PATCH 02/18] mergeIndexAttrib: drop SET_ATTRIB Use SHALLOW_DUPLICATE_ATTRIB (R >= 3.3) for the simple case. Also, Backport ANY_ATTRIB (R >= 4.5) instead of testing !isNull(ATTRIB(.)). --- src/data.table.h | 1 + src/mergelist.c | 8 ++++---- 2 files changed, 5 insertions(+), 4 deletions(-) diff --git a/src/data.table.h b/src/data.table.h index 0fb796eb05..111ae33738 100644 --- a/src/data.table.h +++ b/src/data.table.h @@ -16,6 +16,7 @@ #if R_VERSION < R_Version(4, 5, 0) # define isDataFrame(x) isFrame(x) // #6180 # define CLEAR_ATTRIB(x) SET_ATTRIB(x, R_NilValue) +# define ANY_ATTRIB(x) (!(isNull(ATTRIB(x)))) #endif #include #define SEXPPTR_RO(x) ((const SEXP *)DATAPTR_RO(x)) // to avoid overhead of looped STRING_ELT and VECTOR_ELT diff --git a/src/mergelist.c b/src/mergelist.c index 51f28d224a..aed75e4249 100644 --- a/src/mergelist.c +++ b/src/mergelist.c @@ -22,11 +22,11 @@ void mergeIndexAttrib(SEXP to, SEXP from) { internal_error(__func__, "'to' must be integer() already"); // # nocov if (isNull(from)) return; - SEXP t = ATTRIB(to), f = ATTRIB(from); - if (isNull(t)) // target has no attributes -> overwrite - SET_ATTRIB(to, shallow_duplicate(f)); + if (!ANY_ATTRIB(to)) // target has no attributes -> overwrite + SHALLOW_DUPLICATE_ATTRIB(to, from); else { - for (t = ATTRIB(to); CDR(t) != R_NilValue; t = CDR(t)); // traverse to end of attributes list of to + SEXP t = ATTRIB(to), f = ATTRIB(from); + for (; CDR(t) != R_NilValue; t = CDR(t)); // traverse to end of attributes list of to SETCDR(t, shallow_duplicate(f)); } } From 58d586b0857523200c7e98cdd278398bb7dc0509 Mon Sep 17 00:00:00 2001 From: Ivan K Date: Thu, 18 Dec 2025 11:08:56 +0300 Subject: [PATCH 03/18] cbindlist: use ANY_ATTRIB --- src/mergelist.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/mergelist.c b/src/mergelist.c index aed75e4249..5ecb45609c 100644 --- a/src/mergelist.c +++ b/src/mergelist.c @@ -84,7 +84,7 @@ SEXP cbindlist(SEXP x, SEXP copyArg) { key = getAttrib(thisx, sym_sorted); UNPROTECT(protecti); // thisnames, thisxcol } - if (isNull(ATTRIB(index))) + if (!ANY_ATTRIB(index)) setAttrib(ans, sym_index, R_NilValue); setAttrib(ans, R_NamesSymbol, names); setAttrib(ans, sym_sorted, key); From 402f0d55452d75470f3dfa5c6c9258af70a0a5ed Mon Sep 17 00:00:00 2001 From: Ivan K Date: Thu, 18 Dec 2025 11:13:10 +0300 Subject: [PATCH 04/18] nafillR: use ANY_ATTRIB --- src/nafill.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/nafill.c b/src/nafill.c index 5c9568efb5..ff2a7fc344 100644 --- a/src/nafill.c +++ b/src/nafill.c @@ -218,7 +218,7 @@ SEXP nafillR(SEXP obj, SEXP type, SEXP fill, SEXP nan_is_na_arg, SEXP inplace, S if (!binplace) { for (R_len_t i=0; i Date: Thu, 18 Dec 2025 20:55:22 +0300 Subject: [PATCH 05/18] Backport R_mapAttrib --- src/data.table.h | 8 ++++++++ src/utils.c | 17 +++++++++++++++++ 2 files changed, 25 insertions(+) diff --git a/src/data.table.h b/src/data.table.h index 111ae33738..2f10368487 100644 --- a/src/data.table.h +++ b/src/data.table.h @@ -105,6 +105,11 @@ } # define R_resizeVector(x, newlen) R_resizeVector_(x, newlen) #endif +// TODO(R>=4.6.0): remove the SVN revision check +#if R_VERSION < R_Version(4, 6, 0) || R_SVN_REVISION < 89194 +# define BACKPORT_MAP_ATTRIB +# define R_mapAttrib(x, fun, ctx) R_mapAttrib_(x, fun, ctx) +#endif // init.c extern SEXP char_integer64; @@ -345,6 +350,9 @@ SEXP R_allocResizableVector_(SEXPTYPE type, R_xlen_t maxlen); SEXP R_duplicateAsResizable_(SEXP x); void R_resizeVector_(SEXP x, R_xlen_t newlen); #endif +#ifdef BACKPORT_MAP_ATTRIB +SEXP R_mapAttrib_(SEXP x, SEXP (*fun)(SEXP key, SEXP val, void *ctx), void *ctx); +#endif // types.c char *end(char *start); diff --git a/src/utils.c b/src/utils.c index 1608f90202..5df924e928 100644 --- a/src/utils.c +++ b/src/utils.c @@ -672,3 +672,20 @@ void R_resizeVector_(SEXP x, R_xlen_t newlen) { SETLENGTH(x, newlen); } #endif + +#ifdef BACKPORT_MAP_ATTRIB +SEXP R_mapAttrib_(SEXP x, SEXP (*fun)(SEXP key, SEXP val, void *ctx), void *ctx) { + PROTECT_INDEX i; + SEXP a = ATTRIB(x); + PROTECT_WITH_INDEX(a, &i); + + SEXP ret = NULL; + for (; !isNull(a); REPROTECT(a = CDR(a), i)) { + ret = fun(TAG(a), CAR(a), ctx); + if (ret) break; + } + + UNPROTECT(1); + return ret; +} +#endif From ff280b5f1f7cf0170b917de32f676600886d3418 Mon Sep 17 00:00:00 2001 From: Ivan K Date: Thu, 18 Dec 2025 23:16:24 +0300 Subject: [PATCH 06/18] anySpecialStatic: switch to R_mapAttrib --- src/dogroups.c | 13 +++++++++---- 1 file changed, 9 insertions(+), 4 deletions(-) diff --git a/src/dogroups.c b/src/dogroups.c index 373242516f..7423e7c00a 100644 --- a/src/dogroups.c +++ b/src/dogroups.c @@ -3,6 +3,8 @@ #include #include +static SEXP attribWalker(SEXP key, SEXP val, void *ctx); + static bool anySpecialStatic(SEXP x, hashtab * specials) { // Special refers to special symbols .BY, .I, .N, and .GRP; see special-symbols.Rd // Static because these are like C static arrays which are the same memory for each group; e.g., dogroups @@ -54,15 +56,18 @@ static bool anySpecialStatic(SEXP x, hashtab * specials) { list_el = VECTOR_ELT(x,i); if (anySpecialStatic(list_el, specials)) return true; - for(attribs = ATTRIB(list_el); attribs != R_NilValue; attribs = CDR(attribs)) { - if (anySpecialStatic(CAR(attribs), specials)) - return true; // #4936 - } + if (R_mapAttrib(list_el, attribWalker, specials)) + return true; // #4936 } } return false; } +static SEXP attribWalker(SEXP key, SEXP val, void *specials) { + (void)key; + return anySpecialStatic(val, specials) ? R_NilValue : NULL; +} + SEXP dogroups(SEXP dt, SEXP dtcols, SEXP groups, SEXP grpcols, SEXP jiscols, SEXP xjiscols, SEXP grporder, SEXP order, SEXP starts, SEXP lens, SEXP jexp, SEXP env, SEXP lhs, SEXP newnames, SEXP on, SEXP verboseArg, SEXP showProgressArg) { R_len_t ngrp, nrowgroups, njval=0, ngrpcols, ansloc=0, maxn, estn=-1, thisansloc, grpn, thislen, igrp; From e4f178819aa9d825694258973fbcf1eb49f8c316 Mon Sep 17 00:00:00 2001 From: Ivan K Date: Thu, 18 Dec 2025 23:56:02 +0300 Subject: [PATCH 07/18] dogroups: construct rownames anew Instead of trying to walk ATTRIB in search of the compact 'rownames' attribute to modify, install it anew, take note of the returned reference to the value being installed (a different one!) and modify that. --- src/dogroups.c | 15 +++++++++------ 1 file changed, 9 insertions(+), 6 deletions(-) diff --git a/src/dogroups.c b/src/dogroups.c index 7423e7c00a..21c12618a1 100644 --- a/src/dogroups.c +++ b/src/dogroups.c @@ -72,7 +72,7 @@ SEXP dogroups(SEXP dt, SEXP dtcols, SEXP groups, SEXP grpcols, SEXP jiscols, SEX { R_len_t ngrp, nrowgroups, njval=0, ngrpcols, ansloc=0, maxn, estn=-1, thisansloc, grpn, thislen, igrp; int nprotect=0; - SEXP ans=NULL, jval, thiscol, BY, N, I, GRP, iSD, xSD, rownames, s, RHS, target, source; + SEXP ans=NULL, jval, thiscol, BY, N, I, GRP, iSD, xSD, s, RHS, target, source; Rboolean wasvector, firstalloc=FALSE, NullWarnDone=FALSE; const bool verbose = LOGICAL(verboseArg)[0]==1; double tstart=0, tblock[10]={0}; int nblock[10]={0}; // For verbose printing, tstart is updated each block @@ -135,12 +135,15 @@ SEXP dogroups(SEXP dt, SEXP dtcols, SEXP groups, SEXP grpcols, SEXP jiscols, SEX R_LockBinding(install(".I"), env); SEXP dtnames = PROTECT(getAttrib(dt, R_NamesSymbol)); nprotect++; // added here to fix #91 - `:=` did not issue recycling warning during "by" - // fetch rownames of .SD. rownames[1] is set to -thislen for each group, in case .SD is passed to + + // override rownames of .SD. rownames[1] is set to -thislen for each group, in case .SD is passed to // non data.table aware package that uses rownames - for (s = ATTRIB(SD); s != R_NilValue && TAG(s)!=R_RowNamesSymbol; s = CDR(s)); // getAttrib0 basically but that's hidden in attrib.c; #loop_counter_not_local_scope_ok - if (s==R_NilValue) error(_("row.names attribute of .SD not found")); - rownames = CAR(s); - if (!isInteger(rownames) || LENGTH(rownames)!=2 || INTEGER(rownames)[0]!=NA_INTEGER) error(_("row.names of .SD isn't integer length 2 with NA as first item; i.e., .set_row_names(). [%s %d %d]"),type2char(TYPEOF(rownames)),LENGTH(rownames),INTEGER(rownames)[0]); + SEXP rownames; + PROTECT_INDEX rownamesi; + PROTECT_WITH_INDEX(rownames = allocVector(INTSXP, 2), &rownamesi); nprotect++; + INTEGER(rownames)[0] = NA_INTEGER; + INTEGER(rownames)[1] = -maxGrpSize; + REPROTECT(rownames = setAttrib(SD, R_RowNamesSymbol, rownames), rownamesi); // fetch names of .SD and prepare symbols. In case they are copied-on-write by user assigning to those variables // using <- in j (which is valid, useful and tested), they are repointed to the .SD cols for each group. From 3c6675717f3e8c68b0f07784586d81fe998f9719 Mon Sep 17 00:00:00 2001 From: Ivan K Date: Fri, 19 Dec 2025 00:14:12 +0300 Subject: [PATCH 08/18] mergeIndexAttrib: switch to R_mapAttrib --- src/mergelist.c | 13 ++++++++----- 1 file changed, 8 insertions(+), 5 deletions(-) diff --git a/src/mergelist.c b/src/mergelist.c index 5ecb45609c..90854ae824 100644 --- a/src/mergelist.c +++ b/src/mergelist.c @@ -17,6 +17,12 @@ SEXP copyCols(SEXP x, SEXP cols) { return R_NilValue; } +static SEXP setDuplicateOneAttrib(SEXP key, SEXP val, void *x) { + setAttrib(x, PROTECT(key), PROTECT(shallow_duplicate(val))); + UNPROTECT(2); + return NULL; // continue +} + void mergeIndexAttrib(SEXP to, SEXP from) { if (!isInteger(to) || LENGTH(to)!=0) internal_error(__func__, "'to' must be integer() already"); // # nocov @@ -24,11 +30,8 @@ void mergeIndexAttrib(SEXP to, SEXP from) { return; if (!ANY_ATTRIB(to)) // target has no attributes -> overwrite SHALLOW_DUPLICATE_ATTRIB(to, from); - else { - SEXP t = ATTRIB(to), f = ATTRIB(from); - for (; CDR(t) != R_NilValue; t = CDR(t)); // traverse to end of attributes list of to - SETCDR(t, shallow_duplicate(f)); - } + else + R_mapAttrib(from, setDuplicateOneAttrib, to); } SEXP cbindlist(SEXP x, SEXP copyArg) { From 2c10c371e64ac5362ad49a1f7891450bd8c5651c Mon Sep 17 00:00:00 2001 From: Ivan K Date: Fri, 19 Dec 2025 00:42:22 +0300 Subject: [PATCH 09/18] assign: factor out index fixup Instead of walking the attribute list directly, use R_mapAttrib(). Create a hash table of index names instead of relying on chin() and a temporary string vector. Move all temporary allocations onto the R heap. --- src/assign.c | 195 ++++++++++++++++++++++++++++----------------------- 1 file changed, 106 insertions(+), 89 deletions(-) diff --git a/src/assign.c b/src/assign.c index 849cb08f2a..fb79eab327 100644 --- a/src/assign.c +++ b/src/assign.c @@ -256,6 +256,99 @@ SEXP selfrefokwrapper(SEXP x, SEXP verbose) { return ScalarInteger(_selfrefok(x,FALSE,LOGICAL(verbose)[0])); } +struct attrib_name_ctx { + hashtab *indexNames; + SEXP index, assignedNames; + R_xlen_t len; + bool verbose; +}; +static SEXP getOneAttribName(SEXP key, SEXP val, void *ctx_) { + (void)val; + struct attrib_name_ctx *ctx = ctx_; + if (ctx->indexNames) + hash_set(ctx->indexNames, PRINTNAME(key), 1); + else + ctx->len++; + return NULL; +} + +static SEXP fixIndexAttrib(SEXP tag, SEXP value, void *ctx_) { + const struct attrib_name_ctx *ctx = ctx_; + + hashtab *indexNames = ctx->indexNames; + SEXP index = ctx->index, assignedNames = ctx->assignedNames; + R_xlen_t indexLength = xlength(value); + bool verbose = ctx->verbose; + + const char *tc1, *c1; + tc1 = c1 = CHAR(PRINTNAME(tag)); // the index name; e.g. "__col1__col2" + + if (*tc1!='_' || *(tc1+1)!='_') { + // fix for #1396 + if (verbose) { + Rprintf(_("Dropping index '%s' as it doesn't have '__' at the beginning of its name. It was very likely created by v1.9.4 of data.table.\n"), tc1); + } + setAttrib(index, tag, R_NilValue); + return NULL; + } + + tc1 += 2; // tc1 always marks the start of a key column + if (!*tc1) internal_error(__func__, "index name ends with trailing __"); // # nocov + + void *vmax = vmaxget(); + // check the position of the first appearance of an assigned column in the index. + // the new index will be truncated to this position. + size_t newKeyLength = strlen(c1); + char *s4 = R_alloc(newKeyLength + 3, 1); + memcpy(s4, c1, newKeyLength); + memcpy(s4 + newKeyLength, "__", 3); + + for(int i = 0; i < xlength(assignedNames); i++){ + const char *tc2 = CHAR(STRING_ELT(assignedNames, i)); + void *vmax2 = vmaxget(); + size_t tc2_len = strlen(tc2); + char *s5 = R_alloc(tc2_len + 5, 1); //4 * '_' + \0 + memcpy(s5, "__", 2); + memcpy(s5 + 2, tc2, tc2_len); + memcpy(s5 + 2 + tc2_len, "__", 3); + tc2 = strstr(s4, s5); + if(tc2 && (tc2 - s4 < newKeyLength)){ // new column is part of key; match is before last match + newKeyLength = tc2 - s4; + } + vmaxset(vmax2); + } + + s4[newKeyLength] = '\0'; // truncate the new key to the new length + if(newKeyLength == 0){ // no valid key column remains. Drop the key + setAttrib(index, tag, R_NilValue); + hash_set(indexNames, PRINTNAME(tag), 0); + if (verbose) { + Rprintf(_("Dropping index '%s' due to an update on a key column\n"), c1+2); + } + } else if(newKeyLength < strlen(c1)) { + SEXP s4Str = PROTECT(mkChar(s4)); + if(indexLength == 0 && // shortened index can be kept since it is just information on the order (see #2372) + !hash_lookup(indexNames, s4Str, 0)) { // index with shortened name not present yet + setAttrib(index, installChar(s4Str), value); + hash_set(indexNames, PRINTNAME(tag), 0); + setAttrib(index, tag, R_NilValue); + hash_set(indexNames, s4Str, 1); + if (verbose) + Rprintf(_("Shortening index '%s' to '%s' due to an update on a key column\n"), c1+2, s4+2); + } else { // indexLength > 0 || shortened name present already + // indexLength > 0 indicates reordering. Drop it to avoid spurious reordering in non-indexed columns (#2372) + // shortened name already present indicates that index needs to be dropped to avoid duplicate indices. + setAttrib(index, tag, R_NilValue); + hash_set(indexNames, tag, 0); + if (verbose) + Rprintf(_("Dropping index '%s' due to an update on a key column\n"), c1+2); + } + UNPROTECT(1); // s4Str + } //else: index is not affected by assign: nothing to be done + vmaxset(vmax); + return NULL; +} + int *_Last_updated = NULL; SEXP assign(SEXP dt, SEXP rows, SEXP cols, SEXP newcolnames, SEXP values) @@ -265,11 +358,11 @@ SEXP assign(SEXP dt, SEXP rows, SEXP cols, SEXP newcolnames, SEXP values) // cols : column names or numbers corresponding to the values to set // rows : row numbers to assign R_len_t numToDo, targetlen, vlen, oldncol, oldtncol, coln, protecti=0, newcolnum, indexLength; - SEXP targetcol, nullint, s, colnam, tmp, key, index, a, assignedNames, indexNames; + SEXP targetcol, nullint, s, colnam, tmp, key, index, a, assignedNames; bool verbose=GetVerbose(); int ndelete=0; // how many columns are being deleted const char *c1, *tc1, *tc2; - int *buf, indexNo; + int *buf; if (isNull(dt)) error(_("assign has been passed a NULL dt")); if (TYPEOF(dt) != VECSXP) error(_("dt passed to assign isn't type VECSXP")); if (islocked(dt)) @@ -549,93 +642,17 @@ SEXP assign(SEXP dt, SEXP rows, SEXP cols, SEXP newcolnames, SEXP values) } index = getAttrib(dt, install("index")); if (index != R_NilValue) { - s = ATTRIB(index); - indexNo = 0; - // get a vector with all index names - PROTECT(indexNames = allocVector(STRSXP, xlength(s))); protecti++; - while(s != R_NilValue){ - SET_STRING_ELT(indexNames, indexNo, PRINTNAME(TAG(s))); - indexNo++; - s = CDR(s); - } - s = ATTRIB(index); // reset to first element - indexNo = 0; - while(s != R_NilValue) { - a = TAG(s); - indexLength = xlength(CAR(s)); - tc1 = c1 = CHAR(PRINTNAME(a)); // the index name; e.g. "__col1__col2" - if (*tc1!='_' || *(tc1+1)!='_') { - // fix for #1396 - if (verbose) { - Rprintf(_("Dropping index '%s' as it doesn't have '__' at the beginning of its name. It was very likely created by v1.9.4 of data.table.\n"), tc1); - } - setAttrib(index, a, R_NilValue); - indexNo++; - s = CDR(s); - continue; // with next index - } - tc1 += 2; // tc1 always marks the start of a key column - if (!*tc1) internal_error(__func__, "index name ends with trailing __"); // # nocov - // check the position of the first appearance of an assigned column in the index. - // the new index will be truncated to this position. - char *s4 = malloc(strlen(c1) + 3); - if (!s4) { - internal_error(__func__, "Couldn't allocate memory for s4"); // # nocov - } - memcpy(s4, c1, strlen(c1)); - memset(s4 + strlen(c1), '\0', 1); - strcat(s4, "__"); // add trailing '__' to newKey so we can search for pattern '__colName__' also at the end of the index. - int newKeyLength = strlen(c1); - for(int i = 0; i < xlength(assignedNames); i++){ - tc2 = CHAR(STRING_ELT(assignedNames, i)); - char *s5 = malloc(strlen(tc2) + 5); //4 * '_' + \0 - if (!s5) { - free(s4); // # nocov - internal_error(__func__, "Couldn't allocate memory for s5"); // # nocov - } - memset(s5, '_', 2); - memset(s5 + 2, '\0', 1); - strcat(s5, tc2); - strcat(s5, "__"); - tc2 = strstr(s4, s5); - if(tc2 == NULL){ // column is not part of key - free(s5); - continue; - } - if(tc2 - s4 < newKeyLength){ // new column match is before last match - newKeyLength = tc2 - s4; - } - free(s5); - } - memset(s4 + newKeyLength, '\0', 1); // truncate the new key to the new length - if(newKeyLength == 0){ // no valid key column remains. Drop the key - setAttrib(index, a, R_NilValue); - SET_STRING_ELT(indexNames, indexNo, NA_STRING); - if (verbose) { - Rprintf(_("Dropping index '%s' due to an update on a key column\n"), c1+2); - } - } else if(newKeyLength < strlen(c1)) { - SEXP s4Str = PROTECT(mkString(s4)); - if(indexLength == 0 && // shortened index can be kept since it is just information on the order (see #2372) - LOGICAL(chin(s4Str, indexNames))[0] == 0) {// index with shortened name not present yet - SET_TAG(s, install(s4)); - SET_STRING_ELT(indexNames, indexNo, mkChar(s4)); - if (verbose) - Rprintf(_("Shortening index '%s' to '%s' due to an update on a key column\n"), c1+2, s4 + 2); - } else { // indexLength > 0 || shortened name present already - // indexLength > 0 indicates reordering. Drop it to avoid spurious reordering in non-indexed columns (#2372) - // shortened name already present indicates that index needs to be dropped to avoid duplicate indices. - setAttrib(index, a, R_NilValue); - SET_STRING_ELT(indexNames, indexNo, NA_STRING); - if (verbose) - Rprintf(_("Dropping index '%s' due to an update on a key column\n"), c1+2); - } - UNPROTECT(1); // s4Str - } //else: index is not affected by assign: nothing to be done - free(s4); - indexNo ++; - s = CDR(s); - } + struct attrib_name_ctx ctx = { 0, }; + R_mapAttrib(index, getOneAttribName, &ctx); // how many attributes? + hashtab *h = hash_create(ctx.len); + PROTECT(h->prot); + ctx.indexNames = h; + R_mapAttrib(index, getOneAttribName, &ctx); // now remember the names + ctx.index = index; + ctx.assignedNames = assignedNames; + ctx.verbose = verbose; + R_mapAttrib(index, fixIndexAttrib, &ctx); // adjust indices as needed + UNPROTECT(1); // h } if (ndelete) { // delete any columns assigned NULL (there was a 'continue' earlier in loop above) From e1fa0a63b32fe968cd4cfbbae9364a7055d1b136 Mon Sep 17 00:00:00 2001 From: HughParsonage Date: Sun, 21 Dec 2025 22:13:00 +0300 Subject: [PATCH 10/18] assign: drop indexLength --- src/assign.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/assign.c b/src/assign.c index fb79eab327..94c27d598f 100644 --- a/src/assign.c +++ b/src/assign.c @@ -357,7 +357,7 @@ SEXP assign(SEXP dt, SEXP rows, SEXP cols, SEXP newcolnames, SEXP values) // newcolnames : add these columns (if any) // cols : column names or numbers corresponding to the values to set // rows : row numbers to assign - R_len_t numToDo, targetlen, vlen, oldncol, oldtncol, coln, protecti=0, newcolnum, indexLength; + R_len_t numToDo, targetlen, vlen, oldncol, oldtncol, coln, protecti=0, newcolnum; SEXP targetcol, nullint, s, colnam, tmp, key, index, a, assignedNames; bool verbose=GetVerbose(); int ndelete=0; // how many columns are being deleted From 09b1f7eee884e4925244fd7f2379be63d6b0e929 Mon Sep 17 00:00:00 2001 From: HughParsonage Date: Sun, 21 Dec 2025 22:22:03 +0300 Subject: [PATCH 11/18] assign: fix index unmarking --- src/assign.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/assign.c b/src/assign.c index 94c27d598f..235875ac9d 100644 --- a/src/assign.c +++ b/src/assign.c @@ -339,7 +339,7 @@ static SEXP fixIndexAttrib(SEXP tag, SEXP value, void *ctx_) { // indexLength > 0 indicates reordering. Drop it to avoid spurious reordering in non-indexed columns (#2372) // shortened name already present indicates that index needs to be dropped to avoid duplicate indices. setAttrib(index, tag, R_NilValue); - hash_set(indexNames, tag, 0); + hash_set(indexNames, PRINTNAME(tag), 0); if (verbose) Rprintf(_("Dropping index '%s' due to an update on a key column\n"), c1+2); } From 1a408bf6087e49e7aee6fe86630d80e5c4dbcd68 Mon Sep 17 00:00:00 2001 From: Ivan K Date: Sun, 21 Dec 2025 22:46:43 +0300 Subject: [PATCH 12/18] Comments, better field names --- src/assign.c | 14 +++++++++----- 1 file changed, 9 insertions(+), 5 deletions(-) diff --git a/src/assign.c b/src/assign.c index 235875ac9d..bc07cf2c69 100644 --- a/src/assign.c +++ b/src/assign.c @@ -257,21 +257,25 @@ SEXP selfrefokwrapper(SEXP x, SEXP verbose) { } struct attrib_name_ctx { - hashtab *indexNames; - SEXP index, assignedNames; - R_xlen_t len; + hashtab *indexNames; // stores a 1/0 mark for every CHARSXP index name + R_xlen_t indexNamesLen; // how much memory to allocate for the hash? + SEXP index; // attr(DT, "index") + SEXP assignedNames; // STRSXP vector of variable names just assigned bool verbose; }; + +// Mark each CHARSXP attribute name with a 1 inside the hash, or count them to find out the allocation size. static SEXP getOneAttribName(SEXP key, SEXP val, void *ctx_) { (void)val; struct attrib_name_ctx *ctx = ctx_; if (ctx->indexNames) hash_set(ctx->indexNames, PRINTNAME(key), 1); else - ctx->len++; + ctx->indexNamesLen++; return NULL; } +// For a given index, find out if it sorts a column that has just been assigned. If so, shorten the index (if an equivalent one doesn't already exist) or remove it altogether. static SEXP fixIndexAttrib(SEXP tag, SEXP value, void *ctx_) { const struct attrib_name_ctx *ctx = ctx_; @@ -644,7 +648,7 @@ SEXP assign(SEXP dt, SEXP rows, SEXP cols, SEXP newcolnames, SEXP values) if (index != R_NilValue) { struct attrib_name_ctx ctx = { 0, }; R_mapAttrib(index, getOneAttribName, &ctx); // how many attributes? - hashtab *h = hash_create(ctx.len); + hashtab *h = hash_create(ctx.indexNamesLen); PROTECT(h->prot); ctx.indexNames = h; R_mapAttrib(index, getOneAttribName, &ctx); // now remember the names From 7381ec842be7de710e6237a6aff31ebfa607c0ef Mon Sep 17 00:00:00 2001 From: aitap Date: Sat, 3 Jan 2026 20:13:16 +0000 Subject: [PATCH 13/18] Update src/dogroups.c Co-authored-by: Benjamin Schwendinger <52290390+ben-schwen@users.noreply.github.com> --- src/dogroups.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/dogroups.c b/src/dogroups.c index e8fbd56aa9..92e8e069b9 100644 --- a/src/dogroups.c +++ b/src/dogroups.c @@ -41,7 +41,7 @@ static bool anySpecialStatic(SEXP x, hashtab * specials) { // with PR#4164 started to copy input list columns too much. Hence PR#4655 in v1.13.2 moved that copy here just where it is needed. // Currently the marker is negative truelength. These specials are protected by us here and before we release them // we restore the true truelength for when R starts to use vector truelength. - SEXP attribs, list_el; + SEXP list_el; const int n = length(x); // use length() not LENGTH() because isNewList() is true for NULL if (n==0) From 987c613f61fff3a509ee09bc2c1e206fd1b78d6a Mon Sep 17 00:00:00 2001 From: Ivan K Date: Sun, 4 Jan 2026 00:34:00 +0300 Subject: [PATCH 14/18] mapAttrib: protect the attribute value Otherwise the callback could remove the attribute and end up with the value unprotected. Protect the attribute tag as well for uniformity. Co-Authored-By: HughParsonage --- src/utils.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/src/utils.c b/src/utils.c index 43aef0134e..c3d311cdb7 100644 --- a/src/utils.c +++ b/src/utils.c @@ -686,7 +686,8 @@ SEXP R_mapAttrib_(SEXP x, SEXP (*fun)(SEXP key, SEXP val, void *ctx), void *ctx) SEXP ret = NULL; for (; !isNull(a); REPROTECT(a = CDR(a), i)) { - ret = fun(TAG(a), CAR(a), ctx); + ret = fun(PROTECT(TAG(a)), PROTECT(CAR(a)), ctx); + UNPROTECT(2); if (ret) break; } From ca0e85ce455fb508ddc35654d7bbfdd6835d3d8e Mon Sep 17 00:00:00 2001 From: Benjamin Schwendinger <52290390+ben-schwen@users.noreply.github.com> Date: Sun, 4 Jan 2026 00:37:53 +0300 Subject: [PATCH 15/18] dogroups: look up rownames using mapAttrib This solution is closer to the working approach previously taken by the code. --- src/dogroups.c | 15 +++++++++------ 1 file changed, 9 insertions(+), 6 deletions(-) diff --git a/src/dogroups.c b/src/dogroups.c index 92e8e069b9..8a787173ca 100644 --- a/src/dogroups.c +++ b/src/dogroups.c @@ -67,6 +67,12 @@ static SEXP attribWalker(SEXP key, SEXP val, void *specials) { return anySpecialStatic(val, specials) ? R_NilValue : NULL; } +static SEXP findRowNames(SEXP key, SEXP val, void *data) { + (void)data; + if (key == R_RowNamesSymbol) return val; + return NULL; +} + SEXP dogroups(SEXP dt, SEXP dtcols, SEXP groups, SEXP grpcols, SEXP jiscols, SEXP xjiscols, SEXP grporder, SEXP order, SEXP starts, SEXP lens, SEXP jexp, SEXP env, SEXP lhs, SEXP newnames, SEXP on, SEXP verboseArg, SEXP showProgressArg) { R_len_t ngrp, nrowgroups, njval=0, ngrpcols, ansloc=0, maxn, estn=-1, thisansloc, grpn, thislen, igrp; @@ -137,12 +143,9 @@ SEXP dogroups(SEXP dt, SEXP dtcols, SEXP groups, SEXP grpcols, SEXP jiscols, SEX // override rownames of .SD. rownames[1] is set to -thislen for each group, in case .SD is passed to // non data.table aware package that uses rownames - SEXP rownames; - PROTECT_INDEX rownamesi; - PROTECT_WITH_INDEX(rownames = allocVector(INTSXP, 2), &rownamesi); nprotect++; - INTEGER(rownames)[0] = NA_INTEGER; - INTEGER(rownames)[1] = -maxGrpSize; - REPROTECT(rownames = setAttrib(SD, R_RowNamesSymbol, rownames), rownamesi); + SEXP rownames = R_mapAttrib(SD, findRowNames, NULL); + if (rownames == NULL) error(_("row.names attribute of .SD not found")); + if (!isInteger(rownames) || LENGTH(rownames)!=2 || INTEGER(rownames)[0]!=NA_INTEGER) error(_("row.names of .SD isn't integer length 2 with NA as first item; i.e., .set_row_names(). [%s %d %d]"),type2char(TYPEOF(rownames)),LENGTH(rownames),INTEGER(rownames)[0]); // fetch names of .SD and prepare symbols. In case they are copied-on-write by user assigning to those variables // using <- in j (which is valid, useful and tested), they are repointed to the .SD cols for each group. From 012cf3a20a2f2cd32f5f027852b24def4f1f4f31 Mon Sep 17 00:00:00 2001 From: Ivan K Date: Sun, 4 Jan 2026 00:48:11 +0300 Subject: [PATCH 16/18] Fix comment, function name --- src/assign.c | 2 +- src/dogroups.c | 6 +++--- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/src/assign.c b/src/assign.c index bc07cf2c69..4cc8bccb13 100644 --- a/src/assign.c +++ b/src/assign.c @@ -257,7 +257,7 @@ SEXP selfrefokwrapper(SEXP x, SEXP verbose) { } struct attrib_name_ctx { - hashtab *indexNames; // stores a 1/0 mark for every CHARSXP index name + hashtab *indexNames; // stores a 1 for every CHARSXP index name in use, 0 for removed R_xlen_t indexNamesLen; // how much memory to allocate for the hash? SEXP index; // attr(DT, "index") SEXP assignedNames; // STRSXP vector of variable names just assigned diff --git a/src/dogroups.c b/src/dogroups.c index 8a787173ca..935817032f 100644 --- a/src/dogroups.c +++ b/src/dogroups.c @@ -3,7 +3,7 @@ #include #include -static SEXP attribWalker(SEXP key, SEXP val, void *ctx); +static SEXP anySpecialAttribute(SEXP key, SEXP val, void *ctx); static bool anySpecialStatic(SEXP x, hashtab * specials) { // Special refers to special symbols .BY, .I, .N, and .GRP; see special-symbols.Rd @@ -55,14 +55,14 @@ static bool anySpecialStatic(SEXP x, hashtab * specials) { list_el = VECTOR_ELT(x,i); if (anySpecialStatic(list_el, specials)) return true; - if (R_mapAttrib(list_el, attribWalker, specials)) + if (R_mapAttrib(list_el, anySpecialAttribute, specials)) return true; // #4936 } } return false; } -static SEXP attribWalker(SEXP key, SEXP val, void *specials) { +static SEXP anySpecialAttribute(SEXP key, SEXP val, void *specials) { (void)key; return anySpecialStatic(val, specials) ? R_NilValue : NULL; } From b1daaabf2e5d83c2cf8bb3095195059a8fc6e3ad Mon Sep 17 00:00:00 2001 From: Ivan K Date: Mon, 5 Jan 2026 00:31:52 +0300 Subject: [PATCH 17/18] Protect the newly found rownames attribute --- src/dogroups.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/dogroups.c b/src/dogroups.c index 935817032f..ce5453c75c 100644 --- a/src/dogroups.c +++ b/src/dogroups.c @@ -143,7 +143,7 @@ SEXP dogroups(SEXP dt, SEXP dtcols, SEXP groups, SEXP grpcols, SEXP jiscols, SEX // override rownames of .SD. rownames[1] is set to -thislen for each group, in case .SD is passed to // non data.table aware package that uses rownames - SEXP rownames = R_mapAttrib(SD, findRowNames, NULL); + SEXP rownames = PROTECT(R_mapAttrib(SD, findRowNames, NULL)); nprotect++; if (rownames == NULL) error(_("row.names attribute of .SD not found")); if (!isInteger(rownames) || LENGTH(rownames)!=2 || INTEGER(rownames)[0]!=NA_INTEGER) error(_("row.names of .SD isn't integer length 2 with NA as first item; i.e., .set_row_names(). [%s %d %d]"),type2char(TYPEOF(rownames)),LENGTH(rownames),INTEGER(rownames)[0]); From b4f57028df48139897567779fd283a7ab79a6751 Mon Sep 17 00:00:00 2001 From: Michael Chirico Date: Thu, 8 Jan 2026 22:50:23 -0800 Subject: [PATCH 18/18] add NEWS entry --- NEWS.md | 2 ++ 1 file changed, 2 insertions(+) diff --git a/NEWS.md b/NEWS.md index 256c7450ac..f2d994bf12 100644 --- a/NEWS.md +++ b/NEWS.md @@ -26,6 +26,8 @@ 3. Vignettes are now built using `litedown` instead of `knitr`, [#6394](https://github.com/Rdatatable/data.table/issues/6394). Thanks @jangorecki for the suggestion and @ben-schwen and @aitap for the implementation. +4. Removed use of non-API macros `ATTRIB`, `SET_ATTRIB`, [#6180](https://github.com/Rdatatable/data.table/issues/6180). Thanks @aitap for the continued assiduous work here. + ### BUG FIXES 1. `fread()` with `skip=0` and `(header=TRUE|FALSE)` no longer skips the first row when it has fewer fields than subsequent rows, [#7463](https://github.com/Rdatatable/data.table/issues/7463). Thanks @emayerhofer for the report and @ben-schwen for the fix.