diff --git a/tree/ntuple/inc/ROOT/RField/RFieldFundamental.hxx b/tree/ntuple/inc/ROOT/RField/RFieldFundamental.hxx index c0b66225023b3..6318eb86757de 100644 --- a/tree/ntuple/inc/ROOT/RField/RFieldFundamental.hxx +++ b/tree/ntuple/inc/ROOT/RField/RFieldFundamental.hxx @@ -375,8 +375,11 @@ protected: { const auto r = Base::GetColumnRepresentatives(); const auto n = r.size(); + // Note that we don't assume that we have 0 columns here, as this function may be called to extend the + // column representations of this field. + const auto first = fAvailableColumns.size(); fAvailableColumns.reserve(n); - for (std::uint16_t i = 0; i < n; ++i) { + for (auto i = first; i < n; ++i) { auto &column = fAvailableColumns.emplace_back(ROOT::Internal::RColumn::Create(r[i][0], 0, i)); if (r[i][0] == ROOT::ENTupleColumnType::kReal32Trunc) { column->SetBitsOnStorage(fBitWidth); diff --git a/tree/ntuple/inc/ROOT/RFieldBase.hxx b/tree/ntuple/inc/ROOT/RFieldBase.hxx index 9478022407360..a89b40b3e44dc 100644 --- a/tree/ntuple/inc/ROOT/RFieldBase.hxx +++ b/tree/ntuple/inc/ROOT/RFieldBase.hxx @@ -56,6 +56,8 @@ void CallFlushColumnsOnField(RFieldBase &); void CallCommitClusterOnField(RFieldBase &); void CallConnectPageSinkOnField(RFieldBase &, ROOT::Internal::RPageSink &, ROOT::NTupleSize_t firstEntry = 0); void CallConnectPageSourceOnField(RFieldBase &, ROOT::Internal::RPageSource &); +void CallConnectExtendedColumnsToPageSinkOnField(RFieldBase &, ROOT::Internal::RPageSink &, + ROOT::NTupleSize_t firstEntry); ROOT::RResult> CallFieldBaseCreate(const std::string &fieldName, const std::string &typeName, const ROOT::RCreateFieldOptions &options, const ROOT::RNTupleDescriptor *desc, ROOT::DescriptorId_t fieldId); @@ -86,6 +88,8 @@ class RFieldBase { friend void Internal::CallCommitClusterOnField(RFieldBase &); friend void Internal::CallConnectPageSinkOnField(RFieldBase &, ROOT::Internal::RPageSink &, ROOT::NTupleSize_t); friend void Internal::CallConnectPageSourceOnField(RFieldBase &, ROOT::Internal::RPageSource &); + friend void + Internal::CallConnectExtendedColumnsToPageSinkOnField(RFieldBase &, ROOT::Internal::RPageSink &, ROOT::NTupleSize_t); friend ROOT::RResult> Internal::CallFieldBaseCreate(const std::string &fieldName, const std::string &typeName, const ROOT::RCreateFieldOptions &options, const ROOT::RNTupleDescriptor *desc, @@ -261,6 +265,9 @@ private: /// calling this function. For subfields, a field ID may or may not be set. If the field ID is unset, it will be /// determined using the page source descriptor, based on the parent field ID and the subfield name. void ConnectPageSource(ROOT::Internal::RPageSource &pageSource); + /// Similar to ConnectPageSink, but only used to connect new columns that were added via late model extension. + /// The field must be already connected to the sink. + void ConnectExtendedColumnsToPageSink(ROOT::Internal::RPageSink &pageSink, ROOT::NTupleSize_t firstEntry = 0); void SetArtificial() { @@ -357,8 +364,13 @@ protected: GenerateColumnsImpl<0, ColumnCppTs...>(GetColumnRepresentations().GetSerializationDefault(), 0); } else { const auto N = fColumnRepresentatives.size(); - fAvailableColumns.reserve(N * sizeof...(ColumnCppTs)); - for (unsigned i = 0; i < N; ++i) { + constexpr auto cardinality = sizeof...(ColumnCppTs); + static_assert(cardinality > 0, "GenerateColumnsImpl must be called with at least 1 type argument"); + fAvailableColumns.reserve(N * cardinality); + // Note that we don't assume that we have 0 columns here, as this function may be called to extend the + // column representations of this field. + const auto first = fAvailableColumns.size(); + for (auto i = first / cardinality; i < N; ++i) { GenerateColumnsImpl<0, ColumnCppTs...>(fColumnRepresentatives[i].get(), i); } } @@ -958,6 +970,7 @@ struct RFieldRepresentationModifier { const auto N = field.fColumnRepresentatives[0].get().size(); R__ASSERT(N >= 1 && N <= 2); R__ASSERT(field.fPrincipalColumn); + R__ASSERT(newRepresentationIdx * N < field.fAvailableColumns.size()); field.fPrincipalColumn = field.fAvailableColumns[newRepresentationIdx * N].get(); if (field.fAuxiliaryColumn) { R__ASSERT(N == 2); diff --git a/tree/ntuple/inc/ROOT/RNTupleFillContext.hxx b/tree/ntuple/inc/ROOT/RNTupleFillContext.hxx index 87e25905987d1..bfc104c1a8d2c 100644 --- a/tree/ntuple/inc/ROOT/RNTupleFillContext.hxx +++ b/tree/ntuple/inc/ROOT/RNTupleFillContext.hxx @@ -31,6 +31,11 @@ namespace ROOT { +namespace Internal { +// Used for testing +RPageSink &GetWriterSink(ROOT::RNTupleWriter &writer); +} // namespace Internal + // clang-format off /** \class ROOT::RNTupleFillContext @@ -49,6 +54,7 @@ sequential writing, please refer to RNTupleWriter. class RNTupleFillContext { friend class ROOT::RNTupleWriter; friend class RNTupleParallelWriter; + friend Internal::RPageSink &Internal::GetWriterSink(RNTupleWriter &); private: /// The page sink's parallel page compression scheduler if IMT is on. diff --git a/tree/ntuple/inc/ROOT/RNTupleModel.hxx b/tree/ntuple/inc/ROOT/RNTupleModel.hxx index ca2e5e0af569c..404a9de800542 100644 --- a/tree/ntuple/inc/ROOT/RNTupleModel.hxx +++ b/tree/ntuple/inc/ROOT/RNTupleModel.hxx @@ -402,11 +402,18 @@ You will not normally use this directly; see `RNTupleModel::RUpdater` instead. */ // clang-format on struct RNTupleModelChangeset { + struct RAddedColumnRepr { + ROOT::RFieldBase *fField = nullptr; + std::uint16_t fNumNewColumns = 0; + }; + RNTupleModel &fModel; /// Points to the fields in fModel that were added as part of an updater transaction std::vector fAddedFields; /// Points to the projected fields in fModel that were added as part of an updater transaction std::vector fAddedProjectedFields; + /// Points to fields in fModel that had new column representations appended to them. + std::vector fAddedColumnReprs; RNTupleModelChangeset(RNTupleModel &model) : fModel(model) {} bool IsEmpty() const { return fAddedFields.empty() && fAddedProjectedFields.empty(); } @@ -420,6 +427,9 @@ struct RNTupleModelChangeset { /// \see RNTupleModel::AddProjectedField() ROOT::RResult AddProjectedField(std::unique_ptr field, RNTupleModel::FieldMappingFunc_t mapping); + + ROOT::RResult + AddColumnRepr(ROOT::RFieldBase *field, const ROOT::RFieldBase::RColumnRepresentations::Selection_t &reprs); }; } // namespace Internal diff --git a/tree/ntuple/inc/ROOT/RNTupleWriter.hxx b/tree/ntuple/inc/ROOT/RNTupleWriter.hxx index 3535d2a6c72ef..892339a8a4aa2 100644 --- a/tree/ntuple/inc/ROOT/RNTupleWriter.hxx +++ b/tree/ntuple/inc/ROOT/RNTupleWriter.hxx @@ -121,6 +121,7 @@ class RNTupleWriter { friend std::unique_ptr Experimental::RNTupleWriter_Append(std::unique_ptr model, std::string_view ntuplePath, ROOT::Experimental::RFile &file, const ROOT::RNTupleWriteOptions &options); + friend Internal::RPageSink &Internal::GetWriterSink(RNTupleWriter &); private: RNTupleFillContext fFillContext; diff --git a/tree/ntuple/src/RFieldBase.cxx b/tree/ntuple/src/RFieldBase.cxx index a8c79c2f80553..a3179c46c78ad 100644 --- a/tree/ntuple/src/RFieldBase.cxx +++ b/tree/ntuple/src/RFieldBase.cxx @@ -85,6 +85,11 @@ void ROOT::Internal::CallConnectPageSourceOnField(RFieldBase &field, ROOT::Inter { field.ConnectPageSource(source); } +void ROOT::Internal::CallConnectExtendedColumnsToPageSinkOnField(RFieldBase &field, ROOT::Internal::RPageSink &sink, + ROOT::NTupleSize_t firstEntry) +{ + field.ConnectExtendedColumnsToPageSink(sink, firstEntry); +} ROOT::RResult> ROOT::Internal::CallFieldBaseCreate(const std::string &fieldName, const std::string &typeName, @@ -825,8 +830,17 @@ ROOT::RFieldBase::RColumnRepresentations::Selection_t ROOT::RFieldBase::GetColum void ROOT::RFieldBase::SetColumnRepresentatives(const RColumnRepresentations::Selection_t &representatives) { - if (fState != EState::kUnconnected) - throw RException(R__FAIL("cannot set column representative once field is connected")); + const auto isNewSelectionASuperset = [&]() { + if (representatives.size() < fColumnRepresentatives.size()) + return false; + for (auto i = 0u; i < fColumnRepresentatives.size(); ++i) + if (representatives[i] != fColumnRepresentatives[i].get()) + return false; + return true; + }; + if (fState != EState::kUnconnected && !(fState == EState::kConnectedToSink && isNewSelectionASuperset())) + throw RException(R__FAIL("cannot set column representative once field is connected unless the new selection is a " + "superset of the previous")); const auto &validTypes = GetColumnRepresentations().GetSerializationTypes(); fColumnRepresentatives.clear(); fColumnRepresentatives.reserve(representatives.size()); @@ -944,6 +958,23 @@ void ROOT::RFieldBase::ConnectPageSink(ROOT::Internal::RPageSink &pageSink, ROOT fState = EState::kConnectedToSink; } +void ROOT::RFieldBase::ConnectExtendedColumnsToPageSink(ROOT::Internal::RPageSink &pageSink, + ROOT::NTupleSize_t firstEntry) +{ + if (fState != EState::kConnectedToSink) + throw RException( + R__FAIL("invalid attempt to connect extended columns of a field that is not connected to a sink")); + + const auto nPrev = fAvailableColumns.size(); + GenerateColumns(); + + for (auto i = nPrev, len = fAvailableColumns.size(); i < len; ++i) { + auto &column = fAvailableColumns[i]; + auto firstElementIndex = (column->GetIndex() == 0) ? EntryToColumnElementIndex(firstEntry) : 0; + column->ConnectPageSink(fOnDiskId, pageSink, firstElementIndex); + } +} + void ROOT::RFieldBase::ConnectPageSource(ROOT::Internal::RPageSource &pageSource) { if (dynamic_cast(this)) { diff --git a/tree/ntuple/src/RNTupleModel.cxx b/tree/ntuple/src/RNTupleModel.cxx index e2fdeddd199fc..9e01f4af37424 100644 --- a/tree/ntuple/src/RNTupleModel.cxx +++ b/tree/ntuple/src/RNTupleModel.cxx @@ -287,6 +287,27 @@ ROOT::RNTupleModel::RUpdater::AddProjectedField(std::unique_ptr ROOT::Internal::RNTupleModelChangeset::AddColumnRepr( + ROOT::RFieldBase *field, const ROOT::RFieldBase::RColumnRepresentations::Selection_t &newReprs) +{ + auto reprs = field->GetColumnRepresentatives(); + const auto nPrev = reprs.size(); + for (auto newRepr : newReprs) + // NOTE: we don't need to check for duplicates because SetColumnRepresentatives will do that for us. + reprs.push_back(newRepr); + + field->SetColumnRepresentatives(reprs); + const auto nNew = field->GetColumnRepresentatives().size(); + assert(reprs.size() > 0 && reprs[0].size() <= 2); + const auto cardinality = static_cast(reprs[0].size()); + assert(nNew >= nPrev); + std::uint16_t nAdded = (nNew - nPrev) * cardinality; + if (nAdded > 0) + fAddedColumnReprs.push_back(RAddedColumnRepr{field, nAdded}); + + return RResult::Success(); +} + void ROOT::RNTupleModel::EnsureValidFieldName(std::string_view fieldName) { RResult nameValid = ROOT::Internal::EnsureValidNameForRNTuple(fieldName, "Field"); diff --git a/tree/ntuple/src/RNTupleWriter.cxx b/tree/ntuple/src/RNTupleWriter.cxx index bfa43770dcc61..4933e99ef35a0 100644 --- a/tree/ntuple/src/RNTupleWriter.cxx +++ b/tree/ntuple/src/RNTupleWriter.cxx @@ -160,3 +160,8 @@ ROOT::Experimental::RNTupleWriter_Append(std::unique_ptr mod auto sink = std::make_unique(ntupleBasename, file, ntupleDir, options); return ROOT::RNTupleWriter::Create(std::move(model), std::move(sink), options); } + +ROOT::Internal::RPageSink &ROOT::Internal::GetWriterSink(ROOT::RNTupleWriter &writer) +{ + return *writer.fFillContext.fSink; +} diff --git a/tree/ntuple/src/RPageSinkBuf.cxx b/tree/ntuple/src/RPageSinkBuf.cxx index 4d4843686cc59..5d2fd63f995ff 100644 --- a/tree/ntuple/src/RPageSinkBuf.cxx +++ b/tree/ntuple/src/RPageSinkBuf.cxx @@ -125,12 +125,20 @@ void ROOT::Internal::RPageSinkBuf::UpdateSchema(const ROOT::Internal::RNTupleMod GetProjectedFieldsOfModel(*fInnerModel).Add(std::move(cloned), fieldMap); return p; }; + auto cloneAddColumnRepr = [&](const RNTupleModelChangeset::RAddedColumnRepr &repr) { + auto &innerField = fInnerModel->GetMutableField(repr.fField->GetFieldName()); + innerField.SetColumnRepresentatives(repr.fField->GetColumnRepresentatives()); + ROOT::Internal::CallConnectExtendedColumnsToPageSinkOnField(innerField, *this, firstEntry); + return repr; + }; RNTupleModelChangeset innerChangeset{*fInnerModel}; fInnerModel->Unfreeze(); std::transform(changeset.fAddedFields.cbegin(), changeset.fAddedFields.cend(), std::back_inserter(innerChangeset.fAddedFields), cloneAddField); std::transform(changeset.fAddedProjectedFields.cbegin(), changeset.fAddedProjectedFields.cend(), std::back_inserter(innerChangeset.fAddedProjectedFields), cloneAddProjectedField); + std::transform(changeset.fAddedColumnReprs.cbegin(), changeset.fAddedColumnReprs.cend(), + std::back_inserter(innerChangeset.fAddedColumnReprs), cloneAddColumnRepr); fInnerModel->Freeze(); fInnerSink->UpdateSchema(innerChangeset, firstEntry); } diff --git a/tree/ntuple/src/RPageStorage.cxx b/tree/ntuple/src/RPageStorage.cxx index f4aeca7c594b2..11e56a75d8beb 100644 --- a/tree/ntuple/src/RPageStorage.cxx +++ b/tree/ntuple/src/RPageStorage.cxx @@ -868,6 +868,9 @@ void ROOT::Internal::RPagePersistentSink::UpdateSchema(const ROOT::Internal::RNT for (const auto &descendant : *f) nNewPhysicalColumns += getNColumns(descendant); } + for (auto added : changeset.fAddedColumnReprs) { + nNewPhysicalColumns += added.fNumNewColumns; + } fDescriptorBuilder.ShiftAliasColumns(nNewPhysicalColumns); } @@ -913,6 +916,9 @@ void ROOT::Internal::RPagePersistentSink::UpdateSchema(const ROOT::Internal::RNT for (auto &descendant : *f) addProjectedField(descendant); } + for (const auto& [f, _] : changeset.fAddedColumnReprs) { + ROOT::Internal::CallConnectExtendedColumnsToPageSinkOnField(*f, *this, firstEntry); + } const auto nColumns = descriptor.GetNPhysicalColumns(); fOpenColumnRanges.reserve(fOpenColumnRanges.size() + (nColumns - nColumnsBeforeUpdate)); diff --git a/tree/ntuple/test/ntuple_modelext.cxx b/tree/ntuple/test/ntuple_modelext.cxx index 9ac90f2446af8..dbe5d1ad68333 100644 --- a/tree/ntuple/test/ntuple_modelext.cxx +++ b/tree/ntuple/test/ntuple_modelext.cxx @@ -852,3 +852,318 @@ R"({ // clang-format on EXPECT_EQ(expect, os.str()); } + +TEST(RNTuple, LateColumnExtension) +{ + FileRaii fileGuard("test_ntuple_latecolumnext.ntuple"); + + auto model = RNTupleModel::Create(); + auto pF = model->MakeField("f"); // this has column representation {kSplitReal32}. + + auto writer = ROOT::RNTupleWriter::Recreate(std::move(model), "ntuple", fileGuard.GetPath()); + for (int i = 0; i < 100; ++i) { + *pF = i; + writer->Fill(); + } + auto &modelRef = const_cast(writer->GetModel()); + modelRef.Unfreeze(); + + auto &field = modelRef.GetMutableField("f"); + ROOT::Internal::RNTupleModelChangeset changeset{modelRef}; + changeset.AddColumnRepr(&field, {{ROOT::ENTupleColumnType::kReal32}}); + + modelRef.Freeze(); + + auto &sink = ROOT::Internal::GetWriterSink(*writer); + sink.UpdateSchema(changeset, 0); + + // Keep writing 50 entries with the old active representation, then switch it. + for (int i = 0; i < 100; ++i) { + if (i == 50) { + writer->CommitCluster(); + ROOT::Internal::RFieldRepresentationModifier::SetPrimaryColumnRepresentation(field, 1); + } + *pF = 100 + i; + writer->Fill(); + } + + writer.reset(); + + auto reader = RNTupleReader::Open("ntuple", fileGuard.GetPath()); + // check that we can read data fine + auto vF = reader->GetView("f"); + EXPECT_EQ(reader->GetNEntries(), 200); + for (auto idx : reader->GetEntryRange()) { + EXPECT_FLOAT_EQ(vF(idx), idx); + } + // check that metadata is correct + const auto &desc = reader->GetDescriptor(); + auto fieldId = desc.FindFieldId("f"); + const auto &fdesc = desc.GetFieldDescriptor(fieldId); + ASSERT_EQ(fdesc.GetLogicalColumnIds().size(), 2); + EXPECT_EQ(fdesc.GetColumnCardinality(), 1); + const auto &col1Desc = desc.GetColumnDescriptor(fdesc.GetLogicalColumnIds()[0]); + const auto &col2Desc = desc.GetColumnDescriptor(fdesc.GetLogicalColumnIds()[1]); + EXPECT_EQ(col1Desc.GetType(), ROOT::ENTupleColumnType::kSplitReal32); + EXPECT_EQ(col2Desc.GetType(), ROOT::ENTupleColumnType::kReal32); + EXPECT_EQ(col1Desc.GetFieldId(), fieldId); + EXPECT_EQ(col2Desc.GetFieldId(), fieldId); + EXPECT_EQ(col1Desc.GetRepresentationIndex(), 0); + EXPECT_EQ(col2Desc.GetRepresentationIndex(), 1); + EXPECT_EQ(col1Desc.GetIndex(), 0); + EXPECT_EQ(col2Desc.GetIndex(), 0); + EXPECT_EQ(col1Desc.GetFirstElementIndex(), 0); + EXPECT_EQ(col2Desc.GetFirstElementIndex(), 0); + + const auto cluster1Id = desc.FindClusterId(0, 0); + const auto &cluster1Desc = desc.GetClusterDescriptor(cluster1Id); + ASSERT_TRUE(cluster1Desc.ContainsColumn(0)); + ASSERT_TRUE(cluster1Desc.ContainsColumn(1)); + EXPECT_FALSE(cluster1Desc.GetColumnRange(0).IsSuppressed()); + EXPECT_TRUE(cluster1Desc.GetColumnRange(1).IsSuppressed()); + EXPECT_EQ(cluster1Desc.GetFirstEntryIndex(), 0); + EXPECT_EQ(cluster1Desc.GetNEntries(), 150); + + const auto cluster2Id = desc.FindNextClusterId(cluster1Id); + const auto &cluster2Desc = desc.GetClusterDescriptor(cluster2Id); + ASSERT_TRUE(cluster2Desc.ContainsColumn(0)); + ASSERT_TRUE(cluster2Desc.ContainsColumn(1)); + EXPECT_TRUE(cluster2Desc.GetColumnRange(0).IsSuppressed()); + EXPECT_FALSE(cluster2Desc.GetColumnRange(1).IsSuppressed()); + EXPECT_EQ(cluster2Desc.GetFirstEntryIndex(), 150); + EXPECT_EQ(cluster2Desc.GetNEntries(), 50); +} + +TEST(RNTuple, LateColumnExtension2) +{ + FileRaii fileGuard("test_ntuple_latecolumnext2.ntuple"); + + auto model = RNTupleModel::Create(); + // this has representation {kSplitIndex64, kChar} + auto pS = model->MakeField("s"); + + auto writer = ROOT::RNTupleWriter::Recreate(std::move(model), "ntuple", fileGuard.GetPath()); + for (int i = 0; i < 100; ++i) { + *pS = std::to_string(i); + writer->Fill(); + } + auto &modelRef = const_cast(writer->GetModel()); + modelRef.Unfreeze(); + + auto &field = modelRef.GetMutableField("s"); + ROOT::Internal::RNTupleModelChangeset changeset{modelRef}; + changeset.AddColumnRepr(&field, {{ROOT::ENTupleColumnType::kIndex32, ROOT::ENTupleColumnType::kChar}}); + + modelRef.Freeze(); + + auto &sink = ROOT::Internal::GetWriterSink(*writer); + sink.UpdateSchema(changeset, 0); + + // Keep writing 50 entries with the old active representation, then switch it. + for (int i = 0; i < 100; ++i) { + if (i == 50) { + writer->CommitCluster(); + ROOT::Internal::RFieldRepresentationModifier::SetPrimaryColumnRepresentation(field, 1); + } + *pS = std::to_string(100 + i); + writer->Fill(); + } + + writer.reset(); + + auto reader = RNTupleReader::Open("ntuple", fileGuard.GetPath()); + // check that we can read data fine + auto vF = reader->GetView("s"); + EXPECT_EQ(reader->GetNEntries(), 200); + for (auto idx : reader->GetEntryRange()) { + EXPECT_EQ(vF(idx), std::to_string(idx)); + } + // check that metadata is correct + const auto &desc = reader->GetDescriptor(); + auto fieldId = desc.FindFieldId("s"); + const auto &fdesc = desc.GetFieldDescriptor(fieldId); + ASSERT_EQ(fdesc.GetLogicalColumnIds().size(), 4); + EXPECT_EQ(fdesc.GetColumnCardinality(), 2); + const auto &col1Desc = desc.GetColumnDescriptor(fdesc.GetLogicalColumnIds()[0]); + const auto &col2Desc = desc.GetColumnDescriptor(fdesc.GetLogicalColumnIds()[1]); + const auto &col3Desc = desc.GetColumnDescriptor(fdesc.GetLogicalColumnIds()[2]); + const auto &col4Desc = desc.GetColumnDescriptor(fdesc.GetLogicalColumnIds()[3]); + EXPECT_EQ(col1Desc.GetType(), ROOT::ENTupleColumnType::kSplitIndex64); + EXPECT_EQ(col2Desc.GetType(), ROOT::ENTupleColumnType::kChar); + EXPECT_EQ(col3Desc.GetType(), ROOT::ENTupleColumnType::kIndex32); + EXPECT_EQ(col4Desc.GetType(), ROOT::ENTupleColumnType::kChar); + EXPECT_EQ(col1Desc.GetRepresentationIndex(), 0); + EXPECT_EQ(col2Desc.GetRepresentationIndex(), 0); + EXPECT_EQ(col3Desc.GetRepresentationIndex(), 1); + EXPECT_EQ(col4Desc.GetRepresentationIndex(), 1); + EXPECT_EQ(col1Desc.GetIndex(), 0); + EXPECT_EQ(col2Desc.GetIndex(), 1); + EXPECT_EQ(col3Desc.GetIndex(), 0); + EXPECT_EQ(col4Desc.GetIndex(), 1); + EXPECT_EQ(col1Desc.GetFirstElementIndex(), 0); + EXPECT_EQ(col2Desc.GetFirstElementIndex(), 0); + EXPECT_EQ(col3Desc.GetFirstElementIndex(), 0); + EXPECT_EQ(col4Desc.GetFirstElementIndex(), 0); + + const auto cluster1Id = desc.FindClusterId(0, 0); + const auto &cluster1Desc = desc.GetClusterDescriptor(cluster1Id); + ASSERT_TRUE(cluster1Desc.ContainsColumn(0)); + ASSERT_TRUE(cluster1Desc.ContainsColumn(1)); + ASSERT_TRUE(cluster1Desc.ContainsColumn(2)); + ASSERT_TRUE(cluster1Desc.ContainsColumn(3)); + EXPECT_FALSE(cluster1Desc.GetColumnRange(0).IsSuppressed()); + EXPECT_FALSE(cluster1Desc.GetColumnRange(1).IsSuppressed()); + EXPECT_TRUE(cluster1Desc.GetColumnRange(2).IsSuppressed()); + EXPECT_TRUE(cluster1Desc.GetColumnRange(3).IsSuppressed()); + EXPECT_EQ(cluster1Desc.GetFirstEntryIndex(), 0); + EXPECT_EQ(cluster1Desc.GetNEntries(), 150); + + const auto cluster2Id = desc.FindNextClusterId(cluster1Id); + const auto &cluster2Desc = desc.GetClusterDescriptor(cluster2Id); + ASSERT_TRUE(cluster2Desc.ContainsColumn(0)); + ASSERT_TRUE(cluster2Desc.ContainsColumn(1)); + ASSERT_TRUE(cluster2Desc.ContainsColumn(2)); + ASSERT_TRUE(cluster2Desc.ContainsColumn(3)); + EXPECT_TRUE(cluster2Desc.GetColumnRange(0).IsSuppressed()); + EXPECT_TRUE(cluster2Desc.GetColumnRange(1).IsSuppressed()); + EXPECT_FALSE(cluster2Desc.GetColumnRange(2).IsSuppressed()); + EXPECT_FALSE(cluster2Desc.GetColumnRange(3).IsSuppressed()); + EXPECT_EQ(cluster2Desc.GetFirstEntryIndex(), 150); + EXPECT_EQ(cluster2Desc.GetNEntries(), 50); +} + +TEST(RNTuple, LateColumnExtensionDeferred) +{ + FileRaii fileGuard("test_ntuple_latecolumnext_deferred.ntuple"); + + auto model = RNTupleModel::Create(); + auto pF = model->MakeField("f"); + + auto writer = ROOT::RNTupleWriter::Recreate(std::move(model), "ntuple", fileGuard.GetPath()); + for (int i = 0; i < 50; ++i) { + *pF = i; + writer->Fill(); + } + + // Add a new field (no new columns) + { + auto updater = writer->CreateModelUpdater(); + updater->BeginUpdate(); + // this has representation {kSplitIndex64, kChar} + updater->AddField(std::make_unique>("s")); + updater->CommitUpdate(); + } + + auto pS = writer->GetModel().GetDefaultEntry().GetPtr("s"); + for (int i = 0; i < 50; ++i) { + *pF = 50 + i; + *pS = std::to_string(50 + i); + writer->Fill(); + } + + // Add a new column to the new field + auto &modelRef = const_cast(writer->GetModel()); + modelRef.Unfreeze(); + auto &field = modelRef.GetMutableField("s"); + ROOT::Internal::RNTupleModelChangeset changeset{modelRef}; + changeset.AddColumnRepr(&field, {{ROOT::ENTupleColumnType::kIndex32, ROOT::ENTupleColumnType::kChar}}); + modelRef.Freeze(); + auto &sink = ROOT::Internal::GetWriterSink(*writer); + sink.UpdateSchema(changeset, 50); + + // Keep writing 50 entries with the old active representation, then switch it. + for (int i = 0; i < 100; ++i) { + if (i == 50) { + writer->CommitCluster(); + ROOT::Internal::RFieldRepresentationModifier::SetPrimaryColumnRepresentation(field, 1); + } + *pF = 100 + i; + *pS = std::to_string(100 + i); + writer->Fill(); + } + + writer.reset(); + + auto reader = RNTupleReader::Open("ntuple", fileGuard.GetPath()); + EXPECT_EQ(reader->GetNEntries(), 200); + + // check that we can read data fine + auto prF = reader->GetModel().GetDefaultEntry().GetPtr("f"); + auto prS = reader->GetModel().GetDefaultEntry().GetPtr("s"); + for (auto idx : reader->GetEntryRange()) { + reader->LoadEntry(idx); + EXPECT_FLOAT_EQ(*prF, idx); + EXPECT_EQ(*prS, idx >= 50 ? std::to_string(idx) : ""); + } + + // check that metadata is correct + const auto &desc = reader->GetDescriptor(); + auto fieldFId = desc.FindFieldId("f"); + const auto &ffdesc = desc.GetFieldDescriptor(fieldFId); + ASSERT_EQ(ffdesc.GetLogicalColumnIds().size(), 1); + EXPECT_EQ(ffdesc.GetColumnCardinality(), 1); + auto fieldSId = desc.FindFieldId("s"); + const auto &fsdesc = desc.GetFieldDescriptor(fieldSId); + ASSERT_EQ(fsdesc.GetLogicalColumnIds().size(), 4); + EXPECT_EQ(fsdesc.GetColumnCardinality(), 2); + const auto &col1Desc = desc.GetColumnDescriptor(ffdesc.GetLogicalColumnIds()[0]); + const auto &col2Desc = desc.GetColumnDescriptor(fsdesc.GetLogicalColumnIds()[0]); + const auto &col3Desc = desc.GetColumnDescriptor(fsdesc.GetLogicalColumnIds()[1]); + const auto &col4Desc = desc.GetColumnDescriptor(fsdesc.GetLogicalColumnIds()[2]); + const auto &col5Desc = desc.GetColumnDescriptor(fsdesc.GetLogicalColumnIds()[3]); + EXPECT_EQ(col1Desc.GetType(), ROOT::ENTupleColumnType::kSplitReal32); + EXPECT_EQ(col2Desc.GetType(), ROOT::ENTupleColumnType::kSplitIndex64); + EXPECT_EQ(col3Desc.GetType(), ROOT::ENTupleColumnType::kChar); + EXPECT_EQ(col4Desc.GetType(), ROOT::ENTupleColumnType::kIndex32); + EXPECT_EQ(col5Desc.GetType(), ROOT::ENTupleColumnType::kChar); + EXPECT_EQ(col1Desc.GetFieldId(), fieldFId); + EXPECT_EQ(col2Desc.GetFieldId(), fieldSId); + EXPECT_EQ(col3Desc.GetFieldId(), fieldSId); + EXPECT_EQ(col4Desc.GetFieldId(), fieldSId); + EXPECT_EQ(col5Desc.GetFieldId(), fieldSId); + EXPECT_EQ(col1Desc.GetRepresentationIndex(), 0); + EXPECT_EQ(col2Desc.GetRepresentationIndex(), 0); + EXPECT_EQ(col3Desc.GetRepresentationIndex(), 0); + EXPECT_EQ(col4Desc.GetRepresentationIndex(), 1); + EXPECT_EQ(col5Desc.GetRepresentationIndex(), 1); + EXPECT_EQ(col1Desc.GetIndex(), 0); + EXPECT_EQ(col2Desc.GetIndex(), 0); + EXPECT_EQ(col3Desc.GetIndex(), 1); + EXPECT_EQ(col4Desc.GetIndex(), 0); + EXPECT_EQ(col5Desc.GetIndex(), 1); + EXPECT_EQ(col1Desc.GetFirstElementIndex(), 0); + EXPECT_EQ(col2Desc.GetFirstElementIndex(), 50); + EXPECT_EQ(col3Desc.GetFirstElementIndex(), 0); // string data column is never deferred + EXPECT_EQ(col4Desc.GetFirstElementIndex(), 50); + EXPECT_EQ(col5Desc.GetFirstElementIndex(), 0); // string data column is never deferred + + const auto cluster1Id = desc.FindClusterId(0, 0); + const auto &cluster1Desc = desc.GetClusterDescriptor(cluster1Id); + ASSERT_TRUE(cluster1Desc.ContainsColumn(0)); + ASSERT_TRUE(cluster1Desc.ContainsColumn(1)); + ASSERT_TRUE(cluster1Desc.ContainsColumn(2)); + ASSERT_TRUE(cluster1Desc.ContainsColumn(3)); + ASSERT_TRUE(cluster1Desc.ContainsColumn(4)); + EXPECT_FALSE(cluster1Desc.GetColumnRange(0).IsSuppressed()); + EXPECT_FALSE(cluster1Desc.GetColumnRange(1).IsSuppressed()); + EXPECT_FALSE(cluster1Desc.GetColumnRange(2).IsSuppressed()); + EXPECT_TRUE(cluster1Desc.GetColumnRange(3).IsSuppressed()); + EXPECT_TRUE(cluster1Desc.GetColumnRange(4).IsSuppressed()); + EXPECT_EQ(cluster1Desc.GetFirstEntryIndex(), 0); + EXPECT_EQ(cluster1Desc.GetNEntries(), 150); + + const auto cluster2Id = desc.FindNextClusterId(cluster1Id); + const auto &cluster2Desc = desc.GetClusterDescriptor(cluster2Id); + ASSERT_TRUE(cluster2Desc.ContainsColumn(0)); + ASSERT_TRUE(cluster2Desc.ContainsColumn(1)); + ASSERT_TRUE(cluster2Desc.ContainsColumn(2)); + ASSERT_TRUE(cluster2Desc.ContainsColumn(3)); + ASSERT_TRUE(cluster2Desc.ContainsColumn(4)); + EXPECT_FALSE(cluster2Desc.GetColumnRange(0).IsSuppressed()); + EXPECT_TRUE(cluster2Desc.GetColumnRange(1).IsSuppressed()); + EXPECT_TRUE(cluster2Desc.GetColumnRange(2).IsSuppressed()); + EXPECT_FALSE(cluster2Desc.GetColumnRange(3).IsSuppressed()); + EXPECT_FALSE(cluster2Desc.GetColumnRange(4).IsSuppressed()); + EXPECT_EQ(cluster2Desc.GetFirstEntryIndex(), 150); + EXPECT_EQ(cluster2Desc.GetNEntries(), 50); +}