From 7ff3d9bb43004c5a384b2fdb3b0766cceb4dcf5e Mon Sep 17 00:00:00 2001 From: DuckDB Labs GitHub Bot Date: Tue, 9 Dec 2025 06:32:17 +0000 Subject: [PATCH] Update vendored DuckDB sources to d1dc88f950 --- .../extension/parquet/column_writer.cpp | 8 +- .../writer/templated_column_writer.hpp | 16 +++- src/duckdb/src/common/local_file_system.cpp | 23 +++++ .../src/execution/index/art/base_node.cpp | 4 +- src/duckdb/src/execution/index/art/prefix.cpp | 13 +-- .../src/execution/index/bound_index.cpp | 93 ++++++++++++++----- .../src/execution/index/unbound_index.cpp | 31 +++++-- .../csv_scanner/scanner/base_scanner.cpp | 4 + .../scanner/string_value_scanner.cpp | 64 +++++++------ .../csv_scanner/sniffer/csv_sniffer.cpp | 5 +- .../csv_scanner/sniffer/type_detection.cpp | 18 ++-- .../execution/physical_plan/plan_window.cpp | 14 ++- .../src/function/table/system/duckdb_log.cpp | 3 + .../function/table/system/test_all_types.cpp | 6 +- .../function/table/version/pragma_version.cpp | 6 +- .../src/include/duckdb/common/limits.hpp | 6 +- .../duckdb/common/local_file_system.hpp | 2 + .../execution/index/art/art_operator.hpp | 2 + .../duckdb/execution/index/art/prefix.hpp | 4 +- .../duckdb/execution/index/bound_index.hpp | 2 +- .../duckdb/execution/index/unbound_index.hpp | 48 ++++++++-- .../operator/csv_scanner/base_scanner.hpp | 16 +++- .../csv_scanner/sniffer/csv_sniffer.hpp | 1 + .../csv_scanner/string_value_scanner.hpp | 3 +- .../include/duckdb/main/extension_entries.hpp | 1 + .../parser/parsed_data/sample_options.hpp | 3 + .../subquery/flatten_dependent_join.hpp | 2 +- .../planner/subquery/rewrite_cte_scan.hpp | 4 +- .../storage/table/validity_column_data.hpp | 2 + .../src/main/extension/extension_alias.cpp | 1 + .../transform/helpers/transform_sample.cpp | 5 +- .../planner/binder/statement/bind_insert.cpp | 27 ++++-- .../subquery/flatten_dependent_join.cpp | 81 +++++++++++----- .../src/planner/subquery/rewrite_cte_scan.cpp | 8 +- src/duckdb/src/storage/table/column_data.cpp | 1 - .../table/column_data_checkpointer.cpp | 2 +- .../storage/table/standard_column_data.cpp | 10 +- .../storage/table/validity_column_data.cpp | 17 ++++ 38 files changed, 393 insertions(+), 163 deletions(-) diff --git a/src/duckdb/extension/parquet/column_writer.cpp b/src/duckdb/extension/parquet/column_writer.cpp index 7983b6a23..63fc1cd57 100644 --- a/src/duckdb/extension/parquet/column_writer.cpp +++ b/src/duckdb/extension/parquet/column_writer.cpp @@ -534,10 +534,10 @@ ColumnWriter::CreateWriterRecursive(ClientContext &context, ParquetWriter &write template <> struct NumericLimits { static constexpr float Minimum() { - return std::numeric_limits::lowest(); + return NumericLimits::Minimum(); }; static constexpr float Maximum() { - return std::numeric_limits::max(); + return NumericLimits::Maximum(); }; static constexpr bool IsSigned() { return std::is_signed::value; @@ -550,10 +550,10 @@ struct NumericLimits { template <> struct NumericLimits { static constexpr double Minimum() { - return std::numeric_limits::lowest(); + return NumericLimits::Minimum(); }; static constexpr double Maximum() { - return std::numeric_limits::max(); + return NumericLimits::Maximum(); }; static constexpr bool IsSigned() { return std::is_signed::value; diff --git a/src/duckdb/extension/parquet/include/writer/templated_column_writer.hpp b/src/duckdb/extension/parquet/include/writer/templated_column_writer.hpp index c035bba43..0ed8543d3 100644 --- a/src/duckdb/extension/parquet/include/writer/templated_column_writer.hpp +++ b/src/duckdb/extension/parquet/include/writer/templated_column_writer.hpp @@ -126,7 +126,8 @@ class StandardColumnWriter : public PrimitiveColumnWriter { public: unique_ptr InitializeWriteState(duckdb_parquet::RowGroup &row_group) override { auto result = make_uniq>(writer, row_group, row_group.columns.size()); - result->encoding = duckdb_parquet::Encoding::RLE_DICTIONARY; + result->encoding = writer.GetParquetVersion() == ParquetVersion::V1 ? duckdb_parquet::Encoding::PLAIN_DICTIONARY + : duckdb_parquet::Encoding::RLE_DICTIONARY; RegisterToRowGroup(row_group); return std::move(result); } @@ -150,6 +151,8 @@ class StandardColumnWriter : public PrimitiveColumnWriter { } page_state.dbp_encoder.FinishWrite(temp_writer); break; + case duckdb_parquet::Encoding::PLAIN_DICTIONARY: + // PLAIN_DICTIONARY can be treated the same as RLE_DICTIONARY case duckdb_parquet::Encoding::RLE_DICTIONARY: D_ASSERT(page_state.dict_bit_width != 0); if (!page_state.dict_written_value) { @@ -265,7 +268,8 @@ class StandardColumnWriter : public PrimitiveColumnWriter { bool HasDictionary(PrimitiveColumnWriterState &state_p) override { auto &state = state_p.Cast>(); - return state.encoding == duckdb_parquet::Encoding::RLE_DICTIONARY; + return state.encoding == duckdb_parquet::Encoding::RLE_DICTIONARY || + state.encoding == duckdb_parquet::Encoding::PLAIN_DICTIONARY; } idx_t DictionarySize(PrimitiveColumnWriterState &state_p) override { @@ -285,7 +289,8 @@ class StandardColumnWriter : public PrimitiveColumnWriter { void FlushDictionary(PrimitiveColumnWriterState &state_p, ColumnWriterStatistics *stats) override { auto &state = state_p.Cast>(); - D_ASSERT(state.encoding == duckdb_parquet::Encoding::RLE_DICTIONARY); + D_ASSERT(state.encoding == duckdb_parquet::Encoding::RLE_DICTIONARY || + state.encoding == duckdb_parquet::Encoding::PLAIN_DICTIONARY); if (writer.EnableBloomFilters()) { state.bloom_filter = @@ -310,7 +315,8 @@ class StandardColumnWriter : public PrimitiveColumnWriter { idx_t GetRowSize(const Vector &vector, const idx_t index, const PrimitiveColumnWriterState &state_p) const override { auto &state = state_p.Cast>(); - if (state.encoding == duckdb_parquet::Encoding::RLE_DICTIONARY) { + if (state.encoding == duckdb_parquet::Encoding::RLE_DICTIONARY || + state.encoding == duckdb_parquet::Encoding::PLAIN_DICTIONARY) { return (state.key_bit_width + 7) / 8; } else { return OP::template GetRowSize(vector, index); @@ -328,6 +334,8 @@ class StandardColumnWriter : public PrimitiveColumnWriter { const auto *data_ptr = FlatVector::GetData(input_column); switch (page_state.encoding) { + case duckdb_parquet::Encoding::PLAIN_DICTIONARY: + // PLAIN_DICTIONARY can be treated the same as RLE_DICTIONARY case duckdb_parquet::Encoding::RLE_DICTIONARY: { idx_t r = chunk_start; if (!page_state.dict_written_value) { diff --git a/src/duckdb/src/common/local_file_system.cpp b/src/duckdb/src/common/local_file_system.cpp index 8733e0162..bb7a7b04d 100644 --- a/src/duckdb/src/common/local_file_system.cpp +++ b/src/duckdb/src/common/local_file_system.cpp @@ -1283,6 +1283,29 @@ bool LocalFileSystem::OnDiskFile(FileHandle &handle) { return true; } +string LocalFileSystem::GetVersionTag(FileHandle &handle) { + // TODO: Fix using FileSystem::Stats for v1.5, which should also fix it for Windows +#ifdef _WIN32 + return ""; +#else + int fd = handle.Cast().fd; + struct stat s; + if (fstat(fd, &s) == -1) { + throw IOException("Failed to get file size for file \"%s\": %s", {{"errno", std::to_string(errno)}}, + handle.path, strerror(errno)); + } + + // dev/ino should be enough, but to guard against in-place writes we also add file size and modification time + uint64_t version_tag[4]; + Store(NumericCast(s.st_dev), data_ptr_cast(&version_tag[0])); + Store(NumericCast(s.st_ino), data_ptr_cast(&version_tag[1])); + Store(NumericCast(s.st_size), data_ptr_cast(&version_tag[2])); + Store(Timestamp::FromEpochSeconds(s.st_mtime).value, data_ptr_cast(&version_tag[3])); + + return string(char_ptr_cast(version_tag), sizeof(uint64_t) * 4); +#endif +} + void LocalFileSystem::Seek(FileHandle &handle, idx_t location) { if (!CanSeek()) { throw IOException("Cannot seek in files of this type"); diff --git a/src/duckdb/src/execution/index/art/base_node.cpp b/src/duckdb/src/execution/index/art/base_node.cpp index 94d5c0fe1..a59297c2c 100644 --- a/src/duckdb/src/execution/index/art/base_node.cpp +++ b/src/duckdb/src/execution/index/art/base_node.cpp @@ -95,7 +95,9 @@ void Node4::DeleteChild(ART &art, Node &node, Node &parent, const uint8_t byte, auto prev_node4_status = node.GetGateStatus(); Node::FreeNode(art, node); - Prefix::Concat(art, parent, node, child, remaining_byte, prev_node4_status); + // Propagate both the prev_node_4 status and the general gate status (if the gate was earlier on), + // since the concatenation logic depends on both. + Prefix::Concat(art, parent, node, child, remaining_byte, prev_node4_status, status); } void Node4::ShrinkNode16(ART &art, Node &node4, Node &node16) { diff --git a/src/duckdb/src/execution/index/art/prefix.cpp b/src/duckdb/src/execution/index/art/prefix.cpp index 1d7861135..148f68d0f 100644 --- a/src/duckdb/src/execution/index/art/prefix.cpp +++ b/src/duckdb/src/execution/index/art/prefix.cpp @@ -65,8 +65,8 @@ void Prefix::New(ART &art, reference &ref, const ARTKey &key, const idx_t } } -void Prefix::Concat(ART &art, Node &parent, Node &node4, const Node child, uint8_t byte, - const GateStatus node4_status) { +void Prefix::Concat(ART &art, Node &parent, Node &node4, const Node child, uint8_t byte, const GateStatus node4_status, + const GateStatus status) { // We have four situations from which we enter here: // 1: PREFIX (parent) - Node4 (prev_node4) - PREFIX (child) - INLINED_LEAF, or // 2: PREFIX (parent) - Node4 (prev_node4) - INLINED_LEAF (child), or @@ -90,10 +90,7 @@ void Prefix::Concat(ART &art, Node &parent, Node &node4, const Node child, uint8 ConcatChildIsGate(art, parent, node4, child, byte); return; } - - auto inside_gate = parent.GetGateStatus() == GateStatus::GATE_SET; - ConcatInternal(art, parent, node4, child, byte, inside_gate); - return; + ConcatInternal(art, parent, node4, child, byte, status); } void Prefix::Reduce(ART &art, Node &node, const idx_t pos) { @@ -286,9 +283,9 @@ Prefix Prefix::GetTail(ART &art, const Node &node) { } void Prefix::ConcatInternal(ART &art, Node &parent, Node &node4, const Node child, uint8_t byte, - const bool inside_gate) { + const GateStatus status) { if (child.GetType() == NType::LEAF_INLINED) { - if (inside_gate) { + if (status == GateStatus::GATE_SET) { if (parent.GetType() == NType::PREFIX) { // The parent only contained the Node4, so we can now inline 'all the way up', // and the gate is no longer nested. diff --git a/src/duckdb/src/execution/index/bound_index.cpp b/src/duckdb/src/execution/index/bound_index.cpp index f7ba1e041..8573da471 100644 --- a/src/duckdb/src/execution/index/bound_index.cpp +++ b/src/duckdb/src/execution/index/bound_index.cpp @@ -1,11 +1,13 @@ #include "duckdb/execution/index/bound_index.hpp" +#include "duckdb/common/array.hpp" #include "duckdb/common/radix.hpp" #include "duckdb/common/serializer/serializer.hpp" #include "duckdb/planner/expression/bound_columnref_expression.hpp" #include "duckdb/planner/expression/bound_reference_expression.hpp" #include "duckdb/planner/expression_iterator.hpp" #include "duckdb/storage/table/append_state.hpp" +#include "duckdb/common/types/selection_vector.hpp" namespace duckdb { @@ -154,39 +156,80 @@ string BoundIndex::AppendRowError(DataChunk &input, idx_t index) { return error; } -void BoundIndex::ApplyBufferedReplays(const vector &table_types, - vector &buffered_replays, +namespace { + +struct BufferedReplayState { + optional_ptr buffer = nullptr; + ColumnDataScanState scan_state; + DataChunk current_chunk; + bool scan_initialized = false; +}; +} // namespace + +void BoundIndex::ApplyBufferedReplays(const vector &table_types, BufferedIndexReplays &buffered_replays, const vector &mapped_column_ids) { - for (auto &replay : buffered_replays) { - ColumnDataScanState state; - auto &buffered_data = *replay.data; - buffered_data.InitializeScan(state); - - DataChunk scan_chunk; - buffered_data.InitializeScanChunk(scan_chunk); - DataChunk table_chunk; - table_chunk.InitializeEmpty(table_types); - - while (buffered_data.Scan(state, scan_chunk)) { - for (idx_t i = 0; i < scan_chunk.ColumnCount() - 1; i++) { - auto col_id = mapped_column_ids[i].GetPrimaryIndex(); - table_chunk.data[col_id].Reference(scan_chunk.data[i]); + if (!buffered_replays.HasBufferedReplays()) { + return; + } + + // We have two replay states: one for inserts and one for deletes. These are indexed into using the + // replay_type. Both scans are interleaved, so the state maintains the position of each scan. + array replay_states; + DataChunk table_chunk; + table_chunk.InitializeEmpty(table_types); + + for (const auto &replay_range : buffered_replays.ranges) { + const auto type_idx = static_cast(replay_range.type); + auto &state = replay_states[type_idx]; + + // Initialize the scan state if necessary. Take ownership of buffered operations, since we won't need + // them after replaying anyways. + if (!state.scan_initialized) { + state.buffer = buffered_replays.GetBuffer(replay_range.type); + state.buffer->InitializeScan(state.scan_state); + state.buffer->InitializeScanChunk(state.current_chunk); + state.scan_initialized = true; + } + + idx_t current_row = replay_range.start; + while (current_row < replay_range.end) { + // Scan the next DataChunk from the ColumnDataCollection buffer if the current row is on or after + // that chunk's starting row index. + if (current_row >= state.scan_state.next_row_index) { + if (!state.buffer->Scan(state.scan_state, state.current_chunk)) { + throw InternalException("Buffered index data exhausted during replay"); + } } - table_chunk.SetCardinality(scan_chunk.size()); - switch (replay.type) { - case BufferedIndexReplay::INSERT_ENTRY: { - IndexAppendInfo index_append_info(IndexAppendMode::INSERT_DUPLICATES, nullptr); - auto error = Append(table_chunk, scan_chunk.data.back(), index_append_info); + // We need to process the remaining rows in the current chunk, which is the minimum of the available + // rows in the chunk and the remaining rows in the current range. + const auto offset_in_chunk = current_row - state.scan_state.current_row_index; + const auto available_in_chunk = state.current_chunk.size() - offset_in_chunk; + // [start, end) in ReplayRange is [inclusive, exclusive). + const auto range_remaining = replay_range.end - current_row; + const auto rows_to_process = MinValue(available_in_chunk, range_remaining); + + SelectionVector sel(offset_in_chunk, rows_to_process); + + for (idx_t col_idx = 0; col_idx < state.current_chunk.ColumnCount() - 1; col_idx++) { + const auto col_id = mapped_column_ids[col_idx].GetPrimaryIndex(); + table_chunk.data[col_id].Reference(state.current_chunk.data[col_idx]); + table_chunk.data[col_id].Slice(sel, rows_to_process); + } + table_chunk.SetCardinality(rows_to_process); + Vector row_ids(state.current_chunk.data.back(), sel, rows_to_process); + + if (replay_range.type == BufferedIndexReplay::INSERT_ENTRY) { + IndexAppendInfo append_info(IndexAppendMode::INSERT_DUPLICATES, nullptr); + const auto error = Append(table_chunk, row_ids, append_info); if (error.HasError()) { throw InternalException("error while applying buffered appends: " + error.Message()); } + current_row += rows_to_process; continue; } - case BufferedIndexReplay::DEL_ENTRY: { - Delete(table_chunk, scan_chunk.data.back()); - } - } + Delete(table_chunk, row_ids); + current_row += rows_to_process; } } } diff --git a/src/duckdb/src/execution/index/unbound_index.cpp b/src/duckdb/src/execution/index/unbound_index.cpp index 2fd628323..bd8e707a9 100644 --- a/src/duckdb/src/execution/index/unbound_index.cpp +++ b/src/duckdb/src/execution/index/unbound_index.cpp @@ -8,10 +8,6 @@ namespace duckdb { -BufferedIndexData::BufferedIndexData(BufferedIndexReplay replay_type, unique_ptr data_p) - : type(replay_type), data(std::move(data_p)) { -} - UnboundIndex::UnboundIndex(unique_ptr create_info, IndexStorageInfo storage_info_p, TableIOManager &table_io_manager, AttachedDatabase &db) : Index(create_info->Cast().column_ids, table_io_manager, db), create_info(std::move(create_info)), @@ -40,15 +36,13 @@ void UnboundIndex::CommitDrop() { } void UnboundIndex::BufferChunk(DataChunk &index_column_chunk, Vector &row_ids, - const vector &mapped_column_ids_p, BufferedIndexReplay replay_type) { + const vector &mapped_column_ids_p, const BufferedIndexReplay replay_type) { D_ASSERT(!column_ids.empty()); auto types = index_column_chunk.GetTypes(); // column types types.push_back(LogicalType::ROW_TYPE); auto &allocator = Allocator::Get(db); - BufferedIndexData buffered_data(replay_type, make_uniq(allocator, types)); - //! First time we are buffering data, canonical column_id mapping is stored. //! This should be a sorted list of all the physical offsets of Indexed columns on this table. if (mapped_column_ids.empty()) { @@ -56,7 +50,7 @@ void UnboundIndex::BufferChunk(DataChunk &index_column_chunk, Vector &row_ids, } D_ASSERT(mapped_column_ids == mapped_column_ids_p); - // Combined chunk has all the indexed columns and rowids. + // combined_chunk has all the indexed columns according to mapped_column_ids ordering, as well as a rowid column. DataChunk combined_chunk; combined_chunk.InitializeEmpty(types); for (idx_t i = 0; i < index_column_chunk.ColumnCount(); i++) { @@ -64,8 +58,25 @@ void UnboundIndex::BufferChunk(DataChunk &index_column_chunk, Vector &row_ids, } combined_chunk.data.back().Reference(row_ids); combined_chunk.SetCardinality(index_column_chunk.size()); - buffered_data.data->Append(combined_chunk); - buffered_replays.emplace_back(std::move(buffered_data)); + + auto &buffer = buffered_replays.GetBuffer(replay_type); + if (buffer == nullptr) { + buffer = make_uniq(allocator, types); + } + // The starting index of the buffer range is the size of the buffer. + const idx_t start = buffer->Count(); + const idx_t end = start + combined_chunk.size(); + auto &ranges = buffered_replays.ranges; + + if (ranges.empty() || ranges.back().type != replay_type) { + // If there are no buffered ranges, or the replay types don't match, append a new range. + ranges.emplace_back(replay_type, start, end); + buffer->Append(combined_chunk); + return; + } + // Otherwise merge the range with the previous one. + ranges.back().end = end; + buffer->Append(combined_chunk); } } // namespace duckdb diff --git a/src/duckdb/src/execution/operator/csv_scanner/scanner/base_scanner.cpp b/src/duckdb/src/execution/operator/csv_scanner/scanner/base_scanner.cpp index 1e186fa97..179c5bcbf 100644 --- a/src/duckdb/src/execution/operator/csv_scanner/scanner/base_scanner.cpp +++ b/src/duckdb/src/execution/operator/csv_scanner/scanner/base_scanner.cpp @@ -26,6 +26,10 @@ BaseScanner::BaseScanner(shared_ptr buffer_manager_p, shared_p } } +void BaseScanner::Print() const { + state_machine->Print(); +} + string BaseScanner::RemoveSeparator(const char *value_ptr, const idx_t size, char thousands_separator) { string result; result.reserve(size); diff --git a/src/duckdb/src/execution/operator/csv_scanner/scanner/string_value_scanner.cpp b/src/duckdb/src/execution/operator/csv_scanner/scanner/string_value_scanner.cpp index 5ed14a992..a495d22a6 100644 --- a/src/duckdb/src/execution/operator/csv_scanner/scanner/string_value_scanner.cpp +++ b/src/duckdb/src/execution/operator/csv_scanner/scanner/string_value_scanner.cpp @@ -22,7 +22,7 @@ StringValueResult::StringValueResult(CSVStates &states, CSVStateMachine &state_m idx_t result_size_p, idx_t buffer_position, CSVErrorHandler &error_hander_p, CSVIterator &iterator_p, bool store_line_size_p, shared_ptr csv_file_scan_p, idx_t &lines_read_p, bool sniffing_p, - string path_p, idx_t scan_id) + const string &path_p, idx_t scan_id, bool &used_unstrictness) : ScannerResult(states, state_machine, result_size_p), number_of_columns(NumericCast(state_machine.dialect_options.num_cols)), null_padding(state_machine.options.null_padding), ignore_errors(state_machine.options.ignore_errors.GetValue()), @@ -30,8 +30,8 @@ StringValueResult::StringValueResult(CSVStates &states, CSVStateMachine &state_m ? 0 : state_machine.dialect_options.state_machine_options.delimiter.GetValue().size() - 1), error_handler(error_hander_p), iterator(iterator_p), store_line_size(store_line_size_p), - csv_file_scan(std::move(csv_file_scan_p)), lines_read(lines_read_p), - current_errors(scan_id, state_machine.options.IgnoreErrors()), sniffing(sniffing_p), path(std::move(path_p)) { + csv_file_scan(std::move(csv_file_scan_p)), lines_read(lines_read_p), used_unstrictness(used_unstrictness), + current_errors(scan_id, state_machine.options.IgnoreErrors()), sniffing(sniffing_p), path(path_p) { // Vector information D_ASSERT(number_of_columns > 0); if (!buffer_handle) { @@ -154,23 +154,26 @@ inline bool IsValueNull(const char *null_str_ptr, const char *value_ptr, const i } bool StringValueResult::HandleTooManyColumnsError(const char *value_ptr, const idx_t size) { - if (cur_col_id >= number_of_columns && state_machine.state_machine_options.strict_mode.GetValue()) { - bool error = true; - if (cur_col_id == number_of_columns && ((quoted && state_machine.options.allow_quoted_nulls) || !quoted)) { - // we make an exception if the first over-value is null - bool is_value_null = false; - for (idx_t i = 0; i < null_str_count; i++) { - is_value_null = is_value_null || IsValueNull(null_str_ptr[i], value_ptr, size); + if (cur_col_id >= number_of_columns) { + if (state_machine.state_machine_options.strict_mode.GetValue()) { + bool error = true; + if (cur_col_id == number_of_columns && ((quoted && state_machine.options.allow_quoted_nulls) || !quoted)) { + // we make an exception if the first over-value is null + bool is_value_null = false; + for (idx_t i = 0; i < null_str_count; i++) { + is_value_null = is_value_null || IsValueNull(null_str_ptr[i], value_ptr, size); + } + error = !is_value_null; } - error = !is_value_null; - } - if (error) { - // We error pointing to the current value error. - current_errors.Insert(TOO_MANY_COLUMNS, cur_col_id, chunk_col_id, last_position); - cur_col_id++; + if (error) { + // We error pointing to the current value error. + current_errors.Insert(TOO_MANY_COLUMNS, cur_col_id, chunk_col_id, last_position); + cur_col_id++; + } + // We had an error + return true; } - // We had an error - return true; + used_unstrictness = true; } return false; } @@ -231,6 +234,7 @@ void StringValueResult::AddValueToVector(const char *value_ptr, idx_t size, bool } if (cur_col_id >= number_of_columns) { if (!state_machine.state_machine_options.strict_mode.GetValue()) { + used_unstrictness = true; return; } bool error = true; @@ -549,6 +553,7 @@ void StringValueResult::AddPossiblyEscapedValue(StringValueResult &result, const } if (result.cur_col_id >= result.number_of_columns && !result.state_machine.state_machine_options.strict_mode.GetValue()) { + result.used_unstrictness = true; return; } if (!result.HandleTooManyColumnsError(value_ptr, length)) { @@ -980,7 +985,7 @@ StringValueScanner::StringValueScanner(idx_t scanner_idx_p, const shared_ptrcontext), result_size, iterator.pos.buffer_pos, *error_handler, iterator, buffer_manager->context.client_data->debug_set_max_line_length, csv_file_scan, lines_read, sniffing, - buffer_manager->GetFilePath(), scanner_idx_p), + buffer_manager->GetFilePath(), scanner_idx_p, used_unstrictness), start_pos(0) { if (scanner_idx == 0 && csv_file_scan) { lines_read += csv_file_scan->skipped_rows; @@ -997,7 +1002,7 @@ StringValueScanner::StringValueScanner(const shared_ptr &buffe result(states, *state_machine, cur_buffer_handle, Allocator::DefaultAllocator(), result_size, iterator.pos.buffer_pos, *error_handler, iterator, buffer_manager->context.client_data->debug_set_max_line_length, csv_file_scan, lines_read, sniffing, - buffer_manager->GetFilePath(), 0), + buffer_manager->GetFilePath(), 0, used_unstrictness), start_pos(0) { if (scanner_idx == 0 && csv_file_scan) { lines_read += csv_file_scan->skipped_rows; @@ -1939,14 +1944,17 @@ void StringValueScanner::FinalizeChunkProcess() { if (result.current_errors.HandleErrors(result)) { result.number_of_rows++; } - if (states.IsQuotedCurrent() && !found_error && - state_machine->dialect_options.state_machine_options.strict_mode.GetValue()) { - type = UNTERMINATED_QUOTES; - // If we finish the execution of a buffer, and we end in a quoted state, it means we have unterminated - // quotes - result.current_errors.Insert(type, result.cur_col_id, result.chunk_col_id, result.last_position); - if (result.current_errors.HandleErrors(result)) { - result.number_of_rows++; + if (states.IsQuotedCurrent() && !found_error) { + if (state_machine->dialect_options.state_machine_options.strict_mode.GetValue()) { + type = UNTERMINATED_QUOTES; + // If we finish the execution of a buffer, and we end in a quoted state, it means we have unterminated + // quotes + result.current_errors.Insert(type, result.cur_col_id, result.chunk_col_id, result.last_position); + if (result.current_errors.HandleErrors(result)) { + result.number_of_rows++; + } + } else { + used_unstrictness = true; } } if (!iterator.done) { diff --git a/src/duckdb/src/execution/operator/csv_scanner/sniffer/csv_sniffer.cpp b/src/duckdb/src/execution/operator/csv_scanner/sniffer/csv_sniffer.cpp index bcaed8e5f..9cf087871 100644 --- a/src/duckdb/src/execution/operator/csv_scanner/sniffer/csv_sniffer.cpp +++ b/src/duckdb/src/execution/operator/csv_scanner/sniffer/csv_sniffer.cpp @@ -14,7 +14,7 @@ CSVSniffer::CSVSniffer(CSVReaderOptions &options_p, const MultiFileOptions &file auto &logical_type = format_template.first; best_format_candidates[logical_type].clear(); } - // Initialize max columns found to either 0 or however many were set + // Initialize max columns found to either 0, or however many were set max_columns_found = set_columns.Size(); error_handler = make_shared_ptr(options.ignore_errors.GetValue()); detection_error_handler = make_shared_ptr(true); @@ -193,7 +193,8 @@ SnifferResult CSVSniffer::SniffCSV(const bool force_match) { buffer_manager->ResetBufferManager(); } buffer_manager->sniffing = false; - if (best_candidate->error_handler->AnyErrors() && !options.ignore_errors.GetValue()) { + if (best_candidate->error_handler->AnyErrors() && !options.ignore_errors.GetValue() && + best_candidate->state_machine->dialect_options.state_machine_options.strict_mode.GetValue()) { best_candidate->error_handler->ErrorIfTypeExists(MAXIMUM_LINE_SIZE); } D_ASSERT(best_sql_types_candidates_per_column_idx.size() == names.size()); diff --git a/src/duckdb/src/execution/operator/csv_scanner/sniffer/type_detection.cpp b/src/duckdb/src/execution/operator/csv_scanner/sniffer/type_detection.cpp index 2df6bbe8c..347c0fd1b 100644 --- a/src/duckdb/src/execution/operator/csv_scanner/sniffer/type_detection.cpp +++ b/src/duckdb/src/execution/operator/csv_scanner/sniffer/type_detection.cpp @@ -462,24 +462,30 @@ void CSVSniffer::DetectTypes() { idx_t varchar_cols = 0; for (idx_t col = 0; col < info_sql_types_candidates.size(); col++) { auto &col_type_candidates = info_sql_types_candidates[col]; - // check number of varchar columns + // check the number of varchar columns const auto &col_type = col_type_candidates.back(); if (col_type == LogicalType::VARCHAR) { varchar_cols++; } } - // it's good if the dialect creates more non-varchar columns, but only if we sacrifice < 30% of - // best_num_cols. + // it's good if the dialect creates more non-varchar columns + const bool has_less_varchar_cols = varchar_cols < min_varchar_cols; + // but only if we sacrifice < 30% of best_num_cols. + const bool acceptable_best_num_cols = + static_cast(info_sql_types_candidates.size()) > static_cast(max_columns_found) * 0.7; const idx_t number_of_errors = candidate->error_handler->GetSize(); - if (!best_candidate || (varchar_cols(info_sql_types_candidates.size())>( - static_cast(max_columns_found) * 0.7) && - (!options.ignore_errors.GetValue() || number_of_errors < min_errors))) { + const bool better_strictness = best_candidate_is_strict ? !candidate->used_unstrictness : true; + const bool acceptable_candidate = has_less_varchar_cols && acceptable_best_num_cols && better_strictness; + // If we escaped an unquoted character when strict is false. + if (!best_candidate || + (acceptable_candidate && (!options.ignore_errors.GetValue() || number_of_errors < min_errors))) { min_errors = number_of_errors; best_header_row.clear(); // we have a new best_options candidate best_candidate = std::move(candidate); min_varchar_cols = varchar_cols; + best_candidate_is_strict = !best_candidate->used_unstrictness; best_sql_types_candidates_per_column_idx = info_sql_types_candidates; for (auto &format_candidate : format_candidates) { best_format_candidates[format_candidate.first] = format_candidate.second.format; diff --git a/src/duckdb/src/execution/physical_plan/plan_window.cpp b/src/duckdb/src/execution/physical_plan/plan_window.cpp index c9cab9e8c..ace9b5c1a 100644 --- a/src/duckdb/src/execution/physical_plan/plan_window.cpp +++ b/src/duckdb/src/execution/physical_plan/plan_window.cpp @@ -2,13 +2,11 @@ #include "duckdb/execution/operator/aggregate/physical_window.hpp" #include "duckdb/execution/operator/projection/physical_projection.hpp" #include "duckdb/execution/physical_plan_generator.hpp" -#include "duckdb/main/client_context.hpp" +#include "duckdb/main/client_config.hpp" #include "duckdb/planner/expression/bound_reference_expression.hpp" #include "duckdb/planner/expression/bound_window_expression.hpp" #include "duckdb/planner/operator/logical_window.hpp" -#include - namespace duckdb { PhysicalOperator &PhysicalPlanGenerator::CreatePlan(LogicalWindow &op) { @@ -44,12 +42,12 @@ PhysicalOperator &PhysicalPlanGenerator::CreatePlan(LogicalWindow &op) { // Process the window functions by sharing the partition/order definitions unordered_map projection_map; vector> window_expressions; - idx_t blocking_count = 0; + idx_t streaming_count = 0; auto output_pos = input_width; while (!blocking_windows.empty() || !streaming_windows.empty()) { - const bool process_streaming = blocking_windows.empty(); - auto &remaining = process_streaming ? streaming_windows : blocking_windows; - blocking_count += process_streaming ? 0 : 1; + const bool process_blocking = streaming_windows.empty(); + auto &remaining = process_blocking ? blocking_windows : streaming_windows; + streaming_count += process_blocking ? 0 : 1; // Find all functions that share the partitioning of the first remaining expression auto over_idx = remaining[0]; @@ -122,7 +120,7 @@ PhysicalOperator &PhysicalPlanGenerator::CreatePlan(LogicalWindow &op) { } // Chain the new window operator on top of the plan - if (i < blocking_count) { + if (i >= streaming_count) { auto &window = Make(types, std::move(select_list), op.estimated_cardinality); window.children.push_back(plan); plan = window; diff --git a/src/duckdb/src/function/table/system/duckdb_log.cpp b/src/duckdb/src/function/table/system/duckdb_log.cpp index f84cb405a..96c35853f 100644 --- a/src/duckdb/src/function/table/system/duckdb_log.cpp +++ b/src/duckdb/src/function/table/system/duckdb_log.cpp @@ -62,6 +62,9 @@ unique_ptr DuckDBLogBindReplace(ClientContext &context, TableFunctionB bool denormalized_table = false; auto denormalized_table_setting = input.named_parameters.find("denormalized_table"); if (denormalized_table_setting != input.named_parameters.end()) { + if (denormalized_table_setting->second.IsNull()) { + throw InvalidInputException("denormalized_table cannot be NULL"); + } denormalized_table = denormalized_table_setting->second.GetValue(); } diff --git a/src/duckdb/src/function/table/system/test_all_types.cpp b/src/duckdb/src/function/table/system/test_all_types.cpp index 1e0c0ced3..c1c126019 100644 --- a/src/duckdb/src/function/table/system/test_all_types.cpp +++ b/src/duckdb/src/function/table/system/test_all_types.cpp @@ -69,8 +69,10 @@ vector TestAllTypesFun::GetTestTypes(const bool use_large_enum, const result.emplace_back(LogicalType::TIMESTAMP_TZ, "timestamp_tz"); // More complex numeric types. - result.emplace_back(LogicalType::FLOAT, "float"); - result.emplace_back(LogicalType::DOUBLE, "double"); + result.emplace_back(LogicalType::FLOAT, "float", Value::FLOAT(std::numeric_limits::lowest()), + Value::FLOAT(std::numeric_limits::max())); + result.emplace_back(LogicalType::DOUBLE, "double", Value::DOUBLE(std::numeric_limits::lowest()), + Value::DOUBLE(std::numeric_limits::max())); result.emplace_back(LogicalType::DECIMAL(4, 1), "dec_4_1"); result.emplace_back(LogicalType::DECIMAL(9, 4), "dec_9_4"); result.emplace_back(LogicalType::DECIMAL(18, 6), "dec_18_6"); diff --git a/src/duckdb/src/function/table/version/pragma_version.cpp b/src/duckdb/src/function/table/version/pragma_version.cpp index 40f702062..f3d4134e9 100644 --- a/src/duckdb/src/function/table/version/pragma_version.cpp +++ b/src/duckdb/src/function/table/version/pragma_version.cpp @@ -1,5 +1,5 @@ #ifndef DUCKDB_PATCH_VERSION -#define DUCKDB_PATCH_VERSION "3-dev136" +#define DUCKDB_PATCH_VERSION "3-dev237" #endif #ifndef DUCKDB_MINOR_VERSION #define DUCKDB_MINOR_VERSION 4 @@ -8,10 +8,10 @@ #define DUCKDB_MAJOR_VERSION 1 #endif #ifndef DUCKDB_VERSION -#define DUCKDB_VERSION "v1.4.3-dev136" +#define DUCKDB_VERSION "v1.4.3-dev237" #endif #ifndef DUCKDB_SOURCE_ID -#define DUCKDB_SOURCE_ID "136dd6ada5" +#define DUCKDB_SOURCE_ID "d1dc88f950" #endif #include "duckdb/function/table/system_functions.hpp" #include "duckdb/main/database.hpp" diff --git a/src/duckdb/src/include/duckdb/common/limits.hpp b/src/duckdb/src/include/duckdb/common/limits.hpp index 0662579ef..67a98daf0 100644 --- a/src/duckdb/src/include/duckdb/common/limits.hpp +++ b/src/duckdb/src/include/duckdb/common/limits.hpp @@ -24,10 +24,12 @@ namespace duckdb { template struct NumericLimits { static constexpr T Minimum() { - return std::numeric_limits::lowest(); + return std::numeric_limits::has_infinity ? -std::numeric_limits::infinity() + : std::numeric_limits::lowest(); } static constexpr T Maximum() { - return std::numeric_limits::max(); + return std::numeric_limits::has_infinity ? std::numeric_limits::infinity() + : std::numeric_limits::max(); } static constexpr bool IsSigned() { return std::is_signed::value; diff --git a/src/duckdb/src/include/duckdb/common/local_file_system.hpp b/src/duckdb/src/include/duckdb/common/local_file_system.hpp index 8b3f7aaf2..354886e50 100644 --- a/src/duckdb/src/include/duckdb/common/local_file_system.hpp +++ b/src/duckdb/src/include/duckdb/common/local_file_system.hpp @@ -38,6 +38,8 @@ class LocalFileSystem : public FileSystem { int64_t GetFileSize(FileHandle &handle) override; //! Returns the file last modified time of a file handle, returns timespec with zero on all attributes on error timestamp_t GetLastModifiedTime(FileHandle &handle) override; + //! Returns a tag that uniquely identifies the version of the file + string GetVersionTag(FileHandle &handle) override; //! Returns the file last modified time of a file handle, returns timespec with zero on all attributes on error FileType GetFileType(FileHandle &handle) override; //! Truncate a file to a maximum size of new_size, new_size should be smaller than or equal to the current size of diff --git a/src/duckdb/src/include/duckdb/execution/index/art/art_operator.hpp b/src/duckdb/src/include/duckdb/execution/index/art/art_operator.hpp index 0a71c61f4..a71d9362a 100644 --- a/src/duckdb/src/include/duckdb/execution/index/art/art_operator.hpp +++ b/src/duckdb/src/include/duckdb/execution/index/art/art_operator.hpp @@ -256,6 +256,8 @@ class ARTOperator { if (parent.get().GetType() == NType::PREFIX) { // We might have to compress: // PREFIX (greatgrandparent) - Node4 (grandparent) - PREFIX - INLINED_LEAF. + // The parent does not have to be passed in, as it is a child of the possibly being compressed N4. + // Then, when we delete that child, we also free it. Node::DeleteChild(art, grandparent, greatgrandparent, current_key.get()[grandparent_depth], status, row_id); return; diff --git a/src/duckdb/src/include/duckdb/execution/index/art/prefix.hpp b/src/duckdb/src/include/duckdb/execution/index/art/prefix.hpp index 835e32c0f..109497f2a 100644 --- a/src/duckdb/src/include/duckdb/execution/index/art/prefix.hpp +++ b/src/duckdb/src/include/duckdb/execution/index/art/prefix.hpp @@ -48,7 +48,7 @@ class Prefix { //! Concatenates parent -> prev_node4 -> child. static void Concat(ART &art, Node &parent, Node &node4, const Node child, uint8_t byte, - const GateStatus node4_status); + const GateStatus node4_status, const GateStatus status); //! Removes up to pos bytes from the prefix. //! Shifts all subsequent bytes by pos. Frees empty nodes. @@ -72,7 +72,7 @@ class Prefix { static Prefix GetTail(ART &art, const Node &node); static void ConcatInternal(ART &art, Node &parent, Node &node4, const Node child, uint8_t byte, - const bool inside_gate); + const GateStatus status); static void ConcatNode4WasGate(ART &art, Node &node4, const Node child, uint8_t byte); static void ConcatChildIsGate(ART &art, Node &parent, Node &node4, const Node child, uint8_t byte); static void ConcatOutsideGate(ART &art, Node &parent, Node &node4, const Node child, uint8_t byte); diff --git a/src/duckdb/src/include/duckdb/execution/index/bound_index.hpp b/src/duckdb/src/include/duckdb/execution/index/bound_index.hpp index 92f4f323c..d09e664e5 100644 --- a/src/duckdb/src/include/duckdb/execution/index/bound_index.hpp +++ b/src/duckdb/src/include/duckdb/execution/index/bound_index.hpp @@ -169,7 +169,7 @@ class BoundIndex : public Index { //! Replay index insert and delete operations buffered during WAL replay. //! table_types has the physical types of the table in the order they appear, not logical (no generated columns). //! mapped_column_ids contains the sorted order of Indexed physical column ID's (see unbound_index.hpp comments). - void ApplyBufferedReplays(const vector &table_types, vector &buffered_replays, + void ApplyBufferedReplays(const vector &table_types, BufferedIndexReplays &buffered_replays, const vector &mapped_column_ids); protected: diff --git a/src/duckdb/src/include/duckdb/execution/index/unbound_index.hpp b/src/duckdb/src/include/duckdb/execution/index/unbound_index.hpp index 30c5917f4..0ca4aa9d2 100644 --- a/src/duckdb/src/include/duckdb/execution/index/unbound_index.hpp +++ b/src/duckdb/src/include/duckdb/execution/index/unbound_index.hpp @@ -18,11 +18,43 @@ class ColumnDataCollection; enum class BufferedIndexReplay : uint8_t { INSERT_ENTRY = 0, DEL_ENTRY = 1 }; -struct BufferedIndexData { +struct ReplayRange { BufferedIndexReplay type; - unique_ptr data; + // [start, end) - start is inclusive, end is exclusive for the range within the ColumnDataCollection + // buffer for operations to replay for this range. + idx_t start; + idx_t end; + explicit ReplayRange(const BufferedIndexReplay replay_type, const idx_t start_p, const idx_t end_p) + : type(replay_type), start(start_p), end(end_p) { + } +}; + +// All inserts and deletes to be replayed are stored in their respective buffers. +// Since the inserts and deletes may be interleaved, however, ranges stores the ordering of operations +// and their offsets in the respective buffer. +// Simple example: +// ranges[0] - INSERT_ENTRY, [0,6) +// ranges[1] - DEL_ENTRY, [0,3) +// ranges[2] - INSERT_ENTRY [6,12) +// So even though the buffered_inserts has all the insert data from [0,12), ranges gives us the intervals for +// replaying the index operations in the right order. +struct BufferedIndexReplays { + vector ranges; + unique_ptr buffered_inserts; + unique_ptr buffered_deletes; + + BufferedIndexReplays() = default; + + unique_ptr &GetBuffer(const BufferedIndexReplay replay_type) { + if (replay_type == BufferedIndexReplay::INSERT_ENTRY) { + return buffered_inserts; + } + return buffered_deletes; + } - BufferedIndexData(BufferedIndexReplay replay_type, unique_ptr data_p); + bool HasBufferedReplays() const { + return !ranges.empty(); + } }; class UnboundIndex final : public Index { @@ -31,8 +63,9 @@ class UnboundIndex final : public Index { unique_ptr create_info; //! The serialized storage information of the index. IndexStorageInfo storage_info; - //! Buffer for WAL replays. - vector buffered_replays; + + //! Buffered for index operations during WAL replay. They are replayed upon index binding. + BufferedIndexReplays buffered_replays; //! Maps the column IDs in the buffered replays to a physical table offset. //! For example, column [i] in a buffered ColumnDataCollection is the data for an Indexed column with @@ -78,12 +111,13 @@ class UnboundIndex final : public Index { void BufferChunk(DataChunk &index_column_chunk, Vector &row_ids, const vector &mapped_column_ids_p, BufferedIndexReplay replay_type); bool HasBufferedReplays() const { - return !buffered_replays.empty(); + return buffered_replays.HasBufferedReplays(); } - vector &GetBufferedReplays() { + BufferedIndexReplays &GetBufferedReplays() { return buffered_replays; } + const vector &GetMappedColumnIds() const { return mapped_column_ids; } diff --git a/src/duckdb/src/include/duckdb/execution/operator/csv_scanner/base_scanner.hpp b/src/duckdb/src/include/duckdb/execution/operator/csv_scanner/base_scanner.hpp index 2a123827c..5e36690ee 100644 --- a/src/duckdb/src/include/duckdb/execution/operator/csv_scanner/base_scanner.hpp +++ b/src/duckdb/src/include/duckdb/execution/operator/csv_scanner/base_scanner.hpp @@ -121,6 +121,8 @@ class BaseScanner { virtual ~BaseScanner() = default; + void Print() const; + //! Returns true if the scanner is finished bool FinishedFile() const; @@ -164,10 +166,15 @@ class BaseScanner { //! States CSVStates states; + //! If the scanner ever entered a quoted state bool ever_quoted = false; + //! If the scanner ever entered an escaped state. bool ever_escaped = false; + //! If the scanner ever used advantage of the non-strict mode. + bool used_unstrictness = false; + //! Shared pointer to the buffer_manager, this is shared across multiple scanners shared_ptr buffer_manager; @@ -302,6 +309,9 @@ class BaseScanner { !state_machine->dialect_options.state_machine_options.strict_mode.GetValue())) { // We only set the ever escaped variable if this is either a quote char OR strict mode is off ever_escaped = true; + if (states.states[0] == CSVState::UNQUOTED_ESCAPE) { + used_unstrictness = true; + } } ever_quoted = true; T::SetQuoted(result, iterator.pos.buffer_pos); @@ -332,11 +342,15 @@ class BaseScanner { break; } case CSVState::ESCAPE: - case CSVState::UNQUOTED_ESCAPE: case CSVState::ESCAPED_RETURN: T::SetEscaped(result); iterator.pos.buffer_pos++; break; + case CSVState::UNQUOTED_ESCAPE: + T::SetEscaped(result); + iterator.pos.buffer_pos++; + used_unstrictness = true; + break; case CSVState::STANDARD: { iterator.pos.buffer_pos++; while (iterator.pos.buffer_pos + 8 < to_pos) { diff --git a/src/duckdb/src/include/duckdb/execution/operator/csv_scanner/sniffer/csv_sniffer.hpp b/src/duckdb/src/include/duckdb/execution/operator/csv_scanner/sniffer/csv_sniffer.hpp index 5b985e05c..8ab081ae3 100644 --- a/src/duckdb/src/include/duckdb/execution/operator/csv_scanner/sniffer/csv_sniffer.hpp +++ b/src/duckdb/src/include/duckdb/execution/operator/csv_scanner/sniffer/csv_sniffer.hpp @@ -116,6 +116,7 @@ class CSVSniffer { //! Highest number of columns found idx_t max_columns_found = 0; idx_t max_columns_found_error = 0; + bool best_candidate_is_strict = false; //! Current Candidates being considered vector> candidates; //! Reference to original CSV Options, it will be modified as a result of the sniffer. diff --git a/src/duckdb/src/include/duckdb/execution/operator/csv_scanner/string_value_scanner.hpp b/src/duckdb/src/include/duckdb/execution/operator/csv_scanner/string_value_scanner.hpp index bacabfc4f..158c7ec12 100644 --- a/src/duckdb/src/include/duckdb/execution/operator/csv_scanner/string_value_scanner.hpp +++ b/src/duckdb/src/include/duckdb/execution/operator/csv_scanner/string_value_scanner.hpp @@ -176,7 +176,7 @@ class StringValueResult : public ScannerResult { const shared_ptr &buffer_handle, Allocator &buffer_allocator, idx_t result_size_p, idx_t buffer_position, CSVErrorHandler &error_handler, CSVIterator &iterator, bool store_line_size, shared_ptr csv_file_scan, idx_t &lines_read, bool sniffing, - string path, idx_t scan_id); + const string &path, idx_t scan_id, bool &used_unstrictness); ~StringValueResult(); @@ -225,6 +225,7 @@ class StringValueResult : public ScannerResult { shared_ptr csv_file_scan; idx_t &lines_read; + bool &used_unstrictness; //! Information regarding projected columns unsafe_unique_array projected_columns; bool projecting_columns = false; diff --git a/src/duckdb/src/include/duckdb/main/extension_entries.hpp b/src/duckdb/src/include/duckdb/main/extension_entries.hpp index e3d3d80d6..1d5a8510e 100644 --- a/src/duckdb/src/include/duckdb/main/extension_entries.hpp +++ b/src/duckdb/src/include/duckdb/main/extension_entries.hpp @@ -1046,6 +1046,7 @@ static constexpr ExtensionEntry EXTENSION_SETTINGS[] = { {"http_timeout", "httpfs"}, {"httpfs_client_implementation", "httpfs"}, {"iceberg_via_aws_sdk_for_catalog_interactions", "iceberg"}, + {"merge_http_secret_into_s3_request", "httpfs"}, {"mysql_bit1_as_boolean", "mysql_scanner"}, {"mysql_debug_show_queries", "mysql_scanner"}, {"mysql_experimental_filter_pushdown", "mysql_scanner"}, diff --git a/src/duckdb/src/include/duckdb/parser/parsed_data/sample_options.hpp b/src/duckdb/src/include/duckdb/parser/parsed_data/sample_options.hpp index dadbcfe92..766345f0e 100644 --- a/src/duckdb/src/include/duckdb/parser/parsed_data/sample_options.hpp +++ b/src/duckdb/src/include/duckdb/parser/parsed_data/sample_options.hpp @@ -23,6 +23,9 @@ enum class SampleMethod : uint8_t { SYSTEM_SAMPLE = 0, BERNOULLI_SAMPLE = 1, RES string SampleMethodToString(SampleMethod method); class SampleOptions { +public: + // 1 billion rows should be enough. + static constexpr idx_t MAX_SAMPLE_ROWS = 1000000000; public: explicit SampleOptions(int64_t seed_ = -1); diff --git a/src/duckdb/src/include/duckdb/planner/subquery/flatten_dependent_join.hpp b/src/duckdb/src/include/duckdb/planner/subquery/flatten_dependent_join.hpp index 14ad4510c..5fa37d4ac 100644 --- a/src/duckdb/src/include/duckdb/planner/subquery/flatten_dependent_join.hpp +++ b/src/duckdb/src/include/duckdb/planner/subquery/flatten_dependent_join.hpp @@ -33,7 +33,7 @@ struct FlattenDependentJoins { bool parent_is_dependent_join = false); //! Mark entire subtree of Logical Operators as correlated by adding them to the has_correlated_expressions map. - bool MarkSubtreeCorrelated(LogicalOperator &op); + bool MarkSubtreeCorrelated(LogicalOperator &op, idx_t cte_index); //! Push the dependent join down a LogicalOperator unique_ptr PushDownDependentJoin(unique_ptr plan, diff --git a/src/duckdb/src/include/duckdb/planner/subquery/rewrite_cte_scan.hpp b/src/duckdb/src/include/duckdb/planner/subquery/rewrite_cte_scan.hpp index 72886f80e..323f3b9b4 100644 --- a/src/duckdb/src/include/duckdb/planner/subquery/rewrite_cte_scan.hpp +++ b/src/duckdb/src/include/duckdb/planner/subquery/rewrite_cte_scan.hpp @@ -17,13 +17,15 @@ namespace duckdb { //! Helper class to rewrite correlated cte scans within a single LogicalOperator class RewriteCTEScan : public LogicalOperatorVisitor { public: - RewriteCTEScan(idx_t table_index, const CorrelatedColumns &correlated_columns); + RewriteCTEScan(idx_t table_index, const CorrelatedColumns &correlated_columns, + bool rewrite_dependent_joins = false); void VisitOperator(LogicalOperator &op) override; private: idx_t table_index; const CorrelatedColumns &correlated_columns; + bool rewrite_dependent_joins = false; }; } // namespace duckdb diff --git a/src/duckdb/src/include/duckdb/storage/table/validity_column_data.hpp b/src/duckdb/src/include/duckdb/storage/table/validity_column_data.hpp index 286a5343b..25a016466 100644 --- a/src/duckdb/src/include/duckdb/storage/table/validity_column_data.hpp +++ b/src/duckdb/src/include/duckdb/storage/table/validity_column_data.hpp @@ -23,6 +23,8 @@ class ValidityColumnData : public ColumnData { public: FilterPropagateResult CheckZonemap(ColumnScanState &state, TableFilter &filter) override; void AppendData(BaseStatistics &stats, ColumnAppendState &state, UnifiedVectorFormat &vdata, idx_t count) override; + void UpdateWithBase(TransactionData transaction, DataTable &data_table, idx_t column_index, Vector &update_vector, + row_t *row_ids, idx_t update_count, ColumnData &base); }; } // namespace duckdb diff --git a/src/duckdb/src/main/extension/extension_alias.cpp b/src/duckdb/src/main/extension/extension_alias.cpp index 81d3c1e1b..4a1ae7146 100644 --- a/src/duckdb/src/main/extension/extension_alias.cpp +++ b/src/duckdb/src/main/extension/extension_alias.cpp @@ -10,6 +10,7 @@ static const ExtensionAlias internal_aliases[] = {{"http", "httpfs"}, // httpfs {"postgres", "postgres_scanner"}, // postgres {"sqlite", "sqlite_scanner"}, // sqlite {"sqlite3", "sqlite_scanner"}, + {"uc_catalog", "unity_catalog"}, // old name for compatibility {nullptr, nullptr}}; idx_t ExtensionHelper::ExtensionAliasCount() { diff --git a/src/duckdb/src/parser/transform/helpers/transform_sample.cpp b/src/duckdb/src/parser/transform/helpers/transform_sample.cpp index bd1cc75a2..30d4748dd 100644 --- a/src/duckdb/src/parser/transform/helpers/transform_sample.cpp +++ b/src/duckdb/src/parser/transform/helpers/transform_sample.cpp @@ -44,8 +44,9 @@ unique_ptr Transformer::TransformSampleOptions(optional_ptr(); - if (rows < 0) { - throw ParserException("Sample rows %lld out of range, must be bigger than or equal to 0", rows); + if (rows < 0 || sample_value.GetValue() > SampleOptions::MAX_SAMPLE_ROWS) { + throw ParserException("Sample rows %lld out of range, must be between 0 and %lld", rows, + SampleOptions::MAX_SAMPLE_ROWS); } result->sample_size = Value::BIGINT(rows); result->method = SampleMethod::RESERVOIR_SAMPLE; diff --git a/src/duckdb/src/planner/binder/statement/bind_insert.cpp b/src/duckdb/src/planner/binder/statement/bind_insert.cpp index 8a9009562..96222f0c5 100644 --- a/src/duckdb/src/planner/binder/statement/bind_insert.cpp +++ b/src/duckdb/src/planner/binder/statement/bind_insert.cpp @@ -465,7 +465,11 @@ unique_ptr Binder::GenerateMergeInto(InsertStatement &stmt, if (on_conflict_info.action_type == OnConflictAction::REPLACE) { D_ASSERT(!on_conflict_info.set_info); - on_conflict_info.set_info = CreateSetInfoForReplace(table, stmt, storage_info); + // For BY POSITION, create explicit SET information + // For BY NAME, leave it empty and let bind_merge_into handle it automatically + if (stmt.column_order != InsertColumnOrder::INSERT_BY_NAME) { + on_conflict_info.set_info = CreateSetInfoForReplace(table, stmt, storage_info); + } on_conflict_info.action_type = OnConflictAction::UPDATE; } // now set up the merge actions @@ -484,16 +488,19 @@ unique_ptr Binder::GenerateMergeInto(InsertStatement &stmt, // when doing UPDATE set up the when matched action auto update_action = make_uniq(); update_action->action_type = MergeActionType::MERGE_UPDATE; - for (auto &col : on_conflict_info.set_info->expressions) { - vector> lambda_params; - DoUpdateSetQualify(col, table_name, lambda_params); - } - if (on_conflict_info.set_info->condition) { - vector> lambda_params; - DoUpdateSetQualify(on_conflict_info.set_info->condition, table_name, lambda_params); - update_action->condition = std::move(on_conflict_info.set_info->condition); + update_action->column_order = stmt.column_order; + if (on_conflict_info.set_info) { + for (auto &col : on_conflict_info.set_info->expressions) { + vector> lambda_params; + DoUpdateSetQualify(col, table_name, lambda_params); + } + if (on_conflict_info.set_info->condition) { + vector> lambda_params; + DoUpdateSetQualify(on_conflict_info.set_info->condition, table_name, lambda_params); + update_action->condition = std::move(on_conflict_info.set_info->condition); + } + update_action->update_info = std::move(on_conflict_info.set_info); } - update_action->update_info = std::move(on_conflict_info.set_info); merge_into->actions[MergeActionCondition::WHEN_MATCHED].push_back(std::move(update_action)); } diff --git a/src/duckdb/src/planner/subquery/flatten_dependent_join.cpp b/src/duckdb/src/planner/subquery/flatten_dependent_join.cpp index a4a00f185..a9169787d 100644 --- a/src/duckdb/src/planner/subquery/flatten_dependent_join.cpp +++ b/src/duckdb/src/planner/subquery/flatten_dependent_join.cpp @@ -236,6 +236,16 @@ bool FlattenDependentJoins::DetectCorrelatedExpressions(LogicalOperator &op, boo if (DetectCorrelatedExpressions(*child, lateral, new_lateral_depth, condition)) { has_correlation = true; } + + if (op.type == LogicalOperatorType::LOGICAL_MATERIALIZED_CTE && child_idx == 0) { + auto &setop = op.Cast(); + binder.recursive_ctes[setop.table_index] = &setop; + has_correlated_expressions[op] = has_correlation; + if (has_correlation) { + setop.correlated_columns = correlated_columns; + } + } + child_idx++; } @@ -261,6 +271,7 @@ bool FlattenDependentJoins::DetectCorrelatedExpressions(LogicalOperator &op, boo return true; } // Found a materialized CTE, subtree correlation depends on the CTE node + has_correlated_expressions[op] = has_correlated_expressions[*cte_node]; return has_correlated_expressions[*cte_node]; } // No CTE found: subtree is correlated @@ -279,47 +290,32 @@ bool FlattenDependentJoins::DetectCorrelatedExpressions(LogicalOperator &op, boo binder.recursive_ctes[setop.table_index] = &setop; if (has_correlation) { setop.correlated_columns = correlated_columns; - MarkSubtreeCorrelated(*op.children[1].get()); - } - } - - if (op.type == LogicalOperatorType::LOGICAL_MATERIALIZED_CTE) { - auto &setop = op.Cast(); - binder.recursive_ctes[setop.table_index] = &setop; - // only mark the entire subtree as correlated if the materializing side is correlated - auto entry = has_correlated_expressions.find(*op.children[0]); - if (entry != has_correlated_expressions.end()) { - if (has_correlation && entry->second) { - setop.correlated_columns = correlated_columns; - MarkSubtreeCorrelated(*op.children[1].get()); - } + MarkSubtreeCorrelated(*op.children[1].get(), setop.table_index); } } return has_correlation; } -bool FlattenDependentJoins::MarkSubtreeCorrelated(LogicalOperator &op) { +bool FlattenDependentJoins::MarkSubtreeCorrelated(LogicalOperator &op, idx_t cte_index) { // Do not mark base table scans as correlated auto entry = has_correlated_expressions.find(op); D_ASSERT(entry != has_correlated_expressions.end()); bool has_correlation = entry->second; for (auto &child : op.children) { - has_correlation |= MarkSubtreeCorrelated(*child.get()); + has_correlation |= MarkSubtreeCorrelated(*child.get(), cte_index); } if (op.type != LogicalOperatorType::LOGICAL_GET || op.children.size() == 1) { if (op.type == LogicalOperatorType::LOGICAL_CTE_REF) { // There may be multiple recursive CTEs. Only mark CTE_REFs as correlated, // IFF the CTE that we are reading from is correlated. auto &cteref = op.Cast(); - auto cte = binder.recursive_ctes.find(cteref.cte_index); - bool has_correlation = false; - if (cte != binder.recursive_ctes.end()) { - auto &rec_cte = cte->second->Cast(); - has_correlation = !rec_cte.correlated_columns.empty(); + if (cteref.cte_index != cte_index) { + has_correlated_expressions[op] = has_correlation; + return has_correlation; } - has_correlated_expressions[op] = has_correlation; - return has_correlation; + has_correlated_expressions[op] = true; + return true; } else { has_correlated_expressions[op] = has_correlation; } @@ -695,6 +691,42 @@ unique_ptr FlattenDependentJoins::PushDownDependentJoinInternal return plan; } } else if (join.join_type == JoinType::MARK) { + if (!left_has_correlation && right_has_correlation) { + // found a MARK join where the left side has no correlation + + ColumnBinding right_binding; + + // there may still be correlation on the right side that we have to deal with + // push into the right side if necessary or decorrelate it independently otherwise + plan->children[1] = PushDownDependentJoinInternal(std::move(plan->children[1]), + parent_propagate_null_values, lateral_depth); + right_binding = this->base_binding; + + // now push into the left side of the MARK join even though it has no correlation + // this is necessary to add the correlated columns to the column bindings and allow + // the join condition to be rewritten correctly + plan->children[0] = PushDownDependentJoinInternal(std::move(plan->children[0]), + parent_propagate_null_values, lateral_depth); + + auto left_binding = this->base_binding; + + // add the correlated columns to the join conditions + for (idx_t i = 0; i < correlated_columns.size(); i++) { + JoinCondition cond; + cond.left = make_uniq( + correlated_columns[i].type, + ColumnBinding(left_binding.table_index, left_binding.column_index + i)); + cond.right = make_uniq( + correlated_columns[i].type, + ColumnBinding(right_binding.table_index, right_binding.column_index + i)); + cond.comparison = ExpressionType::COMPARE_NOT_DISTINCT_FROM; + + auto &comparison_join = join.Cast(); + comparison_join.conditions.push_back(std::move(cond)); + } + return plan; + } + // push the child into the LHS plan->children[0] = PushDownDependentJoinInternal(std::move(plan->children[0]), parent_propagate_null_values, lateral_depth); @@ -1031,7 +1063,8 @@ unique_ptr FlattenDependentJoins::PushDownDependentJoinInternal } } - RewriteCTEScan cte_rewriter(table_index, correlated_columns); + RewriteCTEScan cte_rewriter(table_index, correlated_columns, + plan->type == LogicalOperatorType::LOGICAL_RECURSIVE_CTE); cte_rewriter.VisitOperator(*plan->children[1]); parent_propagate_null_values = false; diff --git a/src/duckdb/src/planner/subquery/rewrite_cte_scan.cpp b/src/duckdb/src/planner/subquery/rewrite_cte_scan.cpp index f846d9b36..7df4f13a8 100644 --- a/src/duckdb/src/planner/subquery/rewrite_cte_scan.cpp +++ b/src/duckdb/src/planner/subquery/rewrite_cte_scan.cpp @@ -14,8 +14,10 @@ namespace duckdb { -RewriteCTEScan::RewriteCTEScan(idx_t table_index, const CorrelatedColumns &correlated_columns) - : table_index(table_index), correlated_columns(correlated_columns) { +RewriteCTEScan::RewriteCTEScan(idx_t table_index, const CorrelatedColumns &correlated_columns, + bool rewrite_dependent_joins) + : table_index(table_index), correlated_columns(correlated_columns), + rewrite_dependent_joins(rewrite_dependent_joins) { } void RewriteCTEScan::VisitOperator(LogicalOperator &op) { @@ -29,7 +31,7 @@ void RewriteCTEScan::VisitOperator(LogicalOperator &op) { } cteref.correlated_columns += correlated_columns.size(); } - } else if (op.type == LogicalOperatorType::LOGICAL_DEPENDENT_JOIN) { + } else if (op.type == LogicalOperatorType::LOGICAL_DEPENDENT_JOIN && rewrite_dependent_joins) { // There is another DependentJoin below the correlated recursive CTE. // We have to add the correlated columns of the recursive CTE to the // set of columns of this operator. diff --git a/src/duckdb/src/storage/table/column_data.cpp b/src/duckdb/src/storage/table/column_data.cpp index a1aa2b1c9..cf664549b 100644 --- a/src/duckdb/src/storage/table/column_data.cpp +++ b/src/duckdb/src/storage/table/column_data.cpp @@ -588,7 +588,6 @@ void ColumnData::Update(TransactionData transaction, DataTable &data_table, idx_ Vector base_vector(type); ColumnScanState state; FetchUpdateData(state, row_ids, base_vector); - UpdateInternal(transaction, data_table, column_index, update_vector, row_ids, update_count, base_vector); } diff --git a/src/duckdb/src/storage/table/column_data_checkpointer.cpp b/src/duckdb/src/storage/table/column_data_checkpointer.cpp index 5334bd1af..198a7e249 100644 --- a/src/duckdb/src/storage/table/column_data_checkpointer.cpp +++ b/src/duckdb/src/storage/table/column_data_checkpointer.cpp @@ -363,7 +363,7 @@ void ColumnDataCheckpointer::WriteToDisk() { } bool ColumnDataCheckpointer::HasChanges(ColumnData &col_data) { - return col_data.HasChanges(); + return col_data.HasAnyChanges(); } void ColumnDataCheckpointer::WritePersistentSegments(ColumnCheckpointState &state) { diff --git a/src/duckdb/src/storage/table/standard_column_data.cpp b/src/duckdb/src/storage/table/standard_column_data.cpp index 6c6cdf3a3..42fe46cf2 100644 --- a/src/duckdb/src/storage/table/standard_column_data.cpp +++ b/src/duckdb/src/storage/table/standard_column_data.cpp @@ -170,12 +170,12 @@ void StandardColumnData::UpdateColumn(TransactionData transaction, DataTable &da const vector &column_path, Vector &update_vector, row_t *row_ids, idx_t update_count, idx_t depth) { if (depth >= column_path.size()) { - // update this column + // Update the column. ColumnData::Update(transaction, data_table, column_path[0], update_vector, row_ids, update_count); - } else { - // update the child column (i.e. the validity column) - validity.UpdateColumn(transaction, data_table, column_path, update_vector, row_ids, update_count, depth + 1); + return; } + // Update the child column, which is the validity column. + validity.UpdateWithBase(transaction, data_table, column_path[0], update_vector, row_ids, update_count, *this); } unique_ptr StandardColumnData::GetUpdateStatistics() { @@ -200,8 +200,8 @@ void StandardColumnData::FetchRow(TransactionData transaction, ColumnFetchState auto child_state = make_uniq(); state.child_states.push_back(std::move(child_state)); } - validity.FetchRow(transaction, *state.child_states[0], row_id, result, result_idx); ColumnData::FetchRow(transaction, state, row_id, result, result_idx); + validity.FetchRow(transaction, *state.child_states[0], row_id, result, result_idx); } void StandardColumnData::CommitDropColumn() { diff --git a/src/duckdb/src/storage/table/validity_column_data.cpp b/src/duckdb/src/storage/table/validity_column_data.cpp index fc8a9e1ea..6f6e32576 100644 --- a/src/duckdb/src/storage/table/validity_column_data.cpp +++ b/src/duckdb/src/storage/table/validity_column_data.cpp @@ -1,6 +1,7 @@ #include "duckdb/storage/table/validity_column_data.hpp" #include "duckdb/storage/table/scan_state.hpp" #include "duckdb/storage/table/update_segment.hpp" +#include "duckdb/storage/table/standard_column_data.hpp" namespace duckdb { @@ -13,6 +14,22 @@ FilterPropagateResult ValidityColumnData::CheckZonemap(ColumnScanState &state, T return FilterPropagateResult::NO_PRUNING_POSSIBLE; } +void ValidityColumnData::UpdateWithBase(TransactionData transaction, DataTable &data_table, idx_t column_index, + Vector &update_vector, row_t *row_ids, idx_t update_count, ColumnData &base) { + Vector base_vector(base.type); + ColumnScanState validity_scan_state; + FetchUpdateData(validity_scan_state, row_ids, base_vector); + + if (validity_scan_state.current->GetCompressionFunction().type == CompressionType::COMPRESSION_EMPTY) { + // The validity is actually covered by the data, so we read it to get the validity for UpdateInternal. + ColumnScanState data_scan_state; + auto fetch_count = base.Fetch(data_scan_state, row_ids[0], base_vector); + base_vector.Flatten(fetch_count); + } + + UpdateInternal(transaction, data_table, column_index, update_vector, row_ids, update_count, base_vector); +} + void ValidityColumnData::AppendData(BaseStatistics &stats, ColumnAppendState &state, UnifiedVectorFormat &vdata, idx_t count) { lock_guard l(stats_lock);