Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
78 changes: 39 additions & 39 deletions CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -347,48 +347,48 @@ set(DUCKDB_SRC_FILES
src/duckdb/third_party/zstd/dict/divsufsort.cpp
src/duckdb/third_party/zstd/dict/fastcover.cpp
src/duckdb/third_party/zstd/dict/zdict.cpp
src/duckdb/extension/core_functions/core_functions_extension.cpp
src/duckdb/extension/core_functions/function_list.cpp
src/duckdb/extension/core_functions/lambda_functions.cpp
src/duckdb/ub_extension_core_functions_aggregate_algebraic.cpp
src/duckdb/extension/core_functions/function_list.cpp
src/duckdb/extension/core_functions/core_functions_extension.cpp
src/duckdb/ub_extension_core_functions_aggregate_distributive.cpp
src/duckdb/ub_extension_core_functions_aggregate_nested.cpp
src/duckdb/ub_extension_core_functions_aggregate_holistic.cpp
src/duckdb/ub_extension_core_functions_aggregate_regression.cpp
src/duckdb/ub_extension_core_functions_aggregate_distributive.cpp
src/duckdb/ub_extension_core_functions_scalar_generic.cpp
src/duckdb/ub_extension_core_functions_scalar_array.cpp
src/duckdb/ub_extension_core_functions_scalar_random.cpp
src/duckdb/ub_extension_core_functions_scalar_list.cpp
src/duckdb/ub_extension_core_functions_scalar_struct.cpp
src/duckdb/ub_extension_core_functions_scalar_date.cpp
src/duckdb/ub_extension_core_functions_scalar_enum.cpp
src/duckdb/ub_extension_core_functions_aggregate_algebraic.cpp
src/duckdb/ub_extension_core_functions_aggregate_holistic.cpp
src/duckdb/ub_extension_core_functions_scalar_string.cpp
src/duckdb/ub_extension_core_functions_scalar_bit.cpp
src/duckdb/ub_extension_core_functions_scalar_operators.cpp
src/duckdb/ub_extension_core_functions_scalar_enum.cpp
src/duckdb/ub_extension_core_functions_scalar_map.cpp
src/duckdb/ub_extension_core_functions_scalar_random.cpp
src/duckdb/ub_extension_core_functions_scalar_math.cpp
src/duckdb/ub_extension_core_functions_scalar_string.cpp
src/duckdb/ub_extension_core_functions_scalar_union.cpp
src/duckdb/ub_extension_core_functions_scalar_generic.cpp
src/duckdb/ub_extension_core_functions_scalar_struct.cpp
src/duckdb/ub_extension_core_functions_scalar_list.cpp
src/duckdb/ub_extension_core_functions_scalar_array.cpp
src/duckdb/ub_extension_core_functions_scalar_debug.cpp
src/duckdb/ub_extension_core_functions_scalar_bit.cpp
src/duckdb/ub_extension_core_functions_scalar_blob.cpp
src/duckdb/ub_extension_core_functions_scalar_union.cpp
src/duckdb/ub_extension_core_functions_scalar_map.cpp
src/duckdb/extension/parquet/parquet_crypto.cpp
src/duckdb/extension/parquet/parquet_reader.cpp
src/duckdb/extension/parquet/parquet_metadata.cpp
src/duckdb/ub_extension_core_functions_scalar_date.cpp
src/duckdb/extension/parquet/parquet_writer.cpp
src/duckdb/extension/parquet/zstd_file_system.cpp
src/duckdb/extension/parquet/parquet_crypto.cpp
src/duckdb/extension/parquet/parquet_reader.cpp
src/duckdb/extension/parquet/parquet_timestamp.cpp
src/duckdb/extension/parquet/parquet_float16.cpp
src/duckdb/extension/parquet/parquet_statistics.cpp
src/duckdb/extension/parquet/parquet_multi_file_info.cpp
src/duckdb/extension/parquet/column_writer.cpp
src/duckdb/extension/parquet/column_reader.cpp
src/duckdb/extension/parquet/geo_parquet.cpp
src/duckdb/extension/parquet/parquet_file_metadata_cache.cpp
src/duckdb/extension/parquet/parquet_statistics.cpp
src/duckdb/extension/parquet/parquet_extension.cpp
src/duckdb/extension/parquet/parquet_float16.cpp
src/duckdb/extension/parquet/column_writer.cpp
src/duckdb/extension/parquet/parquet_file_metadata_cache.cpp
src/duckdb/extension/parquet/serialize_parquet.cpp
src/duckdb/extension/parquet/column_reader.cpp
src/duckdb/extension/parquet/parquet_metadata.cpp
src/duckdb/ub_extension_parquet_decoder.cpp
src/duckdb/ub_extension_parquet_writer.cpp
src/duckdb/ub_extension_parquet_reader.cpp
src/duckdb/ub_extension_parquet_reader_variant.cpp
src/duckdb/ub_extension_parquet_writer.cpp
src/duckdb/third_party/parquet/parquet_types.cpp
src/duckdb/third_party/thrift/thrift/protocol/TProtocol.cpp
src/duckdb/third_party/thrift/thrift/transport/TTransportException.cpp
Expand Down Expand Up @@ -427,32 +427,32 @@ set(DUCKDB_SRC_FILES
src/duckdb/third_party/brotli/enc/metablock.cpp
src/duckdb/third_party/brotli/enc/static_dict.cpp
src/duckdb/third_party/brotli/enc/utf8_util.cpp
src/duckdb/extension/icu/./icu-datetrunc.cpp
src/duckdb/extension/icu/./icu-timezone.cpp
src/duckdb/extension/icu/./icu-current.cpp
src/duckdb/extension/icu/./icu-list-range.cpp
src/duckdb/extension/icu/./icu-datefunc.cpp
src/duckdb/extension/icu/./icu-strptime.cpp
src/duckdb/extension/icu/./icu-dateadd.cpp
src/duckdb/extension/icu/./icu_extension.cpp
src/duckdb/extension/icu/./icu-timebucket.cpp
src/duckdb/extension/icu/./icu-table-range.cpp
src/duckdb/extension/icu/./icu-strptime.cpp
src/duckdb/extension/icu/./icu-datepart.cpp
src/duckdb/extension/icu/./icu-makedate.cpp
src/duckdb/extension/icu/./icu-datefunc.cpp
src/duckdb/extension/icu/./icu-current.cpp
src/duckdb/extension/icu/./icu-timezone.cpp
src/duckdb/extension/icu/./icu-table-range.cpp
src/duckdb/extension/icu/./icu-dateadd.cpp
src/duckdb/extension/icu/./icu-timebucket.cpp
src/duckdb/extension/icu/./icu-datesub.cpp
src/duckdb/extension/icu/./icu-datetrunc.cpp
src/duckdb/extension/icu/./icu-list-range.cpp
src/duckdb/ub_extension_icu_third_party_icu_common.cpp
src/duckdb/ub_extension_icu_third_party_icu_i18n.cpp
src/duckdb/extension/icu/third_party/icu/stubdata/stubdata.cpp
src/duckdb/extension/json/json_reader.cpp
src/duckdb/extension/json/json_functions.cpp
src/duckdb/extension/json/json_extension.cpp
src/duckdb/extension/json/json_scan.cpp
src/duckdb/extension/json/json_multi_file_info.cpp
src/duckdb/extension/json/json_enums.cpp
src/duckdb/extension/json/serialize_json.cpp
src/duckdb/extension/json/json_common.cpp
src/duckdb/extension/json/json_scan.cpp
src/duckdb/extension/json/json_enums.cpp
src/duckdb/extension/json/json_reader.cpp
src/duckdb/extension/json/json_serializer.cpp
src/duckdb/extension/json/json_extension.cpp
src/duckdb/extension/json/json_deserializer.cpp
src/duckdb/extension/json/json_serializer.cpp
src/duckdb/ub_extension_json_json_functions.cpp)

set(JEMALLOC_SRC_FILES
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -53,10 +53,7 @@ unique_ptr<FunctionData> CurrentSettingBind(ClientContext &context, ScalarFuncti
if (!context.TryGetCurrentSetting(key, val)) {
auto extension_name = Catalog::AutoloadExtensionByConfigName(context, key);
// If autoloader didn't throw, the config is now available
if (!context.TryGetCurrentSetting(key, val)) {
throw InternalException("Extension %s did not provide the '%s' config setting",
extension_name.ToStdString(), key);
}
context.TryGetCurrentSetting(key, val);
}

bound_function.return_type = val.type();
Expand Down
3 changes: 2 additions & 1 deletion src/duckdb/extension/icu/icu-strptime.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -221,8 +221,9 @@ struct ICUStrptime : public ICUDateFunc {
if (!error.empty()) {
throw InvalidInputException("Failed to parse format specifier %s: %s", format_string, error);
}
// If any format has UTC offsets, then we have to produce TSTZ
// If any format has UTC offsets or names, then we have to produce TSTZ
has_tz = has_tz || format.HasFormatSpecifier(StrTimeSpecifier::TZ_NAME);
has_tz = has_tz || format.HasFormatSpecifier(StrTimeSpecifier::UTC_OFFSET);
formats.emplace_back(format);
}
if (has_tz) {
Expand Down
6 changes: 2 additions & 4 deletions src/duckdb/extension/json/include/json_common.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,7 @@
#include "duckdb/common/operator/string_cast.hpp"
#include "duckdb/planner/expression/bound_function_expression.hpp"
#include "yyjson.hpp"
#include "duckdb/common/types/blob.hpp"

using namespace duckdb_yyjson; // NOLINT

Expand Down Expand Up @@ -228,11 +229,8 @@ struct JSONCommon {

static string FormatParseError(const char *data, idx_t length, yyjson_read_err &error, const string &extra = "") {
D_ASSERT(error.code != YYJSON_READ_SUCCESS);
// Go to blob so we can have a better error message for weird strings
auto blob = Value::BLOB(string(data, length));
// Truncate, so we don't print megabytes worth of JSON
string input = blob.ToString();
input = input.length() > 50 ? string(input.c_str(), 47) + "..." : input;
auto input = length > 50 ? string(data, 47) + "..." : string(data, length);
// Have to replace \r, otherwise output is unreadable
input = StringUtil::Replace(input, "\r", "\\r");
return StringUtil::Format("Malformed JSON at byte %lld of input: %s. %s Input: \"%s\"", error.pos, error.msg,
Expand Down
6 changes: 5 additions & 1 deletion src/duckdb/extension/json/json_functions.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -394,7 +394,11 @@ void JSONFunctions::RegisterSimpleCastFunctions(ExtensionLoader &loader) {
loader.RegisterCastFunction(LogicalType::LIST(LogicalType::JSON()), LogicalTypeId::VARCHAR, CastJSONListToVarchar,
json_list_to_varchar_cost);

// VARCHAR to JSON[] (also needs a special case otherwise get a VARCHAR -> VARCHAR[] cast first)
// JSON[] to JSON is allowed implicitly
loader.RegisterCastFunction(LogicalType::LIST(LogicalType::JSON()), LogicalType::JSON(), CastJSONListToVarchar,
100);

// VARCHAR to JSON[] (also needs a special case otherwise we get a VARCHAR -> VARCHAR[] cast first)
const auto varchar_to_json_list_cost =
CastFunctionSet::ImplicitCastCost(db, LogicalType::VARCHAR, LogicalType::LIST(LogicalType::JSON())) - 1;
BoundCastInfo varchar_to_json_list_info(CastVarcharToJSONList, nullptr, JSONFunctionLocalState::InitCastLocalState);
Expand Down
37 changes: 23 additions & 14 deletions src/duckdb/extension/parquet/column_writer.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -187,9 +187,12 @@ void ColumnWriter::HandleRepeatLevels(ColumnWriterState &state, ColumnWriterStat
// no repeat levels without a parent node
return;
}
while (state.repetition_levels.size() < parent->repetition_levels.size()) {
state.repetition_levels.push_back(parent->repetition_levels[state.repetition_levels.size()]);
if (state.repetition_levels.size() >= parent->repetition_levels.size()) {
return;
}
state.repetition_levels.insert(state.repetition_levels.end(),
parent->repetition_levels.begin() + state.repetition_levels.size(),
parent->repetition_levels.end());
}

void ColumnWriter::HandleDefineLevels(ColumnWriterState &state, ColumnWriterState *parent, const ValidityMask &validity,
Expand All @@ -200,36 +203,41 @@ void ColumnWriter::HandleDefineLevels(ColumnWriterState &state, ColumnWriterStat
while (state.definition_levels.size() < parent->definition_levels.size()) {
idx_t current_index = state.definition_levels.size();
if (parent->definition_levels[current_index] != PARQUET_DEFINE_VALID) {
//! Inherit nulls from parent
state.definition_levels.push_back(parent->definition_levels[current_index]);
state.parent_null_count++;
} else if (validity.RowIsValid(vector_index)) {
//! Produce a non-null define
state.definition_levels.push_back(define_value);
} else {
//! Produce a null define
if (!can_have_nulls) {
throw IOException("Parquet writer: map key column is not allowed to contain NULL values");
}
state.null_count++;
state.definition_levels.push_back(null_value);
}
D_ASSERT(parent->is_empty.empty() || current_index < parent->is_empty.size());
if (parent->is_empty.empty() || !parent->is_empty[current_index]) {
vector_index++;
}
}
return;
}

// no parent: set definition levels only from this validity mask
if (validity.AllValid()) {
state.definition_levels.insert(state.definition_levels.end(), count, define_value);
} else {
// no parent: set definition levels only from this validity mask
if (validity.AllValid()) {
state.definition_levels.insert(state.definition_levels.end(), count, define_value);
} else {
for (idx_t i = 0; i < count; i++) {
const auto is_null = !validity.RowIsValid(i);
state.definition_levels.emplace_back(is_null ? null_value : define_value);
state.null_count += is_null;
}
}
if (!can_have_nulls && state.null_count != 0) {
throw IOException("Parquet writer: map key column is not allowed to contain NULL values");
for (idx_t i = 0; i < count; i++) {
const auto is_null = !validity.RowIsValid(i);
state.definition_levels.emplace_back(is_null ? null_value : define_value);
state.null_count += is_null;
}
}
if (!can_have_nulls && state.null_count != 0) {
throw IOException("Parquet writer: map key column is not allowed to contain NULL values");
}
}

//===--------------------------------------------------------------------===//
Expand Down Expand Up @@ -368,6 +376,7 @@ ParquetColumnSchema ColumnWriter::FillParquetSchema(vector<duckdb_parquet::Schem
}
return map_column;
}

duckdb_parquet::SchemaElement schema_element;
schema_element.type = ParquetWriter::DuckDBTypeToParquetType(type);
schema_element.repetition_type = null_type;
Expand Down
2 changes: 1 addition & 1 deletion src/duckdb/extension/parquet/include/column_writer.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -27,7 +27,7 @@ class ColumnWriterState {

unsafe_vector<uint16_t> definition_levels;
unsafe_vector<uint16_t> repetition_levels;
vector<bool> is_empty;
unsafe_vector<uint8_t> is_empty;
idx_t parent_null_count = 0;
idx_t null_count = 0;

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -14,12 +14,25 @@
namespace duckdb {

class StringColumnReader : public ColumnReader {
enum class StringColumnType : uint8_t { VARCHAR, JSON, OTHER };

static StringColumnType GetStringColumnType(const LogicalType &type) {
if (type.IsJSONType()) {
return StringColumnType::JSON;
}
if (type.id() == LogicalTypeId::VARCHAR) {
return StringColumnType::VARCHAR;
}
return StringColumnType::OTHER;
}

public:
static constexpr const PhysicalType TYPE = PhysicalType::VARCHAR;

public:
StringColumnReader(ParquetReader &reader, const ParquetColumnSchema &schema);
idx_t fixed_width_string_length;
const StringColumnType string_column_type;

public:
static void VerifyString(const char *str_data, uint32_t str_len, const bool isVarchar);
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -25,6 +25,10 @@ class ArrayColumnWriter : public ListColumnWriter {
void Prepare(ColumnWriterState &state, ColumnWriterState *parent, Vector &vector, idx_t count,
bool vector_can_span_multiple_pages) override;
void Write(ColumnWriterState &state, Vector &vector, idx_t count) override;

protected:
void WriteArrayState(ListColumnWriterState &state, idx_t array_size, uint16_t first_repeat_level,
idx_t define_value, const bool is_empty = false);
};

} // namespace duckdb
5 changes: 4 additions & 1 deletion src/duckdb/extension/parquet/parquet_reader.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -570,7 +570,10 @@ ParquetColumnSchema ParquetReader::ParseSchemaRecursive(idx_t depth, idx_t max_d

auto file_meta_data = GetFileMetadata();
D_ASSERT(file_meta_data);
D_ASSERT(next_schema_idx < file_meta_data->schema.size());
if (next_schema_idx >= file_meta_data->schema.size()) {
throw InvalidInputException("Malformed Parquet schema in file \"%s\": invalid schema index %d", file.path,
next_schema_idx);
}
auto &s_ele = file_meta_data->schema[next_schema_idx];
auto this_idx = next_schema_idx;

Expand Down
12 changes: 5 additions & 7 deletions src/duckdb/extension/parquet/parquet_statistics.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -395,23 +395,21 @@ unique_ptr<BaseStatistics> ParquetStatisticsUtils::TransformColumnStatistics(con
}
break;
case LogicalTypeId::VARCHAR: {
auto string_stats = StringStats::CreateEmpty(type);
auto string_stats = StringStats::CreateUnknown(type);
if (parquet_stats.__isset.min_value) {
StringColumnReader::VerifyString(parquet_stats.min_value.c_str(), parquet_stats.min_value.size(), true);
StringStats::Update(string_stats, parquet_stats.min_value);
StringStats::SetMin(string_stats, parquet_stats.min_value);
} else if (parquet_stats.__isset.min) {
StringColumnReader::VerifyString(parquet_stats.min.c_str(), parquet_stats.min.size(), true);
StringStats::Update(string_stats, parquet_stats.min);
StringStats::SetMin(string_stats, parquet_stats.min);
}
if (parquet_stats.__isset.max_value) {
StringColumnReader::VerifyString(parquet_stats.max_value.c_str(), parquet_stats.max_value.size(), true);
StringStats::Update(string_stats, parquet_stats.max_value);
StringStats::SetMax(string_stats, parquet_stats.max_value);
} else if (parquet_stats.__isset.max) {
StringColumnReader::VerifyString(parquet_stats.max.c_str(), parquet_stats.max.size(), true);
StringStats::Update(string_stats, parquet_stats.max);
StringStats::SetMax(string_stats, parquet_stats.max);
}
StringStats::SetContainsUnicode(string_stats);
StringStats::ResetMaxStringLength(string_stats);
row_group_stats = string_stats.ToUnique();
break;
}
Expand Down
21 changes: 17 additions & 4 deletions src/duckdb/extension/parquet/reader/string_column_reader.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,7 @@ namespace duckdb {
// String Column Reader
//===--------------------------------------------------------------------===//
StringColumnReader::StringColumnReader(ParquetReader &reader, const ParquetColumnSchema &schema)
: ColumnReader(reader, schema) {
: ColumnReader(reader, schema), string_column_type(GetStringColumnType(Type())) {
fixed_width_string_length = 0;
if (schema.parquet_type == Type::FIXED_LEN_BYTE_ARRAY) {
fixed_width_string_length = schema.type_length;
Expand All @@ -26,13 +26,26 @@ void StringColumnReader::VerifyString(const char *str_data, uint32_t str_len, co
size_t pos;
auto utf_type = Utf8Proc::Analyze(str_data, str_len, &reason, &pos);
if (utf_type == UnicodeType::INVALID) {
throw InvalidInputException("Invalid string encoding found in Parquet file: value \"" +
Blob::ToString(string_t(str_data, str_len)) + "\" is not valid UTF8!");
throw InvalidInputException("Invalid string encoding found in Parquet file: value \"%s\" is not valid UTF8!",
Blob::ToString(string_t(str_data, str_len)));
}
}

void StringColumnReader::VerifyString(const char *str_data, uint32_t str_len) {
VerifyString(str_data, str_len, Type().id() == LogicalTypeId::VARCHAR);
switch (string_column_type) {
case StringColumnType::VARCHAR:
VerifyString(str_data, str_len, true);
break;
case StringColumnType::JSON: {
const auto error = StringUtil::ValidateJSON(str_data, str_len);
if (!error.empty()) {
throw InvalidInputException("Invalid JSON found in Parquet file: %s", error);
}
break;
}
default:
break;
}
}

class ParquetStringVectorBuffer : public VectorBuffer {
Expand Down
Loading