diff --git a/src/duckdb/extension/json/json_extension.cpp b/src/duckdb/extension/json/json_extension.cpp index d609fd82e..23ca6c014 100644 --- a/src/duckdb/extension/json/json_extension.cpp +++ b/src/duckdb/extension/json/json_extension.cpp @@ -68,7 +68,13 @@ void JsonExtension::Load(DuckDB &db) { // JSON copy function auto copy_fun = JSONFunctions::GetJSONCopyFunction(); - ExtensionUtil::RegisterFunction(db_instance, std::move(copy_fun)); + ExtensionUtil::RegisterFunction(db_instance, copy_fun); + copy_fun.extension = "ndjson"; + copy_fun.name = "ndjson"; + ExtensionUtil::RegisterFunction(db_instance, copy_fun); + copy_fun.extension = "jsonl"; + copy_fun.name = "jsonl"; + ExtensionUtil::RegisterFunction(db_instance, copy_fun); // JSON macro's for (idx_t index = 0; json_macros[index].name != nullptr; index++) { diff --git a/src/duckdb/src/catalog/catalog_set.cpp b/src/duckdb/src/catalog/catalog_set.cpp index f9e20758a..270efff1a 100644 --- a/src/duckdb/src/catalog/catalog_set.cpp +++ b/src/duckdb/src/catalog/catalog_set.cpp @@ -721,6 +721,11 @@ void CatalogSet::Scan(const std::function &callback) { } } +void CatalogSet::SetDefaultGenerator(unique_ptr defaults_p) { + lock_guard lock(catalog_lock); + defaults = std::move(defaults_p); +} + void CatalogSet::Verify(Catalog &catalog_p) { D_ASSERT(&catalog_p == &catalog); vector> entries; diff --git a/src/duckdb/src/catalog/duck_catalog.cpp b/src/duckdb/src/catalog/duck_catalog.cpp index 558a0fec9..d98c82547 100644 --- a/src/duckdb/src/catalog/duck_catalog.cpp +++ b/src/duckdb/src/catalog/duck_catalog.cpp @@ -30,6 +30,7 @@ void DuckCatalog::Initialize(bool load_builtin) { CreateSchemaInfo info; info.schema = DEFAULT_SCHEMA; info.internal = true; + info.on_conflict = OnCreateConflict::IGNORE_ON_CONFLICT; CreateSchema(data, info); if (load_builtin) { diff --git a/src/duckdb/src/common/enum_util.cpp b/src/duckdb/src/common/enum_util.cpp index c51440d53..3103e80e0 100644 --- a/src/duckdb/src/common/enum_util.cpp +++ b/src/duckdb/src/common/enum_util.cpp @@ -1032,19 +1032,20 @@ const StringUtil::EnumStringLiteral *GetDataFileTypeValues() { { static_cast(DataFileType::FILE_DOES_NOT_EXIST), "FILE_DOES_NOT_EXIST" }, { static_cast(DataFileType::DUCKDB_FILE), "DUCKDB_FILE" }, { static_cast(DataFileType::SQLITE_FILE), "SQLITE_FILE" }, - { static_cast(DataFileType::PARQUET_FILE), "PARQUET_FILE" } + { static_cast(DataFileType::PARQUET_FILE), "PARQUET_FILE" }, + { static_cast(DataFileType::UNKNOWN_FILE), "UNKNOWN_FILE" } }; return values; } template<> const char* EnumUtil::ToChars(DataFileType value) { - return StringUtil::EnumToString(GetDataFileTypeValues(), 4, "DataFileType", static_cast(value)); + return StringUtil::EnumToString(GetDataFileTypeValues(), 5, "DataFileType", static_cast(value)); } template<> DataFileType EnumUtil::FromString(const char *value) { - return static_cast(StringUtil::StringToEnum(GetDataFileTypeValues(), 4, "DataFileType", value)); + return static_cast(StringUtil::StringToEnum(GetDataFileTypeValues(), 5, "DataFileType", value)); } const StringUtil::EnumStringLiteral *GetDateCastResultValues() { diff --git a/src/duckdb/src/execution/operator/csv_scanner/scanner/string_value_scanner.cpp b/src/duckdb/src/execution/operator/csv_scanner/scanner/string_value_scanner.cpp index a94adb48a..cbed64993 100644 --- a/src/duckdb/src/execution/operator/csv_scanner/scanner/string_value_scanner.cpp +++ b/src/duckdb/src/execution/operator/csv_scanner/scanner/string_value_scanner.cpp @@ -749,7 +749,7 @@ bool LineError::HandleErrors(StringValueResult &result) { default: throw InvalidInputException("CSV Error not allowed when inserting row"); } - result.error_handler.Error(csv_error); + result.error_handler.Error(csv_error, result.try_row); } if (is_error_in_line && scan_id != StringValueScanner::LINE_FINDER_ID) { if (result.sniffing) { @@ -777,7 +777,7 @@ void StringValueResult::NullPaddingQuotedNewlineCheck() const { // If we have null_padding set, we found a quoted new line, we are scanning the file in parallel; We error. LinesPerBoundary lines_per_batch(iterator.GetBoundaryIdx(), lines_read); auto csv_error = CSVError::NullPaddingFail(state_machine.options, lines_per_batch, path); - error_handler.Error(csv_error); + error_handler.Error(csv_error, try_row); } } @@ -847,13 +847,13 @@ bool StringValueResult::AddRowInternal() { state_machine.options, cur_col_id - 1, lines_per_batch, borked_line, current_line_position.begin.GetGlobalPosition(requested_size, first_nl), last_position.GetGlobalPosition(requested_size, first_nl), path); - error_handler.Error(csv_error); + error_handler.Error(csv_error, try_row); } else { auto csv_error = CSVError::IncorrectColumnAmountError( state_machine.options, cur_col_id - 1, lines_per_batch, borked_line, current_line_position.begin.GetGlobalPosition(requested_size, first_nl), last_position.GetGlobalPosition(requested_size), path); - error_handler.Error(csv_error); + error_handler.Error(csv_error, try_row); } } // If we are here we ignore_errors, so we delete this line @@ -966,6 +966,7 @@ StringValueScanner::StringValueScanner(idx_t scanner_idx_p, const shared_ptrskipped_rows; } iterator.buffer_size = state_machine->options.buffer_size_option.GetValue(); + result.try_row = scanner_idx == LINE_FINDER_ID; } StringValueScanner::StringValueScanner(const shared_ptr &buffer_manager, @@ -1710,19 +1711,24 @@ bool StringValueScanner::IsRowValid(CSVIterator ¤t_iterator) const { return false; } constexpr idx_t result_size = 1; - auto scan_finder = make_uniq(StringValueScanner::LINE_FINDER_ID, buffer_manager, - state_machine_strict, make_shared_ptr(), - csv_file_scan, false, current_iterator, result_size); - auto &tuples = scan_finder->ParseChunk(); - current_iterator.pos = scan_finder->GetIteratorPosition(); - bool has_error = false; - if (tuples.current_errors.HasError()) { - if (tuples.current_errors.Size() != 1 || !tuples.current_errors.HasErrorType(MAXIMUM_LINE_SIZE)) { - // We ignore maximum line size errors - has_error = true; - } - } - return (tuples.number_of_rows == 1 || tuples.first_line_is_comment) && !has_error && tuples.borked_rows.empty(); + auto scan_finder = make_uniq(LINE_FINDER_ID, buffer_manager, state_machine_strict, + make_shared_ptr(), csv_file_scan, false, + current_iterator, result_size); + try { + auto &tuples = scan_finder->ParseChunk(); + current_iterator.pos = scan_finder->GetIteratorPosition(); + bool has_error = false; + if (tuples.current_errors.HasError()) { + if (tuples.current_errors.Size() != 1 || !tuples.current_errors.HasErrorType(MAXIMUM_LINE_SIZE)) { + // We ignore maximum line size errors + has_error = true; + } + } + return (tuples.number_of_rows == 1 || tuples.first_line_is_comment) && !has_error && tuples.borked_rows.empty(); + } catch (const Exception &e) { + return false; + } + return true; } ValidRowInfo StringValueScanner::TryRow(CSVState state, idx_t start_pos, idx_t end_pos) const { diff --git a/src/duckdb/src/execution/operator/csv_scanner/util/csv_error.cpp b/src/duckdb/src/execution/operator/csv_scanner/util/csv_error.cpp index c1bdda64c..d574acc49 100644 --- a/src/duckdb/src/execution/operator/csv_scanner/util/csv_error.cpp +++ b/src/duckdb/src/execution/operator/csv_scanner/util/csv_error.cpp @@ -60,7 +60,7 @@ void CSVErrorHandler::ThrowError(const CSVError &csv_error) { void CSVErrorHandler::Error(const CSVError &csv_error, bool force_error) { lock_guard parallel_lock(main_mutex); - if ((ignore_errors && !force_error) || (PrintLineNumber(csv_error) && !CanGetLine(csv_error.GetBoundaryIndex()))) { + if (!force_error && (ignore_errors || (PrintLineNumber(csv_error) && !CanGetLine(csv_error.GetBoundaryIndex())))) { // We store this error, we can't throw it now, or we are ignoring it errors.push_back(csv_error); return; diff --git a/src/duckdb/src/execution/operator/schema/physical_attach.cpp b/src/duckdb/src/execution/operator/schema/physical_attach.cpp index d3646f32b..4a6c7cb1c 100644 --- a/src/duckdb/src/execution/operator/schema/physical_attach.cpp +++ b/src/duckdb/src/execution/operator/schema/physical_attach.cpp @@ -62,20 +62,6 @@ SourceResultType PhysicalAttach::GetData(ExecutionContext &context, DataChunk &c } } - string extension = ""; - if (FileSystem::IsRemoteFile(path, extension)) { - if (!ExtensionHelper::TryAutoLoadExtension(context.client, extension)) { - throw MissingExtensionException("Attaching path '%s' requires extension '%s' to be loaded", path, - extension); - } - if (options.access_mode == AccessMode::AUTOMATIC) { - // Attaching of remote files gets bumped to READ_ONLY - // This is due to the fact that on most (all?) remote files writes to DB are not available - // and having this raised later is not super helpful - options.access_mode = AccessMode::READ_ONLY; - } - } - // Get the database type and attach the database. db_manager.GetDatabaseType(context.client, *info, config, options); auto attached_db = db_manager.AttachDatabase(context.client, *info, options); diff --git a/src/duckdb/src/function/table/version/pragma_version.cpp b/src/duckdb/src/function/table/version/pragma_version.cpp index 0466de70b..adb9aebf8 100644 --- a/src/duckdb/src/function/table/version/pragma_version.cpp +++ b/src/duckdb/src/function/table/version/pragma_version.cpp @@ -1,5 +1,5 @@ #ifndef DUCKDB_PATCH_VERSION -#define DUCKDB_PATCH_VERSION "0-dev3309" +#define DUCKDB_PATCH_VERSION "0-dev3365" #endif #ifndef DUCKDB_MINOR_VERSION #define DUCKDB_MINOR_VERSION 3 @@ -8,10 +8,10 @@ #define DUCKDB_MAJOR_VERSION 1 #endif #ifndef DUCKDB_VERSION -#define DUCKDB_VERSION "v1.3.0-dev3309" +#define DUCKDB_VERSION "v1.3.0-dev3365" #endif #ifndef DUCKDB_SOURCE_ID -#define DUCKDB_SOURCE_ID "027bc16ee8" +#define DUCKDB_SOURCE_ID "fda0ba6a7a" #endif #include "duckdb/function/table/system_functions.hpp" #include "duckdb/main/database.hpp" diff --git a/src/duckdb/src/include/duckdb/catalog/catalog_entry/duck_schema_entry.hpp b/src/duckdb/src/include/duckdb/catalog/catalog_entry/duck_schema_entry.hpp index 7936ad44f..c080d0fe3 100644 --- a/src/duckdb/src/include/duckdb/catalog/catalog_entry/duck_schema_entry.hpp +++ b/src/duckdb/src/include/duckdb/catalog/catalog_entry/duck_schema_entry.hpp @@ -1,7 +1,7 @@ //===----------------------------------------------------------------------===// // DuckDB // -// duckdb/catalog/catalog_entry/dschema_catalog_entry.hpp +// duckdb/catalog/catalog_entry/duck_schema_entry.hpp // // //===----------------------------------------------------------------------===// @@ -70,11 +70,10 @@ class DuckSchemaEntry : public SchemaCatalogEntry { void Verify(Catalog &catalog) override; -private: - void OnDropEntry(CatalogTransaction transaction, CatalogEntry &entry); - -private: //! Get the catalog set for the specified type CatalogSet &GetCatalogSet(CatalogType type); + +private: + void OnDropEntry(CatalogTransaction transaction, CatalogEntry &entry); }; } // namespace duckdb diff --git a/src/duckdb/src/include/duckdb/catalog/catalog_set.hpp b/src/duckdb/src/include/duckdb/catalog/catalog_set.hpp index 27364e773..7c99c7962 100644 --- a/src/duckdb/src/include/duckdb/catalog/catalog_set.hpp +++ b/src/duckdb/src/include/duckdb/catalog/catalog_set.hpp @@ -128,6 +128,9 @@ class CatalogSet { void Verify(Catalog &catalog); + //! Override the default generator - this should not be used after the catalog set has been used + void SetDefaultGenerator(unique_ptr defaults); + private: bool DropDependencies(CatalogTransaction transaction, const string &name, bool cascade, bool allow_drop_internal = false); diff --git a/src/duckdb/src/include/duckdb/execution/operator/csv_scanner/string_value_scanner.hpp b/src/duckdb/src/include/duckdb/execution/operator/csv_scanner/string_value_scanner.hpp index 85a798af5..bacabfc4f 100644 --- a/src/duckdb/src/include/duckdb/execution/operator/csv_scanner/string_value_scanner.hpp +++ b/src/duckdb/src/include/duckdb/execution/operator/csv_scanner/string_value_scanner.hpp @@ -217,6 +217,9 @@ class StringValueResult : public ScannerResult { bool added_last_line = false; bool quoted_new_line = false; + //! If we are trying a row or not when figuring out the next row to start from. + bool try_row = false; + unsafe_unique_array parse_types; vector names; @@ -376,7 +379,7 @@ class StringValueScanner : public BaseScanner { idx_t start_pos; //! Pointer to the previous buffer handle, necessary for over-buffer values shared_ptr previous_buffer_handle; - //! Strict state machine, is basically a state machine with rfc 4180 set to true, used to figure out new line. + //! Strict state machine is basically a state machine with rfc 4180 set to true, used to figure out a new line. shared_ptr state_machine_strict; }; diff --git a/src/duckdb/src/include/duckdb/main/attached_database.hpp b/src/duckdb/src/include/duckdb/main/attached_database.hpp index 218263b02..eb7d62149 100644 --- a/src/duckdb/src/include/duckdb/main/attached_database.hpp +++ b/src/duckdb/src/include/duckdb/main/attached_database.hpp @@ -56,11 +56,10 @@ class AttachedDatabase : public CatalogEntry { //! Create the built-in system database (without storage). explicit AttachedDatabase(DatabaseInstance &db, AttachedDatabaseType type = AttachedDatabaseType::SYSTEM_DATABASE); //! Create an attached database instance with the specified name and storage. - AttachedDatabase(DatabaseInstance &db, Catalog &catalog, string name, string file_path, - const AttachOptions &options); + AttachedDatabase(DatabaseInstance &db, Catalog &catalog, string name, string file_path, AttachOptions &options); //! Create an attached database instance with the specified storage extension. AttachedDatabase(DatabaseInstance &db, Catalog &catalog, StorageExtension &ext, ClientContext &context, string name, - const AttachInfo &info, const AttachOptions &options); + AttachInfo &info, AttachOptions &options); ~AttachedDatabase() override; //! Initializes the catalog and storage of the attached database. diff --git a/src/duckdb/src/include/duckdb/main/database.hpp b/src/duckdb/src/include/duckdb/main/database.hpp index e00c48a82..5dfc68fa7 100644 --- a/src/duckdb/src/include/duckdb/main/database.hpp +++ b/src/duckdb/src/include/duckdb/main/database.hpp @@ -75,8 +75,8 @@ class DatabaseInstance : public enable_shared_from_this { DUCKDB_API SettingLookupResult TryGetCurrentSetting(const string &key, Value &result) const; - unique_ptr CreateAttachedDatabase(ClientContext &context, const AttachInfo &info, - const AttachOptions &options); + unique_ptr CreateAttachedDatabase(ClientContext &context, AttachInfo &info, + AttachOptions &options); void AddExtensionInfo(const string &name, const ExtensionLoadedInfo &info); diff --git a/src/duckdb/src/include/duckdb/main/database_manager.hpp b/src/duckdb/src/include/duckdb/main/database_manager.hpp index 9b18c5d18..e6eb55a32 100644 --- a/src/duckdb/src/include/duckdb/main/database_manager.hpp +++ b/src/duckdb/src/include/duckdb/main/database_manager.hpp @@ -46,8 +46,7 @@ class DatabaseManager { //! Get an attached database by its name optional_ptr GetDatabase(ClientContext &context, const string &name); //! Attach a new database - optional_ptr AttachDatabase(ClientContext &context, const AttachInfo &info, - const AttachOptions &options); + optional_ptr AttachDatabase(ClientContext &context, AttachInfo &info, AttachOptions &options); //! Detach an existing database void DetachDatabase(ClientContext &context, const string &name, OnEntryNotFound if_not_found); //! Returns a reference to the system catalog diff --git a/src/duckdb/src/include/duckdb/parser/parsed_data/copy_info.hpp b/src/duckdb/src/include/duckdb/parser/parsed_data/copy_info.hpp index 46cb8358c..d3ed005db 100644 --- a/src/duckdb/src/include/duckdb/parser/parsed_data/copy_info.hpp +++ b/src/duckdb/src/include/duckdb/parser/parsed_data/copy_info.hpp @@ -13,6 +13,7 @@ #include "duckdb/common/unordered_map.hpp" #include "duckdb/common/types/value.hpp" #include "duckdb/common/case_insensitive_map.hpp" +#include "duckdb/parser/query_node.hpp" namespace duckdb { @@ -23,7 +24,7 @@ struct CopyInfo : public ParseInfo { static constexpr const ParseInfoType TYPE = ParseInfoType::COPY_INFO; public: - CopyInfo() : ParseInfo(TYPE), catalog(INVALID_CATALOG), schema(DEFAULT_SCHEMA) { + CopyInfo() : ParseInfo(TYPE), catalog(INVALID_CATALOG), schema(DEFAULT_SCHEMA), is_format_auto_detected(true) { } //! The catalog name to copy to/from @@ -38,15 +39,18 @@ struct CopyInfo : public ParseInfo { bool is_from; //! The file format of the external file string format; + //! If the format is manually set (i.e., via the format parameter) or was discovered by inspecting the file path + bool is_format_auto_detected; //! The file path to copy to/from string file_path; //! Set of (key, value) options case_insensitive_map_t> options; - // The SQL statement used instead of a table when copying data out to a file + //! The SQL statement used instead of a table when copying data out to a file unique_ptr select_statement; public: - static string CopyOptionsToString(const string &format, const case_insensitive_map_t> &options); + static string CopyOptionsToString(const string &format, bool is_format_auto_detected, + const case_insensitive_map_t> &options); public: unique_ptr Copy() const; diff --git a/src/duckdb/src/include/duckdb/storage/magic_bytes.hpp b/src/duckdb/src/include/duckdb/storage/magic_bytes.hpp index b6f7f68da..7e3a57881 100644 --- a/src/duckdb/src/include/duckdb/storage/magic_bytes.hpp +++ b/src/duckdb/src/include/duckdb/storage/magic_bytes.hpp @@ -17,7 +17,8 @@ enum class DataFileType : uint8_t { FILE_DOES_NOT_EXIST, // file does not exist DUCKDB_FILE, // duckdb database file SQLITE_FILE, // sqlite database file - PARQUET_FILE // parquet file + PARQUET_FILE, // parquet file + UNKNOWN_FILE // unknown file type }; class MagicBytes { diff --git a/src/duckdb/src/include/duckdb/storage/storage_extension.hpp b/src/duckdb/src/include/duckdb/storage/storage_extension.hpp index eece0b096..62d5a8028 100644 --- a/src/duckdb/src/include/duckdb/storage/storage_extension.hpp +++ b/src/duckdb/src/include/duckdb/storage/storage_extension.hpp @@ -49,4 +49,8 @@ class StorageExtension { } }; +struct OpenFileStorageExtension { + static unique_ptr Create(); +}; + } // namespace duckdb diff --git a/src/duckdb/src/main/attached_database.cpp b/src/duckdb/src/main/attached_database.cpp index 46809cc18..88ce0aedf 100644 --- a/src/duckdb/src/main/attached_database.cpp +++ b/src/duckdb/src/main/attached_database.cpp @@ -84,7 +84,7 @@ AttachedDatabase::AttachedDatabase(DatabaseInstance &db, AttachedDatabaseType ty } AttachedDatabase::AttachedDatabase(DatabaseInstance &db, Catalog &catalog_p, string name_p, string file_path_p, - const AttachOptions &options) + AttachOptions &options) : CatalogEntry(CatalogType::DATABASE_ENTRY, catalog_p, std::move(name_p)), db(db), parent_catalog(&catalog_p) { if (options.access_mode == AccessMode::READ_ONLY) { @@ -116,19 +116,17 @@ AttachedDatabase::AttachedDatabase(DatabaseInstance &db, Catalog &catalog_p, str } AttachedDatabase::AttachedDatabase(DatabaseInstance &db, Catalog &catalog_p, StorageExtension &storage_extension_p, - ClientContext &context, string name_p, const AttachInfo &info, - const AttachOptions &options) + ClientContext &context, string name_p, AttachInfo &info, AttachOptions &options) : CatalogEntry(CatalogType::DATABASE_ENTRY, catalog_p, std::move(name_p)), db(db), parent_catalog(&catalog_p), storage_extension(&storage_extension_p) { + StorageExtensionInfo *storage_info = storage_extension->storage_info.get(); + catalog = storage_extension->attach(storage_info, context, *this, name, info, options.access_mode); if (options.access_mode == AccessMode::READ_ONLY) { type = AttachedDatabaseType::READ_ONLY_DATABASE; } else { type = AttachedDatabaseType::READ_WRITE_DATABASE; } - - StorageExtensionInfo *storage_info = storage_extension->storage_info.get(); - catalog = storage_extension->attach(storage_info, context, *this, name, *info.Copy(), options.access_mode); if (!catalog) { throw InternalException("AttachedDatabase - attach function did not return a catalog"); } diff --git a/src/duckdb/src/main/database.cpp b/src/duckdb/src/main/database.cpp index 7e0e52d4e..ea8b0d7ab 100644 --- a/src/duckdb/src/main/database.cpp +++ b/src/duckdb/src/main/database.cpp @@ -49,6 +49,7 @@ DBConfig::DBConfig() { error_manager = make_uniq(); secret_manager = make_uniq(); http_util = make_uniq(); + storage_extensions["__open_file__"] = OpenFileStorageExtension::Create(); } DBConfig::DBConfig(bool read_only) : DBConfig::DBConfig() { @@ -171,8 +172,8 @@ ConnectionManager &ConnectionManager::Get(ClientContext &context) { return ConnectionManager::Get(DatabaseInstance::GetDatabase(context)); } -unique_ptr DatabaseInstance::CreateAttachedDatabase(ClientContext &context, const AttachInfo &info, - const AttachOptions &options) { +unique_ptr DatabaseInstance::CreateAttachedDatabase(ClientContext &context, AttachInfo &info, + AttachOptions &options) { unique_ptr attached_database; auto &catalog = Catalog::GetSystemCatalog(*this); @@ -314,12 +315,18 @@ void DatabaseInstance::Initialize(const char *database_path, DBConfig *user_conf // initialize the system catalog db_manager->InitializeSystemCatalog(); + if (config.options.database_type == "duckdb") { + config.options.database_type = string(); + } if (!config.options.database_type.empty()) { // if we are opening an extension database - load the extension if (!config.file_system) { throw InternalException("No file system!?"); } - ExtensionHelper::LoadExternalExtension(*this, *config.file_system, config.options.database_type); + auto entry = config.storage_extensions.find(config.options.database_type); + if (entry == config.storage_extensions.end()) { + ExtensionHelper::LoadExternalExtension(*this, *config.file_system, config.options.database_type); + } } LoadExtensionSettings(); diff --git a/src/duckdb/src/main/database_manager.cpp b/src/duckdb/src/main/database_manager.cpp index c1ac5c4fa..70b3a3207 100644 --- a/src/duckdb/src/main/database_manager.cpp +++ b/src/duckdb/src/main/database_manager.cpp @@ -39,11 +39,25 @@ optional_ptr DatabaseManager::GetDatabase(ClientContext &conte return reinterpret_cast(databases->GetEntry(context, name).get()); } -optional_ptr DatabaseManager::AttachDatabase(ClientContext &context, const AttachInfo &info, - const AttachOptions &options) { +optional_ptr DatabaseManager::AttachDatabase(ClientContext &context, AttachInfo &info, + AttachOptions &options) { if (AttachedDatabase::NameIsReserved(info.name)) { throw BinderException("Attached database name \"%s\" cannot be used because it is a reserved name", info.name); } + string extension = ""; + if (FileSystem::IsRemoteFile(info.path, extension)) { + if (!ExtensionHelper::TryAutoLoadExtension(context, extension)) { + throw MissingExtensionException("Attaching path '%s' requires extension '%s' to be loaded", info.path, + extension); + } + if (options.access_mode == AccessMode::AUTOMATIC) { + // Attaching of remote files gets bumped to READ_ONLY + // This is due to the fact that on most (all?) remote files writes to DB are not available + // and having this raised later is not super helpful + options.access_mode = AccessMode::READ_ONLY; + } + } + // now create the attached database auto &db = DatabaseInstance::GetDatabase(context); auto attached_db = db.CreateAttachedDatabase(context, info, options); diff --git a/src/duckdb/src/main/database_path_and_type.cpp b/src/duckdb/src/main/database_path_and_type.cpp index a231b4e27..80cc5a03c 100644 --- a/src/duckdb/src/main/database_path_and_type.cpp +++ b/src/duckdb/src/main/database_path_and_type.cpp @@ -2,6 +2,7 @@ #include "duckdb/main/extension_helper.hpp" #include "duckdb/storage/magic_bytes.hpp" +#include "duckdb/function/replacement_scan.hpp" namespace duckdb { @@ -17,10 +18,23 @@ void DBPathAndType::ExtractExtensionPrefix(string &path, string &db_type) { void DBPathAndType::CheckMagicBytes(FileSystem &fs, string &path, string &db_type) { // if there isn't - check the magic bytes of the file (if any) auto file_type = MagicBytes::CheckMagicBytes(fs, path); - if (file_type == DataFileType::SQLITE_FILE) { + db_type = string(); + switch (file_type) { + case DataFileType::SQLITE_FILE: db_type = "sqlite"; - } else { - db_type = ""; + break; + case DataFileType::PARQUET_FILE: + case DataFileType::UNKNOWN_FILE: { + // FIXME: we should get this from the registered replacement scans instead of hardcoding it here + vector supported_suffixes {"parquet", "csv", "json", "jsonl", "ndjson"}; + if (ReplacementScan::CanReplace(path, supported_suffixes)) { + db_type = "__open_file__"; + break; + } + break; + } + default: + break; } } diff --git a/src/duckdb/src/main/profiling_info.cpp b/src/duckdb/src/main/profiling_info.cpp index acca8c46c..644d342e6 100644 --- a/src/duckdb/src/main/profiling_info.cpp +++ b/src/duckdb/src/main/profiling_info.cpp @@ -39,6 +39,8 @@ ProfilingInfo::ProfilingInfo(const profiler_settings_t &n_settings, const idx_t profiler_settings_t ProfilingInfo::DefaultSettings() { return {MetricsType::QUERY_NAME, MetricsType::BLOCKED_THREAD_TIME, + MetricsType::SYSTEM_PEAK_BUFFER_MEMORY, + MetricsType::SYSTEM_PEAK_TEMP_DIR_SIZE, MetricsType::CPU_TIME, MetricsType::EXTRA_INFO, MetricsType::CUMULATIVE_CARDINALITY, diff --git a/src/duckdb/src/optimizer/deliminator.cpp b/src/duckdb/src/optimizer/deliminator.cpp index 254a57026..0d24635f6 100644 --- a/src/duckdb/src/optimizer/deliminator.cpp +++ b/src/duckdb/src/optimizer/deliminator.cpp @@ -349,6 +349,17 @@ bool Deliminator::RemoveInequalityJoinWithDelimGet(LogicalComparisonJoin &delim_ } } delim_condition.comparison = FlipComparisonExpression(join_comparison); + // join condition was a not equal and filtered out all NULLS. + // DELIM JOIN need to do that for not DELIM_GET side. Easiest way is to change the + // comparison expression type. See duckdb/duckdb#16803 + if (delim_join.join_type != JoinType::MARK) { + if (delim_condition.comparison == ExpressionType::COMPARE_DISTINCT_FROM) { + delim_condition.comparison = ExpressionType::COMPARE_NOTEQUAL; + } + if (delim_condition.comparison == ExpressionType::COMPARE_NOT_DISTINCT_FROM) { + delim_condition.comparison = ExpressionType::COMPARE_EQUAL; + } + } found = true; break; } diff --git a/src/duckdb/src/parser/parsed_data/copy_info.cpp b/src/duckdb/src/parser/parsed_data/copy_info.cpp index c46263a7f..9ba206579 100644 --- a/src/duckdb/src/parser/parsed_data/copy_info.cpp +++ b/src/duckdb/src/parser/parsed_data/copy_info.cpp @@ -12,6 +12,7 @@ unique_ptr CopyInfo::Copy() const { result->file_path = file_path; result->is_from = is_from; result->format = format; + result->is_format_auto_detected = is_format_auto_detected; result->options = options; if (select_statement) { result->select_statement = select_statement->Copy(); @@ -19,15 +20,18 @@ unique_ptr CopyInfo::Copy() const { return result; } -string CopyInfo::CopyOptionsToString(const string &format, const case_insensitive_map_t> &options) { - if (format.empty() && options.empty()) { +string CopyInfo::CopyOptionsToString(const string &format, bool is_format_auto_detected, + const case_insensitive_map_t> &options) { + // We only output the format if there is a format, and it was manually set. + const bool output_format = !format.empty() && !is_format_auto_detected; + if (!output_format && options.empty()) { return string(); } string result; result += " ("; vector stringified; - if (!format.empty()) { + if (!format.empty() && !is_format_auto_detected) { stringified.push_back(StringUtil::Format(" FORMAT %s", format)); } for (auto &opt : options) { @@ -81,7 +85,7 @@ string CopyInfo::ToString() const { result += TablePartToString(); result += " FROM"; result += StringUtil::Format(" %s", SQLString(file_path)); - result += CopyOptionsToString(format, options); + result += CopyOptionsToString(format, is_format_auto_detected, options); } else { if (select_statement) { // COPY (select-node) TO ... @@ -91,7 +95,7 @@ string CopyInfo::ToString() const { } result += " TO "; result += StringUtil::Format("%s", SQLString(file_path)); - result += CopyOptionsToString(format, options); + result += CopyOptionsToString(format, is_format_auto_detected, options); } result += ";"; return result; diff --git a/src/duckdb/src/parser/statement/export_statement.cpp b/src/duckdb/src/parser/statement/export_statement.cpp index 2b2e0c831..560c117ac 100644 --- a/src/duckdb/src/parser/statement/export_statement.cpp +++ b/src/duckdb/src/parser/statement/export_statement.cpp @@ -27,7 +27,7 @@ string ExportStatement::ToString() const { auto &options = info->options; auto &format = info->format; result += StringUtil::Format(" '%s'", path); - result += CopyInfo::CopyOptionsToString(format, options); + result += CopyInfo::CopyOptionsToString(format, info->is_format_auto_detected, options); result += ";"; return result; } diff --git a/src/duckdb/src/parser/transform/statement/transform_copy.cpp b/src/duckdb/src/parser/transform/statement/transform_copy.cpp index 7c199ac5b..7dd2a164f 100644 --- a/src/duckdb/src/parser/transform/statement/transform_copy.cpp +++ b/src/duckdb/src/parser/transform/statement/transform_copy.cpp @@ -68,6 +68,7 @@ void Transformer::TransformCopyOptions(CopyInfo &info, optional_ptrval.str); + info.is_format_auto_detected = false; continue; } @@ -77,6 +78,24 @@ void Transformer::TransformCopyOptions(CopyInfo &info, optional_ptr Transformer::TransformCopy(duckdb_libpgquery::PGCopyStmt &stmt) { auto result = make_uniq(); auto &info = *result->info; @@ -91,13 +110,7 @@ unique_ptr Transformer::TransformCopy(duckdb_libpgquery::PGCopySt info.file_path = stmt.filename; } - if (ReplacementScan::CanReplace(info.file_path, {"parquet"})) { - info.format = "parquet"; - } else if (ReplacementScan::CanReplace(info.file_path, {"json", "jsonl", "ndjson"})) { - info.format = "json"; - } else { - info.format = "csv"; - } + info.format = ExtractFormat(info.file_path); // get select_list if (stmt.attlist) { diff --git a/src/duckdb/src/planner/binder/statement/bind_copy.cpp b/src/duckdb/src/planner/binder/statement/bind_copy.cpp index 952ddb41e..2576c9867 100644 --- a/src/duckdb/src/planner/binder/statement/bind_copy.cpp +++ b/src/duckdb/src/planner/binder/statement/bind_copy.cpp @@ -22,16 +22,42 @@ #include +#include "duckdb/main/extension_entries.hpp" + namespace duckdb { static bool GetBooleanArg(ClientContext &context, const vector &arg) { return arg.empty() || arg[0].CastAs(context, LogicalType::BOOLEAN).GetValue(); } +void IsFormatExtensionKnown(const string &format) { + for (auto &file_postfixes : EXTENSION_FILE_POSTFIXES) { + if (format == file_postfixes.name + 1) { + // It's a match, we must throw + throw CatalogException( + "Copy Function with name \"%s\" is not in the catalog, but it exists in the %s extension.", format, + file_postfixes.extension); + } + } +} + BoundStatement Binder::BindCopyTo(CopyStatement &stmt, CopyToType copy_to_type) { + // Let's first bind our format + auto on_entry_do = + stmt.info->is_format_auto_detected ? OnEntryNotFound::RETURN_NULL : OnEntryNotFound::THROW_EXCEPTION; + CatalogEntryRetriever entry_retriever {context}; + auto &catalog = Catalog::GetSystemCatalog(context); + auto entry = catalog.GetEntry(entry_retriever, DEFAULT_SCHEMA, + {CatalogType::COPY_FUNCTION_ENTRY, stmt.info->format}, on_entry_do); + + if (!entry) { + IsFormatExtensionKnown(stmt.info->format); + // If we did not find an entry, we default to a CSV + entry = catalog.GetEntry(entry_retriever, DEFAULT_SCHEMA, {CatalogType::COPY_FUNCTION_ENTRY, "csv"}, + OnEntryNotFound::THROW_EXCEPTION); + } // lookup the format in the catalog - auto ©_function = - Catalog::GetEntry(context, INVALID_CATALOG, DEFAULT_SCHEMA, stmt.info->format); + auto ©_function = entry->Cast(); if (copy_function.function.plan) { // plan rewrite COPY TO return copy_function.function.plan(*this, stmt); @@ -326,7 +352,19 @@ BoundStatement Binder::BindCopyFrom(CopyStatement &stmt) { // lookup the format in the catalog auto &catalog = Catalog::GetSystemCatalog(context); - auto ©_function = catalog.GetEntry(context, DEFAULT_SCHEMA, stmt.info->format); + auto on_entry_do = + stmt.info->is_format_auto_detected ? OnEntryNotFound::RETURN_NULL : OnEntryNotFound::THROW_EXCEPTION; + CatalogEntryRetriever entry_retriever {context}; + auto entry = catalog.GetEntry(entry_retriever, DEFAULT_SCHEMA, + {CatalogType::COPY_FUNCTION_ENTRY, stmt.info->format}, on_entry_do); + if (!entry) { + IsFormatExtensionKnown(stmt.info->format); + // If we did not find an entry, we default to a CSV + entry = catalog.GetEntry(entry_retriever, DEFAULT_SCHEMA, {CatalogType::COPY_FUNCTION_ENTRY, "csv"}, + OnEntryNotFound::THROW_EXCEPTION); + } + // lookup the format in the catalog + auto ©_function = entry->Cast(); if (!copy_function.function.copy_from_bind) { throw NotImplementedException("COPY FROM is not supported for FORMAT \"%s\"", stmt.info->format); } diff --git a/src/duckdb/src/storage/magic_bytes.cpp b/src/duckdb/src/storage/magic_bytes.cpp index 602e7ffef..a4f32896f 100644 --- a/src/duckdb/src/storage/magic_bytes.cpp +++ b/src/duckdb/src/storage/magic_bytes.cpp @@ -26,7 +26,7 @@ DataFileType MagicBytes::CheckMagicBytes(FileSystem &fs, const string &path) { if (memcmp(buffer + MainHeader::MAGIC_BYTE_OFFSET, MainHeader::MAGIC_BYTES, MainHeader::MAGIC_BYTE_SIZE) == 0) { return DataFileType::DUCKDB_FILE; } - return DataFileType::FILE_DOES_NOT_EXIST; + return DataFileType::UNKNOWN_FILE; } } // namespace duckdb diff --git a/src/duckdb/src/storage/open_file_storage_extension.cpp b/src/duckdb/src/storage/open_file_storage_extension.cpp new file mode 100644 index 000000000..125f65b26 --- /dev/null +++ b/src/duckdb/src/storage/open_file_storage_extension.cpp @@ -0,0 +1,81 @@ +#include "duckdb/storage/storage_extension.hpp" +#include "duckdb/catalog/duck_catalog.hpp" +#include "duckdb/transaction/duck_transaction_manager.hpp" +#include "duckdb/catalog/default/default_views.hpp" +#include "duckdb/catalog/catalog_entry/duck_schema_entry.hpp" +#include "duckdb/catalog/catalog_entry/view_catalog_entry.hpp" + +namespace duckdb { + +class OpenFileDefaultGenerator : public DefaultGenerator { +public: + OpenFileDefaultGenerator(Catalog &catalog, SchemaCatalogEntry &schema, const case_insensitive_set_t &view_names_p, + string file_p) + : DefaultGenerator(catalog), schema(schema), file(std::move(file_p)) { + for (auto &view_name : view_names_p) { + view_names.push_back(view_name); + } + } + +public: + unique_ptr CreateDefaultEntry(ClientContext &context, const string &entry_name) override { + for (auto &entry : view_names) { + if (StringUtil::CIEquals(entry_name, entry)) { + auto result = make_uniq(); + result->schema = DEFAULT_SCHEMA; + result->view_name = entry; + result->sql = StringUtil::Format("SELECT * FROM %s", SQLString(file)); + auto view_info = CreateViewInfo::FromSelect(context, std::move(result)); + return make_uniq_base(catalog, schema, *view_info); + } + } + return nullptr; + } + + vector GetDefaultEntries() override { + return view_names; + } + +private: + SchemaCatalogEntry &schema; + vector view_names; + string file; +}; + +unique_ptr OpenFileStorageAttach(StorageExtensionInfo *storage_info, ClientContext &context, + AttachedDatabase &db, const string &name, AttachInfo &info, + AccessMode access_mode) { + auto file = info.path; + // open an in-memory database + info.path = ":memory:"; + auto catalog = make_uniq(db); + catalog->Initialize(false); + + case_insensitive_set_t view_names; + view_names.insert("file"); + view_names.insert(name); + + // set up the default view generator for "file" and the derived name of the file + auto system_transaction = CatalogTransaction::GetSystemTransaction(db.GetDatabase()); + auto &schema = catalog->GetSchema(system_transaction, DEFAULT_SCHEMA); + auto &duck_schema = schema.Cast(); + auto &catalog_set = duck_schema.GetCatalogSet(CatalogType::VIEW_ENTRY); + auto default_generator = make_uniq(*catalog, schema, view_names, std::move(file)); + catalog_set.SetDefaultGenerator(std::move(default_generator)); + + return std::move(catalog); +} + +unique_ptr OpenFileStorageTransactionManager(StorageExtensionInfo *storage_info, + AttachedDatabase &db, Catalog &catalog) { + return make_uniq(db); +} + +unique_ptr OpenFileStorageExtension::Create() { + auto result = make_uniq(); + result->attach = OpenFileStorageAttach; + result->create_transaction_manager = OpenFileStorageTransactionManager; + return result; +} + +} // namespace duckdb diff --git a/src/duckdb/src/storage/serialization/serialize_parse_info.cpp b/src/duckdb/src/storage/serialization/serialize_parse_info.cpp index 7a9d7b3f9..01bf5bfd5 100644 --- a/src/duckdb/src/storage/serialization/serialize_parse_info.cpp +++ b/src/duckdb/src/storage/serialization/serialize_parse_info.cpp @@ -338,6 +338,7 @@ void CopyInfo::Serialize(Serializer &serializer) const { serializer.WritePropertyWithDefault(206, "file_path", file_path); serializer.WritePropertyWithDefault>>(207, "options", options); serializer.WritePropertyWithDefault>(208, "select_statement", select_statement); + serializer.WritePropertyWithDefault(209, "is_format_auto_detected", is_format_auto_detected); } unique_ptr CopyInfo::Deserialize(Deserializer &deserializer) { @@ -351,6 +352,7 @@ unique_ptr CopyInfo::Deserialize(Deserializer &deserializer) { deserializer.ReadPropertyWithDefault(206, "file_path", result->file_path); deserializer.ReadPropertyWithDefault>>(207, "options", result->options); deserializer.ReadPropertyWithDefault>(208, "select_statement", result->select_statement); + deserializer.ReadPropertyWithDefault(209, "is_format_auto_detected", result->is_format_auto_detected); return std::move(result); } diff --git a/src/duckdb/ub_src_storage.cpp b/src/duckdb/ub_src_storage.cpp index 75495b6c7..43ea1eb7a 100644 --- a/src/duckdb/ub_src_storage.cpp +++ b/src/duckdb/ub_src_storage.cpp @@ -32,6 +32,8 @@ #include "src/storage/write_ahead_log.cpp" +#include "src/storage/open_file_storage_extension.cpp" + #include "src/storage/optimistic_data_writer.cpp" #include "src/storage/partial_block_manager.cpp"