diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml index 5eb26e06b..c60f42b1d 100644 --- a/.github/workflows/test.yml +++ b/.github/workflows/test.yml @@ -41,12 +41,24 @@ jobs: name: AMD64 Ubuntu 24.04 runs-on: ubuntu-24.04 timeout-minutes: 30 + strategy: + fail-fast: false + env: + ICEBERG_TEST_S3_URI: s3://iceberg-test + AWS_ACCESS_KEY_ID: minio + AWS_SECRET_ACCESS_KEY: minio123 + AWS_DEFAULT_REGION: us-east-1 + AWS_ENDPOINT_URL: http://127.0.0.1:9000 + AWS_EC2_METADATA_DISABLED: "TRUE" steps: - name: Checkout iceberg-cpp uses: actions/checkout@v6 - name: Install dependencies shell: bash run: sudo apt-get update && sudo apt-get install -y libcurl4-openssl-dev + - name: Start MinIO + shell: bash + run: bash ci/scripts/start_minio.sh - name: Build Iceberg shell: bash env: @@ -63,9 +75,21 @@ jobs: name: AArch64 macOS 26 runs-on: macos-26 timeout-minutes: 30 + strategy: + fail-fast: false + env: + ICEBERG_TEST_S3_URI: s3://iceberg-test + AWS_ACCESS_KEY_ID: minio + AWS_SECRET_ACCESS_KEY: minio123 + AWS_DEFAULT_REGION: us-east-1 + AWS_ENDPOINT_URL: http://127.0.0.1:9000 + AWS_EC2_METADATA_DISABLED: "TRUE" steps: - name: Checkout iceberg-cpp uses: actions/checkout@v6 + - name: Start MinIO + shell: bash + run: bash ci/scripts/start_minio.sh - name: Build Iceberg shell: bash run: ci/scripts/build_iceberg.sh $(pwd) @@ -76,6 +100,15 @@ jobs: name: AMD64 Windows 2025 runs-on: windows-2025 timeout-minutes: 60 + strategy: + fail-fast: false + env: + ICEBERG_TEST_S3_URI: s3://iceberg-test + AWS_ACCESS_KEY_ID: minio + AWS_SECRET_ACCESS_KEY: minio123 + AWS_DEFAULT_REGION: us-east-1 + AWS_ENDPOINT_URL: http://127.0.0.1:9000 + AWS_EC2_METADATA_DISABLED: "TRUE" steps: - name: Checkout iceberg-cpp uses: actions/checkout@v6 @@ -85,6 +118,9 @@ jobs: vcpkg install zlib:x64-windows nlohmann-json:x64-windows nanoarrow:x64-windows roaring:x64-windows cpr:x64-windows - name: Setup sccache uses: mozilla-actions/sccache-action@7d986dd989559c6ecdb630a3fd2557667be217ad # v0.0.9 + - name: Start MinIO + shell: bash + run: bash ci/scripts/start_minio.sh - name: Build Iceberg shell: cmd env: diff --git a/CMakeLists.txt b/CMakeLists.txt index e7281fb11..8647a3c64 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -45,6 +45,7 @@ option(ICEBERG_BUILD_TESTS "Build tests" ON) option(ICEBERG_BUILD_BUNDLE "Build the battery included library" ON) option(ICEBERG_BUILD_REST "Build rest catalog client" ON) option(ICEBERG_BUILD_REST_INTEGRATION_TESTS "Build rest catalog integration tests" OFF) +option(ICEBERG_S3 "Build with S3 support" ON) option(ICEBERG_ENABLE_ASAN "Enable Address Sanitizer" OFF) option(ICEBERG_ENABLE_UBSAN "Enable Undefined Behavior Sanitizer" OFF) diff --git a/ci/scripts/start_minio.sh b/ci/scripts/start_minio.sh new file mode 100644 index 000000000..219990d3f --- /dev/null +++ b/ci/scripts/start_minio.sh @@ -0,0 +1,148 @@ +#!/usr/bin/env bash +# +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +set -eux + +MINIO_ROOT_USER="${MINIO_ROOT_USER:-minio}" +MINIO_ROOT_PASSWORD="${MINIO_ROOT_PASSWORD:-minio123}" +MINIO_IMAGE="${MINIO_IMAGE:-minio/minio:latest}" +MINIO_CONTAINER_NAME="${MINIO_CONTAINER_NAME:-iceberg-minio}" +MINIO_PORT="${MINIO_PORT:-9000}" +MINIO_CONSOLE_PORT="${MINIO_CONSOLE_PORT:-9001}" +MINIO_BUCKET="${MINIO_BUCKET:-iceberg-test}" +MINIO_ENDPOINT="${MINIO_ENDPOINT:-http://127.0.0.1:${MINIO_PORT}}" + +wait_for_minio() { + for i in {1..30}; do + if curl -fsS "${MINIO_ENDPOINT}/minio/health/ready" >/dev/null; then + return 0 + fi + sleep 1 + done + return 1 +} + +start_minio_docker() { + if ! command -v docker >/dev/null 2>&1; then + return 1 + fi + + if docker ps -a --format '{{.Names}}' | grep -q "^${MINIO_CONTAINER_NAME}\$"; then + docker rm -f "${MINIO_CONTAINER_NAME}" + fi + + docker run -d --name "${MINIO_CONTAINER_NAME}" \ + -p "${MINIO_PORT}:9000" -p "${MINIO_CONSOLE_PORT}:9001" \ + -e "MINIO_ROOT_USER=${MINIO_ROOT_USER}" \ + -e "MINIO_ROOT_PASSWORD=${MINIO_ROOT_PASSWORD}" \ + "${MINIO_IMAGE}" \ + server /data --console-address ":${MINIO_CONSOLE_PORT}" + + wait_for_minio +} + +start_minio_macos() { + if ! command -v brew >/dev/null 2>&1; then + echo "brew is required to start MinIO on macOS without Docker" >&2 + return 1 + fi + + brew install minio + MINIO_ROOT_USER="${MINIO_ROOT_USER}" MINIO_ROOT_PASSWORD="${MINIO_ROOT_PASSWORD}" \ + minio server /tmp/minio --console-address ":${MINIO_CONSOLE_PORT}" & + wait_for_minio +} + +download_mc() { + local uname_out + uname_out="$(uname -s)" + + local mc_dir + mc_dir="${RUNNER_TEMP:-/tmp}" + mkdir -p "${mc_dir}" + + case "${uname_out}" in + Linux*) + MC_BIN="${mc_dir}/mc" + curl -sSL "https://dl.min.io/client/mc/release/linux-amd64/mc" -o "${MC_BIN}" + chmod +x "${MC_BIN}" + ;; + Darwin*) + MC_BIN="${mc_dir}/mc" + local arch + arch="$(uname -m)" + if [ "${arch}" = "arm64" ]; then + curl -sSL "https://dl.min.io/client/mc/release/darwin-arm64/mc" -o "${MC_BIN}" + else + curl -sSL "https://dl.min.io/client/mc/release/darwin-amd64/mc" -o "${MC_BIN}" + fi + chmod +x "${MC_BIN}" + ;; + MINGW*|MSYS*|CYGWIN*) + MC_BIN="${mc_dir}/mc.exe" + curl -sSL "https://dl.min.io/client/mc/release/windows-amd64/mc.exe" -o "${MC_BIN}" + ;; + *) + echo "Unsupported OS for mc: ${uname_out}" >&2 + return 1 + ;; + esac +} + +create_bucket() { + download_mc + for i in {1..30}; do + if "${MC_BIN}" alias set local "${MINIO_ENDPOINT}" "${MINIO_ROOT_USER}" "${MINIO_ROOT_PASSWORD}"; then + break + fi + sleep 1 + done + "${MC_BIN}" mb --ignore-existing "local/${MINIO_BUCKET}" +} + +start_minio_windows() { + local minio_dir="${RUNNER_TEMP:-/tmp}" + local minio_bin="${minio_dir}/minio.exe" + curl -sSL "https://dl.min.io/server/minio/release/windows-amd64/minio.exe" -o "${minio_bin}" + MINIO_ROOT_USER="${MINIO_ROOT_USER}" MINIO_ROOT_PASSWORD="${MINIO_ROOT_PASSWORD}" \ + "${minio_bin}" server "${minio_dir}/minio-data" --console-address ":${MINIO_CONSOLE_PORT}" & + wait_for_minio +} + +case "$(uname -s)" in + Darwin*) + if ! start_minio_docker; then + start_minio_macos + fi + ;; + MINGW*|MSYS*|CYGWIN*) + if ! start_minio_docker; then + start_minio_windows + fi + ;; + Linux*) + start_minio_docker + ;; + *) + echo "Unsupported OS: $(uname -s)" >&2 + exit 1 + ;; +esac + +create_bucket diff --git a/cmake_modules/IcebergThirdpartyToolchain.cmake b/cmake_modules/IcebergThirdpartyToolchain.cmake index 8b32eb749..d4f837d67 100644 --- a/cmake_modules/IcebergThirdpartyToolchain.cmake +++ b/cmake_modules/IcebergThirdpartyToolchain.cmake @@ -102,6 +102,7 @@ function(resolve_arrow_dependency) # Work around undefined symbol: arrow::ipc::ReadSchema(arrow::io::InputStream*, arrow::ipc::DictionaryMemo*) set(ARROW_IPC ON) set(ARROW_FILESYSTEM ON) + set(ARROW_S3 ${ICEBERG_S3}) set(ARROW_JSON ON) set(ARROW_PARQUET ON) set(ARROW_SIMD_LEVEL "NONE") @@ -164,6 +165,13 @@ function(resolve_arrow_dependency) install(FILES ${arrow_bundled_dependencies_location} DESTINATION ${ICEBERG_INSTALL_LIBDIR}) endif() + + # Arrow's exported static target interface may reference system libraries + # (e.g. OpenSSL, CURL, ZLIB) that consumers need to find. + list(APPEND ICEBERG_SYSTEM_DEPENDENCIES ZLIB) + if(ARROW_S3) + list(APPEND ICEBERG_SYSTEM_DEPENDENCIES OpenSSL CURL) + endif() else() set(ARROW_VENDORED FALSE) find_package(Arrow CONFIG REQUIRED) diff --git a/src/iceberg/CMakeLists.txt b/src/iceberg/CMakeLists.txt index b503a41ea..0af6bbce3 100644 --- a/src/iceberg/CMakeLists.txt +++ b/src/iceberg/CMakeLists.txt @@ -40,6 +40,7 @@ set(ICEBERG_SOURCES expression/rewrite_not.cc expression/strict_metrics_evaluator.cc expression/term.cc + file_io_registry.cc file_reader.cc file_writer.cc inheritable_metadata.cc @@ -176,6 +177,8 @@ add_subdirectory(util) if(ICEBERG_BUILD_BUNDLE) set(ICEBERG_BUNDLE_SOURCES arrow/arrow_fs_file_io.cc + arrow/arrow_s3_file_io.cc + arrow/file_io_register.cc arrow/metadata_column_util.cc avro/avro_data_util.cc avro/avro_direct_decoder.cc @@ -241,6 +244,15 @@ if(ICEBERG_BUILD_BUNDLE) OUTPUTS ICEBERG_BUNDLE_LIBRARIES) + if(ICEBERG_S3) + foreach(target iceberg_bundle_static iceberg_bundle_shared) + if(TARGET ${target}) + target_compile_definitions(${target} + PUBLIC "$") + endif() + endforeach() + endif() + add_subdirectory(arrow) add_subdirectory(avro) add_subdirectory(parquet) diff --git a/src/iceberg/arrow/arrow_file_io.h b/src/iceberg/arrow/arrow_file_io.h index 12a9b2303..514881b11 100644 --- a/src/iceberg/arrow/arrow_file_io.h +++ b/src/iceberg/arrow/arrow_file_io.h @@ -20,9 +20,12 @@ #pragma once #include +#include +#include #include "iceberg/file_io.h" #include "iceberg/iceberg_bundle_export.h" +#include "iceberg/result.h" namespace iceberg::arrow { @@ -30,4 +33,18 @@ ICEBERG_BUNDLE_EXPORT std::unique_ptr MakeMockFileIO(); ICEBERG_BUNDLE_EXPORT std::unique_ptr MakeLocalFileIO(); +/// \brief Create an S3 FileIO backed by Arrow's S3FileSystem. +/// +/// This function initializes the S3 subsystem if not already initialized (thread-safe). +/// The S3 initialization is done once per process using std::call_once. +/// +/// \param uri An S3 URI (must start with "s3://") used to validate the scheme. +/// \param properties Optional configuration properties for S3 access. See S3Properties +/// for available keys (credentials, region, endpoint, timeouts, etc.). +/// \return A FileIO instance for S3 operations, or an error if S3 is not supported +/// or the URI is invalid. +ICEBERG_BUNDLE_EXPORT Result> MakeS3FileIO( + const std::string& uri, + const std::unordered_map& properties = {}); + } // namespace iceberg::arrow diff --git a/src/iceberg/arrow/arrow_fs_file_io.cc b/src/iceberg/arrow/arrow_fs_file_io.cc index be62b79af..769fcfb13 100644 --- a/src/iceberg/arrow/arrow_fs_file_io.cc +++ b/src/iceberg/arrow/arrow_fs_file_io.cc @@ -25,13 +25,23 @@ #include "iceberg/arrow/arrow_file_io.h" #include "iceberg/arrow/arrow_fs_file_io_internal.h" #include "iceberg/arrow/arrow_status_internal.h" +#include "iceberg/util/macros.h" namespace iceberg::arrow { +Result ArrowFileSystemFileIO::ResolvePath(const std::string& file_location) { + if (file_location.find("://") != std::string::npos) { + ICEBERG_ARROW_ASSIGN_OR_RETURN(auto path, arrow_fs_->PathFromUri(file_location)); + return path; + } + return file_location; +} + /// \brief Read the content of the file at the given location. Result ArrowFileSystemFileIO::ReadFile(const std::string& file_location, std::optional length) { - ::arrow::fs::FileInfo file_info(file_location); + ICEBERG_ASSIGN_OR_RAISE(auto path, ResolvePath(file_location)); + ::arrow::fs::FileInfo file_info(path); if (length.has_value()) { file_info.set_size(length.value()); } @@ -47,6 +57,10 @@ Result ArrowFileSystemFileIO::ReadFile(const std::string& file_loca ICEBERG_ARROW_ASSIGN_OR_RETURN( auto read_bytes, file->Read(read_length, reinterpret_cast(&content[offset]))); + if (read_bytes == 0) { + return IOError("Unexpected EOF reading {}: got {} of {} bytes", file_location, + offset, file_size); + } remain -= read_bytes; offset += read_bytes; } @@ -57,7 +71,8 @@ Result ArrowFileSystemFileIO::ReadFile(const std::string& file_loca /// \brief Write the given content to the file at the given location. Status ArrowFileSystemFileIO::WriteFile(const std::string& file_location, std::string_view content) { - ICEBERG_ARROW_ASSIGN_OR_RETURN(auto file, arrow_fs_->OpenOutputStream(file_location)); + ICEBERG_ASSIGN_OR_RAISE(auto path, ResolvePath(file_location)); + ICEBERG_ARROW_ASSIGN_OR_RETURN(auto file, arrow_fs_->OpenOutputStream(path)); ICEBERG_ARROW_RETURN_NOT_OK(file->Write(content.data(), content.size())); ICEBERG_ARROW_RETURN_NOT_OK(file->Flush()); ICEBERG_ARROW_RETURN_NOT_OK(file->Close()); @@ -66,7 +81,8 @@ Status ArrowFileSystemFileIO::WriteFile(const std::string& file_location, /// \brief Delete a file at the given location. Status ArrowFileSystemFileIO::DeleteFile(const std::string& file_location) { - ICEBERG_ARROW_RETURN_NOT_OK(arrow_fs_->DeleteFile(file_location)); + ICEBERG_ASSIGN_OR_RAISE(auto path, ResolvePath(file_location)); + ICEBERG_ARROW_RETURN_NOT_OK(arrow_fs_->DeleteFile(path)); return {}; } diff --git a/src/iceberg/arrow/arrow_fs_file_io_internal.h b/src/iceberg/arrow/arrow_fs_file_io_internal.h index f151c7a5b..92a991501 100644 --- a/src/iceberg/arrow/arrow_fs_file_io_internal.h +++ b/src/iceberg/arrow/arrow_fs_file_io_internal.h @@ -56,6 +56,9 @@ class ICEBERG_BUNDLE_EXPORT ArrowFileSystemFileIO : public FileIO { const std::shared_ptr<::arrow::fs::FileSystem>& fs() const { return arrow_fs_; } private: + /// \brief Resolve a file location to a filesystem path. + Result ResolvePath(const std::string& file_location); + std::shared_ptr<::arrow::fs::FileSystem> arrow_fs_; }; diff --git a/src/iceberg/arrow/arrow_s3_file_io.cc b/src/iceberg/arrow/arrow_s3_file_io.cc new file mode 100644 index 000000000..e610d15eb --- /dev/null +++ b/src/iceberg/arrow/arrow_s3_file_io.cc @@ -0,0 +1,166 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +#include +#include +#include + +#include +#ifdef ICEBERG_S3_ENABLED +# include +# define ICEBERG_ARROW_HAS_S3 1 +#else +# define ICEBERG_ARROW_HAS_S3 0 +#endif + +#include "iceberg/arrow/arrow_file_io.h" +#include "iceberg/arrow/arrow_fs_file_io_internal.h" +#include "iceberg/arrow/arrow_status_internal.h" +#include "iceberg/arrow/s3_properties.h" +#include "iceberg/util/macros.h" + +namespace iceberg::arrow { + +namespace { + +Status EnsureS3Initialized() { +#if ICEBERG_ARROW_HAS_S3 + static std::once_flag init_flag; + static ::arrow::Status init_status = ::arrow::Status::OK(); + std::call_once(init_flag, []() { + ::arrow::fs::S3GlobalOptions options; + init_status = ::arrow::fs::InitializeS3(options); + }); + if (!init_status.ok()) { + return std::unexpected(Error{.kind = ::iceberg::arrow::ToErrorKind(init_status), + .message = init_status.ToString()}); + } + return {}; +#else + return NotImplemented("Arrow S3 support is not enabled"); +#endif +} + +#if ICEBERG_ARROW_HAS_S3 +/// \brief Configure S3Options from a properties map. +/// +/// \param properties The configuration properties map. +/// \return Configured S3Options. +Result<::arrow::fs::S3Options> ConfigureS3Options( + const std::unordered_map& properties) { + ::arrow::fs::S3Options options; + + // Configure credentials + auto access_key_it = properties.find(S3Properties::kAccessKeyId); + auto secret_key_it = properties.find(S3Properties::kSecretAccessKey); + auto session_token_it = properties.find(S3Properties::kSessionToken); + + if (access_key_it != properties.end() && secret_key_it != properties.end()) { + if (session_token_it != properties.end()) { + options.ConfigureAccessKey(access_key_it->second, secret_key_it->second, + session_token_it->second); + } else { + options.ConfigureAccessKey(access_key_it->second, secret_key_it->second); + } + } else { + // Use default credential chain (environment, instance profile, etc.) + options.ConfigureDefaultCredentials(); + } + + // Configure region + auto region_it = properties.find(S3Properties::kRegion); + if (region_it != properties.end()) { + options.region = region_it->second; + } + + // Configure endpoint (for MinIO, LocalStack, etc.) + auto endpoint_it = properties.find(S3Properties::kEndpoint); + if (endpoint_it != properties.end()) { + options.endpoint_override = endpoint_it->second; + } else { + // Fall back to AWS standard environment variables for endpoint override + const char* s3_endpoint_env = std::getenv("AWS_ENDPOINT_URL_S3"); + if (s3_endpoint_env != nullptr) { + options.endpoint_override = s3_endpoint_env; + } else { + const char* endpoint_env = std::getenv("AWS_ENDPOINT_URL"); + if (endpoint_env != nullptr) { + options.endpoint_override = endpoint_env; + } + } + } + + auto path_style_it = properties.find(S3Properties::kPathStyleAccess); + if (path_style_it != properties.end() && path_style_it->second == "true") { + options.force_virtual_addressing = false; + } + + // Configure SSL + auto ssl_it = properties.find(S3Properties::kSslEnabled); + if (ssl_it != properties.end() && ssl_it->second == "false") { + options.scheme = "http"; + } + + // Configure timeouts + auto connect_timeout_it = properties.find(S3Properties::kConnectTimeoutMs); + if (connect_timeout_it != properties.end()) { + try { + options.connect_timeout = std::stod(connect_timeout_it->second) / 1000.0; + } catch (const std::exception& e) { + return InvalidArgument("Invalid {}: '{}' ({})", S3Properties::kConnectTimeoutMs, + connect_timeout_it->second, e.what()); + } + } + + auto socket_timeout_it = properties.find(S3Properties::kSocketTimeoutMs); + if (socket_timeout_it != properties.end()) { + try { + options.request_timeout = std::stod(socket_timeout_it->second) / 1000.0; + } catch (const std::exception& e) { + return InvalidArgument("Invalid {}: '{}' ({})", S3Properties::kSocketTimeoutMs, + socket_timeout_it->second, e.what()); + } + } + + return options; +} +#endif + +} // namespace + +Result> MakeS3FileIO( + const std::string& uri, + const std::unordered_map& properties) { + if (!uri.starts_with("s3://")) { + return InvalidArgument("S3 URI must start with s3://"); + } +#if !ICEBERG_ARROW_HAS_S3 + return NotImplemented("Arrow S3 support is not enabled"); +#else + ICEBERG_RETURN_UNEXPECTED(EnsureS3Initialized()); + + // Configure S3 options from properties (uses default credentials if empty) + ICEBERG_ASSIGN_OR_RAISE(auto options, ConfigureS3Options(properties)); + ICEBERG_ARROW_ASSIGN_OR_RETURN(auto fs, ::arrow::fs::S3FileSystem::Make(options)); + + return std::make_unique(std::move(fs)); +#endif +} + +} // namespace iceberg::arrow diff --git a/src/iceberg/arrow/file_io_register.cc b/src/iceberg/arrow/file_io_register.cc new file mode 100644 index 000000000..8438a128c --- /dev/null +++ b/src/iceberg/arrow/file_io_register.cc @@ -0,0 +1,53 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#include "iceberg/arrow/file_io_register.h" + +#include + +#include "iceberg/arrow/arrow_file_io.h" +#include "iceberg/arrow/arrow_fs_file_io_internal.h" +#include "iceberg/file_io_registry.h" +#include "iceberg/util/macros.h" + +namespace iceberg::arrow { + +void RegisterFileIO() { + static std::once_flag flag; + std::call_once(flag, []() { + // Register Arrow local filesystem FileIO + FileIORegistry::Register( + FileIORegistry::kArrowLocalFileIO, + [](const std::string& /*warehouse*/, + const std::unordered_map& /*properties*/) + -> Result> { + return std::shared_ptr(MakeLocalFileIO()); + }); + + // Register Arrow S3 FileIO + FileIORegistry::Register( + FileIORegistry::kArrowS3FileIO, + [](const std::string& warehouse, + const std::unordered_map& properties) + -> Result> { + ICEBERG_ASSIGN_OR_RAISE(auto file_io, MakeS3FileIO(warehouse, properties)); + return std::shared_ptr(std::move(file_io)); + }); + }); +} + +} // namespace iceberg::arrow diff --git a/src/iceberg/arrow/file_io_register.h b/src/iceberg/arrow/file_io_register.h new file mode 100644 index 000000000..6f52c2f81 --- /dev/null +++ b/src/iceberg/arrow/file_io_register.h @@ -0,0 +1,40 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +#pragma once + +/// \file iceberg/arrow/file_io_register.h +/// \brief Provide functions to register Arrow FileIO implementations. + +#include "iceberg/iceberg_bundle_export.h" + +namespace iceberg::arrow { + +/// \brief Register Arrow FileIO implementations (local and S3) into the +/// FileIORegistry. +/// +/// This function is idempotent and thread-safe. It registers: +/// - ArrowFileIO (local filesystem) +/// - ArrowS3FileIO (S3 filesystem) +/// +/// Must be called before using FileIORegistry::Load() with the built-in +/// implementation names (e.g., from RestCatalog::Make(config)). +ICEBERG_BUNDLE_EXPORT void RegisterFileIO(); + +} // namespace iceberg::arrow diff --git a/src/iceberg/arrow/s3_properties.h b/src/iceberg/arrow/s3_properties.h new file mode 100644 index 000000000..210a1ab3e --- /dev/null +++ b/src/iceberg/arrow/s3_properties.h @@ -0,0 +1,52 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +#pragma once + +#include + +namespace iceberg::arrow { + +/// \brief S3 configuration property keys for ArrowS3FileIO. +/// +/// These constants define the property keys used to configure S3 access +/// via the Arrow filesystem integration, following the Iceberg spec for +/// S3 configuration properties. +struct S3Properties { + /// AWS access key ID + static constexpr const char* kAccessKeyId = "s3.access-key-id"; + /// AWS secret access key + static constexpr const char* kSecretAccessKey = "s3.secret-access-key"; + /// AWS session token (for temporary credentials) + static constexpr const char* kSessionToken = "s3.session-token"; + /// AWS region + static constexpr const char* kRegion = "s3.region"; + /// Custom endpoint override (for MinIO, LocalStack, etc.) + static constexpr const char* kEndpoint = "s3.endpoint"; + /// Whether to use path-style access (needed for MinIO) + static constexpr const char* kPathStyleAccess = "s3.path-style-access"; + /// Whether SSL is enabled + static constexpr const char* kSslEnabled = "s3.ssl.enabled"; + /// Connection timeout in milliseconds + static constexpr const char* kConnectTimeoutMs = "s3.connect-timeout-ms"; + /// Socket timeout in milliseconds + static constexpr const char* kSocketTimeoutMs = "s3.socket-timeout-ms"; +}; + +} // namespace iceberg::arrow diff --git a/src/iceberg/catalog/rest/rest_catalog.cc b/src/iceberg/catalog/rest/rest_catalog.cc index 40e112db7..6907b3763 100644 --- a/src/iceberg/catalog/rest/rest_catalog.cc +++ b/src/iceberg/catalog/rest/rest_catalog.cc @@ -36,6 +36,7 @@ #include "iceberg/catalog/rest/resource_paths.h" #include "iceberg/catalog/rest/rest_util.h" #include "iceberg/catalog/rest/types.h" +#include "iceberg/file_io_registry.h" #include "iceberg/json_serde_internal.h" #include "iceberg/partition_spec.h" #include "iceberg/result.h" @@ -174,6 +175,40 @@ Result> RestCatalog::Make( std::move(catalog_session), snapshot_mode)); } +Result> RestCatalog::Make( + const RestCatalogProperties& config) { + // Get warehouse location to determine the appropriate FileIO type + auto warehouse = config.Get(RestCatalogProperties::kWarehouse); + if (warehouse.empty()) { + return InvalidArgument( + "Warehouse location is required when FileIO is not explicitly provided. " + "Set the 'warehouse' property to an S3 URI (s3://...) or local path."); + } + + // Check for user-specified io-impl property + auto io_impl = config.configs().find(FileIOProperties::kImpl); + std::string impl_name; + + if (io_impl != config.configs().end() && !io_impl->second.empty()) { + // User specified a custom io-impl + impl_name = io_impl->second; + } else { + // Use default based on warehouse URI scheme + if (warehouse.starts_with("s3://")) { + impl_name = FileIORegistry::kArrowS3FileIO; + } else { + impl_name = FileIORegistry::kArrowLocalFileIO; + } + } + + // Load FileIO from registry + ICEBERG_ASSIGN_OR_RAISE(auto file_io, + FileIORegistry::Load(impl_name, warehouse, config.configs())); + + // Call the main Make method with the created FileIO + return Make(config, std::move(file_io)); +} + RestCatalog::RestCatalog(RestCatalogProperties config, std::shared_ptr file_io, std::unique_ptr client, std::unique_ptr paths, diff --git a/src/iceberg/catalog/rest/rest_catalog.h b/src/iceberg/catalog/rest/rest_catalog.h index 38230a5e2..ce122b3b8 100644 --- a/src/iceberg/catalog/rest/rest_catalog.h +++ b/src/iceberg/catalog/rest/rest_catalog.h @@ -54,6 +54,31 @@ class ICEBERG_REST_EXPORT RestCatalog : public Catalog, static Result> Make(const RestCatalogProperties& config, std::shared_ptr file_io); + /// \brief Create a RestCatalog instance with auto-detected FileIO. + /// + /// This overload automatically creates an appropriate FileIO based on the "io-impl" + /// property or the warehouse location URI scheme. + /// + /// FileIO selection logic: + /// 1. If "io-impl" property is set, use the specified implementation from + /// FileIORegistry. + /// 2. Otherwise, auto-detect based on warehouse URI: + /// - "s3://" -> ArrowS3FileIO + /// - Local path -> ArrowLocalFileIO + /// + /// Users can register custom FileIO implementations via FileIORegistry::Register(): + /// \code + /// FileIORegistry::Register("com.mycompany.MyFileIO", + /// [](const std::string& warehouse, const auto& props) { + /// return std::make_shared(warehouse, props); + /// }); + /// \endcode + /// + /// \param config the configuration for the RestCatalog, including warehouse location + /// and optional "io-impl" property + /// \return a shared_ptr to RestCatalog instance, or an error if FileIO creation fails + static Result> Make(const RestCatalogProperties& config); + std::string_view name() const override; Result> ListNamespaces(const Namespace& ns) const override; diff --git a/src/iceberg/file_io_registry.cc b/src/iceberg/file_io_registry.cc new file mode 100644 index 000000000..ebc84820f --- /dev/null +++ b/src/iceberg/file_io_registry.cc @@ -0,0 +1,21 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#include "iceberg/file_io_registry.h" + +// FileIORegistry is header-only (all methods are inline/static). +// This translation unit ensures the header compiles cleanly. diff --git a/src/iceberg/file_io_registry.h b/src/iceberg/file_io_registry.h new file mode 100644 index 000000000..6500b0e93 --- /dev/null +++ b/src/iceberg/file_io_registry.h @@ -0,0 +1,102 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +#pragma once + +#include +#include +#include +#include +#include + +#include "iceberg/file_io.h" +#include "iceberg/iceberg_export.h" +#include "iceberg/result.h" + +namespace iceberg { + +/// \brief Registry for FileIO implementations. +/// +/// Provides a mechanism to register and load FileIO implementations by name. +/// This allows the REST catalog (and others) to resolve FileIO implementations +/// at runtime based on configuration properties like "io-impl". +class ICEBERG_EXPORT FileIORegistry { + public: + /// Well-known implementation names + static constexpr const char* kArrowLocalFileIO = "org.apache.iceberg.arrow.ArrowFileIO"; + static constexpr const char* kArrowS3FileIO = "org.apache.iceberg.arrow.ArrowS3FileIO"; + + /// Factory function type for creating FileIO instances. + using Factory = std::function>( + const std::string& warehouse, + const std::unordered_map& properties)>; + + /// \brief Register a FileIO factory under the given name. + /// + /// \param name The implementation name (e.g., "org.apache.iceberg.arrow.ArrowFileIO") + /// \param factory The factory function that creates the FileIO instance. + static void Register(const std::string& name, Factory factory) { + std::lock_guard lock(Mutex()); + Registry()[name] = std::move(factory); + } + + /// \brief Load a FileIO implementation by name. + /// + /// \param name The implementation name to look up. + /// \param warehouse The warehouse location URI. + /// \param properties Configuration properties to pass to the factory. + /// \return A shared_ptr to the FileIO instance, or an error if not found. + static Result> Load( + const std::string& name, const std::string& warehouse, + const std::unordered_map& properties) { + Factory factory; + { + std::lock_guard lock(Mutex()); + auto it = Registry().find(name); + if (it == Registry().end()) { + return std::unexpected( + {.kind = ErrorKind::kNotFound, + .message = "FileIO implementation not found: " + name}); + } + factory = it->second; + } + // Invoke factory outside the lock to avoid blocking other Register/Load + // calls and to prevent deadlocks if the factory calls back into the registry. + return factory(warehouse, properties); + } + + private: + static std::unordered_map& Registry() { + static std::unordered_map registry; + return registry; + } + + static std::mutex& Mutex() { + static std::mutex mutex; + return mutex; + } +}; + +/// \brief Property keys for FileIO configuration. +struct FileIOProperties { + /// The FileIO implementation class name (e.g., "org.apache.iceberg.arrow.ArrowFileIO") + static constexpr const char* kImpl = "io-impl"; +}; + +} // namespace iceberg diff --git a/src/iceberg/meson.build b/src/iceberg/meson.build index 2cf1065b0..b13f6c4c9 100644 --- a/src/iceberg/meson.build +++ b/src/iceberg/meson.build @@ -58,6 +58,7 @@ iceberg_sources = files( 'expression/rewrite_not.cc', 'expression/strict_metrics_evaluator.cc', 'expression/term.cc', + 'file_io_registry.cc', 'file_reader.cc', 'file_writer.cc', 'inheritable_metadata.cc', @@ -185,6 +186,7 @@ install_headers( 'exception.h', 'file_format.h', 'file_io.h', + 'file_io_registry.h', 'file_reader.h', 'file_writer.h', 'iceberg_export.h', diff --git a/src/iceberg/test/CMakeLists.txt b/src/iceberg/test/CMakeLists.txt index 768e0507e..9e6188438 100644 --- a/src/iceberg/test/CMakeLists.txt +++ b/src/iceberg/test/CMakeLists.txt @@ -113,6 +113,7 @@ add_iceberg_test(util_test data_file_set_test.cc decimal_test.cc endian_test.cc + file_io_registry_test.cc formatter_test.cc location_util_test.cc string_util_test.cc @@ -137,6 +138,7 @@ if(ICEBERG_BUILD_BUNDLE) USE_BUNDLE SOURCES arrow_fs_file_io_test.cc + arrow_s3_file_io_test.cc arrow_test.cc gzip_decompress_test.cc metadata_io_test.cc diff --git a/src/iceberg/test/arrow_s3_file_io_test.cc b/src/iceberg/test/arrow_s3_file_io_test.cc new file mode 100644 index 000000000..f44730f04 --- /dev/null +++ b/src/iceberg/test/arrow_s3_file_io_test.cc @@ -0,0 +1,238 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +#include +#include +#include +#include + +#include + +#ifdef ICEBERG_S3_ENABLED +# include +#endif + +#include "iceberg/arrow/arrow_file_io.h" +#include "iceberg/arrow/s3_properties.h" +#include "iceberg/test/matchers.h" + +#ifdef ICEBERG_S3_ENABLED +namespace { + +/// \brief GTest environment that finalizes Arrow S3 after all tests complete. +/// +/// Arrow's S3 initialization creates global state that must be cleaned up via +/// FinalizeS3() before the process exits. Without this, Arrow's static destructor +/// detects the missing finalization and causes a non-zero exit (which fails under +/// sanitizers). GTest Environment::TearDown() runs after all tests but before +/// static destructors, making it the safe place to finalize. +class ArrowS3TestEnvironment : public ::testing::Environment { + public: + void TearDown() override { + auto status = ::arrow::fs::FinalizeS3(); + if (!status.ok()) { + std::cerr << "Warning: FinalizeS3 failed: " << status.ToString() << std::endl; + } + } +}; + +// Register before main() runs. GTest takes ownership of the pointer. +[[maybe_unused]] auto* const kS3Env = + ::testing::AddGlobalTestEnvironment(new ArrowS3TestEnvironment); + +} // namespace +#endif + +namespace iceberg::arrow { + +TEST(ArrowS3FileIOTest, RejectsNonS3Uri) { + auto result = MakeS3FileIO("file:///tmp/not-s3"); + EXPECT_THAT(result, IsError(ErrorKind::kInvalidArgument)); + EXPECT_THAT(result, HasErrorMessage("s3://")); +} + +#ifdef ICEBERG_S3_ENABLED +TEST(ArrowS3FileIOTest, RequiresS3SupportAtBuildTime) { + auto result = MakeS3FileIO("s3://bucket/path"); + if (!result.has_value()) { + EXPECT_NE(result.error().kind, ErrorKind::kNotImplemented); + } +} +#else +TEST(ArrowS3FileIOTest, RequiresS3SupportAtBuildTime) { + auto result = MakeS3FileIO("s3://warehouse/iceberg_example"); + EXPECT_THAT(result, IsError(ErrorKind::kNotImplemented)); +} +#endif + +TEST(ArrowS3FileIOTest, ReadWriteFile) { + const char* base_uri = std::getenv("ICEBERG_TEST_S3_URI"); + if (base_uri == nullptr || std::string(base_uri).empty()) { + GTEST_SKIP() << "Set ICEBERG_TEST_S3_URI to enable S3 IO test"; + } + + auto io_res = MakeS3FileIO(base_uri); + if (!io_res.has_value()) { + if (io_res.error().kind == ErrorKind::kNotImplemented) { + GTEST_SKIP() << "Arrow S3 support is not enabled"; + } + FAIL() << "MakeS3FileIO failed: " << io_res.error().message; + } + + auto io = std::move(io_res.value()); + std::string object_uri = base_uri; + if (!object_uri.ends_with('/')) { + object_uri += '/'; + } + object_uri += "iceberg_s3_io_test.txt"; + auto write_res = io->WriteFile(object_uri, "hello s3"); + ASSERT_THAT(write_res, IsOk()); + + auto read_res = io->ReadFile(object_uri, std::nullopt); + ASSERT_THAT(read_res, IsOk()); + EXPECT_THAT(read_res, HasValue(::testing::Eq("hello s3"))); + + auto del_res = io->DeleteFile(object_uri); + EXPECT_THAT(del_res, IsOk()); +} + +// ============================================================================ +// Tests for MakeS3FileIO with properties +// ============================================================================ + +TEST(ArrowS3FileIOTest, MakeS3FileIOWithPropertiesRejectsNonS3Uri) { + std::unordered_map properties; + auto result = MakeS3FileIO("file:///tmp/not-s3", properties); + EXPECT_THAT(result, IsError(ErrorKind::kInvalidArgument)); + EXPECT_THAT(result, HasErrorMessage("s3://")); +} + +TEST(ArrowS3FileIOTest, MakeS3FileIOWithEmptyPropertiesFallsBack) { + const char* base_uri = std::getenv("ICEBERG_TEST_S3_URI"); + if (base_uri == nullptr || std::string(base_uri).empty()) { + GTEST_SKIP() << "Set ICEBERG_TEST_S3_URI to enable S3 IO test"; + } + + // Empty properties should fall back to URI-based resolution + std::unordered_map properties; + auto io_res = MakeS3FileIO(base_uri, properties); + if (!io_res.has_value()) { + if (io_res.error().kind == ErrorKind::kNotImplemented) { + GTEST_SKIP() << "Arrow S3 support is not enabled"; + } + FAIL() << "MakeS3FileIO failed: " << io_res.error().message; + } + + EXPECT_NE(io_res.value(), nullptr); +} + +TEST(ArrowS3FileIOTest, MakeS3FileIOWithProperties) { + const char* base_uri = std::getenv("ICEBERG_TEST_S3_URI"); + const char* access_key = std::getenv("AWS_ACCESS_KEY_ID"); + const char* secret_key = std::getenv("AWS_SECRET_ACCESS_KEY"); + const char* endpoint = std::getenv("ICEBERG_TEST_S3_ENDPOINT"); + const char* region = std::getenv("AWS_REGION"); + + if (base_uri == nullptr || std::string(base_uri).empty()) { + GTEST_SKIP() << "Set ICEBERG_TEST_S3_URI to enable S3 IO test"; + } + + std::unordered_map properties; + + // Configure credentials if available + if (access_key != nullptr && secret_key != nullptr) { + properties[S3Properties::kAccessKeyId] = access_key; + properties[S3Properties::kSecretAccessKey] = secret_key; + } + + // Configure endpoint if available (for MinIO, LocalStack, etc.) + if (endpoint != nullptr && std::string(endpoint).length() > 0) { + properties[S3Properties::kEndpoint] = endpoint; + } + + // Configure region if available + if (region != nullptr && std::string(region).length() > 0) { + properties[S3Properties::kRegion] = region; + } + + auto io_res = MakeS3FileIO(base_uri, properties); + if (!io_res.has_value()) { + if (io_res.error().kind == ErrorKind::kNotImplemented) { + GTEST_SKIP() << "Arrow S3 support is not enabled"; + } + FAIL() << "MakeS3FileIO failed: " << io_res.error().message; + } + + auto io = std::move(io_res.value()); + std::string object_uri = base_uri; + if (!object_uri.ends_with('/')) { + object_uri += '/'; + } + object_uri += "iceberg_s3_io_props_test.txt"; + + auto write_res = io->WriteFile(object_uri, "hello s3 with properties"); + ASSERT_THAT(write_res, IsOk()); + + auto read_res = io->ReadFile(object_uri, std::nullopt); + ASSERT_THAT(read_res, IsOk()); + EXPECT_THAT(read_res, HasValue(::testing::Eq("hello s3 with properties"))); + + auto del_res = io->DeleteFile(object_uri); + EXPECT_THAT(del_res, IsOk()); +} + +TEST(ArrowS3FileIOTest, MakeS3FileIOWithSslDisabled) { + const char* base_uri = std::getenv("ICEBERG_TEST_S3_URI"); + if (base_uri == nullptr || std::string(base_uri).empty()) { + GTEST_SKIP() << "Set ICEBERG_TEST_S3_URI to enable S3 IO test"; + } + + std::unordered_map properties; + properties[S3Properties::kSslEnabled] = "false"; + + // Just test that the configuration is accepted + auto io_res = MakeS3FileIO(base_uri, properties); + if (!io_res.has_value()) { + if (io_res.error().kind == ErrorKind::kNotImplemented) { + GTEST_SKIP() << "Arrow S3 support is not enabled"; + } + // Other errors are acceptable - just checking config parsing works + } +} + +TEST(ArrowS3FileIOTest, MakeS3FileIOWithTimeouts) { + const char* base_uri = std::getenv("ICEBERG_TEST_S3_URI"); + if (base_uri == nullptr || std::string(base_uri).empty()) { + GTEST_SKIP() << "Set ICEBERG_TEST_S3_URI to enable S3 IO test"; + } + + std::unordered_map properties; + properties[S3Properties::kConnectTimeoutMs] = "5000"; + properties[S3Properties::kSocketTimeoutMs] = "10000"; + + auto io_res = MakeS3FileIO(base_uri, properties); + if (!io_res.has_value()) { + if (io_res.error().kind == ErrorKind::kNotImplemented) { + GTEST_SKIP() << "Arrow S3 support is not enabled"; + } + // Other errors are acceptable - just checking config parsing works + } +} + +} // namespace iceberg::arrow diff --git a/src/iceberg/test/file_io_registry_test.cc b/src/iceberg/test/file_io_registry_test.cc new file mode 100644 index 000000000..d927488a6 --- /dev/null +++ b/src/iceberg/test/file_io_registry_test.cc @@ -0,0 +1,119 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#include "iceberg/file_io_registry.h" + +#include +#include + +#include "iceberg/test/matchers.h" + +namespace iceberg { + +namespace { + +/// A minimal FileIO implementation for testing. +class MockFileIO : public FileIO { + public: + Result ReadFile(const std::string& /*file_location*/, + std::optional /*length*/) override { + return std::string("mock"); + } + + Status WriteFile(const std::string& /*file_location*/, + std::string_view /*content*/) override { + return {}; + } + + Status DeleteFile(const std::string& /*file_location*/) override { return {}; } +}; + +} // namespace + +TEST(FileIoRegistryTest, RegisterAndLoad) { + const std::string impl_name = "com.test.MockFileIO"; + FileIORegistry::Register( + impl_name, + [](const std::string& /*warehouse*/, + const std::unordered_map& /*properties*/) + -> Result> { return std::make_shared(); }); + + auto result = FileIORegistry::Load(impl_name, "/test/warehouse", {}); + ASSERT_THAT(result, IsOk()); + EXPECT_NE(result.value(), nullptr); + + // Verify the loaded FileIO works + auto read_result = result.value()->ReadFile("any_file", std::nullopt); + ASSERT_THAT(read_result, IsOk()); + EXPECT_EQ(read_result.value(), "mock"); +} + +TEST(FileIoRegistryTest, LoadNonExistentReturnsError) { + auto result = FileIORegistry::Load("com.nonexistent.FileIO", "/test/warehouse", {}); + EXPECT_THAT(result, IsError(ErrorKind::kNotFound)); + EXPECT_THAT(result, HasErrorMessage("FileIO implementation not found")); +} + +TEST(FileIoRegistryTest, OverrideExistingRegistration) { + const std::string impl_name = "com.test.OverrideFileIO"; + + // Register first implementation + FileIORegistry::Register( + impl_name, + [](const std::string& /*warehouse*/, + const std::unordered_map& /*properties*/) + -> Result> { return std::make_shared(); }); + + // Override with a different factory + FileIORegistry::Register( + impl_name, + [](const std::string& /*warehouse*/, + const std::unordered_map& /*properties*/) + -> Result> { return std::make_shared(); }); + + // Should still work (the override replaces the original) + auto result = FileIORegistry::Load(impl_name, "/test/warehouse", {}); + ASSERT_THAT(result, IsOk()); + EXPECT_NE(result.value(), nullptr); +} + +TEST(FileIoRegistryTest, FactoryReceivesWarehouseAndProperties) { + const std::string impl_name = "com.test.PropCheckFileIO"; + std::string captured_warehouse; + std::unordered_map captured_properties; + + FileIORegistry::Register( + impl_name, + [&captured_warehouse, &captured_properties]( + const std::string& warehouse, + const std::unordered_map& properties) + -> Result> { + captured_warehouse = warehouse; + captured_properties = properties; + return std::make_shared(); + }); + + std::unordered_map props = {{"key1", "val1"}, + {"key2", "val2"}}; + auto result = FileIORegistry::Load(impl_name, "s3://my-bucket/warehouse", props); + ASSERT_THAT(result, IsOk()); + EXPECT_EQ(captured_warehouse, "s3://my-bucket/warehouse"); + EXPECT_EQ(captured_properties.at("key1"), "val1"); + EXPECT_EQ(captured_properties.at("key2"), "val2"); +} + +} // namespace iceberg diff --git a/src/iceberg/test/rest_catalog_integration_test.cc b/src/iceberg/test/rest_catalog_integration_test.cc index b364ffd36..efde1dc15 100644 --- a/src/iceberg/test/rest_catalog_integration_test.cc +++ b/src/iceberg/test/rest_catalog_integration_test.cc @@ -39,6 +39,7 @@ #include "iceberg/catalog/rest/http_client.h" #include "iceberg/catalog/rest/json_serde_internal.h" #include "iceberg/catalog/rest/rest_catalog.h" +#include "iceberg/file_io_registry.h" #include "iceberg/partition_spec.h" #include "iceberg/result.h" #include "iceberg/schema.h" @@ -476,4 +477,78 @@ TEST_F(RestCatalogIntegrationTest, LoadTableWithSnapshotModeRefs) { EXPECT_FALSE(loaded->metadata()->schemas.empty()); } +// ============================================================================ +// Tests for RestCatalog::Make(config) with auto-detected FileIO +// ============================================================================ + +TEST_F(RestCatalogIntegrationTest, MakeWithoutWarehouseReturnsError) { + auto config = RestCatalogProperties::default_properties(); + config.Set(RestCatalogProperties::kUri, CatalogUri()) + .Set(RestCatalogProperties::kName, std::string(kCatalogName)); + // Note: warehouse is NOT set + + auto result = RestCatalog::Make(config); + + EXPECT_THAT(result, IsError(ErrorKind::kInvalidArgument)); + EXPECT_THAT(result, HasErrorMessage("Warehouse location is required")); +} + +TEST_F(RestCatalogIntegrationTest, MakeWithUnregisteredIoImplReturnsError) { + auto config = RestCatalogProperties::default_properties(); + config.Set(RestCatalogProperties::kUri, CatalogUri()) + .Set(RestCatalogProperties::kName, std::string(kCatalogName)) + .Set(RestCatalogProperties::kWarehouse, std::string("/local/warehouse")); + config.mutable_configs()[FileIOProperties::kImpl] = "com.nonexistent.FileIO"; + + auto result = RestCatalog::Make(config); + + // Should fail because the io-impl is not registered + EXPECT_THAT(result, IsError(ErrorKind::kNotFound)); + EXPECT_THAT(result, HasErrorMessage("FileIO implementation not found")); +} + +TEST_F(RestCatalogIntegrationTest, MakeWithAutoDetectedLocalFileIO) { + FileIORegistry::Register( + FileIORegistry::kArrowLocalFileIO, + [](const std::string& /*warehouse*/, + const std::unordered_map& /*properties*/) + -> Result> { + return std::make_shared(); + }); + + auto config = RestCatalogProperties::default_properties(); + config.Set(RestCatalogProperties::kUri, CatalogUri()) + .Set(RestCatalogProperties::kName, std::string(kCatalogName)) + .Set(RestCatalogProperties::kWarehouse, std::string("/local/warehouse")); + + auto catalog_result = RestCatalog::Make(config); + ASSERT_THAT(catalog_result, IsOk()); + + auto& catalog = catalog_result.value(); + EXPECT_EQ(catalog->name(), kCatalogName); +} + +TEST_F(RestCatalogIntegrationTest, MakeWithCustomIoImpl) { + const std::string custom_impl = "com.mycompany.CustomFileIO"; + FileIORegistry::Register( + custom_impl, + [](const std::string& /*warehouse*/, + const std::unordered_map& /*properties*/) + -> Result> { + return std::make_shared(); + }); + + auto config = RestCatalogProperties::default_properties(); + config.Set(RestCatalogProperties::kUri, CatalogUri()) + .Set(RestCatalogProperties::kName, std::string(kCatalogName)) + .Set(RestCatalogProperties::kWarehouse, std::string("/any/warehouse")); + config.mutable_configs()[FileIOProperties::kImpl] = custom_impl; + + auto catalog_result = RestCatalog::Make(config); + ASSERT_THAT(catalog_result, IsOk()); + + auto& catalog = catalog_result.value(); + EXPECT_EQ(catalog->name(), kCatalogName); +} + } // namespace iceberg::rest