Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
36 changes: 36 additions & 0 deletions .github/workflows/test.yml
Original file line number Diff line number Diff line change
Expand Up @@ -41,12 +41,24 @@ jobs:
name: AMD64 Ubuntu 24.04
runs-on: ubuntu-24.04
timeout-minutes: 30
strategy:
fail-fast: false
env:
ICEBERG_TEST_S3_URI: s3://iceberg-test
AWS_ACCESS_KEY_ID: minio
AWS_SECRET_ACCESS_KEY: minio123
AWS_DEFAULT_REGION: us-east-1
AWS_ENDPOINT_URL: http://127.0.0.1:9000
AWS_EC2_METADATA_DISABLED: "TRUE"
steps:
- name: Checkout iceberg-cpp
uses: actions/checkout@v6
- name: Install dependencies
shell: bash
run: sudo apt-get update && sudo apt-get install -y libcurl4-openssl-dev
- name: Start MinIO
shell: bash
run: bash ci/scripts/start_minio.sh
- name: Build Iceberg
shell: bash
env:
Expand All @@ -63,9 +75,21 @@ jobs:
name: AArch64 macOS 26
runs-on: macos-26
timeout-minutes: 30
strategy:
fail-fast: false
env:
ICEBERG_TEST_S3_URI: s3://iceberg-test
AWS_ACCESS_KEY_ID: minio
AWS_SECRET_ACCESS_KEY: minio123
AWS_DEFAULT_REGION: us-east-1
AWS_ENDPOINT_URL: http://127.0.0.1:9000
AWS_EC2_METADATA_DISABLED: "TRUE"
steps:
- name: Checkout iceberg-cpp
uses: actions/checkout@v6
- name: Start MinIO
shell: bash
run: bash ci/scripts/start_minio.sh
- name: Build Iceberg
shell: bash
run: ci/scripts/build_iceberg.sh $(pwd)
Expand All @@ -76,6 +100,15 @@ jobs:
name: AMD64 Windows 2025
runs-on: windows-2025
timeout-minutes: 60
strategy:
fail-fast: false
env:
ICEBERG_TEST_S3_URI: s3://iceberg-test
AWS_ACCESS_KEY_ID: minio
AWS_SECRET_ACCESS_KEY: minio123
AWS_DEFAULT_REGION: us-east-1
AWS_ENDPOINT_URL: http://127.0.0.1:9000
AWS_EC2_METADATA_DISABLED: "TRUE"
steps:
- name: Checkout iceberg-cpp
uses: actions/checkout@v6
Expand All @@ -85,6 +118,9 @@ jobs:
vcpkg install zlib:x64-windows nlohmann-json:x64-windows nanoarrow:x64-windows roaring:x64-windows cpr:x64-windows
- name: Setup sccache
uses: mozilla-actions/sccache-action@7d986dd989559c6ecdb630a3fd2557667be217ad # v0.0.9
- name: Start MinIO
shell: bash
run: bash ci/scripts/start_minio.sh
- name: Build Iceberg
shell: cmd
env:
Expand Down
1 change: 1 addition & 0 deletions CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -45,6 +45,7 @@ option(ICEBERG_BUILD_TESTS "Build tests" ON)
option(ICEBERG_BUILD_BUNDLE "Build the battery included library" ON)
option(ICEBERG_BUILD_REST "Build rest catalog client" ON)
option(ICEBERG_BUILD_REST_INTEGRATION_TESTS "Build rest catalog integration tests" OFF)
option(ICEBERG_S3 "Build with S3 support" ON)
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

It is worth noting that ICEBERG_S3 should be disabled if ICEBERG_BUILD_BUNDLE is OFF.

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Should we disable it by default?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Currently, it appears that the entire project will prioritize development around the REST catalog, which primarily interacts with S3. Are we certain we want to disable ICEBERG_S3 by default?

option(ICEBERG_ENABLE_ASAN "Enable Address Sanitizer" OFF)
option(ICEBERG_ENABLE_UBSAN "Enable Undefined Behavior Sanitizer" OFF)

Expand Down
148 changes: 148 additions & 0 deletions ci/scripts/start_minio.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,148 @@
#!/usr/bin/env bash
#
# Licensed to the Apache Software Foundation (ASF) under one
# or more contributor license agreements. See the NOTICE file
# distributed with this work for additional information
# regarding copyright ownership. The ASF licenses this file
# to you under the Apache License, Version 2.0 (the
# "License"); you may not use this file except in compliance
# with the License. You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing,
# software distributed under the License is distributed on an
# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
# KIND, either express or implied. See the License for the
# specific language governing permissions and limitations
# under the License.

set -eux

MINIO_ROOT_USER="${MINIO_ROOT_USER:-minio}"
MINIO_ROOT_PASSWORD="${MINIO_ROOT_PASSWORD:-minio123}"
MINIO_IMAGE="${MINIO_IMAGE:-minio/minio:latest}"
MINIO_CONTAINER_NAME="${MINIO_CONTAINER_NAME:-iceberg-minio}"
MINIO_PORT="${MINIO_PORT:-9000}"
MINIO_CONSOLE_PORT="${MINIO_CONSOLE_PORT:-9001}"
MINIO_BUCKET="${MINIO_BUCKET:-iceberg-test}"
MINIO_ENDPOINT="${MINIO_ENDPOINT:-http://127.0.0.1:${MINIO_PORT}}"

wait_for_minio() {
for i in {1..30}; do
if curl -fsS "${MINIO_ENDPOINT}/minio/health/ready" >/dev/null; then
return 0
fi
sleep 1
done
return 1
}

start_minio_docker() {
if ! command -v docker >/dev/null 2>&1; then
return 1
fi

if docker ps -a --format '{{.Names}}' | grep -q "^${MINIO_CONTAINER_NAME}\$"; then
docker rm -f "${MINIO_CONTAINER_NAME}"
fi

docker run -d --name "${MINIO_CONTAINER_NAME}" \
-p "${MINIO_PORT}:9000" -p "${MINIO_CONSOLE_PORT}:9001" \
-e "MINIO_ROOT_USER=${MINIO_ROOT_USER}" \
-e "MINIO_ROOT_PASSWORD=${MINIO_ROOT_PASSWORD}" \
"${MINIO_IMAGE}" \
server /data --console-address ":${MINIO_CONSOLE_PORT}"

wait_for_minio
}

start_minio_macos() {
if ! command -v brew >/dev/null 2>&1; then
echo "brew is required to start MinIO on macOS without Docker" >&2
return 1
fi

brew install minio
MINIO_ROOT_USER="${MINIO_ROOT_USER}" MINIO_ROOT_PASSWORD="${MINIO_ROOT_PASSWORD}" \
minio server /tmp/minio --console-address ":${MINIO_CONSOLE_PORT}" &
wait_for_minio
}

download_mc() {
local uname_out
uname_out="$(uname -s)"

local mc_dir
mc_dir="${RUNNER_TEMP:-/tmp}"
mkdir -p "${mc_dir}"

case "${uname_out}" in
Linux*)
MC_BIN="${mc_dir}/mc"
curl -sSL "https://dl.min.io/client/mc/release/linux-amd64/mc" -o "${MC_BIN}"
chmod +x "${MC_BIN}"
;;
Darwin*)
MC_BIN="${mc_dir}/mc"
local arch
arch="$(uname -m)"
if [ "${arch}" = "arm64" ]; then
curl -sSL "https://dl.min.io/client/mc/release/darwin-arm64/mc" -o "${MC_BIN}"
else
curl -sSL "https://dl.min.io/client/mc/release/darwin-amd64/mc" -o "${MC_BIN}"
fi
chmod +x "${MC_BIN}"
;;
MINGW*|MSYS*|CYGWIN*)
MC_BIN="${mc_dir}/mc.exe"
curl -sSL "https://dl.min.io/client/mc/release/windows-amd64/mc.exe" -o "${MC_BIN}"
;;
*)
echo "Unsupported OS for mc: ${uname_out}" >&2
return 1
;;
esac
}

create_bucket() {
download_mc
for i in {1..30}; do
if "${MC_BIN}" alias set local "${MINIO_ENDPOINT}" "${MINIO_ROOT_USER}" "${MINIO_ROOT_PASSWORD}"; then
break
fi
sleep 1
done
"${MC_BIN}" mb --ignore-existing "local/${MINIO_BUCKET}"
}

start_minio_windows() {
local minio_dir="${RUNNER_TEMP:-/tmp}"
local minio_bin="${minio_dir}/minio.exe"
curl -sSL "https://dl.min.io/server/minio/release/windows-amd64/minio.exe" -o "${minio_bin}"
MINIO_ROOT_USER="${MINIO_ROOT_USER}" MINIO_ROOT_PASSWORD="${MINIO_ROOT_PASSWORD}" \
"${minio_bin}" server "${minio_dir}/minio-data" --console-address ":${MINIO_CONSOLE_PORT}" &
wait_for_minio
}

case "$(uname -s)" in
Darwin*)
if ! start_minio_docker; then
start_minio_macos
fi
;;
MINGW*|MSYS*|CYGWIN*)
if ! start_minio_docker; then
start_minio_windows
fi
;;
Linux*)
start_minio_docker
;;
*)
echo "Unsupported OS: $(uname -s)" >&2
exit 1
;;
esac

create_bucket
8 changes: 8 additions & 0 deletions cmake_modules/IcebergThirdpartyToolchain.cmake
Original file line number Diff line number Diff line change
Expand Up @@ -102,6 +102,7 @@ function(resolve_arrow_dependency)
# Work around undefined symbol: arrow::ipc::ReadSchema(arrow::io::InputStream*, arrow::ipc::DictionaryMemo*)
set(ARROW_IPC ON)
set(ARROW_FILESYSTEM ON)
set(ARROW_S3 ${ICEBERG_S3})
set(ARROW_JSON ON)
set(ARROW_PARQUET ON)
set(ARROW_SIMD_LEVEL "NONE")
Expand Down Expand Up @@ -164,6 +165,13 @@ function(resolve_arrow_dependency)
install(FILES ${arrow_bundled_dependencies_location}
DESTINATION ${ICEBERG_INSTALL_LIBDIR})
endif()

# Arrow's exported static target interface may reference system libraries
# (e.g. OpenSSL, CURL, ZLIB) that consumers need to find.
list(APPEND ICEBERG_SYSTEM_DEPENDENCIES ZLIB)
if(ARROW_S3)
list(APPEND ICEBERG_SYSTEM_DEPENDENCIES OpenSSL CURL)
endif()
else()
set(ARROW_VENDORED FALSE)
find_package(Arrow CONFIG REQUIRED)
Expand Down
12 changes: 12 additions & 0 deletions src/iceberg/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -40,6 +40,7 @@ set(ICEBERG_SOURCES
expression/rewrite_not.cc
expression/strict_metrics_evaluator.cc
expression/term.cc
file_io_registry.cc
file_reader.cc
file_writer.cc
inheritable_metadata.cc
Expand Down Expand Up @@ -176,6 +177,8 @@ add_subdirectory(util)
if(ICEBERG_BUILD_BUNDLE)
set(ICEBERG_BUNDLE_SOURCES
arrow/arrow_fs_file_io.cc
arrow/arrow_s3_file_io.cc
arrow/file_io_register.cc
arrow/metadata_column_util.cc
avro/avro_data_util.cc
avro/avro_direct_decoder.cc
Expand Down Expand Up @@ -241,6 +244,15 @@ if(ICEBERG_BUILD_BUNDLE)
OUTPUTS
ICEBERG_BUNDLE_LIBRARIES)

if(ICEBERG_S3)
foreach(target iceberg_bundle_static iceberg_bundle_shared)
if(TARGET ${target})
target_compile_definitions(${target}
PUBLIC "$<BUILD_INTERFACE:ICEBERG_S3_ENABLED=1>")
endif()
endforeach()
endif()

add_subdirectory(arrow)
add_subdirectory(avro)
add_subdirectory(parquet)
Expand Down
17 changes: 17 additions & 0 deletions src/iceberg/arrow/arrow_file_io.h
Original file line number Diff line number Diff line change
Expand Up @@ -20,14 +20,31 @@
#pragma once

#include <memory>
#include <string>
#include <unordered_map>

#include "iceberg/file_io.h"
#include "iceberg/iceberg_bundle_export.h"
#include "iceberg/result.h"

namespace iceberg::arrow {

ICEBERG_BUNDLE_EXPORT std::unique_ptr<FileIO> MakeMockFileIO();

ICEBERG_BUNDLE_EXPORT std::unique_ptr<FileIO> MakeLocalFileIO();

/// \brief Create an S3 FileIO backed by Arrow's S3FileSystem.
///
/// This function initializes the S3 subsystem if not already initialized (thread-safe).
/// The S3 initialization is done once per process using std::call_once.
///
/// \param uri An S3 URI (must start with "s3://") used to validate the scheme.
/// \param properties Optional configuration properties for S3 access. See S3Properties
/// for available keys (credentials, region, endpoint, timeouts, etc.).
/// \return A FileIO instance for S3 operations, or an error if S3 is not supported
/// or the URI is invalid.
ICEBERG_BUNDLE_EXPORT Result<std::unique_ptr<FileIO>> MakeS3FileIO(
const std::string& uri,
const std::unordered_map<std::string, std::string>& properties = {});

} // namespace iceberg::arrow
22 changes: 19 additions & 3 deletions src/iceberg/arrow/arrow_fs_file_io.cc
Original file line number Diff line number Diff line change
Expand Up @@ -25,13 +25,23 @@
#include "iceberg/arrow/arrow_file_io.h"
#include "iceberg/arrow/arrow_fs_file_io_internal.h"
#include "iceberg/arrow/arrow_status_internal.h"
#include "iceberg/util/macros.h"

namespace iceberg::arrow {

Result<std::string> ArrowFileSystemFileIO::ResolvePath(const std::string& file_location) {
if (file_location.find("://") != std::string::npos) {
ICEBERG_ARROW_ASSIGN_OR_RETURN(auto path, arrow_fs_->PathFromUri(file_location));
return path;
}
return file_location;
}

/// \brief Read the content of the file at the given location.
Result<std::string> ArrowFileSystemFileIO::ReadFile(const std::string& file_location,
std::optional<size_t> length) {
::arrow::fs::FileInfo file_info(file_location);
ICEBERG_ASSIGN_OR_RAISE(auto path, ResolvePath(file_location));
::arrow::fs::FileInfo file_info(path);
if (length.has_value()) {
file_info.set_size(length.value());
}
Expand All @@ -47,6 +57,10 @@ Result<std::string> ArrowFileSystemFileIO::ReadFile(const std::string& file_loca
ICEBERG_ARROW_ASSIGN_OR_RETURN(
auto read_bytes,
file->Read(read_length, reinterpret_cast<uint8_t*>(&content[offset])));
if (read_bytes == 0) {
return IOError("Unexpected EOF reading {}: got {} of {} bytes", file_location,
offset, file_size);
}
remain -= read_bytes;
offset += read_bytes;
}
Expand All @@ -57,7 +71,8 @@ Result<std::string> ArrowFileSystemFileIO::ReadFile(const std::string& file_loca
/// \brief Write the given content to the file at the given location.
Status ArrowFileSystemFileIO::WriteFile(const std::string& file_location,
std::string_view content) {
ICEBERG_ARROW_ASSIGN_OR_RETURN(auto file, arrow_fs_->OpenOutputStream(file_location));
ICEBERG_ASSIGN_OR_RAISE(auto path, ResolvePath(file_location));
ICEBERG_ARROW_ASSIGN_OR_RETURN(auto file, arrow_fs_->OpenOutputStream(path));
ICEBERG_ARROW_RETURN_NOT_OK(file->Write(content.data(), content.size()));
ICEBERG_ARROW_RETURN_NOT_OK(file->Flush());
ICEBERG_ARROW_RETURN_NOT_OK(file->Close());
Expand All @@ -66,7 +81,8 @@ Status ArrowFileSystemFileIO::WriteFile(const std::string& file_location,

/// \brief Delete a file at the given location.
Status ArrowFileSystemFileIO::DeleteFile(const std::string& file_location) {
ICEBERG_ARROW_RETURN_NOT_OK(arrow_fs_->DeleteFile(file_location));
ICEBERG_ASSIGN_OR_RAISE(auto path, ResolvePath(file_location));
ICEBERG_ARROW_RETURN_NOT_OK(arrow_fs_->DeleteFile(path));
return {};
}

Expand Down
3 changes: 3 additions & 0 deletions src/iceberg/arrow/arrow_fs_file_io_internal.h
Original file line number Diff line number Diff line change
Expand Up @@ -56,6 +56,9 @@ class ICEBERG_BUNDLE_EXPORT ArrowFileSystemFileIO : public FileIO {
const std::shared_ptr<::arrow::fs::FileSystem>& fs() const { return arrow_fs_; }

private:
/// \brief Resolve a file location to a filesystem path.
Result<std::string> ResolvePath(const std::string& file_location);

std::shared_ptr<::arrow::fs::FileSystem> arrow_fs_;
};

Expand Down
Loading
Loading