-
-
Notifications
You must be signed in to change notification settings - Fork 95
Open
Description
In small dataset scenarios, chDB's query performance lags behind both Pandas and DuckDB. For example, in a 1000-row Pandas DataFrame with simple query operations, chDB takes over 1ms on an Apple M4 processor. For reference, DuckDB completes the same tasks in 0.6ms, and Pandas only takes 0.1ms.
Likely, a significant portion of the 1ms overhead in chDB comes from ClickHouse's query plan generation and the Python-to-C++ invocation cost.
#!/usr/bin/env python3
import time
import pandas as pd
import numpy as np
import chdb
import duckdb
def generate_test_data(n_rows: int) -> pd.DataFrame:
"""Generate test DataFrame with various column types."""
np.random.seed(42)
df = pd.DataFrame(
{
'id': range(n_rows),
'int_col': np.random.randint(0, 1000, n_rows),
'float_col': np.random.uniform(0, 1000, n_rows),
'str_col': np.random.choice(['A', 'B', 'C', 'D', 'E'], n_rows),
'category': np.random.choice(
['cat1', 'cat2', 'cat3', 'cat4', 'cat5', 'cat6', 'cat7', 'cat8', 'cat9', 'cat10'], n_rows
),
'bool_col': np.random.choice([True, False], n_rows),
'date_col': pd.date_range('2020-01-01', periods=n_rows, freq='s')[:n_rows],
}
)
return df
def pandas_filter_single(df: pd.DataFrame) -> pd.DataFrame:
return df[df['int_col'] > 10000]
def chdb_filter_single(conn, df: pd.DataFrame) -> pd.DataFrame:
return conn.query("SELECT * FROM Python(df) WHERE int_col > 10000", 'DataFrame')
def duckdb_filter_single(conn, df: pd.DataFrame) -> pd.DataFrame:
return conn.execute("SELECT * FROM df WHERE int_col > 10000").df()
def main():
n_rows = 1000
df = generate_test_data(n_rows)
print(f"Data rows: {n_rows}")
# Create connections once
chdb_conn = chdb.connect()
duckdb_conn = duckdb.connect()
# Benchmark pandas
pandas_times = []
for _ in range(3):
start = time.perf_counter()
pandas_filter_single(df)
end = time.perf_counter()
pandas_times.append((end - start) * 1000)
print(f"pandas: {pandas_times} -> min: {min(pandas_times):.4f} ms") # min: 0.1076 ms
# Benchmark chdb
chdb_times = []
for _ in range(3):
start = time.perf_counter()
chdb_filter_single(chdb_conn, df)
end = time.perf_counter()
chdb_times.append((end - start) * 1000)
print(f"chdb: {chdb_times} -> min: {min(chdb_times):.4f} ms") # min: 1.1853 ms
# Benchmark duckdb
duckdb_times = []
for _ in range(3):
start = time.perf_counter()
duckdb_filter_single(duckdb_conn, df)
end = time.perf_counter()
duckdb_times.append((end - start) * 1000)
print(f"duckdb: {duckdb_times} -> min: {min(duckdb_times):.4f} ms") # min: 0.6502 ms
if __name__ == "__main__":
main()Metadata
Metadata
Assignees
Labels
No labels