Skip to content

Slow query response for small datasets in chDB #442

@wudidapaopao

Description

@wudidapaopao

In small dataset scenarios, chDB's query performance lags behind both Pandas and DuckDB. For example, in a 1000-row Pandas DataFrame with simple query operations, chDB takes over 1ms on an Apple M4 processor. For reference, DuckDB completes the same tasks in 0.6ms, and Pandas only takes 0.1ms.

Likely, a significant portion of the 1ms overhead in chDB comes from ClickHouse's query plan generation and the Python-to-C++ invocation cost.

#!/usr/bin/env python3
import time
import pandas as pd
import numpy as np
import chdb
import duckdb

def generate_test_data(n_rows: int) -> pd.DataFrame:
    """Generate test DataFrame with various column types."""
    np.random.seed(42)

    df = pd.DataFrame(
        {
            'id': range(n_rows),
            'int_col': np.random.randint(0, 1000, n_rows),
            'float_col': np.random.uniform(0, 1000, n_rows),
            'str_col': np.random.choice(['A', 'B', 'C', 'D', 'E'], n_rows),
            'category': np.random.choice(
                ['cat1', 'cat2', 'cat3', 'cat4', 'cat5', 'cat6', 'cat7', 'cat8', 'cat9', 'cat10'], n_rows
            ),
            'bool_col': np.random.choice([True, False], n_rows),
            'date_col': pd.date_range('2020-01-01', periods=n_rows, freq='s')[:n_rows],
        }
    )

    return df

def pandas_filter_single(df: pd.DataFrame) -> pd.DataFrame:
    return df[df['int_col'] > 10000]

def chdb_filter_single(conn, df: pd.DataFrame) -> pd.DataFrame:
    return conn.query("SELECT * FROM Python(df) WHERE int_col > 10000", 'DataFrame')

def duckdb_filter_single(conn, df: pd.DataFrame) -> pd.DataFrame:
    return conn.execute("SELECT * FROM df WHERE int_col > 10000").df()

def main():
    n_rows = 1000
    df = generate_test_data(n_rows)
    print(f"Data rows: {n_rows}")

    # Create connections once
    chdb_conn = chdb.connect()
    duckdb_conn = duckdb.connect()

    # Benchmark pandas
    pandas_times = []
    for _ in range(3):
        start = time.perf_counter()
        pandas_filter_single(df)
        end = time.perf_counter()
        pandas_times.append((end - start) * 1000)
    print(f"pandas: {pandas_times} -> min: {min(pandas_times):.4f} ms") # min: 0.1076 ms

    # Benchmark chdb
    chdb_times = []
    for _ in range(3):
        start = time.perf_counter()
        chdb_filter_single(chdb_conn, df)
        end = time.perf_counter()
        chdb_times.append((end - start) * 1000)
    print(f"chdb:   {chdb_times} -> min: {min(chdb_times):.4f} ms") # min: 1.1853 ms

    # Benchmark duckdb
    duckdb_times = []
    for _ in range(3):
        start = time.perf_counter()
        duckdb_filter_single(duckdb_conn, df)
        end = time.perf_counter()
        duckdb_times.append((end - start) * 1000)
    print(f"duckdb: {duckdb_times} -> min: {min(duckdb_times):.4f} ms") # min: 0.6502 ms

if __name__ == "__main__":
    main()

Metadata

Metadata

Assignees

No one assigned

    Labels

    No labels
    No labels

    Type

    No type

    Projects

    No projects

    Milestone

    No milestone

    Relationships

    None yet

    Development

    No branches or pull requests

    Issue actions