Skip to content

Behavior inconsistent with Pandas' data type handling #455

@auxten

Description

@auxten

When reading parquet files with datetime columns, chdb returns timezone-aware
datetimes with system timezone applied, while pandas returns timezone-naive datetimes.

This causes:

  1. Value shift (e.g., 00:00:00 becomes 08:00:00+08:00 in Asia/Singapore timezone)
  2. Type mismatch (datetime64[ns] vs datetime64[ns, Asia/Singapore])

Expected behavior: chdb should return the same datetime values as pandas.

#!/usr/bin/env python3
"""

Minimal reproduction of chdb datetime timezone issue.

When reading parquet files with datetime columns, chdb returns timezone-aware
datetimes with system timezone applied, while pandas returns timezone-naive datetimes.

This causes:
1. Value shift (e.g., 00:00:00 becomes 08:00:00+08:00 in Asia/Singapore timezone)
2. Type mismatch (datetime64[ns] vs datetime64[ns, Asia/Singapore])

Expected behavior: chdb should return the same datetime values as pandas.
"""

import chdb
import pandas as pd
import tempfile
import os

print(f"chdb version: {chdb.__version__}")
print(f"pandas version: {pd.__version__}")

# Create test data with datetime column
df = pd.DataFrame({
    'id': [1, 2, 3],
    'dt': pd.to_datetime(['2020-01-01 00:00:00', '2020-01-01 00:00:01', '2020-01-01 00:00:02'])
})

print("\n=== Original DataFrame ===")
print(df)
print(f"dt dtype: {df['dt'].dtype}")

# Save to parquet
with tempfile.NamedTemporaryFile(suffix='.parquet', delete=False) as f:
    parquet_path = f.name
    df.to_parquet(parquet_path)

print(f"\nParquet file: {parquet_path}")

# Read with pandas
print("\n=== pandas.read_parquet() ===")
pd_result = pd.read_parquet(parquet_path)
print(pd_result)
print(f"dt dtype: {pd_result['dt'].dtype}")
print(f"dt values: {pd_result['dt'].tolist()}")

# Read with chdb
print("\n=== chdb.query() ===")
sql = f"SELECT * FROM file('{parquet_path}', 'Parquet')"
chdb_result = chdb.query(sql, output_format='DataFrame')
print(chdb_result)
print(f"dt dtype: {chdb_result['dt'].dtype}")
print(f"dt values: {chdb_result['dt'].tolist()}")

# Compare
print("\n=== Comparison ===")
print(f"Values equal: {(pd_result['dt'] == chdb_result['dt']).all()}")
print(f"Dtypes equal: {pd_result['dt'].dtype == chdb_result['dt'].dtype}")

# Show the actual difference
print("\n=== Value Difference ===")
print(f"pandas[0]: {pd_result['dt'].iloc[0]} (type: {type(pd_result['dt'].iloc[0])})")
print(f"chdb[0]:   {chdb_result['dt'].iloc[0]} (type: {type(chdb_result['dt'].iloc[0])})")

# Workaround: use session_timezone='UTC'
print("\n=== Workaround: session_timezone='UTC' ===")
sql_utc = f"SELECT * FROM file('{parquet_path}', 'Parquet') SETTINGS session_timezone='UTC'"
chdb_utc = chdb.query(sql_utc, output_format='DataFrame')
print(f"dt dtype: {chdb_utc['dt'].dtype}")
print(f"dt values: {chdb_utc['dt'].tolist()}")
print(f"Values match after tz_localize(None): {(pd_result['dt'] == chdb_utc['dt'].dt.tz_localize(None)).all()}")

# Cleanup
os.unlink(parquet_path)

print("\n=== Summary ===")
print("Issue: chdb applies system timezone to datetime values from parquet files,")
print("       causing value shift and type mismatch compared to pandas.")
print("Expected: chdb should return timezone-naive datetimes matching pandas behavior.")

Metadata

Metadata

Assignees

No one assigned

    Labels

    No labels
    No labels

    Type

    No type

    Projects

    No projects

    Milestone

    No milestone

    Relationships

    None yet

    Development

    No branches or pull requests

    Issue actions