-
-
Notifications
You must be signed in to change notification settings - Fork 95
Closed
Description
When reading parquet files with datetime columns, chdb returns timezone-aware
datetimes with system timezone applied, while pandas returns timezone-naive datetimes.
This causes:
- Value shift (e.g., 00:00:00 becomes 08:00:00+08:00 in Asia/Singapore timezone)
- Type mismatch (datetime64[ns] vs datetime64[ns, Asia/Singapore])
Expected behavior: chdb should return the same datetime values as pandas.
#!/usr/bin/env python3
"""
Minimal reproduction of chdb datetime timezone issue.
When reading parquet files with datetime columns, chdb returns timezone-aware
datetimes with system timezone applied, while pandas returns timezone-naive datetimes.
This causes:
1. Value shift (e.g., 00:00:00 becomes 08:00:00+08:00 in Asia/Singapore timezone)
2. Type mismatch (datetime64[ns] vs datetime64[ns, Asia/Singapore])
Expected behavior: chdb should return the same datetime values as pandas.
"""
import chdb
import pandas as pd
import tempfile
import os
print(f"chdb version: {chdb.__version__}")
print(f"pandas version: {pd.__version__}")
# Create test data with datetime column
df = pd.DataFrame({
'id': [1, 2, 3],
'dt': pd.to_datetime(['2020-01-01 00:00:00', '2020-01-01 00:00:01', '2020-01-01 00:00:02'])
})
print("\n=== Original DataFrame ===")
print(df)
print(f"dt dtype: {df['dt'].dtype}")
# Save to parquet
with tempfile.NamedTemporaryFile(suffix='.parquet', delete=False) as f:
parquet_path = f.name
df.to_parquet(parquet_path)
print(f"\nParquet file: {parquet_path}")
# Read with pandas
print("\n=== pandas.read_parquet() ===")
pd_result = pd.read_parquet(parquet_path)
print(pd_result)
print(f"dt dtype: {pd_result['dt'].dtype}")
print(f"dt values: {pd_result['dt'].tolist()}")
# Read with chdb
print("\n=== chdb.query() ===")
sql = f"SELECT * FROM file('{parquet_path}', 'Parquet')"
chdb_result = chdb.query(sql, output_format='DataFrame')
print(chdb_result)
print(f"dt dtype: {chdb_result['dt'].dtype}")
print(f"dt values: {chdb_result['dt'].tolist()}")
# Compare
print("\n=== Comparison ===")
print(f"Values equal: {(pd_result['dt'] == chdb_result['dt']).all()}")
print(f"Dtypes equal: {pd_result['dt'].dtype == chdb_result['dt'].dtype}")
# Show the actual difference
print("\n=== Value Difference ===")
print(f"pandas[0]: {pd_result['dt'].iloc[0]} (type: {type(pd_result['dt'].iloc[0])})")
print(f"chdb[0]: {chdb_result['dt'].iloc[0]} (type: {type(chdb_result['dt'].iloc[0])})")
# Workaround: use session_timezone='UTC'
print("\n=== Workaround: session_timezone='UTC' ===")
sql_utc = f"SELECT * FROM file('{parquet_path}', 'Parquet') SETTINGS session_timezone='UTC'"
chdb_utc = chdb.query(sql_utc, output_format='DataFrame')
print(f"dt dtype: {chdb_utc['dt'].dtype}")
print(f"dt values: {chdb_utc['dt'].tolist()}")
print(f"Values match after tz_localize(None): {(pd_result['dt'] == chdb_utc['dt'].dt.tz_localize(None)).all()}")
# Cleanup
os.unlink(parquet_path)
print("\n=== Summary ===")
print("Issue: chdb applies system timezone to datetime values from parquet files,")
print(" causing value shift and type mismatch compared to pandas.")
print("Expected: chdb should return timezone-naive datetimes matching pandas behavior.")Metadata
Metadata
Assignees
Labels
No labels