diff --git a/datafusion/expr/src/expr_schema.rs b/datafusion/expr/src/expr_schema.rs index 81d788b1a230..b613db4b6ad1 100644 --- a/datafusion/expr/src/expr_schema.rs +++ b/datafusion/expr/src/expr_schema.rs @@ -65,6 +65,24 @@ pub trait ExprSchemable { -> Result<(DataType, bool)>; } +/// Derives the output field for a cast expression from the source field. +/// For `TryCast`, `force_nullable` is `true` since a failed cast returns NULL. +fn cast_output_field( + source_field: &FieldRef, + target_type: &DataType, + force_nullable: bool, +) -> Arc { + let mut f = source_field + .as_ref() + .clone() + .with_data_type(target_type.clone()) + .with_metadata(source_field.metadata().clone()); + if force_nullable { + f = f.with_nullable(true); + } + Arc::new(f) +} + impl ExprSchemable for Expr { /// Returns the [arrow::datatypes::DataType] of the expression /// based on [ExprSchema] @@ -553,33 +571,25 @@ impl ExprSchemable for Expr { func.return_field_from_args(args) } // _ => Ok((self.get_type(schema)?, self.nullable(schema)?)), - Expr::Cast(Cast { expr, field }) => expr - .to_field(schema) - .map(|(_table_ref, destination_field)| { - // This propagates the nullability of the input rather than - // force the nullability of the destination field. This is - // usually the desired behaviour (i.e., specifying a cast - // destination type usually does not force a user to pick - // nullability, and assuming `true` would prevent the non-nullability - // of the parent expression to make the result eligible for - // optimizations that only apply to non-nullable values). - destination_field - .as_ref() - .clone() - .with_data_type(field.data_type().clone()) - .with_metadata(destination_field.metadata().clone()) + Expr::Cast(Cast { expr, field }) => { + expr.to_field(schema).map(|(_table_ref, src)| { + cast_output_field(&src, field.data_type(), false) }) - .map(Arc::new), + } Expr::Placeholder(Placeholder { id: _, field: Some(field), }) => Ok(Arc::clone(field).renamed(&schema_name)), + Expr::TryCast(TryCast { expr, field }) => { + expr.to_field(schema).map(|(_table_ref, src)| { + cast_output_field(&src, field.data_type(), true) + }) + } Expr::Like(_) | Expr::SimilarTo(_) | Expr::Not(_) | Expr::Between(_) | Expr::Case(_) - | Expr::TryCast(_) | Expr::InList(_) | Expr::InSubquery(_) | Expr::SetComparison(_) diff --git a/datafusion/sqllogictest/test_files/metadata.slt b/datafusion/sqllogictest/test_files/metadata.slt index f3836b23ec32..11ed4fc632e2 100644 --- a/datafusion/sqllogictest/test_files/metadata.slt +++ b/datafusion/sqllogictest/test_files/metadata.slt @@ -220,6 +220,34 @@ FROM table_with_metadata; 2020-09-08 2020-09-08 +# Regression test: CAST should preserve source field metadata +query DT +SELECT + CAST(ts AS DATE) as casted, + arrow_metadata(CAST(ts AS DATE), 'metadata_key') +FROM table_with_metadata; +---- +2020-09-08 ts non-nullable field +2020-09-08 ts non-nullable field +2020-09-08 ts non-nullable field + +# Regression test: CAST preserves metadata on integer column +query IT +SELECT + CAST(id AS BIGINT) as casted, + arrow_metadata(CAST(id AS BIGINT), 'metadata_key') +FROM table_with_metadata; +---- +1 the id field +NULL the id field +3 the id field + +# Regression test: CAST with single-argument arrow_metadata (returns full map) +query ? +select arrow_metadata(CAST(id AS BIGINT)) from table_with_metadata limit 1; +---- +{metadata_key: the id field} + # Regression test: distinct with cast query D SELECT DISTINCT (ts::DATE) AS dist @@ -347,5 +375,44 @@ select arrow_metadata(id) from table_with_metadata limit 1; ---- {metadata_key: the id field} +# Regression test: TRY_CAST should preserve source field metadata +query DT +SELECT + TRY_CAST(ts AS DATE) as try_casted, + arrow_metadata(TRY_CAST(ts AS DATE), 'metadata_key') +FROM table_with_metadata; +---- +2020-09-08 ts non-nullable field +2020-09-08 ts non-nullable field +2020-09-08 ts non-nullable field + +# Regression test: TRY_CAST preserves metadata on integer column +query IT +SELECT + TRY_CAST(id AS BIGINT) as try_casted, + arrow_metadata(TRY_CAST(id AS BIGINT), 'metadata_key') +FROM table_with_metadata; +---- +1 the id field +NULL the id field +3 the id field + +# Regression test: TRY_CAST preserves metadata even when cast fails (returns NULL) +query IT +SELECT + TRY_CAST(name AS INT) as try_casted, + arrow_metadata(TRY_CAST(name AS INT), 'metadata_key') +FROM table_with_metadata; +---- +NULL the name field +NULL the name field +NULL the name field + +# Regression test: TRY_CAST with single-argument arrow_metadata (returns full map) +query ? +select arrow_metadata(TRY_CAST(id AS BIGINT)) from table_with_metadata limit 1; +---- +{metadata_key: the id field} + statement ok drop table table_with_metadata;