Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
44 changes: 27 additions & 17 deletions datafusion/expr/src/expr_schema.rs
Original file line number Diff line number Diff line change
Expand Up @@ -65,6 +65,24 @@ pub trait ExprSchemable {
-> Result<(DataType, bool)>;
}

/// Derives the output field for a cast expression from the source field.
/// For `TryCast`, `force_nullable` is `true` since a failed cast returns NULL.
fn cast_output_field(
source_field: &FieldRef,
target_type: &DataType,
force_nullable: bool,
) -> Arc<Field> {
let mut f = source_field
.as_ref()
.clone()
.with_data_type(target_type.clone())
.with_metadata(source_field.metadata().clone());
if force_nullable {
f = f.with_nullable(true);
}
Arc::new(f)
}

impl ExprSchemable for Expr {
/// Returns the [arrow::datatypes::DataType] of the expression
/// based on [ExprSchema]
Expand Down Expand Up @@ -553,33 +571,25 @@ impl ExprSchemable for Expr {
func.return_field_from_args(args)
}
// _ => Ok((self.get_type(schema)?, self.nullable(schema)?)),
Expr::Cast(Cast { expr, field }) => expr
.to_field(schema)
.map(|(_table_ref, destination_field)| {
// This propagates the nullability of the input rather than
// force the nullability of the destination field. This is
// usually the desired behaviour (i.e., specifying a cast
// destination type usually does not force a user to pick
// nullability, and assuming `true` would prevent the non-nullability
// of the parent expression to make the result eligible for
// optimizations that only apply to non-nullable values).
destination_field
.as_ref()
.clone()
.with_data_type(field.data_type().clone())
.with_metadata(destination_field.metadata().clone())
Expr::Cast(Cast { expr, field }) => {
expr.to_field(schema).map(|(_table_ref, src)| {
cast_output_field(&src, field.data_type(), false)
})
.map(Arc::new),
}
Expr::Placeholder(Placeholder {
id: _,
field: Some(field),
}) => Ok(Arc::clone(field).renamed(&schema_name)),
Expr::TryCast(TryCast { expr, field }) => {
expr.to_field(schema).map(|(_table_ref, src)| {
cast_output_field(&src, field.data_type(), true)
})
}
Expr::Like(_)
| Expr::SimilarTo(_)
| Expr::Not(_)
| Expr::Between(_)
| Expr::Case(_)
| Expr::TryCast(_)
| Expr::InList(_)
| Expr::InSubquery(_)
| Expr::SetComparison(_)
Expand Down
67 changes: 67 additions & 0 deletions datafusion/sqllogictest/test_files/metadata.slt
Original file line number Diff line number Diff line change
Expand Up @@ -220,6 +220,34 @@ FROM table_with_metadata;
2020-09-08
2020-09-08

# Regression test: CAST should preserve source field metadata
query DT
SELECT
CAST(ts AS DATE) as casted,
arrow_metadata(CAST(ts AS DATE), 'metadata_key')
FROM table_with_metadata;
----
2020-09-08 ts non-nullable field
2020-09-08 ts non-nullable field
2020-09-08 ts non-nullable field

# Regression test: CAST preserves metadata on integer column
query IT
SELECT
CAST(id AS BIGINT) as casted,
arrow_metadata(CAST(id AS BIGINT), 'metadata_key')
FROM table_with_metadata;
----
1 the id field
NULL the id field
3 the id field

# Regression test: CAST with single-argument arrow_metadata (returns full map)
query ?
select arrow_metadata(CAST(id AS BIGINT)) from table_with_metadata limit 1;
----
{metadata_key: the id field}

# Regression test: distinct with cast
query D
SELECT DISTINCT (ts::DATE) AS dist
Expand Down Expand Up @@ -347,5 +375,44 @@ select arrow_metadata(id) from table_with_metadata limit 1;
----
{metadata_key: the id field}

# Regression test: TRY_CAST should preserve source field metadata
Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The new keyed arrow_metadata(..., 'metadata_key') cases look great and cover the regression nicely 👍

As a small follow up, it might be worth adding one case for arrow_metadata(TRY_CAST(...)) using the single argument form too. Since that path returns the full map and this file already exercises both forms for plain columns, it could help round out the coverage.

query DT
SELECT
TRY_CAST(ts AS DATE) as try_casted,
arrow_metadata(TRY_CAST(ts AS DATE), 'metadata_key')
FROM table_with_metadata;
----
2020-09-08 ts non-nullable field
2020-09-08 ts non-nullable field
2020-09-08 ts non-nullable field

# Regression test: TRY_CAST preserves metadata on integer column
query IT
SELECT
TRY_CAST(id AS BIGINT) as try_casted,
arrow_metadata(TRY_CAST(id AS BIGINT), 'metadata_key')
FROM table_with_metadata;
----
1 the id field
NULL the id field
3 the id field

# Regression test: TRY_CAST preserves metadata even when cast fails (returns NULL)
query IT
SELECT
TRY_CAST(name AS INT) as try_casted,
arrow_metadata(TRY_CAST(name AS INT), 'metadata_key')
FROM table_with_metadata;
----
NULL the name field
NULL the name field
NULL the name field

# Regression test: TRY_CAST with single-argument arrow_metadata (returns full map)
query ?
select arrow_metadata(TRY_CAST(id AS BIGINT)) from table_with_metadata limit 1;
----
{metadata_key: the id field}

statement ok
drop table table_with_metadata;
Loading