Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -77,6 +77,14 @@ protected boolean supportsBatchReads() {
return false;
}

/**
* Hook for subclasses to declare whether they support all types in the given schema. Default is
* to support all schemas. If false is returned, the test will be skipped using assumeTrue.
*/
protected boolean supports(Schema schema) {
return true;
}

Comment on lines +80 to +87
Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

We put all types in a single schema, and if any type is not supported by the engine, the entire test case will be skipped, which means the validation of other types will also be skipped. Is this granularity too coarse?

Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I agree.
We should have a very clear understanding what is supported and what is not supported.
If an engine does not support a type, then it should be handled and highlighted in the engine test.
We generally want to find all of the features which are not supported by all of the FileFormats, and make sure they become suppoted. So as a first rule, we should try to implement the feature in the formats, and as a second rule we can create an exception exception in the main test in the MISSING_FEATURES

private static final FileFormat[] FILE_FORMATS =
new FileFormat[] {FileFormat.AVRO, FileFormat.PARQUET, FileFormat.ORC};

Expand Down Expand Up @@ -131,6 +139,7 @@ void after() {
void testDataWriterEngineWriteGenericRead(FileFormat fileFormat, DataGenerator dataGenerator)
throws IOException {
Schema schema = dataGenerator.schema();
assumeThat(supports(schema)).isTrue();
FileWriterBuilder<DataWriter<T>, Object> writerBuilder =
FormatModelRegistry.dataWriteBuilder(fileFormat, engineType(), encryptedFile);

Expand Down Expand Up @@ -163,6 +172,7 @@ void testDataWriterEngineWriteGenericRead(FileFormat fileFormat, DataGenerator d
void testDataWriterEngineWriteWithoutEngineSchema(
FileFormat fileFormat, DataGenerator dataGenerator) throws IOException {
Schema schema = dataGenerator.schema();
assumeThat(supports(schema)).isTrue();
FileWriterBuilder<DataWriter<T>, Object> writerBuilder =
FormatModelRegistry.dataWriteBuilder(fileFormat, engineType(), encryptedFile);

Expand Down Expand Up @@ -190,6 +200,7 @@ void testDataWriterEngineWriteWithoutEngineSchema(
void testDataWriterGenericWriteEngineRead(FileFormat fileFormat, DataGenerator dataGenerator)
throws IOException {
Schema schema = dataGenerator.schema();
assumeThat(supports(schema)).isTrue();

List<Record> genericRecords = dataGenerator.generateRecords();
writeGenericRecords(fileFormat, schema, genericRecords);
Expand All @@ -213,6 +224,7 @@ void testDataWriterGenericWriteEngineRead(FileFormat fileFormat, DataGenerator d
void testEqualityDeleteWriterEngineWriteGenericRead(
FileFormat fileFormat, DataGenerator dataGenerator) throws IOException {
Schema schema = dataGenerator.schema();
assumeThat(supports(schema)).isTrue();
FileWriterBuilder<EqualityDeleteWriter<T>, Object> writerBuilder =
FormatModelRegistry.equalityDeleteWriteBuilder(fileFormat, engineType(), encryptedFile);

Expand Down Expand Up @@ -250,6 +262,7 @@ void testEqualityDeleteWriterEngineWriteGenericRead(
void testEqualityDeleteWriterEngineWriteWithoutEngineSchema(
FileFormat fileFormat, DataGenerator dataGenerator) throws IOException {
Schema schema = dataGenerator.schema();
assumeThat(supports(schema)).isTrue();
FileWriterBuilder<EqualityDeleteWriter<T>, Object> writerBuilder =
FormatModelRegistry.equalityDeleteWriteBuilder(fileFormat, engineType(), encryptedFile);

Expand Down Expand Up @@ -283,6 +296,7 @@ void testEqualityDeleteWriterEngineWriteWithoutEngineSchema(
void testEqualityDeleteWriterGenericWriteEngineRead(
FileFormat fileFormat, DataGenerator dataGenerator) throws IOException {
Schema schema = dataGenerator.schema();
assumeThat(supports(schema)).isTrue();
FileWriterBuilder<EqualityDeleteWriter<Record>, Object> writerBuilder =
FormatModelRegistry.equalityDeleteWriteBuilder(fileFormat, Record.class, encryptedFile);

Expand Down
44 changes: 43 additions & 1 deletion data/src/test/java/org/apache/iceberg/data/DataGenerators.java
Original file line number Diff line number Diff line change
Expand Up @@ -29,10 +29,52 @@
*/
class DataGenerators {

static final DataGenerator[] ALL = new DataGenerator[] {new StructOfPrimitive()};
static final DataGenerator[] ALL =
new DataGenerator[] {new StructOfPrimitive(), new DefaultSchema(), new AllTypes()};
Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

If AllTypes already covers all types, then I think both DefaultSchema and StructOfPrimitive are no longer needed.WDYT?


private DataGenerators() {}

static class AllTypes implements DataGenerator {
private final Schema schema =
new Schema(
Types.NestedField.required(1, "boolean_col", Types.BooleanType.get()),
Types.NestedField.required(2, "int_col", Types.IntegerType.get()),
Types.NestedField.required(3, "long_col", Types.LongType.get()),
Types.NestedField.required(4, "float_col", Types.FloatType.get()),
Types.NestedField.required(5, "double_col", Types.DoubleType.get()),
Types.NestedField.required(6, "decimal_col", Types.DecimalType.of(9, 2)),
Types.NestedField.required(7, "date_col", Types.DateType.get()),
Types.NestedField.required(8, "time_col", Types.TimeType.get()),
Types.NestedField.required(9, "timestamp_col", Types.TimestampType.withoutZone()),
Types.NestedField.required(10, "timestamp_tz_col", Types.TimestampType.withZone()),
Types.NestedField.required(11, "string_col", Types.StringType.get()),
Types.NestedField.required(12, "uuid_col", Types.UUIDType.get()),
Types.NestedField.required(13, "fixed_col", Types.FixedType.ofLength(16)),
Types.NestedField.required(14, "binary_col", Types.BinaryType.get()),
Types.NestedField.required(
15, "list_col", Types.ListType.ofRequired(16, Types.StringType.get())),
Types.NestedField.required(
17,
"map_col",
Types.MapType.ofRequired(18, 19, Types.StringType.get(), Types.IntegerType.get())),
Types.NestedField.required(
20,
"struct_col",
Types.StructType.of(
Types.NestedField.required(21, "nested_int", Types.IntegerType.get()),
Types.NestedField.required(22, "nested_string", Types.StringType.get()))));
Comment on lines +60 to +65
Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Can we also add the variant type?

Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Also about other v3 table type? TIMESTAMP_NANO,UNKNOWN and Geometry

Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Maybe we should create a different schema and test for them if they are currently only supported by Parquet


@Override
public Schema schema() {
return schema;
}

@Override
public String toString() {
return "AllTypes";
}
}

static class StructOfPrimitive implements DataGenerator {
private final Schema schema =
new Schema(
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,45 @@
/*
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing,
* software distributed under the License is distributed on an
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
* KIND, either express or implied. See the License for the
* specific language governing permissions and limitations
* under the License.
*/
package org.apache.iceberg.data;

import java.util.List;
import org.apache.iceberg.Schema;

public class TestGenericFormatModel extends BaseFormatModelTests<Record> {

@Override
protected Class<Record> engineType() {
return Record.class;
}

@Override
protected Object engineSchema(Schema schema) {
return null; // Not needed for Generic Record
}

@Override
protected Record convertToEngine(Record record, Schema schema) {
return record; // Generic Record is already in engine format
}

@Override
protected void assertEquals(Schema schema, List<Record> expected, List<Record> actual) {
DataTestHelpers.assertEquals(schema.asStruct(), expected, actual);
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -29,6 +29,16 @@

public class TestFlinkFormatModel extends BaseFormatModelTests<RowData> {

@Override
protected boolean supports(Schema schema) {
// Flink fails on Time and UUID for some formats in TCK currently
Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

If it is just some types, could we create a PR to fix them?
Then we would not need to create an extra filter for them

return schema.columns().stream()
.noneMatch(
c ->
c.type().typeId() == org.apache.iceberg.types.Type.TypeID.TIME
|| c.type().typeId() == org.apache.iceberg.types.Type.TypeID.UUID);
}

@Override
protected Class<RowData> engineType() {
return RowData.class;
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -29,6 +29,16 @@

public class TestFlinkFormatModel extends BaseFormatModelTests<RowData> {

@Override
protected boolean supports(Schema schema) {
// Flink fails on Time and UUID for some formats in TCK currently
return schema.columns().stream()
.noneMatch(
c ->
c.type().typeId() == org.apache.iceberg.types.Type.TypeID.TIME
|| c.type().typeId() == org.apache.iceberg.types.Type.TypeID.UUID);
}

@Override
protected Class<RowData> engineType() {
return RowData.class;
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -29,6 +29,16 @@

public class TestFlinkFormatModel extends BaseFormatModelTests<RowData> {

@Override
protected boolean supports(Schema schema) {
// Flink fails on Time and UUID for some formats in TCK currently
return schema.columns().stream()
.noneMatch(
c ->
c.type().typeId() == org.apache.iceberg.types.Type.TypeID.TIME
|| c.type().typeId() == org.apache.iceberg.types.Type.TypeID.UUID);
}

@Override
protected Class<RowData> engineType() {
return RowData.class;
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -29,6 +29,13 @@

public class TestSparkFormatModel extends BaseFormatModelTests<InternalRow> {

@Override
protected boolean supports(Schema schema) {
// Spark does not support Time types
Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Could we just create a method for this instead of using supports(Schema)?

Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Like: supportsTime() abstract method in BaseFormatModelTests

return schema.columns().stream()
.noneMatch(c -> c.type().typeId() == org.apache.iceberg.types.Type.TypeID.TIME);
}

@Override
protected Class<InternalRow> engineType() {
return InternalRow.class;
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -29,6 +29,13 @@

public class TestSparkFormatModel extends BaseFormatModelTests<InternalRow> {

@Override
protected boolean supports(Schema schema) {
// Spark does not support Time types
return schema.columns().stream()
.noneMatch(c -> c.type().typeId() == org.apache.iceberg.types.Type.TypeID.TIME);
}

@Override
protected Class<InternalRow> engineType() {
return InternalRow.class;
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -29,6 +29,13 @@

public class TestSparkFormatModel extends BaseFormatModelTests<InternalRow> {

@Override
protected boolean supports(Schema schema) {
// Spark does not support Time types
return schema.columns().stream()
.noneMatch(c -> c.type().typeId() == org.apache.iceberg.types.Type.TypeID.TIME);
}

@Override
protected Class<InternalRow> engineType() {
return InternalRow.class;
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -29,6 +29,13 @@

public class TestSparkFormatModel extends BaseFormatModelTests<InternalRow> {

@Override
protected boolean supports(Schema schema) {
// Spark does not support Time types
return schema.columns().stream()
.noneMatch(c -> c.type().typeId() == org.apache.iceberg.types.Type.TypeID.TIME);
}

@Override
protected Class<InternalRow> engineType() {
return InternalRow.class;
Expand Down
Loading