apache · szehon-ho · Feb 24, 2026 · gengliangwang · Feb 24, 2026
diff --git a/...t/src/main/java/org/apache/spark/sql/connector/expressions/filter/PartitionPredicate.java b/...t/src/main/java/org/apache/spark/sql/connector/expressions/filter/PartitionPredicate.java
@@ -0,0 +1,66 @@
+package org.apache.spark.sql.connector.expressions.filter;
+
+import org.apache.spark.sql.catalyst.InternalRow;
+import org.apache.spark.sql.connector.expressions.Expression;
+
+/**
+ * Represents a partition filter expression (an expression targeting only the schema of
+ * {@link org.apache.spark.sql.connector.catalog.Table#partitioning()}).
+ * <p>
+ * This can be used to evaluate individual partition keys against this partition expression
+ * by {@link #accept(InternalRow)}.
+ * </p>
+ * @since 4.2.0
+ */
+public abstract class PartitionPredicate extends Predicate {
+
+  /**
+   * Default predicate name for partition predicates.
+   */
+  public static final String NAME = "PartitionPredicate";
+
+  public PartitionPredicate(String name, Expression[] children) {
+    super(name, children);
+  }
+
+  /**
+   * Evaluates this predicate against a single partition's keys.
+   *
+   * @param partitionKey keys of a single partition, represented the values of the partition
+   *                     corresponding to
+   *                     {@link org.apache.spark.sql.connector.catalog.Table#partitioning()}
+   * @return true if this partition evaluates to true for this partition expression.
+   */
+  public abstract boolean accept(InternalRow partitionKey);
+
+  /**
+   * Returns the ordinal position(s) of the partition transform(s) in
+   * {@link org.apache.spark.sql.connector.catalog.Table#partitioning()} that are
+   * referenced by this partition filter expression.
+   *
+   * <p><b>Example:</b> Suppose {@code Table.partitioning()} returns three partition
+   * transforms: {@code [years(ts), months(ts), bucket(32, id)]} with ordinals 0, 1, 2.
+   * <ul>
+   *   <li>A filter expression {@code years(ts) = 2026} returns {@code [0]}.</li>
+   *   <li>A filter expression {@code years(ts) = 2026 and months(ts) = 01}
+   *       returns {@code [0, 1]}.</li>
+   *   <li>A filter expression {@code bucket(32, id) = 1} returns {@code [2]}.</li>
+   * </ul>
+   * <p>
+   * Data sources can use this to evaluate PartitionPredicates pushed down by
+   * {@link org.apache.spark.sql.connector.read.SupportsRuntimeV2Filtering#filter(Predicate[])}
+   * to determine whether the PartitionPredicate can be satisfied completely,
+   * or whether it must be returned to Spark for post-scan filtering.
+   * <p>
+   * For example, data sources supporting partition spec evolution
+   * should return PartitionPredicates that reference later-added partition
+   * transforms (for which data in the the table is incompletely partitioned)
+   * to Spark for post-scan filter. Initially-added partition transforms
+   * (for which data in the table is completely partitioned) do not need to be returned
+   * for post-scan filter.
+   * @return array of 0-based ordinals for the transform(s) in
+   * {@link org.apache.spark.sql.connector.catalog.Table#partitioning()} referenced by this
+   * PartitionPredicate's partition filter expression.
+   */
+  public abstract int[] referencedPartitionColumnOrdinals();
+}
diff --git a/...catalyst/src/main/java/org/apache/spark/sql/connector/read/SupportsPushDownV2Filters.java b/...catalyst/src/main/java/org/apache/spark/sql/connector/read/SupportsPushDownV2Filters.java
@@ -60,4 +60,16 @@ public interface SupportsPushDownV2Filters extends ScanBuilder {
    * empty array should be returned for this case.
    */
   Predicate[] pushedPredicates();
+
+  /**
+   * Returns true if this data source supports enhanced partition filtering: a second call to
+   * {@link #pushPredicates(Predicate[])} with partition-only predicates (e.g.
+   * {@link org.apache.spark.sql.connector.expressions.filter.PartitionPredicate}) will
+   * be called.
+   *
+   * @since 4.2.0
+   */
+  default boolean supportsEnhancedPartitionFiltering() {
+    return false;
+  }
 }
diff --git a/sql/catalyst/src/main/java/org/apache/spark/sql/connector/util/V2ExpressionSQLBuilder.java b/sql/catalyst/src/main/java/org/apache/spark/sql/connector/util/V2ExpressionSQLBuilder.java
@@ -35,6 +35,7 @@
 import org.apache.spark.sql.connector.expressions.SortDirection;
 import org.apache.spark.sql.connector.expressions.SortOrder;
 import org.apache.spark.sql.connector.expressions.UserDefinedScalarFunc;
+import org.apache.spark.sql.connector.expressions.filter.PartitionPredicate;
 import org.apache.spark.sql.connector.expressions.aggregate.Avg;
 import org.apache.spark.sql.connector.expressions.aggregate.Max;
 import org.apache.spark.sql.connector.expressions.aggregate.Min;
@@ -78,6 +79,8 @@ public String build(Expression expr) {
       return visitLiteral(literal);
     } else if (expr instanceof NamedReference namedReference) {
       return visitNamedReference(namedReference);
+    } else if (expr instanceof PartitionPredicate partitionPredicate) {
+      return visitPartitionPredicate(partitionPredicate);
     } else if (expr instanceof Cast cast) {
       return visitCast(build(cast.expression()), cast.expressionDataType(), cast.dataType());
     } else if (expr instanceof Extract extract) {
@@ -332,6 +335,10 @@ protected String visitUnexpectedExpr(Expression expr) throws IllegalArgumentExce
       "_LEGACY_ERROR_TEMP_3207", Map.of("expr", String.valueOf(expr)));
   }
 
+  protected String visitPartitionPredicate(PartitionPredicate partitionPredicate) {
+    return partitionPredicate.describe();
+  }
+
   protected String visitOverlay(String[] inputs) {
     assert(inputs.length == 3 || inputs.length == 4);
     if (inputs.length == 3) {

diff --git a/...alyst/src/main/scala/org/apache/spark/sql/internal/connector/PartitionPredicateImpl.scala b/...alyst/src/main/scala/org/apache/spark/sql/internal/connector/PartitionPredicateImpl.scala
@@ -0,0 +1,86 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.sql.internal.connector
+
+import org.apache.spark.internal.Logging
+import org.apache.spark.sql.catalyst.InternalRow
+import org.apache.spark.sql.catalyst.expressions._
+import org.apache.spark.sql.connector.expressions.filter.PartitionPredicate
+
+/**
+ * An implementation for [[PartitionPredicate]] that wraps a Catalyst Expression representing a
+ * partition filter.
+ * <p>
+ * Supporting data sources receive these via
+ * [[org.apache.spark.sql.connector.read.SupportsPushDownV2Filters#pushPredicates pushPredicates]]
+ * and may use them for partition filtering.
+ */
+class PartitionPredicateImpl(
+    private val catalystExpression: Expression,
+    private val partitionSchema: Seq[AttributeReference])
+  extends PartitionPredicate(
+    PartitionPredicate.NAME,
+    org.apache.spark.sql.connector.expressions.Expression.EMPTY_EXPRESSION) with Logging {
+
+  /** The wrapped partition filter Catalyst Expression. */
+  def expression: Expression = catalystExpression
+
+  override def toString(): String =
+    s"PartitionPredicate(${catalystExpression.sql})"
+
+  override def accept(partitionValues: InternalRow): Boolean = {
+    // defensive checks
+    if (partitionSchema.isEmpty) {
+      logWarning(s"Cannot evaluate partition predicate ${catalystExpression.sql}: " +
+        s"partition schema is empty, including partition")
+      return true
+    }
+    if (partitionValues.numFields != partitionSchema.length) {
+      logWarning(s"Cannot evaluate partition predicate ${catalystExpression.sql}: " +
+        s"partition value field count (${partitionValues.numFields}) does not match schema " +
+        s"(${partitionSchema.length}), including partition")
+      return true
+    }
+    val refNames = catalystExpression.references.map(_.name).toSet
+    val partitionNames = partitionSchema.map(_.name).toSet
+    if (!refNames.subsetOf(partitionNames)) {
+      logWarning(s"Cannot evaluate partition predicate ${catalystExpression.sql}: " +
+        s"expression references ${refNames.mkString(", ")} not all in partition columns " +
+        s"${partitionNames.mkString(", ")}, including partition")
+      return true
+    }
+
+    // evaluate the catalyst partition filter expression
+    try {
+      val boundExpr = catalystExpression.transform {
+        case a: AttributeReference =>
+          val index = partitionSchema.indexWhere(_.name == a.name)
+          BoundReference(index, partitionSchema(index).dataType, nullable = true)
+      }
+      val boundPredicate = Predicate.createInterpreted(boundExpr)
+      boundPredicate.eval(partitionValues)
+    } catch {
+      case e: Exception =>
+        logWarning(s"Failed to evaluate partition predicate ${catalystExpression.sql}, " +
+          s"including partition", e)
+        true
+    }
+  }
+
+  override def referencedPartitionColumnOrdinals(): Array[Int] = Array.empty[Int]
+}
diff --git a/...t/scala/org/apache/spark/sql/connector/catalog/InMemoryEnhancedPartitionFilterTable.scala b/...t/scala/org/apache/spark/sql/connector/catalog/InMemoryEnhancedPartitionFilterTable.scala
@@ -0,0 +1,128 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.sql.connector.catalog
+
+import java.util
+
+import org.apache.spark.sql.connector.catalog.CatalogV2Implicits.MultipartIdentifierHelper
+import org.apache.spark.sql.connector.expressions.Transform
+import org.apache.spark.sql.connector.expressions.filter.PartitionPredicate
+import org.apache.spark.sql.connector.expressions.filter.Predicate
+import org.apache.spark.sql.connector.read.{InputPartition, Scan, ScanBuilder, SupportsPushDownRequiredColumns, SupportsPushDownV2Filters}
+import org.apache.spark.sql.connector.write.{LogicalWriteInfo, WriteBuilder}
+import org.apache.spark.sql.types.StructType
+import org.apache.spark.sql.util.CaseInsensitiveStringMap
+import org.apache.spark.util.ArrayImplicits._
+
+/**
+ * In-memory table whose scan builder implements enhanced partition filtering using
+ * PartitionPredicates pushed in a second pass.
+ */
+class InMemoryEnhancedPartitionFilterTable(
+    name: String,
+    columns: Array[Column],
+    partitioning: Array[Transform],
+    properties: util.Map[String, String])
+  extends InMemoryTable(name, columns, partitioning, properties) {
+
+  override def newScanBuilder(options: CaseInsensitiveStringMap): ScanBuilder = {
+    new InMemoryEnhancedPartitionFilterScanBuilder(schema())
+  }
+
+  override def newWriteBuilder(info: LogicalWriteInfo): WriteBuilder = {
+    InMemoryBaseTable.maybeSimulateFailedTableWrite(new CaseInsensitiveStringMap(properties))
+    InMemoryBaseTable.maybeSimulateFailedTableWrite(info.options)
+    new InMemoryWriterBuilderWithOverWrite(info)
+  }
+
+  class InMemoryEnhancedPartitionFilterScanBuilder(
+      tableSchema: StructType)
+    extends ScanBuilder
+    with SupportsPushDownV2Filters
+    with SupportsPushDownRequiredColumns {
+
+    private var readSchema: StructType = tableSchema
+    private var partitionPredicates: Array[PartitionPredicate] = Array.empty
+    private var firstPassPushedPredicates: Array[Predicate] = Array.empty
+    private var _pushedPredicates: Array[Predicate] = Array.empty
+
+    override def supportsEnhancedPartitionFiltering(): Boolean = true
+
+    override def pushPredicates(
+        predicates: Array[Predicate])
+    : Array[Predicate] = {
+      val partitionOnly = predicates.filter(_.isInstanceOf[PartitionPredicate])
+      if (partitionOnly.nonEmpty) {
+        // Second call: partition-only predicates (e.g. UDF(partition_col) = value)
+        partitionPredicates = partitionOnly.map(_.asInstanceOf[PartitionPredicate]).toArray
+        _pushedPredicates = firstPassPushedPredicates ++
+          partitionPredicates.map(p => p: Predicate)
+        Array.empty
+      } else {
+        // First call: push partition-only predicates we can evaluate to prune InputPartitions
+        val partNames = InMemoryEnhancedPartitionFilterTable.this.partCols
+          .flatMap(_.toSeq).toSet
+        def referencesOnlyPartitionCols(p: Predicate): Boolean =
+          p.references().forall(ref =>
+            partNames.contains(ref.fieldNames().mkString(".")))
+        val partitionOnlyFirstPass = predicates.filter(referencesOnlyPartitionCols)
+        firstPassPushedPredicates = partitionOnlyFirstPass.filter(p =>
+          InMemoryTableWithV2Filter.supportsPredicates(Array(p)))
+        _pushedPredicates = firstPassPushedPredicates
+        predicates.filterNot(firstPassPushedPredicates.contains)
+      }
+    }
+
+    override def pushedPredicates(): Array[Predicate] = _pushedPredicates
+
+    override def pruneColumns(requiredSchema: StructType): Unit = {
+      readSchema = requiredSchema
+    }
+
+    override def build(): Scan = {
+      val allPartitions = data.map(_.asInstanceOf[InputPartition]).toImmutableArraySeq
+      val filteredByFirstPass = if (firstPassPushedPredicates.isEmpty) {
+        allPartitions
+      } else {
+        val partNames =
+          InMemoryEnhancedPartitionFilterTable.this.partCols.map(_.toSeq.quoted)
+            .toImmutableArraySeq
+        val allKeys = allPartitions.map(_.asInstanceOf[BufferedRows].key)
+        val matchingKeys = InMemoryTableWithV2Filter.filtersToKeys(
+          allKeys, partNames, firstPassPushedPredicates).toSet
+        allPartitions.filter(p =>
+          matchingKeys.contains(p.asInstanceOf[BufferedRows].key))
+      }
+      val filtered = if (partitionPredicates.isEmpty) {
+        filteredByFirstPass
+      } else {
+        filteredByFirstPass.filter { p =>
+          val partRow = p.asInstanceOf[BufferedRows].partitionKey()
+          partitionPredicates.forall(_.accept(partRow))
+        }
+      }
+      InMemoryEnhancedPartitionFilterBatchScan(filtered, readSchema, tableSchema)
+    }
+  }
+
+  case class InMemoryEnhancedPartitionFilterBatchScan(
+      _data: Seq[InputPartition],
+      readSchema: StructType,
+      tableSchema: StructType)
+    extends BatchScanBaseClass(_data, readSchema, tableSchema)
+}
diff --git a/.../org/apache/spark/sql/connector/catalog/InMemoryTableEnhancedPartitionFilterCatalog.scala b/.../org/apache/spark/sql/connector/catalog/InMemoryTableEnhancedPartitionFilterCatalog.scala
@@ -0,0 +1,49 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.sql.connector.catalog
+
+import java.util
+
+import org.apache.spark.sql.catalyst.analysis.TableAlreadyExistsException
+import org.apache.spark.sql.connector.expressions.Transform
+
+class InMemoryTableEnhancedPartitionFilterCatalog extends InMemoryTableCatalog {
+  import CatalogV2Implicits._
+
+  override def createTable(
+      ident: Identifier,
+      columns: Array[Column],
+      partitions: Array[Transform],
+      properties: util.Map[String, String]): Table = {
+    if (tables.containsKey(ident)) {
+      throw new TableAlreadyExistsException(ident.asMultipartIdentifier)
+    }
+
+    InMemoryTableCatalog.maybeSimulateFailedTableCreation(properties)
+
+    val tableName = s"$name.${ident.quoted}"
+    val table = new InMemoryEnhancedPartitionFilterTable(tableName, columns, partitions, properties)
+    tables.put(ident, table)
+    namespaces.putIfAbsent(ident.namespace.toList, Map())
+    table
+  }
+
+  override def createTable(ident: Identifier, tableInfo: TableInfo): Table = {
+    createTable(ident, tableInfo.columns(), tableInfo.partitions(), tableInfo.properties())
+  }
+}
diff --git a/...e/src/main/scala/org/apache/spark/sql/execution/datasources/v2/DataSourceV2Strategy.scala b/...e/src/main/scala/org/apache/spark/sql/execution/datasources/v2/DataSourceV2Strategy.scala
@@ -49,6 +49,7 @@ import org.apache.spark.sql.execution.joins.StoragePartitionJoinParams
 import org.apache.spark.sql.execution.streaming.continuous.{WriteToContinuousDataSource, WriteToContinuousDataSourceExec}
 import org.apache.spark.sql.internal.SQLConf
 import org.apache.spark.sql.internal.StaticSQLConf.WAREHOUSE_PATH
+import org.apache.spark.sql.internal.connector.PartitionPredicateImpl
 import org.apache.spark.sql.sources.{BaseRelation, TableScan}
 import org.apache.spark.storage.StorageLevel
 import org.apache.spark.util.ArrayImplicits._
@@ -697,6 +698,8 @@ private[sql] object DataSourceV2Strategy extends Logging {
           rebuildExpressionFromFilter(or.right(), translatedFilterToExpr))
       case not: V2Not =>
         expressions.Not(rebuildExpressionFromFilter(not.child(), translatedFilterToExpr))
+      case p: PartitionPredicateImpl =>
+        p.expression
       case _ =>
         translatedFilterToExpr.getOrElse(predicate,
           throw SparkException.internalError(