unify the execute api with an execution strategy

ChunxuTang · ChunxuTang · commit 7dbfee5a15bb · 2025-12-04T15:45:30.000-08:00
diff --git a/python/src/graph.rs b/python/src/graph.rs
@@ -22,7 +22,8 @@ use arrow::ffi_stream::ArrowArrayStreamReader;
 use arrow_array::{RecordBatch, RecordBatchReader};
 use arrow_schema::Schema;
 use lance_graph::{
-    CypherQuery as RustCypherQuery, GraphConfig as RustGraphConfig, GraphError as RustGraphError,
+    ExecutionStrategy as RustExecutionStrategy, CypherQuery as RustCypherQuery,
+    GraphConfig as RustGraphConfig, GraphError as RustGraphError,
 };
 use pyo3::{
     exceptions::{PyNotImplementedError, PyRuntimeError, PyValueError},
@@ -34,6 +35,28 @@ use serde_json::Value as JsonValue;
 
 use crate::RT;
 
+/// Execution strategy for Cypher queries
+#[pyclass(name = "ExecutionStrategy", module = "lance.graph")]
+#[derive(Clone, Copy)]
+pub enum ExecutionStrategy {
+    /// Use DataFusion query planner (default, full feature support)
+    DataFusion,
+    /// Use simple single-table executor (legacy, limited features)
+    Simple,
+    /// Use Lance native executor (not yet implemented)
+    LanceNative,
+}
+
+impl From<ExecutionStrategy> for RustExecutionStrategy {
+    fn from(strategy: ExecutionStrategy) -> Self {
+        match strategy {
+            ExecutionStrategy::DataFusion => RustExecutionStrategy::DataFusion,
+            ExecutionStrategy::Simple => RustExecutionStrategy::Simple,
+            ExecutionStrategy::LanceNative => RustExecutionStrategy::LanceNative,
+        }
+    }
+}
+
 /// Convert GraphError to PyErr
 fn graph_error_to_pyerr(err: RustGraphError) -> PyErr {
     match &err {
@@ -267,6 +290,8 @@ impl CypherQuery {
     /// ----------
     /// datasets : dict
     ///     Dictionary mapping table names to Lance datasets
+    /// strategy : ExecutionStrategy, optional
+    ///     Execution strategy to use (defaults to DataFusion)
     ///
     /// Returns
     /// -------
@@ -277,16 +302,34 @@ impl CypherQuery {
     /// ------
     /// RuntimeError
     ///     If query execution fails
-    fn execute(&self, py: Python, datasets: &Bound<'_, PyDict>) -> PyResult<PyObject> {
-        // Convert datasets to Arrow batches while holding the GIL - same as before
+    ///
+    /// Examples
+    /// --------
+    /// >>> # Default strategy (DataFusion)
+    /// >>> result = query.execute(datasets)
+    ///
+    /// >>> # Explicit strategy
+    /// >>> from lance.graph import ExecutionStrategy
+    /// >>> result = query.execute(datasets, strategy=ExecutionStrategy.Simple)
+    #[pyo3(signature = (datasets, strategy=None))]
+    fn execute(
+        &self,
+        py: Python,
+        datasets: &Bound<'_, PyDict>,
+        strategy: Option<ExecutionStrategy>,
+    ) -> PyResult<PyObject> {
+        // Convert datasets to Arrow batches while holding the GIL
         let arrow_datasets = python_datasets_to_batches(datasets)?;
 
+        // Convert Python strategy to Rust strategy
+        let rust_strategy = strategy.map(|s| s.into());
+
         // Clone the inner query for use in the async block
         let inner_query = self.inner.clone();
 
         // Use RT.block_on with Some(py) like the scanner to_pyarrow method
         let result_batch = RT
-            .block_on(Some(py), inner_query.execute(arrow_datasets))?
+            .block_on(Some(py), inner_query.execute(arrow_datasets, rust_strategy))?
             .map_err(graph_error_to_pyerr)?;
 
         record_batch_to_python_table(py, &result_batch)
@@ -562,6 +605,7 @@ fn record_batch_to_python_table(
 pub fn register_graph_module(py: Python, parent_module: &Bound<'_, PyModule>) -> PyResult<()> {
     let graph_module = PyModule::new(py, "graph")?;
 
+    graph_module.add_class::<ExecutionStrategy>()?;
     graph_module.add_class::<GraphConfig>()?;
     graph_module.add_class::<GraphConfigBuilder>()?;
     graph_module.add_class::<CypherQuery>()?;
diff --git a/rust/lance-graph/benches/graph_execution.rs b/rust/lance-graph/benches/graph_execution.rs
@@ -22,7 +22,7 @@ use arrow_schema::{DataType, Field, Schema as ArrowSchema};
 use criterion::{black_box, criterion_group, criterion_main, BenchmarkId, Criterion, Throughput};
 use futures::TryStreamExt;
 use lance::dataset::{Dataset, WriteMode, WriteParams};
-use lance_graph::{CypherQuery, GraphConfig};
+use lance_graph::{CypherQuery, ExecutionStrategy, GraphConfig};
 use tempfile::TempDir;
 
 fn create_people_batch() -> RecordBatch {
@@ -71,7 +71,11 @@ fn execute_cypher_query(
     q: &CypherQuery,
     datasets: HashMap<String, RecordBatch>,
 ) -> RecordBatch {
-    rt.block_on(async move { q.execute(datasets).await.unwrap() })
+    rt.block_on(async move {
+        q.execute(datasets, Some(ExecutionStrategy::Simple))
+            .await
+            .unwrap()
+    })
 }
 
 fn make_people_batch(n: usize) -> RecordBatch {
diff --git a/rust/lance-graph/src/lib.rs b/rust/lance-graph/src/lib.rs
@@ -53,4 +53,4 @@ pub const MAX_VARIABLE_LENGTH_HOPS: u32 = 20;
 
 pub use config::{GraphConfig, NodeMapping, RelationshipMapping};
 pub use error::{GraphError, Result};
-pub use query::CypherQuery;
+pub use query::{CypherQuery, ExecutionStrategy};
diff --git a/rust/lance-graph/src/query.rs b/rust/lance-graph/src/query.rs
@@ -17,6 +17,23 @@ mod clauses;
 mod expr;
 mod simple_executor;
 
+/// Execution strategy for Cypher queries
+#[derive(Debug, Clone, Copy, PartialEq, Eq)]
+pub enum ExecutionStrategy {
+    /// Use DataFusion query planner (default, full feature support)
+    DataFusion,
+    /// Use simple single-table executor (legacy, limited features)
+    Simple,
+    /// Use Lance native executor (not yet implemented)
+    LanceNative,
+}
+
+impl Default for ExecutionStrategy {
+    fn default() -> Self {
+        Self::DataFusion
+    }
+}
+
 /// A Cypher query that can be executed against Lance datasets
 #[derive(Debug, Clone)]
 pub struct CypherQuery {
@@ -92,6 +109,68 @@ impl CypherQuery {
         })
     }
 
+    /// Execute the query against provided in-memory datasets
+    ///
+    /// This method uses the DataFusion planner by default for comprehensive query support
+    /// including joins, aggregations, and complex patterns. You can optionally specify
+    /// a different execution strategy.
+    ///
+    /// # Arguments
+    /// * `datasets` - HashMap of table name to RecordBatch (nodes and relationships)
+    /// * `strategy` - Optional execution strategy (defaults to DataFusion)
+    ///
+    /// # Returns
+    /// A single RecordBatch containing the query results
+    ///
+    /// # Errors
+    /// Returns error if query parsing, planning, or execution fails
+    ///
+    /// # Example
+    /// ```ignore
+    /// use std::collections::HashMap;
+    /// use arrow::record_batch::RecordBatch;
+    /// use lance_graph::query::CypherQuery;
+    ///
+    /// // Create in-memory datasets
+    /// let mut datasets = HashMap::new();
+    /// datasets.insert("Person".to_string(), person_batch);
+    /// datasets.insert("KNOWS".to_string(), knows_batch);
+    ///
+    /// // Parse and execute query
+    /// let query = CypherQuery::parse("MATCH (p:Person)-[:KNOWS]->(f) RETURN p.name, f.name")?
+    ///     .with_config(config);
+    /// // Use the default DataFusion strategy
+    /// let result = query.execute(datasets, None).await?;
+    /// // Use the Simple strategy explicitly
+    /// let result = query.execute(datasets, Some(ExecutionStrategy::Simple)).await?;
+    /// ```
+    pub async fn execute(
+        &self,
+        datasets: HashMap<String, arrow::record_batch::RecordBatch>,
+        strategy: Option<ExecutionStrategy>,
+    ) -> Result<arrow::record_batch::RecordBatch> {
+        let strategy = strategy.unwrap_or_default();
+        match strategy {
+            ExecutionStrategy::DataFusion => self.execute_datafusion(datasets).await,
+            ExecutionStrategy::Simple => self.execute_simple(datasets).await,
+            ExecutionStrategy::LanceNative => Err(GraphError::UnsupportedFeature {
+                feature: "Lance native execution strategy is not yet implemented".to_string(),
+                location: snafu::Location::new(file!(), line!(), column!()),
+            }),
+        }
+    }
+
+    /// Explain the query execution plan using the DataFusion planner
+    ///
+    /// This method provides a high-level overview of the query execution plan
+    /// using the DataFusion planner, which is useful for debugging and optimization.
+    pub async fn explain(
+        &self,
+        datasets: HashMap<String, arrow::record_batch::RecordBatch>,
+    ) -> Result<String> {
+        self.explain_datafusion(datasets).await
+    }
+
     /// Execute using the DataFusion planner with in-memory datasets
     ///
     /// # Overview
@@ -601,30 +680,6 @@ impl CypherQuery {
         Ok(output)
     }
 
-    /// Execute the query against provided in-memory datasets using the DataFusion planner
-    ///
-    /// This is the primary execution method that uses the full DataFusion-based planner
-    /// for comprehensive query support including joins, aggregations, and complex patterns.
-    ///
-    /// For legacy single-table queries, use `execute_simple()` instead.
-    pub async fn execute(
-        &self,
-        datasets: HashMap<String, arrow::record_batch::RecordBatch>,
-    ) -> Result<arrow::record_batch::RecordBatch> {
-        self.execute_datafusion(datasets).await
-    }
-
-    /// Explain the query execution plan using the DataFusion planner
-    ///
-    /// This method provides a high-level overview of the query execution plan
-    /// using the DataFusion planner, which is useful for debugging and optimization.
-    pub async fn explain(
-        &self,
-        datasets: HashMap<String, arrow::record_batch::RecordBatch>,
-    ) -> Result<String> {
-        self.explain_datafusion(datasets).await
-    }
-
     /// Execute simple single-table queries (legacy implementation)
     ///
     /// This method supports basic projection/filter/limit workflows on a single table.