lance-format
diff --git a/‎Cargo.lock‎
Lines changed: 8 additions & 0 deletions b/‎Cargo.lock‎
Lines changed: 8 additions & 0 deletions
diff --git a/‎Cargo.toml‎
Lines changed: 1 addition & 0 deletions b/‎Cargo.toml‎
Lines changed: 1 addition & 0 deletions
diff --git a/‎java/lance-jni/Cargo.lock‎
Lines changed: 7 additions & 0 deletions b/‎java/lance-jni/Cargo.lock‎
Lines changed: 7 additions & 0 deletions
diff --git a/‎java/lance-jni/src/blocking_dataset.rs‎
Lines changed: 2 additions & 1 deletion b/‎java/lance-jni/src/blocking_dataset.rs‎
Lines changed: 2 additions & 1 deletion
diff --git a/‎python/Cargo.lock‎
Lines changed: 7 additions & 0 deletions b/‎python/Cargo.lock‎
Lines changed: 7 additions & 0 deletions
diff --git a/‎python/python/benchmarks/test_search.py‎
Lines changed: 59 additions & 0 deletions b/‎python/python/benchmarks/test_search.py‎
Lines changed: 59 additions & 0 deletions
diff --git a/‎python/python/lance/dataset.py‎
Lines changed: 5 additions & 3 deletions b/‎python/python/lance/dataset.py‎
Lines changed: 5 additions & 3 deletions
diff --git a/‎python/python/tests/test_geo.py‎
Lines changed: 49 additions & 0 deletions b/‎python/python/tests/test_geo.py‎
Lines changed: 49 additions & 0 deletions
diff --git a/‎python/src/dataset.rs‎
Lines changed: 5 additions & 0 deletions b/‎python/src/dataset.rs‎
Lines changed: 5 additions & 0 deletions
diff --git a/‎rust/lance-geo/Cargo.toml‎
Lines changed: 3 additions & 0 deletions b/‎rust/lance-geo/Cargo.toml‎
Lines changed: 3 additions & 0 deletions
@@ -135,6 +135,7 @@ futures = "0.3"
 geoarrow-array = "0.6"
 geoarrow-schema = "0.6"
 geodatafusion = "0.1.1"
+geo-traits = "0.3.0"
 geo-types = "0.7.16"
 http = "1.1.0"
 humantime = "2.2.0"
 
@@ -806,7 +806,8 @@ fn inner_create_index(
         | IndexType::Inverted
         | IndexType::NGram
         | IndexType::ZoneMap
-        | IndexType::BloomFilter => {
+        | IndexType::BloomFilter
+        | IndexType::RTree => {
             // For scalar indices, create a scalar IndexParams
             let (index_type_str, params_opt) = get_scalar_index_params(env, params_jobj)?;
             let scalar_params = lance_index::scalar::ScalarIndexParams {
 
@@ -505,3 +505,62 @@ def test_late_materialization(test_dataset, benchmark, use_index):
         filter=f"{column} = 0",
         batch_size=32,
     )
+
+
+@pytest.fixture(scope="module")
+def test_geo_dataset(tmpdir_factory):
+    from geoarrow.rust.core import (
+        point,
+        points,
+    )
+
+    num_rows = 1_000_000
+    points_2d = points([np.random.randn(num_rows), np.random.randn(num_rows)])
+
+    schema = pa.schema(
+        [
+            pa.field(point("xy")).with_name("points"),
+        ]
+    )
+    table = pa.Table.from_arrays([points_2d], schema=schema)
+    uri = str(tmpdir_factory.mktemp("test_geo_dataset"))
+    lance.write_dataset(table, uri)
+    ds = lance.dataset(uri)
+    return ds
+
+
+@pytest.mark.benchmark(group="geo")
+@pytest.mark.parametrize(
+    "use_index",
+    (False, True),
+    ids=["no_index", "with_index"],
+)
+def test_geo_rtree(test_geo_dataset, benchmark, use_index):
+    if use_index:
+        test_geo_dataset.create_scalar_index(
+            column="points",
+            index_type="RTREE",
+            replace=True,
+        )
+
+    print(
+        test_geo_dataset.scanner(
+            columns=["points"],
+            filter="""
+                St_Contains(points,
+                ST_GeomFromText('POLYGON (( 0 0, 2 0, 0 2, 2 2, 0 0 ))'))
+            """,
+            batch_size=32,
+            use_scalar_index=use_index,
+        ).explain_plan(True)
+    )
+    benchmark(
+        test_geo_dataset.to_table,
+        columns=["points"],
+        filter="""
+            St_Contains(points,
+            ST_GeomFromText('POLYGON (( 0 0, 2 0, 0 2, 2 2, 0 0 ))'))
+        """,
+        batch_size=32,
+        use_scalar_index=use_index,
+    )
@@ -2343,6 +2343,7 @@ def create_scalar_index(
             Literal["NGRAM"],
             Literal["ZONEMAP"],
             Literal["BLOOMFILTER"],
+            Literal["RTREE"],
             IndexConfig,
         ],
         name: Optional[str] = None,
@@ -2428,8 +2429,8 @@ def create_scalar_index(
             or string column.
         index_type : str
             The type of the index.  One of ``"BTREE"``, ``"BITMAP"``,
-            ``"LABEL_LIST"``, ``"NGRAM"``, ``"ZONEMAP"``, ``"INVERTED"``, or
-            ``"BLOOMFILTER"``.
+            ``"LABEL_LIST"``, ``"NGRAM"``, ``"ZONEMAP"``, ``"INVERTED"``,
+            ``"BLOOMFILTER"``, ``"RTREE"``.
         name : str, optional
             The index name. If not provided, it will be generated from the
             column name.
@@ -2550,11 +2551,12 @@ def create_scalar_index(
                 "LABEL_LIST",
                 "INVERTED",
                 "BLOOMFILTER",
+                "RTREE",
             ]:
                 raise NotImplementedError(
                     (
                         'Only "BTREE", "BITMAP", "NGRAM", "ZONEMAP", "LABEL_LIST", '
-                        '"INVERTED", or "BLOOMFILTER" are supported for '
+                        '"INVERTED", "BLOOMFILTER" or "RTREE" are supported for '
                         f"scalar columns.  Received {index_type}",
                     )
                 )
 
@@ -104,3 +104,52 @@ def test_geo_sql(tmp_path: Path):
     assert np.allclose(
         np.array(result["dist"]), np.array([2.5495097567963922]), atol=1e-8
     )
+
+
+def test_rtree_index(tmp_path: Path):
+    # LineStrings
+    num_lines = 10000
+    line_offsets = np.arange(num_lines + 1, dtype=np.int32) * 2
+    linestrings_2d = linestrings(
+        [np.random.randn(num_lines * 2) * 100, np.random.randn(num_lines * 2) * 100],
+        line_offsets,
+    )
+    assert len(linestrings_2d) == num_lines
+
+    schema = pa.schema(
+        [
+            pa.field("id", pa.int64()),
+            pa.field(linestring("xy")).with_name("linestring"),
+        ]
+    )
+    table = pa.Table.from_arrays(
+        [np.arange(num_lines, dtype=np.int64), linestrings_2d], schema=schema
+    )
+    ds = lance.write_dataset(table, str(tmp_path / "test_rtree_index.lance"))
+
+    def query(ds: lance.LanceDataset, has_index=False):
+        sql = """
+              SELECT `id`, linestring
+              FROM dataset
+              WHERE
+              St_Intersects(linestring, ST_GeomFromText('LINESTRING ( 2 0, 0 2 )'))
+              """
+
+        batches = ds.sql("EXPLAIN ANALYZE " + sql).build().to_batch_records()
+        explain = pa.Table.from_batches(batches).to_pandas().to_string()
+
+        if has_index:
+            assert "ScalarIndexQuery" in explain
+        else:
+            assert "ScalarIndexQuery" not in explain
+
+        batches = ds.sql(sql).build().to_batch_records()
+        return pa.Table.from_batches(batches)
+
+    table_without_index = query(ds)
+
+    ds.create_scalar_index("linestring", "RTREE")
+
+    table_with_index = query(ds, has_index=True)
+
+    assert table_with_index == table_without_index
@@ -1820,6 +1820,7 @@ impl Dataset {
             "ZONEMAP" => IndexType::ZoneMap,
             "BLOOMFILTER" => IndexType::BloomFilter,
             "LABEL_LIST" => IndexType::LabelList,
+            "RTREE" => IndexType::RTree,
             "INVERTED" => IndexType::Inverted,
             "IVF_FLAT" | "IVF_PQ" | "IVF_SQ" | "IVF_RQ" | "IVF_HNSW_FLAT" | "IVF_HNSW_PQ"
             | "IVF_HNSW_SQ" => IndexType::Vector,
@@ -1856,6 +1857,10 @@ impl Dataset {
                 index_type: "bloomfilter".to_string(),
                 params: None,
             }),
+            "RTREE" => Box::new(ScalarIndexParams {
+                index_type: "rtree".to_string(),
+                params: None,
+            }),
             "SCALAR" => {
                 let Some(kwargs) = kwargs else {
                     return Err(PyValueError::new_err(
 
@@ -16,7 +16,10 @@ datafusion.workspace = true
 geoarrow-array.workspace = true
 geoarrow-schema.workspace = true
 geodatafusion.workspace = true
+geo-traits.workspace = true
 geo-types.workspace = true
+lance-core.workspace = true
+serde.workspace = true
 
 [lints]
 workspace = true