Skip to content

Commit e9e4628

Browse files
committed
feat: support GEO RTree index
Change-Id: Ic057263d3ff4e9dc7d5874e0b92687b551cfb836
1 parent c58c08b commit e9e4628

File tree

23 files changed

+2530
-23
lines changed

23 files changed

+2530
-23
lines changed

Cargo.lock

Lines changed: 8 additions & 0 deletions
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

Cargo.toml

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -135,6 +135,7 @@ futures = "0.3"
135135
geoarrow-array = "0.6"
136136
geoarrow-schema = "0.6"
137137
geodatafusion = "0.1.1"
138+
geo-traits = "0.3.0"
138139
geo-types = "0.7.16"
139140
http = "1.1.0"
140141
humantime = "2.2.0"

java/lance-jni/Cargo.lock

Lines changed: 7 additions & 0 deletions
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

java/lance-jni/src/blocking_dataset.rs

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -806,7 +806,8 @@ fn inner_create_index(
806806
| IndexType::Inverted
807807
| IndexType::NGram
808808
| IndexType::ZoneMap
809-
| IndexType::BloomFilter => {
809+
| IndexType::BloomFilter
810+
| IndexType::RTree => {
810811
// For scalar indices, create a scalar IndexParams
811812
let (index_type_str, params_opt) = get_scalar_index_params(env, params_jobj)?;
812813
let scalar_params = lance_index::scalar::ScalarIndexParams {

python/Cargo.lock

Lines changed: 7 additions & 0 deletions
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

python/python/benchmarks/test_search.py

Lines changed: 59 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -505,3 +505,62 @@ def test_late_materialization(test_dataset, benchmark, use_index):
505505
filter=f"{column} = 0",
506506
batch_size=32,
507507
)
508+
509+
510+
@pytest.fixture(scope="module")
511+
def test_geo_dataset(tmpdir_factory):
512+
from geoarrow.rust.core import (
513+
point,
514+
points,
515+
)
516+
517+
num_rows = 1_000_000
518+
points_2d = points([np.random.randn(num_rows), np.random.randn(num_rows)])
519+
520+
schema = pa.schema(
521+
[
522+
pa.field(point("xy")).with_name("points"),
523+
]
524+
)
525+
table = pa.Table.from_arrays([points_2d], schema=schema)
526+
uri = str(tmpdir_factory.mktemp("test_geo_dataset"))
527+
lance.write_dataset(table, uri)
528+
ds = lance.dataset(uri)
529+
return ds
530+
531+
532+
@pytest.mark.benchmark(group="geo")
533+
@pytest.mark.parametrize(
534+
"use_index",
535+
(False, True),
536+
ids=["no_index", "with_index"],
537+
)
538+
def test_geo_rtree(test_geo_dataset, benchmark, use_index):
539+
if use_index:
540+
test_geo_dataset.create_scalar_index(
541+
column="points",
542+
index_type="RTREE",
543+
replace=True,
544+
)
545+
546+
print(
547+
test_geo_dataset.scanner(
548+
columns=["points"],
549+
filter="""
550+
St_Contains(points,
551+
ST_GeomFromText('POLYGON (( 0 0, 2 0, 0 2, 2 2, 0 0 ))'))
552+
""",
553+
batch_size=32,
554+
use_scalar_index=use_index,
555+
).explain_plan(True)
556+
)
557+
benchmark(
558+
test_geo_dataset.to_table,
559+
columns=["points"],
560+
filter="""
561+
St_Contains(points,
562+
ST_GeomFromText('POLYGON (( 0 0, 2 0, 0 2, 2 2, 0 0 ))'))
563+
""",
564+
batch_size=32,
565+
use_scalar_index=use_index,
566+
)

python/python/lance/dataset.py

Lines changed: 5 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -2343,6 +2343,7 @@ def create_scalar_index(
23432343
Literal["NGRAM"],
23442344
Literal["ZONEMAP"],
23452345
Literal["BLOOMFILTER"],
2346+
Literal["RTREE"],
23462347
IndexConfig,
23472348
],
23482349
name: Optional[str] = None,
@@ -2428,8 +2429,8 @@ def create_scalar_index(
24282429
or string column.
24292430
index_type : str
24302431
The type of the index. One of ``"BTREE"``, ``"BITMAP"``,
2431-
``"LABEL_LIST"``, ``"NGRAM"``, ``"ZONEMAP"``, ``"INVERTED"``, or
2432-
``"BLOOMFILTER"``.
2432+
``"LABEL_LIST"``, ``"NGRAM"``, ``"ZONEMAP"``, ``"INVERTED"``,
2433+
``"BLOOMFILTER"``, ``"RTREE"``.
24332434
name : str, optional
24342435
The index name. If not provided, it will be generated from the
24352436
column name.
@@ -2550,11 +2551,12 @@ def create_scalar_index(
25502551
"LABEL_LIST",
25512552
"INVERTED",
25522553
"BLOOMFILTER",
2554+
"RTREE",
25532555
]:
25542556
raise NotImplementedError(
25552557
(
25562558
'Only "BTREE", "BITMAP", "NGRAM", "ZONEMAP", "LABEL_LIST", '
2557-
'"INVERTED", or "BLOOMFILTER" are supported for '
2559+
'"INVERTED", "BLOOMFILTER" or "RTREE" are supported for '
25582560
f"scalar columns. Received {index_type}",
25592561
)
25602562
)

python/python/tests/test_geo.py

Lines changed: 49 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -104,3 +104,52 @@ def test_geo_sql(tmp_path: Path):
104104
assert np.allclose(
105105
np.array(result["dist"]), np.array([2.5495097567963922]), atol=1e-8
106106
)
107+
108+
109+
def test_rtree_index(tmp_path: Path):
110+
# LineStrings
111+
num_lines = 10000
112+
line_offsets = np.arange(num_lines + 1, dtype=np.int32) * 2
113+
linestrings_2d = linestrings(
114+
[np.random.randn(num_lines * 2) * 100, np.random.randn(num_lines * 2) * 100],
115+
line_offsets,
116+
)
117+
assert len(linestrings_2d) == num_lines
118+
119+
schema = pa.schema(
120+
[
121+
pa.field("id", pa.int64()),
122+
pa.field(linestring("xy")).with_name("linestring"),
123+
]
124+
)
125+
table = pa.Table.from_arrays(
126+
[np.arange(num_lines, dtype=np.int64), linestrings_2d], schema=schema
127+
)
128+
ds = lance.write_dataset(table, str(tmp_path / "test_rtree_index.lance"))
129+
130+
def query(ds: lance.LanceDataset, has_index=False):
131+
sql = """
132+
SELECT `id`, linestring
133+
FROM dataset
134+
WHERE
135+
St_Intersects(linestring, ST_GeomFromText('LINESTRING ( 2 0, 0 2 )'))
136+
"""
137+
138+
batches = ds.sql("EXPLAIN ANALYZE " + sql).build().to_batch_records()
139+
explain = pa.Table.from_batches(batches).to_pandas().to_string()
140+
141+
if has_index:
142+
assert "ScalarIndexQuery" in explain
143+
else:
144+
assert "ScalarIndexQuery" not in explain
145+
146+
batches = ds.sql(sql).build().to_batch_records()
147+
return pa.Table.from_batches(batches)
148+
149+
table_without_index = query(ds)
150+
151+
ds.create_scalar_index("linestring", "RTREE")
152+
153+
table_with_index = query(ds, has_index=True)
154+
155+
assert table_with_index == table_without_index

python/src/dataset.rs

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1820,6 +1820,7 @@ impl Dataset {
18201820
"ZONEMAP" => IndexType::ZoneMap,
18211821
"BLOOMFILTER" => IndexType::BloomFilter,
18221822
"LABEL_LIST" => IndexType::LabelList,
1823+
"RTREE" => IndexType::RTree,
18231824
"INVERTED" => IndexType::Inverted,
18241825
"IVF_FLAT" | "IVF_PQ" | "IVF_SQ" | "IVF_RQ" | "IVF_HNSW_FLAT" | "IVF_HNSW_PQ"
18251826
| "IVF_HNSW_SQ" => IndexType::Vector,
@@ -1856,6 +1857,10 @@ impl Dataset {
18561857
index_type: "bloomfilter".to_string(),
18571858
params: None,
18581859
}),
1860+
"RTREE" => Box::new(ScalarIndexParams {
1861+
index_type: "rtree".to_string(),
1862+
params: None,
1863+
}),
18591864
"SCALAR" => {
18601865
let Some(kwargs) = kwargs else {
18611866
return Err(PyValueError::new_err(

rust/lance-geo/Cargo.toml

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -16,7 +16,10 @@ datafusion.workspace = true
1616
geoarrow-array.workspace = true
1717
geoarrow-schema.workspace = true
1818
geodatafusion.workspace = true
19+
geo-traits.workspace = true
1920
geo-types.workspace = true
21+
lance-core.workspace = true
22+
serde.workspace = true
2023

2124
[lints]
2225
workspace = true

0 commit comments

Comments
 (0)