Skip to content

Commit b67322d

Browse files
committed
feat: support GEO RTree index
Change-Id: Ic057263d3ff4e9dc7d5874e0b92687b551cfb836
1 parent b478d3b commit b67322d

File tree

25 files changed

+2535
-102
lines changed

25 files changed

+2535
-102
lines changed

Cargo.lock

Lines changed: 8 additions & 0 deletions
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

Cargo.toml

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -135,6 +135,7 @@ futures = "0.3"
135135
geoarrow-array = "0.6"
136136
geoarrow-schema = "0.6"
137137
geodatafusion = "0.1.1"
138+
geo-traits = "0.3.0"
138139
geo-types = "0.7.16"
139140
http = "1.1.0"
140141
humantime = "2.2.0"

java/lance-jni/Cargo.lock

Lines changed: 9 additions & 21 deletions
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

java/lance-jni/src/blocking_dataset.rs

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -790,7 +790,8 @@ fn inner_create_index(
790790
| IndexType::Inverted
791791
| IndexType::NGram
792792
| IndexType::ZoneMap
793-
| IndexType::BloomFilter => {
793+
| IndexType::BloomFilter
794+
| IndexType::RTree => {
794795
// For scalar indices, create a scalar IndexParams
795796
let (index_type_str, params_opt) = get_scalar_index_params(env, params_jobj)?;
796797
let scalar_params = lance_index::scalar::ScalarIndexParams {

protos/index.proto

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -188,4 +188,6 @@ message JsonIndexDetails {
188188
string path = 1;
189189
google.protobuf.Any target_details = 2;
190190
}
191-
message BloomFilterIndexDetails {}
191+
message BloomFilterIndexDetails {}
192+
193+
message RTreeIndexDetails {}

python/Cargo.lock

Lines changed: 9 additions & 21 deletions
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

python/python/benchmarks/test_search.py

Lines changed: 61 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -505,3 +505,64 @@ def test_late_materialization(test_dataset, benchmark, use_index):
505505
filter=f"{column} = 0",
506506
batch_size=32,
507507
)
508+
509+
510+
@pytest.fixture(scope="module")
511+
def test_geo_dataset(tmpdir_factory):
512+
from geoarrow.rust.core import (
513+
point,
514+
points,
515+
)
516+
517+
num_rows = 1_000_000
518+
points_2d = points(
519+
[np.random.randn(num_rows) * 100, np.random.randn(num_rows) * 100]
520+
)
521+
522+
schema = pa.schema(
523+
[
524+
pa.field(point("xy")).with_name("points"),
525+
]
526+
)
527+
table = pa.Table.from_arrays([points_2d], schema=schema)
528+
uri = str(tmpdir_factory.mktemp("test_geo_dataset"))
529+
lance.write_dataset(table, uri)
530+
ds = lance.dataset(uri)
531+
return ds
532+
533+
534+
@pytest.mark.benchmark(group="geo")
535+
@pytest.mark.parametrize(
536+
"use_index",
537+
(False, True),
538+
ids=["no_index", "with_index"],
539+
)
540+
def test_geo_rtree(test_geo_dataset, benchmark, use_index):
541+
if use_index:
542+
test_geo_dataset.create_scalar_index(
543+
column="points",
544+
index_type="RTREE",
545+
replace=True,
546+
)
547+
548+
print(
549+
test_geo_dataset.scanner(
550+
columns=["points"],
551+
filter="""
552+
St_Contains(points,
553+
ST_GeomFromText('POLYGON (( 0 0, 2 0, 0 2, 2 2, 0 0 ))'))
554+
""",
555+
batch_size=32,
556+
use_scalar_index=use_index,
557+
).explain_plan(True)
558+
)
559+
benchmark(
560+
test_geo_dataset.to_table,
561+
columns=["points"],
562+
filter="""
563+
St_Contains(points,
564+
ST_GeomFromText('POLYGON (( 0 0, 2 0, 0 2, 2 2, 0 0 ))'))
565+
""",
566+
batch_size=32,
567+
use_scalar_index=use_index,
568+
)

python/python/lance/dataset.py

Lines changed: 5 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -2341,6 +2341,7 @@ def create_scalar_index(
23412341
Literal["NGRAM"],
23422342
Literal["ZONEMAP"],
23432343
Literal["BLOOMFILTER"],
2344+
Literal["RTREE"],
23442345
IndexConfig,
23452346
],
23462347
name: Optional[str] = None,
@@ -2426,8 +2427,8 @@ def create_scalar_index(
24262427
or string column.
24272428
index_type : str
24282429
The type of the index. One of ``"BTREE"``, ``"BITMAP"``,
2429-
``"LABEL_LIST"``, ``"NGRAM"``, ``"ZONEMAP"``, ``"INVERTED"``, or
2430-
``"BLOOMFILTER"``.
2430+
``"LABEL_LIST"``, ``"NGRAM"``, ``"ZONEMAP"``, ``"INVERTED"``,
2431+
``"BLOOMFILTER"``, ``"RTREE"``.
24312432
name : str, optional
24322433
The index name. If not provided, it will be generated from the
24332434
column name.
@@ -2544,11 +2545,12 @@ def create_scalar_index(
25442545
"LABEL_LIST",
25452546
"INVERTED",
25462547
"BLOOMFILTER",
2548+
"RTREE",
25472549
]:
25482550
raise NotImplementedError(
25492551
(
25502552
'Only "BTREE", "BITMAP", "NGRAM", "ZONEMAP", "LABEL_LIST", '
2551-
'"INVERTED", or "BLOOMFILTER" are supported for '
2553+
'"INVERTED", "BLOOMFILTER" or "RTREE" are supported for '
25522554
f"scalar columns. Received {index_type}",
25532555
)
25542556
)

python/python/tests/test_geo.py

Lines changed: 49 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -104,3 +104,52 @@ def test_geo_sql(tmp_path: Path):
104104
assert np.allclose(
105105
np.array(result["dist"]), np.array([2.5495097567963922]), atol=1e-8
106106
)
107+
108+
109+
def test_rtree_index(tmp_path: Path):
110+
# LineStrings
111+
num_lines = 10000
112+
line_offsets = np.arange(num_lines + 1, dtype=np.int32) * 2
113+
linestrings_2d = linestrings(
114+
[np.random.randn(num_lines * 2) * 100, np.random.randn(num_lines * 2) * 100],
115+
line_offsets,
116+
)
117+
assert len(linestrings_2d) == num_lines
118+
119+
schema = pa.schema(
120+
[
121+
pa.field("id", pa.int64()),
122+
pa.field(linestring("xy")).with_name("linestring"),
123+
]
124+
)
125+
table = pa.Table.from_arrays(
126+
[np.arange(num_lines, dtype=np.int64), linestrings_2d], schema=schema
127+
)
128+
ds = lance.write_dataset(table, str(tmp_path / "test_rtree_index.lance"))
129+
130+
def query(ds: lance.LanceDataset, has_index=False):
131+
sql = """
132+
SELECT `id`, linestring
133+
FROM dataset
134+
WHERE
135+
St_Intersects(linestring, ST_GeomFromText('LINESTRING ( 2 0, 0 2 )'))
136+
"""
137+
138+
batches = ds.sql("EXPLAIN ANALYZE " + sql).build().to_batch_records()
139+
explain = pa.Table.from_batches(batches).to_pandas().to_string()
140+
141+
if has_index:
142+
assert "ScalarIndexQuery" in explain
143+
else:
144+
assert "ScalarIndexQuery" not in explain
145+
146+
batches = ds.sql(sql).build().to_batch_records()
147+
return pa.Table.from_batches(batches)
148+
149+
table_without_index = query(ds)
150+
151+
ds.create_scalar_index("linestring", "RTREE")
152+
153+
table_with_index = query(ds, has_index=True)
154+
155+
assert table_with_index == table_without_index

python/src/dataset.rs

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1776,6 +1776,7 @@ impl Dataset {
17761776
"ZONEMAP" => IndexType::ZoneMap,
17771777
"BLOOMFILTER" => IndexType::BloomFilter,
17781778
"LABEL_LIST" => IndexType::LabelList,
1779+
"RTREE" => IndexType::RTree,
17791780
"INVERTED" | "FTS" => IndexType::Inverted,
17801781
"IVF_FLAT" | "IVF_PQ" | "IVF_SQ" | "IVF_RQ" | "IVF_HNSW_FLAT" | "IVF_HNSW_PQ"
17811782
| "IVF_HNSW_SQ" => IndexType::Vector,
@@ -1812,6 +1813,10 @@ impl Dataset {
18121813
index_type: "bloomfilter".to_string(),
18131814
params: None,
18141815
}),
1816+
"RTREE" => Box::new(ScalarIndexParams {
1817+
index_type: "rtree".to_string(),
1818+
params: None,
1819+
}),
18151820
"SCALAR" => {
18161821
let Some(kwargs) = kwargs else {
18171822
return Err(PyValueError::new_err(

0 commit comments

Comments
 (0)