Skip to content

Commit 65c6e78

Browse files
committed
feat: support GEO RTree index
Change-Id: Ic057263d3ff4e9dc7d5874e0b92687b551cfb836
1 parent 543eb86 commit 65c6e78

File tree

23 files changed

+2523
-8
lines changed

23 files changed

+2523
-8
lines changed

Cargo.lock

Lines changed: 8 additions & 0 deletions
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

Cargo.toml

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -136,6 +136,7 @@ futures = "0.3"
136136
geoarrow-array = "0.6"
137137
geoarrow-schema = "0.6"
138138
geodatafusion = "0.1.1"
139+
geo-traits = "0.3.0"
139140
geo-types = "0.7.16"
140141
http = "1.1.0"
141142
humantime = "2.2.0"

java/lance-jni/Cargo.lock

Lines changed: 7 additions & 0 deletions
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

java/lance-jni/src/blocking_dataset.rs

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -806,7 +806,8 @@ fn inner_create_index(
806806
| IndexType::Inverted
807807
| IndexType::NGram
808808
| IndexType::ZoneMap
809-
| IndexType::BloomFilter => {
809+
| IndexType::BloomFilter
810+
| IndexType::RTree => {
810811
// For scalar indices, create a scalar IndexParams
811812
let (index_type_str, params_opt) = get_scalar_index_params(env, params_jobj)?;
812813
let scalar_params = lance_index::scalar::ScalarIndexParams {

python/Cargo.lock

Lines changed: 7 additions & 0 deletions
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

python/python/benchmarks/test_search.py

Lines changed: 61 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -505,3 +505,64 @@ def test_late_materialization(test_dataset, benchmark, use_index):
505505
filter=f"{column} = 0",
506506
batch_size=32,
507507
)
508+
509+
510+
@pytest.fixture(scope="module")
511+
def test_geo_dataset(tmpdir_factory):
512+
from geoarrow.rust.core import (
513+
point,
514+
points,
515+
)
516+
517+
num_rows = 1_000_000
518+
points_2d = points(
519+
[np.random.randn(num_rows) * 100, np.random.randn(num_rows) * 100]
520+
)
521+
522+
schema = pa.schema(
523+
[
524+
pa.field(point("xy")).with_name("points"),
525+
]
526+
)
527+
table = pa.Table.from_arrays([points_2d], schema=schema)
528+
uri = str(tmpdir_factory.mktemp("test_geo_dataset"))
529+
lance.write_dataset(table, uri)
530+
ds = lance.dataset(uri)
531+
return ds
532+
533+
534+
@pytest.mark.benchmark(group="geo")
535+
@pytest.mark.parametrize(
536+
"use_index",
537+
(False, True),
538+
ids=["no_index", "with_index"],
539+
)
540+
def test_geo_rtree(test_geo_dataset, benchmark, use_index):
541+
if use_index:
542+
test_geo_dataset.create_scalar_index(
543+
column="points",
544+
index_type="RTREE",
545+
replace=True,
546+
)
547+
548+
print(
549+
test_geo_dataset.scanner(
550+
columns=["points"],
551+
filter="""
552+
St_Contains(points,
553+
ST_GeomFromText('POLYGON (( 0 0, 2 0, 0 2, 2 2, 0 0 ))'))
554+
""",
555+
batch_size=32,
556+
use_scalar_index=use_index,
557+
).explain_plan(True)
558+
)
559+
benchmark(
560+
test_geo_dataset.to_table,
561+
columns=["points"],
562+
filter="""
563+
St_Contains(points,
564+
ST_GeomFromText('POLYGON (( 0 0, 2 0, 0 2, 2 2, 0 0 ))'))
565+
""",
566+
batch_size=32,
567+
use_scalar_index=use_index,
568+
)

python/python/lance/dataset.py

Lines changed: 5 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -2331,6 +2331,7 @@ def create_scalar_index(
23312331
Literal["NGRAM"],
23322332
Literal["ZONEMAP"],
23332333
Literal["BLOOMFILTER"],
2334+
Literal["RTREE"],
23342335
IndexConfig,
23352336
],
23362337
name: Optional[str] = None,
@@ -2416,8 +2417,8 @@ def create_scalar_index(
24162417
or string column.
24172418
index_type : str
24182419
The type of the index. One of ``"BTREE"``, ``"BITMAP"``,
2419-
``"LABEL_LIST"``, ``"NGRAM"``, ``"ZONEMAP"``, ``"INVERTED"``, or
2420-
``"BLOOMFILTER"``.
2420+
``"LABEL_LIST"``, ``"NGRAM"``, ``"ZONEMAP"``, ``"INVERTED"``,
2421+
``"BLOOMFILTER"``, ``"RTREE"``.
24212422
name : str, optional
24222423
The index name. If not provided, it will be generated from the
24232424
column name.
@@ -2538,11 +2539,12 @@ def create_scalar_index(
25382539
"LABEL_LIST",
25392540
"INVERTED",
25402541
"BLOOMFILTER",
2542+
"RTREE",
25412543
]:
25422544
raise NotImplementedError(
25432545
(
25442546
'Only "BTREE", "BITMAP", "NGRAM", "ZONEMAP", "LABEL_LIST", '
2545-
'"INVERTED", or "BLOOMFILTER" are supported for '
2547+
'"INVERTED", "BLOOMFILTER" or "RTREE" are supported for '
25462548
f"scalar columns. Received {index_type}",
25472549
)
25482550
)

python/python/tests/test_geo.py

Lines changed: 49 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -104,3 +104,52 @@ def test_geo_sql(tmp_path: Path):
104104
assert np.allclose(
105105
np.array(result["dist"]), np.array([2.5495097567963922]), atol=1e-8
106106
)
107+
108+
109+
def test_rtree_index(tmp_path: Path):
110+
# LineStrings
111+
num_lines = 10000
112+
line_offsets = np.arange(num_lines + 1, dtype=np.int32) * 2
113+
linestrings_2d = linestrings(
114+
[np.random.randn(num_lines * 2) * 100, np.random.randn(num_lines * 2) * 100],
115+
line_offsets,
116+
)
117+
assert len(linestrings_2d) == num_lines
118+
119+
schema = pa.schema(
120+
[
121+
pa.field("id", pa.int64()),
122+
pa.field(linestring("xy")).with_name("linestring"),
123+
]
124+
)
125+
table = pa.Table.from_arrays(
126+
[np.arange(num_lines, dtype=np.int64), linestrings_2d], schema=schema
127+
)
128+
ds = lance.write_dataset(table, str(tmp_path / "test_rtree_index.lance"))
129+
130+
def query(ds: lance.LanceDataset, has_index=False):
131+
sql = """
132+
SELECT `id`, linestring
133+
FROM dataset
134+
WHERE
135+
St_Intersects(linestring, ST_GeomFromText('LINESTRING ( 2 0, 0 2 )'))
136+
"""
137+
138+
batches = ds.sql("EXPLAIN ANALYZE " + sql).build().to_batch_records()
139+
explain = pa.Table.from_batches(batches).to_pandas().to_string()
140+
141+
if has_index:
142+
assert "ScalarIndexQuery" in explain
143+
else:
144+
assert "ScalarIndexQuery" not in explain
145+
146+
batches = ds.sql(sql).build().to_batch_records()
147+
return pa.Table.from_batches(batches)
148+
149+
table_without_index = query(ds)
150+
151+
ds.create_scalar_index("linestring", "RTREE")
152+
153+
table_with_index = query(ds, has_index=True)
154+
155+
assert table_with_index == table_without_index

python/src/dataset.rs

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1781,6 +1781,7 @@ impl Dataset {
17811781
"ZONEMAP" => IndexType::ZoneMap,
17821782
"BLOOMFILTER" => IndexType::BloomFilter,
17831783
"LABEL_LIST" => IndexType::LabelList,
1784+
"RTREE" => IndexType::RTree,
17841785
"INVERTED" => IndexType::Inverted,
17851786
"IVF_FLAT" | "IVF_PQ" | "IVF_SQ" | "IVF_RQ" | "IVF_HNSW_FLAT" | "IVF_HNSW_PQ"
17861787
| "IVF_HNSW_SQ" => IndexType::Vector,
@@ -1817,6 +1818,10 @@ impl Dataset {
18171818
index_type: "bloomfilter".to_string(),
18181819
params: None,
18191820
}),
1821+
"RTREE" => Box::new(ScalarIndexParams {
1822+
index_type: "rtree".to_string(),
1823+
params: None,
1824+
}),
18201825
"SCALAR" => {
18211826
let Some(kwargs) = kwargs else {
18221827
return Err(PyValueError::new_err(

rust/lance-geo/Cargo.toml

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -16,7 +16,10 @@ datafusion.workspace = true
1616
geoarrow-array.workspace = true
1717
geoarrow-schema.workspace = true
1818
geodatafusion.workspace = true
19+
geo-traits.workspace = true
1920
geo-types.workspace = true
21+
lance-core.workspace = true
22+
serde.workspace = true
2023

2124
[lints]
2225
workspace = true

0 commit comments

Comments
 (0)