Skip to content

Commit 26d842a

Browse files
committed
feat: Add CLI tools for ORC file inspection and manipulation
1 parent 56f1682 commit 26d842a

File tree

8 files changed

+754
-3
lines changed

8 files changed

+754
-3
lines changed

Cargo.toml

Lines changed: 23 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -65,6 +65,8 @@ tokio = { version = "1.28", optional = true, features = [
6565
# cli
6666
anyhow = { version = "1.0", optional = true }
6767
clap = { version = "4.5.4", features = ["derive"], optional = true }
68+
serde = { version = "1.0", features = ["derive"], optional = true }
69+
serde_json = { version = "1.0", default-features = false, features = ["std"], optional = true }
6870

6971
# opendal
7072
opendal = { version = "0.53", optional = true, default-features = false }
@@ -76,13 +78,12 @@ criterion = { version = "0.5", default-features = false, features = ["async_toki
7678
opendal = { version = "0.53", default-features = false, features = ["services-memory"] }
7779
pretty_assertions = "1.3.0"
7880
proptest = "1.0.0"
79-
serde_json = { version = "1.0", default-features = false, features = ["std"] }
8081

8182
[features]
8283
default = ["async"]
8384

8485
async = ["async-trait", "futures", "futures-util", "tokio"]
85-
cli = ["anyhow", "clap"]
86+
cli = ["anyhow", "clap", "serde", "serde_json"]
8687
# Enable opendal support.
8788
opendal = ["dep:opendal"]
8889

@@ -107,3 +108,23 @@ required-features = ["cli"]
107108
[[bin]]
108109
name = "orc-stats"
109110
required-features = ["cli"]
111+
112+
[[bin]]
113+
name = "orc-read"
114+
required-features = ["cli"]
115+
116+
[[bin]]
117+
name = "orc-schema"
118+
required-features = ["cli"]
119+
120+
[[bin]]
121+
name = "orc-rowcount"
122+
required-features = ["cli"]
123+
124+
[[bin]]
125+
name = "orc-index"
126+
required-features = ["cli"]
127+
128+
[[bin]]
129+
name = "orc-layout"
130+
required-features = ["cli"]

src/bin/orc-index.rs

Lines changed: 166 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,166 @@
1+
// Licensed to the Apache Software Foundation (ASF) under one
2+
// or more contributor license agreements. See the NOTICE file
3+
// distributed with this work for additional information
4+
// regarding copyright ownership. The ASF licenses this file
5+
// to you under the Apache License, Version 2.0 (the
6+
// "License"); you may not use this file except in compliance
7+
// with the License. You may obtain a copy of the License at
8+
//
9+
// http://www.apache.org/licenses/LICENSE-2.0
10+
//
11+
// Unless required by applicable law or agreed to in writing,
12+
// software distributed under the License is distributed on an
13+
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
14+
// KIND, either express or implied. See the License for the
15+
// specific language governing permissions and limitations
16+
// under the License.
17+
18+
//! Inspect row indexes for a specific ORC column.
19+
//!
20+
//! Row indexes carry per-row-group statistics and positions; this tool surfaces
21+
//! them for debugging predicate pushdown and verifying writer-produced indexes.
22+
23+
use std::{fs::File, path::PathBuf};
24+
25+
use anyhow::{anyhow, Context, Result};
26+
use clap::Parser;
27+
use orc_rust::reader::metadata::read_metadata;
28+
use orc_rust::schema::{DataType, RootDataType};
29+
use orc_rust::statistics::{ColumnStatistics, TypeStatistics};
30+
use orc_rust::stripe::Stripe;
31+
32+
#[derive(Debug, Parser)]
33+
#[command(
34+
author,
35+
version,
36+
about = "Print row group index information for an ORC column"
37+
)]
38+
struct Args {
39+
/// Path to the ORC file
40+
file: PathBuf,
41+
/// Column name to inspect (top-level columns only)
42+
column: String,
43+
}
44+
45+
fn find_column<'a>(root: &'a RootDataType, name: &str) -> Option<(usize, &'a DataType, &'a str)> {
46+
root.children()
47+
.iter()
48+
.find(|c| c.name() == name)
49+
.map(|col| (col.data_type().column_index(), col.data_type(), col.name()))
50+
}
51+
52+
fn fmt_stats(stats: &ColumnStatistics) -> String {
53+
let mut parts = vec![format!("values={}", stats.number_of_values())];
54+
if stats.has_null() {
55+
parts.push("has_nulls=true".to_string());
56+
}
57+
if let Some(ts) = stats.type_statistics() {
58+
match ts {
59+
TypeStatistics::Integer { min, max, .. } => {
60+
parts.push(format!("min={min}"));
61+
parts.push(format!("max={max}"));
62+
}
63+
TypeStatistics::Double { min, max, .. } => {
64+
parts.push(format!("min={min}"));
65+
parts.push(format!("max={max}"));
66+
}
67+
TypeStatistics::String { min, max, .. } => {
68+
parts.push(format!("min={min}"));
69+
parts.push(format!("max={max}"));
70+
}
71+
TypeStatistics::Bucket { true_count } => {
72+
parts.push(format!("true_count={true_count}"));
73+
}
74+
TypeStatistics::Decimal { min, max, .. } => {
75+
parts.push(format!("min={min}"));
76+
parts.push(format!("max={max}"));
77+
}
78+
TypeStatistics::Date { min, max } => {
79+
parts.push(format!("min={min}"));
80+
parts.push(format!("max={max}"));
81+
}
82+
TypeStatistics::Binary { sum } => {
83+
parts.push(format!("total_bytes={sum}"));
84+
}
85+
TypeStatistics::Timestamp { min, max, .. } => {
86+
parts.push(format!("min={min}"));
87+
parts.push(format!("max={max}"));
88+
}
89+
TypeStatistics::Collection {
90+
min_children,
91+
max_children,
92+
total_children,
93+
} => {
94+
parts.push(format!("min_children={min_children}"));
95+
parts.push(format!("max_children={max_children}"));
96+
parts.push(format!("total_children={total_children}"));
97+
}
98+
}
99+
}
100+
parts.join(", ")
101+
}
102+
103+
fn main() -> Result<()> {
104+
let args = Args::parse();
105+
let mut file = File::open(&args.file)
106+
.with_context(|| format!("failed to open {:?}", args.file.display()))?;
107+
let metadata = read_metadata(&mut file)?;
108+
109+
let Some((column_index, data_type, name)) =
110+
find_column(metadata.root_data_type(), &args.column)
111+
else {
112+
let available = metadata
113+
.root_data_type()
114+
.children()
115+
.iter()
116+
.map(|c| c.name().to_string())
117+
.collect::<Vec<_>>()
118+
.join(", ");
119+
return Err(anyhow!(
120+
"column '{}' not found. Available columns: {available}",
121+
args.column
122+
));
123+
};
124+
125+
println!(
126+
"File: {} | Column: {} (index {})",
127+
args.file.display(),
128+
name,
129+
column_index
130+
);
131+
println!("Type: {data_type}");
132+
println!("Stripes: {}", metadata.stripe_metadatas().len());
133+
134+
for (stripe_idx, stripe_meta) in metadata.stripe_metadatas().iter().enumerate() {
135+
let stripe = Stripe::new(&mut file, &metadata, metadata.root_data_type(), stripe_meta)?;
136+
let row_index = stripe.read_row_indexes(&metadata)?;
137+
138+
let Some(col_index) = row_index.column(column_index) else {
139+
println!("Stripe {stripe_idx}: no row index for column");
140+
continue;
141+
};
142+
143+
if col_index.num_row_groups() == 0 {
144+
println!("Stripe {stripe_idx}: no row groups recorded");
145+
continue;
146+
}
147+
148+
println!(
149+
"Stripe {stripe_idx}: rows_per_group={} total_rows={}",
150+
col_index.rows_per_group(),
151+
row_index.total_rows()
152+
);
153+
for (row_group_idx, entry) in col_index.entries().enumerate() {
154+
let start = row_group_idx * col_index.rows_per_group();
155+
let end = (start + col_index.rows_per_group()).min(row_index.total_rows());
156+
print!(" Row group {row_group_idx} rows [{start},{end})");
157+
if let Some(stats) = &entry.statistics {
158+
println!(" -> {}", fmt_stats(stats));
159+
} else {
160+
println!(" -> no statistics");
161+
}
162+
}
163+
}
164+
165+
Ok(())
166+
}

0 commit comments

Comments
 (0)