Skip to content

Commit 3fa4f33

Browse files
committed
Add script and CI to ensure conversion is complete.
1 parent 39e59d7 commit 3fa4f33

File tree

2,793 files changed

+85448
-0
lines changed

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

2,793 files changed

+85448
-0
lines changed

.devcontainer/Dockerfile

Lines changed: 13 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,13 @@
1+
FROM python:3.11
2+
3+
# Install sudo first
4+
RUN apt-get update && apt-get install -y sudo && rm -rf /var/lib/apt/lists/*
5+
6+
# Create vscode user
7+
RUN groupadd --gid 1000 vscode \
8+
&& useradd --uid 1000 --gid 1000 -m -s /bin/bash vscode \
9+
&& echo "vscode ALL=(ALL) NOPASSWD:ALL" >> /etc/sudoers
10+
11+
# Copy and run system dependencies install script
12+
COPY scripts/install-system-deps.sh /tmp/
13+
RUN bash /tmp/install-system-deps.sh && rm /tmp/install-system-deps.sh

.devcontainer/devcontainer.json

Lines changed: 24 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,24 @@
1+
{
2+
"name": "PDF Table Extraction",
3+
"image": "python:3.11",
4+
"features": {
5+
"ghcr.io/devcontainers/features/common-utils:2": {
6+
"installZsh": false,
7+
"installOhMyZsh": false,
8+
"upgradePackages": false,
9+
"username": "vscode",
10+
"userUid": "1000",
11+
"userGid": "1000"
12+
}
13+
},
14+
"remoteUser": "vscode",
15+
"customizations": {
16+
"vscode": {
17+
"extensions": [
18+
"ms-python.python",
19+
"ms-python.vscode-pylance"
20+
]
21+
}
22+
},
23+
"onCreateCommand": "bash .devcontainer/on-create-command.sh"
24+
}

.devcontainer/on-create-command.sh

Lines changed: 12 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,12 @@
1+
#!/bin/bash
2+
set -e
3+
4+
# Install system dependencies
5+
bash scripts/install-system-deps.sh
6+
7+
# Install Python dependencies
8+
pip install --no-cache-dir -r requirements.txt
9+
10+
# Install the package in editable mode
11+
pip install -e .
12+

.github/workflows/ci.yml

Lines changed: 30 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,30 @@
1+
name: CI
2+
3+
on:
4+
push:
5+
branches: [ main ]
6+
pull_request:
7+
branches: [ main ]
8+
9+
jobs:
10+
test:
11+
runs-on: ubuntu-latest
12+
13+
steps:
14+
- uses: actions/checkout@v4
15+
with:
16+
submodules: recursive
17+
18+
- name: Set up Python
19+
uses: actions/setup-python@v5
20+
with:
21+
python-version: '3.11'
22+
23+
- name: Install system dependencies
24+
run: bash scripts/install-system-deps.sh
25+
26+
- name: Install Python dependencies
27+
run: make install
28+
29+
- name: Run all checks
30+
run: make check

.gitignore

Lines changed: 147 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,147 @@
1+
# Byte-compiled / optimized / DLL files
2+
__pycache__/
3+
*.py[cod]
4+
*$py.class
5+
6+
# C extensions
7+
*.so
8+
9+
# Distribution / packaging
10+
.Python
11+
build/
12+
develop-eggs/
13+
dist/
14+
downloads/
15+
eggs/
16+
.eggs/
17+
lib/
18+
lib64/
19+
parts/
20+
sdist/
21+
var/
22+
wheels/
23+
share/python-wheels/
24+
*.egg-info/
25+
.installed.cfg
26+
*.egg
27+
MANIFEST
28+
29+
# PyInstaller
30+
*.manifest
31+
*.spec
32+
33+
# Installer logs
34+
pip-log.txt
35+
pip-delete-this-directory.txt
36+
37+
# Unit test / coverage reports
38+
htmlcov/
39+
.tox/
40+
.nox/
41+
.coverage
42+
.coverage.*
43+
.cache
44+
nosetests.xml
45+
coverage.xml
46+
*.cover
47+
*.py,cover
48+
.hypothesis/
49+
.pytest_cache/
50+
cover/
51+
52+
# Translations
53+
*.mo
54+
*.pot
55+
56+
# Django stuff:
57+
*.log
58+
local_settings.py
59+
db.sqlite3
60+
db.sqlite3-journal
61+
62+
# Flask stuff:
63+
instance/
64+
.webassets-cache
65+
66+
# Scrapy stuff:
67+
.scrapy
68+
69+
# Sphinx documentation
70+
docs/_build/
71+
72+
# PyBuilder
73+
.pybuilder/
74+
target/
75+
76+
# Jupyter Notebook
77+
.ipynb_checkpoints
78+
79+
# IPython
80+
profile_default/
81+
ipython_config.py
82+
83+
# pyenv
84+
.python-version
85+
86+
# pipenv
87+
Pipfile.lock
88+
89+
# poetry
90+
poetry.lock
91+
92+
# pdm
93+
.pdm.toml
94+
95+
# PEP 582
96+
__pypackages__/
97+
98+
# Celery stuff
99+
celerybeat-schedule
100+
celerybeat.pid
101+
102+
# SageMath parsed files
103+
*.sage.py
104+
105+
# Environments
106+
.env
107+
.venv
108+
env/
109+
venv/
110+
ENV/
111+
env.bak/
112+
venv.bak/
113+
114+
# Spyder project settings
115+
.spyderproject
116+
.spyproject
117+
118+
# Rope project settings
119+
.ropeproject
120+
121+
# mkdocs documentation
122+
/site
123+
124+
# mypy
125+
.mypy_cache/
126+
.dmypy.json
127+
dmypy.json
128+
129+
# Pyre type checker
130+
.pyre/
131+
132+
# pytype static type analyzer
133+
.pytype/
134+
135+
# Cython debug symbols
136+
cython_debug/
137+
138+
# IDE
139+
.vscode/
140+
.idea/
141+
*.swp
142+
*.swo
143+
*~
144+
145+
# OS
146+
.DS_Store
147+
Thumbs.db

Makefile

Lines changed: 30 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,30 @@
1+
.PHONY: help install test lint format type-check clean validate
2+
3+
help: ## Show this help
4+
@grep -E '^[a-zA-Z_-]+:.*?## .*$$' $(MAKEFILE_LIST) | sort | awk 'BEGIN {FS = ":.*?## "}; {printf "\033[36m%-20s\033[0m %s\n", $$1, $$2}'
5+
6+
install: ## Install dependencies
7+
@bash scripts/install.sh
8+
9+
test: ## Run tests with coverage
10+
pytest
11+
12+
lint: ## Run linter
13+
ruff check .
14+
15+
format: ## Format code
16+
ruff format .
17+
ruff check --fix .
18+
19+
type-check: ## Run type checker
20+
mypy .
21+
22+
validate: ## Validate that all PDFs have been processed (for CI)
23+
extract-tables -f csv -r --validate data csv
24+
25+
check: lint type-check test validate ## Run all checks (lint, type-check, test, validate)
26+
27+
clean: ## Clean temporary files
28+
rm -rf .pytest_cache .mypy_cache .ruff_cache htmlcov .coverage
29+
find . -type d -name __pycache__ -exec rm -rf {} +
30+
find . -type f -name '*.pyc' -delete

README.md

Lines changed: 102 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1 +1,103 @@
11
# reports
2+
3+
## PDF Table Extraction
4+
5+
This project provides a flexible tool to extract tables from PDF files using Camelot.
6+
7+
### Quick Start with Dev Container
8+
9+
1. Open this project in VS Code
10+
2. When prompted, click "Reopen in Container" (or use Command Palette: "Dev Containers: Reopen in Container")
11+
3. Wait for the container to build (dependencies install automatically via `onCreateCommand`)
12+
4. If you rebuilt the container, install the package:
13+
```bash
14+
make install
15+
```
16+
5. Run the CLI tool:
17+
```bash
18+
extract-tables data/ csv/ --recursive
19+
```
20+
21+
### Manual Setup
22+
23+
If you prefer not to use the dev container:
24+
25+
```bash
26+
# Clone with submodules
27+
git clone --recurse-submodules <repository-url>
28+
29+
# Or if already cloned, initialize submodules
30+
git submodule update --init --recursive
31+
32+
# Install system dependencies (Ubuntu/Debian)
33+
bash scripts/install-system-deps.sh
34+
35+
# Install Python dependencies and package
36+
bash scripts/install.sh
37+
```
38+
39+
### Usage
40+
41+
The package installs a CLI command `extract-tables`:
42+
43+
```bash
44+
# Basic usage: extract to CSV
45+
extract-tables data/ csv/
46+
47+
# Extract to JSON format
48+
extract-tables data/ output/ --format json
49+
50+
# Use lattice flavor (for PDFs with clear table borders)
51+
extract-tables data/ csv/ --flavor lattice
52+
53+
# Process subdirectories recursively
54+
extract-tables data/ csv/ --recursive
55+
56+
# Validate all PDFs have been processed (for CI)
57+
extract-tables -f csv -r --validate data csv
58+
59+
# See all options
60+
extract-tables --help
61+
```
62+
63+
Or use the Python module directly:
64+
65+
```bash
66+
python -m pdf_table_extractor.extract_tables data/ csv/ --recursive
67+
```
68+
69+
### Supported Output Formats
70+
71+
- **csv**: CSV files (one per table)
72+
- **json**: JSON format
73+
- **excel**: Excel spreadsheet (.xlsx)
74+
- **html**: HTML table
75+
- **markdown**: Markdown table
76+
- **sqlite**: SQLite database
77+
78+
### Camelot Flavors
79+
80+
- **stream** (default): Best for PDFs without clear table borders
81+
- **lattice**: Best for PDFs with visible table lines
82+
83+
### CI/CD Validation
84+
85+
To validate that all PDFs have been processed (without actually processing them), use the `--validate` flag:
86+
87+
```bash
88+
extract-tables -f csv -r --validate data csv
89+
```
90+
91+
This is useful in CI to ensure the extraction has been run before committing. It:
92+
- Checks metadata to verify all PDFs are processed
93+
- Exits with code 1 if any PDFs are unprocessed
94+
- Runs in <1 second (doesn't process PDFs)
95+
- Uses the same validation logic as the main tool
96+
97+
Example Makefile target:
98+
```makefile
99+
validate:
100+
extract-tables -f csv -r --validate data csv
101+
```
102+
103+
See `.github/workflows/ci.yml` for a GitHub Actions example.
Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,5 @@
1+
{
2+
"size": 534351,
3+
"mtime": 1763928703.3296146,
4+
"hash": "8c6347609dafeb41c96ef188ee2404e96d2e7e159682eab5abab0d1c3024a636"
5+
}
Lines changed: 24 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,24 @@
1+
"","","SUBJECT: AGRICULTURAL AND APPLIED ECONOMICS (108)","","","","","","","","","","","",""
2+
"","","TERM:","1066","","","","","","","","","","",""
3+
"","","","","","","","","","","FACILITY_ID","COMB_ENRL","ENRL_TOT","EMPLID","INSTRUCTOR ROLE/NAME"
4+
"ALS","AGRICULTURAL AND APPLIED ECONOMICS","","","","","","","","","","","","",""
5+
"DHH","299","IND","061","1","-","","","","","","","1","0002600954","JONES,BRUCE L"
6+
"DHH","299","IND","072","1","-","","","","","","","2","0002601009","CROPP,ROBERT A"
7+
"ACC","306","LEC","001","1","09:00 - 12:00","M","T","W R","","0140 1175","50","4","0000875079","JOHNSON,MICHAEL DEAN"
8+
"DHH","306","LEC","002","1","10:20 - 11:35","M","T","W R","","0140 1195","10","1","0001629408","CHO,HOON"
9+
"DHH","350","LEC","001","1","-","","","","","","92","28","0000438122","THURLOW,JULIE POH"
10+
"DJJ","399","IND","061","1","-","","","","","","","1","0002600954","JONES,BRUCE L"
11+
"DHH","699","IND","051","1","-","","","","","","","1","0002601604","JOHNSON,MARVIN B"
12+
"DCC","875","SEM","001","1","08:30 - 11:30","M","T","W R","F","0464 0B30","","6","0002601604","JOHNSON,MARVIN B"
13+
"DCC","875","SEM","002","1","13:00 - 16:00","M","T","W R","F","0464 0B30","","6","0002601604","JOHNSON,MARVIN B"
14+
"DHH","990","IND","046","1","-","","","","","","","2","0002601875","BISHOP,RICHARD C"
15+
"DHH","990","IND","059","1","-","","","","","","","2","0002601756","CHAVAS,JEAN-PAUL"
16+
"DHH","990","IND","063","1","-","","","","","","","3","0000226384","CARTER,MICHAEL ROSS"
17+
"DHH","990","IND","073","1","-","","","","","","","4","0002601457","COXHEAD,IAN A"
18+
"DHH","990","IND","074","1","-","","","","","","","1","0002600381","PROVENCHER,ROBERT W"
19+
"DHH","990","IND","078","1","-","","","","","","","1","0002602152","FORTENBERY,T RANDALL"
20+
"DHH","990","IND","080","1","-","","","","","","","2","0002602096","BARHAM,BRADFORD L"
21+
"DHH","990","IND","082","1","-","","","","","","","3","0000125075","FOLTZ,JEREMY DAVID"
22+
"DHH","990","IND","083","1","-","","","","","","","5","0003374884","STIEGERT,KYLE W"
23+
"DHH","990","IND","086","1","-","","","","","","","1","0000933523","MITCHELL,PAUL DAVID"
24+
"DHH","990","IND","087","1","-","","","","","","","1","0004111956","LEWIS,DAVID J"

0 commit comments

Comments
 (0)