Skip to content

Commit 1596a9b

Browse files
authored
Merge pull request #3 from joeirimpan/remove-pdf_tools
Remove pdf tools
2 parents d1e8ca5 + e766378 commit 1596a9b

File tree

2 files changed

+127
-423
lines changed

2 files changed

+127
-423
lines changed

src/main.rs

Lines changed: 127 additions & 134 deletions
Original file line numberDiff line numberDiff line change
@@ -1,12 +1,9 @@
1-
mod pdf_tools;
2-
31
use anyhow::{Context, Error};
42
use chrono::{NaiveDate, NaiveDateTime, NaiveTime};
53
use clap::{arg, Command};
64
use csv::Writer;
75
use pdf::content::*;
86
use pdf::file::File as pdfFile;
9-
use pdf_tools::ops_with_text_state;
107
use regex::Regex;
118
use std::fs;
129
use std::fs::File;
@@ -46,155 +43,151 @@ pub fn parse(path: String, _password: String) -> Result<Vec<Transaction>, Error>
4643
// Iterate through pages
4744
for page in file.pages() {
4845
if let Ok(page) = page {
49-
// For the pdf operations, skip till domestic/internation transactions and then skip till the first occurence of date
50-
// This guesses the transactions rows.
51-
let state = ops_with_text_state(&page, &file)
52-
.skip_while(|(op, _text_state)| match op {
53-
Op::TextDraw { ref text } => {
54-
let data = text.as_bytes();
55-
if let Ok(s) = std::str::from_utf8(data) {
56-
return s.trim() != "Domestic Transactions"
57-
&& s.trim() != "International Transactions";
58-
}
59-
return true;
60-
}
61-
_ => return true,
62-
})
63-
.skip_while(|(op, _text_state)| match op {
64-
Op::TextDraw { ref text } => {
65-
let data = text.as_bytes();
66-
if let Ok(s) = std::str::from_utf8(data) {
67-
let parsed_datetime =
68-
NaiveDateTime::parse_from_str(s.trim(), "%d/%m/%Y %H:%M:%S")
69-
.or_else(|_| {
70-
NaiveDate::parse_from_str(s.trim(), "%d/%m/%Y").map(
71-
|date| {
72-
NaiveDateTime::new(
73-
date,
74-
NaiveTime::from_hms_opt(0, 0, 0).unwrap(),
75-
)
76-
},
77-
)
78-
});
79-
match parsed_datetime {
80-
Ok(_) => return false,
81-
Err(_) => return true,
46+
if let Some(content) = &page.contents {
47+
if let Ok(ops) = content.operations(&file) {
48+
let mut transaction = Transaction::default();
49+
50+
let mut found_row = false;
51+
let mut column_ct = 0;
52+
let mut header_assigned = false;
53+
let mut header_column_ct = 0;
54+
let mut prev_value = "";
55+
56+
for op in ops.iter().skip_while(|op| match op {
57+
Op::TextDraw { ref text } => {
58+
let data = text.as_bytes();
59+
if let Ok(s) = std::str::from_utf8(data) {
60+
return s.trim() != "Domestic Transactions"
61+
&& s.trim() != "International Transactions";
8262
}
63+
return true;
8364
}
84-
return true;
85-
}
86-
_ => return true,
87-
});
88-
89-
let mut amt_assigned = false;
90-
let mut col = 0;
91-
let mut found_row = false;
92-
let mut transaction = Transaction::default();
93-
for (op, _text_state) in state {
94-
match op {
95-
Op::TextDraw { ref text } => {
96-
let data = text.as_bytes();
97-
if let Ok(s) = std::str::from_utf8(data) {
98-
let d = s.trim();
99-
if d == "" {
100-
continue;
101-
}
65+
_ => return true,
66+
}) {
67+
match op {
68+
Op::TextDraw { ref text } => {
69+
let data = text.as_bytes();
70+
if let Ok(s) = std::str::from_utf8(data) {
71+
// figure out the header column count from the table header.
72+
// This makes it easier to figure out the end of transaction lines.
73+
let d = s.trim();
74+
if !header_assigned {
75+
// save this value to check in next iteration of Op::BeginText to count header columns.
76+
prev_value = d;
77+
if d == "" {
78+
continue;
79+
}
10280

103-
// try parsing %d/%m/%Y %H:%M:%S / %d/%m/%Y formats
104-
match NaiveDateTime::parse_from_str(d, "%d/%m/%Y %H:%M:%S") {
105-
Ok(dt) => {
106-
// we have transaction here, clone it
107-
if col > 0 {
108-
members.push(transaction.clone());
109-
transaction = Transaction::default();
81+
// XXX: assume the transaction row starts with a date.
82+
let parsed_datetime = NaiveDateTime::parse_from_str(
83+
d,
84+
"%d/%m/%Y %H:%M:%S",
85+
)
86+
.or_else(|_| {
87+
NaiveDate::parse_from_str(d, "%d/%m/%Y").map(
88+
|date| {
89+
NaiveDateTime::new(
90+
date,
91+
NaiveTime::from_hms_opt(0, 0, 0).unwrap(),
92+
)
93+
},
94+
)
95+
});
96+
97+
match parsed_datetime {
98+
Ok(_) => {
99+
header_assigned = true;
100+
// remove card holder name
101+
header_column_ct -= 1;
102+
prev_value = "";
103+
}
104+
Err(_) => continue,
105+
}
110106
}
111107

112-
transaction.date = dt;
113-
found_row = true;
114-
115-
// reset col
116-
col = 0;
117-
}
118-
Err(_) => match NaiveDate::parse_from_str(d, "%d/%m/%Y") {
119-
Ok(dt) => {
120-
// we have transaction here, clone it
121-
if col > 0 {
122-
members.push(transaction.clone());
123-
transaction = Transaction::default();
108+
column_ct += 1;
109+
if d == "" {
110+
if !found_row {
111+
column_ct -= 1;
124112
}
125113

126-
transaction.date = NaiveDateTime::new(
127-
dt,
128-
NaiveTime::from_hms_opt(0, 0, 0).unwrap(),
129-
);
130-
found_row = true;
114+
continue;
115+
}
131116

132-
// reset col
133-
col = 0;
117+
if column_ct == 1 {
118+
if let Ok(tx_date) =
119+
NaiveDateTime::parse_from_str(d, "%d/%m/%Y %H:%M:%S")
120+
{
121+
found_row = true;
122+
transaction.date = tx_date;
123+
continue;
124+
}
125+
if let Ok(tx_date) =
126+
NaiveDate::parse_from_str(d, "%d/%m/%Y")
127+
{
128+
found_row = true;
129+
transaction.date = NaiveDateTime::new(
130+
tx_date,
131+
NaiveTime::from_hms_opt(0, 0, 0).unwrap(),
132+
);
133+
continue;
134+
}
134135
}
135136

136-
Err(_) => {
137-
// Check for the descriptio, amount in the same row where the date was found.
138-
if found_row {
139-
// page end. push the transaction to the list and continue.
140-
if amt_assigned {
141-
if col > 3 {
142-
if let Ok(tx) = String::from_str(s.trim()) {
143-
if tx == "Cr" {
144-
transaction.amount *= -1.0;
145-
}
146-
}
147-
148-
members.push(transaction.clone());
149-
found_row = false;
150-
transaction = Transaction::default();
151-
continue;
152-
}
153-
}
137+
if column_ct > 2 && d.contains(".") {
138+
if let Ok(amt) = d.replace(",", "").parse::<f32>() {
139+
transaction.amount = amt * -1.0;
140+
continue;
141+
}
142+
}
154143

155-
col += 1;
144+
// Must be description or debit/credit representation or reward points
145+
if let Ok(tx) = String::from_str(d) {
146+
// skip empty string
147+
if tx == "" {
148+
continue;
149+
}
156150

157-
// Must be amount?
158-
if col > 1 && d.contains(".") {
159-
if let Ok(amt) = d.replace(",", "").parse::<f32>() {
160-
amt_assigned = true;
161-
transaction.amount = amt * -1.0;
162-
continue;
163-
}
164-
}
151+
// skip reward points
152+
if let Ok(p) = tx.replace("- ", "-").parse::<i32>() {
153+
transaction.points = p;
154+
continue;
155+
}
165156

166-
// Must be description or debit/credit representation or reward points
167-
if let Ok(tx) = String::from_str(s.trim()) {
168-
// skip empty string
169-
if tx == "" {
170-
continue;
171-
}
172-
173-
// skip reward points
174-
if let Ok(p) = tx.replace("- ", "-").parse::<i32>()
175-
{
176-
transaction.points = p;
177-
continue;
178-
}
179-
180-
// mark it as credit
181-
if col > 2 && tx == "Cr" {
182-
transaction.amount *= -1.0;
183-
continue;
184-
}
185-
186-
// assume transaction description to be next to date
187-
if col == 1 {
188-
transaction.tx = tx;
189-
}
190-
}
157+
// mark it as credit
158+
if column_ct > 3 && tx == "Cr" {
159+
transaction.amount *= -1.0;
160+
continue;
161+
}
162+
163+
// assume transaction description to be next to date
164+
if column_ct == 2 {
165+
transaction.tx = tx;
191166
}
192167
}
193-
},
168+
}
169+
}
170+
171+
Op::BeginText => {
172+
if !header_assigned && prev_value != "" {
173+
header_column_ct += 1;
174+
}
175+
}
176+
177+
Op::EndText => {
178+
if found_row && column_ct == header_column_ct {
179+
// push transaction here
180+
members.push(transaction.clone());
181+
182+
// reset found flag
183+
found_row = false;
184+
transaction = Transaction::default();
185+
column_ct = 0;
186+
}
194187
}
188+
_ => {}
195189
}
196190
}
197-
_ => {}
198191
}
199192
}
200193
}

0 commit comments

Comments
 (0)