|
1 | | -mod pdf_tools; |
2 | | - |
3 | 1 | use anyhow::{Context, Error}; |
4 | 2 | use chrono::{NaiveDate, NaiveDateTime, NaiveTime}; |
5 | 3 | use clap::{arg, Command}; |
6 | 4 | use csv::Writer; |
7 | 5 | use pdf::content::*; |
8 | 6 | use pdf::file::File as pdfFile; |
9 | | -use pdf_tools::ops_with_text_state; |
10 | 7 | use regex::Regex; |
11 | 8 | use std::fs; |
12 | 9 | use std::fs::File; |
@@ -46,155 +43,151 @@ pub fn parse(path: String, _password: String) -> Result<Vec<Transaction>, Error> |
46 | 43 | // Iterate through pages |
47 | 44 | for page in file.pages() { |
48 | 45 | if let Ok(page) = page { |
49 | | - // For the pdf operations, skip till domestic/internation transactions and then skip till the first occurence of date |
50 | | - // This guesses the transactions rows. |
51 | | - let state = ops_with_text_state(&page, &file) |
52 | | - .skip_while(|(op, _text_state)| match op { |
53 | | - Op::TextDraw { ref text } => { |
54 | | - let data = text.as_bytes(); |
55 | | - if let Ok(s) = std::str::from_utf8(data) { |
56 | | - return s.trim() != "Domestic Transactions" |
57 | | - && s.trim() != "International Transactions"; |
58 | | - } |
59 | | - return true; |
60 | | - } |
61 | | - _ => return true, |
62 | | - }) |
63 | | - .skip_while(|(op, _text_state)| match op { |
64 | | - Op::TextDraw { ref text } => { |
65 | | - let data = text.as_bytes(); |
66 | | - if let Ok(s) = std::str::from_utf8(data) { |
67 | | - let parsed_datetime = |
68 | | - NaiveDateTime::parse_from_str(s.trim(), "%d/%m/%Y %H:%M:%S") |
69 | | - .or_else(|_| { |
70 | | - NaiveDate::parse_from_str(s.trim(), "%d/%m/%Y").map( |
71 | | - |date| { |
72 | | - NaiveDateTime::new( |
73 | | - date, |
74 | | - NaiveTime::from_hms_opt(0, 0, 0).unwrap(), |
75 | | - ) |
76 | | - }, |
77 | | - ) |
78 | | - }); |
79 | | - match parsed_datetime { |
80 | | - Ok(_) => return false, |
81 | | - Err(_) => return true, |
| 46 | + if let Some(content) = &page.contents { |
| 47 | + if let Ok(ops) = content.operations(&file) { |
| 48 | + let mut transaction = Transaction::default(); |
| 49 | + |
| 50 | + let mut found_row = false; |
| 51 | + let mut column_ct = 0; |
| 52 | + let mut header_assigned = false; |
| 53 | + let mut header_column_ct = 0; |
| 54 | + let mut prev_value = ""; |
| 55 | + |
| 56 | + for op in ops.iter().skip_while(|op| match op { |
| 57 | + Op::TextDraw { ref text } => { |
| 58 | + let data = text.as_bytes(); |
| 59 | + if let Ok(s) = std::str::from_utf8(data) { |
| 60 | + return s.trim() != "Domestic Transactions" |
| 61 | + && s.trim() != "International Transactions"; |
82 | 62 | } |
| 63 | + return true; |
83 | 64 | } |
84 | | - return true; |
85 | | - } |
86 | | - _ => return true, |
87 | | - }); |
88 | | - |
89 | | - let mut amt_assigned = false; |
90 | | - let mut col = 0; |
91 | | - let mut found_row = false; |
92 | | - let mut transaction = Transaction::default(); |
93 | | - for (op, _text_state) in state { |
94 | | - match op { |
95 | | - Op::TextDraw { ref text } => { |
96 | | - let data = text.as_bytes(); |
97 | | - if let Ok(s) = std::str::from_utf8(data) { |
98 | | - let d = s.trim(); |
99 | | - if d == "" { |
100 | | - continue; |
101 | | - } |
| 65 | + _ => return true, |
| 66 | + }) { |
| 67 | + match op { |
| 68 | + Op::TextDraw { ref text } => { |
| 69 | + let data = text.as_bytes(); |
| 70 | + if let Ok(s) = std::str::from_utf8(data) { |
| 71 | + // figure out the header column count from the table header. |
| 72 | + // This makes it easier to figure out the end of transaction lines. |
| 73 | + let d = s.trim(); |
| 74 | + if !header_assigned { |
| 75 | + // save this value to check in next iteration of Op::BeginText to count header columns. |
| 76 | + prev_value = d; |
| 77 | + if d == "" { |
| 78 | + continue; |
| 79 | + } |
102 | 80 |
|
103 | | - // try parsing %d/%m/%Y %H:%M:%S / %d/%m/%Y formats |
104 | | - match NaiveDateTime::parse_from_str(d, "%d/%m/%Y %H:%M:%S") { |
105 | | - Ok(dt) => { |
106 | | - // we have transaction here, clone it |
107 | | - if col > 0 { |
108 | | - members.push(transaction.clone()); |
109 | | - transaction = Transaction::default(); |
| 81 | + // XXX: assume the transaction row starts with a date. |
| 82 | + let parsed_datetime = NaiveDateTime::parse_from_str( |
| 83 | + d, |
| 84 | + "%d/%m/%Y %H:%M:%S", |
| 85 | + ) |
| 86 | + .or_else(|_| { |
| 87 | + NaiveDate::parse_from_str(d, "%d/%m/%Y").map( |
| 88 | + |date| { |
| 89 | + NaiveDateTime::new( |
| 90 | + date, |
| 91 | + NaiveTime::from_hms_opt(0, 0, 0).unwrap(), |
| 92 | + ) |
| 93 | + }, |
| 94 | + ) |
| 95 | + }); |
| 96 | + |
| 97 | + match parsed_datetime { |
| 98 | + Ok(_) => { |
| 99 | + header_assigned = true; |
| 100 | + // remove card holder name |
| 101 | + header_column_ct -= 1; |
| 102 | + prev_value = ""; |
| 103 | + } |
| 104 | + Err(_) => continue, |
| 105 | + } |
110 | 106 | } |
111 | 107 |
|
112 | | - transaction.date = dt; |
113 | | - found_row = true; |
114 | | - |
115 | | - // reset col |
116 | | - col = 0; |
117 | | - } |
118 | | - Err(_) => match NaiveDate::parse_from_str(d, "%d/%m/%Y") { |
119 | | - Ok(dt) => { |
120 | | - // we have transaction here, clone it |
121 | | - if col > 0 { |
122 | | - members.push(transaction.clone()); |
123 | | - transaction = Transaction::default(); |
| 108 | + column_ct += 1; |
| 109 | + if d == "" { |
| 110 | + if !found_row { |
| 111 | + column_ct -= 1; |
124 | 112 | } |
125 | 113 |
|
126 | | - transaction.date = NaiveDateTime::new( |
127 | | - dt, |
128 | | - NaiveTime::from_hms_opt(0, 0, 0).unwrap(), |
129 | | - ); |
130 | | - found_row = true; |
| 114 | + continue; |
| 115 | + } |
131 | 116 |
|
132 | | - // reset col |
133 | | - col = 0; |
| 117 | + if column_ct == 1 { |
| 118 | + if let Ok(tx_date) = |
| 119 | + NaiveDateTime::parse_from_str(d, "%d/%m/%Y %H:%M:%S") |
| 120 | + { |
| 121 | + found_row = true; |
| 122 | + transaction.date = tx_date; |
| 123 | + continue; |
| 124 | + } |
| 125 | + if let Ok(tx_date) = |
| 126 | + NaiveDate::parse_from_str(d, "%d/%m/%Y") |
| 127 | + { |
| 128 | + found_row = true; |
| 129 | + transaction.date = NaiveDateTime::new( |
| 130 | + tx_date, |
| 131 | + NaiveTime::from_hms_opt(0, 0, 0).unwrap(), |
| 132 | + ); |
| 133 | + continue; |
| 134 | + } |
134 | 135 | } |
135 | 136 |
|
136 | | - Err(_) => { |
137 | | - // Check for the descriptio, amount in the same row where the date was found. |
138 | | - if found_row { |
139 | | - // page end. push the transaction to the list and continue. |
140 | | - if amt_assigned { |
141 | | - if col > 3 { |
142 | | - if let Ok(tx) = String::from_str(s.trim()) { |
143 | | - if tx == "Cr" { |
144 | | - transaction.amount *= -1.0; |
145 | | - } |
146 | | - } |
147 | | - |
148 | | - members.push(transaction.clone()); |
149 | | - found_row = false; |
150 | | - transaction = Transaction::default(); |
151 | | - continue; |
152 | | - } |
153 | | - } |
| 137 | + if column_ct > 2 && d.contains(".") { |
| 138 | + if let Ok(amt) = d.replace(",", "").parse::<f32>() { |
| 139 | + transaction.amount = amt * -1.0; |
| 140 | + continue; |
| 141 | + } |
| 142 | + } |
154 | 143 |
|
155 | | - col += 1; |
| 144 | + // Must be description or debit/credit representation or reward points |
| 145 | + if let Ok(tx) = String::from_str(d) { |
| 146 | + // skip empty string |
| 147 | + if tx == "" { |
| 148 | + continue; |
| 149 | + } |
156 | 150 |
|
157 | | - // Must be amount? |
158 | | - if col > 1 && d.contains(".") { |
159 | | - if let Ok(amt) = d.replace(",", "").parse::<f32>() { |
160 | | - amt_assigned = true; |
161 | | - transaction.amount = amt * -1.0; |
162 | | - continue; |
163 | | - } |
164 | | - } |
| 151 | + // skip reward points |
| 152 | + if let Ok(p) = tx.replace("- ", "-").parse::<i32>() { |
| 153 | + transaction.points = p; |
| 154 | + continue; |
| 155 | + } |
165 | 156 |
|
166 | | - // Must be description or debit/credit representation or reward points |
167 | | - if let Ok(tx) = String::from_str(s.trim()) { |
168 | | - // skip empty string |
169 | | - if tx == "" { |
170 | | - continue; |
171 | | - } |
172 | | - |
173 | | - // skip reward points |
174 | | - if let Ok(p) = tx.replace("- ", "-").parse::<i32>() |
175 | | - { |
176 | | - transaction.points = p; |
177 | | - continue; |
178 | | - } |
179 | | - |
180 | | - // mark it as credit |
181 | | - if col > 2 && tx == "Cr" { |
182 | | - transaction.amount *= -1.0; |
183 | | - continue; |
184 | | - } |
185 | | - |
186 | | - // assume transaction description to be next to date |
187 | | - if col == 1 { |
188 | | - transaction.tx = tx; |
189 | | - } |
190 | | - } |
| 157 | + // mark it as credit |
| 158 | + if column_ct > 3 && tx == "Cr" { |
| 159 | + transaction.amount *= -1.0; |
| 160 | + continue; |
| 161 | + } |
| 162 | + |
| 163 | + // assume transaction description to be next to date |
| 164 | + if column_ct == 2 { |
| 165 | + transaction.tx = tx; |
191 | 166 | } |
192 | 167 | } |
193 | | - }, |
| 168 | + } |
| 169 | + } |
| 170 | + |
| 171 | + Op::BeginText => { |
| 172 | + if !header_assigned && prev_value != "" { |
| 173 | + header_column_ct += 1; |
| 174 | + } |
| 175 | + } |
| 176 | + |
| 177 | + Op::EndText => { |
| 178 | + if found_row && column_ct == header_column_ct { |
| 179 | + // push transaction here |
| 180 | + members.push(transaction.clone()); |
| 181 | + |
| 182 | + // reset found flag |
| 183 | + found_row = false; |
| 184 | + transaction = Transaction::default(); |
| 185 | + column_ct = 0; |
| 186 | + } |
194 | 187 | } |
| 188 | + _ => {} |
195 | 189 | } |
196 | 190 | } |
197 | | - _ => {} |
198 | 191 | } |
199 | 192 | } |
200 | 193 | } |
|
0 commit comments