|
13 | 13 | }, |
14 | 14 | { |
15 | 15 | "cell_type": "code", |
16 | | - "execution_count": null, |
| 16 | + "execution_count": 1, |
17 | 17 | "metadata": {}, |
18 | 18 | "outputs": [ |
19 | 19 | { |
20 | 20 | "name": "stderr", |
21 | 21 | "output_type": "stream", |
22 | 22 | "text": [ |
23 | | - "Processing Fields: 100%|██████████| 6/6 [02:01<00:00, 20.26s/it]\n" |
| 23 | + "Processing Fields: 100%|██████████| 6/6 [00:23<00:00, 3.90s/it]\n" |
24 | 24 | ] |
25 | 25 | } |
26 | 26 | ], |
|
30 | 30 | "import os\n", |
31 | 31 | "import pandas as pd\n", |
32 | 32 | "\n", |
33 | | - "# Read unclean / directly scraped reports from file\n", |
| 33 | + "# Read unclean reports from file (these were scraped with the Scraper class)\n", |
34 | 34 | "unclean_reports = pd.read_csv('../data/testreports.csv')\n", |
35 | 35 | "\n", |
36 | 36 | "# Get API key\n", |
37 | 37 | "load_dotenv(\"api.env\")\n", |
38 | 38 | "openai_api_key = os.getenv(\"OPENAI_API_KEY\")\n", |
39 | 39 | "\n", |
40 | | - "# Set up API client\n", |
41 | | - "llm_client = LLM(api_key=openai_api_key,\n", |
42 | | - " model=\"gpt-4.1-mini\")\n", |
| 40 | + "# Set up LLM client\n", |
| 41 | + "llm_client = LLM(api_key=openai_api_key, \n", |
| 42 | + " max_workers=50)\n", |
43 | 43 | "\n", |
44 | | - "# Run cleaner (below, we use minimal parameters. In practice, the user can 'turn off' cleaning for a given column)\n", |
| 44 | + "# Run cleaner\n", |
45 | 45 | "cleaner = Cleaner(\n", |
46 | 46 | " llm=llm_client,\n", |
47 | 47 | " reports=unclean_reports)\n", |
|
52 | 52 | }, |
53 | 53 | { |
54 | 54 | "cell_type": "code", |
55 | | - "execution_count": 5, |
56 | | - "metadata": {}, |
57 | | - "outputs": [], |
58 | | - "source": [ |
59 | | - "cleaned_reports.head()\n", |
60 | | - "\n", |
61 | | - "cleaned_reports.to_csv('../data/testreports_cleaned.csv')" |
62 | | - ] |
63 | | - }, |
64 | | - { |
65 | | - "cell_type": "markdown", |
66 | | - "metadata": {}, |
67 | | - "source": [ |
68 | | - "Below, we can see the output of our cleaning instance:" |
69 | | - ] |
70 | | - }, |
71 | | - { |
72 | | - "cell_type": "code", |
73 | | - "execution_count": 6, |
| 55 | + "execution_count": 2, |
74 | 56 | "metadata": {}, |
75 | 57 | "outputs": [ |
76 | 58 | { |
|
116 | 98 | " <td>L. Brown</td>\n", |
117 | 99 | " <td>West London</td>\n", |
118 | 100 | " <td>Revon Healthcare</td>\n", |
119 | | - " <td>On 18 December 2023 I commenced an investigati...</td>\n", |
120 | | - " <td>James was found deceased in his room at Surbit...</td>\n", |
| 101 | + " <td>On 18th December 2023 I commenced an investiga...</td>\n", |
| 102 | + " <td>James was found deceased in his room at [REDAC...</td>\n", |
121 | 103 | " <td>(1) During the inquest the court was advised t...</td>\n", |
122 | 104 | " </tr>\n", |
123 | 105 | " <tr>\n", |
|
131 | 113 | " <td>HMPPS</td>\n", |
132 | 114 | " <td>N/A: Not found</td>\n", |
133 | 115 | " <td>During the course of the inquest the court hea...</td>\n", |
134 | | - " <td>a Probation staff are not always aware of or h...</td>\n", |
| 116 | + " <td>Probation staff are not always aware of or hav...</td>\n", |
135 | 117 | " </tr>\n", |
136 | 118 | " <tr>\n", |
137 | 119 | " <th>2</th>\n", |
|
154 | 136 | " <td>2025-02-28</td>\n", |
155 | 137 | " <td>A. Cox</td>\n", |
156 | 138 | " <td>Cornwall and the Isles of Scilly</td>\n", |
157 | | - " <td>MP; Secretary of State for Health and Social Care</td>\n", |
| 139 | + " <td>MP; Secretary of State for Health & Social Care</td>\n", |
158 | 140 | " <td>On 27 February 2025, I concluded a four-day ju...</td>\n", |
159 | 141 | " <td>Despite appropriate treatment by paramedics an...</td>\n", |
160 | 142 | " <td>Delay in ambulance response attributable to de...</td>\n", |
|
167 | 149 | " <td>2025-02-28</td>\n", |
168 | 150 | " <td>A. Cox</td>\n", |
169 | 151 | " <td>Cornwall and the Isles of Scilly</td>\n", |
170 | | - " <td>Chief Constable Devon & Cornwall Constabulary;...</td>\n", |
171 | | - " <td>On 27 February 2025, I concluded a four-day ju...</td>\n", |
| 152 | + " <td>Chief Constable, Devon & Cornwall Constabulary...</td>\n", |
| 153 | + " <td>On 27th February 2025, I concluded a four-day ...</td>\n", |
172 | 154 | " <td>Mr Campbell had a history of recreational drug...</td>\n", |
173 | | - " <td>1) Delays in ambulance attendance. I have writ...</td>\n", |
| 155 | + " <td>Delays in ambulance attendance. Information sh...</td>\n", |
174 | 156 | " </tr>\n", |
175 | 157 | " <tr>\n", |
176 | 158 | " <th>5</th>\n", |
|
179 | 161 | " <td>2025-0113</td>\n", |
180 | 162 | " <td>2025-02-28</td>\n", |
181 | 163 | " <td>H. Westerman</td>\n", |
182 | | - " <td>Shropshire, Telford and Wrekin</td>\n", |
| 164 | + " <td>Shropshire, Telford & Wrekin</td>\n", |
183 | 165 | " <td>NHS England; Chief Executive of Shrewsbury and...</td>\n", |
184 | 166 | " <td>On 12 July 2023 Mr Ellery, H.M. Senior Coroner...</td>\n", |
185 | 167 | " <td>Mr Green was admitted to The Royal Shrewsbury ...</td>\n", |
|
207 | 189 | " <td>J. Turner</td>\n", |
208 | 190 | " <td>West Sussex, Brighton and Hove</td>\n", |
209 | 191 | " <td>Ministry of Defence</td>\n", |
210 | | - " <td>On 01 November 2023 I commenced an investigati...</td>\n", |
| 192 | + " <td>On 1 November 2023 I commenced an investigatio...</td>\n", |
211 | 193 | " <td>Mr Taylor had rapidly fallen into drug addicti...</td>\n", |
212 | 194 | " <td>When found to have taken illicit drugs months ...</td>\n", |
213 | 195 | " </tr>\n", |
|
222 | 204 | " <td>Secretary of State Department of Health and So...</td>\n", |
223 | 205 | " <td>On 13th May 2024 I commenced an investigation ...</td>\n", |
224 | 206 | " <td>Kim Robinson's death was recognised at 05:16 o...</td>\n", |
225 | | - " <td>1. Following Kim's tragic death the GP who had...</td>\n", |
| 207 | + " <td>Following Kim's tragic death the GP who had pr...</td>\n", |
226 | 208 | " </tr>\n", |
227 | 209 | " <tr>\n", |
228 | 210 | " <th>9</th>\n", |
|
235 | 217 | " <td>NHS England</td>\n", |
236 | 218 | " <td>On 1 July 2021 an investigation into the death...</td>\n", |
237 | 219 | " <td>Mr Marriage had a longstanding diagnosis of id...</td>\n", |
238 | | - " <td>(1) There are cohorts of patients who are medi...</td>\n", |
| 220 | + " <td>There are cohorts of patients who are medicati...</td>\n", |
239 | 221 | " </tr>\n", |
240 | 222 | " </tbody>\n", |
241 | 223 | "</table>\n", |
|
260 | 242 | "2 2025-04-03 N. Walker Hampshire, Portsmouth and Southampton \n", |
261 | 243 | "3 2025-02-28 A. Cox Cornwall and the Isles of Scilly \n", |
262 | 244 | "4 2025-02-28 A. Cox Cornwall and the Isles of Scilly \n", |
263 | | - "5 2025-02-28 H. Westerman Shropshire, Telford and Wrekin \n", |
| 245 | + "5 2025-02-28 H. Westerman Shropshire, Telford & Wrekin \n", |
264 | 246 | "6 2025-02-27 R. Middleton Dorset \n", |
265 | 247 | "7 2025-01-31 J. Turner West Sussex, Brighton and Hove \n", |
266 | 248 | "8 2025-01-31 N. Parsley Suffolk \n", |
|
270 | 252 | "0 Revon Healthcare \n", |
271 | 253 | "1 HMPPS \n", |
272 | 254 | "2 National Institute for Health and Care Excelle... \n", |
273 | | - "3 MP; Secretary of State for Health and Social Care \n", |
274 | | - "4 Chief Constable Devon & Cornwall Constabulary;... \n", |
| 255 | + "3 MP; Secretary of State for Health & Social Care \n", |
| 256 | + "4 Chief Constable, Devon & Cornwall Constabulary... \n", |
275 | 257 | "5 NHS England; Chief Executive of Shrewsbury and... \n", |
276 | 258 | "6 The Home Office \n", |
277 | 259 | "7 Ministry of Defence \n", |
278 | 260 | "8 Secretary of State Department of Health and So... \n", |
279 | 261 | "9 NHS England \n", |
280 | 262 | "\n", |
281 | 263 | " InvestigationAndInquest \\\n", |
282 | | - "0 On 18 December 2023 I commenced an investigati... \n", |
| 264 | + "0 On 18th December 2023 I commenced an investiga... \n", |
283 | 265 | "1 N/A: Not found \n", |
284 | 266 | "2 On 19th September 2023 an investigation was co... \n", |
285 | 267 | "3 On 27 February 2025, I concluded a four-day ju... \n", |
286 | | - "4 On 27 February 2025, I concluded a four-day ju... \n", |
| 268 | + "4 On 27th February 2025, I concluded a four-day ... \n", |
287 | 269 | "5 On 12 July 2023 Mr Ellery, H.M. Senior Coroner... \n", |
288 | 270 | "6 On the 13th June 2024, an investigation was co... \n", |
289 | | - "7 On 01 November 2023 I commenced an investigati... \n", |
| 271 | + "7 On 1 November 2023 I commenced an investigatio... \n", |
290 | 272 | "8 On 13th May 2024 I commenced an investigation ... \n", |
291 | 273 | "9 On 1 July 2021 an investigation into the death... \n", |
292 | 274 | "\n", |
293 | 275 | " CircumstancesOfDeath \\\n", |
294 | | - "0 James was found deceased in his room at Surbit... \n", |
| 276 | + "0 James was found deceased in his room at [REDAC... \n", |
295 | 277 | "1 During the course of the inquest the court hea... \n", |
296 | 278 | "2 Chloe Elizabeth Burgess was found deceased at ... \n", |
297 | 279 | "3 Despite appropriate treatment by paramedics an... \n", |
|
304 | 286 | "\n", |
305 | 287 | " MattersOfConcern \n", |
306 | 288 | "0 (1) During the inquest the court was advised t... \n", |
307 | | - "1 a Probation staff are not always aware of or h... \n", |
| 289 | + "1 Probation staff are not always aware of or hav... \n", |
308 | 290 | "2 The inquest heard evidence that the potential ... \n", |
309 | 291 | "3 Delay in ambulance response attributable to de... \n", |
310 | | - "4 1) Delays in ambulance attendance. I have writ... \n", |
| 292 | + "4 Delays in ambulance attendance. Information sh... \n", |
311 | 293 | "5 (1) Once any patient at The Royal Shrewsbury H... \n", |
312 | 294 | "6 N/A: Not found \n", |
313 | 295 | "7 When found to have taken illicit drugs months ... \n", |
314 | | - "8 1. Following Kim's tragic death the GP who had... \n", |
315 | | - "9 (1) There are cohorts of patients who are medi... " |
| 296 | + "8 Following Kim's tragic death the GP who had pr... \n", |
| 297 | + "9 There are cohorts of patients who are medicati... " |
316 | 298 | ] |
317 | 299 | }, |
318 | | - "execution_count": 6, |
| 300 | + "execution_count": 2, |
319 | 301 | "metadata": {}, |
320 | 302 | "output_type": "execute_result" |
321 | 303 | } |
322 | 304 | ], |
323 | 305 | "source": [ |
324 | | - "cleaner.cleaned_reports" |
| 306 | + "cleaned_reports.head(n=10)\n", |
| 307 | + "\n", |
| 308 | + "#cleaned_reports.to_csv('../data/testreports_cleaned.csv')" |
| 309 | + ] |
| 310 | + }, |
| 311 | + { |
| 312 | + "cell_type": "markdown", |
| 313 | + "metadata": {}, |
| 314 | + "source": [ |
| 315 | + "Below, we can see the output of our cleaning instance:" |
325 | 316 | ] |
326 | 317 | }, |
327 | 318 | { |
|
333 | 324 | }, |
334 | 325 | { |
335 | 326 | "cell_type": "code", |
336 | | - "execution_count": null, |
| 327 | + "execution_count": 3, |
337 | 328 | "metadata": {}, |
338 | 329 | "outputs": [ |
339 | 330 | { |
|
578 | 569 | "9 During the course of the inquest the evidence ... " |
579 | 570 | ] |
580 | 571 | }, |
581 | | - "execution_count": 4, |
| 572 | + "execution_count": 3, |
582 | 573 | "metadata": {}, |
583 | 574 | "output_type": "execute_result" |
584 | 575 | } |
|
0 commit comments