Skip to content

Commit d638278

Browse files
authored
Merge pull request #12 from Sam-Osian/refactor
2 parents bd27cc1 + dcd69a5 commit d638278

File tree

9 files changed

+3187
-2642
lines changed

9 files changed

+3187
-2642
lines changed

data/scraped_reports.csv

Lines changed: 1469 additions & 693 deletions
Large diffs are not rendered by default.

notebooks/DEMO_Cleaner.ipynb

Lines changed: 43 additions & 52 deletions
Original file line numberDiff line numberDiff line change
@@ -13,14 +13,14 @@
1313
},
1414
{
1515
"cell_type": "code",
16-
"execution_count": null,
16+
"execution_count": 1,
1717
"metadata": {},
1818
"outputs": [
1919
{
2020
"name": "stderr",
2121
"output_type": "stream",
2222
"text": [
23-
"Processing Fields: 100%|██████████| 6/6 [02:01<00:00, 20.26s/it]\n"
23+
"Processing Fields: 100%|██████████| 6/6 [00:23<00:00, 3.90s/it]\n"
2424
]
2525
}
2626
],
@@ -30,18 +30,18 @@
3030
"import os\n",
3131
"import pandas as pd\n",
3232
"\n",
33-
"# Read unclean / directly scraped reports from file\n",
33+
"# Read unclean reports from file (these were scraped with the Scraper class)\n",
3434
"unclean_reports = pd.read_csv('../data/testreports.csv')\n",
3535
"\n",
3636
"# Get API key\n",
3737
"load_dotenv(\"api.env\")\n",
3838
"openai_api_key = os.getenv(\"OPENAI_API_KEY\")\n",
3939
"\n",
40-
"# Set up API client\n",
41-
"llm_client = LLM(api_key=openai_api_key,\n",
42-
" model=\"gpt-4.1-mini\")\n",
40+
"# Set up LLM client\n",
41+
"llm_client = LLM(api_key=openai_api_key, \n",
42+
" max_workers=50)\n",
4343
"\n",
44-
"# Run cleaner (below, we use minimal parameters. In practice, the user can 'turn off' cleaning for a given column)\n",
44+
"# Run cleaner\n",
4545
"cleaner = Cleaner(\n",
4646
" llm=llm_client,\n",
4747
" reports=unclean_reports)\n",
@@ -52,25 +52,7 @@
5252
},
5353
{
5454
"cell_type": "code",
55-
"execution_count": 5,
56-
"metadata": {},
57-
"outputs": [],
58-
"source": [
59-
"cleaned_reports.head()\n",
60-
"\n",
61-
"cleaned_reports.to_csv('../data/testreports_cleaned.csv')"
62-
]
63-
},
64-
{
65-
"cell_type": "markdown",
66-
"metadata": {},
67-
"source": [
68-
"Below, we can see the output of our cleaning instance:"
69-
]
70-
},
71-
{
72-
"cell_type": "code",
73-
"execution_count": 6,
55+
"execution_count": 2,
7456
"metadata": {},
7557
"outputs": [
7658
{
@@ -116,8 +98,8 @@
11698
" <td>L. Brown</td>\n",
11799
" <td>West London</td>\n",
118100
" <td>Revon Healthcare</td>\n",
119-
" <td>On 18 December 2023 I commenced an investigati...</td>\n",
120-
" <td>James was found deceased in his room at Surbit...</td>\n",
101+
" <td>On 18th December 2023 I commenced an investiga...</td>\n",
102+
" <td>James was found deceased in his room at [REDAC...</td>\n",
121103
" <td>(1) During the inquest the court was advised t...</td>\n",
122104
" </tr>\n",
123105
" <tr>\n",
@@ -131,7 +113,7 @@
131113
" <td>HMPPS</td>\n",
132114
" <td>N/A: Not found</td>\n",
133115
" <td>During the course of the inquest the court hea...</td>\n",
134-
" <td>a Probation staff are not always aware of or h...</td>\n",
116+
" <td>Probation staff are not always aware of or hav...</td>\n",
135117
" </tr>\n",
136118
" <tr>\n",
137119
" <th>2</th>\n",
@@ -154,7 +136,7 @@
154136
" <td>2025-02-28</td>\n",
155137
" <td>A. Cox</td>\n",
156138
" <td>Cornwall and the Isles of Scilly</td>\n",
157-
" <td>MP; Secretary of State for Health and Social Care</td>\n",
139+
" <td>MP; Secretary of State for Health &amp; Social Care</td>\n",
158140
" <td>On 27 February 2025, I concluded a four-day ju...</td>\n",
159141
" <td>Despite appropriate treatment by paramedics an...</td>\n",
160142
" <td>Delay in ambulance response attributable to de...</td>\n",
@@ -167,10 +149,10 @@
167149
" <td>2025-02-28</td>\n",
168150
" <td>A. Cox</td>\n",
169151
" <td>Cornwall and the Isles of Scilly</td>\n",
170-
" <td>Chief Constable Devon &amp; Cornwall Constabulary;...</td>\n",
171-
" <td>On 27 February 2025, I concluded a four-day ju...</td>\n",
152+
" <td>Chief Constable, Devon &amp; Cornwall Constabulary...</td>\n",
153+
" <td>On 27th February 2025, I concluded a four-day ...</td>\n",
172154
" <td>Mr Campbell had a history of recreational drug...</td>\n",
173-
" <td>1) Delays in ambulance attendance. I have writ...</td>\n",
155+
" <td>Delays in ambulance attendance. Information sh...</td>\n",
174156
" </tr>\n",
175157
" <tr>\n",
176158
" <th>5</th>\n",
@@ -179,7 +161,7 @@
179161
" <td>2025-0113</td>\n",
180162
" <td>2025-02-28</td>\n",
181163
" <td>H. Westerman</td>\n",
182-
" <td>Shropshire, Telford and Wrekin</td>\n",
164+
" <td>Shropshire, Telford &amp; Wrekin</td>\n",
183165
" <td>NHS England; Chief Executive of Shrewsbury and...</td>\n",
184166
" <td>On 12 July 2023 Mr Ellery, H.M. Senior Coroner...</td>\n",
185167
" <td>Mr Green was admitted to The Royal Shrewsbury ...</td>\n",
@@ -207,7 +189,7 @@
207189
" <td>J. Turner</td>\n",
208190
" <td>West Sussex, Brighton and Hove</td>\n",
209191
" <td>Ministry of Defence</td>\n",
210-
" <td>On 01 November 2023 I commenced an investigati...</td>\n",
192+
" <td>On 1 November 2023 I commenced an investigatio...</td>\n",
211193
" <td>Mr Taylor had rapidly fallen into drug addicti...</td>\n",
212194
" <td>When found to have taken illicit drugs months ...</td>\n",
213195
" </tr>\n",
@@ -222,7 +204,7 @@
222204
" <td>Secretary of State Department of Health and So...</td>\n",
223205
" <td>On 13th May 2024 I commenced an investigation ...</td>\n",
224206
" <td>Kim Robinson's death was recognised at 05:16 o...</td>\n",
225-
" <td>1. Following Kim's tragic death the GP who had...</td>\n",
207+
" <td>Following Kim's tragic death the GP who had pr...</td>\n",
226208
" </tr>\n",
227209
" <tr>\n",
228210
" <th>9</th>\n",
@@ -235,7 +217,7 @@
235217
" <td>NHS England</td>\n",
236218
" <td>On 1 July 2021 an investigation into the death...</td>\n",
237219
" <td>Mr Marriage had a longstanding diagnosis of id...</td>\n",
238-
" <td>(1) There are cohorts of patients who are medi...</td>\n",
220+
" <td>There are cohorts of patients who are medicati...</td>\n",
239221
" </tr>\n",
240222
" </tbody>\n",
241223
"</table>\n",
@@ -260,7 +242,7 @@
260242
"2 2025-04-03 N. Walker Hampshire, Portsmouth and Southampton \n",
261243
"3 2025-02-28 A. Cox Cornwall and the Isles of Scilly \n",
262244
"4 2025-02-28 A. Cox Cornwall and the Isles of Scilly \n",
263-
"5 2025-02-28 H. Westerman Shropshire, Telford and Wrekin \n",
245+
"5 2025-02-28 H. Westerman Shropshire, Telford & Wrekin \n",
264246
"6 2025-02-27 R. Middleton Dorset \n",
265247
"7 2025-01-31 J. Turner West Sussex, Brighton and Hove \n",
266248
"8 2025-01-31 N. Parsley Suffolk \n",
@@ -270,28 +252,28 @@
270252
"0 Revon Healthcare \n",
271253
"1 HMPPS \n",
272254
"2 National Institute for Health and Care Excelle... \n",
273-
"3 MP; Secretary of State for Health and Social Care \n",
274-
"4 Chief Constable Devon & Cornwall Constabulary;... \n",
255+
"3 MP; Secretary of State for Health & Social Care \n",
256+
"4 Chief Constable, Devon & Cornwall Constabulary... \n",
275257
"5 NHS England; Chief Executive of Shrewsbury and... \n",
276258
"6 The Home Office \n",
277259
"7 Ministry of Defence \n",
278260
"8 Secretary of State Department of Health and So... \n",
279261
"9 NHS England \n",
280262
"\n",
281263
" InvestigationAndInquest \\\n",
282-
"0 On 18 December 2023 I commenced an investigati... \n",
264+
"0 On 18th December 2023 I commenced an investiga... \n",
283265
"1 N/A: Not found \n",
284266
"2 On 19th September 2023 an investigation was co... \n",
285267
"3 On 27 February 2025, I concluded a four-day ju... \n",
286-
"4 On 27 February 2025, I concluded a four-day ju... \n",
268+
"4 On 27th February 2025, I concluded a four-day ... \n",
287269
"5 On 12 July 2023 Mr Ellery, H.M. Senior Coroner... \n",
288270
"6 On the 13th June 2024, an investigation was co... \n",
289-
"7 On 01 November 2023 I commenced an investigati... \n",
271+
"7 On 1 November 2023 I commenced an investigatio... \n",
290272
"8 On 13th May 2024 I commenced an investigation ... \n",
291273
"9 On 1 July 2021 an investigation into the death... \n",
292274
"\n",
293275
" CircumstancesOfDeath \\\n",
294-
"0 James was found deceased in his room at Surbit... \n",
276+
"0 James was found deceased in his room at [REDAC... \n",
295277
"1 During the course of the inquest the court hea... \n",
296278
"2 Chloe Elizabeth Burgess was found deceased at ... \n",
297279
"3 Despite appropriate treatment by paramedics an... \n",
@@ -304,24 +286,33 @@
304286
"\n",
305287
" MattersOfConcern \n",
306288
"0 (1) During the inquest the court was advised t... \n",
307-
"1 a Probation staff are not always aware of or h... \n",
289+
"1 Probation staff are not always aware of or hav... \n",
308290
"2 The inquest heard evidence that the potential ... \n",
309291
"3 Delay in ambulance response attributable to de... \n",
310-
"4 1) Delays in ambulance attendance. I have writ... \n",
292+
"4 Delays in ambulance attendance. Information sh... \n",
311293
"5 (1) Once any patient at The Royal Shrewsbury H... \n",
312294
"6 N/A: Not found \n",
313295
"7 When found to have taken illicit drugs months ... \n",
314-
"8 1. Following Kim's tragic death the GP who had... \n",
315-
"9 (1) There are cohorts of patients who are medi... "
296+
"8 Following Kim's tragic death the GP who had pr... \n",
297+
"9 There are cohorts of patients who are medicati... "
316298
]
317299
},
318-
"execution_count": 6,
300+
"execution_count": 2,
319301
"metadata": {},
320302
"output_type": "execute_result"
321303
}
322304
],
323305
"source": [
324-
"cleaner.cleaned_reports"
306+
"cleaned_reports.head(n=10)\n",
307+
"\n",
308+
"#cleaned_reports.to_csv('../data/testreports_cleaned.csv')"
309+
]
310+
},
311+
{
312+
"cell_type": "markdown",
313+
"metadata": {},
314+
"source": [
315+
"Below, we can see the output of our cleaning instance:"
325316
]
326317
},
327318
{
@@ -333,7 +324,7 @@
333324
},
334325
{
335326
"cell_type": "code",
336-
"execution_count": null,
327+
"execution_count": 3,
337328
"metadata": {},
338329
"outputs": [
339330
{
@@ -578,7 +569,7 @@
578569
"9 During the course of the inquest the evidence ... "
579570
]
580571
},
581-
"execution_count": 4,
572+
"execution_count": 3,
582573
"metadata": {},
583574
"output_type": "execute_result"
584575
}

0 commit comments

Comments
 (0)