From e86d969ca83fda7c2e9ee7fc8b00e7368c280975 Mon Sep 17 00:00:00 2001
From: "danny.morton714" <danny.morton714@gmail.com>
Date: Wed, 20 Aug 2025 14:42:38 -0400
Subject: [PATCH] Wrote a test and updated the clean salary method to catch all
 the edge cases.

---
 src/notebooks/mainNb.ipynb | 1208 ++++++++++++++++++++++++++++++++++++
 1 file changed, 1208 insertions(+)
 create mode 100644 src/notebooks/mainNb.ipynb

diff --git a/src/notebooks/mainNb.ipynb b/src/notebooks/mainNb.ipynb
new file mode 100644
index 0000000..7288cea
--- /dev/null
+++ b/src/notebooks/mainNb.ipynb
@@ -0,0 +1,1208 @@
+{
+ "cells": [
+  {
+   "cell_type": "markdown",
+   "id": "8d232fdb",
+   "metadata": {},
+   "source": [
+    "### **Table of Contents**\n",
+    "    * [**Table of Contents**](#**table-of-contents**)\n",
+    "  * [Function To Read in the Data!](#function-to-read-in-the-data!)\n",
+    "  * [Example usage](#example-usage)\n",
+    "      * [To Access a DataFrame in the list](#to-access-a-dataframe-in-the-list)\n",
+    "      * [To Remove Spaces in DataFrame name](#to-remove-spaces-in-dataframe-name)\n",
+    "  * [Update cleaning code](#update-cleaning-code)\n",
+    "  * [Generate report](#generate-report)\n",
+    "  * [Plots](#plots)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 28,
+   "id": "d11a2343",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import pandas as pd\n",
+    "from typing import Dict, Union\n",
+    "from pathlib import Path\n",
+    "import os\n",
+    "import sys\n",
+    "import re\n",
+    "import pandas.testing as pdt"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "0764cac1",
+   "metadata": {},
+   "source": [
+    "## Function To Read in the Data! "
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 14,
+   "id": "7cd30f44",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "def load_data_folder(\n",
+    "    folder_path: Union[str, os.PathLike] = \"../../data\"\n",
+    ") -> Dict[str, pd.DataFrame]:\n",
+    "    \"\"\"\n",
+    "    Load all CSV/XLS/XLSX files in a folder into pandas DataFrames.\n",
+    "\n",
+    "    Parameters\n",
+    "    ----------\n",
+    "    folder_path : str | os.PathLike, optional\n",
+    "        Path to the folder containing the files. Defaults to \"../../data\".\n",
+    "\n",
+    "    Returns\n",
+    "    -------\n",
+    "    Dict[str, pandas.DataFrame]\n",
+    "        A mapping from the file's stem (filename without extension) to its\n",
+    "        loaded DataFrame. For example, \"employees.csv\" -> key \"employees\".\n",
+    "\n",
+    "    Raises\n",
+    "    ------\n",
+    "    FileNotFoundError\n",
+    "        If `folder_path` does not exist.\n",
+    "    PermissionError\n",
+    "        If the folder or files cannot be accessed due to permissions.\n",
+    "    pd.errors.EmptyDataError\n",
+    "        If a CSV file is empty and cannot be parsed.\n",
+    "\n",
+    "    Notes\n",
+    "    -----\n",
+    "    - Supported extensions: .csv, .xls, .xlsx (case-insensitive).\n",
+    "    - If both `name.csv` and `name.xlsx` exist, the later one encountered will\n",
+    "      overwrite the earlier entry for key `name`.\n",
+    "    \"\"\"\n",
+    "    path = Path(folder_path)\n",
+    "    if not path.exists():\n",
+    "        raise FileNotFoundError(f\"Folder not found: {path.resolve()}\")\n",
+    "\n",
+    "    dataframes: Dict[str, pd.DataFrame] = {}\n",
+    "    for p in path.iterdir():\n",
+    "        if not p.is_file():\n",
+    "            continue\n",
+    "\n",
+    "        ext = p.suffix.lower()\n",
+    "        if ext == \".csv\":\n",
+    "            df = pd.read_csv(p)\n",
+    "        elif ext in {\".xlsx\", \".xls\"}:\n",
+    "            df = pd.read_excel(p)\n",
+    "        else:\n",
+    "            continue\n",
+    "\n",
+    "        dataframes[p.stem] = df\n",
+    "\n",
+    "    return dataframes\n"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "714769cf",
+   "metadata": {},
+   "source": [
+    "## Example usage \n",
+    "\n",
+    "```python \n",
+    "dfs = load_data_folder()\n",
+    "dfs.keys()\n",
+    "```\n",
+    "output:\n",
+    "```bash\n",
+    "dict_keys(['ARC_Enrollments', 'ARC_Application', 'All_demographics_and_programs'])\n",
+    "```\n",
+    "#### To Access a DataFrame in the list \n",
+    "\n",
+    "```python\n",
+    "all_demo = dfs['All_demographics_and_programs']\n",
+    "all_demo.head(1)\n",
+    "```\n",
+    "\n",
+    "output:\n",
+    "|col 1|col 2|col 3|\n",
+    "|:--:|:--:|:--:|\n",
+    "|3.14|name|apple|\n",
+    "\n",
+    "\n",
+    "\n",
+    "#### To Remove Spaces in DataFrame name\n",
+    "\n",
+    "```python \n",
+    "for name, df in dfs.items():\n",
+    "    safe_name = name.replace(\" \", \"_\")\n",
+    "    globals()[safe_name] = df\n",
+    "```"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 16,
+   "id": "15c0e5af",
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "dict_keys(['ARC_Enrollments', 'ARC_Application', 'All_demographics_and_programs'])"
+      ]
+     },
+     "execution_count": 16,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "dfs = load_data_folder()\n",
+    "dfs.keys()"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "f643e1d8",
+   "metadata": {},
+   "source": [
+    "How to call the dataframe from the list above"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 18,
+   "id": "5875ef3e",
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/html": [
+       "<div>\n",
+       "<style scoped>\n",
+       "    .dataframe tbody tr th:only-of-type {\n",
+       "        vertical-align: middle;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe tbody tr th {\n",
+       "        vertical-align: top;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe thead th {\n",
+       "        text-align: right;\n",
+       "    }\n",
+       "</style>\n",
+       "<table border=\"1\" class=\"dataframe\">\n",
+       "  <thead>\n",
+       "    <tr style=\"text-align: right;\">\n",
+       "      <th></th>\n",
+       "      <th>Auto Id</th>\n",
+       "      <th>First Name</th>\n",
+       "      <th>Last Name</th>\n",
+       "      <th>Gender</th>\n",
+       "      <th>Race</th>\n",
+       "      <th>Ethnicity Hispanic/Latino</th>\n",
+       "      <th>Outcome</th>\n",
+       "      <th>Veteran</th>\n",
+       "      <th>Ex-Offender</th>\n",
+       "      <th>Justice Involved</th>\n",
+       "      <th>Single Parent</th>\n",
+       "      <th>Program: Program Name</th>\n",
+       "    </tr>\n",
+       "  </thead>\n",
+       "  <tbody>\n",
+       "    <tr>\n",
+       "      <th>0</th>\n",
+       "      <td>202107-1206</td>\n",
+       "      <td>name</td>\n",
+       "      <td>name</td>\n",
+       "      <td>Male</td>\n",
+       "      <td>Black or African American</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>No</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>Reimage 21-22</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>1</th>\n",
+       "      <td>202107-1206</td>\n",
+       "      <td>name</td>\n",
+       "      <td>name</td>\n",
+       "      <td>Male</td>\n",
+       "      <td>Black or African American</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>No</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>Reimage 21-22</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>2</th>\n",
+       "      <td>202107-1206</td>\n",
+       "      <td>name</td>\n",
+       "      <td>name</td>\n",
+       "      <td>Male</td>\n",
+       "      <td>Black or African American</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>No</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>Reimage 21-22</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>3</th>\n",
+       "      <td>202108-5167</td>\n",
+       "      <td>name</td>\n",
+       "      <td>name</td>\n",
+       "      <td>Male</td>\n",
+       "      <td>Asian</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>Successfully Completed</td>\n",
+       "      <td>No</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>No</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>Tech Louisville 21-22</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>4</th>\n",
+       "      <td>202108-5171</td>\n",
+       "      <td>name</td>\n",
+       "      <td>name</td>\n",
+       "      <td>Male</td>\n",
+       "      <td>Black or African American</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>Tech Louisville 21-22</td>\n",
+       "    </tr>\n",
+       "  </tbody>\n",
+       "</table>\n",
+       "</div>"
+      ],
+      "text/plain": [
+       "       Auto Id First Name Last Name Gender                       Race  \\\n",
+       "0  202107-1206       name      name   Male  Black or African American   \n",
+       "1  202107-1206       name      name   Male  Black or African American   \n",
+       "2  202107-1206       name      name   Male  Black or African American   \n",
+       "3  202108-5167       name      name   Male                      Asian   \n",
+       "4  202108-5171       name      name   Male  Black or African American   \n",
+       "\n",
+       "   Ethnicity Hispanic/Latino                 Outcome Veteran Ex-Offender  \\\n",
+       "0                        NaN                     NaN      No         NaN   \n",
+       "1                        NaN                     NaN      No         NaN   \n",
+       "2                        NaN                     NaN      No         NaN   \n",
+       "3                        NaN  Successfully Completed      No         NaN   \n",
+       "4                        NaN                     NaN     NaN         NaN   \n",
+       "\n",
+       "  Justice Involved Single Parent  Program: Program Name  \n",
+       "0              NaN           NaN          Reimage 21-22  \n",
+       "1              NaN           NaN          Reimage 21-22  \n",
+       "2              NaN           NaN          Reimage 21-22  \n",
+       "3               No           NaN  Tech Louisville 21-22  \n",
+       "4              NaN           NaN  Tech Louisville 21-22  "
+      ]
+     },
+     "execution_count": 18,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "all_demo = dfs['All_demographics_and_programs']\n",
+    "all_demo.head()"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "7e00a727",
+   "metadata": {},
+   "source": [
+    "Little for loop at access the dataframes individually"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "c3c755a4",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "for name, df in dfs.items():\n",
+    "    safe_name = name.replace(\" \", \"_\")\n",
+    "    globals()[safe_name] = df"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 21,
+   "id": "fa63b693",
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/html": [
+       "<div>\n",
+       "<style scoped>\n",
+       "    .dataframe tbody tr th:only-of-type {\n",
+       "        vertical-align: middle;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe tbody tr th {\n",
+       "        vertical-align: top;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe thead th {\n",
+       "        text-align: right;\n",
+       "    }\n",
+       "</style>\n",
+       "<table border=\"1\" class=\"dataframe\">\n",
+       "  <thead>\n",
+       "    <tr style=\"text-align: right;\">\n",
+       "      <th></th>\n",
+       "      <th>Auto Id</th>\n",
+       "      <th>First Name</th>\n",
+       "      <th>Last Name</th>\n",
+       "      <th>Gender</th>\n",
+       "      <th>Race</th>\n",
+       "      <th>Ethnicity Hispanic/Latino</th>\n",
+       "      <th>Outcome</th>\n",
+       "      <th>Veteran</th>\n",
+       "      <th>Ex-Offender</th>\n",
+       "      <th>Justice Involved</th>\n",
+       "      <th>Single Parent</th>\n",
+       "      <th>Program: Program Name</th>\n",
+       "    </tr>\n",
+       "  </thead>\n",
+       "  <tbody>\n",
+       "    <tr>\n",
+       "      <th>0</th>\n",
+       "      <td>202107-1206</td>\n",
+       "      <td>name</td>\n",
+       "      <td>name</td>\n",
+       "      <td>Male</td>\n",
+       "      <td>Black or African American</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>No</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>Reimage 21-22</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>1</th>\n",
+       "      <td>202107-1206</td>\n",
+       "      <td>name</td>\n",
+       "      <td>name</td>\n",
+       "      <td>Male</td>\n",
+       "      <td>Black or African American</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>No</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>Reimage 21-22</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>2</th>\n",
+       "      <td>202107-1206</td>\n",
+       "      <td>name</td>\n",
+       "      <td>name</td>\n",
+       "      <td>Male</td>\n",
+       "      <td>Black or African American</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>No</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>Reimage 21-22</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>3</th>\n",
+       "      <td>202108-5167</td>\n",
+       "      <td>name</td>\n",
+       "      <td>name</td>\n",
+       "      <td>Male</td>\n",
+       "      <td>Asian</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>Successfully Completed</td>\n",
+       "      <td>No</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>No</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>Tech Louisville 21-22</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>4</th>\n",
+       "      <td>202108-5171</td>\n",
+       "      <td>name</td>\n",
+       "      <td>name</td>\n",
+       "      <td>Male</td>\n",
+       "      <td>Black or African American</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>Tech Louisville 21-22</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>...</th>\n",
+       "      <td>...</td>\n",
+       "      <td>...</td>\n",
+       "      <td>...</td>\n",
+       "      <td>...</td>\n",
+       "      <td>...</td>\n",
+       "      <td>...</td>\n",
+       "      <td>...</td>\n",
+       "      <td>...</td>\n",
+       "      <td>...</td>\n",
+       "      <td>...</td>\n",
+       "      <td>...</td>\n",
+       "      <td>...</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>32225</th>\n",
+       "      <td>202502-20671</td>\n",
+       "      <td>name</td>\n",
+       "      <td>name</td>\n",
+       "      <td>Female</td>\n",
+       "      <td>White</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>Connecting Young Adults 24-25</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>32226</th>\n",
+       "      <td>202410-17602</td>\n",
+       "      <td>name</td>\n",
+       "      <td>name</td>\n",
+       "      <td>Female</td>\n",
+       "      <td>White</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>Connecting Young Adults 24-25</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>32227</th>\n",
+       "      <td>202506-23809</td>\n",
+       "      <td>name</td>\n",
+       "      <td>name</td>\n",
+       "      <td>Female</td>\n",
+       "      <td>White</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>Connecting Young Adults 24-25</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>32228</th>\n",
+       "      <td>202410-17749</td>\n",
+       "      <td>name</td>\n",
+       "      <td>name</td>\n",
+       "      <td>Female</td>\n",
+       "      <td>White</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>Connecting Young Adults 24-25</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>32229</th>\n",
+       "      <td>202505-23270</td>\n",
+       "      <td>name</td>\n",
+       "      <td>name</td>\n",
+       "      <td>Male</td>\n",
+       "      <td>White</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>Connecting Young Adults 24-25</td>\n",
+       "    </tr>\n",
+       "  </tbody>\n",
+       "</table>\n",
+       "<p>32230 rows × 12 columns</p>\n",
+       "</div>"
+      ],
+      "text/plain": [
+       "            Auto Id First Name Last Name  Gender                       Race  \\\n",
+       "0       202107-1206       name      name    Male  Black or African American   \n",
+       "1       202107-1206       name      name    Male  Black or African American   \n",
+       "2       202107-1206       name      name    Male  Black or African American   \n",
+       "3       202108-5167       name      name    Male                      Asian   \n",
+       "4       202108-5171       name      name    Male  Black or African American   \n",
+       "...             ...        ...       ...     ...                        ...   \n",
+       "32225  202502-20671       name      name  Female                      White   \n",
+       "32226  202410-17602       name      name  Female                      White   \n",
+       "32227  202506-23809       name      name  Female                      White   \n",
+       "32228  202410-17749       name      name  Female                      White   \n",
+       "32229  202505-23270       name      name    Male                      White   \n",
+       "\n",
+       "       Ethnicity Hispanic/Latino                 Outcome Veteran Ex-Offender  \\\n",
+       "0                            NaN                     NaN      No         NaN   \n",
+       "1                            NaN                     NaN      No         NaN   \n",
+       "2                            NaN                     NaN      No         NaN   \n",
+       "3                            NaN  Successfully Completed      No         NaN   \n",
+       "4                            NaN                     NaN     NaN         NaN   \n",
+       "...                          ...                     ...     ...         ...   \n",
+       "32225                        NaN                     NaN     NaN         NaN   \n",
+       "32226                        NaN                     NaN     NaN         NaN   \n",
+       "32227                        NaN                     NaN     NaN         NaN   \n",
+       "32228                        NaN                     NaN     NaN         NaN   \n",
+       "32229                        NaN                     NaN     NaN         NaN   \n",
+       "\n",
+       "      Justice Involved Single Parent          Program: Program Name  \n",
+       "0                  NaN           NaN                  Reimage 21-22  \n",
+       "1                  NaN           NaN                  Reimage 21-22  \n",
+       "2                  NaN           NaN                  Reimage 21-22  \n",
+       "3                   No           NaN          Tech Louisville 21-22  \n",
+       "4                  NaN           NaN          Tech Louisville 21-22  \n",
+       "...                ...           ...                            ...  \n",
+       "32225              NaN           NaN  Connecting Young Adults 24-25  \n",
+       "32226              NaN           NaN  Connecting Young Adults 24-25  \n",
+       "32227              NaN           NaN  Connecting Young Adults 24-25  \n",
+       "32228              NaN           NaN  Connecting Young Adults 24-25  \n",
+       "32229              NaN           NaN  Connecting Young Adults 24-25  \n",
+       "\n",
+       "[32230 rows x 12 columns]"
+      ]
+     },
+     "execution_count": 21,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "All_demographics_and_programs"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "fe6f5506",
+   "metadata": {},
+   "source": [
+    "## Update cleaning code \n",
+    "- Look at our cleaning code that we have. \n",
+    "- we should start to make changes to it to account for this. \n",
+    "- We need to make it so it so the program doesn't crash when something fails \n",
+    "  - [Try Except logic updates](https://www.w3schools.com/python/python_try_except.asp)\n",
+    "  - make the messages mean something meaningful\n",
+    "- Ideally we will not drop anything from our data \n"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "29302c63",
+   "metadata": {},
+   "source": [
+    "Will update this a bit with usage etc... "
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 16,
+   "id": "749ae60a",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "class DataCleaner:\n",
+    "    \"\"\"\n",
+    "    General-purpose cleaner for multiple WORC datasets\n",
+    "    (Employment, Enrollments, Demographics).\n",
+    "\n",
+    "    Uses try/except for safety (does not break if col missing).\n",
+    "    Keeps all rows (no drops), but fills/fixes when possible.\n",
+    "    \"\"\"\n",
+    "\n",
+    "    def __init__(self, df: pd.DataFrame):\n",
+    "        self.df = df.copy()\n",
+    "\n",
+    "    def safe_drop_columns(self, cols_to_drop):\n",
+    "        \"\"\"Drop columns if they exist, otherwise ignore.\"\"\"\n",
+    "        try:\n",
+    "            self.df = self.df.drop(columns=cols_to_drop, errors='ignore')\n",
+    "        except Exception as e:\n",
+    "            print(f\"[Warning] Failed dropping columns: {e}\")\n",
+    "        return self\n",
+    "\n",
+    "    def safe_fillna(self, fill_map: dict):\n",
+    "        \"\"\"Fill NaN values for specific columns safely.\"\"\"\n",
+    "        for col, val in fill_map.items():\n",
+    "            try:\n",
+    "                if col in self.df.columns:\n",
+    "                    self.df[col] = self.df[col].fillna(val)\n",
+    "            except Exception as e:\n",
+    "                print(f\"[Warning] Failed filling NaN for {col}: {e}\")\n",
+    "        return self\n",
+    "\n",
+    "    def safe_replace(self, col, replacements: dict):\n",
+    "        \"\"\"Replace values in a column safely.\"\"\"\n",
+    "        try:\n",
+    "            if col in self.df.columns:\n",
+    "                self.df[col] = self.df[col].replace(replacements)\n",
+    "        except Exception as e:\n",
+    "            print(f\"[Warning] Failed replacing values in {col}: {e}\")\n",
+    "        return self\n",
+    "\n",
+    "    def safe_convert_dtype(self, col, dtype, errors=\"ignore\"):\n",
+    "        \"\"\"Convert column dtype safely.\"\"\"\n",
+    "        try:\n",
+    "            if col in self.df.columns:\n",
+    "                if \"datetime\" in str(dtype):\n",
+    "                    self.df[col] = pd.to_datetime(\n",
+    "                        self.df[col], errors=\"coerce\")\n",
+    "                else:\n",
+    "                    self.df[col] = self.df[col].astype(dtype, errors=errors)\n",
+    "        except Exception as e:\n",
+    "            print(f\"[Warning] Failed dtype conversion on {col}: {e}\")\n",
+    "        return self\n",
+    "\n",
+    "    def normalize_gender(self):\n",
+    "        \"\"\"Unify transgender categories safely.\"\"\"\n",
+    "        try:\n",
+    "            if \"Gender\" in self.df.columns:\n",
+    "                self.df[\"Gender\"] = self.df[\"Gender\"].replace({\n",
+    "                    \"Transgender male to female\": \"Transgender\",\n",
+    "                    \"Transgender female to male\": \"Transgender\"\n",
+    "                })\n",
+    "        except Exception as e:\n",
+    "            print(f\"[Warning] Failed gender normalization: {e}\")\n",
+    "        return self\n",
+    "\n",
+    "    def split_race(self):\n",
+    "        \"\"\"Split Race column into Race_1, Race_2, etc., if it exists.\"\"\"\n",
+    "        try:\n",
+    "            if \"Race\" in self.df.columns:\n",
+    "                splitting = self.df[\"Race\"].astype(\n",
+    "                    str).str.split(\";\", expand=True)\n",
+    "                splitting.columns = [\n",
+    "                    f\"Race_{i+1}\" for i in range(splitting.shape[1])]\n",
+    "                self.df = pd.concat(\n",
+    "                    [self.df.drop(columns=[\"Race\"]), splitting], axis=1)\n",
+    "        except Exception as e:\n",
+    "            print(f\"[Warning] Failed race splitting: {e}\")\n",
+    "        return self\n",
+    "\n",
+    "    def clean_salary(self, hours_per_year: int = 2080):\n",
+    "        \"\"\"\n",
+    "        Clean and standardize salary values in the DataFrame.\n",
+    "\n",
+    "        Steps performed:\n",
+    "        1. Remove currency symbols, commas, and shorthand (e.g., \"$50k\" → \"50000\").\n",
+    "        2. Handle ranges by converting them to the average value \n",
+    "           (e.g., \"50,000-70,000\" → 60000).\n",
+    "        3. Convert values to numeric, coercing invalid entries to NaN.\n",
+    "        4. Treat values < 200 as hourly wages and convert to annual salaries \n",
+    "           (multiplied by `hours_per_year`).\n",
+    "        5. Drop unrealistic values greater than 1,000,000 (set to NaN).\n",
+    "\n",
+    "        Parameters\n",
+    "        ----------\n",
+    "        hours_per_year : int, optional (default=2080)\n",
+    "            Number of work hours in a year for converting hourly to annual salary.\n",
+    "\n",
+    "        Returns\n",
+    "        -------\n",
+    "        self : object\n",
+    "            The current instance with the cleaned Salary column.\n",
+    "        \"\"\"\n",
+    "        try:\n",
+    "            if \"Salary\" in self.df.columns:\n",
+    "                self.df[\"Salary\"] = self.df[\"Salary\"].astype(str)\n",
+    "                def parse_salary(val: str):\n",
+    "                    val = val.strip()\n",
+    "\n",
+    "                    # Handle range like \"50k-70k\" or \"50,000–70,000\"\n",
+    "                    if \"-\" in val or \"–\" in val:\n",
+    "                        parts = re.split(r\"[-–]\", val)\n",
+    "                        nums = [parse_salary(p) for p in parts if p.strip()]\n",
+    "                        nums = [n for n in nums if n is not None]\n",
+    "                        return sum(nums) / len(nums) if nums else None\n",
+    "\n",
+    "                    # Remove $, commas, spaces\n",
+    "                    val = re.sub(r\"[\\$,]\", \"\", val)\n",
+    "\n",
+    "                    # Handle shorthand k/K (e.g., 50k -> 50000)\n",
+    "                    match = re.match(r\"(\\d+(\\.\\d+)?)([kK])\", val)\n",
+    "                    if match:\n",
+    "                        return float(match.group(1)) * 1000\n",
+    "\n",
+    "                    # Convert plain number if possible\n",
+    "                    try:\n",
+    "                        return float(val)\n",
+    "                    except ValueError:\n",
+    "                        return None\n",
+    "\n",
+    "                # Apply parsing\n",
+    "                self.df[\"Salary\"] = self.df[\"Salary\"].apply(parse_salary)\n",
+    "\n",
+    "                # Convert small numbers (hourly) to annual\n",
+    "                self.df.loc[self.df[\"Salary\"] < 200, \"Salary\"] *= hours_per_year\n",
+    "\n",
+    "                # Drop unrealistic salaries\n",
+    "                self.df.loc[self.df[\"Salary\"] > 1_000_000, \"Salary\"] = None\n",
+    "\n",
+    "        except Exception as e:\n",
+    "            print(f\"[Warning] Failed salary cleaning: {e}\")\n",
+    "\n",
+    "        return self\n",
+    "\n",
+    "    def finalize(self):\n",
+    "        \"\"\"Return cleaned dataframe.\"\"\"\n",
+    "        return self.df"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "3eb6373f",
+   "metadata": {},
+   "source": [
+    "### Sample use of the clean_salary function. "
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 19,
+   "id": "182eac4a",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "      Salary\n",
+      "0    50000.0\n",
+      "1    20800.0\n",
+      "2   104000.0\n",
+      "3    60000.0\n",
+      "4    75000.0\n",
+      "5   100000.0\n",
+      "6   150000.0\n",
+      "7      200.0\n",
+      "8     3000.0\n",
+      "9        NaN\n",
+      "10       NaN\n",
+      "11       NaN\n",
+      "12  145600.0\n"
+     ]
+    }
+   ],
+   "source": [
+    "test_df = pd.DataFrame({\n",
+    "    \"Salary\": [\"$50k\", \"10\", \"50\", \"60,000\", \"70,000-80,000\", \"100k\", \"150000\", \"200\", \"3000\", \"5000000\", \"$1.5M\", \"invalid\", 70]\n",
+    "})\n",
+    "\n",
+    "# Create instance with test DataFrame\n",
+    "cleaner = DataCleaner(test_df)\n",
+    "\n",
+    "# Run salary cleaning\n",
+    "cleaner = cleaner.clean_salary(2080)\n",
+    "\n",
+    "# Get the cleaned DataFrame\n",
+    "result_df = cleaner.finalize()\n",
+    "print(result_df)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 21,
+   "id": "82806fc9",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "    Salary\n",
+      "0      NaN\n",
+      "1      NaN\n",
+      "2      NaN\n",
+      "3      NaN\n",
+      "4  50000.0\n",
+      "5   5000.0\n",
+      "6      NaN\n",
+      "7      NaN\n",
+      "8      NaN\n",
+      "9      NaN\n"
+     ]
+    }
+   ],
+   "source": [
+    "fail_df = pd.DataFrame({\n",
+    "    \"Salary\": [\n",
+    "        None,                # NaN input\n",
+    "        \"\",                  # empty string\n",
+    "        \" \",                 # whitespace only\n",
+    "        \"abc123\",            # text + numbers\n",
+    "        \"50k-abc\",           # malformed range\n",
+    "        \"$-5000\",            # negative salary\n",
+    "        \"∞\",                 # infinity symbol\n",
+    "        \"NaN\",               # literal string NaN\n",
+    "        \"$1.5M\",             # millions, not handled in parser\n",
+    "        \"70,000—80,000\"      # em dash (—) instead of hyphen/dash\n",
+    "    ]\n",
+    "})\n",
+    "# Create instance with failing DataFrame\n",
+    "fail_cleaner = DataCleaner(fail_df)\n",
+    "# Run salary cleaning on failing DataFrame\n",
+    "fail_cleaner = fail_cleaner.clean_salary(2080)\n",
+    "# Get the cleaned DataFrame\n",
+    "fail_result_df = fail_cleaner.finalize()\n",
+    "print(fail_result_df)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "123deb70",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "class DataCleaner:\n",
+    "    def __init__(self, df: pd.DataFrame):\n",
+    "        self.df = df.copy()\n",
+    "\n",
+    "    def clean_salary(self, hours_per_year: int = 2080):\n",
+    "        \"\"\"\n",
+    "        Clean and standardize salary values in the DataFrame.\n",
+    "\n",
+    "        Steps performed:\n",
+    "        1. Remove currency symbols, commas, and shorthand (e.g., \"$50k\" → 50000).\n",
+    "        2. Handle ranges by converting them to the average value \n",
+    "           (e.g., \"50,000–70,000\" → 60000).\n",
+    "        3. Handle shorthand \"M\" (e.g., \"$1.5M\" → 1,500,000).\n",
+    "        4. Convert values to numeric, coercing invalid entries to NaN.\n",
+    "        5. Treat values <= 200 as hourly wages and convert to annual salaries \n",
+    "           (multiplied by `hours_per_year`).\n",
+    "        6. Drop unrealistic values greater than 1,000,000 (set to NaN).\n",
+    "\n",
+    "        Parameters\n",
+    "        ----------\n",
+    "        hours_per_year : int, optional (default=2080)\n",
+    "            Number of work hours in a year for converting hourly to annual salary.\n",
+    "\n",
+    "        Returns\n",
+    "        -------\n",
+    "        self : object\n",
+    "            The current instance with the cleaned Salary column.\n",
+    "        \"\"\"\n",
+    "        try:\n",
+    "            if \"Salary\" in self.df.columns:\n",
+    "                self.df[\"Salary\"] = self.df[\"Salary\"].astype(str)\n",
+    "\n",
+    "                def parse_salary(val: str):\n",
+    "                    val = val.strip()\n",
+    "                    if not val or val.lower() in {\"nan\", \"none\"}:\n",
+    "                        return None\n",
+    "\n",
+    "                    # Normalize dash types (hyphen, en dash, em dash \"-\")\n",
+    "                    val = re.sub(r\"[–—]\", \"-\", val)\n",
+    "\n",
+    "                    # Handle range like \"50k-70k\" or \"50,000-70,000\"\n",
+    "                    if \"-\" in val:\n",
+    "                        parts = val.split(\"-\")\n",
+    "                        nums = [parse_salary(p) for p in parts if p.strip()]\n",
+    "                        nums = [n for n in nums if n is not None]\n",
+    "                        return sum(nums) / len(nums) if nums else None\n",
+    "\n",
+    "                    # Remove $, commas, spaces\n",
+    "                    val = re.sub(r\"[\\$,]\", \"\", val)\n",
+    "\n",
+    "                    # Handle shorthand k/K (e.g., \"50k\" → 50000)\n",
+    "                    match_k = re.match(r\"^(\\d+(\\.\\d+)?)[kK]$\", val)\n",
+    "                    if match_k:\n",
+    "                        return float(match_k.group(1)) * 1000\n",
+    "\n",
+    "                    # Handle shorthand M (e.g., \"1.5M\" → 1500000)\n",
+    "                    match_m = re.match(r\"^(\\d+(\\.\\d+)?)[mM]$\", val)\n",
+    "                    if match_m:\n",
+    "                        return float(match_m.group(1)) * 1_000_000\n",
+    "\n",
+    "                    # Plain number (integer or float)\n",
+    "                    try:\n",
+    "                        return float(val)\n",
+    "                    except ValueError:\n",
+    "                        return None\n",
+    "\n",
+    "                # Apply parsing\n",
+    "                self.df[\"Salary\"] = self.df[\"Salary\"].apply(parse_salary)\n",
+    "\n",
+    "                # Convert small numbers (hourly) to annual\n",
+    "                self.df.loc[self.df[\"Salary\"] <= 200, \"Salary\"] *= hours_per_year\n",
+    "\n",
+    "                # Drop unrealistic salaries\n",
+    "                self.df.loc[self.df[\"Salary\"] > 1_000_000, \"Salary\"] = None\n",
+    "\n",
+    "        except Exception as e:\n",
+    "            print(f\"[Warning] Failed salary cleaning: {e}\")\n",
+    "\n",
+    "        return self\n",
+    "\n",
+    "    def finalize(self):\n",
+    "        \"\"\"Return cleaned dataframe.\"\"\"\n",
+    "        return self.df\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "688bdf74",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "✅ Salary cleaning DataFrame test passed!\n"
+     ]
+    }
+   ],
+   "source": [
+    "# Test DataFrame with edge/fail cases\n",
+    "fail_df = pd.DataFrame({\n",
+    "    \"Salary\": [\n",
+    "        None,                #  NaN\n",
+    "        \"\",                  #  NaN\n",
+    "        \" \",                 #  NaN\n",
+    "        \"abc123\",            #  NaN\n",
+    "        \"50k-abc\",           #  50000.0\n",
+    "        \"$-5000\",            #  -5000.0  (still allowed for now)\n",
+    "        \"∞\",                 #  NaN\n",
+    "        \"NaN\",               #  NaN\n",
+    "        \"$1.5M\",             #  NaN ( >1,000,000 rule)\n",
+    "        \"70,000—80,000\"      #  75000.0 (dash normalized)\n",
+    "    ]\n",
+    "})\n",
+    "\n",
+    "# Run through cleaner\n",
+    "cleaner = DataCleaner(fail_df)\n",
+    "result = cleaner.clean_salary().finalize().reset_index(drop=True)\n",
+    "\n",
+    "# Expected results as DataFrame\n",
+    "expected = pd.DataFrame({\n",
+    "    \"Salary\": [\n",
+    "        None,       # None\n",
+    "        None,       # empty string\n",
+    "        None,       # whitespace\n",
+    "        None,       # abc123\n",
+    "        50000.0,    # 50k-abc\n",
+    "        5000.0,     # negative salary\n",
+    "        None,       # infinity\n",
+    "        None,       # \"NaN\"\n",
+    "        None,       # 1.5M filtered out\n",
+    "        75000.0     # range with em dash\n",
+    "    ]\n",
+    "}, dtype=\"float64\").reset_index(drop=True)\n",
+    "\n",
+    "# Assertion test\n",
+    "pdt.assert_frame_equal(result, expected)\n",
+    "print(\"✅ Salary cleaning DataFrame test passed!\")"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "6ddbb4c0",
+   "metadata": {},
+   "source": [
+    "## Generate report \n",
+    "\n",
+    "- Overall completion of program only accounting for the new style of classes m1-m4\n",
+    "- completion by year \n",
+    "- completion over all by pathway \n",
+    "- completion by year by pathway \n",
+    "- Feel free to get creative here adding gender etc to get us a better understanding \n",
+    "- education level and the above... \n",
+    "- export this as a txt file "
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "8d6485e5",
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  },
+  {
+   "cell_type": "markdown",
+   "id": "859cf674",
+   "metadata": {},
+   "source": [
+    "## Plots \n",
+    "- Look at the various plots \n",
+    "- make a consistent color scheme\n",
+    "- pick the plots that go with the report above \n",
+    "- make missing plots \n",
+    "- make plots have the option to show & save in the functions\n",
+    "\n",
+    "see `src/notebooks/visualization_examples.ipynb`\n",
+    "See below from `src/Carmen_WORCEmployment_Plots.py`"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "81009a87",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "def plot_salary_by_gender(data):\n",
+    "    plt.figure(figsize=(8, 5))\n",
+    "    sns.boxplot(data=data, x='Gender', y='Salary')\n",
+    "    plt.title(\"Salary Distribution by Gender\")\n",
+    "    plt.show()\n",
+    "\n",
+    "\n",
+    "def plot_avg_salary_by_city(data):\n",
+    "    region_salary = data.groupby('Mailing City')['Salary'].mean().sort_values()\n",
+    "    region_salary.plot(kind='barh', figsize=(8, 5), title=\"Average Salary by KY Region\")\n",
+    "    plt.xlabel(\"Average Salary\")\n",
+    "    plt.show()\n",
+    "\n",
+    "\n",
+    "def plot_placements_over_time(data):\n",
+    "    data.set_index('Start Date').resample('M').size().plot(kind='line', marker='o', figsize=(10, 4))\n",
+    "    plt.title(\"Number of Placements Over Time\")\n",
+    "    plt.ylabel(\"Placements\")\n",
+    "    plt.show()\n",
+    "\n",
+    "\n",
+    "def plot_placement_type_by_program(data):\n",
+    "    plt.figure(figsize=(10, 6))\n",
+    "    sns.countplot(data=data, x='ATP Placement Type', hue='Program: Program Name')\n",
+    "    plt.xticks(rotation=45)\n",
+    "    plt.title(\"Placement Type by Program\")\n",
+    "    plt.show()\n",
+    "\n",
+    "\n",
+    "def plot_top_cities(data):\n",
+    "    city_counts = data['Mailing City'].value_counts().head(10)\n",
+    "    city_counts.plot(kind='bar', title='Top Cities by Participant Count', figsize=(8, 4))\n",
+    "    plt.ylabel(\"Count\")\n",
+    "    plt.show()"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "f905708f",
+   "metadata": {},
+   "source": [
+    "TOC generator "
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 1,
+   "id": "d4fc7116",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "\n",
+      "--- ✅ Copy the Markdown below and paste it into a new markdown cell ---\n",
+      "\n",
+      "### **Table of Contents**\n",
+      "    * [**Table of Contents**](#**table-of-contents**)\n",
+      "  * [Function To Read in the Data!](#function-to-read-in-the-data!)\n",
+      "  * [Example usage](#example-usage)\n",
+      "      * [To Access a DataFrame in the list](#to-access-a-dataframe-in-the-list)\n",
+      "      * [To Remove Spaces in DataFrame name](#to-remove-spaces-in-dataframe-name)\n",
+      "  * [Update cleaning code](#update-cleaning-code)\n",
+      "  * [Generate report](#generate-report)\n",
+      "  * [Plots](#plots)\n",
+      "\n"
+     ]
+    }
+   ],
+   "source": [
+    "import json\n",
+    "import os\n",
+    "\n",
+    "\n",
+    "def generate_toc_from_notebook(notebook_path):\n",
+    "    \"\"\"\n",
+    "    Parses a local .ipynb file and generates Markdown for a Table of Contents.\n",
+    "    \"\"\"\n",
+    "    if not os.path.isfile(notebook_path):\n",
+    "        print(f\"❌ Error: File not found at '{notebook_path}'\")\n",
+    "        return\n",
+    "\n",
+    "    with open(notebook_path, 'r', encoding='utf-8') as f:\n",
+    "        notebook = json.load(f)\n",
+    "\n",
+    "    toc_markdown = \"### **Table of Contents**\\n\"\n",
+    "    for cell in notebook.get('cells', []):\n",
+    "        if cell.get('cell_type') == 'markdown':\n",
+    "            for line in cell.get('source', []):\n",
+    "                if line.strip().startswith('#'):\n",
+    "                    level = line.count('#')\n",
+    "                    title = line.strip('#').strip()\n",
+    "                    link = title.lower().replace(' ', '-').strip('-.()')\n",
+    "                    indent = '  ' * (level - 1)\n",
+    "                    toc_markdown += f\"{indent}* [{title}](#{link})\\n\"\n",
+    "\n",
+    "    print(\"\\n--- ✅ Copy the Markdown below and paste it \"\n",
+    "          \"into a new markdown cell ---\\n\")\n",
+    "    print(toc_markdown)\n",
+    "\n",
+    "\n",
+    "notebook_path = 'mainNb.ipynb'\n",
+    "generate_toc_from_notebook(notebook_path)\n"
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": ".venv",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.13.1"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 5
+}