From e86d969ca83fda7c2e9ee7fc8b00e7368c280975 Mon Sep 17 00:00:00 2001 From: "danny.morton714" Date: Wed, 20 Aug 2025 14:42:38 -0400 Subject: [PATCH] Wrote a test and updated the clean salary method to catch all the edge cases. --- src/notebooks/mainNb.ipynb | 1208 ++++++++++++++++++++++++++++++++++++ 1 file changed, 1208 insertions(+) create mode 100644 src/notebooks/mainNb.ipynb diff --git a/src/notebooks/mainNb.ipynb b/src/notebooks/mainNb.ipynb new file mode 100644 index 0000000..7288cea --- /dev/null +++ b/src/notebooks/mainNb.ipynb @@ -0,0 +1,1208 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "id": "8d232fdb", + "metadata": {}, + "source": [ + "### **Table of Contents**\n", + " * [**Table of Contents**](#**table-of-contents**)\n", + " * [Function To Read in the Data!](#function-to-read-in-the-data!)\n", + " * [Example usage](#example-usage)\n", + " * [To Access a DataFrame in the list](#to-access-a-dataframe-in-the-list)\n", + " * [To Remove Spaces in DataFrame name](#to-remove-spaces-in-dataframe-name)\n", + " * [Update cleaning code](#update-cleaning-code)\n", + " * [Generate report](#generate-report)\n", + " * [Plots](#plots)" + ] + }, + { + "cell_type": "code", + "execution_count": 28, + "id": "d11a2343", + "metadata": {}, + "outputs": [], + "source": [ + "import pandas as pd\n", + "from typing import Dict, Union\n", + "from pathlib import Path\n", + "import os\n", + "import sys\n", + "import re\n", + "import pandas.testing as pdt" + ] + }, + { + "cell_type": "markdown", + "id": "0764cac1", + "metadata": {}, + "source": [ + "## Function To Read in the Data! " + ] + }, + { + "cell_type": "code", + "execution_count": 14, + "id": "7cd30f44", + "metadata": {}, + "outputs": [], + "source": [ + "def load_data_folder(\n", + " folder_path: Union[str, os.PathLike] = \"../../data\"\n", + ") -> Dict[str, pd.DataFrame]:\n", + " \"\"\"\n", + " Load all CSV/XLS/XLSX files in a folder into pandas DataFrames.\n", + "\n", + " Parameters\n", + " ----------\n", + " folder_path : str | os.PathLike, optional\n", + " Path to the folder containing the files. Defaults to \"../../data\".\n", + "\n", + " Returns\n", + " -------\n", + " Dict[str, pandas.DataFrame]\n", + " A mapping from the file's stem (filename without extension) to its\n", + " loaded DataFrame. For example, \"employees.csv\" -> key \"employees\".\n", + "\n", + " Raises\n", + " ------\n", + " FileNotFoundError\n", + " If `folder_path` does not exist.\n", + " PermissionError\n", + " If the folder or files cannot be accessed due to permissions.\n", + " pd.errors.EmptyDataError\n", + " If a CSV file is empty and cannot be parsed.\n", + "\n", + " Notes\n", + " -----\n", + " - Supported extensions: .csv, .xls, .xlsx (case-insensitive).\n", + " - If both `name.csv` and `name.xlsx` exist, the later one encountered will\n", + " overwrite the earlier entry for key `name`.\n", + " \"\"\"\n", + " path = Path(folder_path)\n", + " if not path.exists():\n", + " raise FileNotFoundError(f\"Folder not found: {path.resolve()}\")\n", + "\n", + " dataframes: Dict[str, pd.DataFrame] = {}\n", + " for p in path.iterdir():\n", + " if not p.is_file():\n", + " continue\n", + "\n", + " ext = p.suffix.lower()\n", + " if ext == \".csv\":\n", + " df = pd.read_csv(p)\n", + " elif ext in {\".xlsx\", \".xls\"}:\n", + " df = pd.read_excel(p)\n", + " else:\n", + " continue\n", + "\n", + " dataframes[p.stem] = df\n", + "\n", + " return dataframes\n" + ] + }, + { + "cell_type": "markdown", + "id": "714769cf", + "metadata": {}, + "source": [ + "## Example usage \n", + "\n", + "```python \n", + "dfs = load_data_folder()\n", + "dfs.keys()\n", + "```\n", + "output:\n", + "```bash\n", + "dict_keys(['ARC_Enrollments', 'ARC_Application', 'All_demographics_and_programs'])\n", + "```\n", + "#### To Access a DataFrame in the list \n", + "\n", + "```python\n", + "all_demo = dfs['All_demographics_and_programs']\n", + "all_demo.head(1)\n", + "```\n", + "\n", + "output:\n", + "|col 1|col 2|col 3|\n", + "|:--:|:--:|:--:|\n", + "|3.14|name|apple|\n", + "\n", + "\n", + "\n", + "#### To Remove Spaces in DataFrame name\n", + "\n", + "```python \n", + "for name, df in dfs.items():\n", + " safe_name = name.replace(\" \", \"_\")\n", + " globals()[safe_name] = df\n", + "```" + ] + }, + { + "cell_type": "code", + "execution_count": 16, + "id": "15c0e5af", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "dict_keys(['ARC_Enrollments', 'ARC_Application', 'All_demographics_and_programs'])" + ] + }, + "execution_count": 16, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "dfs = load_data_folder()\n", + "dfs.keys()" + ] + }, + { + "cell_type": "markdown", + "id": "f643e1d8", + "metadata": {}, + "source": [ + "How to call the dataframe from the list above" + ] + }, + { + "cell_type": "code", + "execution_count": 18, + "id": "5875ef3e", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
Auto IdFirst NameLast NameGenderRaceEthnicity Hispanic/LatinoOutcomeVeteranEx-OffenderJustice InvolvedSingle ParentProgram: Program Name
0202107-1206namenameMaleBlack or African AmericanNaNNaNNoNaNNaNNaNReimage 21-22
1202107-1206namenameMaleBlack or African AmericanNaNNaNNoNaNNaNNaNReimage 21-22
2202107-1206namenameMaleBlack or African AmericanNaNNaNNoNaNNaNNaNReimage 21-22
3202108-5167namenameMaleAsianNaNSuccessfully CompletedNoNaNNoNaNTech Louisville 21-22
4202108-5171namenameMaleBlack or African AmericanNaNNaNNaNNaNNaNNaNTech Louisville 21-22
\n", + "
" + ], + "text/plain": [ + " Auto Id First Name Last Name Gender Race \\\n", + "0 202107-1206 name name Male Black or African American \n", + "1 202107-1206 name name Male Black or African American \n", + "2 202107-1206 name name Male Black or African American \n", + "3 202108-5167 name name Male Asian \n", + "4 202108-5171 name name Male Black or African American \n", + "\n", + " Ethnicity Hispanic/Latino Outcome Veteran Ex-Offender \\\n", + "0 NaN NaN No NaN \n", + "1 NaN NaN No NaN \n", + "2 NaN NaN No NaN \n", + "3 NaN Successfully Completed No NaN \n", + "4 NaN NaN NaN NaN \n", + "\n", + " Justice Involved Single Parent Program: Program Name \n", + "0 NaN NaN Reimage 21-22 \n", + "1 NaN NaN Reimage 21-22 \n", + "2 NaN NaN Reimage 21-22 \n", + "3 No NaN Tech Louisville 21-22 \n", + "4 NaN NaN Tech Louisville 21-22 " + ] + }, + "execution_count": 18, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "all_demo = dfs['All_demographics_and_programs']\n", + "all_demo.head()" + ] + }, + { + "cell_type": "markdown", + "id": "7e00a727", + "metadata": {}, + "source": [ + "Little for loop at access the dataframes individually" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "c3c755a4", + "metadata": {}, + "outputs": [], + "source": [ + "for name, df in dfs.items():\n", + " safe_name = name.replace(\" \", \"_\")\n", + " globals()[safe_name] = df" + ] + }, + { + "cell_type": "code", + "execution_count": 21, + "id": "fa63b693", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
Auto IdFirst NameLast NameGenderRaceEthnicity Hispanic/LatinoOutcomeVeteranEx-OffenderJustice InvolvedSingle ParentProgram: Program Name
0202107-1206namenameMaleBlack or African AmericanNaNNaNNoNaNNaNNaNReimage 21-22
1202107-1206namenameMaleBlack or African AmericanNaNNaNNoNaNNaNNaNReimage 21-22
2202107-1206namenameMaleBlack or African AmericanNaNNaNNoNaNNaNNaNReimage 21-22
3202108-5167namenameMaleAsianNaNSuccessfully CompletedNoNaNNoNaNTech Louisville 21-22
4202108-5171namenameMaleBlack or African AmericanNaNNaNNaNNaNNaNNaNTech Louisville 21-22
.......................................
32225202502-20671namenameFemaleWhiteNaNNaNNaNNaNNaNNaNConnecting Young Adults 24-25
32226202410-17602namenameFemaleWhiteNaNNaNNaNNaNNaNNaNConnecting Young Adults 24-25
32227202506-23809namenameFemaleWhiteNaNNaNNaNNaNNaNNaNConnecting Young Adults 24-25
32228202410-17749namenameFemaleWhiteNaNNaNNaNNaNNaNNaNConnecting Young Adults 24-25
32229202505-23270namenameMaleWhiteNaNNaNNaNNaNNaNNaNConnecting Young Adults 24-25
\n", + "

32230 rows × 12 columns

\n", + "
" + ], + "text/plain": [ + " Auto Id First Name Last Name Gender Race \\\n", + "0 202107-1206 name name Male Black or African American \n", + "1 202107-1206 name name Male Black or African American \n", + "2 202107-1206 name name Male Black or African American \n", + "3 202108-5167 name name Male Asian \n", + "4 202108-5171 name name Male Black or African American \n", + "... ... ... ... ... ... \n", + "32225 202502-20671 name name Female White \n", + "32226 202410-17602 name name Female White \n", + "32227 202506-23809 name name Female White \n", + "32228 202410-17749 name name Female White \n", + "32229 202505-23270 name name Male White \n", + "\n", + " Ethnicity Hispanic/Latino Outcome Veteran Ex-Offender \\\n", + "0 NaN NaN No NaN \n", + "1 NaN NaN No NaN \n", + "2 NaN NaN No NaN \n", + "3 NaN Successfully Completed No NaN \n", + "4 NaN NaN NaN NaN \n", + "... ... ... ... ... \n", + "32225 NaN NaN NaN NaN \n", + "32226 NaN NaN NaN NaN \n", + "32227 NaN NaN NaN NaN \n", + "32228 NaN NaN NaN NaN \n", + "32229 NaN NaN NaN NaN \n", + "\n", + " Justice Involved Single Parent Program: Program Name \n", + "0 NaN NaN Reimage 21-22 \n", + "1 NaN NaN Reimage 21-22 \n", + "2 NaN NaN Reimage 21-22 \n", + "3 No NaN Tech Louisville 21-22 \n", + "4 NaN NaN Tech Louisville 21-22 \n", + "... ... ... ... \n", + "32225 NaN NaN Connecting Young Adults 24-25 \n", + "32226 NaN NaN Connecting Young Adults 24-25 \n", + "32227 NaN NaN Connecting Young Adults 24-25 \n", + "32228 NaN NaN Connecting Young Adults 24-25 \n", + "32229 NaN NaN Connecting Young Adults 24-25 \n", + "\n", + "[32230 rows x 12 columns]" + ] + }, + "execution_count": 21, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "All_demographics_and_programs" + ] + }, + { + "cell_type": "markdown", + "id": "fe6f5506", + "metadata": {}, + "source": [ + "## Update cleaning code \n", + "- Look at our cleaning code that we have. \n", + "- we should start to make changes to it to account for this. \n", + "- We need to make it so it so the program doesn't crash when something fails \n", + " - [Try Except logic updates](https://www.w3schools.com/python/python_try_except.asp)\n", + " - make the messages mean something meaningful\n", + "- Ideally we will not drop anything from our data \n" + ] + }, + { + "cell_type": "markdown", + "id": "29302c63", + "metadata": {}, + "source": [ + "Will update this a bit with usage etc... " + ] + }, + { + "cell_type": "code", + "execution_count": 16, + "id": "749ae60a", + "metadata": {}, + "outputs": [], + "source": [ + "class DataCleaner:\n", + " \"\"\"\n", + " General-purpose cleaner for multiple WORC datasets\n", + " (Employment, Enrollments, Demographics).\n", + "\n", + " Uses try/except for safety (does not break if col missing).\n", + " Keeps all rows (no drops), but fills/fixes when possible.\n", + " \"\"\"\n", + "\n", + " def __init__(self, df: pd.DataFrame):\n", + " self.df = df.copy()\n", + "\n", + " def safe_drop_columns(self, cols_to_drop):\n", + " \"\"\"Drop columns if they exist, otherwise ignore.\"\"\"\n", + " try:\n", + " self.df = self.df.drop(columns=cols_to_drop, errors='ignore')\n", + " except Exception as e:\n", + " print(f\"[Warning] Failed dropping columns: {e}\")\n", + " return self\n", + "\n", + " def safe_fillna(self, fill_map: dict):\n", + " \"\"\"Fill NaN values for specific columns safely.\"\"\"\n", + " for col, val in fill_map.items():\n", + " try:\n", + " if col in self.df.columns:\n", + " self.df[col] = self.df[col].fillna(val)\n", + " except Exception as e:\n", + " print(f\"[Warning] Failed filling NaN for {col}: {e}\")\n", + " return self\n", + "\n", + " def safe_replace(self, col, replacements: dict):\n", + " \"\"\"Replace values in a column safely.\"\"\"\n", + " try:\n", + " if col in self.df.columns:\n", + " self.df[col] = self.df[col].replace(replacements)\n", + " except Exception as e:\n", + " print(f\"[Warning] Failed replacing values in {col}: {e}\")\n", + " return self\n", + "\n", + " def safe_convert_dtype(self, col, dtype, errors=\"ignore\"):\n", + " \"\"\"Convert column dtype safely.\"\"\"\n", + " try:\n", + " if col in self.df.columns:\n", + " if \"datetime\" in str(dtype):\n", + " self.df[col] = pd.to_datetime(\n", + " self.df[col], errors=\"coerce\")\n", + " else:\n", + " self.df[col] = self.df[col].astype(dtype, errors=errors)\n", + " except Exception as e:\n", + " print(f\"[Warning] Failed dtype conversion on {col}: {e}\")\n", + " return self\n", + "\n", + " def normalize_gender(self):\n", + " \"\"\"Unify transgender categories safely.\"\"\"\n", + " try:\n", + " if \"Gender\" in self.df.columns:\n", + " self.df[\"Gender\"] = self.df[\"Gender\"].replace({\n", + " \"Transgender male to female\": \"Transgender\",\n", + " \"Transgender female to male\": \"Transgender\"\n", + " })\n", + " except Exception as e:\n", + " print(f\"[Warning] Failed gender normalization: {e}\")\n", + " return self\n", + "\n", + " def split_race(self):\n", + " \"\"\"Split Race column into Race_1, Race_2, etc., if it exists.\"\"\"\n", + " try:\n", + " if \"Race\" in self.df.columns:\n", + " splitting = self.df[\"Race\"].astype(\n", + " str).str.split(\";\", expand=True)\n", + " splitting.columns = [\n", + " f\"Race_{i+1}\" for i in range(splitting.shape[1])]\n", + " self.df = pd.concat(\n", + " [self.df.drop(columns=[\"Race\"]), splitting], axis=1)\n", + " except Exception as e:\n", + " print(f\"[Warning] Failed race splitting: {e}\")\n", + " return self\n", + "\n", + " def clean_salary(self, hours_per_year: int = 2080):\n", + " \"\"\"\n", + " Clean and standardize salary values in the DataFrame.\n", + "\n", + " Steps performed:\n", + " 1. Remove currency symbols, commas, and shorthand (e.g., \"$50k\" → \"50000\").\n", + " 2. Handle ranges by converting them to the average value \n", + " (e.g., \"50,000-70,000\" → 60000).\n", + " 3. Convert values to numeric, coercing invalid entries to NaN.\n", + " 4. Treat values < 200 as hourly wages and convert to annual salaries \n", + " (multiplied by `hours_per_year`).\n", + " 5. Drop unrealistic values greater than 1,000,000 (set to NaN).\n", + "\n", + " Parameters\n", + " ----------\n", + " hours_per_year : int, optional (default=2080)\n", + " Number of work hours in a year for converting hourly to annual salary.\n", + "\n", + " Returns\n", + " -------\n", + " self : object\n", + " The current instance with the cleaned Salary column.\n", + " \"\"\"\n", + " try:\n", + " if \"Salary\" in self.df.columns:\n", + " self.df[\"Salary\"] = self.df[\"Salary\"].astype(str)\n", + " def parse_salary(val: str):\n", + " val = val.strip()\n", + "\n", + " # Handle range like \"50k-70k\" or \"50,000–70,000\"\n", + " if \"-\" in val or \"–\" in val:\n", + " parts = re.split(r\"[-–]\", val)\n", + " nums = [parse_salary(p) for p in parts if p.strip()]\n", + " nums = [n for n in nums if n is not None]\n", + " return sum(nums) / len(nums) if nums else None\n", + "\n", + " # Remove $, commas, spaces\n", + " val = re.sub(r\"[\\$,]\", \"\", val)\n", + "\n", + " # Handle shorthand k/K (e.g., 50k -> 50000)\n", + " match = re.match(r\"(\\d+(\\.\\d+)?)([kK])\", val)\n", + " if match:\n", + " return float(match.group(1)) * 1000\n", + "\n", + " # Convert plain number if possible\n", + " try:\n", + " return float(val)\n", + " except ValueError:\n", + " return None\n", + "\n", + " # Apply parsing\n", + " self.df[\"Salary\"] = self.df[\"Salary\"].apply(parse_salary)\n", + "\n", + " # Convert small numbers (hourly) to annual\n", + " self.df.loc[self.df[\"Salary\"] < 200, \"Salary\"] *= hours_per_year\n", + "\n", + " # Drop unrealistic salaries\n", + " self.df.loc[self.df[\"Salary\"] > 1_000_000, \"Salary\"] = None\n", + "\n", + " except Exception as e:\n", + " print(f\"[Warning] Failed salary cleaning: {e}\")\n", + "\n", + " return self\n", + "\n", + " def finalize(self):\n", + " \"\"\"Return cleaned dataframe.\"\"\"\n", + " return self.df" + ] + }, + { + "cell_type": "markdown", + "id": "3eb6373f", + "metadata": {}, + "source": [ + "### Sample use of the clean_salary function. " + ] + }, + { + "cell_type": "code", + "execution_count": 19, + "id": "182eac4a", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + " Salary\n", + "0 50000.0\n", + "1 20800.0\n", + "2 104000.0\n", + "3 60000.0\n", + "4 75000.0\n", + "5 100000.0\n", + "6 150000.0\n", + "7 200.0\n", + "8 3000.0\n", + "9 NaN\n", + "10 NaN\n", + "11 NaN\n", + "12 145600.0\n" + ] + } + ], + "source": [ + "test_df = pd.DataFrame({\n", + " \"Salary\": [\"$50k\", \"10\", \"50\", \"60,000\", \"70,000-80,000\", \"100k\", \"150000\", \"200\", \"3000\", \"5000000\", \"$1.5M\", \"invalid\", 70]\n", + "})\n", + "\n", + "# Create instance with test DataFrame\n", + "cleaner = DataCleaner(test_df)\n", + "\n", + "# Run salary cleaning\n", + "cleaner = cleaner.clean_salary(2080)\n", + "\n", + "# Get the cleaned DataFrame\n", + "result_df = cleaner.finalize()\n", + "print(result_df)" + ] + }, + { + "cell_type": "code", + "execution_count": 21, + "id": "82806fc9", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + " Salary\n", + "0 NaN\n", + "1 NaN\n", + "2 NaN\n", + "3 NaN\n", + "4 50000.0\n", + "5 5000.0\n", + "6 NaN\n", + "7 NaN\n", + "8 NaN\n", + "9 NaN\n" + ] + } + ], + "source": [ + "fail_df = pd.DataFrame({\n", + " \"Salary\": [\n", + " None, # NaN input\n", + " \"\", # empty string\n", + " \" \", # whitespace only\n", + " \"abc123\", # text + numbers\n", + " \"50k-abc\", # malformed range\n", + " \"$-5000\", # negative salary\n", + " \"∞\", # infinity symbol\n", + " \"NaN\", # literal string NaN\n", + " \"$1.5M\", # millions, not handled in parser\n", + " \"70,000—80,000\" # em dash (—) instead of hyphen/dash\n", + " ]\n", + "})\n", + "# Create instance with failing DataFrame\n", + "fail_cleaner = DataCleaner(fail_df)\n", + "# Run salary cleaning on failing DataFrame\n", + "fail_cleaner = fail_cleaner.clean_salary(2080)\n", + "# Get the cleaned DataFrame\n", + "fail_result_df = fail_cleaner.finalize()\n", + "print(fail_result_df)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "123deb70", + "metadata": {}, + "outputs": [], + "source": [ + "class DataCleaner:\n", + " def __init__(self, df: pd.DataFrame):\n", + " self.df = df.copy()\n", + "\n", + " def clean_salary(self, hours_per_year: int = 2080):\n", + " \"\"\"\n", + " Clean and standardize salary values in the DataFrame.\n", + "\n", + " Steps performed:\n", + " 1. Remove currency symbols, commas, and shorthand (e.g., \"$50k\" → 50000).\n", + " 2. Handle ranges by converting them to the average value \n", + " (e.g., \"50,000–70,000\" → 60000).\n", + " 3. Handle shorthand \"M\" (e.g., \"$1.5M\" → 1,500,000).\n", + " 4. Convert values to numeric, coercing invalid entries to NaN.\n", + " 5. Treat values <= 200 as hourly wages and convert to annual salaries \n", + " (multiplied by `hours_per_year`).\n", + " 6. Drop unrealistic values greater than 1,000,000 (set to NaN).\n", + "\n", + " Parameters\n", + " ----------\n", + " hours_per_year : int, optional (default=2080)\n", + " Number of work hours in a year for converting hourly to annual salary.\n", + "\n", + " Returns\n", + " -------\n", + " self : object\n", + " The current instance with the cleaned Salary column.\n", + " \"\"\"\n", + " try:\n", + " if \"Salary\" in self.df.columns:\n", + " self.df[\"Salary\"] = self.df[\"Salary\"].astype(str)\n", + "\n", + " def parse_salary(val: str):\n", + " val = val.strip()\n", + " if not val or val.lower() in {\"nan\", \"none\"}:\n", + " return None\n", + "\n", + " # Normalize dash types (hyphen, en dash, em dash \"-\")\n", + " val = re.sub(r\"[–—]\", \"-\", val)\n", + "\n", + " # Handle range like \"50k-70k\" or \"50,000-70,000\"\n", + " if \"-\" in val:\n", + " parts = val.split(\"-\")\n", + " nums = [parse_salary(p) for p in parts if p.strip()]\n", + " nums = [n for n in nums if n is not None]\n", + " return sum(nums) / len(nums) if nums else None\n", + "\n", + " # Remove $, commas, spaces\n", + " val = re.sub(r\"[\\$,]\", \"\", val)\n", + "\n", + " # Handle shorthand k/K (e.g., \"50k\" → 50000)\n", + " match_k = re.match(r\"^(\\d+(\\.\\d+)?)[kK]$\", val)\n", + " if match_k:\n", + " return float(match_k.group(1)) * 1000\n", + "\n", + " # Handle shorthand M (e.g., \"1.5M\" → 1500000)\n", + " match_m = re.match(r\"^(\\d+(\\.\\d+)?)[mM]$\", val)\n", + " if match_m:\n", + " return float(match_m.group(1)) * 1_000_000\n", + "\n", + " # Plain number (integer or float)\n", + " try:\n", + " return float(val)\n", + " except ValueError:\n", + " return None\n", + "\n", + " # Apply parsing\n", + " self.df[\"Salary\"] = self.df[\"Salary\"].apply(parse_salary)\n", + "\n", + " # Convert small numbers (hourly) to annual\n", + " self.df.loc[self.df[\"Salary\"] <= 200, \"Salary\"] *= hours_per_year\n", + "\n", + " # Drop unrealistic salaries\n", + " self.df.loc[self.df[\"Salary\"] > 1_000_000, \"Salary\"] = None\n", + "\n", + " except Exception as e:\n", + " print(f\"[Warning] Failed salary cleaning: {e}\")\n", + "\n", + " return self\n", + "\n", + " def finalize(self):\n", + " \"\"\"Return cleaned dataframe.\"\"\"\n", + " return self.df\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "688bdf74", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "✅ Salary cleaning DataFrame test passed!\n" + ] + } + ], + "source": [ + "# Test DataFrame with edge/fail cases\n", + "fail_df = pd.DataFrame({\n", + " \"Salary\": [\n", + " None, # NaN\n", + " \"\", # NaN\n", + " \" \", # NaN\n", + " \"abc123\", # NaN\n", + " \"50k-abc\", # 50000.0\n", + " \"$-5000\", # -5000.0 (still allowed for now)\n", + " \"∞\", # NaN\n", + " \"NaN\", # NaN\n", + " \"$1.5M\", # NaN ( >1,000,000 rule)\n", + " \"70,000—80,000\" # 75000.0 (dash normalized)\n", + " ]\n", + "})\n", + "\n", + "# Run through cleaner\n", + "cleaner = DataCleaner(fail_df)\n", + "result = cleaner.clean_salary().finalize().reset_index(drop=True)\n", + "\n", + "# Expected results as DataFrame\n", + "expected = pd.DataFrame({\n", + " \"Salary\": [\n", + " None, # None\n", + " None, # empty string\n", + " None, # whitespace\n", + " None, # abc123\n", + " 50000.0, # 50k-abc\n", + " 5000.0, # negative salary\n", + " None, # infinity\n", + " None, # \"NaN\"\n", + " None, # 1.5M filtered out\n", + " 75000.0 # range with em dash\n", + " ]\n", + "}, dtype=\"float64\").reset_index(drop=True)\n", + "\n", + "# Assertion test\n", + "pdt.assert_frame_equal(result, expected)\n", + "print(\"✅ Salary cleaning DataFrame test passed!\")" + ] + }, + { + "cell_type": "markdown", + "id": "6ddbb4c0", + "metadata": {}, + "source": [ + "## Generate report \n", + "\n", + "- Overall completion of program only accounting for the new style of classes m1-m4\n", + "- completion by year \n", + "- completion over all by pathway \n", + "- completion by year by pathway \n", + "- Feel free to get creative here adding gender etc to get us a better understanding \n", + "- education level and the above... \n", + "- export this as a txt file " + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "8d6485e5", + "metadata": {}, + "outputs": [], + "source": [] + }, + { + "cell_type": "markdown", + "id": "859cf674", + "metadata": {}, + "source": [ + "## Plots \n", + "- Look at the various plots \n", + "- make a consistent color scheme\n", + "- pick the plots that go with the report above \n", + "- make missing plots \n", + "- make plots have the option to show & save in the functions\n", + "\n", + "see `src/notebooks/visualization_examples.ipynb`\n", + "See below from `src/Carmen_WORCEmployment_Plots.py`" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "81009a87", + "metadata": {}, + "outputs": [], + "source": [ + "def plot_salary_by_gender(data):\n", + " plt.figure(figsize=(8, 5))\n", + " sns.boxplot(data=data, x='Gender', y='Salary')\n", + " plt.title(\"Salary Distribution by Gender\")\n", + " plt.show()\n", + "\n", + "\n", + "def plot_avg_salary_by_city(data):\n", + " region_salary = data.groupby('Mailing City')['Salary'].mean().sort_values()\n", + " region_salary.plot(kind='barh', figsize=(8, 5), title=\"Average Salary by KY Region\")\n", + " plt.xlabel(\"Average Salary\")\n", + " plt.show()\n", + "\n", + "\n", + "def plot_placements_over_time(data):\n", + " data.set_index('Start Date').resample('M').size().plot(kind='line', marker='o', figsize=(10, 4))\n", + " plt.title(\"Number of Placements Over Time\")\n", + " plt.ylabel(\"Placements\")\n", + " plt.show()\n", + "\n", + "\n", + "def plot_placement_type_by_program(data):\n", + " plt.figure(figsize=(10, 6))\n", + " sns.countplot(data=data, x='ATP Placement Type', hue='Program: Program Name')\n", + " plt.xticks(rotation=45)\n", + " plt.title(\"Placement Type by Program\")\n", + " plt.show()\n", + "\n", + "\n", + "def plot_top_cities(data):\n", + " city_counts = data['Mailing City'].value_counts().head(10)\n", + " city_counts.plot(kind='bar', title='Top Cities by Participant Count', figsize=(8, 4))\n", + " plt.ylabel(\"Count\")\n", + " plt.show()" + ] + }, + { + "cell_type": "markdown", + "id": "f905708f", + "metadata": {}, + "source": [ + "TOC generator " + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "id": "d4fc7116", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\n", + "--- ✅ Copy the Markdown below and paste it into a new markdown cell ---\n", + "\n", + "### **Table of Contents**\n", + " * [**Table of Contents**](#**table-of-contents**)\n", + " * [Function To Read in the Data!](#function-to-read-in-the-data!)\n", + " * [Example usage](#example-usage)\n", + " * [To Access a DataFrame in the list](#to-access-a-dataframe-in-the-list)\n", + " * [To Remove Spaces in DataFrame name](#to-remove-spaces-in-dataframe-name)\n", + " * [Update cleaning code](#update-cleaning-code)\n", + " * [Generate report](#generate-report)\n", + " * [Plots](#plots)\n", + "\n" + ] + } + ], + "source": [ + "import json\n", + "import os\n", + "\n", + "\n", + "def generate_toc_from_notebook(notebook_path):\n", + " \"\"\"\n", + " Parses a local .ipynb file and generates Markdown for a Table of Contents.\n", + " \"\"\"\n", + " if not os.path.isfile(notebook_path):\n", + " print(f\"❌ Error: File not found at '{notebook_path}'\")\n", + " return\n", + "\n", + " with open(notebook_path, 'r', encoding='utf-8') as f:\n", + " notebook = json.load(f)\n", + "\n", + " toc_markdown = \"### **Table of Contents**\\n\"\n", + " for cell in notebook.get('cells', []):\n", + " if cell.get('cell_type') == 'markdown':\n", + " for line in cell.get('source', []):\n", + " if line.strip().startswith('#'):\n", + " level = line.count('#')\n", + " title = line.strip('#').strip()\n", + " link = title.lower().replace(' ', '-').strip('-.()')\n", + " indent = ' ' * (level - 1)\n", + " toc_markdown += f\"{indent}* [{title}](#{link})\\n\"\n", + "\n", + " print(\"\\n--- ✅ Copy the Markdown below and paste it \"\n", + " \"into a new markdown cell ---\\n\")\n", + " print(toc_markdown)\n", + "\n", + "\n", + "notebook_path = 'mainNb.ipynb'\n", + "generate_toc_from_notebook(notebook_path)\n" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": ".venv", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.13.1" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +}