diff --git a/src/notebooks/mainNb.ipynb b/src/notebooks/mainNb.ipynb index 9a64ffb..cfe3734 100644 --- a/src/notebooks/mainNb.ipynb +++ b/src/notebooks/mainNb.ipynb @@ -18,7 +18,7 @@ }, { "cell_type": "code", - "execution_count": 2, + "execution_count": 28, "id": "d11a2343", "metadata": {}, "outputs": [], @@ -28,7 +28,8 @@ "from pathlib import Path\n", "import os\n", "import sys\n", - "import re" + "import re\n", + "import pandas.testing as pdt" ] }, { @@ -630,7 +631,8 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 16, + "id": "749ae60a", "metadata": {}, "outputs": [], @@ -782,6 +784,254 @@ " return self.df" ] }, + { + "cell_type": "markdown", + + "id": "3eb6373f", + "metadata": {}, + "source": [ + "### Sample use of the clean_salary function. " + ] + }, + { + "cell_type": "code", + "execution_count": 19, + "id": "182eac4a", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + " Salary\n", + "0 50000.0\n", + "1 20800.0\n", + "2 104000.0\n", + "3 60000.0\n", + "4 75000.0\n", + "5 100000.0\n", + "6 150000.0\n", + "7 200.0\n", + "8 3000.0\n", + "9 NaN\n", + "10 NaN\n", + "11 NaN\n", + "12 145600.0\n" + ] + } + ], + "source": [ + "test_df = pd.DataFrame({\n", + " \"Salary\": [\"$50k\", \"10\", \"50\", \"60,000\", \"70,000-80,000\", \"100k\", \"150000\", \"200\", \"3000\", \"5000000\", \"$1.5M\", \"invalid\", 70]\n", + "})\n", + "\n", + "# Create instance with test DataFrame\n", + "cleaner = DataCleaner(test_df)\n", + "\n", + "# Run salary cleaning\n", + "cleaner = cleaner.clean_salary(2080)\n", + "\n", + "# Get the cleaned DataFrame\n", + "result_df = cleaner.finalize()\n", + "print(result_df)" + ] + }, + { + "cell_type": "code", + "execution_count": 21, + "id": "82806fc9", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + " Salary\n", + "0 NaN\n", + "1 NaN\n", + "2 NaN\n", + "3 NaN\n", + "4 50000.0\n", + "5 5000.0\n", + "6 NaN\n", + "7 NaN\n", + "8 NaN\n", + "9 NaN\n" + ] + } + ], + "source": [ + "fail_df = pd.DataFrame({\n", + " \"Salary\": [\n", + " None, # NaN input\n", + " \"\", # empty string\n", + " \" \", # whitespace only\n", + " \"abc123\", # text + numbers\n", + " \"50k-abc\", # malformed range\n", + " \"$-5000\", # negative salary\n", + " \"∞\", # infinity symbol\n", + " \"NaN\", # literal string NaN\n", + " \"$1.5M\", # millions, not handled in parser\n", + " \"70,000—80,000\" # em dash (—) instead of hyphen/dash\n", + " ]\n", + "})\n", + "# Create instance with failing DataFrame\n", + "fail_cleaner = DataCleaner(fail_df)\n", + "# Run salary cleaning on failing DataFrame\n", + "fail_cleaner = fail_cleaner.clean_salary(2080)\n", + "# Get the cleaned DataFrame\n", + "fail_result_df = fail_cleaner.finalize()\n", + "print(fail_result_df)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "123deb70", + "metadata": {}, + "outputs": [], + "source": [ + "class DataCleaner:\n", + " def __init__(self, df: pd.DataFrame):\n", + " self.df = df.copy()\n", + "\n", + " def clean_salary(self, hours_per_year: int = 2080):\n", + " \"\"\"\n", + " Clean and standardize salary values in the DataFrame.\n", + "\n", + " Steps performed:\n", + " 1. Remove currency symbols, commas, and shorthand (e.g., \"$50k\" → 50000).\n", + " 2. Handle ranges by converting them to the average value \n", + " (e.g., \"50,000–70,000\" → 60000).\n", + " 3. Handle shorthand \"M\" (e.g., \"$1.5M\" → 1,500,000).\n", + " 4. Convert values to numeric, coercing invalid entries to NaN.\n", + " 5. Treat values <= 200 as hourly wages and convert to annual salaries \n", + " (multiplied by `hours_per_year`).\n", + " 6. Drop unrealistic values greater than 1,000,000 (set to NaN).\n", + "\n", + " Parameters\n", + " ----------\n", + " hours_per_year : int, optional (default=2080)\n", + " Number of work hours in a year for converting hourly to annual salary.\n", + "\n", + " Returns\n", + " -------\n", + " self : object\n", + " The current instance with the cleaned Salary column.\n", + " \"\"\"\n", + " try:\n", + " if \"Salary\" in self.df.columns:\n", + " self.df[\"Salary\"] = self.df[\"Salary\"].astype(str)\n", + "\n", + " def parse_salary(val: str):\n", + " val = val.strip()\n", + " if not val or val.lower() in {\"nan\", \"none\"}:\n", + " return None\n", + "\n", + " # Normalize dash types (hyphen, en dash, em dash \"-\")\n", + " val = re.sub(r\"[–—]\", \"-\", val)\n", + "\n", + " # Handle range like \"50k-70k\" or \"50,000-70,000\"\n", + " if \"-\" in val:\n", + " parts = val.split(\"-\")\n", + " nums = [parse_salary(p) for p in parts if p.strip()]\n", + " nums = [n for n in nums if n is not None]\n", + " return sum(nums) / len(nums) if nums else None\n", + "\n", + " # Remove $, commas, spaces\n", + " val = re.sub(r\"[\\$,]\", \"\", val)\n", + "\n", + " # Handle shorthand k/K (e.g., \"50k\" → 50000)\n", + " match_k = re.match(r\"^(\\d+(\\.\\d+)?)[kK]$\", val)\n", + " if match_k:\n", + " return float(match_k.group(1)) * 1000\n", + "\n", + " # Handle shorthand M (e.g., \"1.5M\" → 1500000)\n", + " match_m = re.match(r\"^(\\d+(\\.\\d+)?)[mM]$\", val)\n", + " if match_m:\n", + " return float(match_m.group(1)) * 1_000_000\n", + "\n", + " # Plain number (integer or float)\n", + " try:\n", + " return float(val)\n", + " except ValueError:\n", + " return None\n", + "\n", + " # Apply parsing\n", + " self.df[\"Salary\"] = self.df[\"Salary\"].apply(parse_salary)\n", + "\n", + " # Convert small numbers (hourly) to annual\n", + " self.df.loc[self.df[\"Salary\"] <= 200, \"Salary\"] *= hours_per_year\n", + "\n", + " # Drop unrealistic salaries\n", + " self.df.loc[self.df[\"Salary\"] > 1_000_000, \"Salary\"] = None\n", + "\n", + " except Exception as e:\n", + " print(f\"[Warning] Failed salary cleaning: {e}\")\n", + "\n", + " return self\n", + "\n", + " def finalize(self):\n", + " \"\"\"Return cleaned dataframe.\"\"\"\n", + " return self.df\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "688bdf74", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "✅ Salary cleaning DataFrame test passed!\n" + ] + } + ], + "source": [ + "# Test DataFrame with edge/fail cases\n", + "fail_df = pd.DataFrame({\n", + " \"Salary\": [\n", + " None, # NaN\n", + " \"\", # NaN\n", + " \" \", # NaN\n", + " \"abc123\", # NaN\n", + " \"50k-abc\", # 50000.0\n", + " \"$-5000\", # -5000.0 (still allowed for now)\n", + " \"∞\", # NaN\n", + " \"NaN\", # NaN\n", + " \"$1.5M\", # NaN ( >1,000,000 rule)\n", + " \"70,000—80,000\" # 75000.0 (dash normalized)\n", + " ]\n", + "})\n", + "\n", + "# Run through cleaner\n", + "cleaner = DataCleaner(fail_df)\n", + "result = cleaner.clean_salary().finalize().reset_index(drop=True)\n", + "\n", + "# Expected results as DataFrame\n", + "expected = pd.DataFrame({\n", + " \"Salary\": [\n", + " None, # None\n", + " None, # empty string\n", + " None, # whitespace\n", + " None, # abc123\n", + " 50000.0, # 50k-abc\n", + " 5000.0, # negative salary\n", + " None, # infinity\n", + " None, # \"NaN\"\n", + " None, # 1.5M filtered out\n", + " 75000.0 # range with em dash\n", + " ]\n", + "}, dtype=\"float64\").reset_index(drop=True)\n", + "\n", + "# Assertion test\n", + "pdt.assert_frame_equal(result, expected)\n", + "print(\"✅ Salary cleaning DataFrame test passed!\")" + ] + }, { "cell_type": "markdown", "id": "6ddbb4c0",