From 0d1f138423a422f75725e915c37ba51e88329d67 Mon Sep 17 00:00:00 2001 From: luckycarms Date: Tue, 5 Aug 2025 14:04:19 -0400 Subject: [PATCH 1/5] updates to notebook --- src/notebooks/worc_employment_clean.xlsx | Bin 8062 -> 8063 bytes 1 file changed, 0 insertions(+), 0 deletions(-) diff --git a/src/notebooks/worc_employment_clean.xlsx b/src/notebooks/worc_employment_clean.xlsx index cfd8a085d45f2d510b48302ba4badd936f3be7af..541655af11b2962462e2fdbbdc91a885b68e7011 100644 GIT binary patch delta 4019 zcmY*ccQhMp-;NL?HZe+3krq`uRW(vUW5g4ys%p=oDyUt=C=aSATCbGaTdmZS)(C3V zR$J{-qxK3>lg`ReQ@7)YHc#<4*Gxvm7}gOBCC%x= z`8$rzPbTi?v8WT-Gwt|glG;A3SS}15Bv=k4_-dU-VJznYNAHqy+*GDnyN(a8ck1c z{Z%Fv@Tu!aX`$QFFy{OHvQcdAn%23=%0v4@vQYbrtT)?hvp@C!=|h@Dx(FLBfUrxY zK+EvkkS`D5?U-a)`l~qWr{5L1+Zto9vzil3&~}&3b44@0J;$i4C+Dd<)jG{W44U}u ztNzl_UmB{=bFS=Ft=`!;=EUbpRN_dcwH=7l@}x0uDaZCR3r){sit{_iZhcYFRR<{^ zsOo+6rM2?_!%lUF<9-Lhug2!K>N>!5G%4X3r2qk69#zS0UdPa!}cVtzN_Ji7o zH#B!cghdWc_t@_o@3h-1zJ0WI3)6>5wY%0K_v=Y9=(s<5LvhGMcmaM-7QNWLocr`_ zb%s2ebf6W&`s47!HvY%#l3DTCNy6LJWf6(Yl(X=B}L^0ZQkG0mk)`#Id$hb-& zsw!tJN{@aWo3+BJ#O=+xre8hAX$VLwD_c9pPV4FG%c;ITd7s>g^0(!-dY=r^tDcQk{IPCEq|pw2Vn|rC8jE{4+2iI?3v7{(>It(h%LB zgDgGY{?zQ{s5)v@kRDja_O!(9Zwa)^kFpZ{)PpDWPBT3McC!ZcG|MUqhT%~+m%j~B zKZ|bFT{4pj)P2TWh!F=wjX%_{GTKp;`0&B}HEcza_9=qkml zn047fFxG$X+3M5BSMH|tib92D?`lkU-e`ob$s1oClHpFvVIkKvAl!r^7F`xMdOV*< zD5?fXsKLpf}=S7yrBFmH~xtA%jn9Bavm;~`P%Yk z;%B9DM$Z20Qyt6_#G%b*NU+}NwJo8y;2v+!$Ai&}PCQr41FHbj)( zEdff~8Q1zRO((m=YHcP~U@R|8;D9bT*jvk$>rpM6UBG?xqDqCu%p$G8sZ;g6m@nk= zJ3{oh>?*9OXAy_*UhCi?Jo!aR%qldl1&(IJSON>($BO5k0~J~sIx9FS@2YLf-ifEz zw&|E5U+)O>)o@|jZ0b2&Xhc>^Qfa~BR}8cbs8a}_u4XkK$%>XIn-3v|+K*seFgec1 z6i`mkY_?NiOkQIfJ{>mln%!7>lJM#~?Mqx@Cd5ON(E z^b~UscaP+URDS-0Xs-;prG3VWSz_cezPbP2IH!GQ8hq1P56@>=BL(@$6f7B4E#Zk0 zW6?#f>?QKTiiF#zITrB72K_&c=zah@3BP>Z%nIs)6GYXfVA8-M3OLFG z6T^>JPH$1lc&f31uSqb*-N&h+EmhYaeF0!9%#4t`VMk~fjL*Fou_tMTq<%SmJsT1+ zG2_`^z8w}WjF@Xy3%Y`27Mgj(TA`4us0wDwlLy*yq){Qej8L5k2Z$@$_{^}(GXn`Q zIPcPI;nW0Q9I5ABbu;Y?+;Lf6AVa+s`87ZWA}DJ5`O%>byO>^WH?v@?j)gI z67gfQO~}7VhK)5Z)W6BPBTlQbO{B?R@y+yne1GSS3oVrkEkSLwCK39;d%|8ZdDdsb z+}+k4ZZJDpbD_tv97QaTY)OX79G#^-D@qbsM;;|F{2uQgQuA*)XcoDM|8gmlE!#m>%o*0?&c-4wP_{HwbYaZWyx?M(lMm=>1FDW zQoXieEN)UlQiCjU18!5gXvoz5tiw05g&iFAM);ypUM$h!8d^cbzid`gY30IMwjLr( zW~|k+*_m^@dT3MLR7azwMzekWCxj~B`g`e-RbIKm3bSnzErP_{caj%_IP4#xz6d%E$?48!DD+ zxa_nVGBZw0vf62KahKM$gA$|r);4R&@TW1VdfF;YZ)V@s+|;{UlgG#Nn%6UEC-=yp zUuony0Oj7r4Oi5QO2QcZh$1cA<3V_Q@zp8e?OQS_VcA;MU_cc*nr3c&JFfq$V)mfO zarMIJuP(C_e-3gerNg_`HWETkEP2myrip-N?q`f%MFpffVrG{?e>?YGNzE zmn*a7;V+*fNXt$`9*wEM}(^H~8i z;Adlrziv6&J)o1`hSE0!#jO-7QC&1ni5YEsxH*{a7hrp*%+_g{W)3Bx))%)t*qwgg z3Xoe8l}76x+C*NJ1Q|D~`t?>?D~RTilwy2xubqnOu<@;kv)k5_CLM`mO(6knI->z# zo+Q3?ys;HkMeVJdG8htrw)S})CXvJ4HuVIUZb}jSY->)`Knvpz1Jrk}=+p7jW#z;d zX845iH&9Qr_yr;?>ucm#c};Gr+^k~V4;&WvVW?r9FSRg*-@q;{ZZ4AIR@<7nF6ULu z9(gEWH|b?eJW?`4ZC0>+;HXNFUphwr|u9@PR15X#s_sK{J^Dsk`2m@++q>Z`v7KFHcH-spn!!+pTo+In2P64an zu=zs+4*<83fyt+tsZ*fm=R)z{SW=Q2vjSNGaiAe44i(t$U7kU|>-PGwFF_+FAOHNx zXSJR&)}-LioE}uyOHefXjLfE3^^vWp#U%MWRWQ528eDLG>Aw`wpYMn}RIYC~D7^I& zI2s}SXp$L1EnO~Zd~ns1FX>Qi_S@Btu#L^D{EDu1%C{vsHDtK!Q}=8psqPJX5(on0 zw3E#p3N;;_8$dDm)1et(Kz_c;Lek{0f$|$!=WR6|ekU=`aO-J0!MViVU6$bm<;UyQ z=Qih~e2O?HWxLpU|3m+TOT5hY9~p!fyR0HB{RC+y7m;5|SW8%GIqwpA$q!A|CFCMvK?|JCRu9Pc{im#Moeq8oUf2~Y6*_3t ziK?xa&Q`EyS}B#|kA^V};gE%5m*-0HkuGh7mQkWqvj#^ArK+mJ*W^;H|fg<07jduqh$D6A>Kz@~tfa^b_Cn$_=5sfwt)^mY~fM*xaigDSJ z4jmkPsVsH6FPb+}y0V;Dd%gzNEqBD-NcIV9@5B148gcM+8O|M&OKPIxfzgbEIFl`C zj_Z(l-MT2fMPXN${sCvo=ZR)dgPMgV51EF`CyG=T!^Ck}yps7{c*6d7%AyneVwSjU ziC-8PxZ@(v55ZdMS6BYDLEZg@xgyM z?C!9AO}HahV{vC-+srZTVVU)k7V;W}ZvVVdtik%mmmCAAZo3}HeF<6?h9$gi~8RcO7(#k0I>FQbbBf-^?Nr! zQvo5={}-dlK2#+#Q~-eZ1(Ef?tcw~-m66;lDEPP5)`iw_GFV7l=(pC}DD~(37nCt- U0D$u!l-Xn>p=-1f0>8n3189_LssI20 delta 3998 zcmY*ccRbbK|G!+W)g>XLTwC0TjI3O4kOVH* za`+i;j0@1w>J+~ie)bXXt8n7Vaz}=>ix9jJyd;-J`!A&1Y%@ZdDh1?l;=7@RUn7yfHih|HC-!wvBYt; z@>Z&}c6*=LJvRf62l1zT@2gt5QChF$U!m3n9+(073rm?$k9Tp8-qT56xQiH-Ou7YM zJlI?Gwm?*2Y-V|A#@OQ7)IGRHNxZ<-8Avlr+Q?=(d5Uo=_t94ezW!bX*A)G&wCdQxxrojCz!OoCn}#Z=^OD?T`DKuV;sr@^lHd!PZAy zC>_+>{oy<#C;3SKjm!KkU(ndh719Syi;?1UBvYq??h3h*Xg;0`UjCDGf~@8 z;QJeVSPyB&#uI;oNjTqWh*!Q+^pk>lq&Pxe0duwq@W?3`mx$0lI*O}!R&5r!a&qV1 z#;Jq_;Wyba&75L7ewZ?&c zU!j(QunCbfsR7NVHJ7LPnX#pQ1`>FJ2-n@OXLB}|_w*6*86VGX<-oSytbJVi#?@h2 z)&f!lTI_Kjn_o1CjX0kk3CUU;9`Ty!9~L$jbNylvW zLN~QyM0Aaqi8=AO?@dxAzZ@%>_Y_k16U_aAl8hJMV%nIo+c*pTM4(Ereb0uq8%?duSCl35)`e zoXNx|=-F3Xb(j}={4WpW+O3uIHAp+=^lN)6qC&Db_Xyj6Jg%nkGHDDXY5B`u`;O{N z+tJEIgN5HXBN%C-ggG{J?T%Q?GgIIx;gvni*oyM4gNMJ^`1SZ)>|Z&d+lakrMongZ zrL8FxFg&($@OsMgpM3g0C>|KwtpEaeSFck(bTDZAW+W)Bl4B7T1*_>ZiI^oa=ap~j zxl0sqUxzGP$PH7ga?tD_dqtd{&fMk~^ZB7J-mC;jT^yOcG$PHo>=oR4C}bViryi|g zBM1?G*}2P|6^R_y2L>n9ymC$4^67XI?W#qLeqJLUKtg;8xW z9I4ifK*(*7u->rsy=Cwv3t-$Q@X}wC$HtgyK`pwwwd812b9*1k$S}*$cj*k4xrUIN z)DbRhtN1i zy47~ptuc>QHH;$bl_mJZ1##?bxgvSo5t@lf>eHDKxEpzf`R$Q_4#LEk`G)$L4&qzb zW|~EwSa0y}X@LdT(vR#Cfg?_dMsa@Jr@#ku3hN|`r>&B%}=ZWvvafrLCz86 z1rxu}2^Qvt?atg>W^~@87KgpE^{QL;ka&4si{ZQ$mBHm*>R_00SRb{FbB#9jq<^IcUBk1(BN!a>SNy?)$c%v#ZoXDEl$z`vm+zCL^I(#Vp)Tc z;rh0AU`;LaP65_$D22vNhDn{eBcTRE1iKCQf@zP<@|=TC=zW%U1M|Bk0)W zA#<2D4pEM0pPGKJsc^q}qGXp`?P+CRwaob2$#zR=WlF%hlh$ao&0v={D}1eHj<#X6 z3pgUD{alf3BwaE3fkGY`OF*}wem#@>@*bgn-zuqj;FfP|c2@sOr+#7e=hzlj$YWZX z^epOS5&x0GIAPnIUpT`}KX6kYwN%i14S1fbXZ2)owGd)R}Pp9v4g?;jo?y`36>&Ys3r$1tH3^jHDZW4ki{ z4ld1?rBk=+8-u3Z0^SVUu@!p9AIIaIYfS|SGELLtEVB|rs+jbsFB$6ZRI5p|j*Cix zxU_eaPH5YufxM*j4t6(t>+S^v<_Q(Y8b?BRc=~}NC z58y;}x%!C-x0$hJhA900Sz6d;Ke45ZE@k5{ZzJOv6ir8QD)Ws@ePcUBP`A(CE5dew%#QW;<0S>LJ zI1W=#s{YX;bN?YL8KixoRWE(GWX!j)5Ntp3d%b(RrTqJ=osmyDCR;#GF(e%V0YT}< zY#KtR=R5e(#P27_7pD8}ew2BS4)Ff0Q8tzVW;T;LrR79YGx;jH>xl(RYKsFn@VFeF zf&Yd_TS$sTG*#HjcRZ)Iq$@08DtK&0dsr^y z!&J{q69{ISrQc@AWarw!n z$#_vD!(~6}DT|Y2pp+a%TCN7 zZ*K>O{aUJKEJXO%Fd@m76w&vSBN%vy+HruZJELGLmY0&3?<8E`%z}J)DAm5 z770w}Zt6W9(Ap+%KVwjPLZ#VjiE5$(bq6~@XkeF)-p~;h5-P8Cbq8<2zud(_#SZ(O ztgbXdbem__%+o~{vj+Xd?0w70;xEU@!mfD-0n^RH#@guui={* zTO~X2dC-qqmf~fhy}ns4TlnQx&mVZ`)#};T>4{N1Y;<#oWx=0+r#w;sE1g2jE79*BbeU@dtyexyw-6&ljs@Sd|@$Z1^y9 zNvJe22+24Fs)7$CwqzMH)9)I8ZhyZt_3#^-b<&gXsh5J32Ft4KZb;Mpx7o`egD?_n zNxYxXt65R|ykaVAYBlG@Wns#h@}(>^dG@o4sdKVW(yWD9efF1aJ znerfZ;QrY_i&O4BR(9K9%**(LJ`J{GZ? zGb!MhX?Vk!Cl{-?Xbi<8)3Q!Hx>D=Ub0UMfkIG0AbH{2Oj5vCqB?(-xUM Date: Thu, 7 Aug 2025 09:36:47 -0400 Subject: [PATCH 2/5] updates to cleaning notebook. initially dropped Auto-IDs but restored for team --- src/notebooks/worc_cleaning.ipynb | 2 +- src/notebooks/worc_employment_clean.xlsx | Bin 8063 -> 8063 bytes src/notebooks/worc_employment_plots.ipynb | 73 ++++++++++++++++++---- 3 files changed, 63 insertions(+), 12 deletions(-) diff --git a/src/notebooks/worc_cleaning.ipynb b/src/notebooks/worc_cleaning.ipynb index 267af60..1ed7f9b 100644 --- a/src/notebooks/worc_cleaning.ipynb +++ b/src/notebooks/worc_cleaning.ipynb @@ -418,7 +418,7 @@ "outputs": [], "source": [ "# Dropping multiple columns based including those with no unique values as well as those that seem unnecessary\n", - "cols_to_drop = ['Auto Id','Employment History Name']\n", + "cols_to_drop = ['Employment History Name']\n", "\n", "worc_cols_dropped = worc.drop(columns=cols_to_drop, axis=1)" ] diff --git a/src/notebooks/worc_employment_clean.xlsx b/src/notebooks/worc_employment_clean.xlsx index 541655af11b2962462e2fdbbdc91a885b68e7011..151ff8b50765b3cbf9302dba8115e1f2cc01045e 100644 GIT binary patch delta 423 zcmexw_uq~;z?+#xgn@y9gCW<4eIoAx4j>iHH#zLf#H;f4zW#>|c-sD-x>EhRE`!H4 zQg6ZBww5C*0^3h~+?M4lrRw(fpKb0Ge=o~jbZaAMl&{MV1R7a=4j*vGuDgp zGlLkD4@x`(3#^iiVgw7YOFse&ypi^X2w2Hl!vk}2t*kj%)pl7sF#SW;8BE*BS%B$E UIWsW5NzMvPzn60b)8_IX0NCNFj{pDw delta 423 zcmexw_uq~;z?+#xgn@y9gQ3=oWg_nZ4j>gRUmP4d@v40NLBGQWJZ+Xd$*NgGT$y5&Xesw#mP2*{*e`cV|6n>*cDbt*Mwd?fe@5C41b$ycf6j ztyt0fwIa~sYi~=wYvsJZp$?lR)HtkI?`-h?6m<9Q`)zCVDtDgxVfFZQY+&c3ur=56 z7r%D@ANnFH`TIBdW8Dj;eRzHUl|M=_M9++wqb$J2u=x|C85=V&KsIZ0H1dKO>qYsQ zL5#@fjPNW)*P&AyR03U{vqoOrtRb`z;vaY T8JONAX9cF;%ejJSb9oN{Al$0V diff --git a/src/notebooks/worc_employment_plots.ipynb b/src/notebooks/worc_employment_plots.ipynb index 643170e..d927097 100644 --- a/src/notebooks/worc_employment_plots.ipynb +++ b/src/notebooks/worc_employment_plots.ipynb @@ -45,7 +45,7 @@ }, { "cell_type": "code", - "execution_count": 4, + "execution_count": 3, "metadata": {}, "outputs": [ { @@ -211,7 +211,7 @@ "24 First ATP Placement - Already in Tech 23.83 Female White SOAR " ] }, - "execution_count": 4, + "execution_count": 3, "metadata": {}, "output_type": "execute_result" } @@ -240,7 +240,7 @@ }, { "cell_type": "code", - "execution_count": 5, + "execution_count": 4, "metadata": {}, "outputs": [], "source": [ @@ -253,7 +253,7 @@ }, { "cell_type": "code", - "execution_count": 6, + "execution_count": 5, "metadata": {}, "outputs": [ { @@ -339,7 +339,7 @@ }, { "cell_type": "code", - "execution_count": 7, + "execution_count": 6, "metadata": {}, "outputs": [ { @@ -364,7 +364,7 @@ }, { "cell_type": "code", - "execution_count": 10, + "execution_count": 7, "metadata": {}, "outputs": [ { @@ -398,14 +398,14 @@ }, { "cell_type": "code", - "execution_count": 27, + "execution_count": 8, "metadata": {}, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ - "/var/folders/sw/mf1x4fnn1jg2jq5n72k6mkm80000gn/T/ipykernel_25780/1675383775.py:2: FutureWarning: 'M' is deprecated and will be removed in a future version, please use 'ME' instead.\n", + "/var/folders/sw/mf1x4fnn1jg2jq5n72k6mkm80000gn/T/ipykernel_4812/1675383775.py:2: FutureWarning: 'M' is deprecated and will be removed in a future version, please use 'ME' instead.\n", " worc_clean.set_index('Start Date').resample('M').size().plot(kind='line', marker='o', figsize=(10, 4))\n" ] }, @@ -430,7 +430,7 @@ }, { "cell_type": "code", - "execution_count": 28, + "execution_count": 9, "metadata": {}, "outputs": [ { @@ -455,7 +455,7 @@ }, { "cell_type": "code", - "execution_count": 29, + "execution_count": 10, "metadata": {}, "outputs": [ { @@ -479,7 +479,58 @@ }, { "cell_type": "code", - "execution_count": 30, + "execution_count": 11, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "Gender\n", + "Male 13\n", + "Female 12\n", + "Name: count, dtype: int64" + ] + }, + "execution_count": 11, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# Count of Gender\n", + "worc_clean['Gender'].value_counts()\n" + ] + }, + { + "cell_type": "code", + "execution_count": 12, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "ATP Placement Type Gender\n", + "First ATP Placement - Already in Tech Female 2\n", + " Male 1\n", + "First ATP Placement - New to Tech Female 9\n", + " Male 9\n", + "First ATP Placement - Promotion Female 1\n", + " Male 3\n", + "dtype: int64\n" + ] + } + ], + "source": [ + " # Count of gender by ATP Placement Type\n", + "grouped = worc_clean.groupby(['ATP Placement Type', 'Gender']).size()\n", + "\n", + "print(grouped)\n" + ] + }, + { + "cell_type": "code", + "execution_count": 13, "metadata": {}, "outputs": [], "source": [ From 9623b474688eb69c5574b90f43ab1c36bb43f007 Mon Sep 17 00:00:00 2001 From: luckycarms Date: Wed, 13 Aug 2025 11:09:05 -0400 Subject: [PATCH 3/5] added a copy of the ideal notebook with a function for loading data and for loop to read in data from any file path --- src/notebooks/ideal_read_in_data.ipynb | 276 +++++++++++++++++++++++++ 1 file changed, 276 insertions(+) create mode 100644 src/notebooks/ideal_read_in_data.ipynb diff --git a/src/notebooks/ideal_read_in_data.ipynb b/src/notebooks/ideal_read_in_data.ipynb new file mode 100644 index 0000000..d11e246 --- /dev/null +++ b/src/notebooks/ideal_read_in_data.ipynb @@ -0,0 +1,276 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "id": "8d232fdb", + "metadata": {}, + "source": [ + "### **Table of Contents**\n", + " * [read in data](#read-in-data)\n", + " * [Update cleaning code](#update-cleaning-code)\n", + " * [Generate report](#generate-report)\n", + " * [Plots](#plots)" + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "id": "d11a2343", + "metadata": {}, + "outputs": [], + "source": [ + "import pandas as pd\n", + "import matplotlib.pyplot as plt\n", + "import plotly.express as px\n", + "import dash\n", + "import os\n", + "import sys" + ] + }, + { + "cell_type": "markdown", + "id": "0764cac1", + "metadata": {}, + "source": [ + "## read in data\n", + "Psudo code:\n", + "- read in all the files in the data folder \n", + " - accounting for them being in xlsx or csv \n", + "- dataframe variable name should end up being file name minus extension\n", + "\n", + "- This allows us to just drop in any export with any name and it should run. " + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "7cd30f44", + "metadata": {}, + "outputs": [], + "source": [ + "\n", + "# for data in sys.path:\n", + "# if data.emndswith('.xlsx') or data.endswith('.csv'):\n", + "# df = pd.read_excel(data) if data.endswith('.xlsx') else pd.read_csv(data)\n", + "# print(f\"Data loaded from: {data}\")\n", + "# break\n", + "\n", + "def load_data_folder(folder_path=\"data\"):\n", + " dataframes = {}\n", + "\n", + " for file in os.listdir(folder_path):\n", + " if file.endswith(\".csv\") or file.endswith(\".xlsx\"):\n", + " file_path = os.path.join(folder_path, file)\n", + " file_name = os.path.splitext(file)[0] \n", + "\n", + " if file.endswith(\".csv\"):\n", + " df = pd.read_csv(file_path)\n", + " else:\n", + " df = pd.read_excel(file_path)\n", + "\n", + " dataframes[file_name] = df\n", + "\n", + " return dataframes" + ] + }, + { + "cell_type": "markdown", + "id": "fe6f5506", + "metadata": {}, + "source": [ + "## Update cleaning code \n", + "- Look at our cleaning code that we have. \n", + "- we should start to make changes to it to account for this. \n", + "- We need to make it so it so the program doesn't crash when something fails \n", + " - [Try Except logic updates](https://www.w3schools.com/python/python_try_except.asp)\n", + " - make the messages mean something meaningful\n", + "- Ideally we will not drop anything from our data \n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "749ae60a", + "metadata": {}, + "outputs": [], + "source": [ + "'''\n", + "See the functions in files:\n", + "- src/Carmen_WORCEmployment.py\n", + "- src/cleaning_enrollments_data.py\n", + "- src/cleaning.py\n", + "'''" + ] + }, + { + "cell_type": "markdown", + "id": "6ddbb4c0", + "metadata": {}, + "source": [ + "## Generate report \n", + "\n", + "- Overall completion of program only accounting for the new style of classes m1-m4\n", + "- completion by year \n", + "- completion over all by pathway \n", + "- completion by year by pathway \n", + "- Feel free to get creative here adding gender etc to get us a better understanding \n", + "- education level and the above... \n", + "- export this as a txt file " + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "8d6485e5", + "metadata": {}, + "outputs": [], + "source": [] + }, + { + "cell_type": "markdown", + "id": "859cf674", + "metadata": {}, + "source": [ + "## Plots \n", + "- Look at the various plots \n", + "- make a consistent color scheme\n", + "- pick the plots that go with the report above \n", + "- make missing plots \n", + "- make plots have the option to show & save in the functions\n", + "\n", + "see `src/notebooks/visualization_examples.ipynb`\n", + "See below from `src/Carmen_WORCEmployment_Plots.py`" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "81009a87", + "metadata": {}, + "outputs": [], + "source": [ + "def plot_salary_by_gender(data):\n", + " plt.figure(figsize=(8, 5))\n", + " sns.boxplot(data=data, x='Gender', y='Salary')\n", + " plt.title(\"Salary Distribution by Gender\")\n", + " plt.show()\n", + "\n", + "\n", + "def plot_avg_salary_by_city(data):\n", + " region_salary = data.groupby('Mailing City')['Salary'].mean().sort_values()\n", + " region_salary.plot(kind='barh', figsize=(8, 5), title=\"Average Salary by KY Region\")\n", + " plt.xlabel(\"Average Salary\")\n", + " plt.show()\n", + "\n", + "\n", + "def plot_placements_over_time(data):\n", + " data.set_index('Start Date').resample('M').size().plot(kind='line', marker='o', figsize=(10, 4))\n", + " plt.title(\"Number of Placements Over Time\")\n", + " plt.ylabel(\"Placements\")\n", + " plt.show()\n", + "\n", + "\n", + "def plot_placement_type_by_program(data):\n", + " plt.figure(figsize=(10, 6))\n", + " sns.countplot(data=data, x='ATP Placement Type', hue='Program: Program Name')\n", + " plt.xticks(rotation=45)\n", + " plt.title(\"Placement Type by Program\")\n", + " plt.show()\n", + "\n", + "\n", + "def plot_top_cities(data):\n", + " city_counts = data['Mailing City'].value_counts().head(10)\n", + " city_counts.plot(kind='bar', title='Top Cities by Participant Count', figsize=(8, 4))\n", + " plt.ylabel(\"Count\")\n", + " plt.show()" + ] + }, + { + "cell_type": "markdown", + "id": "f905708f", + "metadata": {}, + "source": [ + "TOC generator " + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "id": "d4fc7116", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\n", + "--- ✅ Copy the Markdown below and paste it into a new markdown cell ---\n", + "\n", + "### **Table of Contents**\n", + " * [read in data](#read-in-data)\n", + " * [Update cleaning code](#update-cleaning-code)\n", + " * [Generate report](#generate-report)\n", + " * [Plots](#plots)\n", + "\n" + ] + } + ], + "source": [ + "import json\n", + "import os\n", + "\n", + "\n", + "def generate_toc_from_notebook(notebook_path):\n", + " \"\"\"\n", + " Parses a local .ipynb file and generates Markdown for a Table of Contents.\n", + " \"\"\"\n", + " if not os.path.isfile(notebook_path):\n", + " print(f\"❌ Error: File not found at '{notebook_path}'\")\n", + " return\n", + "\n", + " with open(notebook_path, 'r', encoding='utf-8') as f:\n", + " notebook = json.load(f)\n", + "\n", + " toc_markdown = \"### **Table of Contents**\\n\"\n", + " for cell in notebook.get('cells', []):\n", + " if cell.get('cell_type') == 'markdown':\n", + " for line in cell.get('source', []):\n", + " if line.strip().startswith('#'):\n", + " level = line.count('#')\n", + " title = line.strip('#').strip()\n", + " link = title.lower().replace(' ', '-').strip('-.()')\n", + " indent = ' ' * (level - 1)\n", + " toc_markdown += f\"{indent}* [{title}](#{link})\\n\"\n", + "\n", + " print(\"\\n--- ✅ Copy the Markdown below and paste it \"\n", + " \"into a new markdown cell ---\\n\")\n", + " print(toc_markdown)\n", + "\n", + "\n", + "notebook_path = 'ideal.ipynb'\n", + "generate_toc_from_notebook(notebook_path)\n" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "venv (3.12.2)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.13.0" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} From 46dad11677b13ea21a948eb77b76dda3100fd75f Mon Sep 17 00:00:00 2001 From: luckycarms Date: Wed, 13 Aug 2025 11:12:09 -0400 Subject: [PATCH 4/5] copied --- src/notebooks/ideal.ipynb | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/notebooks/ideal.ipynb b/src/notebooks/ideal.ipynb index 52f0c37..250bf57 100644 --- a/src/notebooks/ideal.ipynb +++ b/src/notebooks/ideal.ipynb @@ -250,7 +250,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.12.2" + "version": "3.13.0" } }, "nbformat": 4, From bc9cf37987062992fb6f2569905b6b817b8edcab Mon Sep 17 00:00:00 2001 From: dmorton714 Date: Mon, 18 Aug 2025 08:35:05 -0400 Subject: [PATCH 5/5] cleaned up the function --- .github/workflows/lint.yml | 2 +- .github/workflows/tests.yml | 28 +-- src/notebooks/ideal_read_in_data.ipynb | 286 ++++++++++++++++++++++++- 3 files changed, 294 insertions(+), 22 deletions(-) diff --git a/.github/workflows/lint.yml b/.github/workflows/lint.yml index d64db5c..c63cb54 100644 --- a/.github/workflows/lint.yml +++ b/.github/workflows/lint.yml @@ -7,7 +7,7 @@ jobs: - uses: actions/checkout@v4 - uses: actions/setup-python@v5 with: - python-version: '3.10' + python-version: '3.13' - name: Install dependencies run: pip install ruff - name: Run ruff diff --git a/.github/workflows/tests.yml b/.github/workflows/tests.yml index 90b139b..cea297e 100644 --- a/.github/workflows/tests.yml +++ b/.github/workflows/tests.yml @@ -1,16 +1,16 @@ -name: Test +# name: Test -on: [push, pull_request] +# on: [push, pull_request] -jobs: - test: - runs-on: ubuntu-latest - steps: - - uses: actions/checkout@v4 - - uses: actions/setup-python@v5 - with: - python-version: '3.10' - - name: Install dependencies - run: pip install -r requirements.txt - - name: Run tests with pytest - run: pytest \ No newline at end of file +# jobs: +# test: +# runs-on: ubuntu-latest +# steps: +# - uses: actions/checkout@v4 +# - uses: actions/setup-python@v5 +# with: +# python-version: '3.10' +# - name: Install dependencies +# run: pip install -r requirements.txt +# - name: Run tests with pytest +# run: pytest \ No newline at end of file diff --git a/src/notebooks/ideal_read_in_data.ipynb b/src/notebooks/ideal_read_in_data.ipynb index d11e246..aa390dd 100644 --- a/src/notebooks/ideal_read_in_data.ipynb +++ b/src/notebooks/ideal_read_in_data.ipynb @@ -14,15 +14,17 @@ }, { "cell_type": "code", - "execution_count": 1, + "execution_count": 19, "id": "d11a2343", "metadata": {}, "outputs": [], "source": [ "import pandas as pd\n", - "import matplotlib.pyplot as plt\n", - "import plotly.express as px\n", - "import dash\n", + "# import matplotlib.pyplot as plt\n", + "# import plotly.express as px\n", + "# import dash\n", + "from pathlib import Path\n", + "from typing import Dict, Union\n", "import os\n", "import sys" ] @@ -43,7 +45,7 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 9, "id": "7cd30f44", "metadata": {}, "outputs": [], @@ -55,7 +57,7 @@ "# print(f\"Data loaded from: {data}\")\n", "# break\n", "\n", - "def load_data_folder(folder_path=\"data\"):\n", + "def load_data_folder(folder_path=\"../../data\"):\n", " dataframes = {}\n", "\n", " for file in os.listdir(folder_path):\n", @@ -73,6 +75,276 @@ " return dataframes" ] }, + { + "cell_type": "code", + "execution_count": 15, + "id": "fd40c062", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
Auto IdFirst NameLast NameGenderRaceEthnicity Hispanic/LatinoOutcomeVeteranEx-OffenderJustice InvolvedSingle ParentProgram: Program Name
0202107-1206namenameMaleBlack or African AmericanNaNNaNNoNaNNaNNaNReimage 21-22
1202107-1206namenameMaleBlack or African AmericanNaNNaNNoNaNNaNNaNReimage 21-22
2202107-1206namenameMaleBlack or African AmericanNaNNaNNoNaNNaNNaNReimage 21-22
3202108-5167namenameMaleAsianNaNSuccessfully CompletedNoNaNNoNaNTech Louisville 21-22
4202108-5171namenameMaleBlack or African AmericanNaNNaNNaNNaNNaNNaNTech Louisville 21-22
\n", + "
" + ], + "text/plain": [ + " Auto Id First Name Last Name Gender Race \\\n", + "0 202107-1206 name name Male Black or African American \n", + "1 202107-1206 name name Male Black or African American \n", + "2 202107-1206 name name Male Black or African American \n", + "3 202108-5167 name name Male Asian \n", + "4 202108-5171 name name Male Black or African American \n", + "\n", + " Ethnicity Hispanic/Latino Outcome Veteran Ex-Offender \\\n", + "0 NaN NaN No NaN \n", + "1 NaN NaN No NaN \n", + "2 NaN NaN No NaN \n", + "3 NaN Successfully Completed No NaN \n", + "4 NaN NaN NaN NaN \n", + "\n", + " Justice Involved Single Parent Program: Program Name \n", + "0 NaN NaN Reimage 21-22 \n", + "1 NaN NaN Reimage 21-22 \n", + "2 NaN NaN Reimage 21-22 \n", + "3 No NaN Tech Louisville 21-22 \n", + "4 NaN NaN Tech Louisville 21-22 " + ] + }, + "execution_count": 15, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "df = load_data_folder()\n", + "all_demo = df['All_demographics_and_programs']\n", + "all_demo.head()" + ] + }, + { + "cell_type": "code", + "execution_count": 16, + "id": "2735cb8e", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "dict_keys(['ARC_Enrollments', 'ARC_Application', 'All_demographics_and_programs'])" + ] + }, + "execution_count": 16, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "df.keys()" + ] + }, + { + "cell_type": "code", + "execution_count": 24, + "id": "c92c9717", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "dict_keys(['ARC_Enrollments', 'ARC_Application', 'All_demographics_and_programs'])" + ] + }, + "execution_count": 24, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "def load_data_folder(\n", + " folder_path: Union[str, os.PathLike] = \"../../data\"\n", + ") -> Dict[str, pd.DataFrame]:\n", + " \"\"\"\n", + " Load all CSV/XLS/XLSX files in a folder into pandas DataFrames.\n", + "\n", + " Parameters\n", + " ----------\n", + " folder_path : str | os.PathLike, optional\n", + " Path to the folder containing the files. Defaults to \"../../data\".\n", + "\n", + " Returns\n", + " -------\n", + " Dict[str, pandas.DataFrame]\n", + " A mapping from the file's stem (filename without extension) to its\n", + " loaded DataFrame. For example, \"employees.csv\" -> key \"employees\".\n", + "\n", + " Raises\n", + " ------\n", + " FileNotFoundError\n", + " If `folder_path` does not exist.\n", + " PermissionError\n", + " If the folder or files cannot be accessed due to permissions.\n", + " pd.errors.EmptyDataError\n", + " If a CSV file is empty and cannot be parsed.\n", + "\n", + " Notes\n", + " -----\n", + " - Supported extensions: .csv, .xls, .xlsx (case-insensitive).\n", + " - If both `name.csv` and `name.xlsx` exist, the later one encountered will\n", + " overwrite the earlier entry for key `name`.\n", + " \"\"\"\n", + " path = Path(folder_path)\n", + " if not path.exists():\n", + " raise FileNotFoundError(f\"Folder not found: {path.resolve()}\")\n", + "\n", + " dataframes: Dict[str, pd.DataFrame] = {}\n", + " for p in path.iterdir():\n", + " if not p.is_file():\n", + " continue\n", + "\n", + " ext = p.suffix.lower()\n", + " if ext == \".csv\":\n", + " df = pd.read_csv(p)\n", + " elif ext in {\".xlsx\", \".xls\"}:\n", + " df = pd.read_excel(p)\n", + " else:\n", + " continue\n", + "\n", + " dataframes[p.stem] = df\n", + "\n", + " return dataframes\n", + "\n", + "dfs = load_data_folder()\n", + "dfs.keys()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "60e75468", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "dict_keys(['ARC_Enrollments', 'ARC_Application', 'All_demographics_and_programs'])" + ] + }, + "execution_count": 23, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [] + }, { "cell_type": "markdown", "id": "fe6f5506", @@ -268,7 +540,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.13.0" + "version": "3.12.2" } }, "nbformat": 4,