11import pandas as pd
22import statsmodels .api as sm
33
4- from media_impact_monitor .types_ import Aggregation
4+ from collections import Counter
5+ from itertools import chain
56
67
78def add_lags (df : pd .DataFrame , lags : list [int ]):
@@ -39,61 +40,77 @@ def regress(
3940):
4041 """Get regression result where the outcome is `day` days after the treatment."""
4142 lags = range (1 , lags + 1 )
42- media_df = pd .DataFrame (media_df , columns = ["count" ])
43- # protest_df = add_lags(protest_df, lags=[])
44- media_df = add_lags (media_df , lags = [4 , 5 , 6 , 7 , 8 ])
43+ outcomes = media_df .columns
44+
45+ placebo = False
46+ if placebo :
47+ protest_df = protest_df .copy ().sample (frac = 1 )
48+ results = {}
49+ protest_df = add_lags (protest_df , lags = [1 , 2 , 3 , 4 , 5 , 6 , 7 , 8 , 9 , 10 , 11 , 12 , 13 ])
50+ media_df = add_lags (media_df , lags = [4 , 5 , 6 , 7 , 8 , 9 , 10 , 11 , 12 , 13 ])
4551 # protest_df = add_emws(protest_df)
4652 # media_df = add_emws(media_df, spans=[14])
4753 df = pd .concat ([protest_df , media_df ], axis = 1 )
4854 df = add_weekday_dummies (df )
49- treatment = "protest"
50- outcome = "count"
51- df [outcome ] = df [outcome ].shift (- day )
52- if cumulative :
53- # TODO write tests for this
54- if day < 0 :
55- indexer = pd .api .indexers .FixedForwardWindowIndexer (window_size = - day )
56- df [outcome ] = - df [outcome ].rolling (window = indexer ).sum ()
57- else :
58- df [outcome ] = df [outcome ].rolling (day + 1 ).sum ()
59- df = df .dropna ()
60- placebo = False
61- if placebo :
62- df [treatment ] = df .sample (frac = 1 )[treatment ].to_list ()
63- X = df .drop (columns = [outcome ])
64- y = df [outcome ]
65- model = sm .OLS (y , sm .add_constant (X ))
66- model = model .fit (cov_type = "HC3" )
67- alpha = 0.1
68- return {
69- "date" : day ,
70- "mean" : model .params [treatment ],
71- "p_value" : model .pvalues [treatment ],
72- "ci_lower" : model .conf_int (alpha = alpha )[0 ][treatment ],
73- "ci_upper" : model .conf_int (alpha = alpha )[1 ][treatment ],
74- }
55+ for outcome in outcomes :
56+ df [outcome ] = df [outcome ].shift (- day )
57+ if cumulative :
58+ # TODO write tests for this
59+ if day < 0 :
60+ indexer = pd .api .indexers .FixedForwardWindowIndexer (window_size = - day )
61+ df [outcome ] = - df [outcome ].rolling (window = indexer ).sum ()
62+ else :
63+ df [outcome ] = df [outcome ].rolling (day + 1 ).sum ()
64+ df = df .dropna ()
65+ X = df .drop (columns = [outcome ])
66+ y = df [outcome ]
67+ model = sm .OLS (y , sm .add_constant (X ))
68+ model = model .fit (cov_type = "HC3" )
69+ alpha = 0.1
70+ results [outcome ] = {}
71+ for org in protest_df .columns :
72+ results [outcome ][org ] = {
73+ "date" : day ,
74+ "mean" : model .params [org ],
75+ "p_value" : model .pvalues [org ],
76+ "ci_lower" : model .conf_int (alpha = alpha )[0 ][org ],
77+ "ci_upper" : model .conf_int (alpha = alpha )[1 ][org ],
78+ }
79+ return results
7580
7681
77- def agg_protests (df : pd .DataFrame ):
78- start = df ["date" ].min () # HACK
79- end = df ["date" ].max () # HACK
80- df = df .groupby ("date" )["date" ].count ().to_frame ("protest" )
82+ def agg_protests (events : pd .DataFrame ):
83+ start = events ["date" ].min () # HACK
84+ end = events ["date" ].max () # HACK
85+ # we want to reduce the amount of organizers for our simple regression
86+ # (when using multilevel models later, we can relax this)
87+ # only use primary organizers for each event
88+ primary_orgs = events ["organizers" ].apply (lambda x : x [0 ] if x else None )
89+ # only keep most frequent organizers
90+ orgs = [k for k , v in Counter (primary_orgs ).items () if k and v > 10 ]
91+ # create dummy columns for each organizer
92+ orgs_df = pd .DataFrame ({org : primary_orgs == org for org in orgs })
93+ orgs_df ["date" ] = events ["date" ]
94+ orgs_df = orgs_df .set_index ("date" )
95+ # create time series by counting protests per day for each organizer
96+ ts = orgs_df .groupby ("date" ).agg ({org : "any" for org in orgs }).astype (int )
8197 idx = pd .date_range (start = start , end = end )
82- df = df .reindex (idx ).fillna (0 )
83- return df
98+ ts = ts .reindex (idx ).fillna (0 )
99+ return ts
84100
85101
86102def estimate_impact (
87103 events : pd .DataFrame ,
88104 article_counts : pd .Series ,
89- aggregation : Aggregation ,
90105 cumulative : bool = True ,
91106 lags : int = 14 ,
92107 outcome_days : list [int ] = range (- 14 , 14 ),
93108):
94109 protest_df = agg_protests (events )
95- n = protest_df ["protest" ].sum ()
96- limitations = [f"There have only been { n } protests." ]
110+ return regress (
111+ protest_df , article_counts , day = 7 , lags = lags , cumulative = cumulative
112+ ), []
113+ # TODO: do this per day:
97114 impacts = pd .DataFrame (
98115 [
99116 regress (
@@ -103,4 +120,5 @@ def estimate_impact(
103120 ]
104121 )
105122 impacts = impacts .set_index ("date" )[["mean" , "ci_lower" , "ci_upper" , "p_value" ]]
123+ limitations = []
106124 return impacts , limitations
0 commit comments