Skip to content

Commit c9d5c1c

Browse files
authored
false positive risk (#83)
1 parent d2aab3e commit c9d5c1c

File tree

12 files changed

+235
-81
lines changed

12 files changed

+235
-81
lines changed

pyproject.toml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,7 @@
11
[tool]
22
[tool.poetry]
33
name = "ep-stats"
4-
version = "2.4.0"
4+
version = "2.5.0"
55
homepage = "https://github.com/avast/ep-stats"
66
description = "Statistical package to evaluate ab tests in experimentation platform."
77
authors = [

src/epstats/server/req.py

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -239,6 +239,13 @@ class Experiment(BaseModel):
239239
description="""List of filtering conditions to apply on exposure and goals.""",
240240
)
241241

242+
null_hypothesis_rate: Optional[float] = Field(
243+
None,
244+
title="Null hypothesis rate",
245+
description="""Global null hypothesis rate of the experimentation program. It is defined as the
246+
proportion of all tests in an experimentation program that have not improved or degraded the primary metric.""",
247+
)
248+
242249
query_parameters: dict = Field(
243250
{},
244251
title="Custom query parameters used in the data access.",
@@ -334,6 +341,7 @@ def to_experiment(self):
334341
unit_type=self.unit_type,
335342
variants=self.variants,
336343
filters=[f.to_filter() for f in self.filters] if self.filters else [],
344+
null_hypothesis_rate=self.null_hypothesis_rate,
337345
query_parameters=self.query_parameters,
338346
)
339347

src/epstats/server/res.py

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -72,6 +72,9 @@ class MetricStat(BaseModel):
7272
title="Power",
7373
description="Test power based on the collected `sample_size`.",
7474
)
75+
false_positive_risk: Optional[float] = Field(
76+
None, title="False positive risk.", description="False positive risk of a statistically significant result."
77+
)
7578

7679
@staticmethod
7780
def from_df(df: pd.DataFrame):
@@ -88,6 +91,7 @@ def from_df(df: pd.DataFrame):
8891
sample_size=r["sample_size"],
8992
required_sample_size=r["required_sample_size"],
9093
power=r["power"],
94+
false_positive_risk=r["false_positive_risk"],
9195
)
9296
for i, r in df.iterrows()
9397
]

src/epstats/toolkit/experiment.py

Lines changed: 21 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -54,6 +54,7 @@ def metric_columns(cls) -> List[str]:
5454
1. `sample_size` - current sample size
5555
1. `required_sample_size` - size of the sample required to reach the required power
5656
1. `power` - power based on the collected `sample_size`
57+
1. `false_positive_risk` - false positive risk of a significant metric
5758
"""
5859
return [
5960
"timestamp",
@@ -76,6 +77,7 @@ def metric_columns(cls) -> List[str]:
7677
"sample_size",
7778
"required_sample_size",
7879
"power",
80+
"false_positive_risk",
7981
]
8082

8183
@classmethod
@@ -156,6 +158,7 @@ def __init__(
156158
confidence_level: float = DEFAULT_CONFIDENCE_LEVEL,
157159
variants: Optional[List[str]] = None,
158160
filters: Optional[List[Filter]] = None,
161+
null_hypothesis_rate: Optional[float] = None,
159162
query_parameters: dict = {},
160163
):
161164
self._logger = logging.getLogger(f"{__name__}.{self.__class__.__name__}")
@@ -189,6 +192,7 @@ def __init__(
189192
self._update_dimension_to_value()
190193
self.filters = filters if filters is not None else []
191194
self.query_parameters = query_parameters
195+
self.null_hypothesis_rate = null_hypothesis_rate
192196

193197
def _check_metric_ids_unique(self):
194198
"""
@@ -765,6 +769,22 @@ def _get_power_from_required_sample_sizes(self, metrics: pd.DataFrame, n_variant
765769
axis=1,
766770
)
767771

772+
def _get_false_positive_risk(self, metric_row: pd.Series) -> float:
773+
if self.null_hypothesis_rate is None:
774+
return np.nan
775+
776+
if metric_row["p_value"] >= metric_row["confidence_level"]:
777+
return np.nan
778+
779+
return Statistics.false_positive_risk(
780+
null_hypothesis_rate=self.null_hypothesis_rate,
781+
power=metric_row["power"],
782+
p_value=metric_row["p_value"],
783+
)
784+
785+
def _get_false_positive_risks(self, metrics: pd.DataFrame) -> pd.Series:
786+
return metrics.apply(self._get_false_positive_risk, axis=1)
787+
768788
def _evaluate_metrics(self, goals: pd.DataFrame, column_fce) -> pd.DataFrame:
769789
if not self.metrics:
770790
return pd.DataFrame([], columns=Evaluation.metric_columns())
@@ -822,4 +842,5 @@ def _evaluate_metrics(self, goals: pd.DataFrame, column_fce) -> pd.DataFrame:
822842
c["timestamp"] = round(get_utc_timestamp(datetime.now()).timestamp())
823843
c[["minimum_effect", "sample_size", "required_sample_size"]] = self._get_required_sample_sizes(c, n_variants)
824844
c["power"] = self._get_power_from_required_sample_sizes(c, n_variants)
845+
c["false_positive_risk"] = self._get_false_positive_risks(c)
825846
return c[Evaluation.metric_columns()]

src/epstats/toolkit/statistics.py

Lines changed: 33 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -432,3 +432,36 @@ def power_from_required_sample_size_per_variant(
432432
np.sqrt(required_sample_size_ratio) * (st.norm.ppf(1 - alpha / 2) + st.norm.ppf(required_power))
433433
- st.norm.ppf(1 - alpha / 2)
434434
)
435+
436+
@staticmethod
437+
def false_positive_risk(
438+
null_hypothesis_rate: float,
439+
power: float,
440+
p_value: float,
441+
) -> float:
442+
"""
443+
Computes false positive risk defined as:
444+
445+
$$
446+
P(H_0|S) = \\frac{P(S|H_0)P(H_0)}{P(S)} = \\frac{\\alpha\\pi}{\\alpha\\pi + (1 - \\beta)(1 - \\pi)}
447+
$$
448+
449+
where $S$ is a statisically significant outcome, $H_0$ is the null hypothesis, $1 - \\beta$
450+
is the power of a test, and $\\pi$ is the global null hypothesis rate defined as the proportion
451+
of all tests in an experimentation program that have not improved or degraded the primary metric.
452+
453+
False positive risk $P(H_0|S)$ is not the same as the false positive rate $P(S|H_0) = \\alpha$.
454+
455+
More information can be found in the paper: https://bit.ly/ABTestingIntuitionBusters.
456+
457+
Arguments:
458+
null_hypothesis_rate: global null hypothesis rate of the experimanation program
459+
current_power: power achieved in the test
460+
confidence_level: confidence level of the test
461+
462+
Returns:
463+
false positive risk
464+
"""
465+
466+
pi = null_hypothesis_rate
467+
return (p_value * pi) / (p_value * pi + power * (1 - pi))

src/epstats/toolkit/testing/resources/evaluations_exposures.csv

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -64,6 +64,8 @@ test-multi-check c test_unit_type global exposure 5200
6464
test-conversion-with-minimum-effect a test_unit_type global exposure 21
6565
test-conversion-with-minimum-effect b test_unit_type global exposure 26
6666
test-conversion-with-minimum-effect c test_unit_type global exposure 30
67+
test-false-positive-risk a test_unit_type global exposure 1000
68+
test-false-positive-risk b test_unit_type global exposure 1001
6769
test-dim-operators a test_unit_type global exposure 1000
6870
test-dim-operators b test_unit_type global exposure 1001
6971
test-operator-precedence a test_unit_type global exposure 80

0 commit comments

Comments
 (0)