Code
library(tidyverse)
library(dplyr)
library(here)
library(scales)
library(pwr) # Power analysis
library(effsize) # Effect sizes (e.g. Cohen's d)
library(magrittr)
# For tables
library(gt)
library(gtsummary)
library(IRdisplay)
This report presents examples of statistical analysis for retention rate, intended to complement the Retention Rate specification document, which is only accessible by Wikimedia Foundation staff. The code used was adapted from Mikhail Popov’s work on clickthrough rate,1 given that both metrics share similar data characteristics.
1 Popov https://people.wikimedia.org/~bearloga/notes/clickthrough-rates/analysis/
library(tidyverse)
library(dplyr)
library(here)
library(scales)
library(pwr) # Power analysis
library(effsize) # Effect sizes (e.g. Cohen's d)
library(magrittr)
# For tables
library(gt)
library(gtsummary)
library(IRdisplay)
library(effsize)
Calculate Binomial proportion confidence interval with a 95% CI (α = 0.05).
<-read.csv('Data_demo/example_1.csv') df
<- df %>%
retention_rate filter(cohort_date=='2024-07-01') %>%
group_by(cohort_date) %>%
summarize(
p_hat = returned_users / cohort_users,
n = cohort_users,
sd = sqrt(p_hat * (1 - p_hat)),
ci95 = (p_hat + c(-1, 1) * qnorm(1 - 0.05/2) * sd / sqrt(n)) %>%
label_percent(accuracy = 0.01)() %>%
paste(collapse = "-"), .groups='drop'
)
<- (
tbl %>%
retention_rate gt() %>%
tab_header('Retention Rate') %>%
fmt_number(
columns = c('p_hat','sd' ), decimals=4
%>%
) cols_label(
cohort_date = "Cohort",
p_hat = "Proportion of retention",
sd = "Standard deviation"
%>%
) tab_options(
table.font.size = px(14)
%>%
) cols_align(
align = "center") %>%
cols_width(
~ px(90)
n %>%
) opt_stylize()
)
display_html(as_raw_html(tbl))
Retention Rate | ||||
Cohort | Proportion of retention | n | Standard deviation | ci95 |
---|---|---|---|---|
The average of multiple cohorts is often used for baseline estimation and daily monitoring.
Calculate the 95% confidence interval (α = 0.05) of the sample mean below.
<- df %>%
average_retention_rate mutate(
retention_rate=returned_users / cohort_users,
%>%
) summarize(
sd = sd(retention_rate),
x_bar = mean(retention_rate),
n = n(),
ci95 = (x_bar + c(-1, 1) * qnorm(1 - 0.05/2) * sd / sqrt(n)) %>%
label_percent(accuracy = 0.01)() %>%
paste(collapse = "-")
)
<- (
tbl %>%
average_retention_rate select('x_bar','n','sd','ci95')%>%
gt() %>%
tab_header('Average Retention Rate') %>%
fmt_number(
columns = c('x_bar','sd' ), decimals=4
%>%
) cols_label(
x_bar = "Average Retention Rate",
sd = "Standard deviation"
%>%
) tab_options(
table.font.size = px(14)
%>%
) cols_align(
align = "center") %>%
cols_width(
~ px(100)
n %>%
) opt_stylize()
)
display_html(as_raw_html(tbl))
Average Retention Rate | |||
Average Retention Rate | n | Standard deviation | ci95 |
---|---|---|---|
For retention rate, the effect size is usually small, often much less than the ‘small effect size’ defined by enwiki:Cohen’s h.
<-read.csv('Data_demo/example_2.csv') df_experiment
<- df_experiment %>%
df_experiment mutate(
retentaion_rate=returned_users/cohort_users
)
<- (
tbl %>%
average_retention_rate select('x_bar','n','sd','ci95')%>%
gt() %>%
tab_header('Average Retention Rate') %>%
fmt_number(
columns = c('x_bar','sd' ), decimals=4
%>%
) cols_label(
x_bar = "Average Retention Rate",
sd = "Standard deviation"
%>%
) tab_options(
table.font.size = px(14)
%>%
) cols_align(
align = "center") %>%
cols_width(
~ px(100)
n %>%
) opt_stylize()
)
display_html(as_raw_html(tbl))
<- df_experiment$retentaion_rate|>
p_hats set_names(df_experiment$experiment_assignment)
<- (
tbl list(
`Control vs Treatment1` = abs(ES.h(p_hats["Control"], p_hats["Treatment1"])),
`Control vs Treatment2` = abs(ES.h(p_hats["Control"], p_hats["Treatment2"])),
`Treatment1 vs Treatment2` = abs(ES.h(p_hats["Treatment1"], p_hats["Treatment2"]))
%>%
) as_tibble() %>%
gt() %>%
tab_header("Cohen's h") %>%
opt_stylize()
)
display_html(as_raw_html(tbl))
Cohen's h | ||
Control vs Treatment1 | Control vs Treatment2 | Treatment1 vs Treatment2 |
---|---|---|
pwr.2p.test(
h = 0.009698374,
power = 0.8,
alternative = "greater"
)
Difference of proportion power calculation for binomial distribution (arcsine transformation)
h = 0.009698374
n = 131462
sig.level = 0.05
power = 0.8
alternative = greater
NOTE: same sample sizes
If we use the smallest Cohen’s h (0.009698374) in above example to estimate the sample size, we need at least 131462 units in each group to achieve 80% power (with 5% significance level).
We explored effect size scenarios of 2 pp (percentage point), 1 pp, and 0.5 pp below.
pwr.2p.test(
h = 0.02,
power = 0.8,
alternative = "greater"
)
Difference of proportion power calculation for binomial distribution (arcsine transformation)
h = 0.02
n = 30912.79
sig.level = 0.05
power = 0.8
alternative = greater
NOTE: same sample sizes
pwr.2p.test(
h = 0.01,
power = 0.8,
alternative = "greater"
)
Difference of proportion power calculation for binomial distribution (arcsine transformation)
h = 0.01
n = 123651.1
sig.level = 0.05
power = 0.8
alternative = greater
NOTE: same sample sizes
pwr.2p.test(
h = 0.005,
power = 0.8,
alternative = "greater"
)
Difference of proportion power calculation for binomial distribution (arcsine transformation)
h = 0.005
n = 494604.6
sig.level = 0.05
power = 0.8
alternative = greater
NOTE: same sample sizes
<- c(0.02, 0.01, 0.005)
effect_size<- c(30913, 123652, 494605)
sample_size <- data.frame(effect_size,sample_size) df_size
<- (
tbl %>%
df_size gt() %>%
tab_header('Summary of Effect Sizes and Sample Sizes') %>%
cols_label(
effect_size = "Effect size (h)",
sample_size = "Sample size"
%>%
) tab_options(
table.font.size = px(14)
%>%
) cols_align(
align = "center") %>%
opt_stylize()
)
display_html(as_raw_html(tbl))
Summary of Effect Sizes and Sample Sizes | |
Effect size (h) | Sample size |
---|---|
<-read.csv('Data_demo/example_2.csv') df_expriment
First, we test whether at least one group differs from the others.
prop.test(
x = df_expriment$returned_users,
n = df_expriment$cohort_users
)
3-sample test for equality of proportions without continuity correction
data: df_expriment$returned_users out of df_expriment$cohort_users
X-squared = 410.8, df = 2, p-value < 2.2e-16
alternative hypothesis: two.sided
sample estimates:
prop 1 prop 2 prop 3
0.4649941 0.4698330 0.4800379
We rejected null hypothesis. At least one of the groups is different from the others.
Next, compare each pair of the three groups.
<- df_expriment$returned_users %>%
returned_users set_names(df_expriment$experiment_assignment)
<- df_expriment$cohort_users %>%
cohort_users set_names(df_expriment$experiment_assignment)
pairwise.prop.test(
x = returned_users,
n = cohort_users,
p.adjust.method = "bonferroni"
)
Pairwise comparisons using Pairwise comparison of proportions
data: returned_users out of cohort_users
Control Treatment1
Treatment1 4.7e-10 -
Treatment2 < 2e-16 < 2e-16
P value adjustment method: bonferroni