Code
library(tidyverse)
library(dplyr)
library(here)
library(scales)
library(pwr) # Power analysis
library(effsize) # Effect sizes (e.g. Cohen's d)
library(magrittr)
# For tables
library(gt)
library(gtsummary)
library(IRdisplay)This report presents examples of statistical analysis for retention rate, intended to complement the Retention Rate specification document, which is only accessible by Wikimedia Foundation staff. The code used was adapted from Mikhail Popov’s work on clickthrough rate,1 given that both metrics share similar data characteristics.
1 Popov https://people.wikimedia.org/~bearloga/notes/clickthrough-rates/analysis/
library(tidyverse)
library(dplyr)
library(here)
library(scales)
library(pwr) # Power analysis
library(effsize) # Effect sizes (e.g. Cohen's d)
library(magrittr)
# For tables
library(gt)
library(gtsummary)
library(IRdisplay)library(effsize)Calculate Binomial proportion confidence interval with a 95% CI (α = 0.05).
df<-read.csv('Data_demo/example_1.csv')retention_rate <- df %>%
filter(cohort_date=='2024-07-01') %>%
group_by(cohort_date) %>%
summarize(
p_hat = returned_users / cohort_users,
n = cohort_users,
sd = sqrt(p_hat * (1 - p_hat)),
ci95 = (p_hat + c(-1, 1) * qnorm(1 - 0.05/2) * sd / sqrt(n)) %>%
label_percent(accuracy = 0.01)() %>%
paste(collapse = "-"), .groups='drop'
)
tbl <- (
retention_rate %>%
gt() %>%
tab_header('Retention Rate') %>%
fmt_number(
columns = c('p_hat','sd' ), decimals=4
) %>%
cols_label(
cohort_date = "Cohort",
p_hat = "Proportion of retention",
sd = "Standard deviation"
) %>%
tab_options(
table.font.size = px(14)
) %>%
cols_align(
align = "center") %>%
cols_width(
n ~ px(90)
) %>%
opt_stylize()
)
display_html(as_raw_html(tbl))| Retention Rate | ||||
| Cohort | Proportion of retention | n | Standard deviation | ci95 |
|---|---|---|---|---|
The average of multiple cohorts is often used for baseline estimation and daily monitoring.
Calculate the 95% confidence interval (α = 0.05) of the sample mean below.
average_retention_rate <- df %>%
mutate(
retention_rate=returned_users / cohort_users,
) %>%
summarize(
sd = sd(retention_rate),
x_bar = mean(retention_rate),
n = n(),
ci95 = (x_bar + c(-1, 1) * qnorm(1 - 0.05/2) * sd / sqrt(n)) %>%
label_percent(accuracy = 0.01)() %>%
paste(collapse = "-")
)tbl <- (
average_retention_rate %>%
select('x_bar','n','sd','ci95')%>%
gt() %>%
tab_header('Average Retention Rate') %>%
fmt_number(
columns = c('x_bar','sd' ), decimals=4
) %>%
cols_label(
x_bar = "Average Retention Rate",
sd = "Standard deviation"
) %>%
tab_options(
table.font.size = px(14)
) %>%
cols_align(
align = "center") %>%
cols_width(
n ~ px(100)
) %>%
opt_stylize()
)
display_html(as_raw_html(tbl))| Average Retention Rate | |||
| Average Retention Rate | n | Standard deviation | ci95 |
|---|---|---|---|
For retention rate, the effect size is usually small, often much less than the ‘small effect size’ defined by enwiki:Cohen’s h.
df_experiment <-read.csv('Data_demo/example_2.csv')df_experiment <- df_experiment %>%
mutate(
retentaion_rate=returned_users/cohort_users
)tbl <- (
average_retention_rate %>%
select('x_bar','n','sd','ci95')%>%
gt() %>%
tab_header('Average Retention Rate') %>%
fmt_number(
columns = c('x_bar','sd' ), decimals=4
) %>%
cols_label(
x_bar = "Average Retention Rate",
sd = "Standard deviation"
) %>%
tab_options(
table.font.size = px(14)
) %>%
cols_align(
align = "center") %>%
cols_width(
n ~ px(100)
) %>%
opt_stylize()
)
display_html(as_raw_html(tbl))p_hats <- df_experiment$retentaion_rate|>
set_names(df_experiment$experiment_assignment)
tbl <- (
list(
`Control vs Treatment1` = abs(ES.h(p_hats["Control"], p_hats["Treatment1"])),
`Control vs Treatment2` = abs(ES.h(p_hats["Control"], p_hats["Treatment2"])),
`Treatment1 vs Treatment2` = abs(ES.h(p_hats["Treatment1"], p_hats["Treatment2"]))
) %>%
as_tibble() %>%
gt() %>%
tab_header("Cohen's h") %>%
opt_stylize()
)
display_html(as_raw_html(tbl))| Cohen's h | ||
| Control vs Treatment1 | Control vs Treatment2 | Treatment1 vs Treatment2 |
|---|---|---|
pwr.2p.test(
h = 0.009698374,
power = 0.8,
alternative = "greater"
)
Difference of proportion power calculation for binomial distribution (arcsine transformation)
h = 0.009698374
n = 131462
sig.level = 0.05
power = 0.8
alternative = greater
NOTE: same sample sizes
If we use the smallest Cohen’s h (0.009698374) in above example to estimate the sample size, we need at least 131462 units in each group to achieve 80% power (with 5% significance level).
We explored effect size scenarios of 2 pp (percentage point), 1 pp, and 0.5 pp below.
pwr.2p.test(
h = 0.02,
power = 0.8,
alternative = "greater"
)
Difference of proportion power calculation for binomial distribution (arcsine transformation)
h = 0.02
n = 30912.79
sig.level = 0.05
power = 0.8
alternative = greater
NOTE: same sample sizes
pwr.2p.test(
h = 0.01,
power = 0.8,
alternative = "greater"
)
Difference of proportion power calculation for binomial distribution (arcsine transformation)
h = 0.01
n = 123651.1
sig.level = 0.05
power = 0.8
alternative = greater
NOTE: same sample sizes
pwr.2p.test(
h = 0.005,
power = 0.8,
alternative = "greater"
)
Difference of proportion power calculation for binomial distribution (arcsine transformation)
h = 0.005
n = 494604.6
sig.level = 0.05
power = 0.8
alternative = greater
NOTE: same sample sizes
effect_size<- c(0.02, 0.01, 0.005)
sample_size <- c(30913, 123652, 494605)
df_size <- data.frame(effect_size,sample_size)tbl <- (
df_size %>%
gt() %>%
tab_header('Summary of Effect Sizes and Sample Sizes') %>%
cols_label(
effect_size = "Effect size (h)",
sample_size = "Sample size"
) %>%
tab_options(
table.font.size = px(14)
) %>%
cols_align(
align = "center") %>%
opt_stylize()
)
display_html(as_raw_html(tbl))| Summary of Effect Sizes and Sample Sizes | |
| Effect size (h) | Sample size |
|---|---|
df_expriment <-read.csv('Data_demo/example_2.csv')First, we test whether at least one group differs from the others.
prop.test(
x = df_expriment$returned_users,
n = df_expriment$cohort_users
)
3-sample test for equality of proportions without continuity correction
data: df_expriment$returned_users out of df_expriment$cohort_users
X-squared = 410.8, df = 2, p-value < 2.2e-16
alternative hypothesis: two.sided
sample estimates:
prop 1 prop 2 prop 3
0.4649941 0.4698330 0.4800379
We rejected null hypothesis. At least one of the groups is different from the others.
Next, compare each pair of the three groups.
returned_users <- df_expriment$returned_users %>%
set_names(df_expriment$experiment_assignment)
cohort_users <- df_expriment$cohort_users %>%
set_names(df_expriment$experiment_assignment)
pairwise.prop.test(
x = returned_users,
n = cohort_users,
p.adjust.method = "bonferroni"
)
Pairwise comparisons using Pairwise comparison of proportions
data: returned_users out of cohort_users
Control Treatment1
Treatment1 4.7e-10 -
Treatment2 < 2e-16 < 2e-16
P value adjustment method: bonferroni