Note

This report presents examples of statistical analysis for retention rate, intended to complement the Retention Rate specification document, which is only accessible by Wikimedia Foundation staff. The code used was adapted from Mikhail Popov’s work on clickthrough rate,¹ given that both metrics share similar data characteristics.

¹ Popov https://people.wikimedia.org/~bearloga/notes/clickthrough-rates/analysis/

Code

library(tidyverse)
library(dplyr) 
library(here)
library(scales)
library(pwr)     # Power analysis
library(effsize) # Effect sizes (e.g. Cohen's d)
library(magrittr)
# For tables
library(gt)
library(gtsummary)
library(IRdisplay)

Code

library(effsize)

Baseline estimation

One cohort

Calculate Binomial proportion confidence interval with a 95% CI (α = 0.05).

Code

df<-read.csv('Data_demo/example_1.csv')

retention_rate <- df %>%
    filter(cohort_date=='2024-07-01') %>%
    group_by(cohort_date) %>%
    summarize(
        p_hat = returned_users / cohort_users,
        n = cohort_users,
        sd = sqrt(p_hat * (1 - p_hat)),
        ci95 = (p_hat + c(-1, 1) * qnorm(1 - 0.05/2) * sd / sqrt(n)) %>%
            label_percent(accuracy = 0.01)() %>%
            paste(collapse = "-"), .groups='drop'
    )

Code


tbl <- (
    retention_rate %>%
    gt() %>%
    tab_header('Retention Rate') %>%
    fmt_number(
        columns = c('p_hat','sd' ), decimals=4
    ) %>%
    cols_label(
        cohort_date = "Cohort",
        p_hat = "Proportion of retention",
        sd = "Standard deviation"
      ) %>%
    tab_options(
        table.font.size = px(14)
    ) %>%
    cols_align(
    align = "center") %>%
    cols_width(
            n ~ px(90)
        ) %>%
    opt_stylize() 
)

display_html(as_raw_html(tbl))

Cohort	Proportion of retention	n	Standard deviation	ci95
Retention Rate
2024-07-01	0.4449	863884	0.4970	44.38%-44.59%

Multiple cohorts

The average of multiple cohorts is often used for baseline estimation and daily monitoring.
Calculate the 95% confidence interval (α = 0.05) of the sample mean below.

average_retention_rate <- df %>%
    mutate(
        retention_rate=returned_users / cohort_users,
        ) %>%
    summarize(
        sd = sd(retention_rate),
        x_bar = mean(retention_rate),
        n = n(),
        ci95 = (x_bar + c(-1, 1) * qnorm(1 - 0.05/2) * sd / sqrt(n)) %>%
            label_percent(accuracy = 0.01)() %>%
            paste(collapse = "-")
    )

Code

tbl <- (
    average_retention_rate %>%
    select('x_bar','n','sd','ci95')%>%
    gt() %>%
    tab_header('Average Retention Rate') %>%
    fmt_number(
        columns = c('x_bar','sd' ), decimals=4
    ) %>%
    cols_label(
        x_bar = "Average Retention Rate",
        sd = "Standard deviation"
      ) %>%
    tab_options(
        table.font.size = px(14)
    ) %>%
    cols_align(
    align = "center") %>%
    cols_width(
            n ~ px(100)
        ) %>%
    opt_stylize() 
)

display_html(as_raw_html(tbl))

Average Retention Rate	n	Standard deviation	ci95
Average Retention Rate
0.4557	7	0.0111	44.74%-46.39%

Planning experiments

One cohort

For retention rate, the effect size is usually small, often much less than the ‘small effect size’ defined by enwiki:Cohen’s h.

Code

df_experiment <-read.csv('Data_demo/example_2.csv')

Code

df_experiment <- df_experiment %>%
 mutate(
  retentaion_rate=returned_users/cohort_users
 )

Code

tbl <- (
    average_retention_rate %>%
    select('x_bar','n','sd','ci95')%>%
    gt() %>%
    tab_header('Average Retention Rate') %>%
    fmt_number(
        columns = c('x_bar','sd' ), decimals=4
    ) %>%
    cols_label(
        x_bar = "Average Retention Rate",
        sd = "Standard deviation"
      ) %>%
    tab_options(
        table.font.size = px(14)
    ) %>%
    cols_align(
    align = "center") %>%
    cols_width(
            n ~ px(100)
        ) %>%
    opt_stylize() 
)

display_html(as_raw_html(tbl))

Code

p_hats <- df_experiment$retentaion_rate|>
    set_names(df_experiment$experiment_assignment)

tbl <- (
list(
    `Control vs Treatment1` = abs(ES.h(p_hats["Control"], p_hats["Treatment1"])),
    `Control vs Treatment2` = abs(ES.h(p_hats["Control"], p_hats["Treatment2"])),
    `Treatment1 vs Treatment2` = abs(ES.h(p_hats["Treatment1"], p_hats["Treatment2"]))
) %>%
    as_tibble() %>%
    gt() %>%
    tab_header("Cohen's h") %>%
    opt_stylize() 
)

display_html(as_raw_html(tbl))

Control vs Treatment1	Control vs Treatment2	Treatment1 vs Treatment2
Cohen's h
0.009698374	0.0301343	0.02043592

Code

pwr.2p.test(
    h = 0.009698374,
    power = 0.8,
    alternative = "greater"
)


     Difference of proportion power calculation for binomial distribution (arcsine transformation) 

              h = 0.009698374
              n = 131462
      sig.level = 0.05
          power = 0.8
    alternative = greater

NOTE: same sample sizes

Note

If we use the smallest Cohen’s h (0.009698374) in above example to estimate the sample size, we need at least 131462 units in each group to achieve 80% power (with 5% significance level).

We explored effect size scenarios of 2 pp (percentage point), 1 pp, and 0.5 pp below.

Code

pwr.2p.test(
    h = 0.02,
    power = 0.8,
    alternative = "greater"
)


     Difference of proportion power calculation for binomial distribution (arcsine transformation) 

              h = 0.02
              n = 30912.79
      sig.level = 0.05
          power = 0.8
    alternative = greater

NOTE: same sample sizes

Code

pwr.2p.test(
    h = 0.01,
    power = 0.8,
    alternative = "greater"
)


     Difference of proportion power calculation for binomial distribution (arcsine transformation) 

              h = 0.01
              n = 123651.1
      sig.level = 0.05
          power = 0.8
    alternative = greater

NOTE: same sample sizes

Code

pwr.2p.test(
    h = 0.005,
    power = 0.8,
    alternative = "greater"
)


     Difference of proportion power calculation for binomial distribution (arcsine transformation) 

              h = 0.005
              n = 494604.6
      sig.level = 0.05
          power = 0.8
    alternative = greater

NOTE: same sample sizes

Code


effect_size<- c(0.02, 0.01, 0.005)
sample_size <- c(30913, 123652, 494605)
df_size <- data.frame(effect_size,sample_size)

Code

tbl <- (
    df_size %>%
    gt() %>%
    tab_header('Summary of Effect Sizes and Sample Sizes') %>%
    cols_label(
        effect_size = "Effect size (h)",
        sample_size = "Sample size"
      ) %>%
    tab_options(
        table.font.size = px(14)
    ) %>%
    cols_align(
    align = "center") %>%
    opt_stylize() 
)

display_html(as_raw_html(tbl))

Effect size (h)	Sample size
Summary of Effect Sizes and Sample Sizes
0.020	30913
0.010	123652
0.005	494605

Evaluating experiments

One cohort date

Code

df_expriment <-read.csv('Data_demo/example_2.csv')

First, we test whether at least one group differs from the others.

Code

prop.test(
    x = df_expriment$returned_users,
    n = df_expriment$cohort_users
)


    3-sample test for equality of proportions without continuity correction

data:  df_expriment$returned_users out of df_expriment$cohort_users
X-squared = 410.8, df = 2, p-value < 2.2e-16
alternative hypothesis: two.sided
sample estimates:
   prop 1    prop 2    prop 3 
0.4649941 0.4698330 0.4800379

Note

We rejected null hypothesis. At least one of the groups is different from the others.

Next, compare each pair of the three groups.

Code

returned_users <- df_expriment$returned_users %>%
    set_names(df_expriment$experiment_assignment)
cohort_users <- df_expriment$cohort_users %>%
    set_names(df_expriment$experiment_assignment)

pairwise.prop.test(
    x = returned_users,
    n = cohort_users,
    p.adjust.method = "bonferroni"
)


    Pairwise comparisons using Pairwise comparison of proportions 

data:  returned_users out of cohort_users 

           Control Treatment1
Treatment1 4.7e-10 -         
Treatment2 < 2e-16 < 2e-16   

P value adjustment method: bonferroni