Making of Content Interactions Trend Chart

Mikhail Popov

Final Product

Data

library(here)
library(tidyverse)

metrics <- here("data/metrics.csv") |>
  read_csv(show_col_types = FALSE) |>
  janitor::clean_names() |>
  arrange(month) |>
  filter(month >= "2018-05-01")

current_month <- max(metrics$month)
month total_pageview pageview_multiplier total_pageview_corrected previews_seen interactions interactions_corrected
2018-05-01 16081901671 1 16081901671 1965116275 18047017946 18047017946
2018-11-01 15838868761 1 15838868761 1818341997 17657210758 17657210758
2019-02-01 14994451563 1 14994451563 1738551430 16733002993 16733002993
2019-10-01 16722401874 1 16722401874 1936366629 18658768503 18658768503
2020-04-01 19825763318 1 19825763318 2419940921 22245704239 22245704239
2020-05-01 19295670656 1 19295670656 2399548068 21695218724 21695218724
2021-04-01 17594157293 1 17594157293 2129806204 19723963497 19723963497
2021-05-01 17783793360 1 17783793360 2054246112 19838039472 19838039472
2022-02-01 16040523442 1 16040523442 1833566161 17874089603 17874089603
2022-05-01 16386936815 1 16386936815 1788004827 18174941642 18174941642
metrics_ext <- metrics |>
  mutate(
    dataloss = (pageview_multiplier > 1.0) |
      month %in% (
        metrics |>
          filter(pageview_multiplier > 1.0) |>
          pull(month) |>
          range() |>
          (\(x) x + months(c(-1, 1)))()
      ),
    period = case_when(
      month < "2021-06-01" ~ "before dataloss",
      month >= "2021-06-01" & month < "2022-02-01" ~ "during dataloss",
      month >= "2022-02-01" ~ "after dataloss"
    )
  )

dataloss is TRUE for months during the data loss and 1 month before/after the data loss

month other columns dataloss period
2018-07-01 ... FALSE before dataloss
2019-12-01 ... FALSE before dataloss
2020-06-01 ... FALSE before dataloss
2021-06-01 ... TRUE during dataloss
2021-08-01 ... TRUE during dataloss
2022-01-01 ... TRUE during dataloss
2022-02-01 ... TRUE after dataloss
2022-03-01 ... FALSE after dataloss
2022-06-01 ... FALSE after dataloss

Annotations

annotations <- metrics |>
  select(month) |>
  mutate(
    curr_offset = (month %in% (current_month - years(0:4))),
    prev_offset = (month %in% ((current_month - months(1)) - years(0:4)))
  ) |>
  filter(curr_offset | prev_offset) |>
  mutate(year = year(month)) |>
  inner_join(metrics, by = "month") |>
  group_by(year) |>
  summarize(
    from = interactions_corrected[prev_offset],
    to = interactions_corrected[curr_offset],
    # The change from 2021-06 to 2021-07 (for example):
    delta = to - from,
    direction = factor(delta > 0, c(TRUE, FALSE), c("up", "down"))
  ) |>
  mutate(
    month = ymd(sprintf("%i-%02.0f-01", year, month(current_month))),
    prev_month = month - months(1)
  )
year from to delta direction month prev_month
2018 16751441350 17019003787 267562437 up 2018-07-01 2018-06-01
2019 16449779883 17021113541 571333658 up 2019-07-01 2019-06-01
2020 18238770491 18099564641 -139205850 down 2020-07-01 2020-06-01
2021 18348885876 18598614698 249728822 up 2021-07-01 2021-06-01
2022 17104956625 17857938700 752982075 up 2022-07-01 2022-06-01

Chart

library(scales)
library(hrbrthemes)

p <- ggplot() +
  theme_ipsum_rc(
    grid = "Yy",
    base_family = "Montserrat"
  )

Highlighting July over the years

p <- p +
  geom_point(
    aes(x = month, y = to),
    data = annotations,
    shape = 21, stroke = 1,
    size = 20, alpha = 0.5,
    color = wmf_colors$Yellow50,
    fill = wmf_colors$Yellow90
  ) +
  geom_point(
    aes(x = month, y = to),
    size = 2, # default 1.5
    color = wmf_colors$Accent30,
    data = annotations
  )

p

Line attempt No. 1

p +
  geom_line(
    aes(
      x = month,
      y = interactions
    ),
    color = wmf_colors$Accent30,
    data = metrics_ext |>
      filter(period != "during dataloss")
  )

Line attempt No. 2

p <- p +
  geom_line(
    aes(
      x = month,
      y = interactions,
      group = period
    ),
    color = wmf_colors$Accent30,
    data = metrics_ext |>
      filter(period != "during dataloss")
  )

p

Dataloss: Undercount

p <- p +
  geom_line(
    aes(
      x = month,
      y = interactions
    ),
    data = metrics_ext |>
      filter(dataloss),
    color = wmf_colors$Accent50,
    linetype = "33"
  )

p

Dataloss: Estimate

p <- p +
  geom_line(
    aes(
      x = month,
      y = interactions_corrected
    ),
    data = metrics_ext |>
      filter(dataloss),
    color = wmf_colors$Accent30,
    linetype = "92",
  )

p

Y Axis

p <- p +
  scale_y_continuous(
    name = NULL,
    labels = label_number(
      scale = 1e-9,
      suffix = " B",
      accuracy = 1
    ),
    breaks = seq(16e9, 23e9, 1e9),
    limits = c(16e9, 23e9)
  )

p

X Axis

p <- p +
  scale_x_date(
    name = NULL,
    breaks = annotations$month,
    date_labels = "%B\n%Y"
  )

p

Final Touches

p <- p +
  theme(
    plot.background =
      element_rect(fill = "white", color = "white"),
    panel.grid.major.y =
      element_line(color = wmf_colors$Base70),
    panel.grid.minor.y =
      element_line(color = wmf_colors$Base80),
    axis.text.x = element_text(size = 14),
    axis.text.y = element_text(size = 14)
  ) +
  ggtitle("Content Interactions")

p

Adding Arrows

p <- p +
  geom_segment(
    aes(
      x = prev_month,
      xend = month,
      y = from, yend = to,
      color = direction
    ),
    size = 1.0, # default 0.5
    arrow = arrow(
      type = "closed",
      length = unit(0.1, "inches")
    ),
    data = annotations
  )

p

Adjusting Arrows

p <- p +
  scale_color_manual(
    values = c(
      "up" = wmf_colors$Green50,
      "down" = wmf_colors$Red50
    ),
    guide = "none"
  )

p