This AB Test compares two different configurations of autocomplete on WMF wikis.
The control bucket is the currently deployed autocomplete, it sources the data solely based on titles and redirects. The test bucket, default_sort, augments this by using the default_sort mediawiki attribute which primary aims is to help sort pages in various lists:
on page about individuals default_sort re-orders the title putting the lastname first: John Smith becomes Smith, John, allowing searchers to get suggestions when they only know the lastname.
Lists are also re-ordered the same way allowing searchers to get suggestions on these pages without typing list of first.
The data is sourced from the Search Satisfaction event log, found in event.searchsatisfaction. This schema collects information about the search queries users perform and their interaction with the results and the pages they visit from those results. The selection of metrics presented here draw inspiration from historical AB tests run by the Search Platform team.
This test is a followup of a previous A/B test.
The report provides only aggregate event data. According to the data publication guidelines this is a Low Risk report as it contains only project level analysis.
Data Gathering
Code
from collections import defaultdictfrom datetime import datetimeimport osfrom textwrap import dedentimport great_tables as gtimport matplotlib.pyplot as pltimport numpy as npimport pandas as pdfrom pyspark.sql import functions as F, Windowfrom scipy.stats import gaussian_kdeimport seaborn as snsfrom statsmodels.stats.multitest import multipletestsfrom statsmodels.stats.proportion import proportion_confint, proportions_ztestfrom IPython.display import display, display_html, Markdownfrom discolytics.cli.search_satisfaction_metrics import reporting_dimensions, satisfaction_metricsfrom discolytics.hive import HivePartitionTimeRangefrom discolytics.spark import sum_boolimport wmfdata# palette = bokeh.palettes.Spectral[6]palette = ('#3288bd', '#99d594')sns.set_style("whitegrid")
Code
#os.environ['JAVA_HOME'] = '/usr/lib/jvm/java-8-openjdk-amd64'spark = wmfdata.spark.create_custom_session(master="yarn", spark_config={# The toPandas() kept blowing up at 8g when collecting a weeks data"spark.driver.memory": "12g","spark.executor.memory": "8g","spark.executor.memoryOverhead": "4g","spark.dynamicAllocation.maxExecutors": 10,"spark.driver.maxResultSize": "2g",})# TODO: Move to discolytics (or wmfdata?) as helper fn?def report_known_loggers(spark): log4j = spark.sparkContext._jvm.org.apache.log4j rootLogger = log4j.LogManager.getRootLogger() loggers = rootLogger.getLoggerRepository().getCurrentLoggers()while loggers.hasMoreElements(): child = loggers.nextElement()if child.getLevel() isnotNone: result[child.getName()] = child.getLevel()return rootLogger.getLevel(), result;def suppress_logs(spark): logger = spark.sparkContext._jvm.org.apache.log4j logger.LogManager.getLogger("org.apache.spark.sql").setLevel(logger.Level.ERROR)suppress_logs(spark)if DEBUG:from pprint import pprint pprint(report_known_loggers(spark))
Code
def per_page_agg(df_events):return ( df_events .groupBy( F.col('normalized_host'), F.col('event.searchSessionId'), F.col('event.pageViewId') ) .agg( F.count(F.lit(1)).alias('num_events'), F.min(F.length(F.col('event.query'))) .alias('autocomplete_min_query_length'), F.max(F.length(F.col('event.query'))) .alias('autocomplete_query_length'), F.collect_set(F.when(F.col('event.subTest') !='pending', F.col('event.subTest'))) .alias('subTest'),*((sum_bool(prop) >0).alias(f'is_{name}')for name, prop in satisfaction_metrics().items()if name.startswith('autocomplete')),*(F.first(col).alias(name) for name, col in reporting_dimensions().items()),# We see page views with events from multiple ips...max() seems# reasonable, although there are also good arguments for using sum(). F.max('q_by_ip_day').alias('q_by_ip_day'),# Multiple clicks on same page? i dunno...take the lowest posiiton F.min(F.when(F.col('event.position') >=0, F.col('event.position'))).alias('min_click_position'), )# We are only interested in pages that invoked autocomplete, not visited pages .where('is_autocomplete_serp') .drop('is_autocomplete_serp')# Resolve subTest into a final value .withColumn('no_assigned_subtest', F.size(F.col('subTest')) ==0) .withColumn('multiple_subtest', F.size(F.col('subTest')) >1) .withColumn('empty_query', F.length(F.col('autocomplete_min_query_length')) ==0) .withColumn('subTest', F.when( F.size(F.col('subTest')) ==1, F.col('subTest')[0] ).otherwise( F.lit('') )) )w_ip = ( Window.partitionBy('year', 'month', 'day', 'ip') .rowsBetween(Window.unboundedPreceding, Window.unboundedFollowing))df_source_events = HivePartitionTimeRange.from_spec(f'event.searchsatisfaction/@{START_AT}/{END_AT}').read(spark)df_raw = ( df_source_events .where(F.col("event.subTest").startswith(F.lit(ACTIVE_TEST +':'))) .where(~F.col('useragent.is_bot')) .where(F.col('useragent.browser_family') !='HeadlessChrome') .withColumn('uniqueId', F.col('event.uniqueId')) .dropDuplicates(['uniqueId']) .drop('uniqueId')# Number of distinct page views that performed autocomplete queries# Note that ip isn't a great proxy, some page views see events from multiple ips.# But better than nothing. .withColumn('q_by_ip_day', F.size(F.collect_set('event.pageViewId').over(w_ip))) .transform(per_page_agg)# Pandas is more efficient if these are flat strings and not a tuple# ex: ar, en, zh, etc. .withColumn('wiki_project', F.col('normalized_host.project'))# ex: wikipedia, wiktionary, etc. .withColumn('wiki_family', F.col('normalized_host.project_family'))# Debug columns .drop('pageViewId', 'searchSessionId', 'normalized_host', 'access_method') .toPandas())# In a quick test this cut a days data from 1.6GB -> 120MBfor col_name in ('browser_family', 'country', 'os_family','subTest', 'user_edit_bucket', 'wiki_family', 'wiki_project',): df_raw[col_name] = df_raw[col_name].astype('category')
Data Cleaning
Code
possible_empty_state_sessions = df_raw['empty_query']no_assigned_test = df_raw['no_assigned_subtest']multiple_tests = df_raw['multiple_subtest']too_many_queries = df_raw['q_by_ip_day'] > DAILY_QUERY_THRESHOLDactive_test = df_raw['subTest']wrong_active_test =~no_assigned_test &~multiple_tests &~active_test.str.startswith(ACTIVE_TEST +':')click_no_query = df_raw['autocomplete_query_length'].isna() & df_raw['is_autocomplete_success']invalid_page_views = ( no_assigned_test | multiple_tests | too_many_queries | wrong_active_test | click_no_query)df = df_raw[~invalid_page_views].copy()df['subTest'] = ( active_test[~invalid_page_views] .str[len(ACTIVE_TEST) +1:] .astype('category'))# Useful to group ondf['wiki'] = (df['wiki_project'].astype('str') +'.'+ df['wiki_family'].astype('str')).astype('category')def clip_percentile(df, col_name, upper): col = df[col_name] df[col_name] = col.clip(upper=col.quantile(upper))# Max length was several thousand...clip to 95th percentile# which seems like a reasonable way to avoid skewing the mean# value too much (also called upper-tail winsorization).clip_percentile(df, 'autocomplete_query_length', upper=0.95)df_submits = df[df['is_autocomplete_submit']]df_clicks = df[df['is_autocomplete_success']]colors = {bucket: color for bucket, color inzip(df['subTest'].unique(), palette)}
Data Summary
Code
def display_h(frames, space=10): html_str ='<div style="display: flex; justify-content: space-around; flex-wrap: wrap;">'for frame in frames: html_str +=f'<div style="margin-right: {space}px;">{frame._repr_html_()}</div>' html_str +='</div>' display_html(html_str, raw=True)report_num_days = (datetime.strptime(END_AT, '%Y-%m-%d %H:%M:%S') - datetime.strptime(START_AT, '%Y-%m-%d %H:%M:%S')).dayspv_per_day =len(df) / report_num_days # used later in the reportpv_per_day_perwiki = df.groupby('wiki')['wiki'].count() / report_num_daysoverall_summary = pd.DataFrame( {'Events': df_raw['num_events'].sum(),'Page Views': len(df_raw),'Autocomplete Submits': df_raw['is_autocomplete_submit'].sum(),'Autocomplete Successes': df_raw['is_autocomplete_success'].sum(),'Unique Wikis': len(df_raw[['wiki_project', 'wiki_family']].drop_duplicates()),'Days': report_num_days, }.items(), columns=('name', 'value'))overall_summaryoverall_summary_tbl = ( gt.GT(overall_summary) .tab_header('Summary of Events') .tab_style( style=gt.style.text(weight='bold'), locations=gt.loc.body( columns=['name'], rows=overall_summary.index.to_list(), ) ) .cols_label( name='Metric', value='Count', ) .fmt_number(columns=['value'], decimals=0))cleaning_summary = pd.DataFrame( ( (k, v ifisinstance(v, str) elsef'{v:,}')for k, v in {'Page views with no assigned test': no_assigned_test.sum(),'Page views with multiple test buckets': multiple_tests.sum(),'Page views with non-active tests': wrong_active_test.sum(),'Page views with too many daily searches': too_many_queries.sum(),'Page views with a click but no query': click_no_query.sum(),'Total filtered page views': invalid_page_views.sum(),'Filtered page views': f'{100* invalid_page_views.sum() /len(df_raw):.1f}%', }.items() ), columns=('name', 'value'))cleaning_summary_tbl = ( gt.GT(cleaning_summary) .tab_header('Data Cleaning Summary') .tab_style( style=gt.style.text(weight='bold'), locations=gt.loc.body( columns=['name'], rows=cleaning_summary.index.to_list(), ) ) .cols_label( name='Metric', value='Count', ))display_h([overall_summary_tbl, cleaning_summary_tbl])
Summary of Events
Metric
Count
Events
31,034,658
Page Views
3,061,278
Autocomplete Submits
2,936,357
Autocomplete Successes
1,654,271
Unique Wikis
21
Days
7
Data Cleaning Summary
Metric
Count
Page views with no assigned test
0
Page views with multiple test buckets
2
Page views with non-active tests
0
Page views with too many daily searches
89,574
Page views with a click but no query
4,124
Total filtered page views
93,628
Filtered page views
3.1%
Results of Statistical Analysis
Code
def per_wiki_sig_boolean_metric(title, df, agg_col, bool_col, alpha=0.05, min_obs=1000, include_all=False):if min_obs <1: min_obs =int(len(df) * min_obs)# Only applicable to boolean metrics that can be represented as a number of successes and # a number of observations. agg_df = ( df .groupby([agg_col, 'subTest']) .agg( metric_sum=(bool_col, np.sum), metric_counts=(bool_col, np.size), ) .reset_index() ) pivot_successes = agg_df.pivot(index=agg_col, columns='subTest', values='metric_sum') pivot_counts = agg_df.pivot(index=agg_col, columns='subTest', values='metric_counts') common = pivot_successes.dropna().index.intersection(pivot_counts.dropna().index) pivot_successes = pivot_successes.loc[common] pivot_counts = pivot_counts.loc[common] results = []for i, idx inenumerate(common): successes_control = pivot_successes['control'].values[i] successes_test = pivot_successes[TEST_BUCKET].values[i] counts_control = pivot_counts['control'].values[i] counts_test = pivot_counts[TEST_BUCKET].values[i]if counts_control + counts_test < min_obs:continue z_stat, p_value = proportions_ztest( [successes_control, successes_test], [counts_control, counts_test], alternative='two-sided') results.append({ agg_col: idx,'control': successes_control / counts_control, TEST_BUCKET: successes_test / counts_test,'p_value': p_value,'observations': int(counts_control + counts_test), }) results_df = pd.DataFrame(results)# Correct for running many tests instead of a single test reject, pvals_corrected, *_ = multipletests( results_df['p_value'].values, alpha=alpha, method='fdr_bh' ) results_df['p_value'] = pvals_corrected results_df['difference'] = results_df[TEST_BUCKET] - results_df['control'] results_df['rel_diff'] = results_df['difference'] / results_df['control'] results_df['lift'] = (results_df[TEST_BUCKET] - results_df['control']) / (1- results_df['control'])ifnot include_all: results_df = results_df[reject] results_df_tbl = ( gt.GT(results_df.sort_values('lift', ascending=False)) .tab_header(title) .fmt_percent(columns=['control', TEST_BUCKET, 'difference', 'rel_diff', 'lift'], decimals=1) .fmt_number(columns=['p_value'], decimals=4) .fmt_number(columns=['observations'], decimals=0) .cols_label( rel_diff='% change', ) ) display_h([results_df_tbl])def plot_int_distribution(title, data, color=None):# data must be positive and relatively small numbers min_x =int(np.min(data)) max_x =int(np.max(data)) fig, ax = plt.subplots(figsize=(PLOT_WIDTH, PLOT_HEIGHT)) x = np.arange(min_x, max_x +1) y = np.bincount(data, minlength=max_x +1)[min_x:max_x +1] ax.fill_between(x, y, alpha=0.6, color=color, edgecolor='black', linewidth=1) plt.title(title) ax.set_xlim(0, max_x) ax.set_ylim(0, np.max(y)) ax.set_xlabel('') ax.set_ylabel('') ax.set_yticks([]) ax.set_xticks(np.arange(0, max_x +1))# Remove top and right spines sns.despine(ax=ax, left=True) plt.tight_layout() plt.show()def variable_precision_formatter(x, pos): e = math.floor(math.log10(abs(x))) s =round(x /10**e, 10)def plot_distribution(title, buckets, data, colors=None):if colors isNone: colors =dict(zip(buckets, palette)) fig, ax = plt.subplots(figsize=(PLOT_WIDTH, PLOT_HEIGHT *len(buckets))) min_x =min(np.min(raw) for _, raw in data.values()) max_x =max(np.max(raw) for _, raw in data.values()) x = np.linspace(min_x, max_x, 500)# Calculate PDFs and scaling pdfs = {bucket: gaussian_kde(raw) for bucket, (_, raw) in data.items()} ys = {bucket: pdf(x) for bucket, pdf in pdfs.items()} max_y =max(np.max(ys[bucket]) for bucket in data.keys()) scale =0.4/ max_y# Plot each distribution as a ridgefor i, bucket inenumerate(sorted(buckets, reverse=True)): bounds, raw = data[bucket] y_scaled = ys[bucket] * scale y_offset = i # Vertical offset for ridge effect ax.fill_between(x, y_offset, y_offset + y_scaled, alpha=0.6, color=colors.get(bucket, 'blue'), edgecolor='black', linewidth=1, label=bucket)if bounds: lower, upper = bounds[0], bounds[-1] ax.plot([lower, lower], [y_offset -0.1, y_offset +0.1], 'k-', linewidth=2) ax.plot([upper, upper], [y_offset -0.1, y_offset +0.1], 'k-', linewidth=2) ax.plot([lower, upper], [y_offset, y_offset], 'k-', linewidth=2)# Styling plt.title(title) ax.set_xlim(min_x, max_x) ax.set_ylim(-0.4, len(buckets) -0.6) ax.set_xlabel('Value') ax.set_yticks(range(len(buckets))) ax.set_yticklabels(sorted(buckets, reverse=True)) sns.despine(ax=ax) plt.tight_layout()def ci(values, alpha=0.05): n =len(values)if values.dtype ==bool: successes = np.sum(values) proportion = successes / n scores = np.random.binomial(n, proportion, BOOTSTRAP_ROUNDS) / nelse: scores = ( np.random .choice(values, size=n * BOOTSTRAP_ROUNDS, replace=True) .reshape(BOOTSTRAP_ROUNDS, -1) .mean(axis=1) ) scores = np.sort(scores) low =int(BOOTSTRAP_ROUNDS * (alpha/2)) mid =int(BOOTSTRAP_ROUNDS /2) high =int(BOOTSTRAP_ROUNDS * (1- alpha/2))return (scores[low], scores[mid], scores[high]), scoresdef data_for_ci(df, extract): data = {} buckets = df['subTest'].unique()for bucket insorted(buckets): samples = extract(df[df['subTest'] == bucket]) data[bucket] = ci(samples)return datadef extract_ci(df, extract): data = data_for_ci(df, extract)return {bucket: bounds for bucket, (bounds, scores) in data.items()}def plot_ci(title, df, extract, colors=None): data = data_for_ci(df, extract) plot_distribution(title, data.keys(), data, colors)return {bucket: bounds for bucket, (bounds, scores) in data.items()}def note(content):return Markdown(f""":::{{.callout-note appearance=simple icon=false}}{content}::: """.strip())
Autocomplete Submit Rate
Code
def has_overlap(bounds): (a_low, _, a_high), (b_low, _, b_high) = bounds.values()return (a_low <= b_high) and (b_low <= a_high)def fmt_ci_pct(bounds): low, mid, high = boundsreturnf'{mid:.2%} (95% CI: {low:.2%} - {high:.2%})'def fmt_ci_int(bounds): low, mid, high = boundsreturnf'{mid:.3} (95% CI: {low:.3} - {high:.3})'def fmt_ab_pct(title, bounds): delta = bounds[TEST_BUCKET][1] - bounds['control'][1] direction ='declined'if delta <0else'increased'if has_overlap(bounds): summary ='this result is not statistically significant'else: summary =f'a percentage point delta of {delta:.2%}'return (f"{title}{direction} from "f"{fmt_ci_pct(bounds['control'])} in the control to "f"{fmt_ci_pct(bounds[TEST_BUCKET])} in the test, {summary}." )def fmt_ab_int(title, bounds): delta = bounds[TEST_BUCKET][1] - bounds['control'][1] direction ='declined'if delta <0else'increased'if has_overlap(bounds): summary ='this result is not statistically significant.'else: summary =f'a delta of {delta:,.2}.'return (f"{title}{direction} from "f"{fmt_ci_int(bounds['control'])} in the control to "f"{fmt_ci_int(bounds[TEST_BUCKET])} in the test, {summary}." )def fmt_delta(value, bounds, absolute=True, digits=2, comparative_adjective=False): delta = bounds[TEST_BUCKET][1] - bounds['control'][1] inc = delta >=0if absolute: delta =abs(delta) delta =round(value * delta, -digits)if comparative_adjective and absolute:if inc: adj ='more'else: adj ='fewer'returnf'{delta:,.0f}{adj}'returnf'{delta:,.0f}'display(note(f"""The Autocomplete Submit Rate represents the percentage of page views that perform atleast one autocomplete query and then submit the form, to be taken to either theselected page or Special:Search. Higher is generally considered better, but thecontrol is already quite high. This is primarily a verification that the test treatment did not have a negative effect on the autocomplete submit rate."""))bounds_submit = plot_ci(f"Autocomplete Submit Rate", df,lambda x: x['is_autocomplete_submit'] )display(note(f"""{fmt_ab_pct(f"Autocomplete submit rate", bounds_submit)}. This represents approximately{fmt_delta(pv_per_day, bounds_submit, comparative_adjective=True)} submits each day. """))plt.show()
The Autocomplete Submit Rate represents the percentage of page views that perform at least one autocomplete query and then submit the form, to be taken to either the selected page or Special:Search. Higher is generally considered better, but the control is already quite high. This is primarily a verification that the test treatment did not have a negative effect on the autocomplete submit rate.
Autocomplete submit rate increased from 95.94% (95% CI: 95.91% - 95.97%) in the control to 95.97% (95% CI: 95.94% - 96.00%) in the test, this result is not statistically significant.. This represents approximately 100 more submits each day.
Autocomplete Success Rate
Code
display(note(f"""The Autocomplete Success Rate represents the percentage of page views that perform at leastone autocomplete query and then directly select one of the presented options. Upon selectingan option the user is taken directly to the chosen page. An improved autocomplete should seethe success rate increase, or at least not decline."""))bounds_success = plot_ci(f"Autocomplete Success Rate", df,lambda x: x['is_autocomplete_success'])display(note(f"""{fmt_ab_pct("Autocomplete success rate", bounds_success)}. This represents an increase in successfullautocomplete's by approximately {fmt_delta(pv_per_day, bounds_success)} per day."""))plt.show()
The Autocomplete Success Rate represents the percentage of page views that perform at least one autocomplete query and then directly select one of the presented options. Upon selecting an option the user is taken directly to the chosen page. An improved autocomplete should see the success rate increase, or at least not decline.
Autocomplete success rate increased from 54.08% (95% CI: 53.99% - 54.16%) in the control to 54.41% (95% CI: 54.34% - 54.50%) in the test, a percentage point delta of 0.34%.. This represents an increase in successfull autocomplete’s by approximately 1,400 per day.
Mean Characters Typed Per Successful Lookup
Code
display(note(f"""The number of characters a user types into the autocomplete before selecting from the choices representsa proxy for the amount of effort required to use the autocomplete. An improved autocomplete should seethe number of characters decline, or at least not increase."""))bounds_len = plot_ci(f"Mean Characters Typed Per Successful Lookup", df_clicks,lambda x: x['autocomplete_query_length'])display(note(f"""{fmt_ab_int(f"Mean characters typed per successful lookup", bounds_len)}."""))plt.show()
The number of characters a user types into the autocomplete before selecting from the choices represents a proxy for the amount of effort required to use the autocomplete. An improved autocomplete should see the number of characters decline, or at least not increase.
Mean characters typed per successful lookup declined from 9.44 (95% CI: 9.43 - 9.45) in the control to 9.4 (95% CI: 9.39 - 9.41) in the test, a delta of -0.038…
We can also look at the full distribution of characters typed, rather than narrowing in on only the mean. The final data point represents all values greater than or equal to the max value, this is why the graphs have a lift at the tail. This graph is intended to provide some insight on if there were changes to the distribution.
Code
plot_int_distribution(f'Distribution of Characters Typed Per Sucessfull Lookup: {TEST_BUCKET}', df_clicks[(df_clicks['subTest'] == TEST_BUCKET)]['autocomplete_query_length'], color=colors[TEST_BUCKET],)plot_int_distribution(f'Distribution of Characters Typed Per Sucessfull Lookup: control', df_clicks[(df_clicks['subTest'] =='control')]['autocomplete_query_length'], color=colors['control'],)
Click Position
Code
display(note(f"""The result position of pages selected from autocomplete is another proxy for the quality of searchresults. An improved autocomplete should bring the most relevant results to the top of the list,causing the mean click position to approach 1, or at least not increase. When users consistently click on results lower in the list, it suggests the algorithm is noteffectively prioritizing the most relevant matches. Conversely, when users find what they need in thefirst few positions, it indicates the autocomplete is successfully predicting user intent and rankingresults appropriately."""))bounds_click_pos = plot_ci(f'Mean click position', df_clicks,lambda x: 1+ x['min_click_position'])display(note(f"""{fmt_ab_int(f'Mean click position', bounds_click_pos)}, which for most practical purposesrepresents no meaningful change in user behavior or algorithm performance."""))plt.show()
The result position of pages selected from autocomplete is another proxy for the quality of search results. An improved autocomplete should bring the most relevant results to the top of the list, causing the mean click position to approach 1, or at least not increase.
When users consistently click on results lower in the list, it suggests the algorithm is not effectively prioritizing the most relevant matches. Conversely, when users find what they need in the first few positions, it indicates the autocomplete is successfully predicting user intent and ranking results appropriately.
Mean click position increased from 1.66 (95% CI: 1.66 - 1.66) in the control to 1.67 (95% CI: 1.67 - 1.68) in the test, a delta of 0.013.., which for most practical purposes represents no meaningful change in user behavior or algorithm performance.
Breaking down click behavior by position provides additional insight into user interaction patterns. The proportion of clicks at positions 1, 2, and 3 shows how often users find their desired result among the top choices.
An improved autocomplete should increase the proportion of clicks at position 1 (indicating more users find their target result immediately) while decreasing clicks at lower positions. Position 2 and 3 clicks can indicate reasonable algorithm performance when position 1 doesn’t match user intent.
Code
for i inrange(3): click_pos_bounds = plot_ci(f'Percentage of Clicks at position {i+1}', df_clicks,lambda x: x['min_click_position'] == i ) display(note(fmt_ab_pct(f'Clicks@{i +1}', click_pos_bounds))) plt.show()
Clicks@1 declined from 69.82% (95% CI: 69.72% - 69.93%) in the control to 69.10% (95% CI: 68.99% - 69.20%) in the test, a percentage point delta of -0.72%.
Clicks@2 increased from 16.86% (95% CI: 16.77% - 16.94%) in the control to 17.41% (95% CI: 17.33% - 17.49%) in the test, a percentage point delta of 0.55%.
Clicks@3 declined from 5.61% (95% CI: 5.56% - 5.66%) in the control to 5.59% (95% CI: 5.54% - 5.65%) in the test, this result is not statistically significant.
Summary and Conclusions
Code
def fmt_full_delta_pct(bounds): (a_low, a_mid, a_high), (b_low, b_mid, b_high) = bounds['control'], bounds[TEST_BUCKET] change ='increase'if b_mid > a_mid else'decrease'return (f"from {a_mid:.2%} to {b_mid:.2%} (a {b_mid - a_mid:.2%} percentage point {change})" )def fmt_full_delta_int(bounds): (a_low, a_mid, a_high), (b_low, b_mid, b_high) = bounds['control'], bounds[TEST_BUCKET] change ='increase'if b_mid > a_mid else'decrease'return (f"from {a_mid:.2f} to {b_mid:.2f} (an {change} of {b_mid - a_mid:.2f})" )def fmt_short_delta_int(bounds): (a_low, a_mid, a_high), (b_low, b_mid, b_high) = bounds['control'], bounds[TEST_BUCKET] change ='increased'if b_mid > a_mid else'decreased'return (f"{change} from {a_mid:.2f} to {b_mid:.2f}" )display(Markdown(f"""This AB Test evaluated `{ACTIVE_TEST}`, a new autocomplete configuration designed to improve recall by relying on `default_sort`a contributor maintained page attribute that is used for sorting pages. This attribute generally includes a different formby putting the last name before the first name in case of pages about individuals but also by removing common prefixes such as *List of*.#### Key Findings* **Autocomplete Success Rate:** The new configuration demonstrated a modest but statistically significant increase in from{fmt_full_delta_pct(bounds_success)}. This translates to approximately {fmt_delta(pv_per_day, bounds_success)} more successfulautocomplete submissions per day, indicating that the use of default_sort is helping users directly select desired pages more often.* **Autocomplete Submit Rate:** We did not obverve any snigficant change here.* **User Effort (Characters Typed):** The Mean Characters Typed Per Successful Lookup improved by a tiny margin suggestingthat in some cases typing fewer characters helped users find what they were looking for.* **Result Ranking (Click Position):** Mean average position slightly increased meaning that the improved came at the cost of aprecision loss.#### InterpretationImproving the success rate without much impact on other metrics might indicate that `default_sort` is helping to solve a new class of queriesthat were poorly served before.#### Next StepsWe may want to enable this feature on these wikis."""))
This AB Test evaluated T404858_default_sort_2, a new autocomplete configuration designed to improve recall by relying on default_sort a contributor maintained page attribute that is used for sorting pages. This attribute generally includes a different form by putting the last name before the first name in case of pages about individuals but also by removing common prefixes such as List of.
Key Findings
Autocomplete Success Rate: The new configuration demonstrated a modest but statistically significant increase in from from 54.08% to 54.41% (a 0.34% percentage point increase). This translates to approximately 1,400 more successful autocomplete submissions per day, indicating that the use of default_sort is helping users directly select desired pages more often.
Autocomplete Submit Rate: We did not obverve any snigficant change here.
User Effort (Characters Typed): The Mean Characters Typed Per Successful Lookup improved by a tiny margin suggesting that in some cases typing fewer characters helped users find what they were looking for.
Result Ranking (Click Position): Mean average position slightly increased meaning that the improved came at the cost of a precision loss.
Interpretation
Improving the success rate without much impact on other metrics might indicate that default_sort is helping to solve a new class of queries that were poorly served before.
Next Steps
We may want to enable this feature on these wikis.
Appendix
The following tables are included as a curiosity, but not directly analyzed. Even with corrections for false discovery rates these tables are certain to be populated with both type I and type II errors. None of these individual values should be considered actionable, but in aggregate they provide support to the primary result that the submit rate decreased and the success rate increased across many dimensions. The effect was not limited to specific dimensions.
P-values are adjusted using the Benjamini-Hochberg method (FDR = 5%), meaning we expect about 5% of these significant results to be false positives. We additionally expect some of the true positives to be caused by effects other than the test treatment, particularly on groups with small sample sizes.
The following tables share the same general structure and analysis, but are aggregated over different columns. For each aggregate we present the change in both the submit and success rates.
Code
per_wiki_sig_boolean_metric('Wikis with Statistically Significant Changes to Autocomplete Submit Rate', df, 'wiki', 'is_autocomplete_submit', min_obs=0.001, include_all=False)
Wikis with Statistically Significant Changes to Autocomplete Submit Rate
wiki
control
default_sort
p_value
observations
difference
% change
lift
bg.wikipedia
93.7%
94.9%
0.0318
13,932
1.2%
1.3%
18.8%
Code
per_wiki_sig_boolean_metric('Wikis with Statistically Significant Changes to Autocomplete Success Rate', df, 'wiki', 'is_autocomplete_success', min_obs=0.0001, include_all=False)
Wikis with Statistically Significant Changes to Autocomplete Success Rate
wiki
control
default_sort
p_value
observations
difference
% change
lift
cs.wikipedia
53.0%
54.7%
0.0000
132,291
1.7%
3.2%
3.6%
pl.wikipedia
54.5%
55.3%
0.0003
287,421
0.8%
1.4%
1.7%
es.wikipedia
49.8%
50.2%
0.0061
603,227
0.4%
0.8%
0.8%
de.wikipedia
56.1%
56.3%
0.0450
1,754,180
0.2%
0.3%
0.4%
Code
per_wiki_sig_boolean_metric('Browsers with Statistically Significant Changes to Autocomplete Submit Rate', df, 'browser_family', 'is_autocomplete_submit', min_obs=0.001)
Browsers with Statistically Significant Changes to Autocomplete Submit Rate
browser_family
control
default_sort
p_value
observations
difference
% change
lift
Code
per_wiki_sig_boolean_metric('Browsers with Statistically Significant Changes to Autocomplete Success Rate', df, 'browser_family', 'is_autocomplete_success', min_obs=0.001)
Browsers with Statistically Significant Changes to Autocomplete Success Rate
browser_family
control
default_sort
p_value
observations
difference
% change
lift
Safari
58.7%
59.2%
0.0082
373,447
0.5%
0.8%
1.2%
Chrome
52.6%
53.2%
0.0000
1,132,893
0.6%
1.1%
1.2%
Edge
53.9%
54.3%
0.0279
629,719
0.3%
0.6%
0.7%
Code
per_wiki_sig_boolean_metric('OS Families with Statistically Significant Changes to Autocomplete Submit Rate', df, 'os_family', 'is_autocomplete_submit', min_obs=0.001)
OS Families with Statistically Significant Changes to Autocomplete Submit Rate
os_family
control
default_sort
p_value
observations
difference
% change
lift
Code
per_wiki_sig_boolean_metric('OS families with Statistically Significant Changes to Autocomplete Success Rate', df, 'os_family', 'is_autocomplete_success', min_obs=0.001)
OS families with Statistically Significant Changes to Autocomplete Success Rate
os_family
control
default_sort
p_value
observations
difference
% change
lift
Mac OS X
56.6%
57.0%
0.0192
500,627
0.4%
0.7%
0.9%
Windows
53.9%
54.2%
0.0000
2,254,378
0.4%
0.7%
0.8%
Code
per_wiki_sig_boolean_metric('User Edit Buckets with Statistically Significant Changes to Autocomplete Submit Rate', df, 'user_edit_bucket', 'is_autocomplete_submit', min_obs=0.001,)
User Edit Buckets with Statistically Significant Changes to Autocomplete Submit Rate
user_edit_bucket
control
default_sort
p_value
observations
difference
% change
lift
Code
per_wiki_sig_boolean_metric('User Edit Buckets with Statistically Significant Changes to Autocomplete Success Rate', df, 'user_edit_bucket', 'is_autocomplete_success', min_obs=0.001,)
User Edit Buckets with Statistically Significant Changes to Autocomplete Success Rate