In [1]:
from pyspark.sql import functions as F, types as T

dates = F.lit(False)
for day in range(1, 8):
    dates |= F.col('day') == day
    
PARTITION_COND = (
    (F.col('year') == 2020)
    & (F.col('month') == 8)
    & dates
)
In [2]:
df = (
    spark.read.table('event.mediawiki_cirrussearch_request')
    .where(PARTITION_COND)
    # Full text only defaults to "full_text", we end up excluding some keyword usage.
    # could use elasticsearch_requests.syntax instead
    .where(F.expr('array_contains(elasticsearch_requests.query_type, "full_text")'))
    .select('database', 'identity', 'source',
            F.explode('elasticsearch_requests').alias('req'))
    .where(F.col('req.query_type') == 'full_text')
    .withColumn('queried_commonswiki',
                F.array_contains(F.col('req.indices'), "commonswiki")
                | F.array_contains(F.col('req.indices'), "commonswiki_file"))
    .withColumn('is_api_search',
                F.col('queried_commonswiki') & (F.col('source') == 'api'))
    .withColumn('is_web_search',
                F.col('queried_commonswiki') & (F.col('source') == 'web'))
)
In [3]:
def sum_bool(col_name):
    return F.sum(F.col(col_name).cast(T.IntegerType())).alias(col_name)

df_stats_by_wiki = (
    df
    # Transform into per-identity booleans indicating if that identity performed
    # a kind of search
    .groupBy('database', 'identity')
    .agg(
        F.max(F.col('is_api_search')).alias('used_api_search'),
        F.max(F.col('is_web_search')).alias('used_web_search')
    )
    # counts identities per class per wiki
    .groupBy('database')
    .agg(
        F.sum(F.lit(1)).alias('total_identities'),
        sum_bool('used_api_search'),
        sum_bool('used_web_search')
    )
)
In [4]:
pd = df_stats_by_wiki.toPandas()
In [5]:
pd.sort_values('used_api_search', ascending=False).iloc[:20]
Out[5]:
database total_identities used_api_search used_web_search
489 dewiki 2180187 452551 1872
499 eswiki 999195 356520 1811
437 frwiki 922926 237080 1585
37 ruwiki 631525 203108 861
250 jawiki 577600 200526 556
446 zhwiki 418526 188563 1204
799 commonswiki 354951 147206 204496
289 itwiki 499815 129828 325
395 ptwiki 240944 94205 486
505 arwiki 242108 88745 567
775 plwiki 155574 64164 245
821 kowiki 80838 46795 315
147 nlwiki 216449 43201 292
173 idwiki 216110 42368 665
16 cswiki 98405 39623 130
418 fawiki 95532 36433 306
464 viwiki 60649 24612 103
615 trwiki 113171 22467 142
806 hewiki 65063 21828 2533
460 svwiki 121317 21586 78
In [6]:
pd.sort_values('used_web_search', ascending=False).iloc[:20]
Out[6]:
database total_identities used_api_search used_web_search
799 commonswiki 354951 147206 204496
125 enwiki 10276043 1765 5997
239 enwikivoyage 10275 3 4145
552 simplewiki 33660 8205 2762
806 hewiki 65063 21828 2533
495 wikidatawiki 16983 127 2170
489 dewiki 2180187 452551 1872
499 eswiki 999195 356520 1811
437 frwiki 922926 237080 1585
446 zhwiki 418526 188563 1204
514 specieswiki 5347 994 1037
37 ruwiki 631525 203108 861
173 idwiki 216110 42368 665
490 dewikivoyage 833 3 651
505 arwiki 242108 88745 567
250 jawiki 577600 200526 556
395 ptwiki 240944 94205 486
289 itwiki 499815 129828 325
821 kowiki 80838 46795 315
418 fawiki 95532 36433 306
In [ ]: