Import statements

In [67]:
#!pip install --upgrade git+https://github.com/neilpquinn/wmfdata.git@release
In [68]:
import pandasql
from pandasql import sqldf

import datetime as dt

import pandas as pd

from collections import defaultdict

from wmfdata import spark, mariadb

Configuration

In [69]:
USER_GROUP_FIRST = 'checkuser'
USER_GROUP_SECOND = 'sysop'

Helper functions

In [70]:
def get_number_of_total_members(dbname, ug_group):
    query = '''
    SELECT
        COUNT(*)
    FROM user_groups
    WHERE
        ug_expiry IS NULL AND
        ug_group="{ug_group}"
    '''
    conn = mariadb.connect(dbname)
    with conn.cursor() as cur:
        cur.execute(query.format(
            ug_group=ug_group
        ))
        return cur.fetchall()[0][0]
In [71]:
def get_number_of_non_overlaping_members(dbname, ug_group_first, ug_group_second):
    query = '''
    SELECT
        COUNT(*)
    FROM user_groups
    WHERE
        ug_expiry IS NULL AND
        ug_group="{ug_group_first}" AND
        ug_user NOT IN (SELECT ug_user FROM user_groups WHERE ug_group='{ug_group_second}')
    '''
    conn = mariadb.connect(dbname)
    with conn.cursor() as cur:
        cur.execute(query.format(
            ug_group_first=ug_group_first,
            ug_group_second=ug_group_second
        ))
        return cur.fetchall()[0][0]
In [72]:
def get_number_of_overlaping_members(dbname, ug_group_first, ug_group_second):
    query = '''
    SELECT
        COUNT(*)
    FROM user_groups
    WHERE
        ug_group="{ug_group_first}" AND
        ug_user IN (SELECT ug_user FROM user_groups WHERE ug_group='{ug_group_second}')
    '''
    conn = mariadb.connect(dbname)
    with conn.cursor() as cur:
        cur.execute(query.format(
            ug_group_first=ug_group_first,
            ug_group_second=ug_group_second
        ))
        return cur.fetchall()[0][0]

Program

In [73]:
private_wikis = open('/srv/mediawiki-config/dblists/private.dblist', 'r').read().split('\n')
closed_wikis = open('/srv/mediawiki-config/dblists/closed.dblist', 'r').read().split('\n')

projects = []
for family in ['wikipedia']:
    projects_raw = open('/srv/mediawiki-config/dblists/%s.dblist' % family, 'r').read().split('\n')
    for project_raw in projects_raw:
        if project_raw in private_wikis or project_raw in closed_wikis:
            continue
        projects.append(project_raw)

# Add non-Wikipedias projects with CUs manually, to speed up processing
projects.append('metawiki')
projects.append('commonswiki')
projects.append('wikidatawiki')
projects.append('specieswiki')
projects.append('enwikibooks')

projects.sort()

dataPanda = {
    'Project': [],
    'Total CUs': [],
    'Non-sysop CUs': [],
    'Sysop CUs': []
}

for project in projects:
    total = get_number_of_total_members(project, USER_GROUP_FIRST)
    if total == 0:
        continue
    
    dataPanda['Project'].append(project)
    dataPanda['Total CUs'].append('{num} (100 %)'.format(num=total))
    num = get_number_of_non_overlaping_members(project, USER_GROUP_FIRST, USER_GROUP_SECOND)
    dataPanda['Non-sysop CUs'].append('{num} ({percentage} %)'.format(
        num=num,
        percentage=round(num/total*100, 1)
    ))
    num = get_number_of_overlaping_members(project, USER_GROUP_FIRST, USER_GROUP_SECOND)
    dataPanda['Sysop CUs'].append('{num} ({percentage} %)'.format(
        num=num,
        percentage=round(num/total*100, 1)
    ))

pd.DataFrame(dataPanda)
Out[73]:
Project Total CUs Non-sysop CUs Sysop CUs
0 arwiki 6 (100 %) 0 (0.0 %) 6 (100.0 %)
1 bnwiki 2 (100 %) 0 (0.0 %) 2 (100.0 %)
2 cawiki 2 (100 %) 1 (50.0 %) 1 (50.0 %)
3 commonswiki 5 (100 %) 0 (0.0 %) 5 (100.0 %)
4 cswiki 3 (100 %) 0 (0.0 %) 3 (100.0 %)
5 dawiki 3 (100 %) 1 (33.3 %) 2 (66.7 %)
6 dewiki 5 (100 %) 0 (0.0 %) 5 (100.0 %)
7 enwiki 47 (100 %) 0 (0.0 %) 47 (100.0 %)
8 enwikibooks 2 (100 %) 0 (0.0 %) 2 (100.0 %)
9 eswiki 4 (100 %) 0 (0.0 %) 4 (100.0 %)
10 fawiki 2 (100 %) 0 (0.0 %) 2 (100.0 %)
11 fiwiki 3 (100 %) 0 (0.0 %) 3 (100.0 %)
12 frwiki 6 (100 %) 2 (33.3 %) 4 (66.7 %)
13 hewiki 5 (100 %) 2 (40.0 %) 3 (60.0 %)
14 huwiki 5 (100 %) 1 (20.0 %) 4 (80.0 %)
15 idwiki 5 (100 %) 0 (0.0 %) 5 (100.0 %)
16 itwiki 9 (100 %) 0 (0.0 %) 9 (100.0 %)
17 jawiki 5 (100 %) 0 (0.0 %) 5 (100.0 %)
18 kowiki 4 (100 %) 0 (0.0 %) 4 (100.0 %)
19 metawiki 5 (100 %) 0 (0.0 %) 5 (100.0 %)
20 mlwiki 2 (100 %) 0 (0.0 %) 2 (100.0 %)
21 nlwiki 5 (100 %) 5 (100.0 %) 0 (0.0 %)
22 plwiki 5 (100 %) 0 (0.0 %) 5 (100.0 %)
23 ptwiki 6 (100 %) 0 (0.0 %) 6 (100.0 %)
24 ruwiki 4 (100 %) 0 (0.0 %) 4 (100.0 %)
25 simplewiki 7 (100 %) 0 (0.0 %) 7 (100.0 %)
26 slwiki 2 (100 %) 0 (0.0 %) 2 (100.0 %)
27 specieswiki 3 (100 %) 0 (0.0 %) 3 (100.0 %)
28 srwiki 3 (100 %) 1 (33.3 %) 2 (66.7 %)
29 svwiki 3 (100 %) 0 (0.0 %) 3 (100.0 %)
30 testwiki 2 (100 %) 1 (50.0 %) 1 (50.0 %)
31 thwiki 2 (100 %) 0 (0.0 %) 2 (100.0 %)
32 trwiki 3 (100 %) 0 (0.0 %) 3 (100.0 %)
33 ukwiki 4 (100 %) 0 (0.0 %) 4 (100.0 %)
34 viwiki 2 (100 %) 0 (0.0 %) 2 (100.0 %)
35 wikidatawiki 5 (100 %) 0 (0.0 %) 5 (100.0 %)