#!pip install --upgrade git+https://github.com/neilpquinn/wmfdata.git@release
import pandasql
from pandasql import sqldf
import datetime as dt
import pandas as pd
from collections import defaultdict
from wmfdata import spark, mariadb
USER_GROUP_FIRST = 'checkuser'
USER_GROUP_SECOND = 'sysop'
def get_number_of_total_members(dbname, ug_group):
query = '''
SELECT
COUNT(*)
FROM user_groups
WHERE
ug_expiry IS NULL AND
ug_group="{ug_group}"
'''
conn = mariadb.connect(dbname)
with conn.cursor() as cur:
cur.execute(query.format(
ug_group=ug_group
))
return cur.fetchall()[0][0]
def get_number_of_non_overlaping_members(dbname, ug_group_first, ug_group_second):
query = '''
SELECT
COUNT(*)
FROM user_groups
WHERE
ug_expiry IS NULL AND
ug_group="{ug_group_first}" AND
ug_user NOT IN (SELECT ug_user FROM user_groups WHERE ug_group='{ug_group_second}')
'''
conn = mariadb.connect(dbname)
with conn.cursor() as cur:
cur.execute(query.format(
ug_group_first=ug_group_first,
ug_group_second=ug_group_second
))
return cur.fetchall()[0][0]
def get_number_of_overlaping_members(dbname, ug_group_first, ug_group_second):
query = '''
SELECT
COUNT(*)
FROM user_groups
WHERE
ug_group="{ug_group_first}" AND
ug_user IN (SELECT ug_user FROM user_groups WHERE ug_group='{ug_group_second}')
'''
conn = mariadb.connect(dbname)
with conn.cursor() as cur:
cur.execute(query.format(
ug_group_first=ug_group_first,
ug_group_second=ug_group_second
))
return cur.fetchall()[0][0]
private_wikis = open('/srv/mediawiki-config/dblists/private.dblist', 'r').read().split('\n')
closed_wikis = open('/srv/mediawiki-config/dblists/closed.dblist', 'r').read().split('\n')
projects = []
for family in ['wikipedia']:
projects_raw = open('/srv/mediawiki-config/dblists/%s.dblist' % family, 'r').read().split('\n')
for project_raw in projects_raw:
if project_raw in private_wikis or project_raw in closed_wikis:
continue
projects.append(project_raw)
# Add non-Wikipedias projects with CUs manually, to speed up processing
projects.append('metawiki')
projects.append('commonswiki')
projects.append('wikidatawiki')
projects.append('specieswiki')
projects.append('enwikibooks')
projects.sort()
dataPanda = {
'Project': [],
'Total CUs': [],
'Non-sysop CUs': [],
'Sysop CUs': []
}
for project in projects:
total = get_number_of_total_members(project, USER_GROUP_FIRST)
if total == 0:
continue
dataPanda['Project'].append(project)
dataPanda['Total CUs'].append('{num} (100 %)'.format(num=total))
num = get_number_of_non_overlaping_members(project, USER_GROUP_FIRST, USER_GROUP_SECOND)
dataPanda['Non-sysop CUs'].append('{num} ({percentage} %)'.format(
num=num,
percentage=round(num/total*100, 1)
))
num = get_number_of_overlaping_members(project, USER_GROUP_FIRST, USER_GROUP_SECOND)
dataPanda['Sysop CUs'].append('{num} ({percentage} %)'.format(
num=num,
percentage=round(num/total*100, 1)
))
pd.DataFrame(dataPanda)