from wmfdata import mariadb, utils
import pandas as pd
import json
WIKI = 'testwiki'
def get_mentee_data_by_id(mentee_id):
filteredDf = menteeDataDf.loc[menteeDataDf.mentee_id == mentee_id]
if filteredDf.count().mentee_id != 1:
utils.print_err('ERROR: Duplicate user ID: %s' % mentee_id)
return {}
return json.loads(list(filteredDf.mentee_data)[0])
growthexperiments_mentee_data
table¶menteeDataDf = mariadb.run('''
SELECT *
FROM growthexperiments_mentee_data
''', WIKI, use_x1=True)
# just to show some data as an example of format
get_mentee_data_by_id(menteeDataDf.mentee_id[0])
{'username': 'Tgr', 'reverted': 0, 'questions': 0, 'editcount': 28, 'registration': '20060713135958', 'last_active': '20210711111411', 'blocks': 0}
get_mentee_data_by_id(254)
{'username': 'Tgr', 'reverted': 0, 'questions': 0, 'editcount': 28, 'registration': '20060713135958', 'last_active': '20210711111411', 'blocks': 0}
# store mismatched IDs, format: id: fields
mismatched_ids = {}
def validate_fields_internal(query, fields, fill_na=None):
"""
Validate fields according to data in analytics replicas
"""
df = mariadb.run(query, WIKI)
processed_ids = []
mismatched_total = 0
for mentee_id in menteeDataDf.mentee_id:
mentee = get_mentee_data_by_id(mentee_id)
dfFiltered = df.loc[df.user_id == mentee_id]
# prepare error data, in case it's needed
mentee_ok = True
mentee_mismatched_fields = []
mentee_data_available = dfFiltered.count().user_id != 0
for field in fields:
if mentee_data_available:
value = list(dfFiltered[field])[0]
else:
if fill_na is not None:
value = fill_na
else:
mentee_ok = False
mentee_mismatched_fields.append('MISSING-%s' % field)
continue
if value != mentee.get(field):
mentee_ok = False
mentee_mismatched_fields.append(field)
if not mentee_ok:
mismatched_total += 1
if mentee_id not in mismatched_ids:
mismatched_ids[mentee_id] = mentee_mismatched_fields
else:
mismatched_ids[mentee_id] += mentee_mismatched_fields
processed_ids.append(mentee_id)
return mismatched_total
def validate_fields(query, fields, fill_na=None):
num_of_errors = validate_fields_internal(query.format(
user_ids=', '.join([str(x) for x in menteeDataDf.mentee_id])
), fields, fill_na)
if num_of_errors == 0:
utils.print_err('All OK!')
else:
utils.print_err('ERROR: Found %s mismatched IDs' % num_of_errors)
user_name
, user_registration
, user_editcount
¶validate_fields('''
SELECT
user_id,
user_name AS username,
user_registration AS registration,
user_editcount AS editcount
FROM user
WHERE
user_id IN ({user_ids})
''', ['username', 'registration', 'editcount'])
All OK!
reverted
¶validate_fields('''
SELECT
actor_user AS user_id,
COUNT(*) AS reverted
FROM change_tag
JOIN revision ON ct_rev_id=rev_id
JOIN revision_actor_temp ON rev_id=revactor_rev
JOIN actor ON revactor_actor=actor_id
WHERE
ct_tag_id IN (SELECT ctd_id FROM change_tag_def WHERE ctd_name IN ('mw-reverted'))
AND actor_user IN ({user_ids})
GROUP BY user_id
''', ['reverted'], fill_na=0)
All OK!
questions
¶validate_fields('''
SELECT
actor_user AS user_id,
COUNT(*) AS questions
FROM change_tag
JOIN revision ON ct_rev_id=rev_id
JOIN revision_actor_temp ON rev_id=revactor_rev
JOIN actor ON revactor_actor=actor_id
WHERE
ct_tag_id IN (SELECT ctd_id FROM change_tag_def WHERE ctd_name IN ('mentorship module question', 'mentorship panel question'))
AND actor_user IN ({user_ids})
GROUP BY user_id
''', ['questions'], fill_na=0)
All OK!
last_active
¶validate_fields('''
SELECT
actor_user AS user_id,
MAX(rev_timestamp) AS last_active
FROM revision
JOIN revision_actor_temp ON rev_id=revactor_rev
JOIN actor ON revactor_actor=actor_id
WHERE
actor_user IN ({user_ids})
GROUP BY user_id
''', ['last_active'])
All OK!
blocks
¶validate_fields('''
SELECT
user_id,
COUNT(*) AS blocks
FROM logging
JOIN user ON log_title=REPLACE(user_name, ' ', '_')
WHERE
log_type='block'
AND log_action='block'
AND user_id IN ({user_ids})
GROUP BY user_id
''', ['blocks'], fill_na=0)
All OK!
if len(mismatched_ids) == 0:
utils.print_err('All OK! ')
else:
utils.print_err('ERROR: Found %s mismatched IDs' % len(mismatched_ids))
utils.print_err(mismatched_ids)
All OK!