Douglas Bagnall on Wed, 4 Sep 2019 11:49:17 +0200 (CEST) |
[Date Prev] [Date Next] [Thread Prev] [Thread Next] [Date Index] [Thread Index]
Re: <nettime> The effect of "Nettime is in bad shape" on user agent ratios |
On 4/09/19 4:41 am, John Preston wrote: > Thanks Douglas. I like this. I would like to play with this on a > wider scale (listiverse). Do you have a script to scrape out the > headers from the archived messages or something? Not if you mean web archives. If you mean mbox files, then yes -- last night's script is below. If you wish to compare time periods you will need to cut up the mbox file yourself. Douglas ----------------8<------countmuas.py----------------------------------- #!/usr/bin/python3 """Count user agent headers in mbox files USAGE: python3 countmuas.py MBOX [MBOX [...]] The order in which the results are presented depends on the overall counts, thus the output of countmuas.py A B C is likely to look different from countmuas.py A; countmuas.py B; countmuas.py C User-agent and X-Mailer headers are used where available; further heuristics attempt to distinguish webmail providers. """ import mailbox import sys from collections import Counter import re is_google = re.compile('^(x-gm-|x-google)', re.I).match is_microsoft = re.compile('^(x-ms-|x-microsoft)', re.I).match def count_user_agents(mbox): m = mailbox.mbox(mbox) headers = Counter() for k, msg in m.items(): headers.update(x.lower() for x in msg.keys()) ua_counts = Counter() for k, msg in m.items(): x = [] if any(is_microsoft(h) for h in msg.keys()): x.append('microsoft') if any(is_google(h) for h in msg.keys()): x.append('gmail') ua = msg.get('User-Agent') if ua: ua = re.sub(r'[\d.]\w$', '', ua) ua = re.sub(r'\d\w?[\d.]*', '', ua) x.append(ua) xm = msg.get('X-Mailer') if xm: xm = re.sub(r'\d+[\d.]*\w?[\d.]*', '', xm) x.append(xm) s = '|'.join(x) or "unknown" s = re.sub(r'\s+', ' ', s).strip() ua_counts[s] += 1 clean = Counter() for ua, count in ua_counts.most_common(): ua = re.sub(r'[^\w ]+', '', ua).lower() ua = ua.strip() if any( x in ua for x in ('ymailnorrin', 'aolwebmail', 'yahoomail')): ua = 'yahoo/aol' elif 'thunderbird' in ua: for o in ('linux', 'macintosh', 'windows'): if o in ua: ua = 'Thunderbird (%s)' % o.title() elif 'mew version on emacs' in ua: ua = 'Mew (Emacs)' elif 'cyrusjmap' in ua: ua = 'Cyrus webmail' elif 'jaro mail' in ua: ua = 'Jaro Mail' elif 'trojita' in ua: ua = 'Trojita' elif 'xsll' in ua: ua = 'XS4all Webmail' elif 'claws mail' in ua: ua = 'Claws Mail' elif ua in ('microsoft', 'microsoftgmail'): ua = 'MS/Outlook.com/Hotmail' elif ua == 'gmail': ua = 'Gmail' else: ua = ua.replace('gmail', '') ua = re.sub(' ?deb$', '', ua) ua = re.sub(r' version\s*$', '', ua) ua = ua.title() clean[ua] += count return clean def print_user_agents(counts, names=None): if names is None: names = sorted(list(counts.keys())) total = sum(counts.values()) for ua in names: count = counts[ua] percent = (count * 100.0 / total) print("|%-30s %4.1f%% %s" % ('#' * (int(percent * 1 + 0.5) ), percent, ua)) def main(): files = sys.argv[1:] if {'-h', '--help'}.intersection(files): print(__doc__) sys.exit() names = Counter() mbox_counts = [] for mbox in files: counts = count_user_agents(mbox) mbox_counts.append(counts) names.update(counts) names = [x[0] for x in names.most_common()] for filename, counts in zip(files, mbox_counts): print('----- %s -----' % filename) print_user_agents(counts, names) main() # distributed via <nettime>: no commercial use without permission # <nettime> is a moderated mailing list for net criticism, # collaborative text filtering and cultural politics of the nets # more info: http://mx.kein.org/mailman/listinfo/nettime-l # archive: http://www.nettime.org contact: nettime@kein.org # @nettime_bot tweets mail w/ sender unless #ANON is in Subject: