From b0a5bdeeb342ddaa86e9e04cc2e30de2930886ad Mon Sep 17 00:00:00 2001 From: Ben Hartshorne Date: Wed, 7 Sep 2011 19:46:18 +0000 Subject: [PATCH] adding tools to parse the output of storageTypeStats.php --- maintenance/storage/storageTypeStatsDiff.py | 113 ++++++++++++++++++++ maintenance/storage/storageTypeStatsSum.py | 113 ++++++++++++++++++++ 2 files changed, 226 insertions(+) create mode 100755 maintenance/storage/storageTypeStatsDiff.py create mode 100755 maintenance/storage/storageTypeStatsSum.py diff --git a/maintenance/storage/storageTypeStatsDiff.py b/maintenance/storage/storageTypeStatsDiff.py new file mode 100755 index 0000000000..90a9e1cfd9 --- /dev/null +++ b/maintenance/storage/storageTypeStatsDiff.py @@ -0,0 +1,113 @@ +#!/usr/bin/python + + +""" + + For more detail, see http://wikitech.wikimedia.org/view/Text_storage_data + + reads in two files which should contain the output of storageTypeStatsSum.py + Parses them both and calculates the difference for each storage type + prints this to stdout. + + For best results, give the old and new files their dates for names, eg: + ben@fenari:~/storageStats$ ./storageTypeStatsDiff.py 2010-02-18 2011-08-31 + + Example content: + +ben@fenari:~/storageStats$ cat 2010-02-18 +Results: + Count Type +------------------------------------------ + 9 0,external/simple pointer + 435 0/[none] + 1482941 [none]/[none] + 968957 gzip/[none] + 178234 object,external/simple pointer + 1800 object,utf-8/[none] + 17076928 utf-8,gzip/[none] + 1269 utf-8/[none] +all done! + +ben@fenari:~/storageStats$ cat 2011-08-31 +Results: + Count Type +------------------------------------------ + 9 0,external/simple pointer + 1435 0/[none] + 1002341 [none]/[none] + 1234212 object,external/simple pointer + 213 object,external/blob + 20 object,utf-8/[none] + 123428 utf-8,gzip/[none] + 123 utf-8/[none] +all done! + +""" + + +import re +import optparse + +## +## set up argument parsing. +usage = "usage: %prog " +desc = "Calculate the difference between two files containing storageTypeStatsSum.py output" +parser = optparse.OptionParser(usage=usage, description=desc) +(opts, args) = parser.parse_args() +# Require exactly two arguments +if len(args) != 2: + print "Two files needed." + parser.print_help() + exit() + +try: + oldfile=open(args[0], 'r') + newfile=open(args[1], 'r') +except IOError, e: + print "IOError trying to open %s or %s: %s\n" % (args[0], args[1], e) + exit(1) + +# match only the actual value / key lines; ignore everything else +valueline = re.compile("^ *(?P\d+) *(?P.*)$") + +files={} +# ok, parse the files and collect stats! +for file in (oldfile, newfile): + stats = {} + for line in file: + match = valueline.match(line) + if match: + stats[match.group('desc')] = int(match.group('val')) + #stats collected for one file, save it to the files dict + files[file.name] = stats + +# calculate the difference +diff = {} # contains numbers keyed on storage types +allkeys = [] +# collect keys from both sets in case they don't match +for stats in files.keys(): + # get the union of allkeys and this file's stats keys + allkeys = list( set(allkeys) | set(files[stats].keys()) ) +for key in allkeys: + try: + diff[key] = files[newfile.name][key] - files[oldfile.name][key] + except KeyError: + # this happens when a key only exists in one set + diff[key] = 'n/a' + +# print out results +print "%12s %12s %12s %s" % (oldfile.name, newfile.name, 'Diff', 'Type') +print "---------------------------------------------------------------------" +for key in sorted(allkeys): + try: + oldval = files[oldfile.name][key] + except KeyError: + oldval = 'n/a' + try: + newval = files[newfile.name][key] + except KeyError: + newval = 'n/a' + diffnum = diff[key] + name = key + print "%12s %12s %12s %s" % (oldval, newval, diffnum, name) + diff --git a/maintenance/storage/storageTypeStatsSum.py b/maintenance/storage/storageTypeStatsSum.py new file mode 100755 index 0000000000..b07265758b --- /dev/null +++ b/maintenance/storage/storageTypeStatsSum.py @@ -0,0 +1,113 @@ +#!/usr/bin/python + + +""" + + For more detail, see http://wikitech.wikimedia.org/view/Text_storage_data + + + reads in a file which should contain the output of + ben@hume:~$ /home/w/bin/foreachwiki maintenance/storage/storageTypeStats.php > /tmp/storageTypeStats.log + Parses it and sums up the values for all wikis. + prints this sum to stdout. + + Example content: + +ben@fenari:~/storageStats$ cat sample_output.txt +----------------------------------------------------------------- +aawiki +----------------------------------------------------------------- +aawiki: Using bin size of 100 +aawiki: 0^M1000^M2000^M3000^M4000^M5000^M6000^M7000^M8000^M9000^M10000^M +aawiki: +aawiki: Flags Class Count old_id range +aawiki: ------------------------------------------------------------------------------------------------------------------------ +aawiki: gzip [none] 4568 0 - 4700 +aawiki: [none] [none] 1615 4600 - 6300 +aawiki: utf-8,gzip [none] 1883 5300 - 8300 +aawiki: external,utf-8 CGZ pointer 626 6200 - 10300 +aawiki: external,utf-8 DHB pointer 368 9100 - 10300 +aawiki: utf-8,gzip,external simple pointer 975 8200 - 10400 +aawiki: external,utf8 DHB pointer 211 9400 - 10200 +----------------------------------------------------------------- +aawikibooks +----------------------------------------------------------------- +aawikibooks: Using bin size of 100 +aawikibooks: 0^M1000^M2000^M3000^M +aawikibooks: +aawikibooks: Flags Class Count old_id range +aawikibooks: ------------------------------------------------------------------------------------------------------------------------ +aawikibooks: [none] [none] 881 0 - 1000 +aawikibooks: external,utf-8 CGZ pointer 187 0 - 3400 +aawikibooks: external,utf-8 DHB pointer 34 3200 - 3400 +aawikibooks: object historyblobcurstub 898 900 - 1900 +aawikibooks: utf-8,gzip [none] 900 1800 - 2900 +aawikibooks: utf-8,gzip,external simple pointer 431 2800 - 3400 +aawikibooks: external,utf8 DHB pointer 25 3300 - 3400 + +""" + + +import re +import optparse + +## +## set up argument parsing. Require --input (or -i) and a filename. +usage = "usage: %prog " +desc = """Sum the storage types across all wikis. The input file should +contain the output of: + foreachwiki maintenance/storage/storageTypeStats.php +""" + +parser = optparse.OptionParser(usage=usage, description=desc) +(opts, args) = parser.parse_args() +if len(args) != 1: + print "I can't do anything without a file to parse. Sorry!" + parser.print_help() + exit(1) + +input = args[0] + +try: + file=open(input, 'r') + + # create a bunch of regexes to match various sections of the file + # a section starts with nothing on the line but the name of the wiki db + #aawikibooks + start_section = re.compile("^(?P[a-z0-9_]+)$") + #aawikibooks: external,utf-8 DHB pointer 34 3200 - 3400 + counter = re.compile("^[a-z0-9_]*: *(?P[^ ]+) +(?P[^ ]+ [^ ]*) +(?P\d+) +.*") + + # create a bunch of counters + wiki_count=0 + content_counters = dict() + + # ok, parse the file and collect stats! + for line in file: + match = start_section.match(line) + if match: + # this isn't actually used yet, but is in here for when we + # want more interesting stats and collect per-db + wiki_count += 1 + db_name=match.group('dbname') + match = counter.match(line) + if match: + # sum all unique class,flags combinations + key = "%s/%s" % (match.group('flags'), match.group('class')) + try: + content_counters[key] += int(match.group('count')) + except KeyError: + content_counters[key] = int(match.group('count')) + + +except IOError, e: + print "omg io error %s!" % e + raise e + +print "Results:" +print " Count Type" +print "------------------------------------------" +for key in sorted(content_counters.keys()): + print "%12d %s" % (content_counters[key], key) +print "all done!" + -- 2.20.1