--- /dev/null
+#!/usr/bin/python
+
+
+"""
+
+ For more detail, see http://wikitech.wikimedia.org/view/Text_storage_data
+
+ reads in two files which should contain the output of storageTypeStatsSum.py
+ Parses them both and calculates the difference for each storage type
+ prints this to stdout.
+
+ For best results, give the old and new files their dates for names, eg:
+ ben@fenari:~/storageStats$ ./storageTypeStatsDiff.py 2010-02-18 2011-08-31
+
+ Example content:
+
+ben@fenari:~/storageStats$ cat 2010-02-18
+Results:
+ Count Type
+------------------------------------------
+ 9 0,external/simple pointer
+ 435 0/[none]
+ 1482941 [none]/[none]
+ 968957 gzip/[none]
+ 178234 object,external/simple pointer
+ 1800 object,utf-8/[none]
+ 17076928 utf-8,gzip/[none]
+ 1269 utf-8/[none]
+all done!
+
+ben@fenari:~/storageStats$ cat 2011-08-31
+Results:
+ Count Type
+------------------------------------------
+ 9 0,external/simple pointer
+ 1435 0/[none]
+ 1002341 [none]/[none]
+ 1234212 object,external/simple pointer
+ 213 object,external/blob
+ 20 object,utf-8/[none]
+ 123428 utf-8,gzip/[none]
+ 123 utf-8/[none]
+all done!
+
+"""
+
+
+import re
+import optparse
+
+##
+## set up argument parsing.
+usage = "usage: %prog <old-stats-file> <new-stats-file>"
+desc = "Calculate the difference between two files containing storageTypeStatsSum.py output"
+parser = optparse.OptionParser(usage=usage, description=desc)
+(opts, args) = parser.parse_args()
+# Require exactly two arguments
+if len(args) != 2:
+ print "Two files needed."
+ parser.print_help()
+ exit()
+
+try:
+ oldfile=open(args[0], 'r')
+ newfile=open(args[1], 'r')
+except IOError, e:
+ print "IOError trying to open %s or %s: %s\n" % (args[0], args[1], e)
+ exit(1)
+
+# match only the actual value / key lines; ignore everything else
+valueline = re.compile("^ *(?P<val>\d+) *(?P<desc>.*)$")
+
+files={}
+# ok, parse the files and collect stats!
+for file in (oldfile, newfile):
+ stats = {}
+ for line in file:
+ match = valueline.match(line)
+ if match:
+ stats[match.group('desc')] = int(match.group('val'))
+ #stats collected for one file, save it to the files dict
+ files[file.name] = stats
+
+# calculate the difference
+diff = {} # contains numbers keyed on storage types
+allkeys = []
+# collect keys from both sets in case they don't match
+for stats in files.keys():
+ # get the union of allkeys and this file's stats keys
+ allkeys = list( set(allkeys) | set(files[stats].keys()) )
+for key in allkeys:
+ try:
+ diff[key] = files[newfile.name][key] - files[oldfile.name][key]
+ except KeyError:
+ # this happens when a key only exists in one set
+ diff[key] = 'n/a'
+
+# print out results
+print "%12s %12s %12s %s" % (oldfile.name, newfile.name, 'Diff', 'Type')
+print "---------------------------------------------------------------------"
+for key in sorted(allkeys):
+ try:
+ oldval = files[oldfile.name][key]
+ except KeyError:
+ oldval = 'n/a'
+ try:
+ newval = files[newfile.name][key]
+ except KeyError:
+ newval = 'n/a'
+ diffnum = diff[key]
+ name = key
+ print "%12s %12s %12s %s" % (oldval, newval, diffnum, name)
+
--- /dev/null
+#!/usr/bin/python
+
+
+"""
+
+ For more detail, see http://wikitech.wikimedia.org/view/Text_storage_data
+
+
+ reads in a file which should contain the output of
+ ben@hume:~$ /home/w/bin/foreachwiki maintenance/storage/storageTypeStats.php > /tmp/storageTypeStats.log
+ Parses it and sums up the values for all wikis.
+ prints this sum to stdout.
+
+ Example content:
+
+ben@fenari:~/storageStats$ cat sample_output.txt
+-----------------------------------------------------------------
+aawiki
+-----------------------------------------------------------------
+aawiki: Using bin size of 100
+aawiki: 0^M1000^M2000^M3000^M4000^M5000^M6000^M7000^M8000^M9000^M10000^M
+aawiki:
+aawiki: Flags Class Count old_id range
+aawiki: ------------------------------------------------------------------------------------------------------------------------
+aawiki: gzip [none] 4568 0 - 4700
+aawiki: [none] [none] 1615 4600 - 6300
+aawiki: utf-8,gzip [none] 1883 5300 - 8300
+aawiki: external,utf-8 CGZ pointer 626 6200 - 10300
+aawiki: external,utf-8 DHB pointer 368 9100 - 10300
+aawiki: utf-8,gzip,external simple pointer 975 8200 - 10400
+aawiki: external,utf8 DHB pointer 211 9400 - 10200
+-----------------------------------------------------------------
+aawikibooks
+-----------------------------------------------------------------
+aawikibooks: Using bin size of 100
+aawikibooks: 0^M1000^M2000^M3000^M
+aawikibooks:
+aawikibooks: Flags Class Count old_id range
+aawikibooks: ------------------------------------------------------------------------------------------------------------------------
+aawikibooks: [none] [none] 881 0 - 1000
+aawikibooks: external,utf-8 CGZ pointer 187 0 - 3400
+aawikibooks: external,utf-8 DHB pointer 34 3200 - 3400
+aawikibooks: object historyblobcurstub 898 900 - 1900
+aawikibooks: utf-8,gzip [none] 900 1800 - 2900
+aawikibooks: utf-8,gzip,external simple pointer 431 2800 - 3400
+aawikibooks: external,utf8 DHB pointer 25 3300 - 3400
+
+"""
+
+
+import re
+import optparse
+
+##
+## set up argument parsing. Require --input (or -i) and a filename.
+usage = "usage: %prog <input>"
+desc = """Sum the storage types across all wikis. The input file should
+contain the output of:
+ foreachwiki maintenance/storage/storageTypeStats.php
+"""
+
+parser = optparse.OptionParser(usage=usage, description=desc)
+(opts, args) = parser.parse_args()
+if len(args) != 1:
+ print "I can't do anything without a file to parse. Sorry!"
+ parser.print_help()
+ exit(1)
+
+input = args[0]
+
+try:
+ file=open(input, 'r')
+
+ # create a bunch of regexes to match various sections of the file
+ # a section starts with nothing on the line but the name of the wiki db
+ #aawikibooks
+ start_section = re.compile("^(?P<dbname>[a-z0-9_]+)$")
+ #aawikibooks: external,utf-8 DHB pointer 34 3200 - 3400
+ counter = re.compile("^[a-z0-9_]*: *(?P<flags>[^ ]+) +(?P<class>[^ ]+ [^ ]*) +(?P<count>\d+) +.*")
+
+ # create a bunch of counters
+ wiki_count=0
+ content_counters = dict()
+
+ # ok, parse the file and collect stats!
+ for line in file:
+ match = start_section.match(line)
+ if match:
+ # this isn't actually used yet, but is in here for when we
+ # want more interesting stats and collect per-db
+ wiki_count += 1
+ db_name=match.group('dbname')
+ match = counter.match(line)
+ if match:
+ # sum all unique class,flags combinations
+ key = "%s/%s" % (match.group('flags'), match.group('class'))
+ try:
+ content_counters[key] += int(match.group('count'))
+ except KeyError:
+ content_counters[key] = int(match.group('count'))
+
+
+except IOError, e:
+ print "omg io error %s!" % e
+ raise e
+
+print "Results:"
+print " Count Type"
+print "------------------------------------------"
+for key in sorted(content_counters.keys()):
+ print "%12d %s" % (content_counters[key], key)
+print "all done!"
+