| Index: sitescripts/stats/bin/datamerger.py |
| =================================================================== |
| rename from sitescripts/logs/bin/mergeSubscriptionStats.py |
| rename to sitescripts/stats/bin/datamerger.py |
| --- a/sitescripts/logs/bin/mergeSubscriptionStats.py |
| +++ b/sitescripts/stats/bin/datamerger.py |
| @@ -10,63 +10,86 @@ |
| # Adblock Plus is distributed in the hope that it will be useful, |
| # but WITHOUT ANY WARRANTY; without even the implied warranty of |
| # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the |
| # GNU General Public License for more details. |
| # |
| # You should have received a copy of the GNU General Public License |
| # along with Adblock Plus. If not, see <http://www.gnu.org/licenses/>. |
| -import os, re, subprocess, urllib |
| +import os, sys, re, codecs, subprocess, urllib, simplejson, traceback |
|
Sebastian Noack
2013/08/26 16:05:22
Since Python 2.6, you can use the built-in module
|
| +import sitescripts.stats.common as common |
| from sitescripts.utils import get_config, setupStderr |
| -from ConfigParser import SafeConfigParser, NoOptionError |
| -from StringIO import StringIO |
| -def readStatsFile(path): |
| - result = SafeConfigParser() |
| - match = re.search(r'^ssh://(\w+)@([^/:]+)(?::(\d+))?', path) |
| +def read_stats_file(path): |
| + match = re.search(r"^ssh://(\w+)@([^/:]+)(?::(\d+))?", path) |
| if match: |
| - command = ['ssh', '-q', '-o' 'NumberOfPasswordPrompts 0', '-T', '-k', '-l', match.group(1), match.group(2)] |
| + command = ["ssh", "-q", "-o", "NumberOfPasswordPrompts 0", "-T", "-k", "-l", match.group(1), match.group(2)] |
| if match.group(3): |
| - command[1:1] = ['-P', match.group(3)] |
| + command[1:1] = ["-P", match.group(3)] |
| data = subprocess.check_output(command) |
| - result.readfp(StringIO(data)) |
| - elif path.startswith('http://') or path.startswith('https://'): |
| - result.readfp(urllib.urlopen(path)) |
| + return simplejson.loads(data.decode("utf-8")) |
| + elif path.startswith("http://") or path.startswith("https://"): |
| + return simplejson.load(urllib.urlopen(path).read().decode("utf-8")) |
| elif os.path.exists(path): |
| - result.read(path) |
| - return result |
| + with codecs.open(path, "rb", encoding="utf-8") as file: |
| + return simplejson.load(file) |
| -def getStatsFiles(): |
| + raise IOError("Path '%s' not recognized" % path) |
| + |
| +def get_stats_files(mirrors): |
| config = get_config() |
| - for option in config.options('subscriptionStats'): |
| - match = re.search(r'^mirror_(.*)', option, re.I) |
| - if match: |
| - yield (match.group(1), config.get('subscriptionStats', option)) |
| + if len(mirrors) > 0: |
| + options = map(lambda m: "mirror_" + m, mirrors) |
| + else: |
| + options = filter(lambda o: o.startswith("mirror_"), config.options("stats")) |
| + for option in options: |
| + if config.has_option("stats", option): |
| + value = config.get("stats", option) |
| + if " " in value: |
| + yield re.split(r"\s+", value, 1) |
| + else: |
| + print >>sys.stderr, "Option '%s' has invalid value: '%s'" % (option, value) |
| + else: |
| + print >>sys.stderr, "Option '%s' not found in the configuration" % option |
| -def mergeStatsFile(mirrorName, config1, config2): |
| - def increaseOption(section, option, increase): |
| - if config1.has_option(section, option): |
| - oldval = config1.getint(section, option) |
| - config1.set(section, option, str(oldval + increase)) |
| +def merge_objects(object1, object2): |
| + for key, value in object2.iteritems(): |
| + if key in object1: |
| + if isinstance(value, int): |
| + object1[key] += value |
| + else: |
| + merge_objects(object1[key], object2[key]) |
| else: |
| - config1.set(section, option, str(increase)) |
| + object1[key] = value |
| - for section in config2.sections(): |
| - if not config1.has_section(section): |
| - config1.add_section(section) |
| - for option in config2.options(section): |
| - increase = config2.getint(section, option) |
| - increaseOption(section, option, increase) |
| +def merge_stats_file(server_type, data): |
| + base_dir = os.path.join(get_config().get("stats", "dataDirectory"), common.filename_encode(server_type)) |
| + for month, month_data in data.iteritems(): |
| + for name, file_data in month_data.iteritems(): |
| + path = os.path.join(base_dir, common.filename_encode(month), common.filename_encode(name + ".json")) |
| + if os.path.exists(path): |
| + with codecs.open(path, "rb", encoding="utf-8") as file: |
| + existing = simplejson.load(file) |
| + else: |
| + existing = {} |
| - match = re.search(r'^(\S+) (hits|bandwidth)$', option, re.I) |
| - if match: |
| - increaseOption(section, '%s %s mirror %s' % (match.group(1), match.group(2), mirrorName), increase) |
| + merge_objects(existing, file_data) |
| -if __name__ == '__main__': |
| + dir = os.path.dirname(path) |
| + if not os.path.exists(dir): |
| + os.makedirs(dir) |
| + |
| + with codecs.open(path, "wb", encoding="utf-8") as file: |
| + simplejson.dump(existing, file, indent=2, sort_keys=True) |
| + |
| +def merge_mirror_stats(mirrors): |
| + for server_type, path in get_stats_files(mirrors): |
| + try: |
| + merge_stats_file(server_type, read_stats_file(path)) |
| + except: |
| + print >>sys.stderr, "Unable to merge stats for '%s'" % path |
| + traceback.print_exc() |
| + |
| +if __name__ == "__main__": |
| setupStderr() |
| - |
| - result = readStatsFile(get_config().get('subscriptionStats', 'mainFile')) |
| - for (mirrorName, statsFile) in getStatsFiles(): |
| - mergeStatsFile(mirrorName, result, readStatsFile(statsFile)) |
| - file = open(get_config().get('subscriptionStats', 'mainFile'), 'wb') |
| - result.write(file) |
| + merge_mirror_stats(sys.argv[1:]) |