| OLD | NEW | 
|---|
|  | (Empty) | 
| 1 # This file is part of the Adblock Plus web scripts, |  | 
| 2 # Copyright (C) 2006-present eyeo GmbH |  | 
| 3 # |  | 
| 4 # Adblock Plus is free software: you can redistribute it and/or modify |  | 
| 5 # it under the terms of the GNU General Public License version 3 as |  | 
| 6 # published by the Free Software Foundation. |  | 
| 7 # |  | 
| 8 # Adblock Plus is distributed in the hope that it will be useful, |  | 
| 9 # but WITHOUT ANY WARRANTY; without even the implied warranty of |  | 
| 10 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the |  | 
| 11 # GNU General Public License for more details. |  | 
| 12 # |  | 
| 13 # You should have received a copy of the GNU General Public License |  | 
| 14 # along with Adblock Plus.  If not, see <http://www.gnu.org/licenses/>. |  | 
| 15 |  | 
| 16 import argparse |  | 
| 17 import codecs |  | 
| 18 from collections import OrderedDict |  | 
| 19 from datetime import datetime, timedelta |  | 
| 20 import errno |  | 
| 21 import functools |  | 
| 22 import gzip |  | 
| 23 import json |  | 
| 24 import math |  | 
| 25 import multiprocessing |  | 
| 26 import numbers |  | 
| 27 import os |  | 
| 28 import re |  | 
| 29 import pygeoip |  | 
| 30 import socket |  | 
| 31 import subprocess |  | 
| 32 import sys |  | 
| 33 import traceback |  | 
| 34 import urllib |  | 
| 35 import urlparse |  | 
| 36 |  | 
| 37 import sitescripts.stats.common as common |  | 
| 38 from sitescripts.utils import get_config, setupStderr |  | 
| 39 |  | 
| 40 log_regexp = None |  | 
| 41 gecko_apps = None |  | 
| 42 |  | 
| 43 |  | 
| 44 class StatsFile: |  | 
| 45     def __init__(self, path): |  | 
| 46         self._inner_file = None |  | 
| 47         self._processes = [] |  | 
| 48 |  | 
| 49         parseresult = urlparse.urlparse(path) |  | 
| 50         if parseresult.scheme == 'ssh' and parseresult.username and parseresult.
     hostname and parseresult.path: |  | 
| 51             command = [ |  | 
| 52                 'ssh', '-q', '-o', 'NumberOfPasswordPrompts 0', '-T', '-k', |  | 
| 53                 '-l', parseresult.username, |  | 
| 54                 parseresult.hostname, |  | 
| 55                 parseresult.path.lstrip('/'), |  | 
| 56             ] |  | 
| 57             if parseresult.port: |  | 
| 58                 command[1:1] = ['-P', str(parseresult.port)] |  | 
| 59             ssh_process = subprocess.Popen(command, stdout=subprocess.PIPE) |  | 
| 60             self._processes.append(ssh_process) |  | 
| 61             self._file = ssh_process.stdout |  | 
| 62         elif parseresult.scheme in ('http', 'https'): |  | 
| 63             self._file = urllib.urlopen(path) |  | 
| 64         elif os.path.exists(path): |  | 
| 65             self._file = open(path, 'rb') |  | 
| 66         else: |  | 
| 67             raise IOError("Path '%s' not recognized" % path) |  | 
| 68 |  | 
| 69         if path.endswith('.gz'): |  | 
| 70             # Built-in gzip module doesn't support streaming (fixed in Python 3.
     2) |  | 
| 71             gzip_process = subprocess.Popen(['gzip', '-cd'], stdin=self._file, s
     tdout=subprocess.PIPE) |  | 
| 72             self._processes.append(gzip_process) |  | 
| 73             self._file, self._inner_file = gzip_process.stdout, self._file |  | 
| 74 |  | 
| 75     def __getattr__(self, name): |  | 
| 76         return getattr(self._file, name) |  | 
| 77 |  | 
| 78     def close(self): |  | 
| 79         self._file.close() |  | 
| 80         if self._inner_file: |  | 
| 81             self._inner_file.close() |  | 
| 82         for process in self._processes: |  | 
| 83             process.wait() |  | 
| 84 |  | 
| 85 |  | 
| 86 def get_stats_files(): |  | 
| 87     config = get_config() |  | 
| 88 |  | 
| 89     prefix = 'mirror_' |  | 
| 90     options = filter(lambda o: o.startswith(prefix), config.options('stats')) |  | 
| 91     for option in options: |  | 
| 92         if config.has_option('stats', option): |  | 
| 93             value = config.get('stats', option) |  | 
| 94             if ' ' in value: |  | 
| 95                 yield [option[len(prefix):]] + value.split(None, 1) |  | 
| 96             else: |  | 
| 97                 print >>sys.stderr, "Option '%s' has invalid value: '%s'" % (opt
     ion, value) |  | 
| 98         else: |  | 
| 99             print >>sys.stderr, "Option '%s' not found in the configuration" % o
     ption |  | 
| 100 |  | 
| 101 |  | 
| 102 def cache_lru(func): |  | 
| 103     """ |  | 
| 104       Decorator that memoizes the return values of a single-parameter function i
     n |  | 
| 105       case it is called again with the same parameter. The 1024 most recent |  | 
| 106       results are saved. |  | 
| 107     """ |  | 
| 108 |  | 
| 109     results = OrderedDict() |  | 
| 110     results.entries_left = 1024 |  | 
| 111 |  | 
| 112     def wrapped(arg): |  | 
| 113         if arg in results: |  | 
| 114             result = results[arg] |  | 
| 115             del results[arg] |  | 
| 116         else: |  | 
| 117             if results.entries_left > 0: |  | 
| 118                 results.entries_left -= 1 |  | 
| 119             else: |  | 
| 120                 results.popitem(last=False) |  | 
| 121             try: |  | 
| 122                 result = func(arg) |  | 
| 123             except: |  | 
| 124                 results.entries_left += 1 |  | 
| 125                 raise |  | 
| 126         results[arg] = result |  | 
| 127         return result |  | 
| 128     return wrapped |  | 
| 129 |  | 
| 130 |  | 
| 131 def cache_last(func): |  | 
| 132     """ |  | 
| 133       Decorator that memoizes the last return value of a function in case it is |  | 
| 134       called again with the same parameters. |  | 
| 135     """ |  | 
| 136     result = {'args': None, 'result': None} |  | 
| 137 |  | 
| 138     def wrapped(*args): |  | 
| 139         if args != result['args']: |  | 
| 140             result['result'] = func(*args) |  | 
| 141             result['args'] = args |  | 
| 142         return result['result'] |  | 
| 143     return wrapped |  | 
| 144 |  | 
| 145 |  | 
| 146 @cache_lru |  | 
| 147 def parse_ua(ua): |  | 
| 148     # Opera might disguise itself as other browser so it needs to go first |  | 
| 149     match = re.search(r'\bOpera/([\d\.]+)', ua) |  | 
| 150     if match: |  | 
| 151         # Opera 10+ declares itself as Opera 9.80 but adds Version/1x.x to the U
     A |  | 
| 152         match2 = re.search(r'\bVersion/([\d\.]+)', ua) |  | 
| 153         if match2: |  | 
| 154             return 'Opera', match2.group(1) |  | 
| 155         else: |  | 
| 156             return 'Opera', match.group(1) |  | 
| 157 |  | 
| 158     # Opera 15+ has the same UA as Chrome but adds OPR/1x.x to it |  | 
| 159     match = re.search(r'\bOPR/(\d+\.\d+)', ua) |  | 
| 160     if match: |  | 
| 161         return 'Opera', match.group(1) |  | 
| 162 |  | 
| 163     # Have to check for these before Firefox, they will usually have a Firefox i
     dentifier as well |  | 
| 164     match = re.search(r'\b(Fennec|Thunderbird|SeaMonkey|Songbird|K-Meleon|Prism)
     /(\d+\.\d+)', ua) |  | 
| 165     if match: |  | 
| 166         if match.group(1) == 'Fennec': |  | 
| 167             return 'Firefox Mobile', match.group(2) |  | 
| 168         else: |  | 
| 169             return match.group(1), match.group(2) |  | 
| 170 |  | 
| 171     match = re.search(r'\bFirefox/(\d+\.\d+)', ua) |  | 
| 172     if match: |  | 
| 173         if re.search(r'\bMobile;', ua): |  | 
| 174             return 'Firefox Mobile', match.group(1) |  | 
| 175         elif re.search(r'\bTablet;', ua): |  | 
| 176             return 'Firefox Tablet', match.group(1) |  | 
| 177         else: |  | 
| 178             return 'Firefox', match.group(1) |  | 
| 179 |  | 
| 180     match = re.search(r'\brv:(\d+)\.(\d+)(?:\.(\d+))?', ua) |  | 
| 181     if match and re.search(r'\bGecko/', ua): |  | 
| 182         if match.group(3) and int(match.group(1)) < 2: |  | 
| 183             return 'Gecko', '%s.%s.%s' % (match.group(1), match.group(2), match.
     group(3)) |  | 
| 184         else: |  | 
| 185             return 'Gecko', '%s.%s' % (match.group(1), match.group(2)) |  | 
| 186 |  | 
| 187     match = re.search(r'\bCoolNovo/(\d+\.\d+\.\d+)', ua) |  | 
| 188     if match: |  | 
| 189         return 'CoolNovo', match.group(1) |  | 
| 190 |  | 
| 191     match = re.search(r'\bEdge/(\d+)\.\d+', ua) |  | 
| 192     if match: |  | 
| 193         return 'Edge', match.group(1) |  | 
| 194 |  | 
| 195     match = re.search(r'\bChrome/(\d+\.\d+)', ua) |  | 
| 196     if match: |  | 
| 197         return 'Chrome', match.group(1) |  | 
| 198 |  | 
| 199     match = re.search(r'\bVersion/(\d+\.\d+)', ua) |  | 
| 200     if match and re.search(r'\bMobile Safari/', ua): |  | 
| 201         return 'Mobile Safari', match.group(1) |  | 
| 202     if match and re.search(r'\bSafari/', ua): |  | 
| 203         return 'Safari', match.group(1) |  | 
| 204 |  | 
| 205     if re.search(r'\bAppleWebKit/', ua): |  | 
| 206         return 'WebKit', '' |  | 
| 207 |  | 
| 208     match = re.search(r'\bMSIE (\d+\.\d+)', ua) |  | 
| 209     if match: |  | 
| 210         return 'MSIE', match.group(1) |  | 
| 211 |  | 
| 212     match = re.search(r'\bTrident/(\d+\.\d+)', ua) |  | 
| 213     if match: |  | 
| 214         match2 = re.search(r'\brv:(\d+\.\d+)', ua) |  | 
| 215         if match2: |  | 
| 216             return 'MSIE', match2.group(1) |  | 
| 217         else: |  | 
| 218             return 'Trident', match.group(1) |  | 
| 219 |  | 
| 220     match = re.search(r'\bAndroidDownloadManager(?:/(\d+\.\d+))?', ua) |  | 
| 221     if match: |  | 
| 222         return 'Android', match.group(1) or '' |  | 
| 223 |  | 
| 224     match = re.search(r'\bDalvik/.*\bAndroid (\d+\.\d+)', ua) |  | 
| 225     if match: |  | 
| 226         return 'Android', match.group(1) |  | 
| 227 |  | 
| 228     # ABP/Android downloads use that user agent |  | 
| 229     if ua.startswith('Apache-HttpClient/UNAVAILABLE'): |  | 
| 230         return 'Android', '' |  | 
| 231 |  | 
| 232     # ABP/IE downloads use that user agent |  | 
| 233     if ua == 'Adblock Plus': |  | 
| 234         return 'ABP', '' |  | 
| 235 |  | 
| 236     return 'Other', '' |  | 
| 237 |  | 
| 238 |  | 
| 239 def process_ip(ip, geo, geov6): |  | 
| 240     match = re.search(r'^::ffff:(\d+\.\d+\.\d+\.\d+)$', ip) |  | 
| 241     if match: |  | 
| 242         ip = match.group(1) |  | 
| 243 |  | 
| 244     try: |  | 
| 245         if ':' in ip: |  | 
| 246             country = geov6.country_code_by_addr(ip) |  | 
| 247         else: |  | 
| 248             country = geo.country_code_by_addr(ip) |  | 
| 249     except: |  | 
| 250         traceback.print_exc() |  | 
| 251         country = '' |  | 
| 252 |  | 
| 253     if country in (None, '', '--'): |  | 
| 254         country = 'unknown' |  | 
| 255     country = country.lower() |  | 
| 256 |  | 
| 257     return ip, country |  | 
| 258 |  | 
| 259 |  | 
| 260 @cache_last |  | 
| 261 def parse_time(timestr, tz_hours, tz_minutes): |  | 
| 262     result = datetime.strptime(timestr, '%d/%b/%Y:%H:%M:%S') |  | 
| 263     result -= timedelta(hours=tz_hours, minutes=math.copysign(tz_minutes, tz_hou
     rs)) |  | 
| 264     return result, result.strftime('%Y%m'), result.day, result.weekday(), result
     .hour |  | 
| 265 |  | 
| 266 |  | 
| 267 @cache_lru |  | 
| 268 def parse_path(path): |  | 
| 269     urlparts = urlparse.urlparse(path) |  | 
| 270     try: |  | 
| 271         path = urllib.unquote(urlparts.path).decode('utf-8') |  | 
| 272     except: |  | 
| 273         path = urlparts.path |  | 
| 274     return path[1:], urlparts.query |  | 
| 275 |  | 
| 276 |  | 
| 277 @cache_lru |  | 
| 278 def parse_query(query): |  | 
| 279     return urlparse.parse_qs(query) |  | 
| 280 |  | 
| 281 |  | 
| 282 @cache_lru |  | 
| 283 def parse_lastversion(last_version): |  | 
| 284     if '-' in last_version: |  | 
| 285         last_version = last_version.split('-', 1)[0] |  | 
| 286     return datetime.strptime(last_version, '%Y%m%d%H%M') |  | 
| 287 |  | 
| 288 |  | 
| 289 @cache_lru |  | 
| 290 def get_week(date): |  | 
| 291     return date.isocalendar()[0:2] |  | 
| 292 |  | 
| 293 |  | 
| 294 def parse_downloader_query(info): |  | 
| 295     params = parse_query(info['query']) |  | 
| 296     for param in ('addonName', 'addonVersion', 'application', 'applicationVersio
     n', 'platform', 'platformVersion'): |  | 
| 297         info[param] = params.get(param, ['unknown'])[0] |  | 
| 298 |  | 
| 299     # Only leave the major and minor release number for application and platform |  | 
| 300     info['applicationVersion'] = re.sub(r'^(\d+\.\d+).*', r'\1', info['applicati
     onVersion']) |  | 
| 301     info['platformVersion'] = re.sub(r'^(\d+\.\d+).*', r'\1', info['platformVers
     ion']) |  | 
| 302 |  | 
| 303     # Chrome Adblock sends an X-Client-ID header insteads of URL parameters |  | 
| 304     match = re.match(r'^adblock/([\d\.]+)$', info['clientid'], re.I) if info['cl
     ientid'] else None |  | 
| 305     if match: |  | 
| 306         info['addonName'] = 'chromeadblock' |  | 
| 307         info['addonVersion'] = match.group(1) |  | 
| 308 |  | 
| 309     last_version = params.get('lastVersion', ['unknown'])[0] |  | 
| 310     if info['file'] == 'notification.json' and last_version == '0' and ( |  | 
| 311         (info['addonName'] == 'adblockplus' and info['addonVersion'] == '2.3.1')
      or |  | 
| 312         (info['addonName'] in ('adblockpluschrome', 'adblockplusopera') and info
     ['addonVersion'] == '1.5.2') |  | 
| 313     ): |  | 
| 314         # Broken notification version number in these releases, treat like unkno
     wn |  | 
| 315         last_version = 'unknown' |  | 
| 316 |  | 
| 317     if last_version == 'unknown': |  | 
| 318         info['downloadInterval'] = 'unknown' |  | 
| 319         info['previousDownload'] = 'unknown' |  | 
| 320     elif last_version == '0': |  | 
| 321         info['downloadInterval'] = 'unknown' |  | 
| 322         info['previousDownload'] = 'unknown' |  | 
| 323         info['firstDownload'] = True |  | 
| 324     else: |  | 
| 325         try: |  | 
| 326             last_update = parse_lastversion(last_version) |  | 
| 327             diff = info['time'] - last_update |  | 
| 328             if diff.days >= 365: |  | 
| 329                 info['downloadInterval'] = '%i year(s)' % (diff.days / 365) |  | 
| 330             elif diff.days >= 30: |  | 
| 331                 info['downloadInterval'] = '%i month(s)' % (diff.days / 30) |  | 
| 332             elif diff.days >= 1: |  | 
| 333                 info['downloadInterval'] = '%i day(s)' % diff.days |  | 
| 334             else: |  | 
| 335                 info['downloadInterval'] = '%i hour(s)' % (diff.seconds / 3600) |  | 
| 336 |  | 
| 337             if info['addonName'].startswith('adblockplus'): |  | 
| 338                 diffdays = (info['time'].date() - last_update.date()).days |  | 
| 339                 if diffdays == 0: |  | 
| 340                     info['previousDownload'] = 'same day' |  | 
| 341                 elif diffdays < 30: |  | 
| 342                     info['previousDownload'] = '%i day(s)' % diffdays |  | 
| 343                 elif diffdays < 365: |  | 
| 344                     info['previousDownload'] = '%i month(s)' % (diffdays / 30) |  | 
| 345                 else: |  | 
| 346                     info['previousDownload'] = '%i year(s)' % (diffdays / 365) |  | 
| 347             else: |  | 
| 348                 info['previousDownload'] = 'unknown' |  | 
| 349 |  | 
| 350             if last_update.year != info['time'].year or last_update.month != inf
     o['time'].month: |  | 
| 351                 info['firstInMonth'] = info['firstInDay'] = True |  | 
| 352             elif last_update.day != info['time'].day: |  | 
| 353                 info['firstInDay'] = True |  | 
| 354 |  | 
| 355             if get_week(last_update) != get_week(info['time']): |  | 
| 356                 info['firstInWeek'] = True |  | 
| 357         except ValueError: |  | 
| 358             info['downloadInterval'] = 'unknown' |  | 
| 359             info['previousDownload'] = 'unknown' |  | 
| 360             pass |  | 
| 361 |  | 
| 362 |  | 
| 363 def parse_addon_name(file): |  | 
| 364     if '/' in file: |  | 
| 365         return file.split('/')[-2] |  | 
| 366     else: |  | 
| 367         return None |  | 
| 368 |  | 
| 369 |  | 
| 370 def parse_gecko_query(query): |  | 
| 371     params = urlparse.parse_qs(query) |  | 
| 372 |  | 
| 373     version = params.get('version', ['unknown'])[0] |  | 
| 374 |  | 
| 375     global gecko_apps |  | 
| 376     if gecko_apps == None: |  | 
| 377         from buildtools.packagerGecko import KNOWN_APPS |  | 
| 378         gecko_apps = {v: k for k, v in KNOWN_APPS.iteritems()} |  | 
| 379     appID = params.get('appID', ['unknown'])[0] |  | 
| 380 |  | 
| 381     application = gecko_apps.get(appID, 'unknown') |  | 
| 382     applicationVersion = params.get('appVersion', ['unknown'])[0] |  | 
| 383 |  | 
| 384     # Only leave the major and minor release number for application |  | 
| 385     applicationVersion = re.sub(r'^(\d+\.\d+).*', r'\1', applicationVersion) |  | 
| 386 |  | 
| 387     return version, application, applicationVersion |  | 
| 388 |  | 
| 389 |  | 
| 390 def parse_chrome_query(query): |  | 
| 391     params = urlparse.parse_qs(query) |  | 
| 392 |  | 
| 393     if params.get('prod', ['unknown'])[0] in ('chromecrx', 'chromiumcrx'): |  | 
| 394         application = 'chrome' |  | 
| 395     else: |  | 
| 396         application = 'unknown' |  | 
| 397     applicationVersion = params.get('prodversion', ['unknown'])[0] |  | 
| 398 |  | 
| 399     params2 = urlparse.parse_qs(params.get('x', [''])[0]) |  | 
| 400     version = params2.get('v', ['unknown'])[0] |  | 
| 401 |  | 
| 402     # Only leave the major and minor release number for application |  | 
| 403     applicationVersion = re.sub(r'^(\d+\.\d+).*', r'\1', applicationVersion) |  | 
| 404 |  | 
| 405     return version, application, applicationVersion |  | 
| 406 |  | 
| 407 |  | 
| 408 def parse_update_flag(query): |  | 
| 409     return 'update' if query == 'update' else 'install' |  | 
| 410 |  | 
| 411 |  | 
| 412 def parse_record(line, ignored, geo, geov6): |  | 
| 413     global log_regexp |  | 
| 414     if log_regexp == None: |  | 
| 415         log_regexp = re.compile(r'(\S+) \S+ \S+ \[([^]\s]+) ([+\-]\d\d)(\d\d)\] 
     "GET ([^"\s]+) [^"]+" (\d+) (\d+) "([^"]*)" "([^"]*)"(?: "[^"]*" \S+ "[^"]*" "[^
     "]*" "([^"]*)")?') |  | 
| 416 |  | 
| 417     match = re.search(log_regexp, line) |  | 
| 418     if not match: |  | 
| 419         return None |  | 
| 420 |  | 
| 421     status = int(match.group(6)) |  | 
| 422     if status not in (200, 301, 302): |  | 
| 423         return None |  | 
| 424 |  | 
| 425     info = { |  | 
| 426         'status': status, |  | 
| 427         'size': int(match.group(7)), |  | 
| 428     } |  | 
| 429 |  | 
| 430     info['ip'], info['country'] = process_ip(match.group(1), geo, geov6) |  | 
| 431     info['time'], info['month'], info['day'], info['weekday'], info['hour'] = pa
     rse_time(match.group(2), int(match.group(3)), int(match.group(4))) |  | 
| 432     info['file'], info['query'] = parse_path(match.group(5)) |  | 
| 433     info['referrer'] = match.group(8) |  | 
| 434     info['ua'], info['uaversion'] = parse_ua(match.group(9)) |  | 
| 435     info['fullua'] = '%s %s' % (info['ua'], info['uaversion']) |  | 
| 436     info['clientid'] = match.group(10) |  | 
| 437 |  | 
| 438     # Additional metadata depends on file type |  | 
| 439     filename = os.path.basename(info['file']) |  | 
| 440     ext = os.path.splitext(filename)[1] |  | 
| 441     if ext == '.txt' or filename == 'update.json' or filename == 'notification.j
     son': |  | 
| 442         # Subscription downloads, libadblockplus update checks and notification |  | 
| 443         # checks are performed by the downloader |  | 
| 444         parse_downloader_query(info) |  | 
| 445     elif ext == '.tpl': |  | 
| 446         # MSIE TPL download, no additional data here |  | 
| 447         pass |  | 
| 448     elif ext in ('.xpi', '.crx', '.apk', '.msi', '.exe', '.safariextz'): |  | 
| 449         # Package download, might be an update |  | 
| 450         info['installType'] = parse_update_flag(info['query']) |  | 
| 451     elif filename == 'update.rdf': |  | 
| 452         # Gecko update check or a legacy Android update check. The latter doesn'
     t |  | 
| 453         # have usable data anyway so trying the Chrome route won't do any harm. |  | 
| 454         info['addonName'] = parse_addon_name(info['file']) |  | 
| 455         info['addonVersion'], info['application'], info['applicationVersion'] = 
     parse_gecko_query(info['query']) |  | 
| 456     elif filename == 'updates.xml': |  | 
| 457         # Chrome update check |  | 
| 458         info['addonName'] = parse_addon_name(info['file']) |  | 
| 459         info['addonVersion'], info['application'], info['applicationVersion'] = 
     parse_chrome_query(info['query']) |  | 
| 460     elif filename == 'updates.plist': |  | 
| 461         # Safari update check, no additional data |  | 
| 462         pass |  | 
| 463     else: |  | 
| 464         ignored.add(info['file']) |  | 
| 465         return None |  | 
| 466 |  | 
| 467     if 'addonName' in info: |  | 
| 468         info['fullAddon'] = '%s %s' % (info['addonName'], info['addonVersion']) |  | 
| 469     if 'application' in info: |  | 
| 470         info['fullApplication'] = '%s %s' % (info['application'], info['applicat
     ionVersion']) |  | 
| 471     if 'platform' in info: |  | 
| 472         info['fullPlatform'] = '%s %s' % (info['platform'], info['platformVersio
     n']) |  | 
| 473     return info |  | 
| 474 |  | 
| 475 |  | 
| 476 def add_record(info, section, ignore_fields=()): |  | 
| 477     section['hits'] = section.get('hits', 0) + 1 |  | 
| 478     section['bandwidth'] = section.get('bandwidth', 0) + info['size'] |  | 
| 479 |  | 
| 480     if len(ignore_fields) < 2: |  | 
| 481         for field in map(lambda f: f['name'], common.fields): |  | 
| 482             if field in ignore_fields or field not in info: |  | 
| 483                 continue |  | 
| 484 |  | 
| 485             value = info[field] |  | 
| 486             if field not in section: |  | 
| 487                 section[field] = {} |  | 
| 488             if value not in section[field]: |  | 
| 489                 section[field][value] = {} |  | 
| 490 |  | 
| 491             add_record(info, section[field][value], ignore_fields + (field,)) |  | 
| 492 |  | 
| 493 |  | 
| 494 def parse_fileobj(mirror_name, fileobj, geo, geov6, ignored): |  | 
| 495     data = {} |  | 
| 496     for line in fileobj: |  | 
| 497         info = parse_record(line, ignored, geo, geov6) |  | 
| 498         if info == None: |  | 
| 499             continue |  | 
| 500 |  | 
| 501         info['mirror'] = mirror_name |  | 
| 502         if info['month'] not in data: |  | 
| 503             data[info['month']] = {} |  | 
| 504         section = data[info['month']] |  | 
| 505 |  | 
| 506         if info['file'] not in section: |  | 
| 507             section[info['file']] = {} |  | 
| 508         section = section[info['file']] |  | 
| 509 |  | 
| 510         add_record(info, section) |  | 
| 511     return data |  | 
| 512 |  | 
| 513 |  | 
| 514 def merge_objects(object1, object2, factor=1): |  | 
| 515     for key, value in object2.iteritems(): |  | 
| 516         try: |  | 
| 517             key = unicode(key) |  | 
| 518         except UnicodeDecodeError: |  | 
| 519             key = unicode(key, encoding='latin-1') |  | 
| 520         if isinstance(value, numbers.Number): |  | 
| 521             object1[key] = object1.get(key, 0) + factor * value |  | 
| 522         else: |  | 
| 523             merge_objects(object1.setdefault(key, {}), value, factor) |  | 
| 524 |  | 
| 525 |  | 
| 526 def save_stats(server_type, data, factor=1): |  | 
| 527     base_dir = os.path.join(get_config().get('stats', 'dataDirectory'), common.f
     ilename_encode(server_type)) |  | 
| 528     for month, month_data in data.iteritems(): |  | 
| 529         for name, file_data in month_data.iteritems(): |  | 
| 530             path = os.path.join(base_dir, common.filename_encode(month), common.
     filename_encode(name + '.json')) |  | 
| 531             if os.path.exists(path): |  | 
| 532                 with codecs.open(path, 'rb', encoding='utf-8') as fileobj: |  | 
| 533                     existing = json.load(fileobj) |  | 
| 534             else: |  | 
| 535                 existing = {} |  | 
| 536 |  | 
| 537             merge_objects(existing, file_data, factor) |  | 
| 538 |  | 
| 539             dir = os.path.dirname(path) |  | 
| 540             try: |  | 
| 541                 os.makedirs(dir) |  | 
| 542             except OSError as e: |  | 
| 543                 if e.errno != errno.EEXIST: |  | 
| 544                     raise |  | 
| 545 |  | 
| 546             with codecs.open(path, 'wb', encoding='utf-8') as fileobj: |  | 
| 547                 json.dump(existing, fileobj, indent=2, sort_keys=True) |  | 
| 548 |  | 
| 549 |  | 
| 550 def parse_source(factor, lock, (mirror_name, server_type, log_file)): |  | 
| 551     try: |  | 
| 552         geo = pygeoip.GeoIP(get_config().get('stats', 'geoip_db'), pygeoip.MEMOR
     Y_CACHE) |  | 
| 553         geov6 = pygeoip.GeoIP(get_config().get('stats', 'geoipv6_db'), pygeoip.M
     EMORY_CACHE) |  | 
| 554 |  | 
| 555         ignored = set() |  | 
| 556         fileobj = StatsFile(log_file) |  | 
| 557         try: |  | 
| 558             data = parse_fileobj(mirror_name, fileobj, geo, geov6, ignored) |  | 
| 559         finally: |  | 
| 560             fileobj.close() |  | 
| 561 |  | 
| 562         lock.acquire() |  | 
| 563         try: |  | 
| 564             save_stats(server_type, data, factor) |  | 
| 565         finally: |  | 
| 566             lock.release() |  | 
| 567         return log_file, ignored |  | 
| 568     except: |  | 
| 569         print >>sys.stderr, "Unable to process log file '%s'" % log_file |  | 
| 570         traceback.print_exc() |  | 
| 571         return None, None |  | 
| 572 |  | 
| 573 |  | 
| 574 def parse_sources(sources, factor=1, verbose=False): |  | 
| 575     pool = multiprocessing.Pool() |  | 
| 576     lock = multiprocessing.Manager().Lock() |  | 
| 577     callback = functools.partial(parse_source, factor, lock) |  | 
| 578     try: |  | 
| 579         for log_file, ignored in pool.imap_unordered(callback, sources, chunksiz
     e=1): |  | 
| 580             if verbose and ignored: |  | 
| 581                 print 'Ignored files for %s' % log_file |  | 
| 582                 print '=========================================================
     ===' |  | 
| 583                 print '\n'.join(sorted(ignored)) |  | 
| 584     finally: |  | 
| 585         pool.close() |  | 
| 586 |  | 
| 587 |  | 
| 588 if __name__ == '__main__': |  | 
| 589     setupStderr() |  | 
| 590 |  | 
| 591     parser = argparse.ArgumentParser(description='Processes log files and merges
      them into the stats database') |  | 
| 592     parser.add_argument('--verbose', dest='verbose', action='store_const', const
     =True, default=False, help='Verbose mode, ignored requests will be listed') |  | 
| 593     parser.add_argument('--revert', dest='factor', action='store_const', const=-
     1, default=1, help='Remove log data from the database') |  | 
| 594     parser.add_argument('mirror_name', nargs='?', help='Name of the mirror serve
     r that the file belongs to') |  | 
| 595     parser.add_argument('server_type', nargs='?', help='Server type like downloa
     d, update or subscription') |  | 
| 596     parser.add_argument('log_file', nargs='?', help='Log file path, can be a loc
     al file path, http:// or ssh:// URL') |  | 
| 597     args = parser.parse_args() |  | 
| 598 |  | 
| 599     if args.mirror_name and args.server_type and args.log_file: |  | 
| 600         sources = [(args.mirror_name, args.server_type, args.log_file)] |  | 
| 601     else: |  | 
| 602         sources = get_stats_files() |  | 
| 603     parse_sources(sources, args.factor, args.verbose) |  | 
| OLD | NEW | 
|---|