| OLD | NEW |
| (Empty) |
| 1 # This file is part of the Adblock Plus web scripts, | |
| 2 # Copyright (C) 2006-present eyeo GmbH | |
| 3 # | |
| 4 # Adblock Plus is free software: you can redistribute it and/or modify | |
| 5 # it under the terms of the GNU General Public License version 3 as | |
| 6 # published by the Free Software Foundation. | |
| 7 # | |
| 8 # Adblock Plus is distributed in the hope that it will be useful, | |
| 9 # but WITHOUT ANY WARRANTY; without even the implied warranty of | |
| 10 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the | |
| 11 # GNU General Public License for more details. | |
| 12 # | |
| 13 # You should have received a copy of the GNU General Public License | |
| 14 # along with Adblock Plus. If not, see <http://www.gnu.org/licenses/>. | |
| 15 | |
| 16 import argparse | |
| 17 import codecs | |
| 18 from collections import OrderedDict | |
| 19 from datetime import datetime, timedelta | |
| 20 import errno | |
| 21 import functools | |
| 22 import gzip | |
| 23 import json | |
| 24 import math | |
| 25 import multiprocessing | |
| 26 import numbers | |
| 27 import os | |
| 28 import re | |
| 29 import pygeoip | |
| 30 import socket | |
| 31 import subprocess | |
| 32 import sys | |
| 33 import traceback | |
| 34 import urllib | |
| 35 import urlparse | |
| 36 | |
| 37 import sitescripts.stats.common as common | |
| 38 from sitescripts.utils import get_config, setupStderr | |
| 39 | |
| 40 log_regexp = None | |
| 41 gecko_apps = None | |
| 42 | |
| 43 | |
| 44 class StatsFile: | |
| 45 def __init__(self, path): | |
| 46 self._inner_file = None | |
| 47 self._processes = [] | |
| 48 | |
| 49 parseresult = urlparse.urlparse(path) | |
| 50 if parseresult.scheme == 'ssh' and parseresult.username and parseresult.
hostname and parseresult.path: | |
| 51 command = [ | |
| 52 'ssh', '-q', '-o', 'NumberOfPasswordPrompts 0', '-T', '-k', | |
| 53 '-l', parseresult.username, | |
| 54 parseresult.hostname, | |
| 55 parseresult.path.lstrip('/'), | |
| 56 ] | |
| 57 if parseresult.port: | |
| 58 command[1:1] = ['-P', str(parseresult.port)] | |
| 59 ssh_process = subprocess.Popen(command, stdout=subprocess.PIPE) | |
| 60 self._processes.append(ssh_process) | |
| 61 self._file = ssh_process.stdout | |
| 62 elif parseresult.scheme in ('http', 'https'): | |
| 63 self._file = urllib.urlopen(path) | |
| 64 elif os.path.exists(path): | |
| 65 self._file = open(path, 'rb') | |
| 66 else: | |
| 67 raise IOError("Path '%s' not recognized" % path) | |
| 68 | |
| 69 if path.endswith('.gz'): | |
| 70 # Built-in gzip module doesn't support streaming (fixed in Python 3.
2) | |
| 71 gzip_process = subprocess.Popen(['gzip', '-cd'], stdin=self._file, s
tdout=subprocess.PIPE) | |
| 72 self._processes.append(gzip_process) | |
| 73 self._file, self._inner_file = gzip_process.stdout, self._file | |
| 74 | |
| 75 def __getattr__(self, name): | |
| 76 return getattr(self._file, name) | |
| 77 | |
| 78 def close(self): | |
| 79 self._file.close() | |
| 80 if self._inner_file: | |
| 81 self._inner_file.close() | |
| 82 for process in self._processes: | |
| 83 process.wait() | |
| 84 | |
| 85 | |
| 86 def get_stats_files(): | |
| 87 config = get_config() | |
| 88 | |
| 89 prefix = 'mirror_' | |
| 90 options = filter(lambda o: o.startswith(prefix), config.options('stats')) | |
| 91 for option in options: | |
| 92 if config.has_option('stats', option): | |
| 93 value = config.get('stats', option) | |
| 94 if ' ' in value: | |
| 95 yield [option[len(prefix):]] + value.split(None, 1) | |
| 96 else: | |
| 97 print >>sys.stderr, "Option '%s' has invalid value: '%s'" % (opt
ion, value) | |
| 98 else: | |
| 99 print >>sys.stderr, "Option '%s' not found in the configuration" % o
ption | |
| 100 | |
| 101 | |
| 102 def cache_lru(func): | |
| 103 """ | |
| 104 Decorator that memoizes the return values of a single-parameter function i
n | |
| 105 case it is called again with the same parameter. The 1024 most recent | |
| 106 results are saved. | |
| 107 """ | |
| 108 | |
| 109 results = OrderedDict() | |
| 110 results.entries_left = 1024 | |
| 111 | |
| 112 def wrapped(arg): | |
| 113 if arg in results: | |
| 114 result = results[arg] | |
| 115 del results[arg] | |
| 116 else: | |
| 117 if results.entries_left > 0: | |
| 118 results.entries_left -= 1 | |
| 119 else: | |
| 120 results.popitem(last=False) | |
| 121 try: | |
| 122 result = func(arg) | |
| 123 except: | |
| 124 results.entries_left += 1 | |
| 125 raise | |
| 126 results[arg] = result | |
| 127 return result | |
| 128 return wrapped | |
| 129 | |
| 130 | |
| 131 def cache_last(func): | |
| 132 """ | |
| 133 Decorator that memoizes the last return value of a function in case it is | |
| 134 called again with the same parameters. | |
| 135 """ | |
| 136 result = {'args': None, 'result': None} | |
| 137 | |
| 138 def wrapped(*args): | |
| 139 if args != result['args']: | |
| 140 result['result'] = func(*args) | |
| 141 result['args'] = args | |
| 142 return result['result'] | |
| 143 return wrapped | |
| 144 | |
| 145 | |
| 146 @cache_lru | |
| 147 def parse_ua(ua): | |
| 148 # Opera might disguise itself as other browser so it needs to go first | |
| 149 match = re.search(r'\bOpera/([\d\.]+)', ua) | |
| 150 if match: | |
| 151 # Opera 10+ declares itself as Opera 9.80 but adds Version/1x.x to the U
A | |
| 152 match2 = re.search(r'\bVersion/([\d\.]+)', ua) | |
| 153 if match2: | |
| 154 return 'Opera', match2.group(1) | |
| 155 else: | |
| 156 return 'Opera', match.group(1) | |
| 157 | |
| 158 # Opera 15+ has the same UA as Chrome but adds OPR/1x.x to it | |
| 159 match = re.search(r'\bOPR/(\d+\.\d+)', ua) | |
| 160 if match: | |
| 161 return 'Opera', match.group(1) | |
| 162 | |
| 163 # Have to check for these before Firefox, they will usually have a Firefox i
dentifier as well | |
| 164 match = re.search(r'\b(Fennec|Thunderbird|SeaMonkey|Songbird|K-Meleon|Prism)
/(\d+\.\d+)', ua) | |
| 165 if match: | |
| 166 if match.group(1) == 'Fennec': | |
| 167 return 'Firefox Mobile', match.group(2) | |
| 168 else: | |
| 169 return match.group(1), match.group(2) | |
| 170 | |
| 171 match = re.search(r'\bFirefox/(\d+\.\d+)', ua) | |
| 172 if match: | |
| 173 if re.search(r'\bMobile;', ua): | |
| 174 return 'Firefox Mobile', match.group(1) | |
| 175 elif re.search(r'\bTablet;', ua): | |
| 176 return 'Firefox Tablet', match.group(1) | |
| 177 else: | |
| 178 return 'Firefox', match.group(1) | |
| 179 | |
| 180 match = re.search(r'\brv:(\d+)\.(\d+)(?:\.(\d+))?', ua) | |
| 181 if match and re.search(r'\bGecko/', ua): | |
| 182 if match.group(3) and int(match.group(1)) < 2: | |
| 183 return 'Gecko', '%s.%s.%s' % (match.group(1), match.group(2), match.
group(3)) | |
| 184 else: | |
| 185 return 'Gecko', '%s.%s' % (match.group(1), match.group(2)) | |
| 186 | |
| 187 match = re.search(r'\bCoolNovo/(\d+\.\d+\.\d+)', ua) | |
| 188 if match: | |
| 189 return 'CoolNovo', match.group(1) | |
| 190 | |
| 191 match = re.search(r'\bEdge/(\d+)\.\d+', ua) | |
| 192 if match: | |
| 193 return 'Edge', match.group(1) | |
| 194 | |
| 195 match = re.search(r'\bChrome/(\d+\.\d+)', ua) | |
| 196 if match: | |
| 197 return 'Chrome', match.group(1) | |
| 198 | |
| 199 match = re.search(r'\bVersion/(\d+\.\d+)', ua) | |
| 200 if match and re.search(r'\bMobile Safari/', ua): | |
| 201 return 'Mobile Safari', match.group(1) | |
| 202 if match and re.search(r'\bSafari/', ua): | |
| 203 return 'Safari', match.group(1) | |
| 204 | |
| 205 if re.search(r'\bAppleWebKit/', ua): | |
| 206 return 'WebKit', '' | |
| 207 | |
| 208 match = re.search(r'\bMSIE (\d+\.\d+)', ua) | |
| 209 if match: | |
| 210 return 'MSIE', match.group(1) | |
| 211 | |
| 212 match = re.search(r'\bTrident/(\d+\.\d+)', ua) | |
| 213 if match: | |
| 214 match2 = re.search(r'\brv:(\d+\.\d+)', ua) | |
| 215 if match2: | |
| 216 return 'MSIE', match2.group(1) | |
| 217 else: | |
| 218 return 'Trident', match.group(1) | |
| 219 | |
| 220 match = re.search(r'\bAndroidDownloadManager(?:/(\d+\.\d+))?', ua) | |
| 221 if match: | |
| 222 return 'Android', match.group(1) or '' | |
| 223 | |
| 224 match = re.search(r'\bDalvik/.*\bAndroid (\d+\.\d+)', ua) | |
| 225 if match: | |
| 226 return 'Android', match.group(1) | |
| 227 | |
| 228 # ABP/Android downloads use that user agent | |
| 229 if ua.startswith('Apache-HttpClient/UNAVAILABLE'): | |
| 230 return 'Android', '' | |
| 231 | |
| 232 # ABP/IE downloads use that user agent | |
| 233 if ua == 'Adblock Plus': | |
| 234 return 'ABP', '' | |
| 235 | |
| 236 return 'Other', '' | |
| 237 | |
| 238 | |
| 239 def process_ip(ip, geo, geov6): | |
| 240 match = re.search(r'^::ffff:(\d+\.\d+\.\d+\.\d+)$', ip) | |
| 241 if match: | |
| 242 ip = match.group(1) | |
| 243 | |
| 244 try: | |
| 245 if ':' in ip: | |
| 246 country = geov6.country_code_by_addr(ip) | |
| 247 else: | |
| 248 country = geo.country_code_by_addr(ip) | |
| 249 except: | |
| 250 traceback.print_exc() | |
| 251 country = '' | |
| 252 | |
| 253 if country in (None, '', '--'): | |
| 254 country = 'unknown' | |
| 255 country = country.lower() | |
| 256 | |
| 257 return ip, country | |
| 258 | |
| 259 | |
| 260 @cache_last | |
| 261 def parse_time(timestr, tz_hours, tz_minutes): | |
| 262 result = datetime.strptime(timestr, '%d/%b/%Y:%H:%M:%S') | |
| 263 result -= timedelta(hours=tz_hours, minutes=math.copysign(tz_minutes, tz_hou
rs)) | |
| 264 return result, result.strftime('%Y%m'), result.day, result.weekday(), result
.hour | |
| 265 | |
| 266 | |
| 267 @cache_lru | |
| 268 def parse_path(path): | |
| 269 urlparts = urlparse.urlparse(path) | |
| 270 try: | |
| 271 path = urllib.unquote(urlparts.path).decode('utf-8') | |
| 272 except: | |
| 273 path = urlparts.path | |
| 274 return path[1:], urlparts.query | |
| 275 | |
| 276 | |
| 277 @cache_lru | |
| 278 def parse_query(query): | |
| 279 return urlparse.parse_qs(query) | |
| 280 | |
| 281 | |
| 282 @cache_lru | |
| 283 def parse_lastversion(last_version): | |
| 284 if '-' in last_version: | |
| 285 last_version = last_version.split('-', 1)[0] | |
| 286 return datetime.strptime(last_version, '%Y%m%d%H%M') | |
| 287 | |
| 288 | |
| 289 @cache_lru | |
| 290 def get_week(date): | |
| 291 return date.isocalendar()[0:2] | |
| 292 | |
| 293 | |
| 294 def parse_downloader_query(info): | |
| 295 params = parse_query(info['query']) | |
| 296 for param in ('addonName', 'addonVersion', 'application', 'applicationVersio
n', 'platform', 'platformVersion'): | |
| 297 info[param] = params.get(param, ['unknown'])[0] | |
| 298 | |
| 299 # Only leave the major and minor release number for application and platform | |
| 300 info['applicationVersion'] = re.sub(r'^(\d+\.\d+).*', r'\1', info['applicati
onVersion']) | |
| 301 info['platformVersion'] = re.sub(r'^(\d+\.\d+).*', r'\1', info['platformVers
ion']) | |
| 302 | |
| 303 # Chrome Adblock sends an X-Client-ID header insteads of URL parameters | |
| 304 match = re.match(r'^adblock/([\d\.]+)$', info['clientid'], re.I) if info['cl
ientid'] else None | |
| 305 if match: | |
| 306 info['addonName'] = 'chromeadblock' | |
| 307 info['addonVersion'] = match.group(1) | |
| 308 | |
| 309 last_version = params.get('lastVersion', ['unknown'])[0] | |
| 310 if info['file'] == 'notification.json' and last_version == '0' and ( | |
| 311 (info['addonName'] == 'adblockplus' and info['addonVersion'] == '2.3.1')
or | |
| 312 (info['addonName'] in ('adblockpluschrome', 'adblockplusopera') and info
['addonVersion'] == '1.5.2') | |
| 313 ): | |
| 314 # Broken notification version number in these releases, treat like unkno
wn | |
| 315 last_version = 'unknown' | |
| 316 | |
| 317 if last_version == 'unknown': | |
| 318 info['downloadInterval'] = 'unknown' | |
| 319 info['previousDownload'] = 'unknown' | |
| 320 elif last_version == '0': | |
| 321 info['downloadInterval'] = 'unknown' | |
| 322 info['previousDownload'] = 'unknown' | |
| 323 info['firstDownload'] = True | |
| 324 else: | |
| 325 try: | |
| 326 last_update = parse_lastversion(last_version) | |
| 327 diff = info['time'] - last_update | |
| 328 if diff.days >= 365: | |
| 329 info['downloadInterval'] = '%i year(s)' % (diff.days / 365) | |
| 330 elif diff.days >= 30: | |
| 331 info['downloadInterval'] = '%i month(s)' % (diff.days / 30) | |
| 332 elif diff.days >= 1: | |
| 333 info['downloadInterval'] = '%i day(s)' % diff.days | |
| 334 else: | |
| 335 info['downloadInterval'] = '%i hour(s)' % (diff.seconds / 3600) | |
| 336 | |
| 337 if info['addonName'].startswith('adblockplus'): | |
| 338 diffdays = (info['time'].date() - last_update.date()).days | |
| 339 if diffdays == 0: | |
| 340 info['previousDownload'] = 'same day' | |
| 341 elif diffdays < 30: | |
| 342 info['previousDownload'] = '%i day(s)' % diffdays | |
| 343 elif diffdays < 365: | |
| 344 info['previousDownload'] = '%i month(s)' % (diffdays / 30) | |
| 345 else: | |
| 346 info['previousDownload'] = '%i year(s)' % (diffdays / 365) | |
| 347 else: | |
| 348 info['previousDownload'] = 'unknown' | |
| 349 | |
| 350 if last_update.year != info['time'].year or last_update.month != inf
o['time'].month: | |
| 351 info['firstInMonth'] = info['firstInDay'] = True | |
| 352 elif last_update.day != info['time'].day: | |
| 353 info['firstInDay'] = True | |
| 354 | |
| 355 if get_week(last_update) != get_week(info['time']): | |
| 356 info['firstInWeek'] = True | |
| 357 except ValueError: | |
| 358 info['downloadInterval'] = 'unknown' | |
| 359 info['previousDownload'] = 'unknown' | |
| 360 pass | |
| 361 | |
| 362 | |
| 363 def parse_addon_name(file): | |
| 364 if '/' in file: | |
| 365 return file.split('/')[-2] | |
| 366 else: | |
| 367 return None | |
| 368 | |
| 369 | |
| 370 def parse_gecko_query(query): | |
| 371 params = urlparse.parse_qs(query) | |
| 372 | |
| 373 version = params.get('version', ['unknown'])[0] | |
| 374 | |
| 375 global gecko_apps | |
| 376 if gecko_apps == None: | |
| 377 from buildtools.packagerGecko import KNOWN_APPS | |
| 378 gecko_apps = {v: k for k, v in KNOWN_APPS.iteritems()} | |
| 379 appID = params.get('appID', ['unknown'])[0] | |
| 380 | |
| 381 application = gecko_apps.get(appID, 'unknown') | |
| 382 applicationVersion = params.get('appVersion', ['unknown'])[0] | |
| 383 | |
| 384 # Only leave the major and minor release number for application | |
| 385 applicationVersion = re.sub(r'^(\d+\.\d+).*', r'\1', applicationVersion) | |
| 386 | |
| 387 return version, application, applicationVersion | |
| 388 | |
| 389 | |
| 390 def parse_chrome_query(query): | |
| 391 params = urlparse.parse_qs(query) | |
| 392 | |
| 393 if params.get('prod', ['unknown'])[0] in ('chromecrx', 'chromiumcrx'): | |
| 394 application = 'chrome' | |
| 395 else: | |
| 396 application = 'unknown' | |
| 397 applicationVersion = params.get('prodversion', ['unknown'])[0] | |
| 398 | |
| 399 params2 = urlparse.parse_qs(params.get('x', [''])[0]) | |
| 400 version = params2.get('v', ['unknown'])[0] | |
| 401 | |
| 402 # Only leave the major and minor release number for application | |
| 403 applicationVersion = re.sub(r'^(\d+\.\d+).*', r'\1', applicationVersion) | |
| 404 | |
| 405 return version, application, applicationVersion | |
| 406 | |
| 407 | |
| 408 def parse_update_flag(query): | |
| 409 return 'update' if query == 'update' else 'install' | |
| 410 | |
| 411 | |
| 412 def parse_record(line, ignored, geo, geov6): | |
| 413 global log_regexp | |
| 414 if log_regexp == None: | |
| 415 log_regexp = re.compile(r'(\S+) \S+ \S+ \[([^]\s]+) ([+\-]\d\d)(\d\d)\]
"GET ([^"\s]+) [^"]+" (\d+) (\d+) "([^"]*)" "([^"]*)"(?: "[^"]*" \S+ "[^"]*" "[^
"]*" "([^"]*)")?') | |
| 416 | |
| 417 match = re.search(log_regexp, line) | |
| 418 if not match: | |
| 419 return None | |
| 420 | |
| 421 status = int(match.group(6)) | |
| 422 if status not in (200, 301, 302): | |
| 423 return None | |
| 424 | |
| 425 info = { | |
| 426 'status': status, | |
| 427 'size': int(match.group(7)), | |
| 428 } | |
| 429 | |
| 430 info['ip'], info['country'] = process_ip(match.group(1), geo, geov6) | |
| 431 info['time'], info['month'], info['day'], info['weekday'], info['hour'] = pa
rse_time(match.group(2), int(match.group(3)), int(match.group(4))) | |
| 432 info['file'], info['query'] = parse_path(match.group(5)) | |
| 433 info['referrer'] = match.group(8) | |
| 434 info['ua'], info['uaversion'] = parse_ua(match.group(9)) | |
| 435 info['fullua'] = '%s %s' % (info['ua'], info['uaversion']) | |
| 436 info['clientid'] = match.group(10) | |
| 437 | |
| 438 # Additional metadata depends on file type | |
| 439 filename = os.path.basename(info['file']) | |
| 440 ext = os.path.splitext(filename)[1] | |
| 441 if ext == '.txt' or filename == 'update.json' or filename == 'notification.j
son': | |
| 442 # Subscription downloads, libadblockplus update checks and notification | |
| 443 # checks are performed by the downloader | |
| 444 parse_downloader_query(info) | |
| 445 elif ext == '.tpl': | |
| 446 # MSIE TPL download, no additional data here | |
| 447 pass | |
| 448 elif ext in ('.xpi', '.crx', '.apk', '.msi', '.exe', '.safariextz'): | |
| 449 # Package download, might be an update | |
| 450 info['installType'] = parse_update_flag(info['query']) | |
| 451 elif filename == 'update.rdf': | |
| 452 # Gecko update check or a legacy Android update check. The latter doesn'
t | |
| 453 # have usable data anyway so trying the Chrome route won't do any harm. | |
| 454 info['addonName'] = parse_addon_name(info['file']) | |
| 455 info['addonVersion'], info['application'], info['applicationVersion'] =
parse_gecko_query(info['query']) | |
| 456 elif filename == 'updates.xml': | |
| 457 # Chrome update check | |
| 458 info['addonName'] = parse_addon_name(info['file']) | |
| 459 info['addonVersion'], info['application'], info['applicationVersion'] =
parse_chrome_query(info['query']) | |
| 460 elif filename == 'updates.plist': | |
| 461 # Safari update check, no additional data | |
| 462 pass | |
| 463 else: | |
| 464 ignored.add(info['file']) | |
| 465 return None | |
| 466 | |
| 467 if 'addonName' in info: | |
| 468 info['fullAddon'] = '%s %s' % (info['addonName'], info['addonVersion']) | |
| 469 if 'application' in info: | |
| 470 info['fullApplication'] = '%s %s' % (info['application'], info['applicat
ionVersion']) | |
| 471 if 'platform' in info: | |
| 472 info['fullPlatform'] = '%s %s' % (info['platform'], info['platformVersio
n']) | |
| 473 return info | |
| 474 | |
| 475 | |
| 476 def add_record(info, section, ignore_fields=()): | |
| 477 section['hits'] = section.get('hits', 0) + 1 | |
| 478 section['bandwidth'] = section.get('bandwidth', 0) + info['size'] | |
| 479 | |
| 480 if len(ignore_fields) < 2: | |
| 481 for field in map(lambda f: f['name'], common.fields): | |
| 482 if field in ignore_fields or field not in info: | |
| 483 continue | |
| 484 | |
| 485 value = info[field] | |
| 486 if field not in section: | |
| 487 section[field] = {} | |
| 488 if value not in section[field]: | |
| 489 section[field][value] = {} | |
| 490 | |
| 491 add_record(info, section[field][value], ignore_fields + (field,)) | |
| 492 | |
| 493 | |
| 494 def parse_fileobj(mirror_name, fileobj, geo, geov6, ignored): | |
| 495 data = {} | |
| 496 for line in fileobj: | |
| 497 info = parse_record(line, ignored, geo, geov6) | |
| 498 if info == None: | |
| 499 continue | |
| 500 | |
| 501 info['mirror'] = mirror_name | |
| 502 if info['month'] not in data: | |
| 503 data[info['month']] = {} | |
| 504 section = data[info['month']] | |
| 505 | |
| 506 if info['file'] not in section: | |
| 507 section[info['file']] = {} | |
| 508 section = section[info['file']] | |
| 509 | |
| 510 add_record(info, section) | |
| 511 return data | |
| 512 | |
| 513 | |
| 514 def merge_objects(object1, object2, factor=1): | |
| 515 for key, value in object2.iteritems(): | |
| 516 try: | |
| 517 key = unicode(key) | |
| 518 except UnicodeDecodeError: | |
| 519 key = unicode(key, encoding='latin-1') | |
| 520 if isinstance(value, numbers.Number): | |
| 521 object1[key] = object1.get(key, 0) + factor * value | |
| 522 else: | |
| 523 merge_objects(object1.setdefault(key, {}), value, factor) | |
| 524 | |
| 525 | |
| 526 def save_stats(server_type, data, factor=1): | |
| 527 base_dir = os.path.join(get_config().get('stats', 'dataDirectory'), common.f
ilename_encode(server_type)) | |
| 528 for month, month_data in data.iteritems(): | |
| 529 for name, file_data in month_data.iteritems(): | |
| 530 path = os.path.join(base_dir, common.filename_encode(month), common.
filename_encode(name + '.json')) | |
| 531 if os.path.exists(path): | |
| 532 with codecs.open(path, 'rb', encoding='utf-8') as fileobj: | |
| 533 existing = json.load(fileobj) | |
| 534 else: | |
| 535 existing = {} | |
| 536 | |
| 537 merge_objects(existing, file_data, factor) | |
| 538 | |
| 539 dir = os.path.dirname(path) | |
| 540 try: | |
| 541 os.makedirs(dir) | |
| 542 except OSError as e: | |
| 543 if e.errno != errno.EEXIST: | |
| 544 raise | |
| 545 | |
| 546 with codecs.open(path, 'wb', encoding='utf-8') as fileobj: | |
| 547 json.dump(existing, fileobj, indent=2, sort_keys=True) | |
| 548 | |
| 549 | |
| 550 def parse_source(factor, lock, (mirror_name, server_type, log_file)): | |
| 551 try: | |
| 552 geo = pygeoip.GeoIP(get_config().get('stats', 'geoip_db'), pygeoip.MEMOR
Y_CACHE) | |
| 553 geov6 = pygeoip.GeoIP(get_config().get('stats', 'geoipv6_db'), pygeoip.M
EMORY_CACHE) | |
| 554 | |
| 555 ignored = set() | |
| 556 fileobj = StatsFile(log_file) | |
| 557 try: | |
| 558 data = parse_fileobj(mirror_name, fileobj, geo, geov6, ignored) | |
| 559 finally: | |
| 560 fileobj.close() | |
| 561 | |
| 562 lock.acquire() | |
| 563 try: | |
| 564 save_stats(server_type, data, factor) | |
| 565 finally: | |
| 566 lock.release() | |
| 567 return log_file, ignored | |
| 568 except: | |
| 569 print >>sys.stderr, "Unable to process log file '%s'" % log_file | |
| 570 traceback.print_exc() | |
| 571 return None, None | |
| 572 | |
| 573 | |
| 574 def parse_sources(sources, factor=1, verbose=False): | |
| 575 pool = multiprocessing.Pool() | |
| 576 lock = multiprocessing.Manager().Lock() | |
| 577 callback = functools.partial(parse_source, factor, lock) | |
| 578 try: | |
| 579 for log_file, ignored in pool.imap_unordered(callback, sources, chunksiz
e=1): | |
| 580 if verbose and ignored: | |
| 581 print 'Ignored files for %s' % log_file | |
| 582 print '=========================================================
===' | |
| 583 print '\n'.join(sorted(ignored)) | |
| 584 finally: | |
| 585 pool.close() | |
| 586 | |
| 587 | |
| 588 if __name__ == '__main__': | |
| 589 setupStderr() | |
| 590 | |
| 591 parser = argparse.ArgumentParser(description='Processes log files and merges
them into the stats database') | |
| 592 parser.add_argument('--verbose', dest='verbose', action='store_const', const
=True, default=False, help='Verbose mode, ignored requests will be listed') | |
| 593 parser.add_argument('--revert', dest='factor', action='store_const', const=-
1, default=1, help='Remove log data from the database') | |
| 594 parser.add_argument('mirror_name', nargs='?', help='Name of the mirror serve
r that the file belongs to') | |
| 595 parser.add_argument('server_type', nargs='?', help='Server type like downloa
d, update or subscription') | |
| 596 parser.add_argument('log_file', nargs='?', help='Log file path, can be a loc
al file path, http:// or ssh:// URL') | |
| 597 args = parser.parse_args() | |
| 598 | |
| 599 if args.mirror_name and args.server_type and args.log_file: | |
| 600 sources = [(args.mirror_name, args.server_type, args.log_file)] | |
| 601 else: | |
| 602 sources = get_stats_files() | |
| 603 parse_sources(sources, args.factor, args.verbose) | |
| OLD | NEW |