Rietveld Code Review Tool
Help | Bug tracker | Discussion group | Source code

Side by Side Diff: sitescripts/stats/bin/logprocessor.py

Issue 29934561: #1537 - Remove stats processing from sitescripts (Closed) Base URL: https://hg.adblockplus.org/sitescripts
Patch Set: Created Nov. 2, 2018, 12:42 p.m.
Left:
Right:
Use n/p to move between diff chunks; N/P to move between comments.
Jump to:
View unified diff | Download patch
OLDNEW
(Empty)
1 # This file is part of the Adblock Plus web scripts,
2 # Copyright (C) 2006-present eyeo GmbH
3 #
4 # Adblock Plus is free software: you can redistribute it and/or modify
5 # it under the terms of the GNU General Public License version 3 as
6 # published by the Free Software Foundation.
7 #
8 # Adblock Plus is distributed in the hope that it will be useful,
9 # but WITHOUT ANY WARRANTY; without even the implied warranty of
10 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
11 # GNU General Public License for more details.
12 #
13 # You should have received a copy of the GNU General Public License
14 # along with Adblock Plus. If not, see <http://www.gnu.org/licenses/>.
15
16 import argparse
17 import codecs
18 from collections import OrderedDict
19 from datetime import datetime, timedelta
20 import errno
21 import functools
22 import gzip
23 import json
24 import math
25 import multiprocessing
26 import numbers
27 import os
28 import re
29 import pygeoip
30 import socket
31 import subprocess
32 import sys
33 import traceback
34 import urllib
35 import urlparse
36
37 import sitescripts.stats.common as common
38 from sitescripts.utils import get_config, setupStderr
39
40 log_regexp = None
41 gecko_apps = None
42
43
44 class StatsFile:
45 def __init__(self, path):
46 self._inner_file = None
47 self._processes = []
48
49 parseresult = urlparse.urlparse(path)
50 if parseresult.scheme == 'ssh' and parseresult.username and parseresult. hostname and parseresult.path:
51 command = [
52 'ssh', '-q', '-o', 'NumberOfPasswordPrompts 0', '-T', '-k',
53 '-l', parseresult.username,
54 parseresult.hostname,
55 parseresult.path.lstrip('/'),
56 ]
57 if parseresult.port:
58 command[1:1] = ['-P', str(parseresult.port)]
59 ssh_process = subprocess.Popen(command, stdout=subprocess.PIPE)
60 self._processes.append(ssh_process)
61 self._file = ssh_process.stdout
62 elif parseresult.scheme in ('http', 'https'):
63 self._file = urllib.urlopen(path)
64 elif os.path.exists(path):
65 self._file = open(path, 'rb')
66 else:
67 raise IOError("Path '%s' not recognized" % path)
68
69 if path.endswith('.gz'):
70 # Built-in gzip module doesn't support streaming (fixed in Python 3. 2)
71 gzip_process = subprocess.Popen(['gzip', '-cd'], stdin=self._file, s tdout=subprocess.PIPE)
72 self._processes.append(gzip_process)
73 self._file, self._inner_file = gzip_process.stdout, self._file
74
75 def __getattr__(self, name):
76 return getattr(self._file, name)
77
78 def close(self):
79 self._file.close()
80 if self._inner_file:
81 self._inner_file.close()
82 for process in self._processes:
83 process.wait()
84
85
86 def get_stats_files():
87 config = get_config()
88
89 prefix = 'mirror_'
90 options = filter(lambda o: o.startswith(prefix), config.options('stats'))
91 for option in options:
92 if config.has_option('stats', option):
93 value = config.get('stats', option)
94 if ' ' in value:
95 yield [option[len(prefix):]] + value.split(None, 1)
96 else:
97 print >>sys.stderr, "Option '%s' has invalid value: '%s'" % (opt ion, value)
98 else:
99 print >>sys.stderr, "Option '%s' not found in the configuration" % o ption
100
101
102 def cache_lru(func):
103 """
104 Decorator that memoizes the return values of a single-parameter function i n
105 case it is called again with the same parameter. The 1024 most recent
106 results are saved.
107 """
108
109 results = OrderedDict()
110 results.entries_left = 1024
111
112 def wrapped(arg):
113 if arg in results:
114 result = results[arg]
115 del results[arg]
116 else:
117 if results.entries_left > 0:
118 results.entries_left -= 1
119 else:
120 results.popitem(last=False)
121 try:
122 result = func(arg)
123 except:
124 results.entries_left += 1
125 raise
126 results[arg] = result
127 return result
128 return wrapped
129
130
131 def cache_last(func):
132 """
133 Decorator that memoizes the last return value of a function in case it is
134 called again with the same parameters.
135 """
136 result = {'args': None, 'result': None}
137
138 def wrapped(*args):
139 if args != result['args']:
140 result['result'] = func(*args)
141 result['args'] = args
142 return result['result']
143 return wrapped
144
145
146 @cache_lru
147 def parse_ua(ua):
148 # Opera might disguise itself as other browser so it needs to go first
149 match = re.search(r'\bOpera/([\d\.]+)', ua)
150 if match:
151 # Opera 10+ declares itself as Opera 9.80 but adds Version/1x.x to the U A
152 match2 = re.search(r'\bVersion/([\d\.]+)', ua)
153 if match2:
154 return 'Opera', match2.group(1)
155 else:
156 return 'Opera', match.group(1)
157
158 # Opera 15+ has the same UA as Chrome but adds OPR/1x.x to it
159 match = re.search(r'\bOPR/(\d+\.\d+)', ua)
160 if match:
161 return 'Opera', match.group(1)
162
163 # Have to check for these before Firefox, they will usually have a Firefox i dentifier as well
164 match = re.search(r'\b(Fennec|Thunderbird|SeaMonkey|Songbird|K-Meleon|Prism) /(\d+\.\d+)', ua)
165 if match:
166 if match.group(1) == 'Fennec':
167 return 'Firefox Mobile', match.group(2)
168 else:
169 return match.group(1), match.group(2)
170
171 match = re.search(r'\bFirefox/(\d+\.\d+)', ua)
172 if match:
173 if re.search(r'\bMobile;', ua):
174 return 'Firefox Mobile', match.group(1)
175 elif re.search(r'\bTablet;', ua):
176 return 'Firefox Tablet', match.group(1)
177 else:
178 return 'Firefox', match.group(1)
179
180 match = re.search(r'\brv:(\d+)\.(\d+)(?:\.(\d+))?', ua)
181 if match and re.search(r'\bGecko/', ua):
182 if match.group(3) and int(match.group(1)) < 2:
183 return 'Gecko', '%s.%s.%s' % (match.group(1), match.group(2), match. group(3))
184 else:
185 return 'Gecko', '%s.%s' % (match.group(1), match.group(2))
186
187 match = re.search(r'\bCoolNovo/(\d+\.\d+\.\d+)', ua)
188 if match:
189 return 'CoolNovo', match.group(1)
190
191 match = re.search(r'\bEdge/(\d+)\.\d+', ua)
192 if match:
193 return 'Edge', match.group(1)
194
195 match = re.search(r'\bChrome/(\d+\.\d+)', ua)
196 if match:
197 return 'Chrome', match.group(1)
198
199 match = re.search(r'\bVersion/(\d+\.\d+)', ua)
200 if match and re.search(r'\bMobile Safari/', ua):
201 return 'Mobile Safari', match.group(1)
202 if match and re.search(r'\bSafari/', ua):
203 return 'Safari', match.group(1)
204
205 if re.search(r'\bAppleWebKit/', ua):
206 return 'WebKit', ''
207
208 match = re.search(r'\bMSIE (\d+\.\d+)', ua)
209 if match:
210 return 'MSIE', match.group(1)
211
212 match = re.search(r'\bTrident/(\d+\.\d+)', ua)
213 if match:
214 match2 = re.search(r'\brv:(\d+\.\d+)', ua)
215 if match2:
216 return 'MSIE', match2.group(1)
217 else:
218 return 'Trident', match.group(1)
219
220 match = re.search(r'\bAndroidDownloadManager(?:/(\d+\.\d+))?', ua)
221 if match:
222 return 'Android', match.group(1) or ''
223
224 match = re.search(r'\bDalvik/.*\bAndroid (\d+\.\d+)', ua)
225 if match:
226 return 'Android', match.group(1)
227
228 # ABP/Android downloads use that user agent
229 if ua.startswith('Apache-HttpClient/UNAVAILABLE'):
230 return 'Android', ''
231
232 # ABP/IE downloads use that user agent
233 if ua == 'Adblock Plus':
234 return 'ABP', ''
235
236 return 'Other', ''
237
238
239 def process_ip(ip, geo, geov6):
240 match = re.search(r'^::ffff:(\d+\.\d+\.\d+\.\d+)$', ip)
241 if match:
242 ip = match.group(1)
243
244 try:
245 if ':' in ip:
246 country = geov6.country_code_by_addr(ip)
247 else:
248 country = geo.country_code_by_addr(ip)
249 except:
250 traceback.print_exc()
251 country = ''
252
253 if country in (None, '', '--'):
254 country = 'unknown'
255 country = country.lower()
256
257 return ip, country
258
259
260 @cache_last
261 def parse_time(timestr, tz_hours, tz_minutes):
262 result = datetime.strptime(timestr, '%d/%b/%Y:%H:%M:%S')
263 result -= timedelta(hours=tz_hours, minutes=math.copysign(tz_minutes, tz_hou rs))
264 return result, result.strftime('%Y%m'), result.day, result.weekday(), result .hour
265
266
267 @cache_lru
268 def parse_path(path):
269 urlparts = urlparse.urlparse(path)
270 try:
271 path = urllib.unquote(urlparts.path).decode('utf-8')
272 except:
273 path = urlparts.path
274 return path[1:], urlparts.query
275
276
277 @cache_lru
278 def parse_query(query):
279 return urlparse.parse_qs(query)
280
281
282 @cache_lru
283 def parse_lastversion(last_version):
284 if '-' in last_version:
285 last_version = last_version.split('-', 1)[0]
286 return datetime.strptime(last_version, '%Y%m%d%H%M')
287
288
289 @cache_lru
290 def get_week(date):
291 return date.isocalendar()[0:2]
292
293
294 def parse_downloader_query(info):
295 params = parse_query(info['query'])
296 for param in ('addonName', 'addonVersion', 'application', 'applicationVersio n', 'platform', 'platformVersion'):
297 info[param] = params.get(param, ['unknown'])[0]
298
299 # Only leave the major and minor release number for application and platform
300 info['applicationVersion'] = re.sub(r'^(\d+\.\d+).*', r'\1', info['applicati onVersion'])
301 info['platformVersion'] = re.sub(r'^(\d+\.\d+).*', r'\1', info['platformVers ion'])
302
303 # Chrome Adblock sends an X-Client-ID header insteads of URL parameters
304 match = re.match(r'^adblock/([\d\.]+)$', info['clientid'], re.I) if info['cl ientid'] else None
305 if match:
306 info['addonName'] = 'chromeadblock'
307 info['addonVersion'] = match.group(1)
308
309 last_version = params.get('lastVersion', ['unknown'])[0]
310 if info['file'] == 'notification.json' and last_version == '0' and (
311 (info['addonName'] == 'adblockplus' and info['addonVersion'] == '2.3.1') or
312 (info['addonName'] in ('adblockpluschrome', 'adblockplusopera') and info ['addonVersion'] == '1.5.2')
313 ):
314 # Broken notification version number in these releases, treat like unkno wn
315 last_version = 'unknown'
316
317 if last_version == 'unknown':
318 info['downloadInterval'] = 'unknown'
319 info['previousDownload'] = 'unknown'
320 elif last_version == '0':
321 info['downloadInterval'] = 'unknown'
322 info['previousDownload'] = 'unknown'
323 info['firstDownload'] = True
324 else:
325 try:
326 last_update = parse_lastversion(last_version)
327 diff = info['time'] - last_update
328 if diff.days >= 365:
329 info['downloadInterval'] = '%i year(s)' % (diff.days / 365)
330 elif diff.days >= 30:
331 info['downloadInterval'] = '%i month(s)' % (diff.days / 30)
332 elif diff.days >= 1:
333 info['downloadInterval'] = '%i day(s)' % diff.days
334 else:
335 info['downloadInterval'] = '%i hour(s)' % (diff.seconds / 3600)
336
337 if info['addonName'].startswith('adblockplus'):
338 diffdays = (info['time'].date() - last_update.date()).days
339 if diffdays == 0:
340 info['previousDownload'] = 'same day'
341 elif diffdays < 30:
342 info['previousDownload'] = '%i day(s)' % diffdays
343 elif diffdays < 365:
344 info['previousDownload'] = '%i month(s)' % (diffdays / 30)
345 else:
346 info['previousDownload'] = '%i year(s)' % (diffdays / 365)
347 else:
348 info['previousDownload'] = 'unknown'
349
350 if last_update.year != info['time'].year or last_update.month != inf o['time'].month:
351 info['firstInMonth'] = info['firstInDay'] = True
352 elif last_update.day != info['time'].day:
353 info['firstInDay'] = True
354
355 if get_week(last_update) != get_week(info['time']):
356 info['firstInWeek'] = True
357 except ValueError:
358 info['downloadInterval'] = 'unknown'
359 info['previousDownload'] = 'unknown'
360 pass
361
362
363 def parse_addon_name(file):
364 if '/' in file:
365 return file.split('/')[-2]
366 else:
367 return None
368
369
370 def parse_gecko_query(query):
371 params = urlparse.parse_qs(query)
372
373 version = params.get('version', ['unknown'])[0]
374
375 global gecko_apps
376 if gecko_apps == None:
377 from buildtools.packagerGecko import KNOWN_APPS
378 gecko_apps = {v: k for k, v in KNOWN_APPS.iteritems()}
379 appID = params.get('appID', ['unknown'])[0]
380
381 application = gecko_apps.get(appID, 'unknown')
382 applicationVersion = params.get('appVersion', ['unknown'])[0]
383
384 # Only leave the major and minor release number for application
385 applicationVersion = re.sub(r'^(\d+\.\d+).*', r'\1', applicationVersion)
386
387 return version, application, applicationVersion
388
389
390 def parse_chrome_query(query):
391 params = urlparse.parse_qs(query)
392
393 if params.get('prod', ['unknown'])[0] in ('chromecrx', 'chromiumcrx'):
394 application = 'chrome'
395 else:
396 application = 'unknown'
397 applicationVersion = params.get('prodversion', ['unknown'])[0]
398
399 params2 = urlparse.parse_qs(params.get('x', [''])[0])
400 version = params2.get('v', ['unknown'])[0]
401
402 # Only leave the major and minor release number for application
403 applicationVersion = re.sub(r'^(\d+\.\d+).*', r'\1', applicationVersion)
404
405 return version, application, applicationVersion
406
407
408 def parse_update_flag(query):
409 return 'update' if query == 'update' else 'install'
410
411
412 def parse_record(line, ignored, geo, geov6):
413 global log_regexp
414 if log_regexp == None:
415 log_regexp = re.compile(r'(\S+) \S+ \S+ \[([^]\s]+) ([+\-]\d\d)(\d\d)\] "GET ([^"\s]+) [^"]+" (\d+) (\d+) "([^"]*)" "([^"]*)"(?: "[^"]*" \S+ "[^"]*" "[^ "]*" "([^"]*)")?')
416
417 match = re.search(log_regexp, line)
418 if not match:
419 return None
420
421 status = int(match.group(6))
422 if status not in (200, 301, 302):
423 return None
424
425 info = {
426 'status': status,
427 'size': int(match.group(7)),
428 }
429
430 info['ip'], info['country'] = process_ip(match.group(1), geo, geov6)
431 info['time'], info['month'], info['day'], info['weekday'], info['hour'] = pa rse_time(match.group(2), int(match.group(3)), int(match.group(4)))
432 info['file'], info['query'] = parse_path(match.group(5))
433 info['referrer'] = match.group(8)
434 info['ua'], info['uaversion'] = parse_ua(match.group(9))
435 info['fullua'] = '%s %s' % (info['ua'], info['uaversion'])
436 info['clientid'] = match.group(10)
437
438 # Additional metadata depends on file type
439 filename = os.path.basename(info['file'])
440 ext = os.path.splitext(filename)[1]
441 if ext == '.txt' or filename == 'update.json' or filename == 'notification.j son':
442 # Subscription downloads, libadblockplus update checks and notification
443 # checks are performed by the downloader
444 parse_downloader_query(info)
445 elif ext == '.tpl':
446 # MSIE TPL download, no additional data here
447 pass
448 elif ext in ('.xpi', '.crx', '.apk', '.msi', '.exe', '.safariextz'):
449 # Package download, might be an update
450 info['installType'] = parse_update_flag(info['query'])
451 elif filename == 'update.rdf':
452 # Gecko update check or a legacy Android update check. The latter doesn' t
453 # have usable data anyway so trying the Chrome route won't do any harm.
454 info['addonName'] = parse_addon_name(info['file'])
455 info['addonVersion'], info['application'], info['applicationVersion'] = parse_gecko_query(info['query'])
456 elif filename == 'updates.xml':
457 # Chrome update check
458 info['addonName'] = parse_addon_name(info['file'])
459 info['addonVersion'], info['application'], info['applicationVersion'] = parse_chrome_query(info['query'])
460 elif filename == 'updates.plist':
461 # Safari update check, no additional data
462 pass
463 else:
464 ignored.add(info['file'])
465 return None
466
467 if 'addonName' in info:
468 info['fullAddon'] = '%s %s' % (info['addonName'], info['addonVersion'])
469 if 'application' in info:
470 info['fullApplication'] = '%s %s' % (info['application'], info['applicat ionVersion'])
471 if 'platform' in info:
472 info['fullPlatform'] = '%s %s' % (info['platform'], info['platformVersio n'])
473 return info
474
475
476 def add_record(info, section, ignore_fields=()):
477 section['hits'] = section.get('hits', 0) + 1
478 section['bandwidth'] = section.get('bandwidth', 0) + info['size']
479
480 if len(ignore_fields) < 2:
481 for field in map(lambda f: f['name'], common.fields):
482 if field in ignore_fields or field not in info:
483 continue
484
485 value = info[field]
486 if field not in section:
487 section[field] = {}
488 if value not in section[field]:
489 section[field][value] = {}
490
491 add_record(info, section[field][value], ignore_fields + (field,))
492
493
494 def parse_fileobj(mirror_name, fileobj, geo, geov6, ignored):
495 data = {}
496 for line in fileobj:
497 info = parse_record(line, ignored, geo, geov6)
498 if info == None:
499 continue
500
501 info['mirror'] = mirror_name
502 if info['month'] not in data:
503 data[info['month']] = {}
504 section = data[info['month']]
505
506 if info['file'] not in section:
507 section[info['file']] = {}
508 section = section[info['file']]
509
510 add_record(info, section)
511 return data
512
513
514 def merge_objects(object1, object2, factor=1):
515 for key, value in object2.iteritems():
516 try:
517 key = unicode(key)
518 except UnicodeDecodeError:
519 key = unicode(key, encoding='latin-1')
520 if isinstance(value, numbers.Number):
521 object1[key] = object1.get(key, 0) + factor * value
522 else:
523 merge_objects(object1.setdefault(key, {}), value, factor)
524
525
526 def save_stats(server_type, data, factor=1):
527 base_dir = os.path.join(get_config().get('stats', 'dataDirectory'), common.f ilename_encode(server_type))
528 for month, month_data in data.iteritems():
529 for name, file_data in month_data.iteritems():
530 path = os.path.join(base_dir, common.filename_encode(month), common. filename_encode(name + '.json'))
531 if os.path.exists(path):
532 with codecs.open(path, 'rb', encoding='utf-8') as fileobj:
533 existing = json.load(fileobj)
534 else:
535 existing = {}
536
537 merge_objects(existing, file_data, factor)
538
539 dir = os.path.dirname(path)
540 try:
541 os.makedirs(dir)
542 except OSError as e:
543 if e.errno != errno.EEXIST:
544 raise
545
546 with codecs.open(path, 'wb', encoding='utf-8') as fileobj:
547 json.dump(existing, fileobj, indent=2, sort_keys=True)
548
549
550 def parse_source(factor, lock, (mirror_name, server_type, log_file)):
551 try:
552 geo = pygeoip.GeoIP(get_config().get('stats', 'geoip_db'), pygeoip.MEMOR Y_CACHE)
553 geov6 = pygeoip.GeoIP(get_config().get('stats', 'geoipv6_db'), pygeoip.M EMORY_CACHE)
554
555 ignored = set()
556 fileobj = StatsFile(log_file)
557 try:
558 data = parse_fileobj(mirror_name, fileobj, geo, geov6, ignored)
559 finally:
560 fileobj.close()
561
562 lock.acquire()
563 try:
564 save_stats(server_type, data, factor)
565 finally:
566 lock.release()
567 return log_file, ignored
568 except:
569 print >>sys.stderr, "Unable to process log file '%s'" % log_file
570 traceback.print_exc()
571 return None, None
572
573
574 def parse_sources(sources, factor=1, verbose=False):
575 pool = multiprocessing.Pool()
576 lock = multiprocessing.Manager().Lock()
577 callback = functools.partial(parse_source, factor, lock)
578 try:
579 for log_file, ignored in pool.imap_unordered(callback, sources, chunksiz e=1):
580 if verbose and ignored:
581 print 'Ignored files for %s' % log_file
582 print '========================================================= ==='
583 print '\n'.join(sorted(ignored))
584 finally:
585 pool.close()
586
587
588 if __name__ == '__main__':
589 setupStderr()
590
591 parser = argparse.ArgumentParser(description='Processes log files and merges them into the stats database')
592 parser.add_argument('--verbose', dest='verbose', action='store_const', const =True, default=False, help='Verbose mode, ignored requests will be listed')
593 parser.add_argument('--revert', dest='factor', action='store_const', const=- 1, default=1, help='Remove log data from the database')
594 parser.add_argument('mirror_name', nargs='?', help='Name of the mirror serve r that the file belongs to')
595 parser.add_argument('server_type', nargs='?', help='Server type like downloa d, update or subscription')
596 parser.add_argument('log_file', nargs='?', help='Log file path, can be a loc al file path, http:// or ssh:// URL')
597 args = parser.parse_args()
598
599 if args.mirror_name and args.server_type and args.log_file:
600 sources = [(args.mirror_name, args.server_type, args.log_file)]
601 else:
602 sources = get_stats_files()
603 parse_sources(sources, args.factor, args.verbose)
OLDNEW

Powered by Google App Engine
This is Rietveld