sitescripts/stats/bin/logprocessor.py - Issue 29934561: #1537 - Remove stats processing from sitescripts

Side by Side Diff: sitescripts/stats/bin/logprocessor.py

Issue 29934561: #1537 - Remove stats processing from sitescripts (Closed) Base URL: https://hg.adblockplus.org/sitescripts

Patch Set: Created Nov. 2, 2018, 12:42 p.m.

Left:
Right:

Use n/p to move between diff chunks; N/P to move between comments.

Jump to:

View unified diff | Download patch

OLD	NEW
	(Empty)
1 # This file is part of the Adblock Plus web scripts,

2 # Copyright (C) 2006-present eyeo GmbH

3 #

4 # Adblock Plus is free software: you can redistribute it and/or modify

5 # it under the terms of the GNU General Public License version 3 as

6 # published by the Free Software Foundation.

7 #

8 # Adblock Plus is distributed in the hope that it will be useful,

9 # but WITHOUT ANY WARRANTY; without even the implied warranty of

10 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the

11 # GNU General Public License for more details.

12 #

13 # You should have received a copy of the GNU General Public License

14 # along with Adblock Plus. If not, see <http://www.gnu.org/licenses/>.

15

16 import argparse

17 import codecs

18 from collections import OrderedDict

19 from datetime import datetime, timedelta

20 import errno

21 import functools

22 import gzip

23 import json

24 import math

25 import multiprocessing

26 import numbers

27 import os

28 import re

29 import pygeoip

30 import socket

31 import subprocess

32 import sys

33 import traceback

34 import urllib

35 import urlparse

36

37 import sitescripts.stats.common as common

38 from sitescripts.utils import get_config, setupStderr

39

40 log_regexp = None

41 gecko_apps = None

42

43

44 class StatsFile:

45 def __init__(self, path):

46 self._inner_file = None

47 self._processes = []

48

49 parseresult = urlparse.urlparse(path)

50 if parseresult.scheme == 'ssh' and parseresult.username and parseresult. hostname and parseresult.path:

51 command = [

52 'ssh', '-q', '-o', 'NumberOfPasswordPrompts 0', '-T', '-k',

53 '-l', parseresult.username,

54 parseresult.hostname,

55 parseresult.path.lstrip('/'),

56 ]

57 if parseresult.port:

58 command[1:1] = ['-P', str(parseresult.port)]

59 ssh_process = subprocess.Popen(command, stdout=subprocess.PIPE)

60 self._processes.append(ssh_process)

61 self._file = ssh_process.stdout

62 elif parseresult.scheme in ('http', 'https'):

63 self._file = urllib.urlopen(path)

64 elif os.path.exists(path):

65 self._file = open(path, 'rb')

66 else:

67 raise IOError("Path '%s' not recognized" % path)

68

69 if path.endswith('.gz'):

70 # Built-in gzip module doesn't support streaming (fixed in Python 3. 2)

71 gzip_process = subprocess.Popen(['gzip', '-cd'], stdin=self._file, s tdout=subprocess.PIPE)

72 self._processes.append(gzip_process)

73 self._file, self._inner_file = gzip_process.stdout, self._file

74

75 def __getattr__(self, name):

76 return getattr(self._file, name)

77

78 def close(self):

79 self._file.close()

80 if self._inner_file:

81 self._inner_file.close()

82 for process in self._processes:

83 process.wait()

84

85

86 def get_stats_files():

87 config = get_config()

88

89 prefix = 'mirror_'

90 options = filter(lambda o: o.startswith(prefix), config.options('stats'))

91 for option in options:

92 if config.has_option('stats', option):

93 value = config.get('stats', option)

94 if ' ' in value:

95 yield [option[len(prefix):]] + value.split(None, 1)

96 else:

97 print >>sys.stderr, "Option '%s' has invalid value: '%s'" % (opt ion, value)

98 else:

99 print >>sys.stderr, "Option '%s' not found in the configuration" % o ption

100

101

102 def cache_lru(func):

103 """

104 Decorator that memoizes the return values of a single-parameter function i n

105 case it is called again with the same parameter. The 1024 most recent

106 results are saved.

107 """

108

109 results = OrderedDict()

110 results.entries_left = 1024

111

112 def wrapped(arg):

113 if arg in results:

114 result = results[arg]

115 del results[arg]

116 else:

117 if results.entries_left > 0:

118 results.entries_left -= 1

119 else:

120 results.popitem(last=False)

121 try:

122 result = func(arg)

123 except:

124 results.entries_left += 1

125 raise

126 results[arg] = result

127 return result

128 return wrapped

129

130

131 def cache_last(func):

132 """

133 Decorator that memoizes the last return value of a function in case it is

134 called again with the same parameters.

135 """

136 result = {'args': None, 'result': None}

137

138 def wrapped(*args):

139 if args != result['args']:

140 result['result'] = func(*args)

141 result['args'] = args

142 return result['result']

143 return wrapped

144

145

146 @cache_lru

147 def parse_ua(ua):

148 # Opera might disguise itself as other browser so it needs to go first

149 match = re.search(r'\bOpera/([\d\.]+)', ua)

150 if match:

151 # Opera 10+ declares itself as Opera 9.80 but adds Version/1x.x to the U A

152 match2 = re.search(r'\bVersion/([\d\.]+)', ua)

153 if match2:

154 return 'Opera', match2.group(1)

155 else:

156 return 'Opera', match.group(1)

157

158 # Opera 15+ has the same UA as Chrome but adds OPR/1x.x to it

159 match = re.search(r'\bOPR/(\d+\.\d+)', ua)

160 if match:

161 return 'Opera', match.group(1)

162

163 # Have to check for these before Firefox, they will usually have a Firefox i dentifier as well

164 match = re.search(r'\b(Fennec\|Thunderbird\|SeaMonkey\|Songbird\|K-Meleon\|Prism) /(\d+\.\d+)', ua)

165 if match:

166 if match.group(1) == 'Fennec':

167 return 'Firefox Mobile', match.group(2)

168 else:

169 return match.group(1), match.group(2)

170

171 match = re.search(r'\bFirefox/(\d+\.\d+)', ua)

172 if match:

173 if re.search(r'\bMobile;', ua):

174 return 'Firefox Mobile', match.group(1)

175 elif re.search(r'\bTablet;', ua):

176 return 'Firefox Tablet', match.group(1)

177 else:

178 return 'Firefox', match.group(1)

179

180 match = re.search(r'\brv:(\d+)\.(\d+)(?:\.(\d+))?', ua)

181 if match and re.search(r'\bGecko/', ua):

182 if match.group(3) and int(match.group(1)) < 2:

183 return 'Gecko', '%s.%s.%s' % (match.group(1), match.group(2), match. group(3))

184 else:

185 return 'Gecko', '%s.%s' % (match.group(1), match.group(2))

186

187 match = re.search(r'\bCoolNovo/(\d+\.\d+\.\d+)', ua)

188 if match:

189 return 'CoolNovo', match.group(1)

190

191 match = re.search(r'\bEdge/(\d+)\.\d+', ua)

192 if match:

193 return 'Edge', match.group(1)

194

195 match = re.search(r'\bChrome/(\d+\.\d+)', ua)

196 if match:

197 return 'Chrome', match.group(1)

198

199 match = re.search(r'\bVersion/(\d+\.\d+)', ua)

200 if match and re.search(r'\bMobile Safari/', ua):

201 return 'Mobile Safari', match.group(1)

202 if match and re.search(r'\bSafari/', ua):

203 return 'Safari', match.group(1)

204

205 if re.search(r'\bAppleWebKit/', ua):

206 return 'WebKit', ''

207

208 match = re.search(r'\bMSIE (\d+\.\d+)', ua)

209 if match:

210 return 'MSIE', match.group(1)

211

212 match = re.search(r'\bTrident/(\d+\.\d+)', ua)

213 if match:

214 match2 = re.search(r'\brv:(\d+\.\d+)', ua)

215 if match2:

216 return 'MSIE', match2.group(1)

217 else:

218 return 'Trident', match.group(1)

219

220 match = re.search(r'\bAndroidDownloadManager(?:/(\d+\.\d+))?', ua)

221 if match:

222 return 'Android', match.group(1) or ''

223

224 match = re.search(r'\bDalvik/.*\bAndroid (\d+\.\d+)', ua)

225 if match:

226 return 'Android', match.group(1)

227

228 # ABP/Android downloads use that user agent

229 if ua.startswith('Apache-HttpClient/UNAVAILABLE'):

230 return 'Android', ''

231

232 # ABP/IE downloads use that user agent

233 if ua == 'Adblock Plus':

234 return 'ABP', ''

235

236 return 'Other', ''

237

238

239 def process_ip(ip, geo, geov6):

240 match = re.search(r'^::ffff:(\d+\.\d+\.\d+\.\d+)$', ip)

241 if match:

242 ip = match.group(1)

243

244 try:

245 if ':' in ip:

246 country = geov6.country_code_by_addr(ip)

247 else:

248 country = geo.country_code_by_addr(ip)

249 except:

250 traceback.print_exc()

251 country = ''

252

253 if country in (None, '', '--'):

254 country = 'unknown'

255 country = country.lower()

256

257 return ip, country

258

259

260 @cache_last

261 def parse_time(timestr, tz_hours, tz_minutes):

262 result = datetime.strptime(timestr, '%d/%b/%Y:%H:%M:%S')

263 result -= timedelta(hours=tz_hours, minutes=math.copysign(tz_minutes, tz_hou rs))

264 return result, result.strftime('%Y%m'), result.day, result.weekday(), result .hour

265

266

267 @cache_lru

268 def parse_path(path):

269 urlparts = urlparse.urlparse(path)

270 try:

271 path = urllib.unquote(urlparts.path).decode('utf-8')

272 except:

273 path = urlparts.path

274 return path[1:], urlparts.query

275

276

277 @cache_lru

278 def parse_query(query):

279 return urlparse.parse_qs(query)

280

281

282 @cache_lru

283 def parse_lastversion(last_version):

284 if '-' in last_version:

285 last_version = last_version.split('-', 1)[0]

286 return datetime.strptime(last_version, '%Y%m%d%H%M')

287

288

289 @cache_lru

290 def get_week(date):

291 return date.isocalendar()[0:2]

292

293

294 def parse_downloader_query(info):

295 params = parse_query(info['query'])

296 for param in ('addonName', 'addonVersion', 'application', 'applicationVersio n', 'platform', 'platformVersion'):

297 info[param] = params.get(param, ['unknown'])[0]

298

299 # Only leave the major and minor release number for application and platform

300 info['applicationVersion'] = re.sub(r'^(\d+\.\d+).*', r'\1', info['applicati onVersion'])

301 info['platformVersion'] = re.sub(r'^(\d+\.\d+).*', r'\1', info['platformVers ion'])

302

303 # Chrome Adblock sends an X-Client-ID header insteads of URL parameters

304 match = re.match(r'^adblock/([\d\.]+)$', info['clientid'], re.I) if info['cl ientid'] else None

305 if match:

306 info['addonName'] = 'chromeadblock'

307 info['addonVersion'] = match.group(1)

308

309 last_version = params.get('lastVersion', ['unknown'])[0]

310 if info['file'] == 'notification.json' and last_version == '0' and (

311 (info['addonName'] == 'adblockplus' and info['addonVersion'] == '2.3.1') or

312 (info['addonName'] in ('adblockpluschrome', 'adblockplusopera') and info ['addonVersion'] == '1.5.2')

313 ):

314 # Broken notification version number in these releases, treat like unkno wn

315 last_version = 'unknown'

316

317 if last_version == 'unknown':

318 info['downloadInterval'] = 'unknown'

319 info['previousDownload'] = 'unknown'

320 elif last_version == '0':

321 info['downloadInterval'] = 'unknown'

322 info['previousDownload'] = 'unknown'

323 info['firstDownload'] = True

324 else:

325 try:

326 last_update = parse_lastversion(last_version)

327 diff = info['time'] - last_update

328 if diff.days >= 365:

329 info['downloadInterval'] = '%i year(s)' % (diff.days / 365)

330 elif diff.days >= 30:

331 info['downloadInterval'] = '%i month(s)' % (diff.days / 30)

332 elif diff.days >= 1:

333 info['downloadInterval'] = '%i day(s)' % diff.days

334 else:

335 info['downloadInterval'] = '%i hour(s)' % (diff.seconds / 3600)

336

337 if info['addonName'].startswith('adblockplus'):

338 diffdays = (info['time'].date() - last_update.date()).days

339 if diffdays == 0:

340 info['previousDownload'] = 'same day'

341 elif diffdays < 30:

342 info['previousDownload'] = '%i day(s)' % diffdays

343 elif diffdays < 365:

344 info['previousDownload'] = '%i month(s)' % (diffdays / 30)

345 else:

346 info['previousDownload'] = '%i year(s)' % (diffdays / 365)

347 else:

348 info['previousDownload'] = 'unknown'

349

350 if last_update.year != info['time'].year or last_update.month != inf o['time'].month:

351 info['firstInMonth'] = info['firstInDay'] = True

352 elif last_update.day != info['time'].day:

353 info['firstInDay'] = True

354

355 if get_week(last_update) != get_week(info['time']):

356 info['firstInWeek'] = True

357 except ValueError:

358 info['downloadInterval'] = 'unknown'

359 info['previousDownload'] = 'unknown'

360 pass

361

362

363 def parse_addon_name(file):

364 if '/' in file:

365 return file.split('/')[-2]

366 else:

367 return None

368

369

370 def parse_gecko_query(query):

371 params = urlparse.parse_qs(query)

372

373 version = params.get('version', ['unknown'])[0]

374

375 global gecko_apps

376 if gecko_apps == None:

377 from buildtools.packagerGecko import KNOWN_APPS

378 gecko_apps = {v: k for k, v in KNOWN_APPS.iteritems()}

379 appID = params.get('appID', ['unknown'])[0]

380

381 application = gecko_apps.get(appID, 'unknown')

382 applicationVersion = params.get('appVersion', ['unknown'])[0]

383

384 # Only leave the major and minor release number for application

385 applicationVersion = re.sub(r'^(\d+\.\d+).*', r'\1', applicationVersion)

386

387 return version, application, applicationVersion

388

389

390 def parse_chrome_query(query):

391 params = urlparse.parse_qs(query)

392

393 if params.get('prod', ['unknown'])[0] in ('chromecrx', 'chromiumcrx'):

394 application = 'chrome'

395 else:

396 application = 'unknown'

397 applicationVersion = params.get('prodversion', ['unknown'])[0]

398

399 params2 = urlparse.parse_qs(params.get('x', [''])[0])

400 version = params2.get('v', ['unknown'])[0]

401

402 # Only leave the major and minor release number for application

403 applicationVersion = re.sub(r'^(\d+\.\d+).*', r'\1', applicationVersion)

404

405 return version, application, applicationVersion

406

407

408 def parse_update_flag(query):

409 return 'update' if query == 'update' else 'install'

410

411

412 def parse_record(line, ignored, geo, geov6):

413 global log_regexp

414 if log_regexp == None:

415 log_regexp = re.compile(r'(\S+) \S+ \S+ \[([^]\s]+) ([+\-]\d\d)(\d\d)\] "GET ([^"\s]+) [^"]+" (\d+) (\d+) "([^"])" "([^"])"(?: "[^"]" \S+ "[^"]" "[^ "]" "([^"])")?')

416

417 match = re.search(log_regexp, line)

418 if not match:

419 return None

420

421 status = int(match.group(6))

422 if status not in (200, 301, 302):

423 return None

424

425 info = {

426 'status': status,

427 'size': int(match.group(7)),

428 }

429

430 info['ip'], info['country'] = process_ip(match.group(1), geo, geov6)

431 info['time'], info['month'], info['day'], info['weekday'], info['hour'] = pa rse_time(match.group(2), int(match.group(3)), int(match.group(4)))

432 info['file'], info['query'] = parse_path(match.group(5))

433 info['referrer'] = match.group(8)

434 info['ua'], info['uaversion'] = parse_ua(match.group(9))

435 info['fullua'] = '%s %s' % (info['ua'], info['uaversion'])

436 info['clientid'] = match.group(10)

437

438 # Additional metadata depends on file type

439 filename = os.path.basename(info['file'])

440 ext = os.path.splitext(filename)[1]

441 if ext == '.txt' or filename == 'update.json' or filename == 'notification.j son':

442 # Subscription downloads, libadblockplus update checks and notification

443 # checks are performed by the downloader

444 parse_downloader_query(info)

445 elif ext == '.tpl':

446 # MSIE TPL download, no additional data here

447 pass

448 elif ext in ('.xpi', '.crx', '.apk', '.msi', '.exe', '.safariextz'):

449 # Package download, might be an update

450 info['installType'] = parse_update_flag(info['query'])

451 elif filename == 'update.rdf':

452 # Gecko update check or a legacy Android update check. The latter doesn' t

453 # have usable data anyway so trying the Chrome route won't do any harm.

454 info['addonName'] = parse_addon_name(info['file'])

455 info['addonVersion'], info['application'], info['applicationVersion'] = parse_gecko_query(info['query'])

456 elif filename == 'updates.xml':

457 # Chrome update check

458 info['addonName'] = parse_addon_name(info['file'])

459 info['addonVersion'], info['application'], info['applicationVersion'] = parse_chrome_query(info['query'])

460 elif filename == 'updates.plist':

461 # Safari update check, no additional data

462 pass

463 else:

464 ignored.add(info['file'])

465 return None

466

467 if 'addonName' in info:

468 info['fullAddon'] = '%s %s' % (info['addonName'], info['addonVersion'])

469 if 'application' in info:

470 info['fullApplication'] = '%s %s' % (info['application'], info['applicat ionVersion'])

471 if 'platform' in info:

472 info['fullPlatform'] = '%s %s' % (info['platform'], info['platformVersio n'])

473 return info

474

475

476 def add_record(info, section, ignore_fields=()):

477 section['hits'] = section.get('hits', 0) + 1

478 section['bandwidth'] = section.get('bandwidth', 0) + info['size']

479

480 if len(ignore_fields) < 2:

481 for field in map(lambda f: f['name'], common.fields):

482 if field in ignore_fields or field not in info:

483 continue

484

485 value = info[field]

486 if field not in section:

487 section[field] = {}

488 if value not in section[field]:

489 section[field][value] = {}

490

491 add_record(info, section[field][value], ignore_fields + (field,))

492

493

494 def parse_fileobj(mirror_name, fileobj, geo, geov6, ignored):

495 data = {}

496 for line in fileobj:

497 info = parse_record(line, ignored, geo, geov6)

498 if info == None:

499 continue

500

501 info['mirror'] = mirror_name

502 if info['month'] not in data:

503 data[info['month']] = {}

504 section = data[info['month']]

505

506 if info['file'] not in section:

507 section[info['file']] = {}

508 section = section[info['file']]

509

510 add_record(info, section)

511 return data

512

513

514 def merge_objects(object1, object2, factor=1):

515 for key, value in object2.iteritems():

516 try:

517 key = unicode(key)

518 except UnicodeDecodeError:

519 key = unicode(key, encoding='latin-1')

520 if isinstance(value, numbers.Number):

521 object1[key] = object1.get(key, 0) + factor * value

522 else:

523 merge_objects(object1.setdefault(key, {}), value, factor)

524

525

526 def save_stats(server_type, data, factor=1):

527 base_dir = os.path.join(get_config().get('stats', 'dataDirectory'), common.f ilename_encode(server_type))

528 for month, month_data in data.iteritems():

529 for name, file_data in month_data.iteritems():

530 path = os.path.join(base_dir, common.filename_encode(month), common. filename_encode(name + '.json'))

531 if os.path.exists(path):

532 with codecs.open(path, 'rb', encoding='utf-8') as fileobj:

533 existing = json.load(fileobj)

534 else:

535 existing = {}

536

537 merge_objects(existing, file_data, factor)

538

539 dir = os.path.dirname(path)

540 try:

541 os.makedirs(dir)

542 except OSError as e:

543 if e.errno != errno.EEXIST:

544 raise

545

546 with codecs.open(path, 'wb', encoding='utf-8') as fileobj:

547 json.dump(existing, fileobj, indent=2, sort_keys=True)

548

549

550 def parse_source(factor, lock, (mirror_name, server_type, log_file)):

551 try:

552 geo = pygeoip.GeoIP(get_config().get('stats', 'geoip_db'), pygeoip.MEMOR Y_CACHE)

553 geov6 = pygeoip.GeoIP(get_config().get('stats', 'geoipv6_db'), pygeoip.M EMORY_CACHE)

554

555 ignored = set()

556 fileobj = StatsFile(log_file)

557 try:

558 data = parse_fileobj(mirror_name, fileobj, geo, geov6, ignored)

559 finally:

560 fileobj.close()

561

562 lock.acquire()

563 try:

564 save_stats(server_type, data, factor)

565 finally:

566 lock.release()

567 return log_file, ignored

568 except:

569 print >>sys.stderr, "Unable to process log file '%s'" % log_file

570 traceback.print_exc()

571 return None, None

572

573

574 def parse_sources(sources, factor=1, verbose=False):

575 pool = multiprocessing.Pool()

576 lock = multiprocessing.Manager().Lock()

577 callback = functools.partial(parse_source, factor, lock)

578 try:

579 for log_file, ignored in pool.imap_unordered(callback, sources, chunksiz e=1):

580 if verbose and ignored:

581 print 'Ignored files for %s' % log_file

582 print '========================================================= ==='

583 print '\n'.join(sorted(ignored))

584 finally:

585 pool.close()

586

587

588 if __name__ == '__main__':

589 setupStderr()

590

591 parser = argparse.ArgumentParser(description='Processes log files and merges them into the stats database')

592 parser.add_argument('--verbose', dest='verbose', action='store_const', const =True, default=False, help='Verbose mode, ignored requests will be listed')

593 parser.add_argument('--revert', dest='factor', action='store_const', const=- 1, default=1, help='Remove log data from the database')

594 parser.add_argument('mirror_name', nargs='?', help='Name of the mirror serve r that the file belongs to')

595 parser.add_argument('server_type', nargs='?', help='Server type like downloa d, update or subscription')

596 parser.add_argument('log_file', nargs='?', help='Log file path, can be a loc al file path, http:// or ssh:// URL')

597 args = parser.parse_args()

598

599 if args.mirror_name and args.server_type and args.log_file:

600 sources = [(args.mirror_name, args.server_type, args.log_file)]

601 else:

602 sources = get_stats_files()

603 parse_sources(sources, args.factor, args.verbose)

OLD	NEW

« no previous file with comments | « sitescripts/stats/bin/__init__.py ('k') | sitescripts/stats/bin/pagegenerator.py » ('j') | tox.ini » ('J')