sitescripts/stats/bin/logprocessor.py - Issue 29345242: Noissue - Adapt quotes for compliance with our coding style in sitescripts

Side by Side Diff: sitescripts/stats/bin/logprocessor.py

Issue 29345242: Noissue - Adapt quotes for compliance with our coding style in sitescripts (Closed)

Patch Set: Fixed raw string Created May 30, 2016, 8:47 a.m.

Left:
Right:

Use n/p to move between diff chunks; N/P to move between comments.

Jump to:

View unified diff | Download patch

« no previous file with comments | « sitescripts/send_installation_link/web/send_installation_link.py ('k') | sitescripts/stats/bin/pagegenerator.py » ('j') | sitescripts/urlfixer/bin/topDomains.py » ('J')
Toggle Intra-line Diffs ('i') | Expand Comments ('e') | Collapse Comments ('c') | Hide Comments ('s')

OLD	NEW
1 # This file is part of the Adblock Plus web scripts,	1 # This file is part of the Adblock Plus web scripts,

2 # Copyright (C) 2006-2016 Eyeo GmbH	2 # Copyright (C) 2006-2016 Eyeo GmbH

3 #	3 #

4 # Adblock Plus is free software: you can redistribute it and/or modify	4 # Adblock Plus is free software: you can redistribute it and/or modify

5 # it under the terms of the GNU General Public License version 3 as	5 # it under the terms of the GNU General Public License version 3 as

6 # published by the Free Software Foundation.	6 # published by the Free Software Foundation.

7 #	7 #

8 # Adblock Plus is distributed in the hope that it will be useful,	8 # Adblock Plus is distributed in the hope that it will be useful,

9 # but WITHOUT ANY WARRANTY; without even the implied warranty of	9 # but WITHOUT ANY WARRANTY; without even the implied warranty of

10 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the	10 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the

(...skipping 29 matching lines...) Expand all Loading...
40 log_regexp = None	40 log_regexp = None

41 gecko_apps = None	41 gecko_apps = None

42	42

43	43

44 class StatsFile:	44 class StatsFile:

45 def __init__(self, path):	45 def __init__(self, path):

46 self._inner_file = None	46 self._inner_file = None

47 self._processes = []	47 self._processes = []

48	48

49 parseresult = urlparse.urlparse(path)	49 parseresult = urlparse.urlparse(path)

50 if parseresult.scheme == "ssh" and parseresult.username and parseresult. hostname and parseresult.path:	50 if parseresult.scheme == 'ssh' and parseresult.username and parseresult. hostname and parseresult.path:

51 command = [	51 command = [

52 "ssh", "-q", "-o", "NumberOfPasswordPrompts 0", "-T", "-k",	52 'ssh', '-q', '-o', 'NumberOfPasswordPrompts 0', '-T', '-k',

53 "-l", parseresult.username,	53 '-l', parseresult.username,

54 parseresult.hostname,	54 parseresult.hostname,

55 parseresult.path.lstrip("/")	55 parseresult.path.lstrip('/')

56 ]	56 ]

57 if parseresult.port:	57 if parseresult.port:

58 command[1:1] = ["-P", str(parseresult.port)]	58 command[1:1] = ['-P', str(parseresult.port)]

59 ssh_process = subprocess.Popen(command, stdout=subprocess.PIPE)	59 ssh_process = subprocess.Popen(command, stdout=subprocess.PIPE)

60 self._processes.append(ssh_process)	60 self._processes.append(ssh_process)

61 self._file = ssh_process.stdout	61 self._file = ssh_process.stdout

62 elif parseresult.scheme in ("http", "https"):	62 elif parseresult.scheme in ('http', 'https'):

63 self._file = urllib.urlopen(path)	63 self._file = urllib.urlopen(path)

64 elif os.path.exists(path):	64 elif os.path.exists(path):

65 self._file = open(path, "rb")	65 self._file = open(path, 'rb')

66 else:	66 else:

67 raise IOError("Path '%s' not recognized" % path)	67 raise IOError("Path '%s' not recognized" % path)

68	68

69 if path.endswith(".gz"):	69 if path.endswith('.gz'):

70 # Built-in gzip module doesn't support streaming (fixed in Python 3. 2)	70 # Built-in gzip module doesn't support streaming (fixed in Python 3. 2)

71 gzip_process = subprocess.Popen(["gzip", "-cd"], stdin=self._file, s tdout=subprocess.PIPE)	71 gzip_process = subprocess.Popen(['gzip', '-cd'], stdin=self._file, s tdout=subprocess.PIPE)

72 self._processes.append(gzip_process)	72 self._processes.append(gzip_process)

73 self._file, self._inner_file = gzip_process.stdout, self._file	73 self._file, self._inner_file = gzip_process.stdout, self._file

74	74

75 def __getattr__(self, name):	75 def __getattr__(self, name):

76 return getattr(self._file, name)	76 return getattr(self._file, name)

77	77

78 def close(self):	78 def close(self):

79 self._file.close()	79 self._file.close()

80 if self._inner_file:	80 if self._inner_file:

81 self._inner_file.close()	81 self._inner_file.close()

82 for process in self._processes:	82 for process in self._processes:

83 process.wait()	83 process.wait()

84	84

85	85

86 def get_stats_files():	86 def get_stats_files():

87 config = get_config()	87 config = get_config()

88	88

89 prefix = "mirror_"	89 prefix = 'mirror_'

90 options = filter(lambda o: o.startswith(prefix), config.options("stats"))	90 options = filter(lambda o: o.startswith(prefix), config.options('stats'))

91 for option in options:	91 for option in options:

92 if config.has_option("stats", option):	92 if config.has_option('stats', option):

93 value = config.get("stats", option)	93 value = config.get('stats', option)

94 if " " in value:	94 if ' ' in value:

95 yield [option[len(prefix):]] + value.split(None, 1)	95 yield [option[len(prefix):]] + value.split(None, 1)

96 else:	96 else:

97 print >>sys.stderr, "Option '%s' has invalid value: '%s'" % (opt ion, value)	97 print >>sys.stderr, "Option '%s' has invalid value: '%s'" % (opt ion, value)

98 else:	98 else:

99 print >>sys.stderr, "Option '%s' not found in the configuration" % o ption	99 print >>sys.stderr, "Option '%s' not found in the configuration" % o ption

100	100

101	101

102 def cache_lru(func):	102 def cache_lru(func):

103 """	103 """

104 Decorator that memoizes the return values of a single-parameter function i n	104 Decorator that memoizes the return values of a single-parameter function i n

(...skipping 21 matching lines...) Expand all Loading...
126 results[arg] = result	126 results[arg] = result

127 return result	127 return result

128 return wrapped	128 return wrapped

129	129

130	130

131 def cache_last(func):	131 def cache_last(func):

132 """	132 """

133 Decorator that memoizes the last return value of a function in case it is	133 Decorator that memoizes the last return value of a function in case it is

134 called again with the same parameters.	134 called again with the same parameters.

135 """	135 """

136 result = {"args": None, "result": None}	136 result = {'args': None, 'result': None}

137	137

138 def wrapped(*args):	138 def wrapped(*args):

139 if args != result["args"]:	139 if args != result['args']:

140 result["result"] = func(*args)	140 result['result'] = func(*args)

141 result["args"] = args	141 result['args'] = args

142 return result["result"]	142 return result['result']

143 return wrapped	143 return wrapped

144	144

145	145

146 @cache_lru	146 @cache_lru

147 def parse_ua(ua):	147 def parse_ua(ua):

148 # Opera might disguise itself as other browser so it needs to go first	148 # Opera might disguise itself as other browser so it needs to go first

149 match = re.search(r"\bOpera/([\d\.]+)", ua)	149 match = re.search(r'\bOpera/([\d\.]+)', ua)

150 if match:	150 if match:

151 # Opera 10+ declares itself as Opera 9.80 but adds Version/1x.x to the U A	151 # Opera 10+ declares itself as Opera 9.80 but adds Version/1x.x to the U A

152 match2 = re.search(r"\bVersion/([\d\.]+)", ua)	152 match2 = re.search(r'\bVersion/([\d\.]+)', ua)

153 if match2:	153 if match2:

154 return "Opera", match2.group(1)	154 return 'Opera', match2.group(1)

155 else:	155 else:

156 return "Opera", match.group(1)	156 return 'Opera', match.group(1)

157	157

158 # Opera 15+ has the same UA as Chrome but adds OPR/1x.x to it	158 # Opera 15+ has the same UA as Chrome but adds OPR/1x.x to it

159 match = re.search(r"\bOPR/(\d+\.\d+)", ua)	159 match = re.search(r'\bOPR/(\d+\.\d+)', ua)

160 if match:	160 if match:

161 return "Opera", match.group(1)	161 return 'Opera', match.group(1)

162	162

163 # Have to check for these before Firefox, they will usually have a Firefox i dentifier as well	163 # Have to check for these before Firefox, they will usually have a Firefox i dentifier as well

164 match = re.search(r"\b(Fennec\|Thunderbird\|SeaMonkey\|Songbird\|K-Meleon\|Prism) /(\d+\.\d+)", ua)	164 match = re.search(r'\b(Fennec\|Thunderbird\|SeaMonkey\|Songbird\|K-Meleon\|Prism) /(\d+\.\d+)', ua)

165 if match:	165 if match:

166 if match.group(1) == "Fennec":	166 if match.group(1) == 'Fennec':

167 return "Firefox Mobile", match.group(2)	167 return 'Firefox Mobile', match.group(2)

168 else:	168 else:

169 return match.group(1), match.group(2)	169 return match.group(1), match.group(2)

170	170

171 match = re.search(r"\bFirefox/(\d+\.\d+)", ua)	171 match = re.search(r'\bFirefox/(\d+\.\d+)', ua)

172 if match:	172 if match:

173 if re.search(r"\bMobile;", ua):	173 if re.search(r'\bMobile;', ua):

174 return "Firefox Mobile", match.group(1)	174 return 'Firefox Mobile', match.group(1)

175 elif re.search(r"\bTablet;", ua):	175 elif re.search(r'\bTablet;', ua):

176 return "Firefox Tablet", match.group(1)	176 return 'Firefox Tablet', match.group(1)

177 else:	177 else:

178 return "Firefox", match.group(1)	178 return 'Firefox', match.group(1)

179	179

180 match = re.search(r"\brv:(\d+)\.(\d+)(?:\.(\d+))?", ua)	180 match = re.search(r'\brv:(\d+)\.(\d+)(?:\.(\d+))?', ua)

181 if match and re.search(r"\bGecko/", ua):	181 if match and re.search(r'\bGecko/', ua):

182 if match.group(3) and int(match.group(1)) < 2:	182 if match.group(3) and int(match.group(1)) < 2:

183 return "Gecko", "%s.%s.%s" % (match.group(1), match.group(2), match. group(3))	183 return 'Gecko', '%s.%s.%s' % (match.group(1), match.group(2), match. group(3))

184 else:	184 else:

185 return "Gecko", "%s.%s" % (match.group(1), match.group(2))	185 return 'Gecko', '%s.%s' % (match.group(1), match.group(2))

186	186

187 match = re.search(r"\bCoolNovo/(\d+\.\d+\.\d+)", ua)	187 match = re.search(r'\bCoolNovo/(\d+\.\d+\.\d+)', ua)

188 if match:	188 if match:

189 return "CoolNovo", match.group(1)	189 return 'CoolNovo', match.group(1)

190	190

191 match = re.search(r"\bEdge/(\d+)\.\d+", ua)	191 match = re.search(r'\bEdge/(\d+)\.\d+', ua)

192 if match:	192 if match:

193 return "Edge", match.group(1)	193 return 'Edge', match.group(1)

194	194

195 match = re.search(r"\bChrome/(\d+\.\d+)", ua)	195 match = re.search(r'\bChrome/(\d+\.\d+)', ua)

196 if match:	196 if match:

197 return "Chrome", match.group(1)	197 return 'Chrome', match.group(1)

198	198

199 match = re.search(r"\bVersion/(\d+\.\d+)", ua)	199 match = re.search(r'\bVersion/(\d+\.\d+)', ua)

200 if match and re.search(r"\bMobile Safari/", ua):	200 if match and re.search(r'\bMobile Safari/', ua):

201 return "Mobile Safari", match.group(1)	201 return 'Mobile Safari', match.group(1)

202 if match and re.search(r"\bSafari/", ua):	202 if match and re.search(r'\bSafari/', ua):

203 return "Safari", match.group(1)	203 return 'Safari', match.group(1)

204	204

205 if re.search(r"\bAppleWebKit/", ua):	205 if re.search(r'\bAppleWebKit/', ua):

206 return "WebKit", ""	206 return 'WebKit', ''

207	207

208 match = re.search(r"\bMSIE (\d+\.\d+)", ua)	208 match = re.search(r'\bMSIE (\d+\.\d+)', ua)

209 if match:	209 if match:

210 return "MSIE", match.group(1)	210 return 'MSIE', match.group(1)

211	211

212 match = re.search(r"\bTrident/(\d+\.\d+)", ua)	212 match = re.search(r'\bTrident/(\d+\.\d+)', ua)

213 if match:	213 if match:

214 match2 = re.search(r"\brv:(\d+\.\d+)", ua)	214 match2 = re.search(r'\brv:(\d+\.\d+)', ua)

215 if match2:	215 if match2:

216 return "MSIE", match2.group(1)	216 return 'MSIE', match2.group(1)

217 else:	217 else:

218 return "Trident", match.group(1)	218 return 'Trident', match.group(1)

219	219

220 match = re.search(r"\bAndroidDownloadManager(?:/(\d+\.\d+))?", ua)	220 match = re.search(r'\bAndroidDownloadManager(?:/(\d+\.\d+))?', ua)

221 if match:	221 if match:

222 return "Android", match.group(1) or ""	222 return 'Android', match.group(1) or ''

223	223

224 match = re.search(r"\bDalvik/.*\bAndroid (\d+\.\d+)", ua)	224 match = re.search(r'\bDalvik/.*\bAndroid (\d+\.\d+)', ua)

225 if match:	225 if match:

226 return "Android", match.group(1)	226 return 'Android', match.group(1)

227	227

228 # ABP/Android downloads use that user agent	228 # ABP/Android downloads use that user agent

229 if ua.startswith("Apache-HttpClient/UNAVAILABLE"):	229 if ua.startswith('Apache-HttpClient/UNAVAILABLE'):

230 return "Android", ""	230 return 'Android', ''

231	231

232 # ABP/IE downloads use that user agent	232 # ABP/IE downloads use that user agent

233 if ua == "Adblock Plus":	233 if ua == 'Adblock Plus':

234 return "ABP", ""	234 return 'ABP', ''

235	235

236 return "Other", ""	236 return 'Other', ''

237	237

238	238

239 def process_ip(ip, geo, geov6):	239 def process_ip(ip, geo, geov6):

240 match = re.search(r"^::ffff:(\d+\.\d+\.\d+\.\d+)$", ip)	240 match = re.search(r'^::ffff:(\d+\.\d+\.\d+\.\d+)$', ip)

241 if match:	241 if match:

242 ip = match.group(1)	242 ip = match.group(1)

243	243

244 try:	244 try:

245 if ":" in ip:	245 if ':' in ip:

246 country = geov6.country_code_by_addr(ip)	246 country = geov6.country_code_by_addr(ip)

247 else:	247 else:

248 country = geo.country_code_by_addr(ip)	248 country = geo.country_code_by_addr(ip)

249 except:	249 except:

250 traceback.print_exc()	250 traceback.print_exc()

251 country = ""	251 country = ''

252	252

253 if country in (None, "", "--"):	253 if country in (None, '', '--'):

254 country = "unknown"	254 country = 'unknown'

255 country = country.lower()	255 country = country.lower()

256	256

257 return ip, country	257 return ip, country

258	258

259	259

260 @cache_last	260 @cache_last

261 def parse_time(timestr, tz_hours, tz_minutes):	261 def parse_time(timestr, tz_hours, tz_minutes):

262 result = datetime.strptime(timestr, "%d/%b/%Y:%H:%M:%S")	262 result = datetime.strptime(timestr, '%d/%b/%Y:%H:%M:%S')

263 result -= timedelta(hours=tz_hours, minutes=math.copysign(tz_minutes, tz_hou rs))	263 result -= timedelta(hours=tz_hours, minutes=math.copysign(tz_minutes, tz_hou rs))

264 return result, result.strftime("%Y%m"), result.day, result.weekday(), result .hour	264 return result, result.strftime('%Y%m'), result.day, result.weekday(), result .hour

265	265

266	266

267 @cache_lru	267 @cache_lru

268 def parse_path(path):	268 def parse_path(path):

269 urlparts = urlparse.urlparse(path)	269 urlparts = urlparse.urlparse(path)

270 try:	270 try:

271 path = urllib.unquote(urlparts.path).decode("utf-8")	271 path = urllib.unquote(urlparts.path).decode('utf-8')

272 except:	272 except:

273 path = urlparts.path	273 path = urlparts.path

274 return path[1:], urlparts.query	274 return path[1:], urlparts.query

275	275

276	276

277 @cache_lru	277 @cache_lru

278 def parse_query(query):	278 def parse_query(query):

279 return urlparse.parse_qs(query)	279 return urlparse.parse_qs(query)

280	280

281	281

282 @cache_lru	282 @cache_lru

283 def parse_lastversion(last_version):	283 def parse_lastversion(last_version):

284 if '-' in last_version:	284 if '-' in last_version:

285 last_version = last_version.split('-', 1)[0]	285 last_version = last_version.split('-', 1)[0]

286 return datetime.strptime(last_version, "%Y%m%d%H%M")	286 return datetime.strptime(last_version, '%Y%m%d%H%M')

287	287

288	288

289 @cache_lru	289 @cache_lru

290 def get_week(date):	290 def get_week(date):

291 return date.isocalendar()[0:2]	291 return date.isocalendar()[0:2]

292	292

293	293

294 def parse_downloader_query(info):	294 def parse_downloader_query(info):

295 params = parse_query(info["query"])	295 params = parse_query(info['query'])

296 for param in ("addonName", "addonVersion", "application", "applicationVersio n", "platform", "platformVersion"):	296 for param in ('addonName', 'addonVersion', 'application', 'applicationVersio n', 'platform', 'platformVersion'):

297 info[param] = params.get(param, ["unknown"])[0]	297 info[param] = params.get(param, ['unknown'])[0]

298	298

299 # Only leave the major and minor release number for application and platform	299 # Only leave the major and minor release number for application and platform

300 info["applicationVersion"] = re.sub(r"^(\d+\.\d+).*", r"\1", info["applicati onVersion"])	300 info['applicationVersion'] = re.sub(r'^(\d+\.\d+).*', r'\1', info['applicati onVersion'])

301 info["platformVersion"] = re.sub(r"^(\d+\.\d+).*", r"\1", info["platformVers ion"])	301 info['platformVersion'] = re.sub(r'^(\d+\.\d+).*', r'\1', info['platformVers ion'])

302	302

303 # Chrome Adblock sends an X-Client-ID header insteads of URL parameters	303 # Chrome Adblock sends an X-Client-ID header insteads of URL parameters

304 match = re.match(r"^adblock/([\d\.]+)$", info["clientid"], re.I) if info["cl ientid"] else None	304 match = re.match(r'^adblock/([\d\.]+)$', info['clientid'], re.I) if info['cl ientid'] else None

305 if match:	305 if match:

306 info["addonName"] = "chromeadblock"	306 info['addonName'] = 'chromeadblock'

307 info["addonVersion"] = match.group(1)	307 info['addonVersion'] = match.group(1)

308	308

309 last_version = params.get("lastVersion", ["unknown"])[0]	309 last_version = params.get('lastVersion', ['unknown'])[0]

310 if info["file"] == "notification.json" and last_version == "0" and (	310 if info['file'] == 'notification.json' and last_version == '0' and (

311 (info["addonName"] == "adblockplus" and info["addonVersion"] == "2.3.1") or	311 (info['addonName'] == 'adblockplus' and info['addonVersion'] == '2.3.1') or

312 (info["addonName"] in ("adblockpluschrome", "adblockplusopera") and info ["addonVersion"] == "1.5.2")	312 (info['addonName'] in ('adblockpluschrome', 'adblockplusopera') and info ['addonVersion'] == '1.5.2')

313 ):	313 ):

314 # Broken notification version number in these releases, treat like unkno wn	314 # Broken notification version number in these releases, treat like unkno wn

315 last_version = "unknown"	315 last_version = 'unknown'

316	316

317 if last_version == "unknown":	317 if last_version == 'unknown':

318 info["downloadInterval"] = "unknown"	318 info['downloadInterval'] = 'unknown'

319 info["previousDownload"] = "unknown"	319 info['previousDownload'] = 'unknown'

320 elif last_version == "0":	320 elif last_version == '0':

321 info["downloadInterval"] = "unknown"	321 info['downloadInterval'] = 'unknown'

322 info["previousDownload"] = "unknown"	322 info['previousDownload'] = 'unknown'

323 info["firstDownload"] = True	323 info['firstDownload'] = True

324 else:	324 else:

325 try:	325 try:

326 last_update = parse_lastversion(last_version)	326 last_update = parse_lastversion(last_version)

327 diff = info["time"] - last_update	327 diff = info['time'] - last_update

328 if diff.days >= 365:	328 if diff.days >= 365:

329 info["downloadInterval"] = "%i year(s)" % (diff.days / 365)	329 info['downloadInterval'] = '%i year(s)' % (diff.days / 365)

330 elif diff.days >= 30:	330 elif diff.days >= 30:

331 info["downloadInterval"] = "%i month(s)" % (diff.days / 30)	331 info['downloadInterval'] = '%i month(s)' % (diff.days / 30)

332 elif diff.days >= 1:	332 elif diff.days >= 1:

333 info["downloadInterval"] = "%i day(s)" % diff.days	333 info['downloadInterval'] = '%i day(s)' % diff.days

334 else:	334 else:

335 info["downloadInterval"] = "%i hour(s)" % (diff.seconds / 3600)	335 info['downloadInterval'] = '%i hour(s)' % (diff.seconds / 3600)

336	336

337 if info["addonName"].startswith("adblockplus"):	337 if info['addonName'].startswith('adblockplus'):

338 diffdays = (info["time"].date() - last_update.date()).days	338 diffdays = (info['time'].date() - last_update.date()).days

339 if diffdays == 0:	339 if diffdays == 0:

340 info["previousDownload"] = "same day"	340 info['previousDownload'] = 'same day'

341 elif diffdays < 30:	341 elif diffdays < 30:

342 info["previousDownload"] = "%i day(s)" % diffdays	342 info['previousDownload'] = '%i day(s)' % diffdays

343 elif diffdays < 365:	343 elif diffdays < 365:

344 info["previousDownload"] = "%i month(s)" % (diffdays / 30)	344 info['previousDownload'] = '%i month(s)' % (diffdays / 30)

345 else:	345 else:

346 info["previousDownload"] = "%i year(s)" % (diffdays / 365)	346 info['previousDownload'] = '%i year(s)' % (diffdays / 365)

347 else:	347 else:

348 info["previousDownload"] = "unknown"	348 info['previousDownload'] = 'unknown'

349	349

350 if last_update.year != info["time"].year or last_update.month != inf o["time"].month:	350 if last_update.year != info['time'].year or last_update.month != inf o['time'].month:

351 info["firstInMonth"] = info["firstInDay"] = True	351 info['firstInMonth'] = info['firstInDay'] = True

352 elif last_update.day != info["time"].day:	352 elif last_update.day != info['time'].day:

353 info["firstInDay"] = True	353 info['firstInDay'] = True

354	354

355 if get_week(last_update) != get_week(info["time"]):	355 if get_week(last_update) != get_week(info['time']):

356 info["firstInWeek"] = True	356 info['firstInWeek'] = True

357 except ValueError:	357 except ValueError:

358 info["downloadInterval"] = "unknown"	358 info['downloadInterval'] = 'unknown'

359 info["previousDownload"] = "unknown"	359 info['previousDownload'] = 'unknown'

360 pass	360 pass

361	361

362	362

363 def parse_addon_name(file):	363 def parse_addon_name(file):

364 if "/" in file:	364 if '/' in file:

365 return file.split("/")[-2]	365 return file.split('/')[-2]

366 else:	366 else:

367 return None	367 return None

368	368

369	369

370 def parse_gecko_query(query):	370 def parse_gecko_query(query):

371 params = urlparse.parse_qs(query)	371 params = urlparse.parse_qs(query)

372	372

373 version = params.get("version", ["unknown"])[0]	373 version = params.get('version', ['unknown'])[0]

374	374

375 global gecko_apps	375 global gecko_apps

376 if gecko_apps == None:	376 if gecko_apps == None:

377 from buildtools.packagerGecko import KNOWN_APPS	377 from buildtools.packagerGecko import KNOWN_APPS

378 gecko_apps = {v: k for k, v in KNOWN_APPS.iteritems()}	378 gecko_apps = {v: k for k, v in KNOWN_APPS.iteritems()}

379 appID = params.get("appID", ["unknown"])[0]	379 appID = params.get('appID', ['unknown'])[0]

380	380

381 application = gecko_apps.get(appID, "unknown")	381 application = gecko_apps.get(appID, 'unknown')

382 applicationVersion = params.get("appVersion", ["unknown"])[0]	382 applicationVersion = params.get('appVersion', ['unknown'])[0]

383	383

384 # Only leave the major and minor release number for application	384 # Only leave the major and minor release number for application

385 applicationVersion = re.sub(r"^(\d+\.\d+).*", r"\1", applicationVersion)	385 applicationVersion = re.sub(r'^(\d+\.\d+).*', r'\1', applicationVersion)

386	386

387 return version, application, applicationVersion	387 return version, application, applicationVersion

388	388

389	389

390 def parse_chrome_query(query):	390 def parse_chrome_query(query):

391 params = urlparse.parse_qs(query)	391 params = urlparse.parse_qs(query)

392	392

393 if params.get("prod", ["unknown"])[0] in ("chromecrx", "chromiumcrx"):	393 if params.get('prod', ['unknown'])[0] in ('chromecrx', 'chromiumcrx'):

394 application = "chrome"	394 application = 'chrome'

395 else:	395 else:

396 application = "unknown"	396 application = 'unknown'

397 applicationVersion = params.get("prodversion", ["unknown"])[0]	397 applicationVersion = params.get('prodversion', ['unknown'])[0]

398	398

399 params2 = urlparse.parse_qs(params.get("x", [""])[0])	399 params2 = urlparse.parse_qs(params.get('x', [''])[0])

400 version = params2.get("v", ["unknown"])[0]	400 version = params2.get('v', ['unknown'])[0]

401	401

402 # Only leave the major and minor release number for application	402 # Only leave the major and minor release number for application

403 applicationVersion = re.sub(r"^(\d+\.\d+).*", r"\1", applicationVersion)	403 applicationVersion = re.sub(r'^(\d+\.\d+).*', r'\1', applicationVersion)

404	404

405 return version, application, applicationVersion	405 return version, application, applicationVersion

406	406

407	407

408 def parse_update_flag(query):	408 def parse_update_flag(query):

409 return "update" if query == "update" else "install"	409 return 'update' if query == 'update' else 'install'

410	410

411	411

412 def parse_record(line, ignored, geo, geov6):	412 def parse_record(line, ignored, geo, geov6):

413 global log_regexp	413 global log_regexp

414 if log_regexp == None:	414 if log_regexp == None:

415 log_regexp = re.compile(r'(\S+) \S+ \S+ \[([^]\s]+) ([+\-]\d\d)(\d\d)\] "GET ([^"\s]+) [^"]+" (\d+) (\d+) "([^"])" "([^"])"(?: "[^"]" \S+ "[^"]" "[^ "]" "([^"])")?')	415 log_regexp = re.compile(r'(\S+) \S+ \S+ \[([^]\s]+) ([+\-]\d\d)(\d\d)\] "GET ([^"\s]+) [^"]+" (\d+) (\d+) "([^"])" "([^"])"(?: "[^"]" \S+ "[^"]" "[^ "]" "([^"])")?')

416	416

417 match = re.search(log_regexp, line)	417 match = re.search(log_regexp, line)

418 if not match:	418 if not match:

419 return None	419 return None

420	420

421 status = int(match.group(6))	421 status = int(match.group(6))

422 if status not in (200, 301, 302):	422 if status not in (200, 301, 302):

423 return None	423 return None

424	424

425 info = {	425 info = {

426 "status": status,	426 'status': status,

427 "size": int(match.group(7)),	427 'size': int(match.group(7)),

428 }	428 }

429	429

430 info["ip"], info["country"] = process_ip(match.group(1), geo, geov6)	430 info['ip'], info['country'] = process_ip(match.group(1), geo, geov6)

431 info["time"], info["month"], info["day"], info["weekday"], info["hour"] = pa rse_time(match.group(2), int(match.group(3)), int(match.group(4)))	431 info['time'], info['month'], info['day'], info['weekday'], info['hour'] = pa rse_time(match.group(2), int(match.group(3)), int(match.group(4)))

432 info["file"], info["query"] = parse_path(match.group(5))	432 info['file'], info['query'] = parse_path(match.group(5))

433 info["referrer"] = match.group(8)	433 info['referrer'] = match.group(8)

434 info["ua"], info["uaversion"] = parse_ua(match.group(9))	434 info['ua'], info['uaversion'] = parse_ua(match.group(9))

435 info["fullua"] = "%s %s" % (info["ua"], info["uaversion"])	435 info['fullua'] = '%s %s' % (info['ua'], info['uaversion'])

436 info["clientid"] = match.group(10)	436 info['clientid'] = match.group(10)

437	437

438 # Additional metadata depends on file type	438 # Additional metadata depends on file type

439 filename = os.path.basename(info["file"])	439 filename = os.path.basename(info['file'])

440 ext = os.path.splitext(filename)[1]	440 ext = os.path.splitext(filename)[1]

441 if ext == ".txt" or filename == "update.json" or filename == "notification.j son":	441 if ext == '.txt' or filename == 'update.json' or filename == 'notification.j son':

442 # Subscription downloads, libadblockplus update checks and notification	442 # Subscription downloads, libadblockplus update checks and notification

443 # checks are performed by the downloader	443 # checks are performed by the downloader

444 parse_downloader_query(info)	444 parse_downloader_query(info)

445 elif ext == ".tpl":	445 elif ext == '.tpl':

446 # MSIE TPL download, no additional data here	446 # MSIE TPL download, no additional data here

447 pass	447 pass

448 elif ext in (".xpi", ".crx", ".apk", ".msi", ".exe", ".safariextz"):	448 elif ext in ('.xpi', '.crx', '.apk', '.msi', '.exe', '.safariextz'):

449 # Package download, might be an update	449 # Package download, might be an update

450 info["installType"] = parse_update_flag(info["query"])	450 info['installType'] = parse_update_flag(info['query'])

451 elif filename == "update.rdf":	451 elif filename == 'update.rdf':

452 # Gecko update check or a legacy Android update check. The latter doesn' t	452 # Gecko update check or a legacy Android update check. The latter doesn' t

453 # have usable data anyway so trying the Chrome route won't do any harm.	453 # have usable data anyway so trying the Chrome route won't do any harm.

454 info["addonName"] = parse_addon_name(info["file"])	454 info['addonName'] = parse_addon_name(info['file'])

455 info["addonVersion"], info["application"], info["applicationVersion"] = parse_gecko_query(info["query"])	455 info['addonVersion'], info['application'], info['applicationVersion'] = parse_gecko_query(info['query'])

456 elif filename == "updates.xml":	456 elif filename == 'updates.xml':

457 # Chrome update check	457 # Chrome update check

458 info["addonName"] = parse_addon_name(info["file"])	458 info['addonName'] = parse_addon_name(info['file'])

459 info["addonVersion"], info["application"], info["applicationVersion"] = parse_chrome_query(info["query"])	459 info['addonVersion'], info['application'], info['applicationVersion'] = parse_chrome_query(info['query'])

460 elif filename == "updates.plist":	460 elif filename == 'updates.plist':

461 # Safari update check, no additional data	461 # Safari update check, no additional data

462 pass	462 pass

463 else:	463 else:

464 ignored.add(info["file"])	464 ignored.add(info['file'])

465 return None	465 return None

466	466

467 if "addonName" in info:	467 if 'addonName' in info:

468 info["fullAddon"] = "%s %s" % (info["addonName"], info["addonVersion"])	468 info['fullAddon'] = '%s %s' % (info['addonName'], info['addonVersion'])

469 if "application" in info:	469 if 'application' in info:

470 info["fullApplication"] = "%s %s" % (info["application"], info["applicat ionVersion"])	470 info['fullApplication'] = '%s %s' % (info['application'], info['applicat ionVersion'])

471 if "platform" in info:	471 if 'platform' in info:

472 info["fullPlatform"] = "%s %s" % (info["platform"], info["platformVersio n"])	472 info['fullPlatform'] = '%s %s' % (info['platform'], info['platformVersio n'])

473 return info	473 return info

474	474

475	475

476 def add_record(info, section, ignore_fields=()):	476 def add_record(info, section, ignore_fields=()):

477 section["hits"] = section.get("hits", 0) + 1	477 section['hits'] = section.get('hits', 0) + 1

478 section["bandwidth"] = section.get("bandwidth", 0) + info["size"]	478 section['bandwidth'] = section.get('bandwidth', 0) + info['size']

479	479

480 if len(ignore_fields) < 2:	480 if len(ignore_fields) < 2:

481 for field in map(lambda f: f["name"], common.fields):	481 for field in map(lambda f: f['name'], common.fields):

482 if field in ignore_fields or field not in info:	482 if field in ignore_fields or field not in info:

483 continue	483 continue

484	484

485 value = info[field]	485 value = info[field]

486 if field not in section:	486 if field not in section:

487 section[field] = {}	487 section[field] = {}

488 if value not in section[field]:	488 if value not in section[field]:

489 section[field][value] = {}	489 section[field][value] = {}

490	490

491 add_record(info, section[field][value], ignore_fields + (field,))	491 add_record(info, section[field][value], ignore_fields + (field,))

492	492

493	493

494 def parse_fileobj(mirror_name, fileobj, geo, geov6, ignored):	494 def parse_fileobj(mirror_name, fileobj, geo, geov6, ignored):

495 data = {}	495 data = {}

496 for line in fileobj:	496 for line in fileobj:

497 info = parse_record(line, ignored, geo, geov6)	497 info = parse_record(line, ignored, geo, geov6)

498 if info == None:	498 if info == None:

499 continue	499 continue

500	500

501 info["mirror"] = mirror_name	501 info['mirror'] = mirror_name

502 if info["month"] not in data:	502 if info['month'] not in data:

503 data[info["month"]] = {}	503 data[info['month']] = {}

504 section = data[info["month"]]	504 section = data[info['month']]

505	505

506 if info["file"] not in section:	506 if info['file'] not in section:

507 section[info["file"]] = {}	507 section[info['file']] = {}

508 section = section[info["file"]]	508 section = section[info['file']]

509	509

510 add_record(info, section)	510 add_record(info, section)

511 return data	511 return data

512	512

513	513

514 def merge_objects(object1, object2, factor=1):	514 def merge_objects(object1, object2, factor=1):

515 for key, value in object2.iteritems():	515 for key, value in object2.iteritems():

516 try:	516 try:

517 key = unicode(key)	517 key = unicode(key)

518 except UnicodeDecodeError:	518 except UnicodeDecodeError:

519 key = unicode(key, encoding="latin-1")	519 key = unicode(key, encoding='latin-1')

520 if isinstance(value, numbers.Number):	520 if isinstance(value, numbers.Number):

521 object1[key] = object1.get(key, 0) + factor * value	521 object1[key] = object1.get(key, 0) + factor * value

522 else:	522 else:

523 merge_objects(object1.setdefault(key, {}), value, factor)	523 merge_objects(object1.setdefault(key, {}), value, factor)

524	524

525	525

526 def save_stats(server_type, data, factor=1):	526 def save_stats(server_type, data, factor=1):

527 base_dir = os.path.join(get_config().get("stats", "dataDirectory"), common.f ilename_encode(server_type))	527 base_dir = os.path.join(get_config().get('stats', 'dataDirectory'), common.f ilename_encode(server_type))

528 for month, month_data in data.iteritems():	528 for month, month_data in data.iteritems():

529 for name, file_data in month_data.iteritems():	529 for name, file_data in month_data.iteritems():

530 path = os.path.join(base_dir, common.filename_encode(month), common. filename_encode(name + ".json"))	530 path = os.path.join(base_dir, common.filename_encode(month), common. filename_encode(name + '.json'))

531 if os.path.exists(path):	531 if os.path.exists(path):

532 with codecs.open(path, "rb", encoding="utf-8") as fileobj:	532 with codecs.open(path, 'rb', encoding='utf-8') as fileobj:

533 existing = json.load(fileobj)	533 existing = json.load(fileobj)

534 else:	534 else:

535 existing = {}	535 existing = {}

536	536

537 merge_objects(existing, file_data, factor)	537 merge_objects(existing, file_data, factor)

538	538

539 dir = os.path.dirname(path)	539 dir = os.path.dirname(path)

540 try:	540 try:

541 os.makedirs(dir)	541 os.makedirs(dir)

542 except OSError, e:	542 except OSError, e:

543 if e.errno != errno.EEXIST:	543 if e.errno != errno.EEXIST:

544 raise	544 raise

545	545

546 with codecs.open(path, "wb", encoding="utf-8") as fileobj:	546 with codecs.open(path, 'wb', encoding='utf-8') as fileobj:

547 json.dump(existing, fileobj, indent=2, sort_keys=True)	547 json.dump(existing, fileobj, indent=2, sort_keys=True)

548	548

549	549

550 def parse_source(factor, lock, (mirror_name, server_type, log_file)):	550 def parse_source(factor, lock, (mirror_name, server_type, log_file)):

551 try:	551 try:

552 geo = pygeoip.GeoIP(get_config().get("stats", "geoip_db"), pygeoip.MEMOR Y_CACHE)	552 geo = pygeoip.GeoIP(get_config().get('stats', 'geoip_db'), pygeoip.MEMOR Y_CACHE)

553 geov6 = pygeoip.GeoIP(get_config().get("stats", "geoipv6_db"), pygeoip.M EMORY_CACHE)	553 geov6 = pygeoip.GeoIP(get_config().get('stats', 'geoipv6_db'), pygeoip.M EMORY_CACHE)

554	554

555 ignored = set()	555 ignored = set()

556 fileobj = StatsFile(log_file)	556 fileobj = StatsFile(log_file)

557 try:	557 try:

558 data = parse_fileobj(mirror_name, fileobj, geo, geov6, ignored)	558 data = parse_fileobj(mirror_name, fileobj, geo, geov6, ignored)

559 finally:	559 finally:

560 fileobj.close()	560 fileobj.close()

561	561

562 lock.acquire()	562 lock.acquire()

563 try:	563 try:

564 save_stats(server_type, data, factor)	564 save_stats(server_type, data, factor)

565 finally:	565 finally:

566 lock.release()	566 lock.release()

567 return log_file, ignored	567 return log_file, ignored

568 except:	568 except:

569 print >>sys.stderr, "Unable to process log file '%s'" % log_file	569 print >>sys.stderr, "Unable to process log file '%s'" % log_file

570 traceback.print_exc()	570 traceback.print_exc()

571 return None, None	571 return None, None

572	572

573	573

574 def parse_sources(sources, factor=1, verbose=False):	574 def parse_sources(sources, factor=1, verbose=False):

575 pool = multiprocessing.Pool()	575 pool = multiprocessing.Pool()

576 lock = multiprocessing.Manager().Lock()	576 lock = multiprocessing.Manager().Lock()

577 callback = functools.partial(parse_source, factor, lock)	577 callback = functools.partial(parse_source, factor, lock)

578 try:	578 try:

579 for log_file, ignored in pool.imap_unordered(callback, sources, chunksiz e=1):	579 for log_file, ignored in pool.imap_unordered(callback, sources, chunksiz e=1):

580 if verbose and ignored:	580 if verbose and ignored:

581 print "Ignored files for %s" % log_file	581 print 'Ignored files for %s' % log_file

582 print "========================================================= ==="	582 print '========================================================= ==='

583 print "\n".join(sorted(ignored))	583 print '\n'.join(sorted(ignored))

584 finally:	584 finally:

585 pool.close()	585 pool.close()

586	586

587 if __name__ == "__main__":	587 if __name__ == '__main__':

588 setupStderr()	588 setupStderr()

589	589

590 parser = argparse.ArgumentParser(description="Processes log files and merges them into the stats database")	590 parser = argparse.ArgumentParser(description='Processes log files and merges them into the stats database')

591 parser.add_argument("--verbose", dest="verbose", action="store_const", const =True, default=False, help="Verbose mode, ignored requests will be listed")	591 parser.add_argument('--verbose', dest='verbose', action='store_const', const =True, default=False, help='Verbose mode, ignored requests will be listed')

592 parser.add_argument("--revert", dest="factor", action="store_const", const=- 1, default=1, help="Remove log data from the database")	592 parser.add_argument('--revert', dest='factor', action='store_const', const=- 1, default=1, help='Remove log data from the database')

593 parser.add_argument("mirror_name", nargs="?", help="Name of the mirror serve r that the file belongs to")	593 parser.add_argument('mirror_name', nargs='?', help='Name of the mirror serve r that the file belongs to')

594 parser.add_argument("server_type", nargs="?", help="Server type like downloa d, update or subscription")	594 parser.add_argument('server_type', nargs='?', help='Server type like downloa d, update or subscription')

595 parser.add_argument("log_file", nargs="?", help="Log file path, can be a loc al file path, http:// or ssh:// URL")	595 parser.add_argument('log_file', nargs='?', help='Log file path, can be a loc al file path, http:// or ssh:// URL')

596 args = parser.parse_args()	596 args = parser.parse_args()

597	597

598 if args.mirror_name and args.server_type and args.log_file:	598 if args.mirror_name and args.server_type and args.log_file:

599 sources = [(args.mirror_name, args.server_type, args.log_file)]	599 sources = [(args.mirror_name, args.server_type, args.log_file)]

600 else:	600 else:

601 sources = get_stats_files()	601 sources = get_stats_files()

602 parse_sources(sources, args.factor, args.verbose)	602 parse_sources(sources, args.factor, args.verbose)

OLD	NEW