Rietveld Code Review Tool
Help | Bug tracker | Discussion group | Source code

Side by Side Diff: sitescripts/stats/bin/logprocessor.py

Issue 29345242: Noissue - Adapt quotes for compliance with our coding style in sitescripts (Closed)
Patch Set: Fixed raw string Created May 30, 2016, 8:47 a.m.
Left:
Right:
Use n/p to move between diff chunks; N/P to move between comments.
Jump to:
View unified diff | Download patch
OLDNEW
1 # This file is part of the Adblock Plus web scripts, 1 # This file is part of the Adblock Plus web scripts,
2 # Copyright (C) 2006-2016 Eyeo GmbH 2 # Copyright (C) 2006-2016 Eyeo GmbH
3 # 3 #
4 # Adblock Plus is free software: you can redistribute it and/or modify 4 # Adblock Plus is free software: you can redistribute it and/or modify
5 # it under the terms of the GNU General Public License version 3 as 5 # it under the terms of the GNU General Public License version 3 as
6 # published by the Free Software Foundation. 6 # published by the Free Software Foundation.
7 # 7 #
8 # Adblock Plus is distributed in the hope that it will be useful, 8 # Adblock Plus is distributed in the hope that it will be useful,
9 # but WITHOUT ANY WARRANTY; without even the implied warranty of 9 # but WITHOUT ANY WARRANTY; without even the implied warranty of
10 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 10 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
(...skipping 29 matching lines...) Expand all
40 log_regexp = None 40 log_regexp = None
41 gecko_apps = None 41 gecko_apps = None
42 42
43 43
44 class StatsFile: 44 class StatsFile:
45 def __init__(self, path): 45 def __init__(self, path):
46 self._inner_file = None 46 self._inner_file = None
47 self._processes = [] 47 self._processes = []
48 48
49 parseresult = urlparse.urlparse(path) 49 parseresult = urlparse.urlparse(path)
50 if parseresult.scheme == "ssh" and parseresult.username and parseresult. hostname and parseresult.path: 50 if parseresult.scheme == 'ssh' and parseresult.username and parseresult. hostname and parseresult.path:
51 command = [ 51 command = [
52 "ssh", "-q", "-o", "NumberOfPasswordPrompts 0", "-T", "-k", 52 'ssh', '-q', '-o', 'NumberOfPasswordPrompts 0', '-T', '-k',
53 "-l", parseresult.username, 53 '-l', parseresult.username,
54 parseresult.hostname, 54 parseresult.hostname,
55 parseresult.path.lstrip("/") 55 parseresult.path.lstrip('/')
56 ] 56 ]
57 if parseresult.port: 57 if parseresult.port:
58 command[1:1] = ["-P", str(parseresult.port)] 58 command[1:1] = ['-P', str(parseresult.port)]
59 ssh_process = subprocess.Popen(command, stdout=subprocess.PIPE) 59 ssh_process = subprocess.Popen(command, stdout=subprocess.PIPE)
60 self._processes.append(ssh_process) 60 self._processes.append(ssh_process)
61 self._file = ssh_process.stdout 61 self._file = ssh_process.stdout
62 elif parseresult.scheme in ("http", "https"): 62 elif parseresult.scheme in ('http', 'https'):
63 self._file = urllib.urlopen(path) 63 self._file = urllib.urlopen(path)
64 elif os.path.exists(path): 64 elif os.path.exists(path):
65 self._file = open(path, "rb") 65 self._file = open(path, 'rb')
66 else: 66 else:
67 raise IOError("Path '%s' not recognized" % path) 67 raise IOError("Path '%s' not recognized" % path)
68 68
69 if path.endswith(".gz"): 69 if path.endswith('.gz'):
70 # Built-in gzip module doesn't support streaming (fixed in Python 3. 2) 70 # Built-in gzip module doesn't support streaming (fixed in Python 3. 2)
71 gzip_process = subprocess.Popen(["gzip", "-cd"], stdin=self._file, s tdout=subprocess.PIPE) 71 gzip_process = subprocess.Popen(['gzip', '-cd'], stdin=self._file, s tdout=subprocess.PIPE)
72 self._processes.append(gzip_process) 72 self._processes.append(gzip_process)
73 self._file, self._inner_file = gzip_process.stdout, self._file 73 self._file, self._inner_file = gzip_process.stdout, self._file
74 74
75 def __getattr__(self, name): 75 def __getattr__(self, name):
76 return getattr(self._file, name) 76 return getattr(self._file, name)
77 77
78 def close(self): 78 def close(self):
79 self._file.close() 79 self._file.close()
80 if self._inner_file: 80 if self._inner_file:
81 self._inner_file.close() 81 self._inner_file.close()
82 for process in self._processes: 82 for process in self._processes:
83 process.wait() 83 process.wait()
84 84
85 85
86 def get_stats_files(): 86 def get_stats_files():
87 config = get_config() 87 config = get_config()
88 88
89 prefix = "mirror_" 89 prefix = 'mirror_'
90 options = filter(lambda o: o.startswith(prefix), config.options("stats")) 90 options = filter(lambda o: o.startswith(prefix), config.options('stats'))
91 for option in options: 91 for option in options:
92 if config.has_option("stats", option): 92 if config.has_option('stats', option):
93 value = config.get("stats", option) 93 value = config.get('stats', option)
94 if " " in value: 94 if ' ' in value:
95 yield [option[len(prefix):]] + value.split(None, 1) 95 yield [option[len(prefix):]] + value.split(None, 1)
96 else: 96 else:
97 print >>sys.stderr, "Option '%s' has invalid value: '%s'" % (opt ion, value) 97 print >>sys.stderr, "Option '%s' has invalid value: '%s'" % (opt ion, value)
98 else: 98 else:
99 print >>sys.stderr, "Option '%s' not found in the configuration" % o ption 99 print >>sys.stderr, "Option '%s' not found in the configuration" % o ption
100 100
101 101
102 def cache_lru(func): 102 def cache_lru(func):
103 """ 103 """
104 Decorator that memoizes the return values of a single-parameter function i n 104 Decorator that memoizes the return values of a single-parameter function i n
(...skipping 21 matching lines...) Expand all
126 results[arg] = result 126 results[arg] = result
127 return result 127 return result
128 return wrapped 128 return wrapped
129 129
130 130
131 def cache_last(func): 131 def cache_last(func):
132 """ 132 """
133 Decorator that memoizes the last return value of a function in case it is 133 Decorator that memoizes the last return value of a function in case it is
134 called again with the same parameters. 134 called again with the same parameters.
135 """ 135 """
136 result = {"args": None, "result": None} 136 result = {'args': None, 'result': None}
137 137
138 def wrapped(*args): 138 def wrapped(*args):
139 if args != result["args"]: 139 if args != result['args']:
140 result["result"] = func(*args) 140 result['result'] = func(*args)
141 result["args"] = args 141 result['args'] = args
142 return result["result"] 142 return result['result']
143 return wrapped 143 return wrapped
144 144
145 145
146 @cache_lru 146 @cache_lru
147 def parse_ua(ua): 147 def parse_ua(ua):
148 # Opera might disguise itself as other browser so it needs to go first 148 # Opera might disguise itself as other browser so it needs to go first
149 match = re.search(r"\bOpera/([\d\.]+)", ua) 149 match = re.search(r'\bOpera/([\d\.]+)', ua)
150 if match: 150 if match:
151 # Opera 10+ declares itself as Opera 9.80 but adds Version/1x.x to the U A 151 # Opera 10+ declares itself as Opera 9.80 but adds Version/1x.x to the U A
152 match2 = re.search(r"\bVersion/([\d\.]+)", ua) 152 match2 = re.search(r'\bVersion/([\d\.]+)', ua)
153 if match2: 153 if match2:
154 return "Opera", match2.group(1) 154 return 'Opera', match2.group(1)
155 else: 155 else:
156 return "Opera", match.group(1) 156 return 'Opera', match.group(1)
157 157
158 # Opera 15+ has the same UA as Chrome but adds OPR/1x.x to it 158 # Opera 15+ has the same UA as Chrome but adds OPR/1x.x to it
159 match = re.search(r"\bOPR/(\d+\.\d+)", ua) 159 match = re.search(r'\bOPR/(\d+\.\d+)', ua)
160 if match: 160 if match:
161 return "Opera", match.group(1) 161 return 'Opera', match.group(1)
162 162
163 # Have to check for these before Firefox, they will usually have a Firefox i dentifier as well 163 # Have to check for these before Firefox, they will usually have a Firefox i dentifier as well
164 match = re.search(r"\b(Fennec|Thunderbird|SeaMonkey|Songbird|K-Meleon|Prism) /(\d+\.\d+)", ua) 164 match = re.search(r'\b(Fennec|Thunderbird|SeaMonkey|Songbird|K-Meleon|Prism) /(\d+\.\d+)', ua)
165 if match: 165 if match:
166 if match.group(1) == "Fennec": 166 if match.group(1) == 'Fennec':
167 return "Firefox Mobile", match.group(2) 167 return 'Firefox Mobile', match.group(2)
168 else: 168 else:
169 return match.group(1), match.group(2) 169 return match.group(1), match.group(2)
170 170
171 match = re.search(r"\bFirefox/(\d+\.\d+)", ua) 171 match = re.search(r'\bFirefox/(\d+\.\d+)', ua)
172 if match: 172 if match:
173 if re.search(r"\bMobile;", ua): 173 if re.search(r'\bMobile;', ua):
174 return "Firefox Mobile", match.group(1) 174 return 'Firefox Mobile', match.group(1)
175 elif re.search(r"\bTablet;", ua): 175 elif re.search(r'\bTablet;', ua):
176 return "Firefox Tablet", match.group(1) 176 return 'Firefox Tablet', match.group(1)
177 else: 177 else:
178 return "Firefox", match.group(1) 178 return 'Firefox', match.group(1)
179 179
180 match = re.search(r"\brv:(\d+)\.(\d+)(?:\.(\d+))?", ua) 180 match = re.search(r'\brv:(\d+)\.(\d+)(?:\.(\d+))?', ua)
181 if match and re.search(r"\bGecko/", ua): 181 if match and re.search(r'\bGecko/', ua):
182 if match.group(3) and int(match.group(1)) < 2: 182 if match.group(3) and int(match.group(1)) < 2:
183 return "Gecko", "%s.%s.%s" % (match.group(1), match.group(2), match. group(3)) 183 return 'Gecko', '%s.%s.%s' % (match.group(1), match.group(2), match. group(3))
184 else: 184 else:
185 return "Gecko", "%s.%s" % (match.group(1), match.group(2)) 185 return 'Gecko', '%s.%s' % (match.group(1), match.group(2))
186 186
187 match = re.search(r"\bCoolNovo/(\d+\.\d+\.\d+)", ua) 187 match = re.search(r'\bCoolNovo/(\d+\.\d+\.\d+)', ua)
188 if match: 188 if match:
189 return "CoolNovo", match.group(1) 189 return 'CoolNovo', match.group(1)
190 190
191 match = re.search(r"\bEdge/(\d+)\.\d+", ua) 191 match = re.search(r'\bEdge/(\d+)\.\d+', ua)
192 if match: 192 if match:
193 return "Edge", match.group(1) 193 return 'Edge', match.group(1)
194 194
195 match = re.search(r"\bChrome/(\d+\.\d+)", ua) 195 match = re.search(r'\bChrome/(\d+\.\d+)', ua)
196 if match: 196 if match:
197 return "Chrome", match.group(1) 197 return 'Chrome', match.group(1)
198 198
199 match = re.search(r"\bVersion/(\d+\.\d+)", ua) 199 match = re.search(r'\bVersion/(\d+\.\d+)', ua)
200 if match and re.search(r"\bMobile Safari/", ua): 200 if match and re.search(r'\bMobile Safari/', ua):
201 return "Mobile Safari", match.group(1) 201 return 'Mobile Safari', match.group(1)
202 if match and re.search(r"\bSafari/", ua): 202 if match and re.search(r'\bSafari/', ua):
203 return "Safari", match.group(1) 203 return 'Safari', match.group(1)
204 204
205 if re.search(r"\bAppleWebKit/", ua): 205 if re.search(r'\bAppleWebKit/', ua):
206 return "WebKit", "" 206 return 'WebKit', ''
207 207
208 match = re.search(r"\bMSIE (\d+\.\d+)", ua) 208 match = re.search(r'\bMSIE (\d+\.\d+)', ua)
209 if match: 209 if match:
210 return "MSIE", match.group(1) 210 return 'MSIE', match.group(1)
211 211
212 match = re.search(r"\bTrident/(\d+\.\d+)", ua) 212 match = re.search(r'\bTrident/(\d+\.\d+)', ua)
213 if match: 213 if match:
214 match2 = re.search(r"\brv:(\d+\.\d+)", ua) 214 match2 = re.search(r'\brv:(\d+\.\d+)', ua)
215 if match2: 215 if match2:
216 return "MSIE", match2.group(1) 216 return 'MSIE', match2.group(1)
217 else: 217 else:
218 return "Trident", match.group(1) 218 return 'Trident', match.group(1)
219 219
220 match = re.search(r"\bAndroidDownloadManager(?:/(\d+\.\d+))?", ua) 220 match = re.search(r'\bAndroidDownloadManager(?:/(\d+\.\d+))?', ua)
221 if match: 221 if match:
222 return "Android", match.group(1) or "" 222 return 'Android', match.group(1) or ''
223 223
224 match = re.search(r"\bDalvik/.*\bAndroid (\d+\.\d+)", ua) 224 match = re.search(r'\bDalvik/.*\bAndroid (\d+\.\d+)', ua)
225 if match: 225 if match:
226 return "Android", match.group(1) 226 return 'Android', match.group(1)
227 227
228 # ABP/Android downloads use that user agent 228 # ABP/Android downloads use that user agent
229 if ua.startswith("Apache-HttpClient/UNAVAILABLE"): 229 if ua.startswith('Apache-HttpClient/UNAVAILABLE'):
230 return "Android", "" 230 return 'Android', ''
231 231
232 # ABP/IE downloads use that user agent 232 # ABP/IE downloads use that user agent
233 if ua == "Adblock Plus": 233 if ua == 'Adblock Plus':
234 return "ABP", "" 234 return 'ABP', ''
235 235
236 return "Other", "" 236 return 'Other', ''
237 237
238 238
239 def process_ip(ip, geo, geov6): 239 def process_ip(ip, geo, geov6):
240 match = re.search(r"^::ffff:(\d+\.\d+\.\d+\.\d+)$", ip) 240 match = re.search(r'^::ffff:(\d+\.\d+\.\d+\.\d+)$', ip)
241 if match: 241 if match:
242 ip = match.group(1) 242 ip = match.group(1)
243 243
244 try: 244 try:
245 if ":" in ip: 245 if ':' in ip:
246 country = geov6.country_code_by_addr(ip) 246 country = geov6.country_code_by_addr(ip)
247 else: 247 else:
248 country = geo.country_code_by_addr(ip) 248 country = geo.country_code_by_addr(ip)
249 except: 249 except:
250 traceback.print_exc() 250 traceback.print_exc()
251 country = "" 251 country = ''
252 252
253 if country in (None, "", "--"): 253 if country in (None, '', '--'):
254 country = "unknown" 254 country = 'unknown'
255 country = country.lower() 255 country = country.lower()
256 256
257 return ip, country 257 return ip, country
258 258
259 259
260 @cache_last 260 @cache_last
261 def parse_time(timestr, tz_hours, tz_minutes): 261 def parse_time(timestr, tz_hours, tz_minutes):
262 result = datetime.strptime(timestr, "%d/%b/%Y:%H:%M:%S") 262 result = datetime.strptime(timestr, '%d/%b/%Y:%H:%M:%S')
263 result -= timedelta(hours=tz_hours, minutes=math.copysign(tz_minutes, tz_hou rs)) 263 result -= timedelta(hours=tz_hours, minutes=math.copysign(tz_minutes, tz_hou rs))
264 return result, result.strftime("%Y%m"), result.day, result.weekday(), result .hour 264 return result, result.strftime('%Y%m'), result.day, result.weekday(), result .hour
265 265
266 266
267 @cache_lru 267 @cache_lru
268 def parse_path(path): 268 def parse_path(path):
269 urlparts = urlparse.urlparse(path) 269 urlparts = urlparse.urlparse(path)
270 try: 270 try:
271 path = urllib.unquote(urlparts.path).decode("utf-8") 271 path = urllib.unquote(urlparts.path).decode('utf-8')
272 except: 272 except:
273 path = urlparts.path 273 path = urlparts.path
274 return path[1:], urlparts.query 274 return path[1:], urlparts.query
275 275
276 276
277 @cache_lru 277 @cache_lru
278 def parse_query(query): 278 def parse_query(query):
279 return urlparse.parse_qs(query) 279 return urlparse.parse_qs(query)
280 280
281 281
282 @cache_lru 282 @cache_lru
283 def parse_lastversion(last_version): 283 def parse_lastversion(last_version):
284 if '-' in last_version: 284 if '-' in last_version:
285 last_version = last_version.split('-', 1)[0] 285 last_version = last_version.split('-', 1)[0]
286 return datetime.strptime(last_version, "%Y%m%d%H%M") 286 return datetime.strptime(last_version, '%Y%m%d%H%M')
287 287
288 288
289 @cache_lru 289 @cache_lru
290 def get_week(date): 290 def get_week(date):
291 return date.isocalendar()[0:2] 291 return date.isocalendar()[0:2]
292 292
293 293
294 def parse_downloader_query(info): 294 def parse_downloader_query(info):
295 params = parse_query(info["query"]) 295 params = parse_query(info['query'])
296 for param in ("addonName", "addonVersion", "application", "applicationVersio n", "platform", "platformVersion"): 296 for param in ('addonName', 'addonVersion', 'application', 'applicationVersio n', 'platform', 'platformVersion'):
297 info[param] = params.get(param, ["unknown"])[0] 297 info[param] = params.get(param, ['unknown'])[0]
298 298
299 # Only leave the major and minor release number for application and platform 299 # Only leave the major and minor release number for application and platform
300 info["applicationVersion"] = re.sub(r"^(\d+\.\d+).*", r"\1", info["applicati onVersion"]) 300 info['applicationVersion'] = re.sub(r'^(\d+\.\d+).*', r'\1', info['applicati onVersion'])
301 info["platformVersion"] = re.sub(r"^(\d+\.\d+).*", r"\1", info["platformVers ion"]) 301 info['platformVersion'] = re.sub(r'^(\d+\.\d+).*', r'\1', info['platformVers ion'])
302 302
303 # Chrome Adblock sends an X-Client-ID header insteads of URL parameters 303 # Chrome Adblock sends an X-Client-ID header insteads of URL parameters
304 match = re.match(r"^adblock/([\d\.]+)$", info["clientid"], re.I) if info["cl ientid"] else None 304 match = re.match(r'^adblock/([\d\.]+)$', info['clientid'], re.I) if info['cl ientid'] else None
305 if match: 305 if match:
306 info["addonName"] = "chromeadblock" 306 info['addonName'] = 'chromeadblock'
307 info["addonVersion"] = match.group(1) 307 info['addonVersion'] = match.group(1)
308 308
309 last_version = params.get("lastVersion", ["unknown"])[0] 309 last_version = params.get('lastVersion', ['unknown'])[0]
310 if info["file"] == "notification.json" and last_version == "0" and ( 310 if info['file'] == 'notification.json' and last_version == '0' and (
311 (info["addonName"] == "adblockplus" and info["addonVersion"] == "2.3.1") or 311 (info['addonName'] == 'adblockplus' and info['addonVersion'] == '2.3.1') or
312 (info["addonName"] in ("adblockpluschrome", "adblockplusopera") and info ["addonVersion"] == "1.5.2") 312 (info['addonName'] in ('adblockpluschrome', 'adblockplusopera') and info ['addonVersion'] == '1.5.2')
313 ): 313 ):
314 # Broken notification version number in these releases, treat like unkno wn 314 # Broken notification version number in these releases, treat like unkno wn
315 last_version = "unknown" 315 last_version = 'unknown'
316 316
317 if last_version == "unknown": 317 if last_version == 'unknown':
318 info["downloadInterval"] = "unknown" 318 info['downloadInterval'] = 'unknown'
319 info["previousDownload"] = "unknown" 319 info['previousDownload'] = 'unknown'
320 elif last_version == "0": 320 elif last_version == '0':
321 info["downloadInterval"] = "unknown" 321 info['downloadInterval'] = 'unknown'
322 info["previousDownload"] = "unknown" 322 info['previousDownload'] = 'unknown'
323 info["firstDownload"] = True 323 info['firstDownload'] = True
324 else: 324 else:
325 try: 325 try:
326 last_update = parse_lastversion(last_version) 326 last_update = parse_lastversion(last_version)
327 diff = info["time"] - last_update 327 diff = info['time'] - last_update
328 if diff.days >= 365: 328 if diff.days >= 365:
329 info["downloadInterval"] = "%i year(s)" % (diff.days / 365) 329 info['downloadInterval'] = '%i year(s)' % (diff.days / 365)
330 elif diff.days >= 30: 330 elif diff.days >= 30:
331 info["downloadInterval"] = "%i month(s)" % (diff.days / 30) 331 info['downloadInterval'] = '%i month(s)' % (diff.days / 30)
332 elif diff.days >= 1: 332 elif diff.days >= 1:
333 info["downloadInterval"] = "%i day(s)" % diff.days 333 info['downloadInterval'] = '%i day(s)' % diff.days
334 else: 334 else:
335 info["downloadInterval"] = "%i hour(s)" % (diff.seconds / 3600) 335 info['downloadInterval'] = '%i hour(s)' % (diff.seconds / 3600)
336 336
337 if info["addonName"].startswith("adblockplus"): 337 if info['addonName'].startswith('adblockplus'):
338 diffdays = (info["time"].date() - last_update.date()).days 338 diffdays = (info['time'].date() - last_update.date()).days
339 if diffdays == 0: 339 if diffdays == 0:
340 info["previousDownload"] = "same day" 340 info['previousDownload'] = 'same day'
341 elif diffdays < 30: 341 elif diffdays < 30:
342 info["previousDownload"] = "%i day(s)" % diffdays 342 info['previousDownload'] = '%i day(s)' % diffdays
343 elif diffdays < 365: 343 elif diffdays < 365:
344 info["previousDownload"] = "%i month(s)" % (diffdays / 30) 344 info['previousDownload'] = '%i month(s)' % (diffdays / 30)
345 else: 345 else:
346 info["previousDownload"] = "%i year(s)" % (diffdays / 365) 346 info['previousDownload'] = '%i year(s)' % (diffdays / 365)
347 else: 347 else:
348 info["previousDownload"] = "unknown" 348 info['previousDownload'] = 'unknown'
349 349
350 if last_update.year != info["time"].year or last_update.month != inf o["time"].month: 350 if last_update.year != info['time'].year or last_update.month != inf o['time'].month:
351 info["firstInMonth"] = info["firstInDay"] = True 351 info['firstInMonth'] = info['firstInDay'] = True
352 elif last_update.day != info["time"].day: 352 elif last_update.day != info['time'].day:
353 info["firstInDay"] = True 353 info['firstInDay'] = True
354 354
355 if get_week(last_update) != get_week(info["time"]): 355 if get_week(last_update) != get_week(info['time']):
356 info["firstInWeek"] = True 356 info['firstInWeek'] = True
357 except ValueError: 357 except ValueError:
358 info["downloadInterval"] = "unknown" 358 info['downloadInterval'] = 'unknown'
359 info["previousDownload"] = "unknown" 359 info['previousDownload'] = 'unknown'
360 pass 360 pass
361 361
362 362
363 def parse_addon_name(file): 363 def parse_addon_name(file):
364 if "/" in file: 364 if '/' in file:
365 return file.split("/")[-2] 365 return file.split('/')[-2]
366 else: 366 else:
367 return None 367 return None
368 368
369 369
370 def parse_gecko_query(query): 370 def parse_gecko_query(query):
371 params = urlparse.parse_qs(query) 371 params = urlparse.parse_qs(query)
372 372
373 version = params.get("version", ["unknown"])[0] 373 version = params.get('version', ['unknown'])[0]
374 374
375 global gecko_apps 375 global gecko_apps
376 if gecko_apps == None: 376 if gecko_apps == None:
377 from buildtools.packagerGecko import KNOWN_APPS 377 from buildtools.packagerGecko import KNOWN_APPS
378 gecko_apps = {v: k for k, v in KNOWN_APPS.iteritems()} 378 gecko_apps = {v: k for k, v in KNOWN_APPS.iteritems()}
379 appID = params.get("appID", ["unknown"])[0] 379 appID = params.get('appID', ['unknown'])[0]
380 380
381 application = gecko_apps.get(appID, "unknown") 381 application = gecko_apps.get(appID, 'unknown')
382 applicationVersion = params.get("appVersion", ["unknown"])[0] 382 applicationVersion = params.get('appVersion', ['unknown'])[0]
383 383
384 # Only leave the major and minor release number for application 384 # Only leave the major and minor release number for application
385 applicationVersion = re.sub(r"^(\d+\.\d+).*", r"\1", applicationVersion) 385 applicationVersion = re.sub(r'^(\d+\.\d+).*', r'\1', applicationVersion)
386 386
387 return version, application, applicationVersion 387 return version, application, applicationVersion
388 388
389 389
390 def parse_chrome_query(query): 390 def parse_chrome_query(query):
391 params = urlparse.parse_qs(query) 391 params = urlparse.parse_qs(query)
392 392
393 if params.get("prod", ["unknown"])[0] in ("chromecrx", "chromiumcrx"): 393 if params.get('prod', ['unknown'])[0] in ('chromecrx', 'chromiumcrx'):
394 application = "chrome" 394 application = 'chrome'
395 else: 395 else:
396 application = "unknown" 396 application = 'unknown'
397 applicationVersion = params.get("prodversion", ["unknown"])[0] 397 applicationVersion = params.get('prodversion', ['unknown'])[0]
398 398
399 params2 = urlparse.parse_qs(params.get("x", [""])[0]) 399 params2 = urlparse.parse_qs(params.get('x', [''])[0])
400 version = params2.get("v", ["unknown"])[0] 400 version = params2.get('v', ['unknown'])[0]
401 401
402 # Only leave the major and minor release number for application 402 # Only leave the major and minor release number for application
403 applicationVersion = re.sub(r"^(\d+\.\d+).*", r"\1", applicationVersion) 403 applicationVersion = re.sub(r'^(\d+\.\d+).*', r'\1', applicationVersion)
404 404
405 return version, application, applicationVersion 405 return version, application, applicationVersion
406 406
407 407
408 def parse_update_flag(query): 408 def parse_update_flag(query):
409 return "update" if query == "update" else "install" 409 return 'update' if query == 'update' else 'install'
410 410
411 411
412 def parse_record(line, ignored, geo, geov6): 412 def parse_record(line, ignored, geo, geov6):
413 global log_regexp 413 global log_regexp
414 if log_regexp == None: 414 if log_regexp == None:
415 log_regexp = re.compile(r'(\S+) \S+ \S+ \[([^]\s]+) ([+\-]\d\d)(\d\d)\] "GET ([^"\s]+) [^"]+" (\d+) (\d+) "([^"]*)" "([^"]*)"(?: "[^"]*" \S+ "[^"]*" "[^ "]*" "([^"]*)")?') 415 log_regexp = re.compile(r'(\S+) \S+ \S+ \[([^]\s]+) ([+\-]\d\d)(\d\d)\] "GET ([^"\s]+) [^"]+" (\d+) (\d+) "([^"]*)" "([^"]*)"(?: "[^"]*" \S+ "[^"]*" "[^ "]*" "([^"]*)")?')
416 416
417 match = re.search(log_regexp, line) 417 match = re.search(log_regexp, line)
418 if not match: 418 if not match:
419 return None 419 return None
420 420
421 status = int(match.group(6)) 421 status = int(match.group(6))
422 if status not in (200, 301, 302): 422 if status not in (200, 301, 302):
423 return None 423 return None
424 424
425 info = { 425 info = {
426 "status": status, 426 'status': status,
427 "size": int(match.group(7)), 427 'size': int(match.group(7)),
428 } 428 }
429 429
430 info["ip"], info["country"] = process_ip(match.group(1), geo, geov6) 430 info['ip'], info['country'] = process_ip(match.group(1), geo, geov6)
431 info["time"], info["month"], info["day"], info["weekday"], info["hour"] = pa rse_time(match.group(2), int(match.group(3)), int(match.group(4))) 431 info['time'], info['month'], info['day'], info['weekday'], info['hour'] = pa rse_time(match.group(2), int(match.group(3)), int(match.group(4)))
432 info["file"], info["query"] = parse_path(match.group(5)) 432 info['file'], info['query'] = parse_path(match.group(5))
433 info["referrer"] = match.group(8) 433 info['referrer'] = match.group(8)
434 info["ua"], info["uaversion"] = parse_ua(match.group(9)) 434 info['ua'], info['uaversion'] = parse_ua(match.group(9))
435 info["fullua"] = "%s %s" % (info["ua"], info["uaversion"]) 435 info['fullua'] = '%s %s' % (info['ua'], info['uaversion'])
436 info["clientid"] = match.group(10) 436 info['clientid'] = match.group(10)
437 437
438 # Additional metadata depends on file type 438 # Additional metadata depends on file type
439 filename = os.path.basename(info["file"]) 439 filename = os.path.basename(info['file'])
440 ext = os.path.splitext(filename)[1] 440 ext = os.path.splitext(filename)[1]
441 if ext == ".txt" or filename == "update.json" or filename == "notification.j son": 441 if ext == '.txt' or filename == 'update.json' or filename == 'notification.j son':
442 # Subscription downloads, libadblockplus update checks and notification 442 # Subscription downloads, libadblockplus update checks and notification
443 # checks are performed by the downloader 443 # checks are performed by the downloader
444 parse_downloader_query(info) 444 parse_downloader_query(info)
445 elif ext == ".tpl": 445 elif ext == '.tpl':
446 # MSIE TPL download, no additional data here 446 # MSIE TPL download, no additional data here
447 pass 447 pass
448 elif ext in (".xpi", ".crx", ".apk", ".msi", ".exe", ".safariextz"): 448 elif ext in ('.xpi', '.crx', '.apk', '.msi', '.exe', '.safariextz'):
449 # Package download, might be an update 449 # Package download, might be an update
450 info["installType"] = parse_update_flag(info["query"]) 450 info['installType'] = parse_update_flag(info['query'])
451 elif filename == "update.rdf": 451 elif filename == 'update.rdf':
452 # Gecko update check or a legacy Android update check. The latter doesn' t 452 # Gecko update check or a legacy Android update check. The latter doesn' t
453 # have usable data anyway so trying the Chrome route won't do any harm. 453 # have usable data anyway so trying the Chrome route won't do any harm.
454 info["addonName"] = parse_addon_name(info["file"]) 454 info['addonName'] = parse_addon_name(info['file'])
455 info["addonVersion"], info["application"], info["applicationVersion"] = parse_gecko_query(info["query"]) 455 info['addonVersion'], info['application'], info['applicationVersion'] = parse_gecko_query(info['query'])
456 elif filename == "updates.xml": 456 elif filename == 'updates.xml':
457 # Chrome update check 457 # Chrome update check
458 info["addonName"] = parse_addon_name(info["file"]) 458 info['addonName'] = parse_addon_name(info['file'])
459 info["addonVersion"], info["application"], info["applicationVersion"] = parse_chrome_query(info["query"]) 459 info['addonVersion'], info['application'], info['applicationVersion'] = parse_chrome_query(info['query'])
460 elif filename == "updates.plist": 460 elif filename == 'updates.plist':
461 # Safari update check, no additional data 461 # Safari update check, no additional data
462 pass 462 pass
463 else: 463 else:
464 ignored.add(info["file"]) 464 ignored.add(info['file'])
465 return None 465 return None
466 466
467 if "addonName" in info: 467 if 'addonName' in info:
468 info["fullAddon"] = "%s %s" % (info["addonName"], info["addonVersion"]) 468 info['fullAddon'] = '%s %s' % (info['addonName'], info['addonVersion'])
469 if "application" in info: 469 if 'application' in info:
470 info["fullApplication"] = "%s %s" % (info["application"], info["applicat ionVersion"]) 470 info['fullApplication'] = '%s %s' % (info['application'], info['applicat ionVersion'])
471 if "platform" in info: 471 if 'platform' in info:
472 info["fullPlatform"] = "%s %s" % (info["platform"], info["platformVersio n"]) 472 info['fullPlatform'] = '%s %s' % (info['platform'], info['platformVersio n'])
473 return info 473 return info
474 474
475 475
476 def add_record(info, section, ignore_fields=()): 476 def add_record(info, section, ignore_fields=()):
477 section["hits"] = section.get("hits", 0) + 1 477 section['hits'] = section.get('hits', 0) + 1
478 section["bandwidth"] = section.get("bandwidth", 0) + info["size"] 478 section['bandwidth'] = section.get('bandwidth', 0) + info['size']
479 479
480 if len(ignore_fields) < 2: 480 if len(ignore_fields) < 2:
481 for field in map(lambda f: f["name"], common.fields): 481 for field in map(lambda f: f['name'], common.fields):
482 if field in ignore_fields or field not in info: 482 if field in ignore_fields or field not in info:
483 continue 483 continue
484 484
485 value = info[field] 485 value = info[field]
486 if field not in section: 486 if field not in section:
487 section[field] = {} 487 section[field] = {}
488 if value not in section[field]: 488 if value not in section[field]:
489 section[field][value] = {} 489 section[field][value] = {}
490 490
491 add_record(info, section[field][value], ignore_fields + (field,)) 491 add_record(info, section[field][value], ignore_fields + (field,))
492 492
493 493
494 def parse_fileobj(mirror_name, fileobj, geo, geov6, ignored): 494 def parse_fileobj(mirror_name, fileobj, geo, geov6, ignored):
495 data = {} 495 data = {}
496 for line in fileobj: 496 for line in fileobj:
497 info = parse_record(line, ignored, geo, geov6) 497 info = parse_record(line, ignored, geo, geov6)
498 if info == None: 498 if info == None:
499 continue 499 continue
500 500
501 info["mirror"] = mirror_name 501 info['mirror'] = mirror_name
502 if info["month"] not in data: 502 if info['month'] not in data:
503 data[info["month"]] = {} 503 data[info['month']] = {}
504 section = data[info["month"]] 504 section = data[info['month']]
505 505
506 if info["file"] not in section: 506 if info['file'] not in section:
507 section[info["file"]] = {} 507 section[info['file']] = {}
508 section = section[info["file"]] 508 section = section[info['file']]
509 509
510 add_record(info, section) 510 add_record(info, section)
511 return data 511 return data
512 512
513 513
514 def merge_objects(object1, object2, factor=1): 514 def merge_objects(object1, object2, factor=1):
515 for key, value in object2.iteritems(): 515 for key, value in object2.iteritems():
516 try: 516 try:
517 key = unicode(key) 517 key = unicode(key)
518 except UnicodeDecodeError: 518 except UnicodeDecodeError:
519 key = unicode(key, encoding="latin-1") 519 key = unicode(key, encoding='latin-1')
520 if isinstance(value, numbers.Number): 520 if isinstance(value, numbers.Number):
521 object1[key] = object1.get(key, 0) + factor * value 521 object1[key] = object1.get(key, 0) + factor * value
522 else: 522 else:
523 merge_objects(object1.setdefault(key, {}), value, factor) 523 merge_objects(object1.setdefault(key, {}), value, factor)
524 524
525 525
526 def save_stats(server_type, data, factor=1): 526 def save_stats(server_type, data, factor=1):
527 base_dir = os.path.join(get_config().get("stats", "dataDirectory"), common.f ilename_encode(server_type)) 527 base_dir = os.path.join(get_config().get('stats', 'dataDirectory'), common.f ilename_encode(server_type))
528 for month, month_data in data.iteritems(): 528 for month, month_data in data.iteritems():
529 for name, file_data in month_data.iteritems(): 529 for name, file_data in month_data.iteritems():
530 path = os.path.join(base_dir, common.filename_encode(month), common. filename_encode(name + ".json")) 530 path = os.path.join(base_dir, common.filename_encode(month), common. filename_encode(name + '.json'))
531 if os.path.exists(path): 531 if os.path.exists(path):
532 with codecs.open(path, "rb", encoding="utf-8") as fileobj: 532 with codecs.open(path, 'rb', encoding='utf-8') as fileobj:
533 existing = json.load(fileobj) 533 existing = json.load(fileobj)
534 else: 534 else:
535 existing = {} 535 existing = {}
536 536
537 merge_objects(existing, file_data, factor) 537 merge_objects(existing, file_data, factor)
538 538
539 dir = os.path.dirname(path) 539 dir = os.path.dirname(path)
540 try: 540 try:
541 os.makedirs(dir) 541 os.makedirs(dir)
542 except OSError, e: 542 except OSError, e:
543 if e.errno != errno.EEXIST: 543 if e.errno != errno.EEXIST:
544 raise 544 raise
545 545
546 with codecs.open(path, "wb", encoding="utf-8") as fileobj: 546 with codecs.open(path, 'wb', encoding='utf-8') as fileobj:
547 json.dump(existing, fileobj, indent=2, sort_keys=True) 547 json.dump(existing, fileobj, indent=2, sort_keys=True)
548 548
549 549
550 def parse_source(factor, lock, (mirror_name, server_type, log_file)): 550 def parse_source(factor, lock, (mirror_name, server_type, log_file)):
551 try: 551 try:
552 geo = pygeoip.GeoIP(get_config().get("stats", "geoip_db"), pygeoip.MEMOR Y_CACHE) 552 geo = pygeoip.GeoIP(get_config().get('stats', 'geoip_db'), pygeoip.MEMOR Y_CACHE)
553 geov6 = pygeoip.GeoIP(get_config().get("stats", "geoipv6_db"), pygeoip.M EMORY_CACHE) 553 geov6 = pygeoip.GeoIP(get_config().get('stats', 'geoipv6_db'), pygeoip.M EMORY_CACHE)
554 554
555 ignored = set() 555 ignored = set()
556 fileobj = StatsFile(log_file) 556 fileobj = StatsFile(log_file)
557 try: 557 try:
558 data = parse_fileobj(mirror_name, fileobj, geo, geov6, ignored) 558 data = parse_fileobj(mirror_name, fileobj, geo, geov6, ignored)
559 finally: 559 finally:
560 fileobj.close() 560 fileobj.close()
561 561
562 lock.acquire() 562 lock.acquire()
563 try: 563 try:
564 save_stats(server_type, data, factor) 564 save_stats(server_type, data, factor)
565 finally: 565 finally:
566 lock.release() 566 lock.release()
567 return log_file, ignored 567 return log_file, ignored
568 except: 568 except:
569 print >>sys.stderr, "Unable to process log file '%s'" % log_file 569 print >>sys.stderr, "Unable to process log file '%s'" % log_file
570 traceback.print_exc() 570 traceback.print_exc()
571 return None, None 571 return None, None
572 572
573 573
574 def parse_sources(sources, factor=1, verbose=False): 574 def parse_sources(sources, factor=1, verbose=False):
575 pool = multiprocessing.Pool() 575 pool = multiprocessing.Pool()
576 lock = multiprocessing.Manager().Lock() 576 lock = multiprocessing.Manager().Lock()
577 callback = functools.partial(parse_source, factor, lock) 577 callback = functools.partial(parse_source, factor, lock)
578 try: 578 try:
579 for log_file, ignored in pool.imap_unordered(callback, sources, chunksiz e=1): 579 for log_file, ignored in pool.imap_unordered(callback, sources, chunksiz e=1):
580 if verbose and ignored: 580 if verbose and ignored:
581 print "Ignored files for %s" % log_file 581 print 'Ignored files for %s' % log_file
582 print "========================================================= ===" 582 print '========================================================= ==='
583 print "\n".join(sorted(ignored)) 583 print '\n'.join(sorted(ignored))
584 finally: 584 finally:
585 pool.close() 585 pool.close()
586 586
587 if __name__ == "__main__": 587 if __name__ == '__main__':
588 setupStderr() 588 setupStderr()
589 589
590 parser = argparse.ArgumentParser(description="Processes log files and merges them into the stats database") 590 parser = argparse.ArgumentParser(description='Processes log files and merges them into the stats database')
591 parser.add_argument("--verbose", dest="verbose", action="store_const", const =True, default=False, help="Verbose mode, ignored requests will be listed") 591 parser.add_argument('--verbose', dest='verbose', action='store_const', const =True, default=False, help='Verbose mode, ignored requests will be listed')
592 parser.add_argument("--revert", dest="factor", action="store_const", const=- 1, default=1, help="Remove log data from the database") 592 parser.add_argument('--revert', dest='factor', action='store_const', const=- 1, default=1, help='Remove log data from the database')
593 parser.add_argument("mirror_name", nargs="?", help="Name of the mirror serve r that the file belongs to") 593 parser.add_argument('mirror_name', nargs='?', help='Name of the mirror serve r that the file belongs to')
594 parser.add_argument("server_type", nargs="?", help="Server type like downloa d, update or subscription") 594 parser.add_argument('server_type', nargs='?', help='Server type like downloa d, update or subscription')
595 parser.add_argument("log_file", nargs="?", help="Log file path, can be a loc al file path, http:// or ssh:// URL") 595 parser.add_argument('log_file', nargs='?', help='Log file path, can be a loc al file path, http:// or ssh:// URL')
596 args = parser.parse_args() 596 args = parser.parse_args()
597 597
598 if args.mirror_name and args.server_type and args.log_file: 598 if args.mirror_name and args.server_type and args.log_file:
599 sources = [(args.mirror_name, args.server_type, args.log_file)] 599 sources = [(args.mirror_name, args.server_type, args.log_file)]
600 else: 600 else:
601 sources = get_stats_files() 601 sources = get_stats_files()
602 parse_sources(sources, args.factor, args.verbose) 602 parse_sources(sources, args.factor, args.verbose)
OLDNEW

Powered by Google App Engine
This is Rietveld