| OLD | NEW |
| 1 # This file is part of the Adblock Plus web scripts, | 1 # This file is part of the Adblock Plus web scripts, |
| 2 # Copyright (C) 2006-2016 Eyeo GmbH | 2 # Copyright (C) 2006-2016 Eyeo GmbH |
| 3 # | 3 # |
| 4 # Adblock Plus is free software: you can redistribute it and/or modify | 4 # Adblock Plus is free software: you can redistribute it and/or modify |
| 5 # it under the terms of the GNU General Public License version 3 as | 5 # it under the terms of the GNU General Public License version 3 as |
| 6 # published by the Free Software Foundation. | 6 # published by the Free Software Foundation. |
| 7 # | 7 # |
| 8 # Adblock Plus is distributed in the hope that it will be useful, | 8 # Adblock Plus is distributed in the hope that it will be useful, |
| 9 # but WITHOUT ANY WARRANTY; without even the implied warranty of | 9 # but WITHOUT ANY WARRANTY; without even the implied warranty of |
| 10 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the | 10 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the |
| (...skipping 29 matching lines...) Expand all Loading... |
| 40 log_regexp = None | 40 log_regexp = None |
| 41 gecko_apps = None | 41 gecko_apps = None |
| 42 | 42 |
| 43 | 43 |
| 44 class StatsFile: | 44 class StatsFile: |
| 45 def __init__(self, path): | 45 def __init__(self, path): |
| 46 self._inner_file = None | 46 self._inner_file = None |
| 47 self._processes = [] | 47 self._processes = [] |
| 48 | 48 |
| 49 parseresult = urlparse.urlparse(path) | 49 parseresult = urlparse.urlparse(path) |
| 50 if parseresult.scheme == "ssh" and parseresult.username and parseresult.
hostname and parseresult.path: | 50 if parseresult.scheme == 'ssh' and parseresult.username and parseresult.
hostname and parseresult.path: |
| 51 command = [ | 51 command = [ |
| 52 "ssh", "-q", "-o", "NumberOfPasswordPrompts 0", "-T", "-k", | 52 'ssh', '-q', '-o', 'NumberOfPasswordPrompts 0', '-T', '-k', |
| 53 "-l", parseresult.username, | 53 '-l', parseresult.username, |
| 54 parseresult.hostname, | 54 parseresult.hostname, |
| 55 parseresult.path.lstrip("/") | 55 parseresult.path.lstrip('/') |
| 56 ] | 56 ] |
| 57 if parseresult.port: | 57 if parseresult.port: |
| 58 command[1:1] = ["-P", str(parseresult.port)] | 58 command[1:1] = ['-P', str(parseresult.port)] |
| 59 ssh_process = subprocess.Popen(command, stdout=subprocess.PIPE) | 59 ssh_process = subprocess.Popen(command, stdout=subprocess.PIPE) |
| 60 self._processes.append(ssh_process) | 60 self._processes.append(ssh_process) |
| 61 self._file = ssh_process.stdout | 61 self._file = ssh_process.stdout |
| 62 elif parseresult.scheme in ("http", "https"): | 62 elif parseresult.scheme in ('http', 'https'): |
| 63 self._file = urllib.urlopen(path) | 63 self._file = urllib.urlopen(path) |
| 64 elif os.path.exists(path): | 64 elif os.path.exists(path): |
| 65 self._file = open(path, "rb") | 65 self._file = open(path, 'rb') |
| 66 else: | 66 else: |
| 67 raise IOError("Path '%s' not recognized" % path) | 67 raise IOError("Path '%s' not recognized" % path) |
| 68 | 68 |
| 69 if path.endswith(".gz"): | 69 if path.endswith('.gz'): |
| 70 # Built-in gzip module doesn't support streaming (fixed in Python 3.
2) | 70 # Built-in gzip module doesn't support streaming (fixed in Python 3.
2) |
| 71 gzip_process = subprocess.Popen(["gzip", "-cd"], stdin=self._file, s
tdout=subprocess.PIPE) | 71 gzip_process = subprocess.Popen(['gzip', '-cd'], stdin=self._file, s
tdout=subprocess.PIPE) |
| 72 self._processes.append(gzip_process) | 72 self._processes.append(gzip_process) |
| 73 self._file, self._inner_file = gzip_process.stdout, self._file | 73 self._file, self._inner_file = gzip_process.stdout, self._file |
| 74 | 74 |
| 75 def __getattr__(self, name): | 75 def __getattr__(self, name): |
| 76 return getattr(self._file, name) | 76 return getattr(self._file, name) |
| 77 | 77 |
| 78 def close(self): | 78 def close(self): |
| 79 self._file.close() | 79 self._file.close() |
| 80 if self._inner_file: | 80 if self._inner_file: |
| 81 self._inner_file.close() | 81 self._inner_file.close() |
| 82 for process in self._processes: | 82 for process in self._processes: |
| 83 process.wait() | 83 process.wait() |
| 84 | 84 |
| 85 | 85 |
| 86 def get_stats_files(): | 86 def get_stats_files(): |
| 87 config = get_config() | 87 config = get_config() |
| 88 | 88 |
| 89 prefix = "mirror_" | 89 prefix = 'mirror_' |
| 90 options = filter(lambda o: o.startswith(prefix), config.options("stats")) | 90 options = filter(lambda o: o.startswith(prefix), config.options('stats')) |
| 91 for option in options: | 91 for option in options: |
| 92 if config.has_option("stats", option): | 92 if config.has_option('stats', option): |
| 93 value = config.get("stats", option) | 93 value = config.get('stats', option) |
| 94 if " " in value: | 94 if ' ' in value: |
| 95 yield [option[len(prefix):]] + value.split(None, 1) | 95 yield [option[len(prefix):]] + value.split(None, 1) |
| 96 else: | 96 else: |
| 97 print >>sys.stderr, "Option '%s' has invalid value: '%s'" % (opt
ion, value) | 97 print >>sys.stderr, "Option '%s' has invalid value: '%s'" % (opt
ion, value) |
| 98 else: | 98 else: |
| 99 print >>sys.stderr, "Option '%s' not found in the configuration" % o
ption | 99 print >>sys.stderr, "Option '%s' not found in the configuration" % o
ption |
| 100 | 100 |
| 101 | 101 |
| 102 def cache_lru(func): | 102 def cache_lru(func): |
| 103 """ | 103 """ |
| 104 Decorator that memoizes the return values of a single-parameter function i
n | 104 Decorator that memoizes the return values of a single-parameter function i
n |
| (...skipping 21 matching lines...) Expand all Loading... |
| 126 results[arg] = result | 126 results[arg] = result |
| 127 return result | 127 return result |
| 128 return wrapped | 128 return wrapped |
| 129 | 129 |
| 130 | 130 |
| 131 def cache_last(func): | 131 def cache_last(func): |
| 132 """ | 132 """ |
| 133 Decorator that memoizes the last return value of a function in case it is | 133 Decorator that memoizes the last return value of a function in case it is |
| 134 called again with the same parameters. | 134 called again with the same parameters. |
| 135 """ | 135 """ |
| 136 result = {"args": None, "result": None} | 136 result = {'args': None, 'result': None} |
| 137 | 137 |
| 138 def wrapped(*args): | 138 def wrapped(*args): |
| 139 if args != result["args"]: | 139 if args != result['args']: |
| 140 result["result"] = func(*args) | 140 result['result'] = func(*args) |
| 141 result["args"] = args | 141 result['args'] = args |
| 142 return result["result"] | 142 return result['result'] |
| 143 return wrapped | 143 return wrapped |
| 144 | 144 |
| 145 | 145 |
| 146 @cache_lru | 146 @cache_lru |
| 147 def parse_ua(ua): | 147 def parse_ua(ua): |
| 148 # Opera might disguise itself as other browser so it needs to go first | 148 # Opera might disguise itself as other browser so it needs to go first |
| 149 match = re.search(r"\bOpera/([\d\.]+)", ua) | 149 match = re.search(r'\bOpera/([\d\.]+)', ua) |
| 150 if match: | 150 if match: |
| 151 # Opera 10+ declares itself as Opera 9.80 but adds Version/1x.x to the U
A | 151 # Opera 10+ declares itself as Opera 9.80 but adds Version/1x.x to the U
A |
| 152 match2 = re.search(r"\bVersion/([\d\.]+)", ua) | 152 match2 = re.search(r'\bVersion/([\d\.]+)', ua) |
| 153 if match2: | 153 if match2: |
| 154 return "Opera", match2.group(1) | 154 return 'Opera', match2.group(1) |
| 155 else: | 155 else: |
| 156 return "Opera", match.group(1) | 156 return 'Opera', match.group(1) |
| 157 | 157 |
| 158 # Opera 15+ has the same UA as Chrome but adds OPR/1x.x to it | 158 # Opera 15+ has the same UA as Chrome but adds OPR/1x.x to it |
| 159 match = re.search(r"\bOPR/(\d+\.\d+)", ua) | 159 match = re.search(r'\bOPR/(\d+\.\d+)', ua) |
| 160 if match: | 160 if match: |
| 161 return "Opera", match.group(1) | 161 return 'Opera', match.group(1) |
| 162 | 162 |
| 163 # Have to check for these before Firefox, they will usually have a Firefox i
dentifier as well | 163 # Have to check for these before Firefox, they will usually have a Firefox i
dentifier as well |
| 164 match = re.search(r"\b(Fennec|Thunderbird|SeaMonkey|Songbird|K-Meleon|Prism)
/(\d+\.\d+)", ua) | 164 match = re.search(r'\b(Fennec|Thunderbird|SeaMonkey|Songbird|K-Meleon|Prism)
/(\d+\.\d+)', ua) |
| 165 if match: | 165 if match: |
| 166 if match.group(1) == "Fennec": | 166 if match.group(1) == 'Fennec': |
| 167 return "Firefox Mobile", match.group(2) | 167 return 'Firefox Mobile', match.group(2) |
| 168 else: | 168 else: |
| 169 return match.group(1), match.group(2) | 169 return match.group(1), match.group(2) |
| 170 | 170 |
| 171 match = re.search(r"\bFirefox/(\d+\.\d+)", ua) | 171 match = re.search(r'\bFirefox/(\d+\.\d+)', ua) |
| 172 if match: | 172 if match: |
| 173 if re.search(r"\bMobile;", ua): | 173 if re.search(r'\bMobile;', ua): |
| 174 return "Firefox Mobile", match.group(1) | 174 return 'Firefox Mobile', match.group(1) |
| 175 elif re.search(r"\bTablet;", ua): | 175 elif re.search(r'\bTablet;', ua): |
| 176 return "Firefox Tablet", match.group(1) | 176 return 'Firefox Tablet', match.group(1) |
| 177 else: | 177 else: |
| 178 return "Firefox", match.group(1) | 178 return 'Firefox', match.group(1) |
| 179 | 179 |
| 180 match = re.search(r"\brv:(\d+)\.(\d+)(?:\.(\d+))?", ua) | 180 match = re.search(r'\brv:(\d+)\.(\d+)(?:\.(\d+))?', ua) |
| 181 if match and re.search(r"\bGecko/", ua): | 181 if match and re.search(r'\bGecko/', ua): |
| 182 if match.group(3) and int(match.group(1)) < 2: | 182 if match.group(3) and int(match.group(1)) < 2: |
| 183 return "Gecko", "%s.%s.%s" % (match.group(1), match.group(2), match.
group(3)) | 183 return 'Gecko', '%s.%s.%s' % (match.group(1), match.group(2), match.
group(3)) |
| 184 else: | 184 else: |
| 185 return "Gecko", "%s.%s" % (match.group(1), match.group(2)) | 185 return 'Gecko', '%s.%s' % (match.group(1), match.group(2)) |
| 186 | 186 |
| 187 match = re.search(r"\bCoolNovo/(\d+\.\d+\.\d+)", ua) | 187 match = re.search(r'\bCoolNovo/(\d+\.\d+\.\d+)', ua) |
| 188 if match: | 188 if match: |
| 189 return "CoolNovo", match.group(1) | 189 return 'CoolNovo', match.group(1) |
| 190 | 190 |
| 191 match = re.search(r"\bEdge/(\d+)\.\d+", ua) | 191 match = re.search(r'\bEdge/(\d+)\.\d+', ua) |
| 192 if match: | 192 if match: |
| 193 return "Edge", match.group(1) | 193 return 'Edge', match.group(1) |
| 194 | 194 |
| 195 match = re.search(r"\bChrome/(\d+\.\d+)", ua) | 195 match = re.search(r'\bChrome/(\d+\.\d+)', ua) |
| 196 if match: | 196 if match: |
| 197 return "Chrome", match.group(1) | 197 return 'Chrome', match.group(1) |
| 198 | 198 |
| 199 match = re.search(r"\bVersion/(\d+\.\d+)", ua) | 199 match = re.search(r'\bVersion/(\d+\.\d+)', ua) |
| 200 if match and re.search(r"\bMobile Safari/", ua): | 200 if match and re.search(r'\bMobile Safari/', ua): |
| 201 return "Mobile Safari", match.group(1) | 201 return 'Mobile Safari', match.group(1) |
| 202 if match and re.search(r"\bSafari/", ua): | 202 if match and re.search(r'\bSafari/', ua): |
| 203 return "Safari", match.group(1) | 203 return 'Safari', match.group(1) |
| 204 | 204 |
| 205 if re.search(r"\bAppleWebKit/", ua): | 205 if re.search(r'\bAppleWebKit/', ua): |
| 206 return "WebKit", "" | 206 return 'WebKit', '' |
| 207 | 207 |
| 208 match = re.search(r"\bMSIE (\d+\.\d+)", ua) | 208 match = re.search(r'\bMSIE (\d+\.\d+)', ua) |
| 209 if match: | 209 if match: |
| 210 return "MSIE", match.group(1) | 210 return 'MSIE', match.group(1) |
| 211 | 211 |
| 212 match = re.search(r"\bTrident/(\d+\.\d+)", ua) | 212 match = re.search(r'\bTrident/(\d+\.\d+)', ua) |
| 213 if match: | 213 if match: |
| 214 match2 = re.search(r"\brv:(\d+\.\d+)", ua) | 214 match2 = re.search(r'\brv:(\d+\.\d+)', ua) |
| 215 if match2: | 215 if match2: |
| 216 return "MSIE", match2.group(1) | 216 return 'MSIE', match2.group(1) |
| 217 else: | 217 else: |
| 218 return "Trident", match.group(1) | 218 return 'Trident', match.group(1) |
| 219 | 219 |
| 220 match = re.search(r"\bAndroidDownloadManager(?:/(\d+\.\d+))?", ua) | 220 match = re.search(r'\bAndroidDownloadManager(?:/(\d+\.\d+))?', ua) |
| 221 if match: | 221 if match: |
| 222 return "Android", match.group(1) or "" | 222 return 'Android', match.group(1) or '' |
| 223 | 223 |
| 224 match = re.search(r"\bDalvik/.*\bAndroid (\d+\.\d+)", ua) | 224 match = re.search(r'\bDalvik/.*\bAndroid (\d+\.\d+)', ua) |
| 225 if match: | 225 if match: |
| 226 return "Android", match.group(1) | 226 return 'Android', match.group(1) |
| 227 | 227 |
| 228 # ABP/Android downloads use that user agent | 228 # ABP/Android downloads use that user agent |
| 229 if ua.startswith("Apache-HttpClient/UNAVAILABLE"): | 229 if ua.startswith('Apache-HttpClient/UNAVAILABLE'): |
| 230 return "Android", "" | 230 return 'Android', '' |
| 231 | 231 |
| 232 # ABP/IE downloads use that user agent | 232 # ABP/IE downloads use that user agent |
| 233 if ua == "Adblock Plus": | 233 if ua == 'Adblock Plus': |
| 234 return "ABP", "" | 234 return 'ABP', '' |
| 235 | 235 |
| 236 return "Other", "" | 236 return 'Other', '' |
| 237 | 237 |
| 238 | 238 |
| 239 def process_ip(ip, geo, geov6): | 239 def process_ip(ip, geo, geov6): |
| 240 match = re.search(r"^::ffff:(\d+\.\d+\.\d+\.\d+)$", ip) | 240 match = re.search(r'^::ffff:(\d+\.\d+\.\d+\.\d+)$', ip) |
| 241 if match: | 241 if match: |
| 242 ip = match.group(1) | 242 ip = match.group(1) |
| 243 | 243 |
| 244 try: | 244 try: |
| 245 if ":" in ip: | 245 if ':' in ip: |
| 246 country = geov6.country_code_by_addr(ip) | 246 country = geov6.country_code_by_addr(ip) |
| 247 else: | 247 else: |
| 248 country = geo.country_code_by_addr(ip) | 248 country = geo.country_code_by_addr(ip) |
| 249 except: | 249 except: |
| 250 traceback.print_exc() | 250 traceback.print_exc() |
| 251 country = "" | 251 country = '' |
| 252 | 252 |
| 253 if country in (None, "", "--"): | 253 if country in (None, '', '--'): |
| 254 country = "unknown" | 254 country = 'unknown' |
| 255 country = country.lower() | 255 country = country.lower() |
| 256 | 256 |
| 257 return ip, country | 257 return ip, country |
| 258 | 258 |
| 259 | 259 |
| 260 @cache_last | 260 @cache_last |
| 261 def parse_time(timestr, tz_hours, tz_minutes): | 261 def parse_time(timestr, tz_hours, tz_minutes): |
| 262 result = datetime.strptime(timestr, "%d/%b/%Y:%H:%M:%S") | 262 result = datetime.strptime(timestr, '%d/%b/%Y:%H:%M:%S') |
| 263 result -= timedelta(hours=tz_hours, minutes=math.copysign(tz_minutes, tz_hou
rs)) | 263 result -= timedelta(hours=tz_hours, minutes=math.copysign(tz_minutes, tz_hou
rs)) |
| 264 return result, result.strftime("%Y%m"), result.day, result.weekday(), result
.hour | 264 return result, result.strftime('%Y%m'), result.day, result.weekday(), result
.hour |
| 265 | 265 |
| 266 | 266 |
| 267 @cache_lru | 267 @cache_lru |
| 268 def parse_path(path): | 268 def parse_path(path): |
| 269 urlparts = urlparse.urlparse(path) | 269 urlparts = urlparse.urlparse(path) |
| 270 try: | 270 try: |
| 271 path = urllib.unquote(urlparts.path).decode("utf-8") | 271 path = urllib.unquote(urlparts.path).decode('utf-8') |
| 272 except: | 272 except: |
| 273 path = urlparts.path | 273 path = urlparts.path |
| 274 return path[1:], urlparts.query | 274 return path[1:], urlparts.query |
| 275 | 275 |
| 276 | 276 |
| 277 @cache_lru | 277 @cache_lru |
| 278 def parse_query(query): | 278 def parse_query(query): |
| 279 return urlparse.parse_qs(query) | 279 return urlparse.parse_qs(query) |
| 280 | 280 |
| 281 | 281 |
| 282 @cache_lru | 282 @cache_lru |
| 283 def parse_lastversion(last_version): | 283 def parse_lastversion(last_version): |
| 284 if '-' in last_version: | 284 if '-' in last_version: |
| 285 last_version = last_version.split('-', 1)[0] | 285 last_version = last_version.split('-', 1)[0] |
| 286 return datetime.strptime(last_version, "%Y%m%d%H%M") | 286 return datetime.strptime(last_version, '%Y%m%d%H%M') |
| 287 | 287 |
| 288 | 288 |
| 289 @cache_lru | 289 @cache_lru |
| 290 def get_week(date): | 290 def get_week(date): |
| 291 return date.isocalendar()[0:2] | 291 return date.isocalendar()[0:2] |
| 292 | 292 |
| 293 | 293 |
| 294 def parse_downloader_query(info): | 294 def parse_downloader_query(info): |
| 295 params = parse_query(info["query"]) | 295 params = parse_query(info['query']) |
| 296 for param in ("addonName", "addonVersion", "application", "applicationVersio
n", "platform", "platformVersion"): | 296 for param in ('addonName', 'addonVersion', 'application', 'applicationVersio
n', 'platform', 'platformVersion'): |
| 297 info[param] = params.get(param, ["unknown"])[0] | 297 info[param] = params.get(param, ['unknown'])[0] |
| 298 | 298 |
| 299 # Only leave the major and minor release number for application and platform | 299 # Only leave the major and minor release number for application and platform |
| 300 info["applicationVersion"] = re.sub(r"^(\d+\.\d+).*", r"\1", info["applicati
onVersion"]) | 300 info['applicationVersion'] = re.sub(r'^(\d+\.\d+).*', r'\1', info['applicati
onVersion']) |
| 301 info["platformVersion"] = re.sub(r"^(\d+\.\d+).*", r"\1", info["platformVers
ion"]) | 301 info['platformVersion'] = re.sub(r'^(\d+\.\d+).*', r'\1', info['platformVers
ion']) |
| 302 | 302 |
| 303 # Chrome Adblock sends an X-Client-ID header insteads of URL parameters | 303 # Chrome Adblock sends an X-Client-ID header insteads of URL parameters |
| 304 match = re.match(r"^adblock/([\d\.]+)$", info["clientid"], re.I) if info["cl
ientid"] else None | 304 match = re.match(r'^adblock/([\d\.]+)$', info['clientid'], re.I) if info['cl
ientid'] else None |
| 305 if match: | 305 if match: |
| 306 info["addonName"] = "chromeadblock" | 306 info['addonName'] = 'chromeadblock' |
| 307 info["addonVersion"] = match.group(1) | 307 info['addonVersion'] = match.group(1) |
| 308 | 308 |
| 309 last_version = params.get("lastVersion", ["unknown"])[0] | 309 last_version = params.get('lastVersion', ['unknown'])[0] |
| 310 if info["file"] == "notification.json" and last_version == "0" and ( | 310 if info['file'] == 'notification.json' and last_version == '0' and ( |
| 311 (info["addonName"] == "adblockplus" and info["addonVersion"] == "2.3.1")
or | 311 (info['addonName'] == 'adblockplus' and info['addonVersion'] == '2.3.1')
or |
| 312 (info["addonName"] in ("adblockpluschrome", "adblockplusopera") and info
["addonVersion"] == "1.5.2") | 312 (info['addonName'] in ('adblockpluschrome', 'adblockplusopera') and info
['addonVersion'] == '1.5.2') |
| 313 ): | 313 ): |
| 314 # Broken notification version number in these releases, treat like unkno
wn | 314 # Broken notification version number in these releases, treat like unkno
wn |
| 315 last_version = "unknown" | 315 last_version = 'unknown' |
| 316 | 316 |
| 317 if last_version == "unknown": | 317 if last_version == 'unknown': |
| 318 info["downloadInterval"] = "unknown" | 318 info['downloadInterval'] = 'unknown' |
| 319 info["previousDownload"] = "unknown" | 319 info['previousDownload'] = 'unknown' |
| 320 elif last_version == "0": | 320 elif last_version == '0': |
| 321 info["downloadInterval"] = "unknown" | 321 info['downloadInterval'] = 'unknown' |
| 322 info["previousDownload"] = "unknown" | 322 info['previousDownload'] = 'unknown' |
| 323 info["firstDownload"] = True | 323 info['firstDownload'] = True |
| 324 else: | 324 else: |
| 325 try: | 325 try: |
| 326 last_update = parse_lastversion(last_version) | 326 last_update = parse_lastversion(last_version) |
| 327 diff = info["time"] - last_update | 327 diff = info['time'] - last_update |
| 328 if diff.days >= 365: | 328 if diff.days >= 365: |
| 329 info["downloadInterval"] = "%i year(s)" % (diff.days / 365) | 329 info['downloadInterval'] = '%i year(s)' % (diff.days / 365) |
| 330 elif diff.days >= 30: | 330 elif diff.days >= 30: |
| 331 info["downloadInterval"] = "%i month(s)" % (diff.days / 30) | 331 info['downloadInterval'] = '%i month(s)' % (diff.days / 30) |
| 332 elif diff.days >= 1: | 332 elif diff.days >= 1: |
| 333 info["downloadInterval"] = "%i day(s)" % diff.days | 333 info['downloadInterval'] = '%i day(s)' % diff.days |
| 334 else: | 334 else: |
| 335 info["downloadInterval"] = "%i hour(s)" % (diff.seconds / 3600) | 335 info['downloadInterval'] = '%i hour(s)' % (diff.seconds / 3600) |
| 336 | 336 |
| 337 if info["addonName"].startswith("adblockplus"): | 337 if info['addonName'].startswith('adblockplus'): |
| 338 diffdays = (info["time"].date() - last_update.date()).days | 338 diffdays = (info['time'].date() - last_update.date()).days |
| 339 if diffdays == 0: | 339 if diffdays == 0: |
| 340 info["previousDownload"] = "same day" | 340 info['previousDownload'] = 'same day' |
| 341 elif diffdays < 30: | 341 elif diffdays < 30: |
| 342 info["previousDownload"] = "%i day(s)" % diffdays | 342 info['previousDownload'] = '%i day(s)' % diffdays |
| 343 elif diffdays < 365: | 343 elif diffdays < 365: |
| 344 info["previousDownload"] = "%i month(s)" % (diffdays / 30) | 344 info['previousDownload'] = '%i month(s)' % (diffdays / 30) |
| 345 else: | 345 else: |
| 346 info["previousDownload"] = "%i year(s)" % (diffdays / 365) | 346 info['previousDownload'] = '%i year(s)' % (diffdays / 365) |
| 347 else: | 347 else: |
| 348 info["previousDownload"] = "unknown" | 348 info['previousDownload'] = 'unknown' |
| 349 | 349 |
| 350 if last_update.year != info["time"].year or last_update.month != inf
o["time"].month: | 350 if last_update.year != info['time'].year or last_update.month != inf
o['time'].month: |
| 351 info["firstInMonth"] = info["firstInDay"] = True | 351 info['firstInMonth'] = info['firstInDay'] = True |
| 352 elif last_update.day != info["time"].day: | 352 elif last_update.day != info['time'].day: |
| 353 info["firstInDay"] = True | 353 info['firstInDay'] = True |
| 354 | 354 |
| 355 if get_week(last_update) != get_week(info["time"]): | 355 if get_week(last_update) != get_week(info['time']): |
| 356 info["firstInWeek"] = True | 356 info['firstInWeek'] = True |
| 357 except ValueError: | 357 except ValueError: |
| 358 info["downloadInterval"] = "unknown" | 358 info['downloadInterval'] = 'unknown' |
| 359 info["previousDownload"] = "unknown" | 359 info['previousDownload'] = 'unknown' |
| 360 pass | 360 pass |
| 361 | 361 |
| 362 | 362 |
| 363 def parse_addon_name(file): | 363 def parse_addon_name(file): |
| 364 if "/" in file: | 364 if '/' in file: |
| 365 return file.split("/")[-2] | 365 return file.split('/')[-2] |
| 366 else: | 366 else: |
| 367 return None | 367 return None |
| 368 | 368 |
| 369 | 369 |
| 370 def parse_gecko_query(query): | 370 def parse_gecko_query(query): |
| 371 params = urlparse.parse_qs(query) | 371 params = urlparse.parse_qs(query) |
| 372 | 372 |
| 373 version = params.get("version", ["unknown"])[0] | 373 version = params.get('version', ['unknown'])[0] |
| 374 | 374 |
| 375 global gecko_apps | 375 global gecko_apps |
| 376 if gecko_apps == None: | 376 if gecko_apps == None: |
| 377 from buildtools.packagerGecko import KNOWN_APPS | 377 from buildtools.packagerGecko import KNOWN_APPS |
| 378 gecko_apps = {v: k for k, v in KNOWN_APPS.iteritems()} | 378 gecko_apps = {v: k for k, v in KNOWN_APPS.iteritems()} |
| 379 appID = params.get("appID", ["unknown"])[0] | 379 appID = params.get('appID', ['unknown'])[0] |
| 380 | 380 |
| 381 application = gecko_apps.get(appID, "unknown") | 381 application = gecko_apps.get(appID, 'unknown') |
| 382 applicationVersion = params.get("appVersion", ["unknown"])[0] | 382 applicationVersion = params.get('appVersion', ['unknown'])[0] |
| 383 | 383 |
| 384 # Only leave the major and minor release number for application | 384 # Only leave the major and minor release number for application |
| 385 applicationVersion = re.sub(r"^(\d+\.\d+).*", r"\1", applicationVersion) | 385 applicationVersion = re.sub(r'^(\d+\.\d+).*', r'\1', applicationVersion) |
| 386 | 386 |
| 387 return version, application, applicationVersion | 387 return version, application, applicationVersion |
| 388 | 388 |
| 389 | 389 |
| 390 def parse_chrome_query(query): | 390 def parse_chrome_query(query): |
| 391 params = urlparse.parse_qs(query) | 391 params = urlparse.parse_qs(query) |
| 392 | 392 |
| 393 if params.get("prod", ["unknown"])[0] in ("chromecrx", "chromiumcrx"): | 393 if params.get('prod', ['unknown'])[0] in ('chromecrx', 'chromiumcrx'): |
| 394 application = "chrome" | 394 application = 'chrome' |
| 395 else: | 395 else: |
| 396 application = "unknown" | 396 application = 'unknown' |
| 397 applicationVersion = params.get("prodversion", ["unknown"])[0] | 397 applicationVersion = params.get('prodversion', ['unknown'])[0] |
| 398 | 398 |
| 399 params2 = urlparse.parse_qs(params.get("x", [""])[0]) | 399 params2 = urlparse.parse_qs(params.get('x', [''])[0]) |
| 400 version = params2.get("v", ["unknown"])[0] | 400 version = params2.get('v', ['unknown'])[0] |
| 401 | 401 |
| 402 # Only leave the major and minor release number for application | 402 # Only leave the major and minor release number for application |
| 403 applicationVersion = re.sub(r"^(\d+\.\d+).*", r"\1", applicationVersion) | 403 applicationVersion = re.sub(r'^(\d+\.\d+).*', r'\1', applicationVersion) |
| 404 | 404 |
| 405 return version, application, applicationVersion | 405 return version, application, applicationVersion |
| 406 | 406 |
| 407 | 407 |
| 408 def parse_update_flag(query): | 408 def parse_update_flag(query): |
| 409 return "update" if query == "update" else "install" | 409 return 'update' if query == 'update' else 'install' |
| 410 | 410 |
| 411 | 411 |
| 412 def parse_record(line, ignored, geo, geov6): | 412 def parse_record(line, ignored, geo, geov6): |
| 413 global log_regexp | 413 global log_regexp |
| 414 if log_regexp == None: | 414 if log_regexp == None: |
| 415 log_regexp = re.compile(r'(\S+) \S+ \S+ \[([^]\s]+) ([+\-]\d\d)(\d\d)\]
"GET ([^"\s]+) [^"]+" (\d+) (\d+) "([^"]*)" "([^"]*)"(?: "[^"]*" \S+ "[^"]*" "[^
"]*" "([^"]*)")?') | 415 log_regexp = re.compile(r'(\S+) \S+ \S+ \[([^]\s]+) ([+\-]\d\d)(\d\d)\]
"GET ([^"\s]+) [^"]+" (\d+) (\d+) "([^"]*)" "([^"]*)"(?: "[^"]*" \S+ "[^"]*" "[^
"]*" "([^"]*)")?') |
| 416 | 416 |
| 417 match = re.search(log_regexp, line) | 417 match = re.search(log_regexp, line) |
| 418 if not match: | 418 if not match: |
| 419 return None | 419 return None |
| 420 | 420 |
| 421 status = int(match.group(6)) | 421 status = int(match.group(6)) |
| 422 if status not in (200, 301, 302): | 422 if status not in (200, 301, 302): |
| 423 return None | 423 return None |
| 424 | 424 |
| 425 info = { | 425 info = { |
| 426 "status": status, | 426 'status': status, |
| 427 "size": int(match.group(7)), | 427 'size': int(match.group(7)), |
| 428 } | 428 } |
| 429 | 429 |
| 430 info["ip"], info["country"] = process_ip(match.group(1), geo, geov6) | 430 info['ip'], info['country'] = process_ip(match.group(1), geo, geov6) |
| 431 info["time"], info["month"], info["day"], info["weekday"], info["hour"] = pa
rse_time(match.group(2), int(match.group(3)), int(match.group(4))) | 431 info['time'], info['month'], info['day'], info['weekday'], info['hour'] = pa
rse_time(match.group(2), int(match.group(3)), int(match.group(4))) |
| 432 info["file"], info["query"] = parse_path(match.group(5)) | 432 info['file'], info['query'] = parse_path(match.group(5)) |
| 433 info["referrer"] = match.group(8) | 433 info['referrer'] = match.group(8) |
| 434 info["ua"], info["uaversion"] = parse_ua(match.group(9)) | 434 info['ua'], info['uaversion'] = parse_ua(match.group(9)) |
| 435 info["fullua"] = "%s %s" % (info["ua"], info["uaversion"]) | 435 info['fullua'] = '%s %s' % (info['ua'], info['uaversion']) |
| 436 info["clientid"] = match.group(10) | 436 info['clientid'] = match.group(10) |
| 437 | 437 |
| 438 # Additional metadata depends on file type | 438 # Additional metadata depends on file type |
| 439 filename = os.path.basename(info["file"]) | 439 filename = os.path.basename(info['file']) |
| 440 ext = os.path.splitext(filename)[1] | 440 ext = os.path.splitext(filename)[1] |
| 441 if ext == ".txt" or filename == "update.json" or filename == "notification.j
son": | 441 if ext == '.txt' or filename == 'update.json' or filename == 'notification.j
son': |
| 442 # Subscription downloads, libadblockplus update checks and notification | 442 # Subscription downloads, libadblockplus update checks and notification |
| 443 # checks are performed by the downloader | 443 # checks are performed by the downloader |
| 444 parse_downloader_query(info) | 444 parse_downloader_query(info) |
| 445 elif ext == ".tpl": | 445 elif ext == '.tpl': |
| 446 # MSIE TPL download, no additional data here | 446 # MSIE TPL download, no additional data here |
| 447 pass | 447 pass |
| 448 elif ext in (".xpi", ".crx", ".apk", ".msi", ".exe", ".safariextz"): | 448 elif ext in ('.xpi', '.crx', '.apk', '.msi', '.exe', '.safariextz'): |
| 449 # Package download, might be an update | 449 # Package download, might be an update |
| 450 info["installType"] = parse_update_flag(info["query"]) | 450 info['installType'] = parse_update_flag(info['query']) |
| 451 elif filename == "update.rdf": | 451 elif filename == 'update.rdf': |
| 452 # Gecko update check or a legacy Android update check. The latter doesn'
t | 452 # Gecko update check or a legacy Android update check. The latter doesn'
t |
| 453 # have usable data anyway so trying the Chrome route won't do any harm. | 453 # have usable data anyway so trying the Chrome route won't do any harm. |
| 454 info["addonName"] = parse_addon_name(info["file"]) | 454 info['addonName'] = parse_addon_name(info['file']) |
| 455 info["addonVersion"], info["application"], info["applicationVersion"] =
parse_gecko_query(info["query"]) | 455 info['addonVersion'], info['application'], info['applicationVersion'] =
parse_gecko_query(info['query']) |
| 456 elif filename == "updates.xml": | 456 elif filename == 'updates.xml': |
| 457 # Chrome update check | 457 # Chrome update check |
| 458 info["addonName"] = parse_addon_name(info["file"]) | 458 info['addonName'] = parse_addon_name(info['file']) |
| 459 info["addonVersion"], info["application"], info["applicationVersion"] =
parse_chrome_query(info["query"]) | 459 info['addonVersion'], info['application'], info['applicationVersion'] =
parse_chrome_query(info['query']) |
| 460 elif filename == "updates.plist": | 460 elif filename == 'updates.plist': |
| 461 # Safari update check, no additional data | 461 # Safari update check, no additional data |
| 462 pass | 462 pass |
| 463 else: | 463 else: |
| 464 ignored.add(info["file"]) | 464 ignored.add(info['file']) |
| 465 return None | 465 return None |
| 466 | 466 |
| 467 if "addonName" in info: | 467 if 'addonName' in info: |
| 468 info["fullAddon"] = "%s %s" % (info["addonName"], info["addonVersion"]) | 468 info['fullAddon'] = '%s %s' % (info['addonName'], info['addonVersion']) |
| 469 if "application" in info: | 469 if 'application' in info: |
| 470 info["fullApplication"] = "%s %s" % (info["application"], info["applicat
ionVersion"]) | 470 info['fullApplication'] = '%s %s' % (info['application'], info['applicat
ionVersion']) |
| 471 if "platform" in info: | 471 if 'platform' in info: |
| 472 info["fullPlatform"] = "%s %s" % (info["platform"], info["platformVersio
n"]) | 472 info['fullPlatform'] = '%s %s' % (info['platform'], info['platformVersio
n']) |
| 473 return info | 473 return info |
| 474 | 474 |
| 475 | 475 |
| 476 def add_record(info, section, ignore_fields=()): | 476 def add_record(info, section, ignore_fields=()): |
| 477 section["hits"] = section.get("hits", 0) + 1 | 477 section['hits'] = section.get('hits', 0) + 1 |
| 478 section["bandwidth"] = section.get("bandwidth", 0) + info["size"] | 478 section['bandwidth'] = section.get('bandwidth', 0) + info['size'] |
| 479 | 479 |
| 480 if len(ignore_fields) < 2: | 480 if len(ignore_fields) < 2: |
| 481 for field in map(lambda f: f["name"], common.fields): | 481 for field in map(lambda f: f['name'], common.fields): |
| 482 if field in ignore_fields or field not in info: | 482 if field in ignore_fields or field not in info: |
| 483 continue | 483 continue |
| 484 | 484 |
| 485 value = info[field] | 485 value = info[field] |
| 486 if field not in section: | 486 if field not in section: |
| 487 section[field] = {} | 487 section[field] = {} |
| 488 if value not in section[field]: | 488 if value not in section[field]: |
| 489 section[field][value] = {} | 489 section[field][value] = {} |
| 490 | 490 |
| 491 add_record(info, section[field][value], ignore_fields + (field,)) | 491 add_record(info, section[field][value], ignore_fields + (field,)) |
| 492 | 492 |
| 493 | 493 |
| 494 def parse_fileobj(mirror_name, fileobj, geo, geov6, ignored): | 494 def parse_fileobj(mirror_name, fileobj, geo, geov6, ignored): |
| 495 data = {} | 495 data = {} |
| 496 for line in fileobj: | 496 for line in fileobj: |
| 497 info = parse_record(line, ignored, geo, geov6) | 497 info = parse_record(line, ignored, geo, geov6) |
| 498 if info == None: | 498 if info == None: |
| 499 continue | 499 continue |
| 500 | 500 |
| 501 info["mirror"] = mirror_name | 501 info['mirror'] = mirror_name |
| 502 if info["month"] not in data: | 502 if info['month'] not in data: |
| 503 data[info["month"]] = {} | 503 data[info['month']] = {} |
| 504 section = data[info["month"]] | 504 section = data[info['month']] |
| 505 | 505 |
| 506 if info["file"] not in section: | 506 if info['file'] not in section: |
| 507 section[info["file"]] = {} | 507 section[info['file']] = {} |
| 508 section = section[info["file"]] | 508 section = section[info['file']] |
| 509 | 509 |
| 510 add_record(info, section) | 510 add_record(info, section) |
| 511 return data | 511 return data |
| 512 | 512 |
| 513 | 513 |
| 514 def merge_objects(object1, object2, factor=1): | 514 def merge_objects(object1, object2, factor=1): |
| 515 for key, value in object2.iteritems(): | 515 for key, value in object2.iteritems(): |
| 516 try: | 516 try: |
| 517 key = unicode(key) | 517 key = unicode(key) |
| 518 except UnicodeDecodeError: | 518 except UnicodeDecodeError: |
| 519 key = unicode(key, encoding="latin-1") | 519 key = unicode(key, encoding='latin-1') |
| 520 if isinstance(value, numbers.Number): | 520 if isinstance(value, numbers.Number): |
| 521 object1[key] = object1.get(key, 0) + factor * value | 521 object1[key] = object1.get(key, 0) + factor * value |
| 522 else: | 522 else: |
| 523 merge_objects(object1.setdefault(key, {}), value, factor) | 523 merge_objects(object1.setdefault(key, {}), value, factor) |
| 524 | 524 |
| 525 | 525 |
| 526 def save_stats(server_type, data, factor=1): | 526 def save_stats(server_type, data, factor=1): |
| 527 base_dir = os.path.join(get_config().get("stats", "dataDirectory"), common.f
ilename_encode(server_type)) | 527 base_dir = os.path.join(get_config().get('stats', 'dataDirectory'), common.f
ilename_encode(server_type)) |
| 528 for month, month_data in data.iteritems(): | 528 for month, month_data in data.iteritems(): |
| 529 for name, file_data in month_data.iteritems(): | 529 for name, file_data in month_data.iteritems(): |
| 530 path = os.path.join(base_dir, common.filename_encode(month), common.
filename_encode(name + ".json")) | 530 path = os.path.join(base_dir, common.filename_encode(month), common.
filename_encode(name + '.json')) |
| 531 if os.path.exists(path): | 531 if os.path.exists(path): |
| 532 with codecs.open(path, "rb", encoding="utf-8") as fileobj: | 532 with codecs.open(path, 'rb', encoding='utf-8') as fileobj: |
| 533 existing = json.load(fileobj) | 533 existing = json.load(fileobj) |
| 534 else: | 534 else: |
| 535 existing = {} | 535 existing = {} |
| 536 | 536 |
| 537 merge_objects(existing, file_data, factor) | 537 merge_objects(existing, file_data, factor) |
| 538 | 538 |
| 539 dir = os.path.dirname(path) | 539 dir = os.path.dirname(path) |
| 540 try: | 540 try: |
| 541 os.makedirs(dir) | 541 os.makedirs(dir) |
| 542 except OSError, e: | 542 except OSError, e: |
| 543 if e.errno != errno.EEXIST: | 543 if e.errno != errno.EEXIST: |
| 544 raise | 544 raise |
| 545 | 545 |
| 546 with codecs.open(path, "wb", encoding="utf-8") as fileobj: | 546 with codecs.open(path, 'wb', encoding='utf-8') as fileobj: |
| 547 json.dump(existing, fileobj, indent=2, sort_keys=True) | 547 json.dump(existing, fileobj, indent=2, sort_keys=True) |
| 548 | 548 |
| 549 | 549 |
| 550 def parse_source(factor, lock, (mirror_name, server_type, log_file)): | 550 def parse_source(factor, lock, (mirror_name, server_type, log_file)): |
| 551 try: | 551 try: |
| 552 geo = pygeoip.GeoIP(get_config().get("stats", "geoip_db"), pygeoip.MEMOR
Y_CACHE) | 552 geo = pygeoip.GeoIP(get_config().get('stats', 'geoip_db'), pygeoip.MEMOR
Y_CACHE) |
| 553 geov6 = pygeoip.GeoIP(get_config().get("stats", "geoipv6_db"), pygeoip.M
EMORY_CACHE) | 553 geov6 = pygeoip.GeoIP(get_config().get('stats', 'geoipv6_db'), pygeoip.M
EMORY_CACHE) |
| 554 | 554 |
| 555 ignored = set() | 555 ignored = set() |
| 556 fileobj = StatsFile(log_file) | 556 fileobj = StatsFile(log_file) |
| 557 try: | 557 try: |
| 558 data = parse_fileobj(mirror_name, fileobj, geo, geov6, ignored) | 558 data = parse_fileobj(mirror_name, fileobj, geo, geov6, ignored) |
| 559 finally: | 559 finally: |
| 560 fileobj.close() | 560 fileobj.close() |
| 561 | 561 |
| 562 lock.acquire() | 562 lock.acquire() |
| 563 try: | 563 try: |
| 564 save_stats(server_type, data, factor) | 564 save_stats(server_type, data, factor) |
| 565 finally: | 565 finally: |
| 566 lock.release() | 566 lock.release() |
| 567 return log_file, ignored | 567 return log_file, ignored |
| 568 except: | 568 except: |
| 569 print >>sys.stderr, "Unable to process log file '%s'" % log_file | 569 print >>sys.stderr, "Unable to process log file '%s'" % log_file |
| 570 traceback.print_exc() | 570 traceback.print_exc() |
| 571 return None, None | 571 return None, None |
| 572 | 572 |
| 573 | 573 |
| 574 def parse_sources(sources, factor=1, verbose=False): | 574 def parse_sources(sources, factor=1, verbose=False): |
| 575 pool = multiprocessing.Pool() | 575 pool = multiprocessing.Pool() |
| 576 lock = multiprocessing.Manager().Lock() | 576 lock = multiprocessing.Manager().Lock() |
| 577 callback = functools.partial(parse_source, factor, lock) | 577 callback = functools.partial(parse_source, factor, lock) |
| 578 try: | 578 try: |
| 579 for log_file, ignored in pool.imap_unordered(callback, sources, chunksiz
e=1): | 579 for log_file, ignored in pool.imap_unordered(callback, sources, chunksiz
e=1): |
| 580 if verbose and ignored: | 580 if verbose and ignored: |
| 581 print "Ignored files for %s" % log_file | 581 print 'Ignored files for %s' % log_file |
| 582 print "=========================================================
===" | 582 print '=========================================================
===' |
| 583 print "\n".join(sorted(ignored)) | 583 print '\n'.join(sorted(ignored)) |
| 584 finally: | 584 finally: |
| 585 pool.close() | 585 pool.close() |
| 586 | 586 |
| 587 if __name__ == "__main__": | 587 if __name__ == '__main__': |
| 588 setupStderr() | 588 setupStderr() |
| 589 | 589 |
| 590 parser = argparse.ArgumentParser(description="Processes log files and merges
them into the stats database") | 590 parser = argparse.ArgumentParser(description='Processes log files and merges
them into the stats database') |
| 591 parser.add_argument("--verbose", dest="verbose", action="store_const", const
=True, default=False, help="Verbose mode, ignored requests will be listed") | 591 parser.add_argument('--verbose', dest='verbose', action='store_const', const
=True, default=False, help='Verbose mode, ignored requests will be listed') |
| 592 parser.add_argument("--revert", dest="factor", action="store_const", const=-
1, default=1, help="Remove log data from the database") | 592 parser.add_argument('--revert', dest='factor', action='store_const', const=-
1, default=1, help='Remove log data from the database') |
| 593 parser.add_argument("mirror_name", nargs="?", help="Name of the mirror serve
r that the file belongs to") | 593 parser.add_argument('mirror_name', nargs='?', help='Name of the mirror serve
r that the file belongs to') |
| 594 parser.add_argument("server_type", nargs="?", help="Server type like downloa
d, update or subscription") | 594 parser.add_argument('server_type', nargs='?', help='Server type like downloa
d, update or subscription') |
| 595 parser.add_argument("log_file", nargs="?", help="Log file path, can be a loc
al file path, http:// or ssh:// URL") | 595 parser.add_argument('log_file', nargs='?', help='Log file path, can be a loc
al file path, http:// or ssh:// URL') |
| 596 args = parser.parse_args() | 596 args = parser.parse_args() |
| 597 | 597 |
| 598 if args.mirror_name and args.server_type and args.log_file: | 598 if args.mirror_name and args.server_type and args.log_file: |
| 599 sources = [(args.mirror_name, args.server_type, args.log_file)] | 599 sources = [(args.mirror_name, args.server_type, args.log_file)] |
| 600 else: | 600 else: |
| 601 sources = get_stats_files() | 601 sources = get_stats_files() |
| 602 parse_sources(sources, args.factor, args.verbose) | 602 parse_sources(sources, args.factor, args.verbose) |
| OLD | NEW |