OLD | NEW |
1 # coding: utf-8 | 1 # coding: utf-8 |
2 | 2 |
3 # This file is part of the Adblock Plus web scripts, | 3 # This file is part of the Adblock Plus web scripts, |
4 # Copyright (C) 2006-2013 Eyeo GmbH | 4 # Copyright (C) 2006-2013 Eyeo GmbH |
5 # | 5 # |
6 # Adblock Plus is free software: you can redistribute it and/or modify | 6 # Adblock Plus is free software: you can redistribute it and/or modify |
7 # it under the terms of the GNU General Public License version 3 as | 7 # it under the terms of the GNU General Public License version 3 as |
8 # published by the Free Software Foundation. | 8 # published by the Free Software Foundation. |
9 # | 9 # |
10 # Adblock Plus is distributed in the hope that it will be useful, | 10 # Adblock Plus is distributed in the hope that it will be useful, |
11 # but WITHOUT ANY WARRANTY; without even the implied warranty of | 11 # but WITHOUT ANY WARRANTY; without even the implied warranty of |
12 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the | 12 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the |
13 # GNU General Public License for more details. | 13 # GNU General Public License for more details. |
14 # | 14 # |
15 # You should have received a copy of the GNU General Public License | 15 # You should have received a copy of the GNU General Public License |
16 # along with Adblock Plus. If not, see <http://www.gnu.org/licenses/>. | 16 # along with Adblock Plus. If not, see <http://www.gnu.org/licenses/>. |
17 | 17 |
18 import sys, re, math, GeoIP | 18 import os, sys, codecs, re, math, GeoIP, urllib, urlparse, socket, simplejson |
| 19 from collections import OrderedDict |
| 20 import sitescripts.stats.common as common |
19 from sitescripts.utils import get_config, setupStderr | 21 from sitescripts.utils import get_config, setupStderr |
20 from datetime import datetime, timedelta | 22 from datetime import datetime, timedelta |
21 from ConfigParser import SafeConfigParser, NoOptionError | 23 |
22 | 24 log_regexp = None |
23 def parseUA(ua): | 25 mirror_name = None |
| 26 gecko_apps = None |
| 27 |
| 28 def cache_lru(func): |
| 29 """ |
| 30 Decorator that memoizes the return values of a single-parameter function in |
| 31 case it is called again with the same parameter. The 1024 most recent |
| 32 results are saved. |
| 33 """ |
| 34 |
| 35 results = OrderedDict() |
| 36 results.entries_left = 1024 |
| 37 |
| 38 def wrapped(arg): |
| 39 if arg in results: |
| 40 result = results[arg] |
| 41 del results[arg] |
| 42 else: |
| 43 if results.entries_left > 0: |
| 44 results.entries_left -= 1 |
| 45 else: |
| 46 results.popitem(last=False) |
| 47 result = func(arg) |
| 48 results[arg] = result |
| 49 return result |
| 50 return wrapped |
| 51 |
| 52 |
| 53 def cache_last(func): |
| 54 """ |
| 55 Decorator that memoizes the last return value of a function in case it is |
| 56 called again with the same parameters. |
| 57 """ |
| 58 result = {"args": None, "result": None} |
| 59 |
| 60 def wrapped(*args): |
| 61 if args != result["args"]: |
| 62 result["result"] = func(*args) |
| 63 result["args"] = args |
| 64 return result["result"] |
| 65 return wrapped |
| 66 |
| 67 |
| 68 @cache_lru |
| 69 def parse_ua(ua): |
24 # Opera might disguise itself as other browser so it needs to go first | 70 # Opera might disguise itself as other browser so it needs to go first |
25 match = re.search(r'\bOpera/([\d\.]+)', ua) | 71 match = re.search(r"\bOpera/([\d\.]+)", ua) |
26 if match: | 72 if match: |
27 # Opera 10+ declares itself as Opera 9.80 but adds Version/1x.x to the UA | 73 # Opera 10+ declares itself as Opera 9.80 but adds Version/1x.x to the UA |
28 match2 = re.search(r'\bVersion/([\d\.]+)', ua) | 74 match2 = re.search(r"\bVersion/([\d\.]+)", ua) |
29 if match2: | 75 if match2: |
30 return 'Opera %s' % match2.group(1) | 76 return "Opera", match2.group(1) |
31 else: | 77 else: |
32 return 'Opera %s' % match.group(1) | 78 return "Opera", match.group(1) |
33 | 79 |
34 # Opera 15+ has the same UA as Chrome but adds OPR/1x.x to it | 80 # Opera 15+ has the same UA as Chrome but adds OPR/1x.x to it |
35 match = re.search(r'\bOPR/(\d+\.\d+)', ua) | 81 match = re.search(r"\bOPR/(\d+\.\d+)", ua) |
36 if match: | 82 if match: |
37 return 'Opera %s' % match.group(1) | 83 return "Opera", match.group(1) |
38 | 84 |
39 for appName in ('Fennec', 'Thunderbird', 'SeaMonkey', 'Songbird', 'K-Meleon',
'Prism', 'Firefox'): | 85 # Have to check for these before Firefox, they will usually have a Firefox ide
ntifier as well |
40 match = re.search(r'\b%s/(\d+\.\d+)' % appName, ua) | 86 match = re.search(r"\b(Fennec|Thunderbird|SeaMonkey|Songbird|K-Meleon|Prism)/(
\d+\.\d+)", ua) |
41 if match: | 87 if match: |
42 return '%s %s' % (appName, match.group(1)) | 88 if match.group(1) == "Fennec": |
43 | 89 return "Firefox Mobile", match.group(2) |
44 match = re.search(r'\brv:(\d+)\.(\d+)(?:\.(\d+))?', ua) | 90 else: |
45 if match and re.search(r'\bGecko/', ua): | 91 return match.group(1), match.group(2) |
| 92 |
| 93 match = re.search(r"\bFirefox/(\d+\.\d+)", ua) |
| 94 if match: |
| 95 if re.search(r"\bMobile;", ua): |
| 96 return "Firefox Mobile", match.group(1) |
| 97 else: |
| 98 return "Firefox", match.group(1) |
| 99 |
| 100 match = re.search(r"\brv:(\d+)\.(\d+)(?:\.(\d+))?", ua) |
| 101 if match and re.search(r"\bGecko/", ua): |
46 if match.group(3) and int(match.group(1)) < 2: | 102 if match.group(3) and int(match.group(1)) < 2: |
47 return 'Gecko %s.%s.%s' % (match.group(1), match.group(2), match.group(3)) | 103 return "Gecko", "%s.%s.%s" % (match.group(1), match.group(2), match.group(
3)) |
48 else: | 104 else: |
49 return 'Gecko %s.%s' % (match.group(1), match.group(2)) | 105 return "Gecko", "%s.%s" % (match.group(1), match.group(2)) |
50 | 106 |
51 match = re.search(r'\bChrome/(\d+\.\d+)', ua) | 107 match = re.search(r"\bCoolNovo/(\d+\.\d+\.\d+)", ua) |
52 if match: | 108 if match: |
53 return 'Chrome %s' % match.group(1) | 109 return "CoolNovo", match.group(1) |
54 | 110 |
55 match = re.search(r'\bVersion/(\d+\.\d+)', ua) | 111 match = re.search(r"\bChrome/(\d+\.\d+)", ua) |
56 if match and re.search(r'\Safari/', ua): | 112 if match: |
57 return 'Safari %s' % match.group(1) | 113 return "Chrome", match.group(1) |
58 | 114 |
59 if re.search(r'\bAppleWebKit/', ua): | 115 match = re.search(r"\bVersion/(\d+\.\d+)", ua) |
60 return 'WebKit' | 116 if match and re.search(r"\bMobile Safari/", ua): |
61 | 117 return "Mobile Safari", match.group(1) |
62 match = re.search(r'\bMSIE (\d+\.\d+)', ua) | 118 if match and re.search(r"\bSafari/", ua): |
63 if match: | 119 return "Safari", match.group(1) |
64 return 'MSIE %s' % match.group(1) | 120 |
65 | 121 if re.search(r"\bAppleWebKit/", ua): |
66 return 'Other' | 122 return "WebKit", "" |
67 | 123 |
68 def parseStdIn(geo): | 124 match = re.search(r"\bMSIE (\d+\.\d+)", ua) |
69 if get_config().has_option('logs', 'subscriptionsSubdir'): | 125 if match: |
70 subdir = get_config().get('logs', 'subscriptionsSubdir') | 126 return "MSIE", match.group(1) |
71 subdir = re.sub(r'^/+', '', subdir) | 127 |
72 subdir = re.sub(r'/+$', '', subdir) | 128 match = re.search(r"\bTrident/(\d+\.\d+)", ua) |
73 subdir = re.sub(r'(?=\W)', r'\\', subdir) | 129 if match: |
74 subdir = subdir + '/' | 130 return "Trident", match.group(1) |
| 131 |
| 132 match = re.search(r"\bAndroidDownloadManager(?:/(\d+\.\d+))?", ua) |
| 133 if match: |
| 134 return "Android", match.group(1) or "" |
| 135 |
| 136 match = re.search(r"\bDalvik/.*\bAndroid (\d+\.\d+)", ua) |
| 137 if match: |
| 138 return "Android", match.group(1) |
| 139 |
| 140 # ABP/Android downloads use that user agent |
| 141 if ua.startswith("Apache-HttpClient/UNAVAILABLE"): |
| 142 return "Android", "" |
| 143 |
| 144 # ABP/IE downloads use that user agent |
| 145 if ua == "Adblock Plus": |
| 146 return "ABP", "" |
| 147 |
| 148 return "Other", "" |
| 149 |
| 150 def process_ip(ip, geo): |
| 151 match = re.search(r"^::ffff:(\d+\.\d+\.\d+\.\d+)$", ip) |
| 152 if match: |
| 153 ip = match.group(1) |
| 154 |
| 155 country = geo.country_code_by_addr(ip) |
| 156 if country in (None, "", "--"): |
| 157 country = "unknown" |
| 158 country = country.lower() |
| 159 |
| 160 return ip, country |
| 161 |
| 162 @cache_last |
| 163 def parse_time(timestr, tz_hours, tz_minutes): |
| 164 result = datetime.strptime(timestr, "%d/%b/%Y:%H:%M:%S") |
| 165 result -= timedelta(hours = tz_hours, minutes = math.copysign(tz_minutes, tz_h
ours)) |
| 166 return result, result.strftime("%Y%m"), result.day, result.weekday(), result.h
our |
| 167 |
| 168 @cache_lru |
| 169 def parse_path(path): |
| 170 urlparts = urlparse.urlparse(path) |
| 171 try: |
| 172 path = urllib.unquote(urlparts.path).decode("utf-8") |
| 173 except: |
| 174 path = urlparts.path |
| 175 return path[1:], urlparts.query |
| 176 |
| 177 @cache_lru |
| 178 def parse_query(query): |
| 179 return urlparse.parse_qs(query) |
| 180 |
| 181 @cache_lru |
| 182 def parse_lastversion(last_version): |
| 183 return datetime.strptime(last_version, "%Y%m%d%H%M") |
| 184 |
| 185 @cache_lru |
| 186 def get_week(date): |
| 187 return date.isocalendar()[0:2] |
| 188 |
| 189 def parse_downloader_query(info): |
| 190 params = parse_query(info["query"]) |
| 191 for param in ("addonName", "addonVersion", "application", "applicationVersion"
, "platform", "platformVersion"): |
| 192 info[param] = params.get(param, ["unknown"])[0] |
| 193 |
| 194 # Only leave the major and minor release number for application and platform |
| 195 info["applicationVersion"] = re.sub(r"^(\d+\.\d+).*", r"\1", info["application
Version"]) |
| 196 info["platformVersion"] = re.sub(r"^(\d+\.\d+).*", r"\1", info["platformVersio
n"]) |
| 197 |
| 198 # Chrome Adblock sends an X-Client-ID header insteads of URL parameters |
| 199 match = re.match(r"^adblock/([\d\.]+)$", info["clientid"], re.I) if info["clie
ntid"] else None |
| 200 if match: |
| 201 info["addonName"] = "chromeadblock" |
| 202 info["addonVersion"] = match.group(1) |
| 203 |
| 204 last_version = params.get("lastVersion", ["unknown"])[0] |
| 205 if info["file"] == "notification.json" and last_version == "0" and ( |
| 206 (info["addonName"] == "adblockplus" and info["addonVersion"] == "2.3.1") o
r |
| 207 (info["addonName"] in ("adblockpluschrome", "adblockplusopera") and info["
addonVersion"] == "1.5.2") |
| 208 ): |
| 209 # Broken notification version number in these releases, treat like unknown |
| 210 last_version = "unknown" |
| 211 |
| 212 if last_version == "unknown": |
| 213 info["downloadInterval"] = "unknown" |
| 214 elif last_version == "0": |
| 215 info["downloadInterval"] = "unknown" |
| 216 info["firstDownload"] = info["firstInMonth"] = info["firstInWeek"] = info["f
irstInDay"] = True |
75 else: | 217 else: |
76 subdir = '' | 218 try: |
77 regexp = re.compile(r'(\S+) \S+ \S+ \[([^]\s]+) ([+\-]\d\d)(\d\d)\] "GET (?:\w
+://[^/]+)?/%s([\w\-\+\.]+\.(?:txt|tpl))(?:\?[^\s"]*)? [^"]+" (\d+) (\d+) "[^"]*
" "([^"]*)"' % subdir) | 219 last_update = parse_lastversion(last_version) |
78 | 220 diff = info["time"] - last_update |
| 221 if diff.days >= 365: |
| 222 info["downloadInterval"] = "%i year(s)" % (diff.days / 365) |
| 223 elif diff.days >= 30: |
| 224 info["downloadInterval"] = "%i month(s)" % (diff.days / 30) |
| 225 elif diff.days >= 1: |
| 226 info["downloadInterval"] = "%i day(s)" % diff.days |
| 227 else: |
| 228 info["downloadInterval"] = "%i hour(s)" % (diff.seconds / 3600) |
| 229 |
| 230 if last_update.year != info["time"].year or last_update.month != info["tim
e"].month: |
| 231 info["firstInMonth"] = info["firstInDay"] = True |
| 232 elif last_update.day != info["time"].day: |
| 233 info["firstInDay"] = True |
| 234 |
| 235 if get_week(last_update) != get_week(info["time"]): |
| 236 info["firstInWeek"] = True |
| 237 except ValueError: |
| 238 info["downloadInterval"] = "unknown" |
| 239 pass |
| 240 |
| 241 def parse_addon_name(file): |
| 242 if "/" in file: |
| 243 return file.split("/")[-2] |
| 244 else: |
| 245 return None |
| 246 |
| 247 def parse_gecko_query(query): |
| 248 params = urlparse.parse_qs(query) |
| 249 |
| 250 version = params.get("version", ["unknown"])[0] |
| 251 |
| 252 global gecko_apps |
| 253 if gecko_apps == None: |
| 254 from buildtools.packagerGecko import KNOWN_APPS |
| 255 gecko_apps = {v: k for k, v in KNOWN_APPS.iteritems()} |
| 256 appID = params.get("appID", ["unknown"])[0] |
| 257 |
| 258 application = gecko_apps.get(appID, "unknown") |
| 259 applicationVersion = params.get("appVersion", ["unknown"])[0] |
| 260 |
| 261 # Only leave the major and minor release number for application |
| 262 applicationVersion = re.sub(r"^(\d+\.\d+).*", r"\1", applicationVersion) |
| 263 |
| 264 return version, application, applicationVersion |
| 265 |
| 266 def parse_chrome_query(query): |
| 267 params = urlparse.parse_qs(query) |
| 268 |
| 269 if params.get("prod", ["unknown"])[0] in ("chromecrx", "chromiumcrx"): |
| 270 application = "chrome" |
| 271 else: |
| 272 application = "unknown" |
| 273 applicationVersion = params.get("prodversion", ["unknown"])[0] |
| 274 |
| 275 params2 = urlparse.parse_qs(params.get("x", [""])[0]) |
| 276 version = params2.get("v", ["unknown"])[0] |
| 277 |
| 278 # Only leave the major and minor release number for application |
| 279 applicationVersion = re.sub(r"^(\d+\.\d+).*", r"\1", applicationVersion) |
| 280 |
| 281 return version, application, applicationVersion |
| 282 |
| 283 def parse_update_flag(query): |
| 284 return "update" if query == "update" else "install" |
| 285 |
| 286 def parse_record(line, ignored, geo): |
| 287 global log_regexp, mirror_name |
| 288 if log_regexp == None: |
| 289 log_regexp = re.compile(r'(\S+) \S+ \S+ \[([^]\s]+) ([+\-]\d\d)(\d\d)\] "GET
([^"\s]+) [^"]+" (\d+) (\d+) "[^"]*" "([^"]*)"(?: "[^"]*" \S+ "[^"]*" "[^"]*" "
([^"]*)")?') |
| 290 if mirror_name == None: |
| 291 mirror_name = re.sub(r"\..*", "", socket.gethostname()) |
| 292 |
| 293 match = re.search(log_regexp, line) |
| 294 if not match: |
| 295 return None |
| 296 |
| 297 status = int(match.group(6)) |
| 298 if status != 200: |
| 299 return None |
| 300 |
| 301 info = { |
| 302 "mirror": mirror_name, |
| 303 "size": int(match.group(7)), |
| 304 } |
| 305 |
| 306 info["ip"], info["country"] = process_ip(match.group(1), geo) |
| 307 info["time"], info["month"], info["day"], info["weekday"], info["hour"] = pars
e_time(match.group(2), int(match.group(3)), int(match.group(4))) |
| 308 info["file"], info["query"] = parse_path(match.group(5)) |
| 309 info["ua"], info["uaversion"] = parse_ua(match.group(8)) |
| 310 info["fullua"] = "%s %s" % (info["ua"], info["uaversion"]) |
| 311 info["clientid"] = match.group(9) |
| 312 |
| 313 # Additional metadata depends on file type |
| 314 filename = os.path.basename(info["file"]) |
| 315 ext = os.path.splitext(filename)[1] |
| 316 if ext == ".txt" or filename == "update.json" or filename == "notification.jso
n": |
| 317 # Subscription downloads, libadblockplus update checks and notification |
| 318 # checks are performed by the downloader |
| 319 parse_downloader_query(info) |
| 320 elif ext == ".tpl": |
| 321 # MSIE TPL download, no additional data here |
| 322 pass |
| 323 elif ext in (".xpi", ".crx", ".apk", ".msi", ".exe"): |
| 324 # Package download, might be an update |
| 325 info["installType"] = parse_update_flag(info["query"]) |
| 326 elif filename == "update.rdf": |
| 327 # Gecko update check or a legacy Android update check. The latter doesn't |
| 328 # have usable data anyway so trying the Chrome route won't do any harm. |
| 329 info["addonName"] = parse_addon_name(info["file"]) |
| 330 info["addonVersion"], info["application"], info["applicationVersion"] = pars
e_gecko_query(info["query"]) |
| 331 elif filename == "updates.xml": |
| 332 # Chrome update check |
| 333 info["addonName"] = parse_addon_name(info["file"]) |
| 334 info["addonVersion"], info["application"], info["applicationVersion"] = pars
e_chrome_query(info["query"]) |
| 335 else: |
| 336 ignored.add(info["file"]) |
| 337 return None |
| 338 |
| 339 if "addonName" in info: |
| 340 info["fullAddon"] = "%s %s" % (info["addonName"], info["addonVersion"]) |
| 341 if "application" in info: |
| 342 info["fullApplication"] = "%s %s" % (info["application"], info["applicationV
ersion"]) |
| 343 if "platform" in info: |
| 344 info["fullPlatform"] = "%s %s" % (info["platform"], info["platformVersion"]) |
| 345 return info |
| 346 |
| 347 def add_record(info, section, ignore_fields=()): |
| 348 section["hits"] = section.get("hits", 0) + 1 |
| 349 section["bandwidth"] = section.get("bandwidth", 0) + info["size"] |
| 350 |
| 351 if len(ignore_fields) < 2: |
| 352 for field in map(lambda f: f["name"], common.fields): |
| 353 if field in ignore_fields or field not in info: |
| 354 continue |
| 355 |
| 356 value = info[field] |
| 357 if field not in section: |
| 358 section[field] = {} |
| 359 if value not in section[field]: |
| 360 section[field][value] = {} |
| 361 |
| 362 add_record(info, section[field][value], ignore_fields + (field,)) |
| 363 |
| 364 def parse_stdin(geo, verbose): |
79 data = {} | 365 data = {} |
| 366 ignored = set() |
80 for line in sys.stdin: | 367 for line in sys.stdin: |
81 match = re.search(regexp, line) | 368 info = parse_record(line, ignored, geo) |
82 if not match: | 369 if info == None: |
83 continue | 370 continue |
84 | 371 |
85 ip, time, tzHours, tzMinutes = match.group(1), match.group(2), int(match.gro
up(3)), int(match.group(4)) | 372 if info["month"] not in data: |
86 file, status, size, ua = match.group(5), int(match.group(6)), int(match.grou
p(7)), match.group(8) | 373 data[info["month"]] = {} |
87 if status != 200 and status != 302 and status != 304: | 374 section = data[info["month"]] |
88 continue | 375 |
89 if file.startswith('robots.'): | 376 if info["file"] not in section: |
90 continue | 377 section[info["file"]] = {} |
91 | 378 section = section[info["file"]] |
92 time = datetime.strptime(time, '%d/%b/%Y:%H:%M:%S') | 379 |
93 time -= timedelta(hours = tzHours, minutes = math.copysign(tzMinutes, tzHour
s)) | 380 add_record(info, section) |
94 | 381 |
95 match = re.search(r'^::ffff:(\d+\.\d+\.\d+\.\d+)$', ip) | 382 if verbose: |
96 if match: | 383 print "Ignored files" |
97 ip = match.group(1) | 384 print "=============" |
98 country = geo.country_code_by_addr(ip) | 385 print "\n".join(sorted(ignored)) |
99 if country == '' or country == '--': | 386 return data |
100 country = 'unknown' | 387 |
101 | 388 if __name__ == "__main__": |
102 ua = parseUA(ua)[:20] | |
103 | |
104 section = time.strftime('%Y%m') | |
105 if not section in data: | |
106 data[section] = {} | |
107 | |
108 def addResultInt(key, value): | |
109 if key in data[section]: | |
110 data[section][key] += value | |
111 else: | |
112 data[section][key] = value | |
113 | |
114 addResultInt('%s hits' % file, 1) | |
115 addResultInt('%s bandwidth' % file, size) | |
116 addResultInt('%s hits day %i' % (file, time.day), 1) | |
117 addResultInt('%s bandwidth day %i' % (file, time.day), size) | |
118 addResultInt('%s hits hour %i' % (file, time.hour), 1) | |
119 addResultInt('%s bandwidth hour %i' % (file, time.hour), size) | |
120 addResultInt('%s hits country %s' % (file, country), 1) | |
121 addResultInt('%s bandwidth country %s' % (file, country), size) | |
122 addResultInt('%s hits app %s' % (file, ua), 1) | |
123 addResultInt('%s bandwidth app %s' % (file, ua), size) | |
124 | |
125 result = SafeConfigParser() | |
126 for section in data.iterkeys(): | |
127 result.add_section(section) | |
128 for key, value in data[section].iteritems(): | |
129 result.set(section, key, str(value)) | |
130 return result | |
131 | |
132 if __name__ == '__main__': | |
133 setupStderr() | 389 setupStderr() |
134 | 390 |
135 geo = GeoIP.open(get_config().get('logs', 'geoip_db'), GeoIP.GEOIP_MEMORY_CACH
E) | 391 verbose = (len(sys.argv) >= 2 and sys.argv[1] == "verbose") |
136 result = parseStdIn(geo) | 392 geo = GeoIP.open(get_config().get("stats", "geoip_db"), GeoIP.GEOIP_MEMORY_CAC
HE) |
137 | 393 result = parse_stdin(geo, verbose) |
138 file = open(get_config().get('subscriptionStats', 'tempFile'), 'wb') | 394 |
139 result.write(file) | 395 with codecs.open(get_config().get("stats", "tempFile"), "wb", encoding="utf-8"
) as file: |
| 396 simplejson.dump(result, file, indent=2, sort_keys=True) |
OLD | NEW |