| Left: | ||
| Right: |
| OLD | NEW |
|---|---|
| 1 # coding: utf-8 | 1 # coding: utf-8 |
| 2 | 2 |
| 3 # This file is part of the Adblock Plus web scripts, | 3 # This file is part of the Adblock Plus web scripts, |
| 4 # Copyright (C) 2006-2013 Eyeo GmbH | 4 # Copyright (C) 2006-2013 Eyeo GmbH |
| 5 # | 5 # |
| 6 # Adblock Plus is free software: you can redistribute it and/or modify | 6 # Adblock Plus is free software: you can redistribute it and/or modify |
| 7 # it under the terms of the GNU General Public License version 3 as | 7 # it under the terms of the GNU General Public License version 3 as |
| 8 # published by the Free Software Foundation. | 8 # published by the Free Software Foundation. |
| 9 # | 9 # |
| 10 # Adblock Plus is distributed in the hope that it will be useful, | 10 # Adblock Plus is distributed in the hope that it will be useful, |
| 11 # but WITHOUT ANY WARRANTY; without even the implied warranty of | 11 # but WITHOUT ANY WARRANTY; without even the implied warranty of |
| 12 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the | 12 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the |
| 13 # GNU General Public License for more details. | 13 # GNU General Public License for more details. |
| 14 # | 14 # |
| 15 # You should have received a copy of the GNU General Public License | 15 # You should have received a copy of the GNU General Public License |
| 16 # along with Adblock Plus. If not, see <http://www.gnu.org/licenses/>. | 16 # along with Adblock Plus. If not, see <http://www.gnu.org/licenses/>. |
| 17 | 17 |
| 18 import sys, re, math, GeoIP | 18 import os, sys, codecs, re, math, GeoIP, urllib, urlparse, socket, simplejson |
| 19 import sitescripts.stats.common as common | |
| 19 from sitescripts.utils import get_config, setupStderr | 20 from sitescripts.utils import get_config, setupStderr |
| 20 from datetime import datetime, timedelta | 21 from datetime import datetime, timedelta |
| 21 from ConfigParser import SafeConfigParser, NoOptionError | 22 |
| 22 | 23 log_regexp = None |
| 23 def parseUA(ua): | 24 mirror_name = None |
| 25 gecko_apps = None | |
| 26 | |
| 27 def parse_ua(ua): | |
| 24 # Opera might disguise itself as other browser so it needs to go first | 28 # Opera might disguise itself as other browser so it needs to go first |
| 25 match = re.search(r'\bOpera/([\d\.]+)', ua) | 29 match = re.search(r"\bOpera/([\d\.]+)", ua) |
| 26 if match: | 30 if match: |
| 27 # Opera 10+ declares itself as Opera 9.80 but adds Version/1x.x to the UA | 31 # Opera 10+ declares itself as Opera 9.80 but adds Version/1x.x to the UA |
| 28 match2 = re.search(r'\bVersion/([\d\.]+)', ua) | 32 match2 = re.search(r"\bVersion/([\d\.]+)", ua) |
| 29 if match2: | 33 if match2: |
| 30 return 'Opera %s' % match2.group(1) | 34 return "Opera", match2.group(1) |
| 31 else: | 35 else: |
| 32 return 'Opera %s' % match.group(1) | 36 return "Opera", match.group(1) |
| 33 | 37 |
| 34 # Opera 15+ has the same UA as Chrome but adds OPR/1x.x to it | 38 # Opera 15+ has the same UA as Chrome but adds OPR/1x.x to it |
| 35 match = re.search(r'\bOPR/(\d+\.\d+)', ua) | 39 match = re.search(r"\bOPR/(\d+\.\d+)", ua) |
| 36 if match: | 40 if match: |
| 37 return 'Opera %s' % match.group(1) | 41 return "Opera", match.group(1) |
| 38 | 42 |
| 39 for appName in ('Fennec', 'Thunderbird', 'SeaMonkey', 'Songbird', 'K-Meleon', 'Prism', 'Firefox'): | 43 for appName in ("Fennec", "Thunderbird", "SeaMonkey", "Songbird", "K-Meleon", "Prism", "Firefox"): |
| 40 match = re.search(r'\b%s/(\d+\.\d+)' % appName, ua) | 44 match = re.search(r"\b%s/(\d+\.\d+)" % appName, ua) |
|
Sebastian Noack
2013/08/26 16:05:22
Instead of iterating over the list of browsers and
Wladimir Palant
2013/08/27 07:34:28
This was done like that intentionally - quite a fe
| |
| 41 if match: | 45 if match: |
| 42 return '%s %s' % (appName, match.group(1)) | 46 if appName == "Fennec" or (appName == "Firefox" and re.search(r"\bMobile;" , ua)): |
| 43 | 47 return "Firefox Mobile", match.group(1) |
| 44 match = re.search(r'\brv:(\d+)\.(\d+)(?:\.(\d+))?', ua) | 48 else: |
| 45 if match and re.search(r'\bGecko/', ua): | 49 return appName, match.group(1) |
| 50 | |
| 51 match = re.search(r"\brv:(\d+)\.(\d+)(?:\.(\d+))?", ua) | |
| 52 if match and re.search(r"\bGecko/", ua): | |
| 46 if match.group(3) and int(match.group(1)) < 2: | 53 if match.group(3) and int(match.group(1)) < 2: |
| 47 return 'Gecko %s.%s.%s' % (match.group(1), match.group(2), match.group(3)) | 54 return "Gecko", "%s.%s.%s" % (match.group(1), match.group(2), match.group( 3)) |
|
Sebastian Noack
2013/08/26 16:05:22
You could just just call match.groups(), which alr
Wladimir Palant
2013/08/27 07:34:28
I think I rather keep this as is for consistency w
| |
| 48 else: | 55 else: |
| 49 return 'Gecko %s.%s' % (match.group(1), match.group(2)) | 56 return "Gecko", "%s.%s" % (match.group(1), match.group(2)) |
| 50 | 57 |
| 51 match = re.search(r'\bChrome/(\d+\.\d+)', ua) | 58 match = re.search(r"\bCoolNovo/(\d+\.\d+\.\d+)", ua) |
| 52 if match: | 59 if match: |
| 53 return 'Chrome %s' % match.group(1) | 60 return "CoolNovo", match.group(1) |
| 54 | 61 |
| 55 match = re.search(r'\bVersion/(\d+\.\d+)', ua) | 62 match = re.search(r"\bChrome/(\d+\.\d+)", ua) |
| 56 if match and re.search(r'\Safari/', ua): | 63 if match: |
| 57 return 'Safari %s' % match.group(1) | 64 return "Chrome", match.group(1) |
| 58 | 65 |
| 59 if re.search(r'\bAppleWebKit/', ua): | 66 match = re.search(r"\bVersion/(\d+\.\d+)", ua) |
| 60 return 'WebKit' | 67 if match and re.search(r"\bMobile Safari/", ua): |
| 61 | 68 return "Mobile Safari", match.group(1) |
| 62 match = re.search(r'\bMSIE (\d+\.\d+)', ua) | 69 if match and re.search(r"\bSafari/", ua): |
| 63 if match: | 70 return "Safari", match.group(1) |
| 64 return 'MSIE %s' % match.group(1) | 71 |
| 65 | 72 if re.search(r"\bAppleWebKit/", ua): |
| 66 return 'Other' | 73 return "WebKit", "" |
| 67 | 74 |
| 68 def parseStdIn(geo): | 75 match = re.search(r"\bMSIE (\d+\.\d+)", ua) |
| 69 if get_config().has_option('logs', 'subscriptionsSubdir'): | 76 if match: |
| 70 subdir = get_config().get('logs', 'subscriptionsSubdir') | 77 return "MSIE", match.group(1) |
| 71 subdir = re.sub(r'^/+', '', subdir) | 78 |
| 72 subdir = re.sub(r'/+$', '', subdir) | 79 match = re.search(r"\bTrident/(\d+\.\d+)", ua) |
| 73 subdir = re.sub(r'(?=\W)', r'\\', subdir) | 80 if match: |
| 74 subdir = subdir + '/' | 81 return "Trident", match.group(1) |
| 82 | |
| 83 match = re.search(r"\bAndroidDownloadManager(?:/(\d+\.\d+))?", ua) | |
| 84 if match: | |
| 85 return "Android", match.group(1) or "" | |
| 86 | |
| 87 match = re.search(r"\bDalvik/.*\bAndroid (\d+\.\d+)", ua) | |
| 88 if match: | |
| 89 return "Android", match.group(1) | |
| 90 | |
| 91 # ABP/Android downloads use that user agent | |
| 92 if ua.startswith("Apache-HttpClient/UNAVAILABLE"): | |
| 93 return "Android", "" | |
| 94 | |
| 95 # ABP/IE downloads use that user agent | |
| 96 if ua == "Adblock Plus": | |
| 97 return "ABP", "" | |
| 98 | |
| 99 return "Other", "" | |
| 100 | |
| 101 def process_ip(ip, geo): | |
| 102 match = re.search(r"^::ffff:(\d+\.\d+\.\d+\.\d+)$", ip) | |
| 103 if match: | |
| 104 ip = match.group(1) | |
| 105 | |
| 106 country = geo.country_code_by_addr(ip) | |
| 107 if country in (None, "", "--"): | |
| 108 country = "unknown" | |
| 109 country = country.lower() | |
| 110 | |
| 111 return ip, country | |
| 112 | |
| 113 def parse_time(timestr, tz_hours, tz_minutes): | |
| 114 result = datetime.strptime(timestr, "%d/%b/%Y:%H:%M:%S") | |
| 115 result -= timedelta(hours = tz_hours, minutes = math.copysign(tz_minutes, tz_h ours)) | |
| 116 return result, result.strftime("%Y%m"), result.day, result.weekday(), result.h our | |
| 117 | |
| 118 def parse_path(path): | |
| 119 urlparts = urlparse.urlparse(path) | |
| 120 try: | |
| 121 path = urllib.unquote(urlparts.path).decode("utf-8") | |
| 122 except: | |
| 123 path = urlparts.path | |
| 124 return path[1:], urlparts.query | |
| 125 | |
| 126 def parse_downloader_query(info): | |
| 127 params = urlparse.parse_qs(info["query"]) | |
| 128 for param in ("addonName", "addonVersion", "application", "applicationVersion" , "platform", "platformVersion"): | |
| 129 info[param] = params.get(param, ["unknown"])[0] | |
| 130 | |
| 131 # Only leave the major and minor release number for application and platform | |
| 132 info["applicationVersion"] = re.sub(r"^(\d+\.\d+).*", r"\1", info["application Version"]) | |
| 133 info["platformVersion"] = re.sub(r"^(\d+\.\d+).*", r"\1", info["platformVersio n"]) | |
| 134 | |
| 135 # Chrome Adblock sends an X-Client-ID header insteads of URL parameters | |
| 136 match = re.match(r"^adblock/([\d\.]+)$", info["clientid"], re.I) if info["clie ntid"] else None | |
| 137 if match: | |
| 138 info["addonName"] = "chromeadblock" | |
| 139 info["addonVersion"] = match.group(1) | |
| 140 | |
| 141 last_version = params.get("lastVersion", ["unknown"])[0] | |
| 142 if info["file"] == "notification.json" and last_version == "0" and ( | |
| 143 (info["addonName"] == "adblockplus" and info["addonVersion"] == "2.3.1") o r | |
| 144 (info["addonName"] in ("adblockpluschrome", "adblockplusopera") and info[" addonVersion"] == "1.5.2") | |
| 145 ): | |
| 146 # Broken notification version number in these releases, treat like unknown | |
| 147 last_version = "unknown" | |
| 148 | |
| 149 if last_version == "unknown": | |
| 150 info["downloadInterval"] = "unknown" | |
| 151 elif last_version == "0": | |
| 152 info["downloadInterval"] = "unknown" | |
| 153 info["firstDownload"] = info["firstInMonth"] = info["firstInWeek"] = info["f irstInDay"] = True | |
| 75 else: | 154 else: |
| 76 subdir = '' | 155 try: |
| 77 regexp = re.compile(r'(\S+) \S+ \S+ \[([^]\s]+) ([+\-]\d\d)(\d\d)\] "GET (?:\w +://[^/]+)?/%s([\w\-\+\.]+\.(?:txt|tpl))(?:\?[^\s"]*)? [^"]+" (\d+) (\d+) "[^"]* " "([^"]*)"' % subdir) | 156 last_update = datetime.strptime(last_version, "%Y%m%d%H%M") |
| 78 | 157 diff = info["time"] - last_update |
| 158 if diff.days >= 365: | |
| 159 info["downloadInterval"] = "%i year(s)" % (diff.days / 365) | |
| 160 elif diff.days >= 30: | |
| 161 info["downloadInterval"] = "%i month(s)" % (diff.days / 30) | |
| 162 elif diff.days >= 1: | |
| 163 info["downloadInterval"] = "%i day(s)" % diff.days | |
| 164 else: | |
| 165 info["downloadInterval"] = "%i hour(s)" % (diff.seconds / 3600) | |
| 166 | |
| 167 if last_update.year != info["time"].year or last_update.month != info["tim e"].month: | |
| 168 info["firstInMonth"] = info["firstInDay"] = True | |
| 169 elif last_update.day != info["time"].day: | |
| 170 info["firstInDay"] = True | |
| 171 | |
| 172 if last_update.isocalendar()[0:2] != info["time"].isocalendar()[0:2]: | |
| 173 info["firstInWeek"] = True | |
| 174 except ValueError: | |
| 175 info["downloadInterval"] = "unknown" | |
| 176 pass | |
| 177 | |
| 178 def parse_addon_name(file): | |
| 179 if "/" in file: | |
| 180 return file.split("/")[-2] | |
| 181 else: | |
| 182 return None | |
| 183 | |
| 184 def parse_gecko_query(query): | |
| 185 params = urlparse.parse_qs(query) | |
| 186 | |
| 187 version = params.get("version", ["unknown"])[0] | |
| 188 | |
| 189 global gecko_apps | |
| 190 if gecko_apps == None: | |
| 191 from buildtools.packagerGecko import KNOWN_APPS | |
| 192 gecko_apps = {v: k for k, v in KNOWN_APPS.iteritems()} | |
| 193 appID = params.get("appID", ["unknown"])[0] | |
| 194 | |
| 195 application = gecko_apps.get(appID, "unknown") | |
| 196 applicationVersion = params.get("appVersion", ["unknown"])[0] | |
| 197 | |
| 198 # Only leave the major and minor release number for application | |
| 199 applicationVersion = re.sub(r"^(\d+\.\d+).*", r"\1", applicationVersion) | |
| 200 | |
| 201 return version, application, applicationVersion | |
| 202 | |
| 203 def parse_chrome_query(query): | |
| 204 params = urlparse.parse_qs(query) | |
| 205 | |
| 206 if params.get("prod", ["unknown"])[0] in ("chromecrx", "chromiumcrx"): | |
| 207 application = "chrome" | |
| 208 else: | |
| 209 application = "unknown" | |
| 210 applicationVersion = params.get("prodversion", ["unknown"])[0] | |
| 211 | |
| 212 params2 = urlparse.parse_qs(params.get("x", [""])[0]) | |
| 213 version = params2.get("v", ["unknown"])[0] | |
| 214 | |
| 215 # Only leave the major and minor release number for application | |
| 216 applicationVersion = re.sub(r"^(\d+\.\d+).*", r"\1", applicationVersion) | |
| 217 | |
| 218 return version, application, applicationVersion | |
| 219 | |
| 220 def parse_update_flag(query): | |
| 221 return "update" if query == "update" else "install" | |
| 222 | |
| 223 def parse_record(line, ignored, geo): | |
| 224 global log_regexp, mirror_name | |
| 225 if log_regexp == None: | |
| 226 log_regexp = re.compile(r'(\S+) \S+ \S+ \[([^]\s]+) ([+\-]\d\d)(\d\d)\] "GET ([^"\s]+) [^"]+" (\d+) (\d+) "[^"]*" "([^"]*)"(?: "[^"]*" \S+ "[^"]*" "[^"]*" " ([^"]*)")?') | |
| 227 if mirror_name == None: | |
| 228 mirror_name = re.sub(r"\..*", "", socket.gethostname()) | |
| 229 | |
| 230 match = re.search(log_regexp, line) | |
| 231 if not match: | |
| 232 return None | |
| 233 | |
| 234 status = int(match.group(6)) | |
| 235 if status != 200: | |
| 236 return None | |
| 237 | |
| 238 info = { | |
| 239 "mirror": mirror_name, | |
| 240 "size": int(match.group(7)), | |
| 241 } | |
| 242 | |
| 243 info["ip"], info["country"] = process_ip(match.group(1), geo) | |
| 244 info["time"], info["month"], info["day"], info["weekday"], info["hour"] = pars e_time(match.group(2), int(match.group(3)), int(match.group(4))) | |
| 245 info["file"], info["query"] = parse_path(match.group(5)) | |
| 246 info["ua"], info["uaversion"] = parse_ua(match.group(8)) | |
| 247 info["fullua"] = "%s %s" % (info["ua"], info["uaversion"]) | |
| 248 info["clientid"] = match.group(9) | |
| 249 | |
| 250 # Additional metadata depends on file type | |
| 251 filename = os.path.basename(info["file"]) | |
| 252 ext = os.path.splitext(filename)[1] | |
| 253 if ext == ".txt" or filename == "update.json" or filename == "notification.jso n": | |
| 254 # Subscription downloads, libadblockplus update checks and notification | |
| 255 # checks are performed by the downloader | |
| 256 parse_downloader_query(info) | |
| 257 elif ext == ".tpl": | |
| 258 # MSIE TPL download, no additional data here | |
| 259 pass | |
| 260 elif ext in (".xpi", ".crx", ".apk", ".msi", ".exe"): | |
| 261 # Package download, might be an update | |
| 262 info["installType"] = parse_update_flag(info["query"]) | |
| 263 elif filename == "update.rdf": | |
| 264 # Gecko update check or a legacy Android update check. The latter doesn't | |
| 265 # have usable data anyway so trying the Chrome route won't do any harm. | |
| 266 info["addonName"] = parse_addon_name(info["file"]) | |
| 267 info["addonVersion"], info["application"], info["applicationVersion"] = pars e_gecko_query(info["query"]) | |
| 268 elif filename == "updates.xml": | |
| 269 # Chrome update check | |
| 270 info["addonName"] = parse_addon_name(info["file"]) | |
| 271 info["addonVersion"], info["application"], info["applicationVersion"] = pars e_chrome_query(info["query"]) | |
| 272 else: | |
| 273 ignored.add(info["file"]) | |
| 274 return None | |
| 275 | |
| 276 if "addonName" in info: | |
| 277 info["fullAddon"] = "%s %s" % (info["addonName"], info["addonVersion"]) | |
| 278 if "application" in info: | |
| 279 info["fullApplication"] = "%s %s" % (info["application"], info["applicationV ersion"]) | |
| 280 if "platform" in info: | |
| 281 info["fullPlatform"] = "%s %s" % (info["platform"], info["platformVersion"]) | |
| 282 return info | |
| 283 | |
| 284 def add_record(info, section, ignore_fields=()): | |
| 285 section["hits"] = section.get("hits", 0) + 1 | |
| 286 section["bandwidth"] = section.get("bandwidth", 0) + info["size"] | |
| 287 | |
| 288 if len(ignore_fields) < 2: | |
| 289 for field in map(lambda f: f["name"], common.fields): | |
| 290 if field in ignore_fields or field not in info: | |
| 291 continue | |
| 292 | |
| 293 value = info[field] | |
| 294 if field not in section: | |
| 295 section[field] = {} | |
| 296 if value not in section[field]: | |
| 297 section[field][value] = {} | |
| 298 | |
| 299 add_record(info, section[field][value], ignore_fields + (field,)) | |
| 300 | |
| 301 def parse_stdin(geo, verbose): | |
| 79 data = {} | 302 data = {} |
| 303 ignored = set() | |
| 80 for line in sys.stdin: | 304 for line in sys.stdin: |
| 81 match = re.search(regexp, line) | 305 info = parse_record(line, ignored, geo) |
| 82 if not match: | 306 if info == None: |
| 83 continue | 307 continue |
| 84 | 308 |
| 85 ip, time, tzHours, tzMinutes = match.group(1), match.group(2), int(match.gro up(3)), int(match.group(4)) | 309 if info["month"] not in data: |
| 86 file, status, size, ua = match.group(5), int(match.group(6)), int(match.grou p(7)), match.group(8) | 310 data[info["month"]] = {} |
| 87 if status != 200 and status != 302 and status != 304: | 311 section = data[info["month"]] |
| 88 continue | 312 |
| 89 if file.startswith('robots.'): | 313 if info["file"] not in section: |
| 90 continue | 314 section[info["file"]] = {} |
| 91 | 315 section = section[info["file"]] |
| 92 time = datetime.strptime(time, '%d/%b/%Y:%H:%M:%S') | 316 |
| 93 time -= timedelta(hours = tzHours, minutes = math.copysign(tzMinutes, tzHour s)) | 317 add_record(info, section) |
| 94 | 318 |
| 95 match = re.search(r'^::ffff:(\d+\.\d+\.\d+\.\d+)$', ip) | 319 if verbose: |
| 96 if match: | 320 print "Ignored files" |
| 97 ip = match.group(1) | 321 print "=============" |
| 98 country = geo.country_code_by_addr(ip) | 322 print "\n".join(sorted(ignored)) |
| 99 if country == '' or country == '--': | 323 return data |
| 100 country = 'unknown' | 324 |
| 101 | 325 if __name__ == "__main__": |
| 102 ua = parseUA(ua)[:20] | |
| 103 | |
| 104 section = time.strftime('%Y%m') | |
| 105 if not section in data: | |
| 106 data[section] = {} | |
| 107 | |
| 108 def addResultInt(key, value): | |
| 109 if key in data[section]: | |
| 110 data[section][key] += value | |
| 111 else: | |
| 112 data[section][key] = value | |
| 113 | |
| 114 addResultInt('%s hits' % file, 1) | |
| 115 addResultInt('%s bandwidth' % file, size) | |
| 116 addResultInt('%s hits day %i' % (file, time.day), 1) | |
| 117 addResultInt('%s bandwidth day %i' % (file, time.day), size) | |
| 118 addResultInt('%s hits hour %i' % (file, time.hour), 1) | |
| 119 addResultInt('%s bandwidth hour %i' % (file, time.hour), size) | |
| 120 addResultInt('%s hits country %s' % (file, country), 1) | |
| 121 addResultInt('%s bandwidth country %s' % (file, country), size) | |
| 122 addResultInt('%s hits app %s' % (file, ua), 1) | |
| 123 addResultInt('%s bandwidth app %s' % (file, ua), size) | |
| 124 | |
| 125 result = SafeConfigParser() | |
| 126 for section in data.iterkeys(): | |
| 127 result.add_section(section) | |
| 128 for key, value in data[section].iteritems(): | |
| 129 result.set(section, key, str(value)) | |
| 130 return result | |
| 131 | |
| 132 if __name__ == '__main__': | |
| 133 setupStderr() | 326 setupStderr() |
| 134 | 327 |
| 135 geo = GeoIP.open(get_config().get('logs', 'geoip_db'), GeoIP.GEOIP_MEMORY_CACH E) | 328 verbose = (len(sys.argv) >= 2 and sys.argv[1] == "verbose") |
| 136 result = parseStdIn(geo) | 329 geo = GeoIP.open(get_config().get("stats", "geoip_db"), GeoIP.GEOIP_MEMORY_CAC HE) |
| 137 | 330 result = parse_stdin(geo, verbose) |
| 138 file = open(get_config().get('subscriptionStats', 'tempFile'), 'wb') | 331 |
| 139 result.write(file) | 332 with codecs.open(get_config().get("stats", "tempFile"), "wb", encoding="utf-8" ) as file: |
| 333 simplejson.dump(result, file, indent=2, sort_keys=True) | |
| OLD | NEW |