| OLD | NEW |
| 1 # coding: utf-8 | 1 # coding: utf-8 |
| 2 | 2 |
| 3 # This file is part of the Adblock Plus web scripts, | 3 # This file is part of the Adblock Plus web scripts, |
| 4 # Copyright (C) 2006-2013 Eyeo GmbH | 4 # Copyright (C) 2006-2013 Eyeo GmbH |
| 5 # | 5 # |
| 6 # Adblock Plus is free software: you can redistribute it and/or modify | 6 # Adblock Plus is free software: you can redistribute it and/or modify |
| 7 # it under the terms of the GNU General Public License version 3 as | 7 # it under the terms of the GNU General Public License version 3 as |
| 8 # published by the Free Software Foundation. | 8 # published by the Free Software Foundation. |
| 9 # | 9 # |
| 10 # Adblock Plus is distributed in the hope that it will be useful, | 10 # Adblock Plus is distributed in the hope that it will be useful, |
| 11 # but WITHOUT ANY WARRANTY; without even the implied warranty of | 11 # but WITHOUT ANY WARRANTY; without even the implied warranty of |
| 12 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the | 12 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the |
| 13 # GNU General Public License for more details. | 13 # GNU General Public License for more details. |
| 14 # | 14 # |
| 15 # You should have received a copy of the GNU General Public License | 15 # You should have received a copy of the GNU General Public License |
| 16 # along with Adblock Plus. If not, see <http://www.gnu.org/licenses/>. | 16 # along with Adblock Plus. If not, see <http://www.gnu.org/licenses/>. |
| 17 | 17 |
| 18 import os, sys, codecs, re, math, GeoIP, urllib, urlparse, socket, simplejson | 18 import os, sys, codecs, re, math, urllib, urlparse, socket, json |
| 19 import pygeoip |
| 19 from collections import OrderedDict | 20 from collections import OrderedDict |
| 20 import sitescripts.stats.common as common | 21 import sitescripts.stats.common as common |
| 21 from sitescripts.utils import get_config, setupStderr | 22 from sitescripts.utils import get_config, setupStderr |
| 22 from datetime import datetime, timedelta | 23 from datetime import datetime, timedelta |
| 23 | 24 |
| 24 log_regexp = None | 25 log_regexp = None |
| 25 mirror_name = None | 26 mirror_name = None |
| 26 gecko_apps = None | 27 gecko_apps = None |
| 27 | 28 |
| 28 def cache_lru(func): | 29 def cache_lru(func): |
| (...skipping 111 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
| 140 # ABP/Android downloads use that user agent | 141 # ABP/Android downloads use that user agent |
| 141 if ua.startswith("Apache-HttpClient/UNAVAILABLE"): | 142 if ua.startswith("Apache-HttpClient/UNAVAILABLE"): |
| 142 return "Android", "" | 143 return "Android", "" |
| 143 | 144 |
| 144 # ABP/IE downloads use that user agent | 145 # ABP/IE downloads use that user agent |
| 145 if ua == "Adblock Plus": | 146 if ua == "Adblock Plus": |
| 146 return "ABP", "" | 147 return "ABP", "" |
| 147 | 148 |
| 148 return "Other", "" | 149 return "Other", "" |
| 149 | 150 |
| 150 def process_ip(ip, geo): | 151 def process_ip(ip, geo, geov6): |
| 151 match = re.search(r"^::ffff:(\d+\.\d+\.\d+\.\d+)$", ip) | 152 match = re.search(r"^::ffff:(\d+\.\d+\.\d+\.\d+)$", ip) |
| 152 if match: | 153 if match: |
| 153 ip = match.group(1) | 154 ip = match.group(1) |
| 154 | 155 |
| 155 country = geo.country_code_by_addr(ip) | 156 if ":" in ip: |
| 157 country = geov6.country_code_by_addr(ip) |
| 158 else: |
| 159 country = geo.country_code_by_addr(ip) |
| 156 if country in (None, "", "--"): | 160 if country in (None, "", "--"): |
| 157 country = "unknown" | 161 country = "unknown" |
| 158 country = country.lower() | 162 country = country.lower() |
| 159 | 163 |
| 160 return ip, country | 164 return ip, country |
| 161 | 165 |
| 162 @cache_last | 166 @cache_last |
| 163 def parse_time(timestr, tz_hours, tz_minutes): | 167 def parse_time(timestr, tz_hours, tz_minutes): |
| 164 result = datetime.strptime(timestr, "%d/%b/%Y:%H:%M:%S") | 168 result = datetime.strptime(timestr, "%d/%b/%Y:%H:%M:%S") |
| 165 result -= timedelta(hours = tz_hours, minutes = math.copysign(tz_minutes, tz_h
ours)) | 169 result -= timedelta(hours = tz_hours, minutes = math.copysign(tz_minutes, tz_h
ours)) |
| (...skipping 110 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
| 276 version = params2.get("v", ["unknown"])[0] | 280 version = params2.get("v", ["unknown"])[0] |
| 277 | 281 |
| 278 # Only leave the major and minor release number for application | 282 # Only leave the major and minor release number for application |
| 279 applicationVersion = re.sub(r"^(\d+\.\d+).*", r"\1", applicationVersion) | 283 applicationVersion = re.sub(r"^(\d+\.\d+).*", r"\1", applicationVersion) |
| 280 | 284 |
| 281 return version, application, applicationVersion | 285 return version, application, applicationVersion |
| 282 | 286 |
| 283 def parse_update_flag(query): | 287 def parse_update_flag(query): |
| 284 return "update" if query == "update" else "install" | 288 return "update" if query == "update" else "install" |
| 285 | 289 |
| 286 def parse_record(line, ignored, geo): | 290 def parse_record(line, ignored, geo, geov6): |
| 287 global log_regexp, mirror_name | 291 global log_regexp, mirror_name |
| 288 if log_regexp == None: | 292 if log_regexp == None: |
| 289 log_regexp = re.compile(r'(\S+) \S+ \S+ \[([^]\s]+) ([+\-]\d\d)(\d\d)\] "GET
([^"\s]+) [^"]+" (\d+) (\d+) "[^"]*" "([^"]*)"(?: "[^"]*" \S+ "[^"]*" "[^"]*" "
([^"]*)")?') | 293 log_regexp = re.compile(r'(\S+) \S+ \S+ \[([^]\s]+) ([+\-]\d\d)(\d\d)\] "GET
([^"\s]+) [^"]+" (\d+) (\d+) "[^"]*" "([^"]*)"(?: "[^"]*" \S+ "[^"]*" "[^"]*" "
([^"]*)")?') |
| 290 if mirror_name == None: | 294 if mirror_name == None: |
| 291 mirror_name = re.sub(r"\..*", "", socket.gethostname()) | 295 mirror_name = re.sub(r"\..*", "", socket.gethostname()) |
| 292 | 296 |
| 293 match = re.search(log_regexp, line) | 297 match = re.search(log_regexp, line) |
| 294 if not match: | 298 if not match: |
| 295 return None | 299 return None |
| 296 | 300 |
| 297 status = int(match.group(6)) | 301 status = int(match.group(6)) |
| 298 if status != 200: | 302 if status != 200: |
| 299 return None | 303 return None |
| 300 | 304 |
| 301 info = { | 305 info = { |
| 302 "mirror": mirror_name, | 306 "mirror": mirror_name, |
| 303 "size": int(match.group(7)), | 307 "size": int(match.group(7)), |
| 304 } | 308 } |
| 305 | 309 |
| 306 info["ip"], info["country"] = process_ip(match.group(1), geo) | 310 info["ip"], info["country"] = process_ip(match.group(1), geo, geov6) |
| 307 info["time"], info["month"], info["day"], info["weekday"], info["hour"] = pars
e_time(match.group(2), int(match.group(3)), int(match.group(4))) | 311 info["time"], info["month"], info["day"], info["weekday"], info["hour"] = pars
e_time(match.group(2), int(match.group(3)), int(match.group(4))) |
| 308 info["file"], info["query"] = parse_path(match.group(5)) | 312 info["file"], info["query"] = parse_path(match.group(5)) |
| 309 info["ua"], info["uaversion"] = parse_ua(match.group(8)) | 313 info["ua"], info["uaversion"] = parse_ua(match.group(8)) |
| 310 info["fullua"] = "%s %s" % (info["ua"], info["uaversion"]) | 314 info["fullua"] = "%s %s" % (info["ua"], info["uaversion"]) |
| 311 info["clientid"] = match.group(9) | 315 info["clientid"] = match.group(9) |
| 312 | 316 |
| 313 # Additional metadata depends on file type | 317 # Additional metadata depends on file type |
| 314 filename = os.path.basename(info["file"]) | 318 filename = os.path.basename(info["file"]) |
| 315 ext = os.path.splitext(filename)[1] | 319 ext = os.path.splitext(filename)[1] |
| 316 if ext == ".txt" or filename == "update.json" or filename == "notification.jso
n": | 320 if ext == ".txt" or filename == "update.json" or filename == "notification.jso
n": |
| (...skipping 37 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
| 354 continue | 358 continue |
| 355 | 359 |
| 356 value = info[field] | 360 value = info[field] |
| 357 if field not in section: | 361 if field not in section: |
| 358 section[field] = {} | 362 section[field] = {} |
| 359 if value not in section[field]: | 363 if value not in section[field]: |
| 360 section[field][value] = {} | 364 section[field][value] = {} |
| 361 | 365 |
| 362 add_record(info, section[field][value], ignore_fields + (field,)) | 366 add_record(info, section[field][value], ignore_fields + (field,)) |
| 363 | 367 |
| 364 def parse_stdin(geo, verbose): | 368 def parse_stdin(geo, geov6, verbose): |
| 365 data = {} | 369 data = {} |
| 366 ignored = set() | 370 ignored = set() |
| 367 for line in sys.stdin: | 371 for line in sys.stdin: |
| 368 info = parse_record(line, ignored, geo) | 372 info = parse_record(line, ignored, geo, geov6) |
| 369 if info == None: | 373 if info == None: |
| 370 continue | 374 continue |
| 371 | 375 |
| 372 if info["month"] not in data: | 376 if info["month"] not in data: |
| 373 data[info["month"]] = {} | 377 data[info["month"]] = {} |
| 374 section = data[info["month"]] | 378 section = data[info["month"]] |
| 375 | 379 |
| 376 if info["file"] not in section: | 380 if info["file"] not in section: |
| 377 section[info["file"]] = {} | 381 section[info["file"]] = {} |
| 378 section = section[info["file"]] | 382 section = section[info["file"]] |
| 379 | 383 |
| 380 add_record(info, section) | 384 add_record(info, section) |
| 381 | 385 |
| 382 if verbose: | 386 if verbose: |
| 383 print "Ignored files" | 387 print "Ignored files" |
| 384 print "=============" | 388 print "=============" |
| 385 print "\n".join(sorted(ignored)) | 389 print "\n".join(sorted(ignored)) |
| 386 return data | 390 return data |
| 387 | 391 |
| 388 if __name__ == "__main__": | 392 if __name__ == "__main__": |
| 389 setupStderr() | 393 setupStderr() |
| 390 | 394 |
| 391 verbose = (len(sys.argv) >= 2 and sys.argv[1] == "verbose") | 395 verbose = (len(sys.argv) >= 2 and sys.argv[1] == "verbose") |
| 392 geo = GeoIP.open(get_config().get("stats", "geoip_db"), GeoIP.GEOIP_MEMORY_CAC
HE) | 396 geo = pygeoip.GeoIP(get_config().get("stats", "geoip_db"), pygeoip.MEMORY_CACH
E) |
| 393 result = parse_stdin(geo, verbose) | 397 geov6 = pygeoip.GeoIP(get_config().get("stats", "geoipv6_db"), pygeoip.MEMORY_
CACHE) |
| 398 result = parse_stdin(geo, geov6, verbose) |
| 394 | 399 |
| 395 with codecs.open(get_config().get("stats", "tempFile"), "wb", encoding="utf-8"
) as file: | 400 with codecs.open(get_config().get("stats", "tempFile"), "wb", encoding="utf-8"
) as file: |
| 396 simplejson.dump(result, file, indent=2, sort_keys=True) | 401 json.dump(result, file, indent=2, sort_keys=True) |
| OLD | NEW |