| Index: sitescripts/stats/bin/logprocessor.py |
| =================================================================== |
| --- a/sitescripts/stats/bin/logprocessor.py |
| +++ b/sitescripts/stats/bin/logprocessor.py |
| @@ -10,17 +10,18 @@ |
| # Adblock Plus is distributed in the hope that it will be useful, |
| # but WITHOUT ANY WARRANTY; without even the implied warranty of |
| # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the |
| # GNU General Public License for more details. |
| # |
| # You should have received a copy of the GNU General Public License |
| # along with Adblock Plus. If not, see <http://www.gnu.org/licenses/>. |
| -import os, sys, codecs, re, math, GeoIP, urllib, urlparse, socket, simplejson |
| +import os, sys, codecs, re, math, urllib, urlparse, socket, json |
| +import pygeoip |
| from collections import OrderedDict |
| import sitescripts.stats.common as common |
| from sitescripts.utils import get_config, setupStderr |
| from datetime import datetime, timedelta |
| log_regexp = None |
| mirror_name = None |
| gecko_apps = None |
| @@ -142,22 +143,25 @@ def parse_ua(ua): |
| return "Android", "" |
| # ABP/IE downloads use that user agent |
| if ua == "Adblock Plus": |
| return "ABP", "" |
| return "Other", "" |
| -def process_ip(ip, geo): |
| +def process_ip(ip, geo, geov6): |
| match = re.search(r"^::ffff:(\d+\.\d+\.\d+\.\d+)$", ip) |
| if match: |
| ip = match.group(1) |
| - country = geo.country_code_by_addr(ip) |
| + if ":" in ip: |
| + country = geov6.country_code_by_addr(ip) |
| + else: |
| + country = geo.country_code_by_addr(ip) |
| if country in (None, "", "--"): |
| country = "unknown" |
| country = country.lower() |
| return ip, country |
| @cache_last |
| def parse_time(timestr, tz_hours, tz_minutes): |
| @@ -278,17 +282,17 @@ def parse_chrome_query(query): |
| # Only leave the major and minor release number for application |
| applicationVersion = re.sub(r"^(\d+\.\d+).*", r"\1", applicationVersion) |
| return version, application, applicationVersion |
| def parse_update_flag(query): |
| return "update" if query == "update" else "install" |
| -def parse_record(line, ignored, geo): |
| +def parse_record(line, ignored, geo, geov6): |
| global log_regexp, mirror_name |
| if log_regexp == None: |
| log_regexp = re.compile(r'(\S+) \S+ \S+ \[([^]\s]+) ([+\-]\d\d)(\d\d)\] "GET ([^"\s]+) [^"]+" (\d+) (\d+) "[^"]*" "([^"]*)"(?: "[^"]*" \S+ "[^"]*" "[^"]*" "([^"]*)")?') |
| if mirror_name == None: |
| mirror_name = re.sub(r"\..*", "", socket.gethostname()) |
| match = re.search(log_regexp, line) |
| if not match: |
| @@ -298,17 +302,17 @@ def parse_record(line, ignored, geo): |
| if status != 200: |
| return None |
| info = { |
| "mirror": mirror_name, |
| "size": int(match.group(7)), |
| } |
| - info["ip"], info["country"] = process_ip(match.group(1), geo) |
| + info["ip"], info["country"] = process_ip(match.group(1), geo, geov6) |
| info["time"], info["month"], info["day"], info["weekday"], info["hour"] = parse_time(match.group(2), int(match.group(3)), int(match.group(4))) |
| info["file"], info["query"] = parse_path(match.group(5)) |
| info["ua"], info["uaversion"] = parse_ua(match.group(8)) |
| info["fullua"] = "%s %s" % (info["ua"], info["uaversion"]) |
| info["clientid"] = match.group(9) |
| # Additional metadata depends on file type |
| filename = os.path.basename(info["file"]) |
| @@ -356,21 +360,21 @@ def add_record(info, section, ignore_fie |
| value = info[field] |
| if field not in section: |
| section[field] = {} |
| if value not in section[field]: |
| section[field][value] = {} |
| add_record(info, section[field][value], ignore_fields + (field,)) |
| -def parse_stdin(geo, verbose): |
| +def parse_stdin(geo, geov6, verbose): |
| data = {} |
| ignored = set() |
| for line in sys.stdin: |
| - info = parse_record(line, ignored, geo) |
| + info = parse_record(line, ignored, geo, geov6) |
| if info == None: |
| continue |
| if info["month"] not in data: |
| data[info["month"]] = {} |
| section = data[info["month"]] |
| if info["file"] not in section: |
| @@ -384,13 +388,14 @@ def parse_stdin(geo, verbose): |
| print "=============" |
| print "\n".join(sorted(ignored)) |
| return data |
| if __name__ == "__main__": |
| setupStderr() |
| verbose = (len(sys.argv) >= 2 and sys.argv[1] == "verbose") |
| - geo = GeoIP.open(get_config().get("stats", "geoip_db"), GeoIP.GEOIP_MEMORY_CACHE) |
| - result = parse_stdin(geo, verbose) |
| + geo = pygeoip.GeoIP(get_config().get("stats", "geoip_db"), pygeoip.MEMORY_CACHE) |
| + geov6 = pygeoip.GeoIP(get_config().get("stats", "geoipv6_db"), pygeoip.MEMORY_CACHE) |
| + result = parse_stdin(geo, geov6, verbose) |
| with codecs.open(get_config().get("stats", "tempFile"), "wb", encoding="utf-8") as file: |
| - simplejson.dump(result, file, indent=2, sort_keys=True) |
| + json.dump(result, file, indent=2, sort_keys=True) |