Index: sitescripts/stats/bin/logprocessor.py |
=================================================================== |
--- a/sitescripts/stats/bin/logprocessor.py |
+++ b/sitescripts/stats/bin/logprocessor.py |
@@ -10,17 +10,18 @@ |
# Adblock Plus is distributed in the hope that it will be useful, |
# but WITHOUT ANY WARRANTY; without even the implied warranty of |
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the |
# GNU General Public License for more details. |
# |
# You should have received a copy of the GNU General Public License |
# along with Adblock Plus. If not, see <http://www.gnu.org/licenses/>. |
-import os, sys, codecs, re, math, GeoIP, urllib, urlparse, socket, simplejson |
+import os, sys, codecs, re, math, urllib, urlparse, socket, json |
+import pygeoip |
from collections import OrderedDict |
import sitescripts.stats.common as common |
from sitescripts.utils import get_config, setupStderr |
from datetime import datetime, timedelta |
log_regexp = None |
mirror_name = None |
gecko_apps = None |
@@ -142,22 +143,25 @@ def parse_ua(ua): |
return "Android", "" |
# ABP/IE downloads use that user agent |
if ua == "Adblock Plus": |
return "ABP", "" |
return "Other", "" |
-def process_ip(ip, geo): |
+def process_ip(ip, geo, geov6): |
match = re.search(r"^::ffff:(\d+\.\d+\.\d+\.\d+)$", ip) |
if match: |
ip = match.group(1) |
- country = geo.country_code_by_addr(ip) |
+ if ":" in ip: |
+ country = geov6.country_code_by_addr(ip) |
+ else: |
+ country = geo.country_code_by_addr(ip) |
if country in (None, "", "--"): |
country = "unknown" |
country = country.lower() |
return ip, country |
@cache_last |
def parse_time(timestr, tz_hours, tz_minutes): |
@@ -278,17 +282,17 @@ def parse_chrome_query(query): |
# Only leave the major and minor release number for application |
applicationVersion = re.sub(r"^(\d+\.\d+).*", r"\1", applicationVersion) |
return version, application, applicationVersion |
def parse_update_flag(query): |
return "update" if query == "update" else "install" |
-def parse_record(line, ignored, geo): |
+def parse_record(line, ignored, geo, geov6): |
global log_regexp, mirror_name |
if log_regexp == None: |
log_regexp = re.compile(r'(\S+) \S+ \S+ \[([^]\s]+) ([+\-]\d\d)(\d\d)\] "GET ([^"\s]+) [^"]+" (\d+) (\d+) "[^"]*" "([^"]*)"(?: "[^"]*" \S+ "[^"]*" "[^"]*" "([^"]*)")?') |
if mirror_name == None: |
mirror_name = re.sub(r"\..*", "", socket.gethostname()) |
match = re.search(log_regexp, line) |
if not match: |
@@ -298,17 +302,17 @@ def parse_record(line, ignored, geo): |
if status != 200: |
return None |
info = { |
"mirror": mirror_name, |
"size": int(match.group(7)), |
} |
- info["ip"], info["country"] = process_ip(match.group(1), geo) |
+ info["ip"], info["country"] = process_ip(match.group(1), geo, geov6) |
info["time"], info["month"], info["day"], info["weekday"], info["hour"] = parse_time(match.group(2), int(match.group(3)), int(match.group(4))) |
info["file"], info["query"] = parse_path(match.group(5)) |
info["ua"], info["uaversion"] = parse_ua(match.group(8)) |
info["fullua"] = "%s %s" % (info["ua"], info["uaversion"]) |
info["clientid"] = match.group(9) |
# Additional metadata depends on file type |
filename = os.path.basename(info["file"]) |
@@ -356,21 +360,21 @@ def add_record(info, section, ignore_fie |
value = info[field] |
if field not in section: |
section[field] = {} |
if value not in section[field]: |
section[field][value] = {} |
add_record(info, section[field][value], ignore_fields + (field,)) |
-def parse_stdin(geo, verbose): |
+def parse_stdin(geo, geov6, verbose): |
data = {} |
ignored = set() |
for line in sys.stdin: |
- info = parse_record(line, ignored, geo) |
+ info = parse_record(line, ignored, geo, geov6) |
if info == None: |
continue |
if info["month"] not in data: |
data[info["month"]] = {} |
section = data[info["month"]] |
if info["file"] not in section: |
@@ -384,13 +388,14 @@ def parse_stdin(geo, verbose): |
print "=============" |
print "\n".join(sorted(ignored)) |
return data |
if __name__ == "__main__": |
setupStderr() |
verbose = (len(sys.argv) >= 2 and sys.argv[1] == "verbose") |
- geo = GeoIP.open(get_config().get("stats", "geoip_db"), GeoIP.GEOIP_MEMORY_CACHE) |
- result = parse_stdin(geo, verbose) |
+ geo = pygeoip.GeoIP(get_config().get("stats", "geoip_db"), pygeoip.MEMORY_CACHE) |
+ geov6 = pygeoip.GeoIP(get_config().get("stats", "geoipv6_db"), pygeoip.MEMORY_CACHE) |
+ result = parse_stdin(geo, geov6, verbose) |
with codecs.open(get_config().get("stats", "tempFile"), "wb", encoding="utf-8") as file: |
- simplejson.dump(result, file, indent=2, sort_keys=True) |
+ json.dump(result, file, indent=2, sort_keys=True) |