Rietveld Code Review Tool
Help | Bug tracker | Discussion group | Source code

Unified Diff: sitescripts/stats/bin/logprocessor.py

Issue 11577044: Switch to pygeoip for log processing to allow running it via PyPy (Closed)
Patch Set: Created Aug. 29, 2013, 7:38 p.m.
Use n/p to move between diff chunks; N/P to move between comments.
Jump to:
View side-by-side diff with in-line comments
Download patch
« no previous file with comments | « pygeoip/util.py ('k') | sitescripts/stats/test/logprocessor.py » ('j') | no next file with comments »
Expand Comments ('e') | Collapse Comments ('c') | Show Comments Hide Comments ('s')
Index: sitescripts/stats/bin/logprocessor.py
===================================================================
--- a/sitescripts/stats/bin/logprocessor.py
+++ b/sitescripts/stats/bin/logprocessor.py
@@ -10,17 +10,18 @@
# Adblock Plus is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with Adblock Plus. If not, see <http://www.gnu.org/licenses/>.
-import os, sys, codecs, re, math, GeoIP, urllib, urlparse, socket, simplejson
+import os, sys, codecs, re, math, urllib, urlparse, socket, json
+import pygeoip
from collections import OrderedDict
import sitescripts.stats.common as common
from sitescripts.utils import get_config, setupStderr
from datetime import datetime, timedelta
log_regexp = None
mirror_name = None
gecko_apps = None
@@ -142,22 +143,25 @@ def parse_ua(ua):
return "Android", ""
# ABP/IE downloads use that user agent
if ua == "Adblock Plus":
return "ABP", ""
return "Other", ""
-def process_ip(ip, geo):
+def process_ip(ip, geo, geov6):
match = re.search(r"^::ffff:(\d+\.\d+\.\d+\.\d+)$", ip)
if match:
ip = match.group(1)
- country = geo.country_code_by_addr(ip)
+ if ":" in ip:
+ country = geov6.country_code_by_addr(ip)
+ else:
+ country = geo.country_code_by_addr(ip)
if country in (None, "", "--"):
country = "unknown"
country = country.lower()
return ip, country
@cache_last
def parse_time(timestr, tz_hours, tz_minutes):
@@ -278,17 +282,17 @@ def parse_chrome_query(query):
# Only leave the major and minor release number for application
applicationVersion = re.sub(r"^(\d+\.\d+).*", r"\1", applicationVersion)
return version, application, applicationVersion
def parse_update_flag(query):
return "update" if query == "update" else "install"
-def parse_record(line, ignored, geo):
+def parse_record(line, ignored, geo, geov6):
global log_regexp, mirror_name
if log_regexp == None:
log_regexp = re.compile(r'(\S+) \S+ \S+ \[([^]\s]+) ([+\-]\d\d)(\d\d)\] "GET ([^"\s]+) [^"]+" (\d+) (\d+) "[^"]*" "([^"]*)"(?: "[^"]*" \S+ "[^"]*" "[^"]*" "([^"]*)")?')
if mirror_name == None:
mirror_name = re.sub(r"\..*", "", socket.gethostname())
match = re.search(log_regexp, line)
if not match:
@@ -298,17 +302,17 @@ def parse_record(line, ignored, geo):
if status != 200:
return None
info = {
"mirror": mirror_name,
"size": int(match.group(7)),
}
- info["ip"], info["country"] = process_ip(match.group(1), geo)
+ info["ip"], info["country"] = process_ip(match.group(1), geo, geov6)
info["time"], info["month"], info["day"], info["weekday"], info["hour"] = parse_time(match.group(2), int(match.group(3)), int(match.group(4)))
info["file"], info["query"] = parse_path(match.group(5))
info["ua"], info["uaversion"] = parse_ua(match.group(8))
info["fullua"] = "%s %s" % (info["ua"], info["uaversion"])
info["clientid"] = match.group(9)
# Additional metadata depends on file type
filename = os.path.basename(info["file"])
@@ -356,21 +360,21 @@ def add_record(info, section, ignore_fie
value = info[field]
if field not in section:
section[field] = {}
if value not in section[field]:
section[field][value] = {}
add_record(info, section[field][value], ignore_fields + (field,))
-def parse_stdin(geo, verbose):
+def parse_stdin(geo, geov6, verbose):
data = {}
ignored = set()
for line in sys.stdin:
- info = parse_record(line, ignored, geo)
+ info = parse_record(line, ignored, geo, geov6)
if info == None:
continue
if info["month"] not in data:
data[info["month"]] = {}
section = data[info["month"]]
if info["file"] not in section:
@@ -384,13 +388,14 @@ def parse_stdin(geo, verbose):
print "============="
print "\n".join(sorted(ignored))
return data
if __name__ == "__main__":
setupStderr()
verbose = (len(sys.argv) >= 2 and sys.argv[1] == "verbose")
- geo = GeoIP.open(get_config().get("stats", "geoip_db"), GeoIP.GEOIP_MEMORY_CACHE)
- result = parse_stdin(geo, verbose)
+ geo = pygeoip.GeoIP(get_config().get("stats", "geoip_db"), pygeoip.MEMORY_CACHE)
+ geov6 = pygeoip.GeoIP(get_config().get("stats", "geoipv6_db"), pygeoip.MEMORY_CACHE)
+ result = parse_stdin(geo, geov6, verbose)
with codecs.open(get_config().get("stats", "tempFile"), "wb", encoding="utf-8") as file:
- simplejson.dump(result, file, indent=2, sort_keys=True)
+ json.dump(result, file, indent=2, sort_keys=True)
« no previous file with comments | « pygeoip/util.py ('k') | sitescripts/stats/test/logprocessor.py » ('j') | no next file with comments »

Powered by Google App Engine
This is Rietveld