Rietveld Code Review Tool
Help | Bug tracker | Discussion group | Source code

Side by Side Diff: sitescripts/stats/bin/logprocessor.py

Issue 11577044: Switch to pygeoip for log processing to allow running it via PyPy (Closed)
Patch Set: Created Aug. 29, 2013, 7:38 p.m.
Left:
Right:
Use n/p to move between diff chunks; N/P to move between comments.
Jump to:
View unified diff | Download patch
« no previous file with comments | « pygeoip/util.py ('k') | sitescripts/stats/test/logprocessor.py » ('j') | no next file with comments »
Toggle Intra-line Diffs ('i') | Expand Comments ('e') | Collapse Comments ('c') | Show Comments Hide Comments ('s')
OLDNEW
1 # coding: utf-8 1 # coding: utf-8
2 2
3 # This file is part of the Adblock Plus web scripts, 3 # This file is part of the Adblock Plus web scripts,
4 # Copyright (C) 2006-2013 Eyeo GmbH 4 # Copyright (C) 2006-2013 Eyeo GmbH
5 # 5 #
6 # Adblock Plus is free software: you can redistribute it and/or modify 6 # Adblock Plus is free software: you can redistribute it and/or modify
7 # it under the terms of the GNU General Public License version 3 as 7 # it under the terms of the GNU General Public License version 3 as
8 # published by the Free Software Foundation. 8 # published by the Free Software Foundation.
9 # 9 #
10 # Adblock Plus is distributed in the hope that it will be useful, 10 # Adblock Plus is distributed in the hope that it will be useful,
11 # but WITHOUT ANY WARRANTY; without even the implied warranty of 11 # but WITHOUT ANY WARRANTY; without even the implied warranty of
12 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 12 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
13 # GNU General Public License for more details. 13 # GNU General Public License for more details.
14 # 14 #
15 # You should have received a copy of the GNU General Public License 15 # You should have received a copy of the GNU General Public License
16 # along with Adblock Plus. If not, see <http://www.gnu.org/licenses/>. 16 # along with Adblock Plus. If not, see <http://www.gnu.org/licenses/>.
17 17
18 import os, sys, codecs, re, math, GeoIP, urllib, urlparse, socket, simplejson 18 import os, sys, codecs, re, math, urllib, urlparse, socket, json
19 import pygeoip
19 from collections import OrderedDict 20 from collections import OrderedDict
20 import sitescripts.stats.common as common 21 import sitescripts.stats.common as common
21 from sitescripts.utils import get_config, setupStderr 22 from sitescripts.utils import get_config, setupStderr
22 from datetime import datetime, timedelta 23 from datetime import datetime, timedelta
23 24
24 log_regexp = None 25 log_regexp = None
25 mirror_name = None 26 mirror_name = None
26 gecko_apps = None 27 gecko_apps = None
27 28
28 def cache_lru(func): 29 def cache_lru(func):
(...skipping 111 matching lines...) Expand 10 before | Expand all | Expand 10 after
140 # ABP/Android downloads use that user agent 141 # ABP/Android downloads use that user agent
141 if ua.startswith("Apache-HttpClient/UNAVAILABLE"): 142 if ua.startswith("Apache-HttpClient/UNAVAILABLE"):
142 return "Android", "" 143 return "Android", ""
143 144
144 # ABP/IE downloads use that user agent 145 # ABP/IE downloads use that user agent
145 if ua == "Adblock Plus": 146 if ua == "Adblock Plus":
146 return "ABP", "" 147 return "ABP", ""
147 148
148 return "Other", "" 149 return "Other", ""
149 150
150 def process_ip(ip, geo): 151 def process_ip(ip, geo, geov6):
151 match = re.search(r"^::ffff:(\d+\.\d+\.\d+\.\d+)$", ip) 152 match = re.search(r"^::ffff:(\d+\.\d+\.\d+\.\d+)$", ip)
152 if match: 153 if match:
153 ip = match.group(1) 154 ip = match.group(1)
154 155
155 country = geo.country_code_by_addr(ip) 156 if ":" in ip:
157 country = geov6.country_code_by_addr(ip)
158 else:
159 country = geo.country_code_by_addr(ip)
156 if country in (None, "", "--"): 160 if country in (None, "", "--"):
157 country = "unknown" 161 country = "unknown"
158 country = country.lower() 162 country = country.lower()
159 163
160 return ip, country 164 return ip, country
161 165
162 @cache_last 166 @cache_last
163 def parse_time(timestr, tz_hours, tz_minutes): 167 def parse_time(timestr, tz_hours, tz_minutes):
164 result = datetime.strptime(timestr, "%d/%b/%Y:%H:%M:%S") 168 result = datetime.strptime(timestr, "%d/%b/%Y:%H:%M:%S")
165 result -= timedelta(hours = tz_hours, minutes = math.copysign(tz_minutes, tz_h ours)) 169 result -= timedelta(hours = tz_hours, minutes = math.copysign(tz_minutes, tz_h ours))
(...skipping 110 matching lines...) Expand 10 before | Expand all | Expand 10 after
276 version = params2.get("v", ["unknown"])[0] 280 version = params2.get("v", ["unknown"])[0]
277 281
278 # Only leave the major and minor release number for application 282 # Only leave the major and minor release number for application
279 applicationVersion = re.sub(r"^(\d+\.\d+).*", r"\1", applicationVersion) 283 applicationVersion = re.sub(r"^(\d+\.\d+).*", r"\1", applicationVersion)
280 284
281 return version, application, applicationVersion 285 return version, application, applicationVersion
282 286
283 def parse_update_flag(query): 287 def parse_update_flag(query):
284 return "update" if query == "update" else "install" 288 return "update" if query == "update" else "install"
285 289
286 def parse_record(line, ignored, geo): 290 def parse_record(line, ignored, geo, geov6):
287 global log_regexp, mirror_name 291 global log_regexp, mirror_name
288 if log_regexp == None: 292 if log_regexp == None:
289 log_regexp = re.compile(r'(\S+) \S+ \S+ \[([^]\s]+) ([+\-]\d\d)(\d\d)\] "GET ([^"\s]+) [^"]+" (\d+) (\d+) "[^"]*" "([^"]*)"(?: "[^"]*" \S+ "[^"]*" "[^"]*" " ([^"]*)")?') 293 log_regexp = re.compile(r'(\S+) \S+ \S+ \[([^]\s]+) ([+\-]\d\d)(\d\d)\] "GET ([^"\s]+) [^"]+" (\d+) (\d+) "[^"]*" "([^"]*)"(?: "[^"]*" \S+ "[^"]*" "[^"]*" " ([^"]*)")?')
290 if mirror_name == None: 294 if mirror_name == None:
291 mirror_name = re.sub(r"\..*", "", socket.gethostname()) 295 mirror_name = re.sub(r"\..*", "", socket.gethostname())
292 296
293 match = re.search(log_regexp, line) 297 match = re.search(log_regexp, line)
294 if not match: 298 if not match:
295 return None 299 return None
296 300
297 status = int(match.group(6)) 301 status = int(match.group(6))
298 if status != 200: 302 if status != 200:
299 return None 303 return None
300 304
301 info = { 305 info = {
302 "mirror": mirror_name, 306 "mirror": mirror_name,
303 "size": int(match.group(7)), 307 "size": int(match.group(7)),
304 } 308 }
305 309
306 info["ip"], info["country"] = process_ip(match.group(1), geo) 310 info["ip"], info["country"] = process_ip(match.group(1), geo, geov6)
307 info["time"], info["month"], info["day"], info["weekday"], info["hour"] = pars e_time(match.group(2), int(match.group(3)), int(match.group(4))) 311 info["time"], info["month"], info["day"], info["weekday"], info["hour"] = pars e_time(match.group(2), int(match.group(3)), int(match.group(4)))
308 info["file"], info["query"] = parse_path(match.group(5)) 312 info["file"], info["query"] = parse_path(match.group(5))
309 info["ua"], info["uaversion"] = parse_ua(match.group(8)) 313 info["ua"], info["uaversion"] = parse_ua(match.group(8))
310 info["fullua"] = "%s %s" % (info["ua"], info["uaversion"]) 314 info["fullua"] = "%s %s" % (info["ua"], info["uaversion"])
311 info["clientid"] = match.group(9) 315 info["clientid"] = match.group(9)
312 316
313 # Additional metadata depends on file type 317 # Additional metadata depends on file type
314 filename = os.path.basename(info["file"]) 318 filename = os.path.basename(info["file"])
315 ext = os.path.splitext(filename)[1] 319 ext = os.path.splitext(filename)[1]
316 if ext == ".txt" or filename == "update.json" or filename == "notification.jso n": 320 if ext == ".txt" or filename == "update.json" or filename == "notification.jso n":
(...skipping 37 matching lines...) Expand 10 before | Expand all | Expand 10 after
354 continue 358 continue
355 359
356 value = info[field] 360 value = info[field]
357 if field not in section: 361 if field not in section:
358 section[field] = {} 362 section[field] = {}
359 if value not in section[field]: 363 if value not in section[field]:
360 section[field][value] = {} 364 section[field][value] = {}
361 365
362 add_record(info, section[field][value], ignore_fields + (field,)) 366 add_record(info, section[field][value], ignore_fields + (field,))
363 367
364 def parse_stdin(geo, verbose): 368 def parse_stdin(geo, geov6, verbose):
365 data = {} 369 data = {}
366 ignored = set() 370 ignored = set()
367 for line in sys.stdin: 371 for line in sys.stdin:
368 info = parse_record(line, ignored, geo) 372 info = parse_record(line, ignored, geo, geov6)
369 if info == None: 373 if info == None:
370 continue 374 continue
371 375
372 if info["month"] not in data: 376 if info["month"] not in data:
373 data[info["month"]] = {} 377 data[info["month"]] = {}
374 section = data[info["month"]] 378 section = data[info["month"]]
375 379
376 if info["file"] not in section: 380 if info["file"] not in section:
377 section[info["file"]] = {} 381 section[info["file"]] = {}
378 section = section[info["file"]] 382 section = section[info["file"]]
379 383
380 add_record(info, section) 384 add_record(info, section)
381 385
382 if verbose: 386 if verbose:
383 print "Ignored files" 387 print "Ignored files"
384 print "=============" 388 print "============="
385 print "\n".join(sorted(ignored)) 389 print "\n".join(sorted(ignored))
386 return data 390 return data
387 391
388 if __name__ == "__main__": 392 if __name__ == "__main__":
389 setupStderr() 393 setupStderr()
390 394
391 verbose = (len(sys.argv) >= 2 and sys.argv[1] == "verbose") 395 verbose = (len(sys.argv) >= 2 and sys.argv[1] == "verbose")
392 geo = GeoIP.open(get_config().get("stats", "geoip_db"), GeoIP.GEOIP_MEMORY_CAC HE) 396 geo = pygeoip.GeoIP(get_config().get("stats", "geoip_db"), pygeoip.MEMORY_CACH E)
393 result = parse_stdin(geo, verbose) 397 geov6 = pygeoip.GeoIP(get_config().get("stats", "geoipv6_db"), pygeoip.MEMORY_ CACHE)
398 result = parse_stdin(geo, geov6, verbose)
394 399
395 with codecs.open(get_config().get("stats", "tempFile"), "wb", encoding="utf-8" ) as file: 400 with codecs.open(get_config().get("stats", "tempFile"), "wb", encoding="utf-8" ) as file:
396 simplejson.dump(result, file, indent=2, sort_keys=True) 401 json.dump(result, file, indent=2, sort_keys=True)
OLDNEW
« no previous file with comments | « pygeoip/util.py ('k') | sitescripts/stats/test/logprocessor.py » ('j') | no next file with comments »

Powered by Google App Engine
This is Rietveld