sitescripts/stats/bin/logprocessor.py - Issue 11577044: Switch to pygeoip for log processing to allow running it via PyPy

Side by Side Diff: sitescripts/stats/bin/logprocessor.py

Issue 11577044: Switch to pygeoip for log processing to allow running it via PyPy (Closed)

Patch Set: Created Aug. 29, 2013, 7:38 p.m.

Left:
Right:

Use n/p to move between diff chunks; N/P to move between comments.

Jump to:

View unified diff | Download patch

OLD	NEW
1 # coding: utf-8	1 # coding: utf-8

2	2

3 # This file is part of the Adblock Plus web scripts,	3 # This file is part of the Adblock Plus web scripts,

4 # Copyright (C) 2006-2013 Eyeo GmbH	4 # Copyright (C) 2006-2013 Eyeo GmbH

5 #	5 #

6 # Adblock Plus is free software: you can redistribute it and/or modify	6 # Adblock Plus is free software: you can redistribute it and/or modify

7 # it under the terms of the GNU General Public License version 3 as	7 # it under the terms of the GNU General Public License version 3 as

8 # published by the Free Software Foundation.	8 # published by the Free Software Foundation.

9 #	9 #

10 # Adblock Plus is distributed in the hope that it will be useful,	10 # Adblock Plus is distributed in the hope that it will be useful,

11 # but WITHOUT ANY WARRANTY; without even the implied warranty of	11 # but WITHOUT ANY WARRANTY; without even the implied warranty of

12 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the	12 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the

13 # GNU General Public License for more details.	13 # GNU General Public License for more details.

14 #	14 #

15 # You should have received a copy of the GNU General Public License	15 # You should have received a copy of the GNU General Public License

16 # along with Adblock Plus. If not, see <http://www.gnu.org/licenses/>.	16 # along with Adblock Plus. If not, see <http://www.gnu.org/licenses/>.

17	17

18 import os, sys, codecs, re, math, GeoIP, urllib, urlparse, socket, simplejson	18 import os, sys, codecs, re, math, urllib, urlparse, socket, json

	19 import pygeoip

19 from collections import OrderedDict	20 from collections import OrderedDict

20 import sitescripts.stats.common as common	21 import sitescripts.stats.common as common

21 from sitescripts.utils import get_config, setupStderr	22 from sitescripts.utils import get_config, setupStderr

22 from datetime import datetime, timedelta	23 from datetime import datetime, timedelta

23	24

24 log_regexp = None	25 log_regexp = None

25 mirror_name = None	26 mirror_name = None

26 gecko_apps = None	27 gecko_apps = None

27	28

28 def cache_lru(func):	29 def cache_lru(func):

(...skipping 111 matching lines...) Expand 10 before \| Expand all \| Expand 10 after Loading...
140 # ABP/Android downloads use that user agent	141 # ABP/Android downloads use that user agent

141 if ua.startswith("Apache-HttpClient/UNAVAILABLE"):	142 if ua.startswith("Apache-HttpClient/UNAVAILABLE"):

142 return "Android", ""	143 return "Android", ""

143	144

144 # ABP/IE downloads use that user agent	145 # ABP/IE downloads use that user agent

145 if ua == "Adblock Plus":	146 if ua == "Adblock Plus":

146 return "ABP", ""	147 return "ABP", ""

147	148

148 return "Other", ""	149 return "Other", ""

149	150

150 def process_ip(ip, geo):	151 def process_ip(ip, geo, geov6):

151 match = re.search(r"^::ffff:(\d+\.\d+\.\d+\.\d+)$", ip)	152 match = re.search(r"^::ffff:(\d+\.\d+\.\d+\.\d+)$", ip)

152 if match:	153 if match:

153 ip = match.group(1)	154 ip = match.group(1)

154	155

155 country = geo.country_code_by_addr(ip)	156 if ":" in ip:

	157 country = geov6.country_code_by_addr(ip)

	158 else:

	159 country = geo.country_code_by_addr(ip)

156 if country in (None, "", "--"):	160 if country in (None, "", "--"):

157 country = "unknown"	161 country = "unknown"

158 country = country.lower()	162 country = country.lower()

159	163

160 return ip, country	164 return ip, country

161	165

162 @cache_last	166 @cache_last

163 def parse_time(timestr, tz_hours, tz_minutes):	167 def parse_time(timestr, tz_hours, tz_minutes):

164 result = datetime.strptime(timestr, "%d/%b/%Y:%H:%M:%S")	168 result = datetime.strptime(timestr, "%d/%b/%Y:%H:%M:%S")

165 result -= timedelta(hours = tz_hours, minutes = math.copysign(tz_minutes, tz_h ours))	169 result -= timedelta(hours = tz_hours, minutes = math.copysign(tz_minutes, tz_h ours))

(...skipping 110 matching lines...) Expand 10 before \| Expand all \| Expand 10 after Loading...
276 version = params2.get("v", ["unknown"])[0]	280 version = params2.get("v", ["unknown"])[0]

277	281

278 # Only leave the major and minor release number for application	282 # Only leave the major and minor release number for application

279 applicationVersion = re.sub(r"^(\d+\.\d+).*", r"\1", applicationVersion)	283 applicationVersion = re.sub(r"^(\d+\.\d+).*", r"\1", applicationVersion)

280	284

281 return version, application, applicationVersion	285 return version, application, applicationVersion

282	286

283 def parse_update_flag(query):	287 def parse_update_flag(query):

284 return "update" if query == "update" else "install"	288 return "update" if query == "update" else "install"

285	289

286 def parse_record(line, ignored, geo):	290 def parse_record(line, ignored, geo, geov6):

287 global log_regexp, mirror_name	291 global log_regexp, mirror_name

288 if log_regexp == None:	292 if log_regexp == None:

289 log_regexp = re.compile(r'(\S+) \S+ \S+ \[([^]\s]+) ([+\-]\d\d)(\d\d)\] "GET ([^"\s]+) [^"]+" (\d+) (\d+) "[^"]" "([^"])"(?: "[^"]" \S+ "[^"]" "[^"]" " ([^"])")?')	293 log_regexp = re.compile(r'(\S+) \S+ \S+ \[([^]\s]+) ([+\-]\d\d)(\d\d)\] "GET ([^"\s]+) [^"]+" (\d+) (\d+) "[^"]" "([^"])"(?: "[^"]" \S+ "[^"]" "[^"]" " ([^"])")?')

290 if mirror_name == None:	294 if mirror_name == None:

291 mirror_name = re.sub(r"\..*", "", socket.gethostname())	295 mirror_name = re.sub(r"\..*", "", socket.gethostname())

292	296

293 match = re.search(log_regexp, line)	297 match = re.search(log_regexp, line)

294 if not match:	298 if not match:

295 return None	299 return None

296	300

297 status = int(match.group(6))	301 status = int(match.group(6))

298 if status != 200:	302 if status != 200:

299 return None	303 return None

300	304

301 info = {	305 info = {

302 "mirror": mirror_name,	306 "mirror": mirror_name,

303 "size": int(match.group(7)),	307 "size": int(match.group(7)),

304 }	308 }

305	309

306 info["ip"], info["country"] = process_ip(match.group(1), geo)	310 info["ip"], info["country"] = process_ip(match.group(1), geo, geov6)

307 info["time"], info["month"], info["day"], info["weekday"], info["hour"] = pars e_time(match.group(2), int(match.group(3)), int(match.group(4)))	311 info["time"], info["month"], info["day"], info["weekday"], info["hour"] = pars e_time(match.group(2), int(match.group(3)), int(match.group(4)))

308 info["file"], info["query"] = parse_path(match.group(5))	312 info["file"], info["query"] = parse_path(match.group(5))

309 info["ua"], info["uaversion"] = parse_ua(match.group(8))	313 info["ua"], info["uaversion"] = parse_ua(match.group(8))

310 info["fullua"] = "%s %s" % (info["ua"], info["uaversion"])	314 info["fullua"] = "%s %s" % (info["ua"], info["uaversion"])

311 info["clientid"] = match.group(9)	315 info["clientid"] = match.group(9)

312	316

313 # Additional metadata depends on file type	317 # Additional metadata depends on file type

314 filename = os.path.basename(info["file"])	318 filename = os.path.basename(info["file"])

315 ext = os.path.splitext(filename)[1]	319 ext = os.path.splitext(filename)[1]

316 if ext == ".txt" or filename == "update.json" or filename == "notification.jso n":	320 if ext == ".txt" or filename == "update.json" or filename == "notification.jso n":

(...skipping 37 matching lines...) Expand 10 before \| Expand all \| Expand 10 after Loading...
354 continue	358 continue

355	359

356 value = info[field]	360 value = info[field]

357 if field not in section:	361 if field not in section:

358 section[field] = {}	362 section[field] = {}

359 if value not in section[field]:	363 if value not in section[field]:

360 section[field][value] = {}	364 section[field][value] = {}

361	365

362 add_record(info, section[field][value], ignore_fields + (field,))	366 add_record(info, section[field][value], ignore_fields + (field,))

363	367

364 def parse_stdin(geo, verbose):	368 def parse_stdin(geo, geov6, verbose):

365 data = {}	369 data = {}

366 ignored = set()	370 ignored = set()

367 for line in sys.stdin:	371 for line in sys.stdin:

368 info = parse_record(line, ignored, geo)	372 info = parse_record(line, ignored, geo, geov6)

369 if info == None:	373 if info == None:

370 continue	374 continue

371	375

372 if info["month"] not in data:	376 if info["month"] not in data:

373 data[info["month"]] = {}	377 data[info["month"]] = {}

374 section = data[info["month"]]	378 section = data[info["month"]]

375	379

376 if info["file"] not in section:	380 if info["file"] not in section:

377 section[info["file"]] = {}	381 section[info["file"]] = {}

378 section = section[info["file"]]	382 section = section[info["file"]]

379	383

380 add_record(info, section)	384 add_record(info, section)

381	385

382 if verbose:	386 if verbose:

383 print "Ignored files"	387 print "Ignored files"

384 print "============="	388 print "============="

385 print "\n".join(sorted(ignored))	389 print "\n".join(sorted(ignored))

386 return data	390 return data

387	391

388 if __name__ == "__main__":	392 if __name__ == "__main__":

389 setupStderr()	393 setupStderr()

390	394

391 verbose = (len(sys.argv) >= 2 and sys.argv[1] == "verbose")	395 verbose = (len(sys.argv) >= 2 and sys.argv[1] == "verbose")

392 geo = GeoIP.open(get_config().get("stats", "geoip_db"), GeoIP.GEOIP_MEMORY_CAC HE)	396 geo = pygeoip.GeoIP(get_config().get("stats", "geoip_db"), pygeoip.MEMORY_CACH E)

393 result = parse_stdin(geo, verbose)	397 geov6 = pygeoip.GeoIP(get_config().get("stats", "geoipv6_db"), pygeoip.MEMORY_ CACHE)

	398 result = parse_stdin(geo, geov6, verbose)

394	399

395 with codecs.open(get_config().get("stats", "tempFile"), "wb", encoding="utf-8" ) as file:	400 with codecs.open(get_config().get("stats", "tempFile"), "wb", encoding="utf-8" ) as file:

396 simplejson.dump(result, file, indent=2, sort_keys=True)	401 json.dump(result, file, indent=2, sort_keys=True)

OLD	NEW

« no previous file with comments | « pygeoip/util.py ('k') | sitescripts/stats/test/logprocessor.py » ('j') | no next file with comments »