sitescripts/stats/bin/logprocessor.py - Issue 11481051: Update stats processing

Delta Between Two Patch Sets: sitescripts/stats/bin/logprocessor.py

Issue 11481051: Update stats processing (Closed)

Left Patch Set: Fixed two presentation issues Created Aug. 24, 2013, 1:11 p.m.

Right Patch Set: Improved performance using memoization Created Aug. 29, 2013, 1:39 p.m.

Left:
Right:

Use n/p to move between diff chunks; N/P to move between comments.

Jump to:

Left: Side by side diff | Download
Right: Side by side diff | Download

LEFT	RIGHT
1 # coding: utf-8	1 # coding: utf-8

2	2

3 # This file is part of the Adblock Plus web scripts,	3 # This file is part of the Adblock Plus web scripts,

4 # Copyright (C) 2006-2013 Eyeo GmbH	4 # Copyright (C) 2006-2013 Eyeo GmbH

5 #	5 #

6 # Adblock Plus is free software: you can redistribute it and/or modify	6 # Adblock Plus is free software: you can redistribute it and/or modify

7 # it under the terms of the GNU General Public License version 3 as	7 # it under the terms of the GNU General Public License version 3 as

8 # published by the Free Software Foundation.	8 # published by the Free Software Foundation.

9 #	9 #

10 # Adblock Plus is distributed in the hope that it will be useful,	10 # Adblock Plus is distributed in the hope that it will be useful,

11 # but WITHOUT ANY WARRANTY; without even the implied warranty of	11 # but WITHOUT ANY WARRANTY; without even the implied warranty of

12 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the	12 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the

13 # GNU General Public License for more details.	13 # GNU General Public License for more details.

14 #	14 #

15 # You should have received a copy of the GNU General Public License	15 # You should have received a copy of the GNU General Public License

16 # along with Adblock Plus. If not, see <http://www.gnu.org/licenses/>.	16 # along with Adblock Plus. If not, see <http://www.gnu.org/licenses/>.

17	17

18 import os, sys, codecs, re, math, GeoIP, urllib, urlparse, socket, simplejson	18 import os, sys, codecs, re, math, GeoIP, urllib, urlparse, socket, simplejson

	19 from collections import OrderedDict

19 import sitescripts.stats.common as common	20 import sitescripts.stats.common as common

20 from sitescripts.utils import get_config, setupStderr	21 from sitescripts.utils import get_config, setupStderr

21 from datetime import datetime, timedelta	22 from datetime import datetime, timedelta

22	23

23 log_regexp = None	24 log_regexp = None

24 mirror_name = None	25 mirror_name = None

25 gecko_apps = None	26 gecko_apps = None

26	27

	28 def cache_lru(func):

	29 """

	30 Decorator that memoizes the return values of a single-parameter function in

	31 case it is called again with the same parameter. The 1024 most recent

	32 results are saved.

	33 """

	34

	35 results = OrderedDict()

	36 results.entries_left = 1024

	37

	38 def wrapped(arg):

	39 if arg in results:

	40 result = results[arg]

	41 del results[arg]

	42 else:

	43 if results.entries_left > 0:

	44 results.entries_left -= 1

	45 else:

	46 results.popitem(last=False)

	47 result = func(arg)

	48 results[arg] = result

	49 return result

	50 return wrapped

	51

	52

	53 def cache_last(func):

	54 """

	55 Decorator that memoizes the last return value of a function in case it is

	56 called again with the same parameters.

	57 """

	58 result = {"args": None, "result": None}

	59

	60 def wrapped(*args):

	61 if args != result["args"]:

	62 result["result"] = func(*args)

	63 result["args"] = args

	64 return result["result"]

	65 return wrapped

	66

	67

	68 @cache_lru

27 def parse_ua(ua):	69 def parse_ua(ua):

28 # Opera might disguise itself as other browser so it needs to go first	70 # Opera might disguise itself as other browser so it needs to go first

29 match = re.search(r"\bOpera/([\d\.]+)", ua)	71 match = re.search(r"\bOpera/([\d\.]+)", ua)

30 if match:	72 if match:

31 # Opera 10+ declares itself as Opera 9.80 but adds Version/1x.x to the UA	73 # Opera 10+ declares itself as Opera 9.80 but adds Version/1x.x to the UA

32 match2 = re.search(r"\bVersion/([\d\.]+)", ua)	74 match2 = re.search(r"\bVersion/([\d\.]+)", ua)

33 if match2:	75 if match2:

34 return "Opera", match2.group(1)	76 return "Opera", match2.group(1)

35 else:	77 else:

36 return "Opera", match.group(1)	78 return "Opera", match.group(1)

37	79

38 # Opera 15+ has the same UA as Chrome but adds OPR/1x.x to it	80 # Opera 15+ has the same UA as Chrome but adds OPR/1x.x to it

39 match = re.search(r"\bOPR/(\d+\.\d+)", ua)	81 match = re.search(r"\bOPR/(\d+\.\d+)", ua)

40 if match:	82 if match:

41 return "Opera", match.group(1)	83 return "Opera", match.group(1)

42	84

43 for appName in ("Fennec", "Thunderbird", "SeaMonkey", "Songbird", "K-Meleon", "Prism", "Firefox"):	85 # Have to check for these before Firefox, they will usually have a Firefox ide ntifier as well

44 match = re.search(r"\b%s/(\d+\.\d+)" % appName, ua)	86 match = re.search(r"\b(Fennec\|Thunderbird\|SeaMonkey\|Songbird\|K-Meleon\|Prism)/( \d+\.\d+)", ua)
Sebastian Noack 2013/08/26 16:05:22 Instead of iterating over the list of browsers and Instead of iterating over the list of browsers and generate a regex for each, I would suggest to use a single regex that matches all: \b(Fennec\|Thunderbird\|SeaMonkey\|Songbird\|K-Meleon\|Prism\|Firefox)/(\d+\.\d+) Wladimir Palant 2013/08/27 07:34:28 This was done like that intentionally - quite a fe Show quoted text On 2013/08/26 16:05:22, sebastian wrote: > Instead of iterating over the list of browsers and generate a regex for each, I > would suggest to use a single regex that matches all: > > \b(Fennec\|Thunderbird\|SeaMonkey\|Songbird\|K-Meleon\|Prism\|Firefox)/(\d+\.\d+) This was done like that intentionally - quite a few browsers will claim being Firefox. However, we can indeed handle all but Firefox in one go, done that now.
45 if match:	87 if match:

46 if appName == "Fennec" or (appName == "Firefox" and re.search(r"\bMobile;" , ua)):	88 if match.group(1) == "Fennec":

47 return "Firefox Mobile", match.group(1)	89 return "Firefox Mobile", match.group(2)

48 else:	90 else:

49 return appName, match.group(1)	91 return match.group(1), match.group(2)

	92

	93 match = re.search(r"\bFirefox/(\d+\.\d+)", ua)

	94 if match:

	95 if re.search(r"\bMobile;", ua):

	96 return "Firefox Mobile", match.group(1)

	97 else:

	98 return "Firefox", match.group(1)

50	99

51 match = re.search(r"\brv:(\d+)\.(\d+)(?:\.(\d+))?", ua)	100 match = re.search(r"\brv:(\d+)\.(\d+)(?:\.(\d+))?", ua)

52 if match and re.search(r"\bGecko/", ua):	101 if match and re.search(r"\bGecko/", ua):

53 if match.group(3) and int(match.group(1)) < 2:	102 if match.group(3) and int(match.group(1)) < 2:

54 return "Gecko", "%s.%s.%s" % (match.group(1), match.group(2), match.group( 3))	103 return "Gecko", "%s.%s.%s" % (match.group(1), match.group(2), match.group( 3))
Sebastian Noack 2013/08/26 16:05:22 You could just just call match.groups(), which alr You could just just call match.groups(), which already returns a tuple with all 3 groups in it. Wladimir Palant 2013/08/27 07:34:28 I think I rather keep this as is for consistency w Show quoted text On 2013/08/26 16:05:22, sebastian wrote: > You could just just call match.groups(), which already returns a tuple with all > 3 groups in it. I think I rather keep this as is for consistency with the variant two lines below. Besides, I prefer listing the groups explicitly - regexps commonly have groups that aren't meant to be used (I avoid this here by using (?:...) but still).
55 else:	104 else:

56 return "Gecko", "%s.%s" % (match.group(1), match.group(2))	105 return "Gecko", "%s.%s" % (match.group(1), match.group(2))

57	106

58 match = re.search(r"\bCoolNovo/(\d+\.\d+\.\d+)", ua)	107 match = re.search(r"\bCoolNovo/(\d+\.\d+\.\d+)", ua)

59 if match:	108 if match:

60 return "CoolNovo", match.group(1)	109 return "CoolNovo", match.group(1)

61	110

62 match = re.search(r"\bChrome/(\d+\.\d+)", ua)	111 match = re.search(r"\bChrome/(\d+\.\d+)", ua)

63 if match:	112 if match:

64 return "Chrome", match.group(1)	113 return "Chrome", match.group(1)

(...skipping 38 matching lines...) Expand 10 before \| Expand all \| Expand 10 after Loading...
103 if match:	152 if match:

104 ip = match.group(1)	153 ip = match.group(1)

105	154

106 country = geo.country_code_by_addr(ip)	155 country = geo.country_code_by_addr(ip)

107 if country in (None, "", "--"):	156 if country in (None, "", "--"):

108 country = "unknown"	157 country = "unknown"

109 country = country.lower()	158 country = country.lower()

110	159

111 return ip, country	160 return ip, country

112	161

	162 @cache_last

113 def parse_time(timestr, tz_hours, tz_minutes):	163 def parse_time(timestr, tz_hours, tz_minutes):

114 result = datetime.strptime(timestr, "%d/%b/%Y:%H:%M:%S")	164 result = datetime.strptime(timestr, "%d/%b/%Y:%H:%M:%S")

115 result -= timedelta(hours = tz_hours, minutes = math.copysign(tz_minutes, tz_h ours))	165 result -= timedelta(hours = tz_hours, minutes = math.copysign(tz_minutes, tz_h ours))

116 return result, result.strftime("%Y%m"), result.day, result.weekday(), result.h our	166 return result, result.strftime("%Y%m"), result.day, result.weekday(), result.h our

117	167

	168 @cache_lru

118 def parse_path(path):	169 def parse_path(path):

119 urlparts = urlparse.urlparse(path)	170 urlparts = urlparse.urlparse(path)

120 try:	171 try:

121 path = urllib.unquote(urlparts.path).decode("utf-8")	172 path = urllib.unquote(urlparts.path).decode("utf-8")

122 except:	173 except:

123 path = urlparts.path	174 path = urlparts.path

124 return path[1:], urlparts.query	175 return path[1:], urlparts.query

125	176

	177 @cache_lru

	178 def parse_query(query):

	179 return urlparse.parse_qs(query)

	180

	181 @cache_lru

	182 def parse_lastversion(last_version):

	183 return datetime.strptime(last_version, "%Y%m%d%H%M")

	184

	185 @cache_lru

	186 def get_week(date):

	187 return date.isocalendar()[0:2]

	188

126 def parse_downloader_query(info):	189 def parse_downloader_query(info):

127 params = urlparse.parse_qs(info["query"])	190 params = parse_query(info["query"])

128 for param in ("addonName", "addonVersion", "application", "applicationVersion" , "platform", "platformVersion"):	191 for param in ("addonName", "addonVersion", "application", "applicationVersion" , "platform", "platformVersion"):

129 info[param] = params.get(param, ["unknown"])[0]	192 info[param] = params.get(param, ["unknown"])[0]

130	193

131 # Only leave the major and minor release number for application and platform	194 # Only leave the major and minor release number for application and platform

132 info["applicationVersion"] = re.sub(r"^(\d+\.\d+).*", r"\1", info["application Version"])	195 info["applicationVersion"] = re.sub(r"^(\d+\.\d+).*", r"\1", info["application Version"])

133 info["platformVersion"] = re.sub(r"^(\d+\.\d+).*", r"\1", info["platformVersio n"])	196 info["platformVersion"] = re.sub(r"^(\d+\.\d+).*", r"\1", info["platformVersio n"])

134	197

135 # Chrome Adblock sends an X-Client-ID header insteads of URL parameters	198 # Chrome Adblock sends an X-Client-ID header insteads of URL parameters

136 match = re.match(r"^adblock/([\d\.]+)$", info["clientid"], re.I) if info["clie ntid"] else None	199 match = re.match(r"^adblock/([\d\.]+)$", info["clientid"], re.I) if info["clie ntid"] else None

137 if match:	200 if match:

138 info["addonName"] = "chromeadblock"	201 info["addonName"] = "chromeadblock"

139 info["addonVersion"] = match.group(1)	202 info["addonVersion"] = match.group(1)

140	203

141 last_version = params.get("lastVersion", ["unknown"])[0]	204 last_version = params.get("lastVersion", ["unknown"])[0]

142 if info["file"] == "notification.json" and last_version == "0" and (	205 if info["file"] == "notification.json" and last_version == "0" and (

143 (info["addonName"] == "adblockplus" and info["addonVersion"] == "2.3.1") o r	206 (info["addonName"] == "adblockplus" and info["addonVersion"] == "2.3.1") o r

144 (info["addonName"] in ("adblockpluschrome", "adblockplusopera") and info[" addonVersion"] == "1.5.2")	207 (info["addonName"] in ("adblockpluschrome", "adblockplusopera") and info[" addonVersion"] == "1.5.2")

145 ):	208 ):

146 # Broken notification version number in these releases, treat like unknown	209 # Broken notification version number in these releases, treat like unknown

147 last_version = "unknown"	210 last_version = "unknown"

148	211

149 if last_version == "unknown":	212 if last_version == "unknown":

150 info["downloadInterval"] = "unknown"	213 info["downloadInterval"] = "unknown"

151 elif last_version == "0":	214 elif last_version == "0":

152 info["downloadInterval"] = "unknown"	215 info["downloadInterval"] = "unknown"

153 info["firstDownload"] = info["firstInMonth"] = info["firstInWeek"] = info["f irstInDay"] = True	216 info["firstDownload"] = info["firstInMonth"] = info["firstInWeek"] = info["f irstInDay"] = True

154 else:	217 else:

155 try:	218 try:

156 last_update = datetime.strptime(last_version, "%Y%m%d%H%M")	219 last_update = parse_lastversion(last_version)

157 diff = info["time"] - last_update	220 diff = info["time"] - last_update

158 if diff.days >= 365:	221 if diff.days >= 365:

159 info["downloadInterval"] = "%i year(s)" % (diff.days / 365)	222 info["downloadInterval"] = "%i year(s)" % (diff.days / 365)

160 elif diff.days >= 30:	223 elif diff.days >= 30:

161 info["downloadInterval"] = "%i month(s)" % (diff.days / 30)	224 info["downloadInterval"] = "%i month(s)" % (diff.days / 30)

162 elif diff.days >= 1:	225 elif diff.days >= 1:

163 info["downloadInterval"] = "%i day(s)" % diff.days	226 info["downloadInterval"] = "%i day(s)" % diff.days

164 else:	227 else:

165 info["downloadInterval"] = "%i hour(s)" % (diff.seconds / 3600)	228 info["downloadInterval"] = "%i hour(s)" % (diff.seconds / 3600)

166	229

167 if last_update.year != info["time"].year or last_update.month != info["tim e"].month:	230 if last_update.year != info["time"].year or last_update.month != info["tim e"].month:

168 info["firstInMonth"] = info["firstInDay"] = True	231 info["firstInMonth"] = info["firstInDay"] = True

169 elif last_update.day != info["time"].day:	232 elif last_update.day != info["time"].day:

170 info["firstInDay"] = True	233 info["firstInDay"] = True

171	234

172 if last_update.isocalendar()[0:2] != info["time"].isocalendar()[0:2]:	235 if get_week(last_update) != get_week(info["time"]):

173 info["firstInWeek"] = True	236 info["firstInWeek"] = True

174 except ValueError:	237 except ValueError:

175 info["downloadInterval"] = "unknown"	238 info["downloadInterval"] = "unknown"

176 pass	239 pass

177	240

178 def parse_addon_name(file):	241 def parse_addon_name(file):

179 if "/" in file:	242 if "/" in file:

180 return file.split("/")[-2]	243 return file.split("/")[-2]

181 else:	244 else:

182 return None	245 return None

(...skipping 141 matching lines...) Expand 10 before \| Expand all \| Expand 10 after Loading...
324	387

325 if __name__ == "__main__":	388 if __name__ == "__main__":

326 setupStderr()	389 setupStderr()

327	390

328 verbose = (len(sys.argv) >= 2 and sys.argv[1] == "verbose")	391 verbose = (len(sys.argv) >= 2 and sys.argv[1] == "verbose")

329 geo = GeoIP.open(get_config().get("stats", "geoip_db"), GeoIP.GEOIP_MEMORY_CAC HE)	392 geo = GeoIP.open(get_config().get("stats", "geoip_db"), GeoIP.GEOIP_MEMORY_CAC HE)

330 result = parse_stdin(geo, verbose)	393 result = parse_stdin(geo, verbose)

331	394

332 with codecs.open(get_config().get("stats", "tempFile"), "wb", encoding="utf-8" ) as file:	395 with codecs.open(get_config().get("stats", "tempFile"), "wb", encoding="utf-8" ) as file:

333 simplejson.dump(result, file, indent=2, sort_keys=True)	396 simplejson.dump(result, file, indent=2, sort_keys=True)

LEFT	RIGHT