sitescripts/stats/bin/logprocessor.py - Issue 11481051: Update stats processing

Delta Between Two Patch Sets: sitescripts/stats/bin/logprocessor.py

Issue 11481051: Update stats processing (Closed)

Left Patch Set: String concatenation fix Created Aug. 27, 2013, 12:41 p.m.

Right Patch Set: Improved performance using memoization Created Aug. 29, 2013, 1:39 p.m.

Left:
Right:

Use n/p to move between diff chunks; N/P to move between comments.

Jump to:

Left: Side by side diff | Download
Right: Side by side diff | Download

LEFT	RIGHT
1 # coding: utf-8	1 # coding: utf-8

2	2

3 # This file is part of the Adblock Plus web scripts,	3 # This file is part of the Adblock Plus web scripts,

4 # Copyright (C) 2006-2013 Eyeo GmbH	4 # Copyright (C) 2006-2013 Eyeo GmbH

5 #	5 #

6 # Adblock Plus is free software: you can redistribute it and/or modify	6 # Adblock Plus is free software: you can redistribute it and/or modify

7 # it under the terms of the GNU General Public License version 3 as	7 # it under the terms of the GNU General Public License version 3 as

8 # published by the Free Software Foundation.	8 # published by the Free Software Foundation.

9 #	9 #

10 # Adblock Plus is distributed in the hope that it will be useful,	10 # Adblock Plus is distributed in the hope that it will be useful,

11 # but WITHOUT ANY WARRANTY; without even the implied warranty of	11 # but WITHOUT ANY WARRANTY; without even the implied warranty of

12 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the	12 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the

13 # GNU General Public License for more details.	13 # GNU General Public License for more details.

14 #	14 #

15 # You should have received a copy of the GNU General Public License	15 # You should have received a copy of the GNU General Public License

16 # along with Adblock Plus. If not, see <http://www.gnu.org/licenses/>.	16 # along with Adblock Plus. If not, see <http://www.gnu.org/licenses/>.

17	17

18 import os, sys, codecs, re, math, GeoIP, urllib, urlparse, socket, simplejson	18 import os, sys, codecs, re, math, GeoIP, urllib, urlparse, socket, simplejson

	19 from collections import OrderedDict

19 import sitescripts.stats.common as common	20 import sitescripts.stats.common as common

20 from sitescripts.utils import get_config, setupStderr	21 from sitescripts.utils import get_config, setupStderr

21 from datetime import datetime, timedelta	22 from datetime import datetime, timedelta

22	23

23 log_regexp = None	24 log_regexp = None

24 mirror_name = None	25 mirror_name = None

25 gecko_apps = None	26 gecko_apps = None

26	27

	28 def cache_lru(func):

	29 """

	30 Decorator that memoizes the return values of a single-parameter function in

	31 case it is called again with the same parameter. The 1024 most recent

	32 results are saved.

	33 """

	34

	35 results = OrderedDict()

	36 results.entries_left = 1024

	37

	38 def wrapped(arg):

	39 if arg in results:

	40 result = results[arg]

	41 del results[arg]

	42 else:

	43 if results.entries_left > 0:

	44 results.entries_left -= 1

	45 else:

	46 results.popitem(last=False)

	47 result = func(arg)

	48 results[arg] = result

	49 return result

	50 return wrapped

	51

	52

	53 def cache_last(func):

	54 """

	55 Decorator that memoizes the last return value of a function in case it is

	56 called again with the same parameters.

	57 """

	58 result = {"args": None, "result": None}

	59

	60 def wrapped(*args):

	61 if args != result["args"]:

	62 result["result"] = func(*args)

	63 result["args"] = args

	64 return result["result"]

	65 return wrapped

	66

	67

	68 @cache_lru

27 def parse_ua(ua):	69 def parse_ua(ua):

28 # Opera might disguise itself as other browser so it needs to go first	70 # Opera might disguise itself as other browser so it needs to go first

29 match = re.search(r"\bOpera/([\d\.]+)", ua)	71 match = re.search(r"\bOpera/([\d\.]+)", ua)
Felix Dahlke 2013/08/28 17:25:32 Memoization helped save around 5% here. In an acce Memoization helped save around 5% here. In an access log with 10k lines, I had only 435 unique user agents. Here's the approach I used: ua_mem_table = {} def parse_ua_mem(ua): hash_code = hash(ua) if not hash_code in ua_mem_table: ua_mem_table[hash_code] = {} if ua in ua_mem_table[hash_code]: return ua_mem_table[hash_code][ua] browser = parse_ua(ua) ua_mem_table[hash_code][ua] = browser return browser Then I called parse_ua_mem instead of parse_ua below. This is relatively easy on the memory and avoids collisions. Would be a tad bit faster to just use a single flat hash table with the ua as the key, but that'd also blow up memory usage. Sebastian Noack 2013/08/29 10:54:30 Keys in python dicts are already unique. So when y Show quoted text On 2013/08/29 10:30:09, Felix H. Dahlke wrote: > On 2013/08/29 10:18:05, sebastian wrote: > > On 2013/08/28 17:25:32, Felix H. Dahlke wrote: > > > Memoization helped save around 5% here. In an access log with 10k lines, I > had > > > only 435 unique user agents. > > > > > > Here's the approach I used: > > > > > > ua_mem_table = {} > > > > > > def parse_ua_mem(ua): > > > hash_code = hash(ua) > > > if not hash_code in ua_mem_table: > > > ua_mem_table[hash_code] = {} > > > if ua in ua_mem_table[hash_code]: > > > return ua_mem_table[hash_code][ua] > > > browser = parse_ua(ua) > > > ua_mem_table[hash_code][ua] = browser > > > return browser > > > > > > Then I called parse_ua_mem instead of parse_ua below. > > > > > > This is relatively easy on the memory and avoids collisions. Would be a tad > > bit > > > faster to just use a single flat hash table with the ua as the key, but > that'd > > > also blow up memory usage. > > > > Python dicts are already hashtables. So when you get or set an item in a dict, > > hash() is already called to determine the bucket, and then the key of each > item > > in the bucket will be compared directly to the requested key. So following > > implementation would be even slightly faster and prevents collisions as well: > > > > def parse_ua_mem(ua): > > try: > > browser = ua_mem_table[ua] > > except KeyError: > > browser = ua_mem_table[ua] = parse_ua(ua) > > return browser > > I'm aware that dicts are hash tables. However, hash tables save the key as well > as the hash, and since the key is rather large, we'd use considerably more > memory using a single flat hash table, like I said. With this approach, we keep > only the unique keys. In both cases, memory usage goes up with the number of > lines, which is not ideal. But we shouldn't have it go up linearly. Keys in python dicts are already unique. So when you add multiple items with the same key (key1 == key2 == True), into a dict, only the most recent added key-value-pair is kept. Just like with your approach. Or what do you mean? If you don't want to grow the cache infinitely, you might want to use an LRU cache.
30 if match:	72 if match:

31 # Opera 10+ declares itself as Opera 9.80 but adds Version/1x.x to the UA	73 # Opera 10+ declares itself as Opera 9.80 but adds Version/1x.x to the UA

32 match2 = re.search(r"\bVersion/([\d\.]+)", ua)	74 match2 = re.search(r"\bVersion/([\d\.]+)", ua)

33 if match2:	75 if match2:

34 return "Opera", match2.group(1)	76 return "Opera", match2.group(1)

35 else:	77 else:

36 return "Opera", match.group(1)	78 return "Opera", match.group(1)

37	79

38 # Opera 15+ has the same UA as Chrome but adds OPR/1x.x to it	80 # Opera 15+ has the same UA as Chrome but adds OPR/1x.x to it

39 match = re.search(r"\bOPR/(\d+\.\d+)", ua)	81 match = re.search(r"\bOPR/(\d+\.\d+)", ua)

(...skipping 70 matching lines...) Expand 10 before \| Expand all \| Expand 10 after Loading...
110 if match:	152 if match:

111 ip = match.group(1)	153 ip = match.group(1)

112	154

113 country = geo.country_code_by_addr(ip)	155 country = geo.country_code_by_addr(ip)

114 if country in (None, "", "--"):	156 if country in (None, "", "--"):

115 country = "unknown"	157 country = "unknown"

116 country = country.lower()	158 country = country.lower()

117	159

118 return ip, country	160 return ip, country

119	161

	162 @cache_last

120 def parse_time(timestr, tz_hours, tz_minutes):	163 def parse_time(timestr, tz_hours, tz_minutes):

121 result = datetime.strptime(timestr, "%d/%b/%Y:%H:%M:%S")	164 result = datetime.strptime(timestr, "%d/%b/%Y:%H:%M:%S")

122 result -= timedelta(hours = tz_hours, minutes = math.copysign(tz_minutes, tz_h ours))	165 result -= timedelta(hours = tz_hours, minutes = math.copysign(tz_minutes, tz_h ours))

123 return result, result.strftime("%Y%m"), result.day, result.weekday(), result.h our	166 return result, result.strftime("%Y%m"), result.day, result.weekday(), result.h our

124	167

	168 @cache_lru

125 def parse_path(path):	169 def parse_path(path):

126 urlparts = urlparse.urlparse(path)	170 urlparts = urlparse.urlparse(path)

127 try:	171 try:

128 path = urllib.unquote(urlparts.path).decode("utf-8")	172 path = urllib.unquote(urlparts.path).decode("utf-8")

129 except:	173 except:

130 path = urlparts.path	174 path = urlparts.path

131 return path[1:], urlparts.query	175 return path[1:], urlparts.query

132	176

	177 @cache_lru

	178 def parse_query(query):

	179 return urlparse.parse_qs(query)

	180

	181 @cache_lru

	182 def parse_lastversion(last_version):

	183 return datetime.strptime(last_version, "%Y%m%d%H%M")

	184

	185 @cache_lru

	186 def get_week(date):

	187 return date.isocalendar()[0:2]

	188

133 def parse_downloader_query(info):	189 def parse_downloader_query(info):

134 params = urlparse.parse_qs(info["query"])	190 params = parse_query(info["query"])

135 for param in ("addonName", "addonVersion", "application", "applicationVersion" , "platform", "platformVersion"):	191 for param in ("addonName", "addonVersion", "application", "applicationVersion" , "platform", "platformVersion"):

136 info[param] = params.get(param, ["unknown"])[0]	192 info[param] = params.get(param, ["unknown"])[0]

137	193

138 # Only leave the major and minor release number for application and platform	194 # Only leave the major and minor release number for application and platform

139 info["applicationVersion"] = re.sub(r"^(\d+\.\d+).*", r"\1", info["application Version"])	195 info["applicationVersion"] = re.sub(r"^(\d+\.\d+).*", r"\1", info["application Version"])

140 info["platformVersion"] = re.sub(r"^(\d+\.\d+).*", r"\1", info["platformVersio n"])	196 info["platformVersion"] = re.sub(r"^(\d+\.\d+).*", r"\1", info["platformVersio n"])

141	197

142 # Chrome Adblock sends an X-Client-ID header insteads of URL parameters	198 # Chrome Adblock sends an X-Client-ID header insteads of URL parameters

143 match = re.match(r"^adblock/([\d\.]+)$", info["clientid"], re.I) if info["clie ntid"] else None	199 match = re.match(r"^adblock/([\d\.]+)$", info["clientid"], re.I) if info["clie ntid"] else None

144 if match:	200 if match:

145 info["addonName"] = "chromeadblock"	201 info["addonName"] = "chromeadblock"

146 info["addonVersion"] = match.group(1)	202 info["addonVersion"] = match.group(1)

147	203

148 last_version = params.get("lastVersion", ["unknown"])[0]	204 last_version = params.get("lastVersion", ["unknown"])[0]

149 if info["file"] == "notification.json" and last_version == "0" and (	205 if info["file"] == "notification.json" and last_version == "0" and (

150 (info["addonName"] == "adblockplus" and info["addonVersion"] == "2.3.1") o r	206 (info["addonName"] == "adblockplus" and info["addonVersion"] == "2.3.1") o r

151 (info["addonName"] in ("adblockpluschrome", "adblockplusopera") and info[" addonVersion"] == "1.5.2")	207 (info["addonName"] in ("adblockpluschrome", "adblockplusopera") and info[" addonVersion"] == "1.5.2")

152 ):	208 ):

153 # Broken notification version number in these releases, treat like unknown	209 # Broken notification version number in these releases, treat like unknown

154 last_version = "unknown"	210 last_version = "unknown"

155	211

156 if last_version == "unknown":	212 if last_version == "unknown":

157 info["downloadInterval"] = "unknown"	213 info["downloadInterval"] = "unknown"

158 elif last_version == "0":	214 elif last_version == "0":

159 info["downloadInterval"] = "unknown"	215 info["downloadInterval"] = "unknown"

160 info["firstDownload"] = info["firstInMonth"] = info["firstInWeek"] = info["f irstInDay"] = True	216 info["firstDownload"] = info["firstInMonth"] = info["firstInWeek"] = info["f irstInDay"] = True

161 else:	217 else:

162 try:	218 try:

163 last_update = datetime.strptime(last_version, "%Y%m%d%H%M")	219 last_update = parse_lastversion(last_version)

164 diff = info["time"] - last_update	220 diff = info["time"] - last_update

165 if diff.days >= 365:	221 if diff.days >= 365:

166 info["downloadInterval"] = "%i year(s)" % (diff.days / 365)	222 info["downloadInterval"] = "%i year(s)" % (diff.days / 365)

167 elif diff.days >= 30:	223 elif diff.days >= 30:

168 info["downloadInterval"] = "%i month(s)" % (diff.days / 30)	224 info["downloadInterval"] = "%i month(s)" % (diff.days / 30)

169 elif diff.days >= 1:	225 elif diff.days >= 1:

170 info["downloadInterval"] = "%i day(s)" % diff.days	226 info["downloadInterval"] = "%i day(s)" % diff.days

171 else:	227 else:

172 info["downloadInterval"] = "%i hour(s)" % (diff.seconds / 3600)	228 info["downloadInterval"] = "%i hour(s)" % (diff.seconds / 3600)

173	229

174 if last_update.year != info["time"].year or last_update.month != info["tim e"].month:	230 if last_update.year != info["time"].year or last_update.month != info["tim e"].month:

175 info["firstInMonth"] = info["firstInDay"] = True	231 info["firstInMonth"] = info["firstInDay"] = True

176 elif last_update.day != info["time"].day:	232 elif last_update.day != info["time"].day:

177 info["firstInDay"] = True	233 info["firstInDay"] = True

178	234

179 if last_update.isocalendar()[0:2] != info["time"].isocalendar()[0:2]:	235 if get_week(last_update) != get_week(info["time"]):

180 info["firstInWeek"] = True	236 info["firstInWeek"] = True

181 except ValueError:	237 except ValueError:

182 info["downloadInterval"] = "unknown"	238 info["downloadInterval"] = "unknown"

183 pass	239 pass

184	240

185 def parse_addon_name(file):	241 def parse_addon_name(file):

186 if "/" in file:	242 if "/" in file:

187 return file.split("/")[-2]	243 return file.split("/")[-2]

188 else:	244 else:

189 return None	245 return None

(...skipping 141 matching lines...) Expand 10 before \| Expand all \| Expand 10 after Loading...
331	387

332 if __name__ == "__main__":	388 if __name__ == "__main__":

333 setupStderr()	389 setupStderr()

334	390

335 verbose = (len(sys.argv) >= 2 and sys.argv[1] == "verbose")	391 verbose = (len(sys.argv) >= 2 and sys.argv[1] == "verbose")

336 geo = GeoIP.open(get_config().get("stats", "geoip_db"), GeoIP.GEOIP_MEMORY_CAC HE)	392 geo = GeoIP.open(get_config().get("stats", "geoip_db"), GeoIP.GEOIP_MEMORY_CAC HE)

337 result = parse_stdin(geo, verbose)	393 result = parse_stdin(geo, verbose)

338	394

339 with codecs.open(get_config().get("stats", "tempFile"), "wb", encoding="utf-8" ) as file:	395 with codecs.open(get_config().get("stats", "tempFile"), "wb", encoding="utf-8" ) as file:

340 simplejson.dump(result, file, indent=2, sort_keys=True)	396 simplejson.dump(result, file, indent=2, sort_keys=True)

LEFT	RIGHT