Rietveld Code Review Tool
Help | Bug tracker | Discussion group | Source code

Delta Between Two Patch Sets: sitescripts/stats/bin/logprocessor.py

Issue 11481051: Update stats processing (Closed)
Left Patch Set: String concatenation fix Created Aug. 27, 2013, 12:41 p.m.
Right Patch Set: Improved performance using memoization Created Aug. 29, 2013, 1:39 p.m.
Left:
Right:
Use n/p to move between diff chunks; N/P to move between comments.
Jump to:
Left: Side by side diff | Download
Right: Side by side diff | Download
« no previous file with change/comment | « sitescripts/stats/bin/datamerger.py ('k') | sitescripts/stats/bin/pagegenerator.py » ('j') | no next file with change/comment »
Toggle Intra-line Diffs ('i') | Expand Comments ('e') | Collapse Comments ('c') | Show Comments Hide Comments ('s')
LEFTRIGHT
1 # coding: utf-8 1 # coding: utf-8
2 2
3 # This file is part of the Adblock Plus web scripts, 3 # This file is part of the Adblock Plus web scripts,
4 # Copyright (C) 2006-2013 Eyeo GmbH 4 # Copyright (C) 2006-2013 Eyeo GmbH
5 # 5 #
6 # Adblock Plus is free software: you can redistribute it and/or modify 6 # Adblock Plus is free software: you can redistribute it and/or modify
7 # it under the terms of the GNU General Public License version 3 as 7 # it under the terms of the GNU General Public License version 3 as
8 # published by the Free Software Foundation. 8 # published by the Free Software Foundation.
9 # 9 #
10 # Adblock Plus is distributed in the hope that it will be useful, 10 # Adblock Plus is distributed in the hope that it will be useful,
11 # but WITHOUT ANY WARRANTY; without even the implied warranty of 11 # but WITHOUT ANY WARRANTY; without even the implied warranty of
12 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 12 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
13 # GNU General Public License for more details. 13 # GNU General Public License for more details.
14 # 14 #
15 # You should have received a copy of the GNU General Public License 15 # You should have received a copy of the GNU General Public License
16 # along with Adblock Plus. If not, see <http://www.gnu.org/licenses/>. 16 # along with Adblock Plus. If not, see <http://www.gnu.org/licenses/>.
17 17
18 import os, sys, codecs, re, math, GeoIP, urllib, urlparse, socket, simplejson 18 import os, sys, codecs, re, math, GeoIP, urllib, urlparse, socket, simplejson
19 from collections import OrderedDict
19 import sitescripts.stats.common as common 20 import sitescripts.stats.common as common
20 from sitescripts.utils import get_config, setupStderr 21 from sitescripts.utils import get_config, setupStderr
21 from datetime import datetime, timedelta 22 from datetime import datetime, timedelta
22 23
23 log_regexp = None 24 log_regexp = None
24 mirror_name = None 25 mirror_name = None
25 gecko_apps = None 26 gecko_apps = None
26 27
28 def cache_lru(func):
29 """
30 Decorator that memoizes the return values of a single-parameter function in
31 case it is called again with the same parameter. The 1024 most recent
32 results are saved.
33 """
34
35 results = OrderedDict()
36 results.entries_left = 1024
37
38 def wrapped(arg):
39 if arg in results:
40 result = results[arg]
41 del results[arg]
42 else:
43 if results.entries_left > 0:
44 results.entries_left -= 1
45 else:
46 results.popitem(last=False)
47 result = func(arg)
48 results[arg] = result
49 return result
50 return wrapped
51
52
53 def cache_last(func):
54 """
55 Decorator that memoizes the last return value of a function in case it is
56 called again with the same parameters.
57 """
58 result = {"args": None, "result": None}
59
60 def wrapped(*args):
61 if args != result["args"]:
62 result["result"] = func(*args)
63 result["args"] = args
64 return result["result"]
65 return wrapped
66
67
68 @cache_lru
27 def parse_ua(ua): 69 def parse_ua(ua):
28 # Opera might disguise itself as other browser so it needs to go first 70 # Opera might disguise itself as other browser so it needs to go first
29 match = re.search(r"\bOpera/([\d\.]+)", ua) 71 match = re.search(r"\bOpera/([\d\.]+)", ua)
Felix Dahlke 2013/08/28 17:25:32 Memoization helped save around 5% here. In an acce
Sebastian Noack 2013/08/29 10:54:30 Keys in python dicts are already unique. So when y
30 if match: 72 if match:
31 # Opera 10+ declares itself as Opera 9.80 but adds Version/1x.x to the UA 73 # Opera 10+ declares itself as Opera 9.80 but adds Version/1x.x to the UA
32 match2 = re.search(r"\bVersion/([\d\.]+)", ua) 74 match2 = re.search(r"\bVersion/([\d\.]+)", ua)
33 if match2: 75 if match2:
34 return "Opera", match2.group(1) 76 return "Opera", match2.group(1)
35 else: 77 else:
36 return "Opera", match.group(1) 78 return "Opera", match.group(1)
37 79
38 # Opera 15+ has the same UA as Chrome but adds OPR/1x.x to it 80 # Opera 15+ has the same UA as Chrome but adds OPR/1x.x to it
39 match = re.search(r"\bOPR/(\d+\.\d+)", ua) 81 match = re.search(r"\bOPR/(\d+\.\d+)", ua)
(...skipping 70 matching lines...) Expand 10 before | Expand all | Expand 10 after
110 if match: 152 if match:
111 ip = match.group(1) 153 ip = match.group(1)
112 154
113 country = geo.country_code_by_addr(ip) 155 country = geo.country_code_by_addr(ip)
114 if country in (None, "", "--"): 156 if country in (None, "", "--"):
115 country = "unknown" 157 country = "unknown"
116 country = country.lower() 158 country = country.lower()
117 159
118 return ip, country 160 return ip, country
119 161
162 @cache_last
120 def parse_time(timestr, tz_hours, tz_minutes): 163 def parse_time(timestr, tz_hours, tz_minutes):
121 result = datetime.strptime(timestr, "%d/%b/%Y:%H:%M:%S") 164 result = datetime.strptime(timestr, "%d/%b/%Y:%H:%M:%S")
122 result -= timedelta(hours = tz_hours, minutes = math.copysign(tz_minutes, tz_h ours)) 165 result -= timedelta(hours = tz_hours, minutes = math.copysign(tz_minutes, tz_h ours))
123 return result, result.strftime("%Y%m"), result.day, result.weekday(), result.h our 166 return result, result.strftime("%Y%m"), result.day, result.weekday(), result.h our
124 167
168 @cache_lru
125 def parse_path(path): 169 def parse_path(path):
126 urlparts = urlparse.urlparse(path) 170 urlparts = urlparse.urlparse(path)
127 try: 171 try:
128 path = urllib.unquote(urlparts.path).decode("utf-8") 172 path = urllib.unquote(urlparts.path).decode("utf-8")
129 except: 173 except:
130 path = urlparts.path 174 path = urlparts.path
131 return path[1:], urlparts.query 175 return path[1:], urlparts.query
132 176
177 @cache_lru
178 def parse_query(query):
179 return urlparse.parse_qs(query)
180
181 @cache_lru
182 def parse_lastversion(last_version):
183 return datetime.strptime(last_version, "%Y%m%d%H%M")
184
185 @cache_lru
186 def get_week(date):
187 return date.isocalendar()[0:2]
188
133 def parse_downloader_query(info): 189 def parse_downloader_query(info):
134 params = urlparse.parse_qs(info["query"]) 190 params = parse_query(info["query"])
135 for param in ("addonName", "addonVersion", "application", "applicationVersion" , "platform", "platformVersion"): 191 for param in ("addonName", "addonVersion", "application", "applicationVersion" , "platform", "platformVersion"):
136 info[param] = params.get(param, ["unknown"])[0] 192 info[param] = params.get(param, ["unknown"])[0]
137 193
138 # Only leave the major and minor release number for application and platform 194 # Only leave the major and minor release number for application and platform
139 info["applicationVersion"] = re.sub(r"^(\d+\.\d+).*", r"\1", info["application Version"]) 195 info["applicationVersion"] = re.sub(r"^(\d+\.\d+).*", r"\1", info["application Version"])
140 info["platformVersion"] = re.sub(r"^(\d+\.\d+).*", r"\1", info["platformVersio n"]) 196 info["platformVersion"] = re.sub(r"^(\d+\.\d+).*", r"\1", info["platformVersio n"])
141 197
142 # Chrome Adblock sends an X-Client-ID header insteads of URL parameters 198 # Chrome Adblock sends an X-Client-ID header insteads of URL parameters
143 match = re.match(r"^adblock/([\d\.]+)$", info["clientid"], re.I) if info["clie ntid"] else None 199 match = re.match(r"^adblock/([\d\.]+)$", info["clientid"], re.I) if info["clie ntid"] else None
144 if match: 200 if match:
145 info["addonName"] = "chromeadblock" 201 info["addonName"] = "chromeadblock"
146 info["addonVersion"] = match.group(1) 202 info["addonVersion"] = match.group(1)
147 203
148 last_version = params.get("lastVersion", ["unknown"])[0] 204 last_version = params.get("lastVersion", ["unknown"])[0]
149 if info["file"] == "notification.json" and last_version == "0" and ( 205 if info["file"] == "notification.json" and last_version == "0" and (
150 (info["addonName"] == "adblockplus" and info["addonVersion"] == "2.3.1") o r 206 (info["addonName"] == "adblockplus" and info["addonVersion"] == "2.3.1") o r
151 (info["addonName"] in ("adblockpluschrome", "adblockplusopera") and info[" addonVersion"] == "1.5.2") 207 (info["addonName"] in ("adblockpluschrome", "adblockplusopera") and info[" addonVersion"] == "1.5.2")
152 ): 208 ):
153 # Broken notification version number in these releases, treat like unknown 209 # Broken notification version number in these releases, treat like unknown
154 last_version = "unknown" 210 last_version = "unknown"
155 211
156 if last_version == "unknown": 212 if last_version == "unknown":
157 info["downloadInterval"] = "unknown" 213 info["downloadInterval"] = "unknown"
158 elif last_version == "0": 214 elif last_version == "0":
159 info["downloadInterval"] = "unknown" 215 info["downloadInterval"] = "unknown"
160 info["firstDownload"] = info["firstInMonth"] = info["firstInWeek"] = info["f irstInDay"] = True 216 info["firstDownload"] = info["firstInMonth"] = info["firstInWeek"] = info["f irstInDay"] = True
161 else: 217 else:
162 try: 218 try:
163 last_update = datetime.strptime(last_version, "%Y%m%d%H%M") 219 last_update = parse_lastversion(last_version)
164 diff = info["time"] - last_update 220 diff = info["time"] - last_update
165 if diff.days >= 365: 221 if diff.days >= 365:
166 info["downloadInterval"] = "%i year(s)" % (diff.days / 365) 222 info["downloadInterval"] = "%i year(s)" % (diff.days / 365)
167 elif diff.days >= 30: 223 elif diff.days >= 30:
168 info["downloadInterval"] = "%i month(s)" % (diff.days / 30) 224 info["downloadInterval"] = "%i month(s)" % (diff.days / 30)
169 elif diff.days >= 1: 225 elif diff.days >= 1:
170 info["downloadInterval"] = "%i day(s)" % diff.days 226 info["downloadInterval"] = "%i day(s)" % diff.days
171 else: 227 else:
172 info["downloadInterval"] = "%i hour(s)" % (diff.seconds / 3600) 228 info["downloadInterval"] = "%i hour(s)" % (diff.seconds / 3600)
173 229
174 if last_update.year != info["time"].year or last_update.month != info["tim e"].month: 230 if last_update.year != info["time"].year or last_update.month != info["tim e"].month:
175 info["firstInMonth"] = info["firstInDay"] = True 231 info["firstInMonth"] = info["firstInDay"] = True
176 elif last_update.day != info["time"].day: 232 elif last_update.day != info["time"].day:
177 info["firstInDay"] = True 233 info["firstInDay"] = True
178 234
179 if last_update.isocalendar()[0:2] != info["time"].isocalendar()[0:2]: 235 if get_week(last_update) != get_week(info["time"]):
180 info["firstInWeek"] = True 236 info["firstInWeek"] = True
181 except ValueError: 237 except ValueError:
182 info["downloadInterval"] = "unknown" 238 info["downloadInterval"] = "unknown"
183 pass 239 pass
184 240
185 def parse_addon_name(file): 241 def parse_addon_name(file):
186 if "/" in file: 242 if "/" in file:
187 return file.split("/")[-2] 243 return file.split("/")[-2]
188 else: 244 else:
189 return None 245 return None
(...skipping 141 matching lines...) Expand 10 before | Expand all | Expand 10 after
331 387
332 if __name__ == "__main__": 388 if __name__ == "__main__":
333 setupStderr() 389 setupStderr()
334 390
335 verbose = (len(sys.argv) >= 2 and sys.argv[1] == "verbose") 391 verbose = (len(sys.argv) >= 2 and sys.argv[1] == "verbose")
336 geo = GeoIP.open(get_config().get("stats", "geoip_db"), GeoIP.GEOIP_MEMORY_CAC HE) 392 geo = GeoIP.open(get_config().get("stats", "geoip_db"), GeoIP.GEOIP_MEMORY_CAC HE)
337 result = parse_stdin(geo, verbose) 393 result = parse_stdin(geo, verbose)
338 394
339 with codecs.open(get_config().get("stats", "tempFile"), "wb", encoding="utf-8" ) as file: 395 with codecs.open(get_config().get("stats", "tempFile"), "wb", encoding="utf-8" ) as file:
340 simplejson.dump(result, file, indent=2, sort_keys=True) 396 simplejson.dump(result, file, indent=2, sort_keys=True)
LEFTRIGHT

Powered by Google App Engine
This is Rietveld