Rietveld Code Review Tool
Help | Bug tracker | Discussion group | Source code

Delta Between Two Patch Sets: sitescripts/stats/bin/logprocessor.py

Issue 11481051: Update stats processing (Closed)
Left Patch Set: Fixed two presentation issues Created Aug. 24, 2013, 1:11 p.m.
Right Patch Set: Improved performance using memoization Created Aug. 29, 2013, 1:39 p.m.
Left:
Right:
Use n/p to move between diff chunks; N/P to move between comments.
Jump to:
Left: Side by side diff | Download
Right: Side by side diff | Download
« no previous file with change/comment | « sitescripts/stats/bin/datamerger.py ('k') | sitescripts/stats/bin/pagegenerator.py » ('j') | no next file with change/comment »
Toggle Intra-line Diffs ('i') | Expand Comments ('e') | Collapse Comments ('c') | Show Comments Hide Comments ('s')
LEFTRIGHT
1 # coding: utf-8 1 # coding: utf-8
2 2
3 # This file is part of the Adblock Plus web scripts, 3 # This file is part of the Adblock Plus web scripts,
4 # Copyright (C) 2006-2013 Eyeo GmbH 4 # Copyright (C) 2006-2013 Eyeo GmbH
5 # 5 #
6 # Adblock Plus is free software: you can redistribute it and/or modify 6 # Adblock Plus is free software: you can redistribute it and/or modify
7 # it under the terms of the GNU General Public License version 3 as 7 # it under the terms of the GNU General Public License version 3 as
8 # published by the Free Software Foundation. 8 # published by the Free Software Foundation.
9 # 9 #
10 # Adblock Plus is distributed in the hope that it will be useful, 10 # Adblock Plus is distributed in the hope that it will be useful,
11 # but WITHOUT ANY WARRANTY; without even the implied warranty of 11 # but WITHOUT ANY WARRANTY; without even the implied warranty of
12 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 12 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
13 # GNU General Public License for more details. 13 # GNU General Public License for more details.
14 # 14 #
15 # You should have received a copy of the GNU General Public License 15 # You should have received a copy of the GNU General Public License
16 # along with Adblock Plus. If not, see <http://www.gnu.org/licenses/>. 16 # along with Adblock Plus. If not, see <http://www.gnu.org/licenses/>.
17 17
18 import os, sys, codecs, re, math, GeoIP, urllib, urlparse, socket, simplejson 18 import os, sys, codecs, re, math, GeoIP, urllib, urlparse, socket, simplejson
19 from collections import OrderedDict
19 import sitescripts.stats.common as common 20 import sitescripts.stats.common as common
20 from sitescripts.utils import get_config, setupStderr 21 from sitescripts.utils import get_config, setupStderr
21 from datetime import datetime, timedelta 22 from datetime import datetime, timedelta
22 23
23 log_regexp = None 24 log_regexp = None
24 mirror_name = None 25 mirror_name = None
25 gecko_apps = None 26 gecko_apps = None
26 27
28 def cache_lru(func):
29 """
30 Decorator that memoizes the return values of a single-parameter function in
31 case it is called again with the same parameter. The 1024 most recent
32 results are saved.
33 """
34
35 results = OrderedDict()
36 results.entries_left = 1024
37
38 def wrapped(arg):
39 if arg in results:
40 result = results[arg]
41 del results[arg]
42 else:
43 if results.entries_left > 0:
44 results.entries_left -= 1
45 else:
46 results.popitem(last=False)
47 result = func(arg)
48 results[arg] = result
49 return result
50 return wrapped
51
52
53 def cache_last(func):
54 """
55 Decorator that memoizes the last return value of a function in case it is
56 called again with the same parameters.
57 """
58 result = {"args": None, "result": None}
59
60 def wrapped(*args):
61 if args != result["args"]:
62 result["result"] = func(*args)
63 result["args"] = args
64 return result["result"]
65 return wrapped
66
67
68 @cache_lru
27 def parse_ua(ua): 69 def parse_ua(ua):
28 # Opera might disguise itself as other browser so it needs to go first 70 # Opera might disguise itself as other browser so it needs to go first
29 match = re.search(r"\bOpera/([\d\.]+)", ua) 71 match = re.search(r"\bOpera/([\d\.]+)", ua)
30 if match: 72 if match:
31 # Opera 10+ declares itself as Opera 9.80 but adds Version/1x.x to the UA 73 # Opera 10+ declares itself as Opera 9.80 but adds Version/1x.x to the UA
32 match2 = re.search(r"\bVersion/([\d\.]+)", ua) 74 match2 = re.search(r"\bVersion/([\d\.]+)", ua)
33 if match2: 75 if match2:
34 return "Opera", match2.group(1) 76 return "Opera", match2.group(1)
35 else: 77 else:
36 return "Opera", match.group(1) 78 return "Opera", match.group(1)
37 79
38 # Opera 15+ has the same UA as Chrome but adds OPR/1x.x to it 80 # Opera 15+ has the same UA as Chrome but adds OPR/1x.x to it
39 match = re.search(r"\bOPR/(\d+\.\d+)", ua) 81 match = re.search(r"\bOPR/(\d+\.\d+)", ua)
40 if match: 82 if match:
41 return "Opera", match.group(1) 83 return "Opera", match.group(1)
42 84
43 for appName in ("Fennec", "Thunderbird", "SeaMonkey", "Songbird", "K-Meleon", "Prism", "Firefox"): 85 # Have to check for these before Firefox, they will usually have a Firefox ide ntifier as well
44 match = re.search(r"\b%s/(\d+\.\d+)" % appName, ua) 86 match = re.search(r"\b(Fennec|Thunderbird|SeaMonkey|Songbird|K-Meleon|Prism)/( \d+\.\d+)", ua)
Sebastian Noack 2013/08/26 16:05:22 Instead of iterating over the list of browsers and
Wladimir Palant 2013/08/27 07:34:28 This was done like that intentionally - quite a fe
45 if match: 87 if match:
46 if appName == "Fennec" or (appName == "Firefox" and re.search(r"\bMobile;" , ua)): 88 if match.group(1) == "Fennec":
47 return "Firefox Mobile", match.group(1) 89 return "Firefox Mobile", match.group(2)
48 else: 90 else:
49 return appName, match.group(1) 91 return match.group(1), match.group(2)
92
93 match = re.search(r"\bFirefox/(\d+\.\d+)", ua)
94 if match:
95 if re.search(r"\bMobile;", ua):
96 return "Firefox Mobile", match.group(1)
97 else:
98 return "Firefox", match.group(1)
50 99
51 match = re.search(r"\brv:(\d+)\.(\d+)(?:\.(\d+))?", ua) 100 match = re.search(r"\brv:(\d+)\.(\d+)(?:\.(\d+))?", ua)
52 if match and re.search(r"\bGecko/", ua): 101 if match and re.search(r"\bGecko/", ua):
53 if match.group(3) and int(match.group(1)) < 2: 102 if match.group(3) and int(match.group(1)) < 2:
54 return "Gecko", "%s.%s.%s" % (match.group(1), match.group(2), match.group( 3)) 103 return "Gecko", "%s.%s.%s" % (match.group(1), match.group(2), match.group( 3))
Sebastian Noack 2013/08/26 16:05:22 You could just just call match.groups(), which alr
Wladimir Palant 2013/08/27 07:34:28 I think I rather keep this as is for consistency w
55 else: 104 else:
56 return "Gecko", "%s.%s" % (match.group(1), match.group(2)) 105 return "Gecko", "%s.%s" % (match.group(1), match.group(2))
57 106
58 match = re.search(r"\bCoolNovo/(\d+\.\d+\.\d+)", ua) 107 match = re.search(r"\bCoolNovo/(\d+\.\d+\.\d+)", ua)
59 if match: 108 if match:
60 return "CoolNovo", match.group(1) 109 return "CoolNovo", match.group(1)
61 110
62 match = re.search(r"\bChrome/(\d+\.\d+)", ua) 111 match = re.search(r"\bChrome/(\d+\.\d+)", ua)
63 if match: 112 if match:
64 return "Chrome", match.group(1) 113 return "Chrome", match.group(1)
(...skipping 38 matching lines...) Expand 10 before | Expand all | Expand 10 after
103 if match: 152 if match:
104 ip = match.group(1) 153 ip = match.group(1)
105 154
106 country = geo.country_code_by_addr(ip) 155 country = geo.country_code_by_addr(ip)
107 if country in (None, "", "--"): 156 if country in (None, "", "--"):
108 country = "unknown" 157 country = "unknown"
109 country = country.lower() 158 country = country.lower()
110 159
111 return ip, country 160 return ip, country
112 161
162 @cache_last
113 def parse_time(timestr, tz_hours, tz_minutes): 163 def parse_time(timestr, tz_hours, tz_minutes):
114 result = datetime.strptime(timestr, "%d/%b/%Y:%H:%M:%S") 164 result = datetime.strptime(timestr, "%d/%b/%Y:%H:%M:%S")
115 result -= timedelta(hours = tz_hours, minutes = math.copysign(tz_minutes, tz_h ours)) 165 result -= timedelta(hours = tz_hours, minutes = math.copysign(tz_minutes, tz_h ours))
116 return result, result.strftime("%Y%m"), result.day, result.weekday(), result.h our 166 return result, result.strftime("%Y%m"), result.day, result.weekday(), result.h our
117 167
168 @cache_lru
118 def parse_path(path): 169 def parse_path(path):
119 urlparts = urlparse.urlparse(path) 170 urlparts = urlparse.urlparse(path)
120 try: 171 try:
121 path = urllib.unquote(urlparts.path).decode("utf-8") 172 path = urllib.unquote(urlparts.path).decode("utf-8")
122 except: 173 except:
123 path = urlparts.path 174 path = urlparts.path
124 return path[1:], urlparts.query 175 return path[1:], urlparts.query
125 176
177 @cache_lru
178 def parse_query(query):
179 return urlparse.parse_qs(query)
180
181 @cache_lru
182 def parse_lastversion(last_version):
183 return datetime.strptime(last_version, "%Y%m%d%H%M")
184
185 @cache_lru
186 def get_week(date):
187 return date.isocalendar()[0:2]
188
126 def parse_downloader_query(info): 189 def parse_downloader_query(info):
127 params = urlparse.parse_qs(info["query"]) 190 params = parse_query(info["query"])
128 for param in ("addonName", "addonVersion", "application", "applicationVersion" , "platform", "platformVersion"): 191 for param in ("addonName", "addonVersion", "application", "applicationVersion" , "platform", "platformVersion"):
129 info[param] = params.get(param, ["unknown"])[0] 192 info[param] = params.get(param, ["unknown"])[0]
130 193
131 # Only leave the major and minor release number for application and platform 194 # Only leave the major and minor release number for application and platform
132 info["applicationVersion"] = re.sub(r"^(\d+\.\d+).*", r"\1", info["application Version"]) 195 info["applicationVersion"] = re.sub(r"^(\d+\.\d+).*", r"\1", info["application Version"])
133 info["platformVersion"] = re.sub(r"^(\d+\.\d+).*", r"\1", info["platformVersio n"]) 196 info["platformVersion"] = re.sub(r"^(\d+\.\d+).*", r"\1", info["platformVersio n"])
134 197
135 # Chrome Adblock sends an X-Client-ID header insteads of URL parameters 198 # Chrome Adblock sends an X-Client-ID header insteads of URL parameters
136 match = re.match(r"^adblock/([\d\.]+)$", info["clientid"], re.I) if info["clie ntid"] else None 199 match = re.match(r"^adblock/([\d\.]+)$", info["clientid"], re.I) if info["clie ntid"] else None
137 if match: 200 if match:
138 info["addonName"] = "chromeadblock" 201 info["addonName"] = "chromeadblock"
139 info["addonVersion"] = match.group(1) 202 info["addonVersion"] = match.group(1)
140 203
141 last_version = params.get("lastVersion", ["unknown"])[0] 204 last_version = params.get("lastVersion", ["unknown"])[0]
142 if info["file"] == "notification.json" and last_version == "0" and ( 205 if info["file"] == "notification.json" and last_version == "0" and (
143 (info["addonName"] == "adblockplus" and info["addonVersion"] == "2.3.1") o r 206 (info["addonName"] == "adblockplus" and info["addonVersion"] == "2.3.1") o r
144 (info["addonName"] in ("adblockpluschrome", "adblockplusopera") and info[" addonVersion"] == "1.5.2") 207 (info["addonName"] in ("adblockpluschrome", "adblockplusopera") and info[" addonVersion"] == "1.5.2")
145 ): 208 ):
146 # Broken notification version number in these releases, treat like unknown 209 # Broken notification version number in these releases, treat like unknown
147 last_version = "unknown" 210 last_version = "unknown"
148 211
149 if last_version == "unknown": 212 if last_version == "unknown":
150 info["downloadInterval"] = "unknown" 213 info["downloadInterval"] = "unknown"
151 elif last_version == "0": 214 elif last_version == "0":
152 info["downloadInterval"] = "unknown" 215 info["downloadInterval"] = "unknown"
153 info["firstDownload"] = info["firstInMonth"] = info["firstInWeek"] = info["f irstInDay"] = True 216 info["firstDownload"] = info["firstInMonth"] = info["firstInWeek"] = info["f irstInDay"] = True
154 else: 217 else:
155 try: 218 try:
156 last_update = datetime.strptime(last_version, "%Y%m%d%H%M") 219 last_update = parse_lastversion(last_version)
157 diff = info["time"] - last_update 220 diff = info["time"] - last_update
158 if diff.days >= 365: 221 if diff.days >= 365:
159 info["downloadInterval"] = "%i year(s)" % (diff.days / 365) 222 info["downloadInterval"] = "%i year(s)" % (diff.days / 365)
160 elif diff.days >= 30: 223 elif diff.days >= 30:
161 info["downloadInterval"] = "%i month(s)" % (diff.days / 30) 224 info["downloadInterval"] = "%i month(s)" % (diff.days / 30)
162 elif diff.days >= 1: 225 elif diff.days >= 1:
163 info["downloadInterval"] = "%i day(s)" % diff.days 226 info["downloadInterval"] = "%i day(s)" % diff.days
164 else: 227 else:
165 info["downloadInterval"] = "%i hour(s)" % (diff.seconds / 3600) 228 info["downloadInterval"] = "%i hour(s)" % (diff.seconds / 3600)
166 229
167 if last_update.year != info["time"].year or last_update.month != info["tim e"].month: 230 if last_update.year != info["time"].year or last_update.month != info["tim e"].month:
168 info["firstInMonth"] = info["firstInDay"] = True 231 info["firstInMonth"] = info["firstInDay"] = True
169 elif last_update.day != info["time"].day: 232 elif last_update.day != info["time"].day:
170 info["firstInDay"] = True 233 info["firstInDay"] = True
171 234
172 if last_update.isocalendar()[0:2] != info["time"].isocalendar()[0:2]: 235 if get_week(last_update) != get_week(info["time"]):
173 info["firstInWeek"] = True 236 info["firstInWeek"] = True
174 except ValueError: 237 except ValueError:
175 info["downloadInterval"] = "unknown" 238 info["downloadInterval"] = "unknown"
176 pass 239 pass
177 240
178 def parse_addon_name(file): 241 def parse_addon_name(file):
179 if "/" in file: 242 if "/" in file:
180 return file.split("/")[-2] 243 return file.split("/")[-2]
181 else: 244 else:
182 return None 245 return None
(...skipping 141 matching lines...) Expand 10 before | Expand all | Expand 10 after
324 387
325 if __name__ == "__main__": 388 if __name__ == "__main__":
326 setupStderr() 389 setupStderr()
327 390
328 verbose = (len(sys.argv) >= 2 and sys.argv[1] == "verbose") 391 verbose = (len(sys.argv) >= 2 and sys.argv[1] == "verbose")
329 geo = GeoIP.open(get_config().get("stats", "geoip_db"), GeoIP.GEOIP_MEMORY_CAC HE) 392 geo = GeoIP.open(get_config().get("stats", "geoip_db"), GeoIP.GEOIP_MEMORY_CAC HE)
330 result = parse_stdin(geo, verbose) 393 result = parse_stdin(geo, verbose)
331 394
332 with codecs.open(get_config().get("stats", "tempFile"), "wb", encoding="utf-8" ) as file: 395 with codecs.open(get_config().get("stats", "tempFile"), "wb", encoding="utf-8" ) as file:
333 simplejson.dump(result, file, indent=2, sort_keys=True) 396 simplejson.dump(result, file, indent=2, sort_keys=True)
LEFTRIGHT

Powered by Google App Engine
This is Rietveld