Rietveld Code Review Tool
Help | Bug tracker | Discussion group | Source code

Side by Side Diff: sitescripts/stats/bin/logprocessor.py

Issue 11481051: Update stats processing (Closed)
Patch Set: Improved performance using memoization Created Aug. 29, 2013, 1:39 p.m.
Left:
Right:
Use n/p to move between diff chunks; N/P to move between comments.
Jump to:
View unified diff | Download patch
« no previous file with comments | « sitescripts/stats/bin/datamerger.py ('k') | sitescripts/stats/bin/pagegenerator.py » ('j') | no next file with comments »
Toggle Intra-line Diffs ('i') | Expand Comments ('e') | Collapse Comments ('c') | Show Comments Hide Comments ('s')
OLDNEW
1 # coding: utf-8 1 # coding: utf-8
2 2
3 # This file is part of the Adblock Plus web scripts, 3 # This file is part of the Adblock Plus web scripts,
4 # Copyright (C) 2006-2013 Eyeo GmbH 4 # Copyright (C) 2006-2013 Eyeo GmbH
5 # 5 #
6 # Adblock Plus is free software: you can redistribute it and/or modify 6 # Adblock Plus is free software: you can redistribute it and/or modify
7 # it under the terms of the GNU General Public License version 3 as 7 # it under the terms of the GNU General Public License version 3 as
8 # published by the Free Software Foundation. 8 # published by the Free Software Foundation.
9 # 9 #
10 # Adblock Plus is distributed in the hope that it will be useful, 10 # Adblock Plus is distributed in the hope that it will be useful,
11 # but WITHOUT ANY WARRANTY; without even the implied warranty of 11 # but WITHOUT ANY WARRANTY; without even the implied warranty of
12 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 12 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
13 # GNU General Public License for more details. 13 # GNU General Public License for more details.
14 # 14 #
15 # You should have received a copy of the GNU General Public License 15 # You should have received a copy of the GNU General Public License
16 # along with Adblock Plus. If not, see <http://www.gnu.org/licenses/>. 16 # along with Adblock Plus. If not, see <http://www.gnu.org/licenses/>.
17 17
18 import sys, re, math, GeoIP 18 import os, sys, codecs, re, math, GeoIP, urllib, urlparse, socket, simplejson
19 from collections import OrderedDict
20 import sitescripts.stats.common as common
19 from sitescripts.utils import get_config, setupStderr 21 from sitescripts.utils import get_config, setupStderr
20 from datetime import datetime, timedelta 22 from datetime import datetime, timedelta
21 from ConfigParser import SafeConfigParser, NoOptionError 23
22 24 log_regexp = None
23 def parseUA(ua): 25 mirror_name = None
26 gecko_apps = None
27
28 def cache_lru(func):
29 """
30 Decorator that memoizes the return values of a single-parameter function in
31 case it is called again with the same parameter. The 1024 most recent
32 results are saved.
33 """
34
35 results = OrderedDict()
36 results.entries_left = 1024
37
38 def wrapped(arg):
39 if arg in results:
40 result = results[arg]
41 del results[arg]
42 else:
43 if results.entries_left > 0:
44 results.entries_left -= 1
45 else:
46 results.popitem(last=False)
47 result = func(arg)
48 results[arg] = result
49 return result
50 return wrapped
51
52
53 def cache_last(func):
54 """
55 Decorator that memoizes the last return value of a function in case it is
56 called again with the same parameters.
57 """
58 result = {"args": None, "result": None}
59
60 def wrapped(*args):
61 if args != result["args"]:
62 result["result"] = func(*args)
63 result["args"] = args
64 return result["result"]
65 return wrapped
66
67
68 @cache_lru
69 def parse_ua(ua):
24 # Opera might disguise itself as other browser so it needs to go first 70 # Opera might disguise itself as other browser so it needs to go first
25 match = re.search(r'\bOpera/([\d\.]+)', ua) 71 match = re.search(r"\bOpera/([\d\.]+)", ua)
26 if match: 72 if match:
27 # Opera 10+ declares itself as Opera 9.80 but adds Version/1x.x to the UA 73 # Opera 10+ declares itself as Opera 9.80 but adds Version/1x.x to the UA
28 match2 = re.search(r'\bVersion/([\d\.]+)', ua) 74 match2 = re.search(r"\bVersion/([\d\.]+)", ua)
29 if match2: 75 if match2:
30 return 'Opera %s' % match2.group(1) 76 return "Opera", match2.group(1)
31 else: 77 else:
32 return 'Opera %s' % match.group(1) 78 return "Opera", match.group(1)
33 79
34 # Opera 15+ has the same UA as Chrome but adds OPR/1x.x to it 80 # Opera 15+ has the same UA as Chrome but adds OPR/1x.x to it
35 match = re.search(r'\bOPR/(\d+\.\d+)', ua) 81 match = re.search(r"\bOPR/(\d+\.\d+)", ua)
36 if match: 82 if match:
37 return 'Opera %s' % match.group(1) 83 return "Opera", match.group(1)
38 84
39 for appName in ('Fennec', 'Thunderbird', 'SeaMonkey', 'Songbird', 'K-Meleon', 'Prism', 'Firefox'): 85 # Have to check for these before Firefox, they will usually have a Firefox ide ntifier as well
40 match = re.search(r'\b%s/(\d+\.\d+)' % appName, ua) 86 match = re.search(r"\b(Fennec|Thunderbird|SeaMonkey|Songbird|K-Meleon|Prism)/( \d+\.\d+)", ua)
41 if match: 87 if match:
42 return '%s %s' % (appName, match.group(1)) 88 if match.group(1) == "Fennec":
43 89 return "Firefox Mobile", match.group(2)
44 match = re.search(r'\brv:(\d+)\.(\d+)(?:\.(\d+))?', ua) 90 else:
45 if match and re.search(r'\bGecko/', ua): 91 return match.group(1), match.group(2)
92
93 match = re.search(r"\bFirefox/(\d+\.\d+)", ua)
94 if match:
95 if re.search(r"\bMobile;", ua):
96 return "Firefox Mobile", match.group(1)
97 else:
98 return "Firefox", match.group(1)
99
100 match = re.search(r"\brv:(\d+)\.(\d+)(?:\.(\d+))?", ua)
101 if match and re.search(r"\bGecko/", ua):
46 if match.group(3) and int(match.group(1)) < 2: 102 if match.group(3) and int(match.group(1)) < 2:
47 return 'Gecko %s.%s.%s' % (match.group(1), match.group(2), match.group(3)) 103 return "Gecko", "%s.%s.%s" % (match.group(1), match.group(2), match.group( 3))
48 else: 104 else:
49 return 'Gecko %s.%s' % (match.group(1), match.group(2)) 105 return "Gecko", "%s.%s" % (match.group(1), match.group(2))
50 106
51 match = re.search(r'\bChrome/(\d+\.\d+)', ua) 107 match = re.search(r"\bCoolNovo/(\d+\.\d+\.\d+)", ua)
52 if match: 108 if match:
53 return 'Chrome %s' % match.group(1) 109 return "CoolNovo", match.group(1)
54 110
55 match = re.search(r'\bVersion/(\d+\.\d+)', ua) 111 match = re.search(r"\bChrome/(\d+\.\d+)", ua)
56 if match and re.search(r'\Safari/', ua): 112 if match:
57 return 'Safari %s' % match.group(1) 113 return "Chrome", match.group(1)
58 114
59 if re.search(r'\bAppleWebKit/', ua): 115 match = re.search(r"\bVersion/(\d+\.\d+)", ua)
60 return 'WebKit' 116 if match and re.search(r"\bMobile Safari/", ua):
61 117 return "Mobile Safari", match.group(1)
62 match = re.search(r'\bMSIE (\d+\.\d+)', ua) 118 if match and re.search(r"\bSafari/", ua):
63 if match: 119 return "Safari", match.group(1)
64 return 'MSIE %s' % match.group(1) 120
65 121 if re.search(r"\bAppleWebKit/", ua):
66 return 'Other' 122 return "WebKit", ""
67 123
68 def parseStdIn(geo): 124 match = re.search(r"\bMSIE (\d+\.\d+)", ua)
69 if get_config().has_option('logs', 'subscriptionsSubdir'): 125 if match:
70 subdir = get_config().get('logs', 'subscriptionsSubdir') 126 return "MSIE", match.group(1)
71 subdir = re.sub(r'^/+', '', subdir) 127
72 subdir = re.sub(r'/+$', '', subdir) 128 match = re.search(r"\bTrident/(\d+\.\d+)", ua)
73 subdir = re.sub(r'(?=\W)', r'\\', subdir) 129 if match:
74 subdir = subdir + '/' 130 return "Trident", match.group(1)
131
132 match = re.search(r"\bAndroidDownloadManager(?:/(\d+\.\d+))?", ua)
133 if match:
134 return "Android", match.group(1) or ""
135
136 match = re.search(r"\bDalvik/.*\bAndroid (\d+\.\d+)", ua)
137 if match:
138 return "Android", match.group(1)
139
140 # ABP/Android downloads use that user agent
141 if ua.startswith("Apache-HttpClient/UNAVAILABLE"):
142 return "Android", ""
143
144 # ABP/IE downloads use that user agent
145 if ua == "Adblock Plus":
146 return "ABP", ""
147
148 return "Other", ""
149
150 def process_ip(ip, geo):
151 match = re.search(r"^::ffff:(\d+\.\d+\.\d+\.\d+)$", ip)
152 if match:
153 ip = match.group(1)
154
155 country = geo.country_code_by_addr(ip)
156 if country in (None, "", "--"):
157 country = "unknown"
158 country = country.lower()
159
160 return ip, country
161
162 @cache_last
163 def parse_time(timestr, tz_hours, tz_minutes):
164 result = datetime.strptime(timestr, "%d/%b/%Y:%H:%M:%S")
165 result -= timedelta(hours = tz_hours, minutes = math.copysign(tz_minutes, tz_h ours))
166 return result, result.strftime("%Y%m"), result.day, result.weekday(), result.h our
167
168 @cache_lru
169 def parse_path(path):
170 urlparts = urlparse.urlparse(path)
171 try:
172 path = urllib.unquote(urlparts.path).decode("utf-8")
173 except:
174 path = urlparts.path
175 return path[1:], urlparts.query
176
177 @cache_lru
178 def parse_query(query):
179 return urlparse.parse_qs(query)
180
181 @cache_lru
182 def parse_lastversion(last_version):
183 return datetime.strptime(last_version, "%Y%m%d%H%M")
184
185 @cache_lru
186 def get_week(date):
187 return date.isocalendar()[0:2]
188
189 def parse_downloader_query(info):
190 params = parse_query(info["query"])
191 for param in ("addonName", "addonVersion", "application", "applicationVersion" , "platform", "platformVersion"):
192 info[param] = params.get(param, ["unknown"])[0]
193
194 # Only leave the major and minor release number for application and platform
195 info["applicationVersion"] = re.sub(r"^(\d+\.\d+).*", r"\1", info["application Version"])
196 info["platformVersion"] = re.sub(r"^(\d+\.\d+).*", r"\1", info["platformVersio n"])
197
198 # Chrome Adblock sends an X-Client-ID header insteads of URL parameters
199 match = re.match(r"^adblock/([\d\.]+)$", info["clientid"], re.I) if info["clie ntid"] else None
200 if match:
201 info["addonName"] = "chromeadblock"
202 info["addonVersion"] = match.group(1)
203
204 last_version = params.get("lastVersion", ["unknown"])[0]
205 if info["file"] == "notification.json" and last_version == "0" and (
206 (info["addonName"] == "adblockplus" and info["addonVersion"] == "2.3.1") o r
207 (info["addonName"] in ("adblockpluschrome", "adblockplusopera") and info[" addonVersion"] == "1.5.2")
208 ):
209 # Broken notification version number in these releases, treat like unknown
210 last_version = "unknown"
211
212 if last_version == "unknown":
213 info["downloadInterval"] = "unknown"
214 elif last_version == "0":
215 info["downloadInterval"] = "unknown"
216 info["firstDownload"] = info["firstInMonth"] = info["firstInWeek"] = info["f irstInDay"] = True
75 else: 217 else:
76 subdir = '' 218 try:
77 regexp = re.compile(r'(\S+) \S+ \S+ \[([^]\s]+) ([+\-]\d\d)(\d\d)\] "GET (?:\w +://[^/]+)?/%s([\w\-\+\.]+\.(?:txt|tpl))(?:\?[^\s"]*)? [^"]+" (\d+) (\d+) "[^"]* " "([^"]*)"' % subdir) 219 last_update = parse_lastversion(last_version)
78 220 diff = info["time"] - last_update
221 if diff.days >= 365:
222 info["downloadInterval"] = "%i year(s)" % (diff.days / 365)
223 elif diff.days >= 30:
224 info["downloadInterval"] = "%i month(s)" % (diff.days / 30)
225 elif diff.days >= 1:
226 info["downloadInterval"] = "%i day(s)" % diff.days
227 else:
228 info["downloadInterval"] = "%i hour(s)" % (diff.seconds / 3600)
229
230 if last_update.year != info["time"].year or last_update.month != info["tim e"].month:
231 info["firstInMonth"] = info["firstInDay"] = True
232 elif last_update.day != info["time"].day:
233 info["firstInDay"] = True
234
235 if get_week(last_update) != get_week(info["time"]):
236 info["firstInWeek"] = True
237 except ValueError:
238 info["downloadInterval"] = "unknown"
239 pass
240
241 def parse_addon_name(file):
242 if "/" in file:
243 return file.split("/")[-2]
244 else:
245 return None
246
247 def parse_gecko_query(query):
248 params = urlparse.parse_qs(query)
249
250 version = params.get("version", ["unknown"])[0]
251
252 global gecko_apps
253 if gecko_apps == None:
254 from buildtools.packagerGecko import KNOWN_APPS
255 gecko_apps = {v: k for k, v in KNOWN_APPS.iteritems()}
256 appID = params.get("appID", ["unknown"])[0]
257
258 application = gecko_apps.get(appID, "unknown")
259 applicationVersion = params.get("appVersion", ["unknown"])[0]
260
261 # Only leave the major and minor release number for application
262 applicationVersion = re.sub(r"^(\d+\.\d+).*", r"\1", applicationVersion)
263
264 return version, application, applicationVersion
265
266 def parse_chrome_query(query):
267 params = urlparse.parse_qs(query)
268
269 if params.get("prod", ["unknown"])[0] in ("chromecrx", "chromiumcrx"):
270 application = "chrome"
271 else:
272 application = "unknown"
273 applicationVersion = params.get("prodversion", ["unknown"])[0]
274
275 params2 = urlparse.parse_qs(params.get("x", [""])[0])
276 version = params2.get("v", ["unknown"])[0]
277
278 # Only leave the major and minor release number for application
279 applicationVersion = re.sub(r"^(\d+\.\d+).*", r"\1", applicationVersion)
280
281 return version, application, applicationVersion
282
283 def parse_update_flag(query):
284 return "update" if query == "update" else "install"
285
286 def parse_record(line, ignored, geo):
287 global log_regexp, mirror_name
288 if log_regexp == None:
289 log_regexp = re.compile(r'(\S+) \S+ \S+ \[([^]\s]+) ([+\-]\d\d)(\d\d)\] "GET ([^"\s]+) [^"]+" (\d+) (\d+) "[^"]*" "([^"]*)"(?: "[^"]*" \S+ "[^"]*" "[^"]*" " ([^"]*)")?')
290 if mirror_name == None:
291 mirror_name = re.sub(r"\..*", "", socket.gethostname())
292
293 match = re.search(log_regexp, line)
294 if not match:
295 return None
296
297 status = int(match.group(6))
298 if status != 200:
299 return None
300
301 info = {
302 "mirror": mirror_name,
303 "size": int(match.group(7)),
304 }
305
306 info["ip"], info["country"] = process_ip(match.group(1), geo)
307 info["time"], info["month"], info["day"], info["weekday"], info["hour"] = pars e_time(match.group(2), int(match.group(3)), int(match.group(4)))
308 info["file"], info["query"] = parse_path(match.group(5))
309 info["ua"], info["uaversion"] = parse_ua(match.group(8))
310 info["fullua"] = "%s %s" % (info["ua"], info["uaversion"])
311 info["clientid"] = match.group(9)
312
313 # Additional metadata depends on file type
314 filename = os.path.basename(info["file"])
315 ext = os.path.splitext(filename)[1]
316 if ext == ".txt" or filename == "update.json" or filename == "notification.jso n":
317 # Subscription downloads, libadblockplus update checks and notification
318 # checks are performed by the downloader
319 parse_downloader_query(info)
320 elif ext == ".tpl":
321 # MSIE TPL download, no additional data here
322 pass
323 elif ext in (".xpi", ".crx", ".apk", ".msi", ".exe"):
324 # Package download, might be an update
325 info["installType"] = parse_update_flag(info["query"])
326 elif filename == "update.rdf":
327 # Gecko update check or a legacy Android update check. The latter doesn't
328 # have usable data anyway so trying the Chrome route won't do any harm.
329 info["addonName"] = parse_addon_name(info["file"])
330 info["addonVersion"], info["application"], info["applicationVersion"] = pars e_gecko_query(info["query"])
331 elif filename == "updates.xml":
332 # Chrome update check
333 info["addonName"] = parse_addon_name(info["file"])
334 info["addonVersion"], info["application"], info["applicationVersion"] = pars e_chrome_query(info["query"])
335 else:
336 ignored.add(info["file"])
337 return None
338
339 if "addonName" in info:
340 info["fullAddon"] = "%s %s" % (info["addonName"], info["addonVersion"])
341 if "application" in info:
342 info["fullApplication"] = "%s %s" % (info["application"], info["applicationV ersion"])
343 if "platform" in info:
344 info["fullPlatform"] = "%s %s" % (info["platform"], info["platformVersion"])
345 return info
346
347 def add_record(info, section, ignore_fields=()):
348 section["hits"] = section.get("hits", 0) + 1
349 section["bandwidth"] = section.get("bandwidth", 0) + info["size"]
350
351 if len(ignore_fields) < 2:
352 for field in map(lambda f: f["name"], common.fields):
353 if field in ignore_fields or field not in info:
354 continue
355
356 value = info[field]
357 if field not in section:
358 section[field] = {}
359 if value not in section[field]:
360 section[field][value] = {}
361
362 add_record(info, section[field][value], ignore_fields + (field,))
363
364 def parse_stdin(geo, verbose):
79 data = {} 365 data = {}
366 ignored = set()
80 for line in sys.stdin: 367 for line in sys.stdin:
81 match = re.search(regexp, line) 368 info = parse_record(line, ignored, geo)
82 if not match: 369 if info == None:
83 continue 370 continue
84 371
85 ip, time, tzHours, tzMinutes = match.group(1), match.group(2), int(match.gro up(3)), int(match.group(4)) 372 if info["month"] not in data:
86 file, status, size, ua = match.group(5), int(match.group(6)), int(match.grou p(7)), match.group(8) 373 data[info["month"]] = {}
87 if status != 200 and status != 302 and status != 304: 374 section = data[info["month"]]
88 continue 375
89 if file.startswith('robots.'): 376 if info["file"] not in section:
90 continue 377 section[info["file"]] = {}
91 378 section = section[info["file"]]
92 time = datetime.strptime(time, '%d/%b/%Y:%H:%M:%S') 379
93 time -= timedelta(hours = tzHours, minutes = math.copysign(tzMinutes, tzHour s)) 380 add_record(info, section)
94 381
95 match = re.search(r'^::ffff:(\d+\.\d+\.\d+\.\d+)$', ip) 382 if verbose:
96 if match: 383 print "Ignored files"
97 ip = match.group(1) 384 print "============="
98 country = geo.country_code_by_addr(ip) 385 print "\n".join(sorted(ignored))
99 if country == '' or country == '--': 386 return data
100 country = 'unknown' 387
101 388 if __name__ == "__main__":
102 ua = parseUA(ua)[:20]
103
104 section = time.strftime('%Y%m')
105 if not section in data:
106 data[section] = {}
107
108 def addResultInt(key, value):
109 if key in data[section]:
110 data[section][key] += value
111 else:
112 data[section][key] = value
113
114 addResultInt('%s hits' % file, 1)
115 addResultInt('%s bandwidth' % file, size)
116 addResultInt('%s hits day %i' % (file, time.day), 1)
117 addResultInt('%s bandwidth day %i' % (file, time.day), size)
118 addResultInt('%s hits hour %i' % (file, time.hour), 1)
119 addResultInt('%s bandwidth hour %i' % (file, time.hour), size)
120 addResultInt('%s hits country %s' % (file, country), 1)
121 addResultInt('%s bandwidth country %s' % (file, country), size)
122 addResultInt('%s hits app %s' % (file, ua), 1)
123 addResultInt('%s bandwidth app %s' % (file, ua), size)
124
125 result = SafeConfigParser()
126 for section in data.iterkeys():
127 result.add_section(section)
128 for key, value in data[section].iteritems():
129 result.set(section, key, str(value))
130 return result
131
132 if __name__ == '__main__':
133 setupStderr() 389 setupStderr()
134 390
135 geo = GeoIP.open(get_config().get('logs', 'geoip_db'), GeoIP.GEOIP_MEMORY_CACH E) 391 verbose = (len(sys.argv) >= 2 and sys.argv[1] == "verbose")
136 result = parseStdIn(geo) 392 geo = GeoIP.open(get_config().get("stats", "geoip_db"), GeoIP.GEOIP_MEMORY_CAC HE)
137 393 result = parse_stdin(geo, verbose)
138 file = open(get_config().get('subscriptionStats', 'tempFile'), 'wb') 394
139 result.write(file) 395 with codecs.open(get_config().get("stats", "tempFile"), "wb", encoding="utf-8" ) as file:
396 simplejson.dump(result, file, indent=2, sort_keys=True)
OLDNEW
« no previous file with comments | « sitescripts/stats/bin/datamerger.py ('k') | sitescripts/stats/bin/pagegenerator.py » ('j') | no next file with comments »

Powered by Google App Engine
This is Rietveld