Rietveld Code Review Tool
Help | Bug tracker | Discussion group | Source code

Side by Side Diff: sitescripts/stats/bin/logprocessor.py

Issue 11481051: Update stats processing (Closed)
Patch Set: String concatenation fix Created Aug. 27, 2013, 12:41 p.m.
Left:
Right:
Use n/p to move between diff chunks; N/P to move between comments.
Jump to:
View unified diff | Download patch
« no previous file with comments | « sitescripts/stats/bin/datamerger.py ('k') | sitescripts/stats/bin/pagegenerator.py » ('j') | no next file with comments »
Toggle Intra-line Diffs ('i') | Expand Comments ('e') | Collapse Comments ('c') | Show Comments Hide Comments ('s')
OLDNEW
1 # coding: utf-8 1 # coding: utf-8
2 2
3 # This file is part of the Adblock Plus web scripts, 3 # This file is part of the Adblock Plus web scripts,
4 # Copyright (C) 2006-2013 Eyeo GmbH 4 # Copyright (C) 2006-2013 Eyeo GmbH
5 # 5 #
6 # Adblock Plus is free software: you can redistribute it and/or modify 6 # Adblock Plus is free software: you can redistribute it and/or modify
7 # it under the terms of the GNU General Public License version 3 as 7 # it under the terms of the GNU General Public License version 3 as
8 # published by the Free Software Foundation. 8 # published by the Free Software Foundation.
9 # 9 #
10 # Adblock Plus is distributed in the hope that it will be useful, 10 # Adblock Plus is distributed in the hope that it will be useful,
11 # but WITHOUT ANY WARRANTY; without even the implied warranty of 11 # but WITHOUT ANY WARRANTY; without even the implied warranty of
12 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 12 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
13 # GNU General Public License for more details. 13 # GNU General Public License for more details.
14 # 14 #
15 # You should have received a copy of the GNU General Public License 15 # You should have received a copy of the GNU General Public License
16 # along with Adblock Plus. If not, see <http://www.gnu.org/licenses/>. 16 # along with Adblock Plus. If not, see <http://www.gnu.org/licenses/>.
17 17
18 import sys, re, math, GeoIP 18 import os, sys, codecs, re, math, GeoIP, urllib, urlparse, socket, simplejson
19 import sitescripts.stats.common as common
19 from sitescripts.utils import get_config, setupStderr 20 from sitescripts.utils import get_config, setupStderr
20 from datetime import datetime, timedelta 21 from datetime import datetime, timedelta
21 from ConfigParser import SafeConfigParser, NoOptionError 22
22 23 log_regexp = None
23 def parseUA(ua): 24 mirror_name = None
25 gecko_apps = None
26
27 def parse_ua(ua):
24 # Opera might disguise itself as other browser so it needs to go first 28 # Opera might disguise itself as other browser so it needs to go first
25 match = re.search(r'\bOpera/([\d\.]+)', ua) 29 match = re.search(r"\bOpera/([\d\.]+)", ua)
Felix Dahlke 2013/08/28 17:25:32 Memoization helped save around 5% here. In an acce
Sebastian Noack 2013/08/29 10:54:30 Keys in python dicts are already unique. So when y
26 if match: 30 if match:
27 # Opera 10+ declares itself as Opera 9.80 but adds Version/1x.x to the UA 31 # Opera 10+ declares itself as Opera 9.80 but adds Version/1x.x to the UA
28 match2 = re.search(r'\bVersion/([\d\.]+)', ua) 32 match2 = re.search(r"\bVersion/([\d\.]+)", ua)
29 if match2: 33 if match2:
30 return 'Opera %s' % match2.group(1) 34 return "Opera", match2.group(1)
31 else: 35 else:
32 return 'Opera %s' % match.group(1) 36 return "Opera", match.group(1)
33 37
34 # Opera 15+ has the same UA as Chrome but adds OPR/1x.x to it 38 # Opera 15+ has the same UA as Chrome but adds OPR/1x.x to it
35 match = re.search(r'\bOPR/(\d+\.\d+)', ua) 39 match = re.search(r"\bOPR/(\d+\.\d+)", ua)
36 if match: 40 if match:
37 return 'Opera %s' % match.group(1) 41 return "Opera", match.group(1)
38 42
39 for appName in ('Fennec', 'Thunderbird', 'SeaMonkey', 'Songbird', 'K-Meleon', 'Prism', 'Firefox'): 43 # Have to check for these before Firefox, they will usually have a Firefox ide ntifier as well
40 match = re.search(r'\b%s/(\d+\.\d+)' % appName, ua) 44 match = re.search(r"\b(Fennec|Thunderbird|SeaMonkey|Songbird|K-Meleon|Prism)/( \d+\.\d+)", ua)
41 if match: 45 if match:
42 return '%s %s' % (appName, match.group(1)) 46 if match.group(1) == "Fennec":
43 47 return "Firefox Mobile", match.group(2)
44 match = re.search(r'\brv:(\d+)\.(\d+)(?:\.(\d+))?', ua) 48 else:
45 if match and re.search(r'\bGecko/', ua): 49 return match.group(1), match.group(2)
50
51 match = re.search(r"\bFirefox/(\d+\.\d+)", ua)
52 if match:
53 if re.search(r"\bMobile;", ua):
54 return "Firefox Mobile", match.group(1)
55 else:
56 return "Firefox", match.group(1)
57
58 match = re.search(r"\brv:(\d+)\.(\d+)(?:\.(\d+))?", ua)
59 if match and re.search(r"\bGecko/", ua):
46 if match.group(3) and int(match.group(1)) < 2: 60 if match.group(3) and int(match.group(1)) < 2:
47 return 'Gecko %s.%s.%s' % (match.group(1), match.group(2), match.group(3)) 61 return "Gecko", "%s.%s.%s" % (match.group(1), match.group(2), match.group( 3))
48 else: 62 else:
49 return 'Gecko %s.%s' % (match.group(1), match.group(2)) 63 return "Gecko", "%s.%s" % (match.group(1), match.group(2))
50 64
51 match = re.search(r'\bChrome/(\d+\.\d+)', ua) 65 match = re.search(r"\bCoolNovo/(\d+\.\d+\.\d+)", ua)
52 if match: 66 if match:
53 return 'Chrome %s' % match.group(1) 67 return "CoolNovo", match.group(1)
54 68
55 match = re.search(r'\bVersion/(\d+\.\d+)', ua) 69 match = re.search(r"\bChrome/(\d+\.\d+)", ua)
56 if match and re.search(r'\Safari/', ua): 70 if match:
57 return 'Safari %s' % match.group(1) 71 return "Chrome", match.group(1)
58 72
59 if re.search(r'\bAppleWebKit/', ua): 73 match = re.search(r"\bVersion/(\d+\.\d+)", ua)
60 return 'WebKit' 74 if match and re.search(r"\bMobile Safari/", ua):
61 75 return "Mobile Safari", match.group(1)
62 match = re.search(r'\bMSIE (\d+\.\d+)', ua) 76 if match and re.search(r"\bSafari/", ua):
63 if match: 77 return "Safari", match.group(1)
64 return 'MSIE %s' % match.group(1) 78
65 79 if re.search(r"\bAppleWebKit/", ua):
66 return 'Other' 80 return "WebKit", ""
67 81
68 def parseStdIn(geo): 82 match = re.search(r"\bMSIE (\d+\.\d+)", ua)
69 if get_config().has_option('logs', 'subscriptionsSubdir'): 83 if match:
70 subdir = get_config().get('logs', 'subscriptionsSubdir') 84 return "MSIE", match.group(1)
71 subdir = re.sub(r'^/+', '', subdir) 85
72 subdir = re.sub(r'/+$', '', subdir) 86 match = re.search(r"\bTrident/(\d+\.\d+)", ua)
73 subdir = re.sub(r'(?=\W)', r'\\', subdir) 87 if match:
74 subdir = subdir + '/' 88 return "Trident", match.group(1)
89
90 match = re.search(r"\bAndroidDownloadManager(?:/(\d+\.\d+))?", ua)
91 if match:
92 return "Android", match.group(1) or ""
93
94 match = re.search(r"\bDalvik/.*\bAndroid (\d+\.\d+)", ua)
95 if match:
96 return "Android", match.group(1)
97
98 # ABP/Android downloads use that user agent
99 if ua.startswith("Apache-HttpClient/UNAVAILABLE"):
100 return "Android", ""
101
102 # ABP/IE downloads use that user agent
103 if ua == "Adblock Plus":
104 return "ABP", ""
105
106 return "Other", ""
107
108 def process_ip(ip, geo):
109 match = re.search(r"^::ffff:(\d+\.\d+\.\d+\.\d+)$", ip)
110 if match:
111 ip = match.group(1)
112
113 country = geo.country_code_by_addr(ip)
114 if country in (None, "", "--"):
115 country = "unknown"
116 country = country.lower()
117
118 return ip, country
119
120 def parse_time(timestr, tz_hours, tz_minutes):
121 result = datetime.strptime(timestr, "%d/%b/%Y:%H:%M:%S")
122 result -= timedelta(hours = tz_hours, minutes = math.copysign(tz_minutes, tz_h ours))
123 return result, result.strftime("%Y%m"), result.day, result.weekday(), result.h our
124
125 def parse_path(path):
126 urlparts = urlparse.urlparse(path)
127 try:
128 path = urllib.unquote(urlparts.path).decode("utf-8")
129 except:
130 path = urlparts.path
131 return path[1:], urlparts.query
132
133 def parse_downloader_query(info):
134 params = urlparse.parse_qs(info["query"])
135 for param in ("addonName", "addonVersion", "application", "applicationVersion" , "platform", "platformVersion"):
136 info[param] = params.get(param, ["unknown"])[0]
137
138 # Only leave the major and minor release number for application and platform
139 info["applicationVersion"] = re.sub(r"^(\d+\.\d+).*", r"\1", info["application Version"])
140 info["platformVersion"] = re.sub(r"^(\d+\.\d+).*", r"\1", info["platformVersio n"])
141
142 # Chrome Adblock sends an X-Client-ID header insteads of URL parameters
143 match = re.match(r"^adblock/([\d\.]+)$", info["clientid"], re.I) if info["clie ntid"] else None
144 if match:
145 info["addonName"] = "chromeadblock"
146 info["addonVersion"] = match.group(1)
147
148 last_version = params.get("lastVersion", ["unknown"])[0]
149 if info["file"] == "notification.json" and last_version == "0" and (
150 (info["addonName"] == "adblockplus" and info["addonVersion"] == "2.3.1") o r
151 (info["addonName"] in ("adblockpluschrome", "adblockplusopera") and info[" addonVersion"] == "1.5.2")
152 ):
153 # Broken notification version number in these releases, treat like unknown
154 last_version = "unknown"
155
156 if last_version == "unknown":
157 info["downloadInterval"] = "unknown"
158 elif last_version == "0":
159 info["downloadInterval"] = "unknown"
160 info["firstDownload"] = info["firstInMonth"] = info["firstInWeek"] = info["f irstInDay"] = True
75 else: 161 else:
76 subdir = '' 162 try:
77 regexp = re.compile(r'(\S+) \S+ \S+ \[([^]\s]+) ([+\-]\d\d)(\d\d)\] "GET (?:\w +://[^/]+)?/%s([\w\-\+\.]+\.(?:txt|tpl))(?:\?[^\s"]*)? [^"]+" (\d+) (\d+) "[^"]* " "([^"]*)"' % subdir) 163 last_update = datetime.strptime(last_version, "%Y%m%d%H%M")
78 164 diff = info["time"] - last_update
165 if diff.days >= 365:
166 info["downloadInterval"] = "%i year(s)" % (diff.days / 365)
167 elif diff.days >= 30:
168 info["downloadInterval"] = "%i month(s)" % (diff.days / 30)
169 elif diff.days >= 1:
170 info["downloadInterval"] = "%i day(s)" % diff.days
171 else:
172 info["downloadInterval"] = "%i hour(s)" % (diff.seconds / 3600)
173
174 if last_update.year != info["time"].year or last_update.month != info["tim e"].month:
175 info["firstInMonth"] = info["firstInDay"] = True
176 elif last_update.day != info["time"].day:
177 info["firstInDay"] = True
178
179 if last_update.isocalendar()[0:2] != info["time"].isocalendar()[0:2]:
180 info["firstInWeek"] = True
181 except ValueError:
182 info["downloadInterval"] = "unknown"
183 pass
184
185 def parse_addon_name(file):
186 if "/" in file:
187 return file.split("/")[-2]
188 else:
189 return None
190
191 def parse_gecko_query(query):
192 params = urlparse.parse_qs(query)
193
194 version = params.get("version", ["unknown"])[0]
195
196 global gecko_apps
197 if gecko_apps == None:
198 from buildtools.packagerGecko import KNOWN_APPS
199 gecko_apps = {v: k for k, v in KNOWN_APPS.iteritems()}
200 appID = params.get("appID", ["unknown"])[0]
201
202 application = gecko_apps.get(appID, "unknown")
203 applicationVersion = params.get("appVersion", ["unknown"])[0]
204
205 # Only leave the major and minor release number for application
206 applicationVersion = re.sub(r"^(\d+\.\d+).*", r"\1", applicationVersion)
207
208 return version, application, applicationVersion
209
210 def parse_chrome_query(query):
211 params = urlparse.parse_qs(query)
212
213 if params.get("prod", ["unknown"])[0] in ("chromecrx", "chromiumcrx"):
214 application = "chrome"
215 else:
216 application = "unknown"
217 applicationVersion = params.get("prodversion", ["unknown"])[0]
218
219 params2 = urlparse.parse_qs(params.get("x", [""])[0])
220 version = params2.get("v", ["unknown"])[0]
221
222 # Only leave the major and minor release number for application
223 applicationVersion = re.sub(r"^(\d+\.\d+).*", r"\1", applicationVersion)
224
225 return version, application, applicationVersion
226
227 def parse_update_flag(query):
228 return "update" if query == "update" else "install"
229
230 def parse_record(line, ignored, geo):
231 global log_regexp, mirror_name
232 if log_regexp == None:
233 log_regexp = re.compile(r'(\S+) \S+ \S+ \[([^]\s]+) ([+\-]\d\d)(\d\d)\] "GET ([^"\s]+) [^"]+" (\d+) (\d+) "[^"]*" "([^"]*)"(?: "[^"]*" \S+ "[^"]*" "[^"]*" " ([^"]*)")?')
234 if mirror_name == None:
235 mirror_name = re.sub(r"\..*", "", socket.gethostname())
236
237 match = re.search(log_regexp, line)
238 if not match:
239 return None
240
241 status = int(match.group(6))
242 if status != 200:
243 return None
244
245 info = {
246 "mirror": mirror_name,
247 "size": int(match.group(7)),
248 }
249
250 info["ip"], info["country"] = process_ip(match.group(1), geo)
251 info["time"], info["month"], info["day"], info["weekday"], info["hour"] = pars e_time(match.group(2), int(match.group(3)), int(match.group(4)))
252 info["file"], info["query"] = parse_path(match.group(5))
253 info["ua"], info["uaversion"] = parse_ua(match.group(8))
254 info["fullua"] = "%s %s" % (info["ua"], info["uaversion"])
255 info["clientid"] = match.group(9)
256
257 # Additional metadata depends on file type
258 filename = os.path.basename(info["file"])
259 ext = os.path.splitext(filename)[1]
260 if ext == ".txt" or filename == "update.json" or filename == "notification.jso n":
261 # Subscription downloads, libadblockplus update checks and notification
262 # checks are performed by the downloader
263 parse_downloader_query(info)
264 elif ext == ".tpl":
265 # MSIE TPL download, no additional data here
266 pass
267 elif ext in (".xpi", ".crx", ".apk", ".msi", ".exe"):
268 # Package download, might be an update
269 info["installType"] = parse_update_flag(info["query"])
270 elif filename == "update.rdf":
271 # Gecko update check or a legacy Android update check. The latter doesn't
272 # have usable data anyway so trying the Chrome route won't do any harm.
273 info["addonName"] = parse_addon_name(info["file"])
274 info["addonVersion"], info["application"], info["applicationVersion"] = pars e_gecko_query(info["query"])
275 elif filename == "updates.xml":
276 # Chrome update check
277 info["addonName"] = parse_addon_name(info["file"])
278 info["addonVersion"], info["application"], info["applicationVersion"] = pars e_chrome_query(info["query"])
279 else:
280 ignored.add(info["file"])
281 return None
282
283 if "addonName" in info:
284 info["fullAddon"] = "%s %s" % (info["addonName"], info["addonVersion"])
285 if "application" in info:
286 info["fullApplication"] = "%s %s" % (info["application"], info["applicationV ersion"])
287 if "platform" in info:
288 info["fullPlatform"] = "%s %s" % (info["platform"], info["platformVersion"])
289 return info
290
291 def add_record(info, section, ignore_fields=()):
292 section["hits"] = section.get("hits", 0) + 1
293 section["bandwidth"] = section.get("bandwidth", 0) + info["size"]
294
295 if len(ignore_fields) < 2:
296 for field in map(lambda f: f["name"], common.fields):
297 if field in ignore_fields or field not in info:
298 continue
299
300 value = info[field]
301 if field not in section:
302 section[field] = {}
303 if value not in section[field]:
304 section[field][value] = {}
305
306 add_record(info, section[field][value], ignore_fields + (field,))
307
308 def parse_stdin(geo, verbose):
79 data = {} 309 data = {}
310 ignored = set()
80 for line in sys.stdin: 311 for line in sys.stdin:
81 match = re.search(regexp, line) 312 info = parse_record(line, ignored, geo)
82 if not match: 313 if info == None:
83 continue 314 continue
84 315
85 ip, time, tzHours, tzMinutes = match.group(1), match.group(2), int(match.gro up(3)), int(match.group(4)) 316 if info["month"] not in data:
86 file, status, size, ua = match.group(5), int(match.group(6)), int(match.grou p(7)), match.group(8) 317 data[info["month"]] = {}
87 if status != 200 and status != 302 and status != 304: 318 section = data[info["month"]]
88 continue 319
89 if file.startswith('robots.'): 320 if info["file"] not in section:
90 continue 321 section[info["file"]] = {}
91 322 section = section[info["file"]]
92 time = datetime.strptime(time, '%d/%b/%Y:%H:%M:%S') 323
93 time -= timedelta(hours = tzHours, minutes = math.copysign(tzMinutes, tzHour s)) 324 add_record(info, section)
94 325
95 match = re.search(r'^::ffff:(\d+\.\d+\.\d+\.\d+)$', ip) 326 if verbose:
96 if match: 327 print "Ignored files"
97 ip = match.group(1) 328 print "============="
98 country = geo.country_code_by_addr(ip) 329 print "\n".join(sorted(ignored))
99 if country == '' or country == '--': 330 return data
100 country = 'unknown' 331
101 332 if __name__ == "__main__":
102 ua = parseUA(ua)[:20]
103
104 section = time.strftime('%Y%m')
105 if not section in data:
106 data[section] = {}
107
108 def addResultInt(key, value):
109 if key in data[section]:
110 data[section][key] += value
111 else:
112 data[section][key] = value
113
114 addResultInt('%s hits' % file, 1)
115 addResultInt('%s bandwidth' % file, size)
116 addResultInt('%s hits day %i' % (file, time.day), 1)
117 addResultInt('%s bandwidth day %i' % (file, time.day), size)
118 addResultInt('%s hits hour %i' % (file, time.hour), 1)
119 addResultInt('%s bandwidth hour %i' % (file, time.hour), size)
120 addResultInt('%s hits country %s' % (file, country), 1)
121 addResultInt('%s bandwidth country %s' % (file, country), size)
122 addResultInt('%s hits app %s' % (file, ua), 1)
123 addResultInt('%s bandwidth app %s' % (file, ua), size)
124
125 result = SafeConfigParser()
126 for section in data.iterkeys():
127 result.add_section(section)
128 for key, value in data[section].iteritems():
129 result.set(section, key, str(value))
130 return result
131
132 if __name__ == '__main__':
133 setupStderr() 333 setupStderr()
134 334
135 geo = GeoIP.open(get_config().get('logs', 'geoip_db'), GeoIP.GEOIP_MEMORY_CACH E) 335 verbose = (len(sys.argv) >= 2 and sys.argv[1] == "verbose")
136 result = parseStdIn(geo) 336 geo = GeoIP.open(get_config().get("stats", "geoip_db"), GeoIP.GEOIP_MEMORY_CAC HE)
137 337 result = parse_stdin(geo, verbose)
138 file = open(get_config().get('subscriptionStats', 'tempFile'), 'wb') 338
139 result.write(file) 339 with codecs.open(get_config().get("stats", "tempFile"), "wb", encoding="utf-8" ) as file:
340 simplejson.dump(result, file, indent=2, sort_keys=True)
OLDNEW
« no previous file with comments | « sitescripts/stats/bin/datamerger.py ('k') | sitescripts/stats/bin/pagegenerator.py » ('j') | no next file with comments »

Powered by Google App Engine
This is Rietveld