sitescripts/stats/bin/logprocessor.py - Issue 11481051: Update stats processing

Side by Side Diff: sitescripts/stats/bin/logprocessor.py

Issue 11481051: Update stats processing (Closed)

Patch Set: Fixed two presentation issues Created Aug. 24, 2013, 1:11 p.m.

Left:
Right:

Use n/p to move between diff chunks; N/P to move between comments.

Jump to:

View unified diff | Download patch

OLD	NEW
1 # coding: utf-8	1 # coding: utf-8

2	2

3 # This file is part of the Adblock Plus web scripts,	3 # This file is part of the Adblock Plus web scripts,

4 # Copyright (C) 2006-2013 Eyeo GmbH	4 # Copyright (C) 2006-2013 Eyeo GmbH

5 #	5 #

6 # Adblock Plus is free software: you can redistribute it and/or modify	6 # Adblock Plus is free software: you can redistribute it and/or modify

7 # it under the terms of the GNU General Public License version 3 as	7 # it under the terms of the GNU General Public License version 3 as

8 # published by the Free Software Foundation.	8 # published by the Free Software Foundation.

9 #	9 #

10 # Adblock Plus is distributed in the hope that it will be useful,	10 # Adblock Plus is distributed in the hope that it will be useful,

11 # but WITHOUT ANY WARRANTY; without even the implied warranty of	11 # but WITHOUT ANY WARRANTY; without even the implied warranty of

12 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the	12 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the

13 # GNU General Public License for more details.	13 # GNU General Public License for more details.

14 #	14 #

15 # You should have received a copy of the GNU General Public License	15 # You should have received a copy of the GNU General Public License

16 # along with Adblock Plus. If not, see <http://www.gnu.org/licenses/>.	16 # along with Adblock Plus. If not, see <http://www.gnu.org/licenses/>.

17	17

18 import sys, re, math, GeoIP	18 import os, sys, codecs, re, math, GeoIP, urllib, urlparse, socket, simplejson

	19 import sitescripts.stats.common as common

19 from sitescripts.utils import get_config, setupStderr	20 from sitescripts.utils import get_config, setupStderr

20 from datetime import datetime, timedelta	21 from datetime import datetime, timedelta

21 from ConfigParser import SafeConfigParser, NoOptionError	22

22	23 log_regexp = None

23 def parseUA(ua):	24 mirror_name = None

	25 gecko_apps = None

	26

	27 def parse_ua(ua):

24 # Opera might disguise itself as other browser so it needs to go first	28 # Opera might disguise itself as other browser so it needs to go first

25 match = re.search(r'\bOpera/([\d\.]+)', ua)	29 match = re.search(r"\bOpera/([\d\.]+)", ua)

26 if match:	30 if match:

27 # Opera 10+ declares itself as Opera 9.80 but adds Version/1x.x to the UA	31 # Opera 10+ declares itself as Opera 9.80 but adds Version/1x.x to the UA

28 match2 = re.search(r'\bVersion/([\d\.]+)', ua)	32 match2 = re.search(r"\bVersion/([\d\.]+)", ua)

29 if match2:	33 if match2:

30 return 'Opera %s' % match2.group(1)	34 return "Opera", match2.group(1)

31 else:	35 else:

32 return 'Opera %s' % match.group(1)	36 return "Opera", match.group(1)

33	37

34 # Opera 15+ has the same UA as Chrome but adds OPR/1x.x to it	38 # Opera 15+ has the same UA as Chrome but adds OPR/1x.x to it

35 match = re.search(r'\bOPR/(\d+\.\d+)', ua)	39 match = re.search(r"\bOPR/(\d+\.\d+)", ua)

36 if match:	40 if match:

37 return 'Opera %s' % match.group(1)	41 return "Opera", match.group(1)

38	42

39 for appName in ('Fennec', 'Thunderbird', 'SeaMonkey', 'Songbird', 'K-Meleon', 'Prism', 'Firefox'):	43 for appName in ("Fennec", "Thunderbird", "SeaMonkey", "Songbird", "K-Meleon", "Prism", "Firefox"):

40 match = re.search(r'\b%s/(\d+\.\d+)' % appName, ua)	44 match = re.search(r"\b%s/(\d+\.\d+)" % appName, ua)
	Sebastian Noack 2013/08/26 16:05:22 Instead of iterating over the list of browsers and Instead of iterating over the list of browsers and generate a regex for each, I would suggest to use a single regex that matches all: \b(Fennec\|Thunderbird\|SeaMonkey\|Songbird\|K-Meleon\|Prism\|Firefox)/(\d+\.\d+) Wladimir Palant 2013/08/27 07:34:28 This was done like that intentionally - quite a fe Show quoted text On 2013/08/26 16:05:22, sebastian wrote: > Instead of iterating over the list of browsers and generate a regex for each, I > would suggest to use a single regex that matches all: > > \b(Fennec\|Thunderbird\|SeaMonkey\|Songbird\|K-Meleon\|Prism\|Firefox)/(\d+\.\d+) This was done like that intentionally - quite a few browsers will claim being Firefox. However, we can indeed handle all but Firefox in one go, done that now.
41 if match:	45 if match:

42 return '%s %s' % (appName, match.group(1))	46 if appName == "Fennec" or (appName == "Firefox" and re.search(r"\bMobile;" , ua)):

43	47 return "Firefox Mobile", match.group(1)

44 match = re.search(r'\brv:(\d+)\.(\d+)(?:\.(\d+))?', ua)	48 else:

45 if match and re.search(r'\bGecko/', ua):	49 return appName, match.group(1)

	50

	51 match = re.search(r"\brv:(\d+)\.(\d+)(?:\.(\d+))?", ua)

	52 if match and re.search(r"\bGecko/", ua):

46 if match.group(3) and int(match.group(1)) < 2:	53 if match.group(3) and int(match.group(1)) < 2:

47 return 'Gecko %s.%s.%s' % (match.group(1), match.group(2), match.group(3))	54 return "Gecko", "%s.%s.%s" % (match.group(1), match.group(2), match.group( 3))
	Sebastian Noack 2013/08/26 16:05:22 You could just just call match.groups(), which alr You could just just call match.groups(), which already returns a tuple with all 3 groups in it. Wladimir Palant 2013/08/27 07:34:28 I think I rather keep this as is for consistency w Show quoted text On 2013/08/26 16:05:22, sebastian wrote: > You could just just call match.groups(), which already returns a tuple with all > 3 groups in it. I think I rather keep this as is for consistency with the variant two lines below. Besides, I prefer listing the groups explicitly - regexps commonly have groups that aren't meant to be used (I avoid this here by using (?:...) but still).
48 else:	55 else:

49 return 'Gecko %s.%s' % (match.group(1), match.group(2))	56 return "Gecko", "%s.%s" % (match.group(1), match.group(2))

50	57

51 match = re.search(r'\bChrome/(\d+\.\d+)', ua)	58 match = re.search(r"\bCoolNovo/(\d+\.\d+\.\d+)", ua)

52 if match:	59 if match:

53 return 'Chrome %s' % match.group(1)	60 return "CoolNovo", match.group(1)

54	61

55 match = re.search(r'\bVersion/(\d+\.\d+)', ua)	62 match = re.search(r"\bChrome/(\d+\.\d+)", ua)

56 if match and re.search(r'\Safari/', ua):	63 if match:

57 return 'Safari %s' % match.group(1)	64 return "Chrome", match.group(1)

58	65

59 if re.search(r'\bAppleWebKit/', ua):	66 match = re.search(r"\bVersion/(\d+\.\d+)", ua)

60 return 'WebKit'	67 if match and re.search(r"\bMobile Safari/", ua):

61	68 return "Mobile Safari", match.group(1)

62 match = re.search(r'\bMSIE (\d+\.\d+)', ua)	69 if match and re.search(r"\bSafari/", ua):

63 if match:	70 return "Safari", match.group(1)

64 return 'MSIE %s' % match.group(1)	71

65	72 if re.search(r"\bAppleWebKit/", ua):

66 return 'Other'	73 return "WebKit", ""

67	74

68 def parseStdIn(geo):	75 match = re.search(r"\bMSIE (\d+\.\d+)", ua)

69 if get_config().has_option('logs', 'subscriptionsSubdir'):	76 if match:

70 subdir = get_config().get('logs', 'subscriptionsSubdir')	77 return "MSIE", match.group(1)

71 subdir = re.sub(r'^/+', '', subdir)	78

72 subdir = re.sub(r'/+$', '', subdir)	79 match = re.search(r"\bTrident/(\d+\.\d+)", ua)

73 subdir = re.sub(r'(?=\W)', r'\\', subdir)	80 if match:

74 subdir = subdir + '/'	81 return "Trident", match.group(1)

	82

	83 match = re.search(r"\bAndroidDownloadManager(?:/(\d+\.\d+))?", ua)

	84 if match:

	85 return "Android", match.group(1) or ""

	86

	87 match = re.search(r"\bDalvik/.*\bAndroid (\d+\.\d+)", ua)

	88 if match:

	89 return "Android", match.group(1)

	90

	91 # ABP/Android downloads use that user agent

	92 if ua.startswith("Apache-HttpClient/UNAVAILABLE"):

	93 return "Android", ""

	94

	95 # ABP/IE downloads use that user agent

	96 if ua == "Adblock Plus":

	97 return "ABP", ""

	98

	99 return "Other", ""

	100

	101 def process_ip(ip, geo):

	102 match = re.search(r"^::ffff:(\d+\.\d+\.\d+\.\d+)$", ip)

	103 if match:

	104 ip = match.group(1)

	105

	106 country = geo.country_code_by_addr(ip)

	107 if country in (None, "", "--"):

	108 country = "unknown"

	109 country = country.lower()

	110

	111 return ip, country

	112

	113 def parse_time(timestr, tz_hours, tz_minutes):

	114 result = datetime.strptime(timestr, "%d/%b/%Y:%H:%M:%S")

	115 result -= timedelta(hours = tz_hours, minutes = math.copysign(tz_minutes, tz_h ours))

	116 return result, result.strftime("%Y%m"), result.day, result.weekday(), result.h our

	117

	118 def parse_path(path):

	119 urlparts = urlparse.urlparse(path)

	120 try:

	121 path = urllib.unquote(urlparts.path).decode("utf-8")

	122 except:

	123 path = urlparts.path

	124 return path[1:], urlparts.query

	125

	126 def parse_downloader_query(info):

	127 params = urlparse.parse_qs(info["query"])

	128 for param in ("addonName", "addonVersion", "application", "applicationVersion" , "platform", "platformVersion"):

	129 info[param] = params.get(param, ["unknown"])[0]

	130

	131 # Only leave the major and minor release number for application and platform

	132 info["applicationVersion"] = re.sub(r"^(\d+\.\d+).*", r"\1", info["application Version"])

	133 info["platformVersion"] = re.sub(r"^(\d+\.\d+).*", r"\1", info["platformVersio n"])

	134

	135 # Chrome Adblock sends an X-Client-ID header insteads of URL parameters

	136 match = re.match(r"^adblock/([\d\.]+)$", info["clientid"], re.I) if info["clie ntid"] else None

	137 if match:

	138 info["addonName"] = "chromeadblock"

	139 info["addonVersion"] = match.group(1)

	140

	141 last_version = params.get("lastVersion", ["unknown"])[0]

	142 if info["file"] == "notification.json" and last_version == "0" and (

	143 (info["addonName"] == "adblockplus" and info["addonVersion"] == "2.3.1") o r

	144 (info["addonName"] in ("adblockpluschrome", "adblockplusopera") and info[" addonVersion"] == "1.5.2")

	145 ):

	146 # Broken notification version number in these releases, treat like unknown

	147 last_version = "unknown"

	148

	149 if last_version == "unknown":

	150 info["downloadInterval"] = "unknown"

	151 elif last_version == "0":

	152 info["downloadInterval"] = "unknown"

	153 info["firstDownload"] = info["firstInMonth"] = info["firstInWeek"] = info["f irstInDay"] = True

75 else:	154 else:

76 subdir = ''	155 try:

77 regexp = re.compile(r'(\S+) \S+ \S+ \[([^]\s]+) ([+\-]\d\d)(\d\d)\] "GET (?:\w +://[^/]+)?/%s([\w\-\+\.]+\.(?:txt\|tpl))(?:\?[^\s"])? [^"]+" (\d+) (\d+) "[^"] " "([^"]*)"' % subdir)	156 last_update = datetime.strptime(last_version, "%Y%m%d%H%M")

78	157 diff = info["time"] - last_update

	158 if diff.days >= 365:

	159 info["downloadInterval"] = "%i year(s)" % (diff.days / 365)

	160 elif diff.days >= 30:

	161 info["downloadInterval"] = "%i month(s)" % (diff.days / 30)

	162 elif diff.days >= 1:

	163 info["downloadInterval"] = "%i day(s)" % diff.days

	164 else:

	165 info["downloadInterval"] = "%i hour(s)" % (diff.seconds / 3600)

	166

	167 if last_update.year != info["time"].year or last_update.month != info["tim e"].month:

	168 info["firstInMonth"] = info["firstInDay"] = True

	169 elif last_update.day != info["time"].day:

	170 info["firstInDay"] = True

	171

	172 if last_update.isocalendar()[0:2] != info["time"].isocalendar()[0:2]:

	173 info["firstInWeek"] = True

	174 except ValueError:

	175 info["downloadInterval"] = "unknown"

	176 pass

	177

	178 def parse_addon_name(file):

	179 if "/" in file:

	180 return file.split("/")[-2]

	181 else:

	182 return None

	183

	184 def parse_gecko_query(query):

	185 params = urlparse.parse_qs(query)

	186

	187 version = params.get("version", ["unknown"])[0]

	188

	189 global gecko_apps

	190 if gecko_apps == None:

	191 from buildtools.packagerGecko import KNOWN_APPS

	192 gecko_apps = {v: k for k, v in KNOWN_APPS.iteritems()}

	193 appID = params.get("appID", ["unknown"])[0]

	194

	195 application = gecko_apps.get(appID, "unknown")

	196 applicationVersion = params.get("appVersion", ["unknown"])[0]

	197

	198 # Only leave the major and minor release number for application

	199 applicationVersion = re.sub(r"^(\d+\.\d+).*", r"\1", applicationVersion)

	200

	201 return version, application, applicationVersion

	202

	203 def parse_chrome_query(query):

	204 params = urlparse.parse_qs(query)

	205

	206 if params.get("prod", ["unknown"])[0] in ("chromecrx", "chromiumcrx"):

	207 application = "chrome"

	208 else:

	209 application = "unknown"

	210 applicationVersion = params.get("prodversion", ["unknown"])[0]

	211

	212 params2 = urlparse.parse_qs(params.get("x", [""])[0])

	213 version = params2.get("v", ["unknown"])[0]

	214

	215 # Only leave the major and minor release number for application

	216 applicationVersion = re.sub(r"^(\d+\.\d+).*", r"\1", applicationVersion)

	217

	218 return version, application, applicationVersion

	219

	220 def parse_update_flag(query):

	221 return "update" if query == "update" else "install"

	222

	223 def parse_record(line, ignored, geo):

	224 global log_regexp, mirror_name

	225 if log_regexp == None:

	226 log_regexp = re.compile(r'(\S+) \S+ \S+ \[([^]\s]+) ([+\-]\d\d)(\d\d)\] "GET ([^"\s]+) [^"]+" (\d+) (\d+) "[^"]" "([^"])"(?: "[^"]" \S+ "[^"]" "[^"]" " ([^"])")?')

	227 if mirror_name == None:

	228 mirror_name = re.sub(r"\..*", "", socket.gethostname())

	229

	230 match = re.search(log_regexp, line)

	231 if not match:

	232 return None

	233

	234 status = int(match.group(6))

	235 if status != 200:

	236 return None

	237

	238 info = {

	239 "mirror": mirror_name,

	240 "size": int(match.group(7)),

	241 }

	242

	243 info["ip"], info["country"] = process_ip(match.group(1), geo)

	244 info["time"], info["month"], info["day"], info["weekday"], info["hour"] = pars e_time(match.group(2), int(match.group(3)), int(match.group(4)))

	245 info["file"], info["query"] = parse_path(match.group(5))

	246 info["ua"], info["uaversion"] = parse_ua(match.group(8))

	247 info["fullua"] = "%s %s" % (info["ua"], info["uaversion"])

	248 info["clientid"] = match.group(9)

	249

	250 # Additional metadata depends on file type

	251 filename = os.path.basename(info["file"])

	252 ext = os.path.splitext(filename)[1]

	253 if ext == ".txt" or filename == "update.json" or filename == "notification.jso n":

	254 # Subscription downloads, libadblockplus update checks and notification

	255 # checks are performed by the downloader

	256 parse_downloader_query(info)

	257 elif ext == ".tpl":

	258 # MSIE TPL download, no additional data here

	259 pass

	260 elif ext in (".xpi", ".crx", ".apk", ".msi", ".exe"):

	261 # Package download, might be an update

	262 info["installType"] = parse_update_flag(info["query"])

	263 elif filename == "update.rdf":

	264 # Gecko update check or a legacy Android update check. The latter doesn't

	265 # have usable data anyway so trying the Chrome route won't do any harm.

	266 info["addonName"] = parse_addon_name(info["file"])

	267 info["addonVersion"], info["application"], info["applicationVersion"] = pars e_gecko_query(info["query"])

	268 elif filename == "updates.xml":

	269 # Chrome update check

	270 info["addonName"] = parse_addon_name(info["file"])

	271 info["addonVersion"], info["application"], info["applicationVersion"] = pars e_chrome_query(info["query"])

	272 else:

	273 ignored.add(info["file"])

	274 return None

	275

	276 if "addonName" in info:

	277 info["fullAddon"] = "%s %s" % (info["addonName"], info["addonVersion"])

	278 if "application" in info:

	279 info["fullApplication"] = "%s %s" % (info["application"], info["applicationV ersion"])

	280 if "platform" in info:

	281 info["fullPlatform"] = "%s %s" % (info["platform"], info["platformVersion"])

	282 return info

	283

	284 def add_record(info, section, ignore_fields=()):

	285 section["hits"] = section.get("hits", 0) + 1

	286 section["bandwidth"] = section.get("bandwidth", 0) + info["size"]

	287

	288 if len(ignore_fields) < 2:

	289 for field in map(lambda f: f["name"], common.fields):

	290 if field in ignore_fields or field not in info:

	291 continue

	292

	293 value = info[field]

	294 if field not in section:

	295 section[field] = {}

	296 if value not in section[field]:

	297 section[field][value] = {}

	298

	299 add_record(info, section[field][value], ignore_fields + (field,))

	300

	301 def parse_stdin(geo, verbose):

79 data = {}	302 data = {}

	303 ignored = set()

80 for line in sys.stdin:	304 for line in sys.stdin:

81 match = re.search(regexp, line)	305 info = parse_record(line, ignored, geo)

82 if not match:	306 if info == None:

83 continue	307 continue

84	308

85 ip, time, tzHours, tzMinutes = match.group(1), match.group(2), int(match.gro up(3)), int(match.group(4))	309 if info["month"] not in data:

86 file, status, size, ua = match.group(5), int(match.group(6)), int(match.grou p(7)), match.group(8)	310 data[info["month"]] = {}

87 if status != 200 and status != 302 and status != 304:	311 section = data[info["month"]]

88 continue	312

89 if file.startswith('robots.'):	313 if info["file"] not in section:

90 continue	314 section[info["file"]] = {}

91	315 section = section[info["file"]]

92 time = datetime.strptime(time, '%d/%b/%Y:%H:%M:%S')	316

93 time -= timedelta(hours = tzHours, minutes = math.copysign(tzMinutes, tzHour s))	317 add_record(info, section)

94	318

95 match = re.search(r'^::ffff:(\d+\.\d+\.\d+\.\d+)$', ip)	319 if verbose:

96 if match:	320 print "Ignored files"

97 ip = match.group(1)	321 print "============="

98 country = geo.country_code_by_addr(ip)	322 print "\n".join(sorted(ignored))

99 if country == '' or country == '--':	323 return data

100 country = 'unknown'	324

101	325 if __name__ == "__main__":

102 ua = parseUA(ua)[:20]

103

104 section = time.strftime('%Y%m')

105 if not section in data:

106 data[section] = {}

107

108 def addResultInt(key, value):

109 if key in data[section]:

110 data[section][key] += value

111 else:

112 data[section][key] = value

113

114 addResultInt('%s hits' % file, 1)

115 addResultInt('%s bandwidth' % file, size)

116 addResultInt('%s hits day %i' % (file, time.day), 1)

117 addResultInt('%s bandwidth day %i' % (file, time.day), size)

118 addResultInt('%s hits hour %i' % (file, time.hour), 1)

119 addResultInt('%s bandwidth hour %i' % (file, time.hour), size)

120 addResultInt('%s hits country %s' % (file, country), 1)

121 addResultInt('%s bandwidth country %s' % (file, country), size)

122 addResultInt('%s hits app %s' % (file, ua), 1)

123 addResultInt('%s bandwidth app %s' % (file, ua), size)

124

125 result = SafeConfigParser()

126 for section in data.iterkeys():

127 result.add_section(section)

128 for key, value in data[section].iteritems():

129 result.set(section, key, str(value))

130 return result

131

132 if __name__ == '__main__':

133 setupStderr()	326 setupStderr()

134	327

135 geo = GeoIP.open(get_config().get('logs', 'geoip_db'), GeoIP.GEOIP_MEMORY_CACH E)	328 verbose = (len(sys.argv) >= 2 and sys.argv[1] == "verbose")

136 result = parseStdIn(geo)	329 geo = GeoIP.open(get_config().get("stats", "geoip_db"), GeoIP.GEOIP_MEMORY_CAC HE)

137	330 result = parse_stdin(geo, verbose)

138 file = open(get_config().get('subscriptionStats', 'tempFile'), 'wb')	331

139 result.write(file)	332 with codecs.open(get_config().get("stats", "tempFile"), "wb", encoding="utf-8" ) as file:

	333 simplejson.dump(result, file, indent=2, sort_keys=True)

OLD	NEW

« sitescripts/stats/bin/datamerger.py ('K') | « sitescripts/stats/bin/datamerger.py ('k') | sitescripts/stats/bin/pagegenerator.py » ('j') | sitescripts/stats/bin/pagegenerator.py » ('J')