sitescripts/stats/bin/pagegenerator.py - Issue 11481051: Update stats processing

Side by Side Diff: sitescripts/stats/bin/pagegenerator.py

Issue 11481051: Update stats processing (Closed)

Patch Set: Fixed two presentation issues Created Aug. 24, 2013, 1:11 p.m.

Left:
Right:

Use n/p to move between diff chunks; N/P to move between comments.

Jump to:

View unified diff | Download patch

OLD	NEW
1 # coding: utf-8	1 # coding: utf-8

2	2

3 # This file is part of the Adblock Plus web scripts,	3 # This file is part of the Adblock Plus web scripts,

4 # Copyright (C) 2006-2013 Eyeo GmbH	4 # Copyright (C) 2006-2013 Eyeo GmbH

5 #	5 #

6 # Adblock Plus is free software: you can redistribute it and/or modify	6 # Adblock Plus is free software: you can redistribute it and/or modify

7 # it under the terms of the GNU General Public License version 3 as	7 # it under the terms of the GNU General Public License version 3 as

8 # published by the Free Software Foundation.	8 # published by the Free Software Foundation.

9 #	9 #

10 # Adblock Plus is distributed in the hope that it will be useful,	10 # Adblock Plus is distributed in the hope that it will be useful,

11 # but WITHOUT ANY WARRANTY; without even the implied warranty of	11 # but WITHOUT ANY WARRANTY; without even the implied warranty of

12 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the	12 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the

13 # GNU General Public License for more details.	13 # GNU General Public License for more details.

14 #	14 #

15 # You should have received a copy of the GNU General Public License	15 # You should have received a copy of the GNU General Public License

16 # along with Adblock Plus. If not, see <http://www.gnu.org/licenses/>.	16 # along with Adblock Plus. If not, see <http://www.gnu.org/licenses/>.

17	17

18 import os, re, time	18 import os, re, codecs, simplejson, time, itertools

19 from datetime import date, timedelta	19 from datetime import date

20 from sitescripts.utils import get_config, setupStderr, get_template	20 from sitescripts.utils import get_config, setupStderr, get_custom_template_envir onment, cached

21 from sitescripts.logs.countryCodes import countryCodes	21 import sitescripts.stats.common as common

22 from ConfigParser import SafeConfigParser	22 from sitescripts.stats.countrycodes import countrycodes

23	23

24 def getSubscriptionFiles(data, month):	24 @cached(())
	Sebastian Noack 2013/08/26 16:05:22 You passed an empty tuple as timeout. Obviously yo You passed an empty tuple as timeout. Obviously you want to make the timeout forever, but this isn't the right way to do it. It though works in at least recent versions of CPython, but that is just luck, since the Python language specs, don't specify that a tuple is always larger than an int/float, as far as I know. So it might fail with other Python implementations or future versions of CPython. However why don't you just use float('Inf')? Wladimir Palant 2013/08/27 07:34:28 This behavior is somewhat specified as described h This behavior is somewhat specified as described here: http://stackoverflow.com/a/3270689/785541 Anyway, float("inf") was introduced in Python 2.6, this code is older however. I followed the advice to use an empty tuple because I liked it better than using an arbitrary number like 1e400. Fixed.
25 result = {}	25 def get_template_environment():

26 if data.has_section(month):	26 return get_custom_template_environment({

27 for option in data.options(month):	27 "monthname": lambda value: date(int(value[0:4]), int(value[4:]), 1).strftime ("%b %Y"),

28 result[option[0:option.index(' ')]] = True	28 "weekday": lambda value: ["Monday", "Tuesday", "Wednesday", "Thursday", "Fri day", "Saturday", "Sunday"][int(value)],

29 return result	29 "countryname": lambda value: countrycodes.get(value, "Unknown"),

	30 "sortfield": lambda value, field: (field["sort"] if "sort" in field else def ault_sort)(value),

	31 "maxhits": lambda items: max(itertools.chain((value["hits"] for key, value i n items), [1])),

	32 "maxbandwidth": lambda items: max(itertools.chain((value["bandwidth"] for ke y, value in items), [1])),

	33 "sumhits": lambda items: max(sum(value["hits"] for key, value in items), 1),

	34 "sumbandwidth": lambda items: max(sum(value["bandwidth"] for key, value in i tems), 1),

	35 "isspecial": lambda name, field: field["isspecial"](name) if "isspecial" in field else False,

	36 })

30	37

31 def generateMainPage(data, outputDir):	38 @cached(())

32 def getDataInt(month, key):	39 def get_main_page_template():

33 if data.has_option(month, key):	40 return get_template_environment().get_template(get_config().get("stats", "main PageTemplate"))

34 return data.getint(month, key)

35 else:

36 return 0

37	41

38 month = date.today().strftime('%Y%m')	42 @cached(())

39 subscriptions = []	43 def get_file_stats_template():

40 for fileName in getSubscriptionFiles(data, month).iterkeys():	44 return get_template_environment().get_template(get_config().get("stats", "file PageTemplate"))

41 subscriptions.append({

42 'fileName': fileName,

43 'url': 'subscription_%s_%s.html' % (re.sub(r'\W', '_', fileName), month),

44 'hits': getDataInt(month, '%s hits' % fileName),

45 'bandwidth': getDataInt(month, '%s bandwidth' % fileName)

46 })

47 subscriptions = sorted(subscriptions, key=lambda s: s['hits'], reverse=True)

48	45

49 file = os.path.join(outputDir, 'index.html')	46 @cached(())

50 template = get_template(get_config().get('subscriptionStats', 'mainPageTemplat e'))	47 def get_file_overview_template():

51 template.stream({'now': time.time(), 'month': month, 'subscriptions': subscrip tions}).dump(file)	48 return get_template_environment().get_template(get_config().get("stats", "file OverviewTemplate"))

52	49

53 def generateSubscriptionPages(data, outputDir):	50 def default_sort(obj):

54 existingSubscriptions = {}	51 return sorted(obj.items(), key=lambda (k,v): v["hits"], reverse=True)

55 template = get_template(get_config().get('subscriptionStats', 'subscriptionPag eTemplate'))

56 for month in data.sections():

57 subscriptions = {}

58 for option in data.options(month):

59 spaceIndex = option.index(' ')

60 if spaceIndex < 0:

61 continue

62 fileName, key = option[0:spaceIndex], option[spaceIndex+1:]

63 existingSubscriptions[fileName] = True

64 if not fileName in subscriptions:

65 subscriptions[fileName] = {

66 'now': time.time(),

67 'month': month,

68 'daysInMonth': (date(int(month[0:4]), int(month[4:]), 1) - timedelta(d ays=1)).day,

69 'currentMonth': month == date.today().strftime('%Y%m'),

70 'fileName': fileName,

71 'overviewURL': 'overview_%s.html' % re.sub(r'\W', '_', fileName),

72 'hits': 0,

73 'bandwidth': 0,

74 'day': {},

75 'weekday': [{'id': i, 'hits': 0, 'bandwidth': 0, 'count': 0}for i in r ange(7)],

76 'hour': {},

77 'country': {},

78 'app': {},

79 'mirror': {},

80 }

81 if key == 'hits' or key == 'bandwidth':

82 subscriptions[fileName][key] = data.getint(month, option)

83 else:

84 match = re.search(r'^(hits\|bandwidth) (day\|hour\|country\|app\|mirror) (.*) $', key)

85 if match:

86 if not match.group(3) in subscriptions[fileName][match.group(2)]:

87 subscriptions[fileName][match.group(2)][match.group(3)] = {

88 'id': match.group(3),

89 'hits': 0,

90 'bandwidth': 0,

91 }

92 if match.group(2) == 'day':

93 subscriptions[fileName][match.group(2)][match.group(3)]['weekday'] = date(int(month[0:4]), int(month[4:]), int(match.group(3))).weekday()

94 if match.group(2) == 'country':

95 if match.group(3) in countryCodes:

96 subscriptions[fileName][match.group(2)][match.group(3)]['name'] = countryCodes[match.group(3)]

97 subscriptions[fileName][match.group(2)][match.group(3)]['image'] = match.group(3)

98 else:

99 subscriptions[fileName][match.group(2)][match.group(3)]['name'] = 'Unknown'

100 subscriptions[fileName][match.group(2)][match.group(3)]['image'] = 'ip'

101 subscriptions[fileName][match.group(2)][match.group(3)][match.group(1) ] = data.getint(month, option)

102	52

103 for subscription in subscriptions.itervalues():	53 def ensure_dir(path):
	Sebastian Noack 2013/08/26 16:05:22 Instead of checking if the dir exist and creating Instead of checking if the dir exist and creating it if not, I would rather just create it and catch the OSError. This will be faster since the FS must be queried only once and you get around the theoretical condition that the dir doesn't exist when you check for its presence but suddenly was created by somebody else when actually try to create it a (few) ms later.
104 for key in ('day', 'hour'):	54 dir = os.path.dirname(path)

105 subscription[key] = sorted(subscription[key].itervalues(), key=lambda s: int(s['id']))	55 if not os.path.exists(dir):

106 for key in ('country', 'app', 'mirror'):	56 os.makedirs(dir)
	Wladimir Palant 2013/08/27 07:34:28 Fixed. However, this is a common pattern in our co Fixed. However, this is a common pattern in our code base and will have to be fixed consistently everywhere.
107 subscription[key] = sorted(subscription[key].itervalues(), key=lambda s: s['hits'], reverse=True)

108 for dayInfo in subscription['day']:

109 weekdayInfo = subscription['weekday'][dayInfo['weekday']]

110 weekdayInfo['hits'] = (weekdayInfo['hits'] * weekdayInfo['count'] + dayI nfo['hits']) / (weekdayInfo['count'] + 1)

111 weekdayInfo['bandwidth'] = (weekdayInfo['bandwidth'] * weekdayInfo['coun t'] + dayInfo['bandwidth']) / (weekdayInfo['count'] + 1)

112 weekdayInfo['count'] += 1

113 fileName = 'subscription_%s_%s.html' % (re.sub(r'\W', '_', subscription['f ileName']), month)

114 template.stream(subscription).dump(os.path.join(outputDir, fileName))

115 return existingSubscriptions

116	57

117 def generateOverviewPage(data, outputDir, fileName):	58 def generate_main_page(outputfile, month, url, data):

118 months = []	59 ensure_dir(outputfile)

119 for month in data.sections():	60 get_main_page_template().stream({

120 if data.has_option(month, '%s hits' % fileName) and data.has_option(month, ' %s bandwidth' % fileName):	61 "now": time.time(),

121 months.append({	62 "month": month,

122 'id': month,	63 "url": url,

123 'url': 'subscription_%s_%s.html' % (re.sub(r'\W', '_', fileName), month) ,	64 "data": data,

124 'hits': data.getint(month, '%s hits' % fileName),	65 }).dump(outputfile)

125 'bandwidth': data.getint(month, '%s bandwidth' % fileName),

126 })

127 months = sorted(months, key=lambda m: m['id'])

128	66

129 file = os.path.join(outputDir, 'overview_%s.html' % re.sub(r'\W', '_', fileNam e))	67 def generate_file_stats(outputfile, month, url, overview_url, data, filter=None, filtered_urls={}):

130 template = get_template(get_config().get('subscriptionStats', 'subscriptionOve rviewTemplate'))	68 ensure_dir(outputfile)

131 template.stream({'now': time.time(), 'fileName': fileName, 'month': months}).d ump(file)	69 get_file_stats_template().stream({

	70 "now": time.time(),

	71 "month": month,

	72 "url": url,

	73 "overview_url": overview_url,

	74 "data": data,

	75 "fields": common.fields,

	76 "filter": filter,

	77 "filtered_urls": filtered_urls

	78 }).dump(outputfile)

	79

	80 def generate_file_overview(outputfile, url, data):

	81 ensure_dir(outputfile)

	82 get_file_overview_template().stream({

	83 "now": time.time(),

	84 "url": url,

	85 "data": data

	86 }).dump(outputfile)

	87

	88 def get_names(dir, needdirectories):

	89 for file in os.listdir(dir):

	90 path = os.path.join(dir, file)

	91 if (needdirectories and os.path.isdir(path)) or (not needdirectories and os. path.isfile(path)):

	92 yield common.filename_decode(file), path

	93

	94 def generate_pages(datadir, outputdir):

	95 for server_type, server_type_dir in get_names(datadir, True):

	96 baseURL = get_config().get("stats", "baseURL_" + server_type)

	97 filedata = {}

	98 current_month = None

	99 for month, month_dir in get_names(server_type_dir, True):

	100 if current_month == None or month > current_month:

	101 current_month = month

	102

	103 for filename, path in get_names(month_dir, False):

	104 filename = re.sub(r"\.json$", "", filename)

	105 with codecs.open(path, "rb", encoding="utf-8") as file:

	106 data = simplejson.load(file)

	107

	108 overview_url = "../../overview-" + common.filename_encode(filename + ".h tml")

	109 filtered_urls = {}

	110 for field in common.fields:

	111 if field["name"] not in data:

	112 continue

	113 # Create filtered views for the first thirty values of a field if they

	114 # have filtered data.

	115 for name, value in get_template_environment().filters["sortfield"](dat a[field["name"]], field)[0:30]:

	116 if filter(lambda k: k not in ("hits", "bandwidth"), value.keys()):
	Sebastian Noack 2013/08/26 16:05:22 No need to create a new list with the keys first. No need to create a new list with the keys first. Just call filter(..., value). If value is a dict, it is iterable, and produces its keys on iteration. Also you might want to use itertools.ifilter instead of filter, which doesn't returns a pre-computed list, but a generator that is evaluated on the fly. Wladimir Palant 2013/08/27 07:34:28 I rather use value.iterkeys() here - it's obvious Show quoted text On 2013/08/26 16:05:22, sebastian wrote: > No need to create a new list with the keys first. Just call filter(..., value). > If value is a dict, it is iterable, and produces its keys on iteration. Also you > might want to use itertools.ifilter instead of filter, which doesn't returns a > pre-computed list, but a generator that is evaluated on the fly. I rather use value.iterkeys() here - it's obvious what's meant then, with Python dictionaries I'm frequently confused what I am iterating on. As to itertools.ifilter(), I would rather not complicate the code here since this spot isn't performance-critical. Wladimir Palant 2013/08/27 11:59:47 Reply by Sebastian: It does the same, but isn't a Reply by Sebastian: It does the same, but isn't actually necessary, because the dict object itself is iterable and will produce it keys on iteration. Wladimir Palant 2013/08/27 12:42:01 See my reply - I already know that. But I prefer i Show quoted text On 2013/08/27 11:59:47, Sebasitian wrote: > It does the same, but isn't actually necessary, because the dict object itself > is iterable and will produce it keys on iteration. See my reply - I already know that. But I prefer it to be obvious that we are iterating over keys and not items here.
	117 outputfile = os.path.join(outputdir,

	118 common.filename_encode(server_type),

	119 common.filename_encode(month),

	120 common.filename_encode(filename),

	121 "filtered-%s-%s.html" % (

	122 common.filename_encode(field["name"]),

	123 common.filename_encode(name),

	124 ))

	125 generate_file_stats(outputfile, month, baseURL + filename, overvie w_url,

	126 value, filter={"field": field, "value": name})

	127

	128 if not field["name"] in filtered_urls:

	129 filtered_urls[field["name"]] = {}

	130 filtered_urls[field["name"]][name] = outputfile

	131

	132 outputfile = os.path.join(outputdir,

	133 common.filename_encode(server_type),

	134 common.filename_encode(month),

	135 common.filename_encode(filename),

	136 "index.html")

	137 generate_file_stats(outputfile, month, baseURL + filename, overview_url,

	138 data, filtered_urls=filtered_urls)

	139

	140 if filename not in filedata:

	141 filedata[filename] = {}

	142 month_url = (common.filename_encode(month) + "/" +
	Sebastian Noack 2013/08/26 16:05:22 You should use os.path.join() here as well. You should use os.path.join() here as well. Wladimir Palant 2013/08/27 07:34:28 No, definitely not going to use os.path.join() for Show quoted text On 2013/08/26 16:05:22, sebastian wrote: > You should use os.path.join() here as well. No, definitely not going to use os.path.join() for URLs - this would produce wrong URLs on Windows for example. And urlparse.urljoin() won't make the logic here more obvious. Wladimir Palant 2013/08/27 11:59:47 Reply by Sebastian: Oh, didn't noted, it was an u Reply by Sebastian: Oh, didn't noted, it was an url. In that case hardcoded slashes are ok, of course. But I would rather use a format string here, with all the literal parts already included, instead the plus operator. This is Python, not JS. ;) I only use the plus operator to concatenate two strings. If you have more strings to concatenate into a single string, it is more readable and faster to either use format strings or str.join().
	143 common.filename_encode(filename) + "/" +

	144 "index.html")

	145 filedata[filename][month] = {"url": month_url, "hits": data["hits"], "ba ndwidth": data["bandwidth"]}

	146

	147 monthdata = {}

	148 for filename, data in filedata.iteritems():

	149 outputfile = os.path.join(outputdir,

	150 common.filename_encode(server_type),

	151 "overview-" + common.filename_encode(filename + ".html"))

	152 generate_file_overview(outputfile, baseURL + filename, data)

	153

	154 if current_month in data:

	155 monthdata[filename] = dict(data[current_month])

	156

	157 outputfile = os.path.join(outputdir, common.filename_encode(server_type), "i ndex.html")

	158 generate_main_page(outputfile, current_month, baseURL, monthdata)

132	159

133 if __name__ == '__main__':	160 if __name__ == '__main__':

134 setupStderr()	161 setupStderr()

135	162

136 data = SafeConfigParser()	163 datadir = get_config().get("stats", "dataDirectory")

137 data.read(get_config().get('subscriptionStats', 'mainFile'))	164 outputdir = get_config().get("stats", "outputDirectory")

138	165 generate_pages(datadir, outputdir)

139 outputDir = get_config().get('subscriptionStats', 'outputDirectory')

140 if not os.path.exists(outputDir):

141 os.makedirs(outputDir)

142 generateMainPage(data, outputDir)

143 subscriptions = generateSubscriptionPages(data, outputDir)

144 for fileName in subscriptions.iterkeys():

145 generateOverviewPage(data, outputDir, fileName)

OLD	NEW

« sitescripts/stats/bin/logprocessor.py ('K') | « sitescripts/stats/bin/logprocessor.py ('k') | sitescripts/stats/common.py » ('j') | sitescripts/utils.py » ('J')