sitescripts/stats/bin/pagegenerator.py - Issue 11481051: Update stats processing

Delta Between Two Patch Sets: sitescripts/stats/bin/pagegenerator.py

Issue 11481051: Update stats processing (Closed)

Left Patch Set: Fixed two presentation issues Created Aug. 24, 2013, 1:11 p.m.

Right Patch Set: Improved performance using memoization Created Aug. 29, 2013, 1:39 p.m.

Left:
Right:

Use n/p to move between diff chunks; N/P to move between comments.

Jump to:

Left: Side by side diff | Download
Right: Side by side diff | Download

LEFT	RIGHT
1 # coding: utf-8	1 # coding: utf-8

2	2

3 # This file is part of the Adblock Plus web scripts,	3 # This file is part of the Adblock Plus web scripts,

4 # Copyright (C) 2006-2013 Eyeo GmbH	4 # Copyright (C) 2006-2013 Eyeo GmbH

5 #	5 #

6 # Adblock Plus is free software: you can redistribute it and/or modify	6 # Adblock Plus is free software: you can redistribute it and/or modify

7 # it under the terms of the GNU General Public License version 3 as	7 # it under the terms of the GNU General Public License version 3 as

8 # published by the Free Software Foundation.	8 # published by the Free Software Foundation.

9 #	9 #

10 # Adblock Plus is distributed in the hope that it will be useful,	10 # Adblock Plus is distributed in the hope that it will be useful,

11 # but WITHOUT ANY WARRANTY; without even the implied warranty of	11 # but WITHOUT ANY WARRANTY; without even the implied warranty of

12 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the	12 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the

13 # GNU General Public License for more details.	13 # GNU General Public License for more details.

14 #	14 #

15 # You should have received a copy of the GNU General Public License	15 # You should have received a copy of the GNU General Public License

16 # along with Adblock Plus. If not, see <http://www.gnu.org/licenses/>.	16 # along with Adblock Plus. If not, see <http://www.gnu.org/licenses/>.

17	17

18 import os, re, codecs, simplejson, time, itertools	18 import os, re, codecs, simplejson, time, itertools

19 from datetime import date	19 from datetime import date

20 from sitescripts.utils import get_config, setupStderr, get_custom_template_envir onment, cached	20 from sitescripts.utils import get_config, setupStderr, get_custom_template_envir onment, cached

21 import sitescripts.stats.common as common	21 import sitescripts.stats.common as common

22 from sitescripts.stats.countrycodes import countrycodes	22 from sitescripts.stats.countrycodes import countrycodes

23	23

24 @cached(())	24 @cached(float("inf"))
Sebastian Noack 2013/08/26 16:05:22 You passed an empty tuple as timeout. Obviously yo You passed an empty tuple as timeout. Obviously you want to make the timeout forever, but this isn't the right way to do it. It though works in at least recent versions of CPython, but that is just luck, since the Python language specs, don't specify that a tuple is always larger than an int/float, as far as I know. So it might fail with other Python implementations or future versions of CPython. However why don't you just use float('Inf')? Wladimir Palant 2013/08/27 07:34:28 This behavior is somewhat specified as described h This behavior is somewhat specified as described here: http://stackoverflow.com/a/3270689/785541 Anyway, float("inf") was introduced in Python 2.6, this code is older however. I followed the advice to use an empty tuple because I liked it better than using an arbitrary number like 1e400. Fixed.
25 def get_template_environment():	25 def get_template_environment():

26 return get_custom_template_environment({	26 return get_custom_template_environment({

27 "monthname": lambda value: date(int(value[0:4]), int(value[4:]), 1).strftime ("%b %Y"),	27 "monthname": lambda value: date(int(value[0:4]), int(value[4:]), 1).strftime ("%b %Y"),

28 "weekday": lambda value: ["Monday", "Tuesday", "Wednesday", "Thursday", "Fri day", "Saturday", "Sunday"][int(value)],	28 "weekday": lambda value: ["Monday", "Tuesday", "Wednesday", "Thursday", "Fri day", "Saturday", "Sunday"][int(value)],

29 "countryname": lambda value: countrycodes.get(value, "Unknown"),	29 "countryname": lambda value: countrycodes.get(value, "Unknown"),

30 "sortfield": lambda value, field: (field["sort"] if "sort" in field else def ault_sort)(value),	30 "sortfield": lambda value, field: (field["sort"] if "sort" in field else def ault_sort)(value),

31 "maxhits": lambda items: max(itertools.chain((value["hits"] for key, value i n items), [1])),	31 "maxhits": lambda items: max(itertools.chain((value["hits"] for key, value i n items), [1])),

32 "maxbandwidth": lambda items: max(itertools.chain((value["bandwidth"] for ke y, value in items), [1])),	32 "maxbandwidth": lambda items: max(itertools.chain((value["bandwidth"] for ke y, value in items), [1])),

33 "sumhits": lambda items: max(sum(value["hits"] for key, value in items), 1),	33 "sumhits": lambda items: max(sum(value["hits"] for key, value in items), 1),

34 "sumbandwidth": lambda items: max(sum(value["bandwidth"] for key, value in i tems), 1),	34 "sumbandwidth": lambda items: max(sum(value["bandwidth"] for key, value in i tems), 1),

35 "isspecial": lambda name, field: field["isspecial"](name) if "isspecial" in field else False,	35 "isspecial": lambda name, field: field["isspecial"](name) if "isspecial" in field else False,

36 })	36 })

37	37

38 @cached(())	38 @cached(float("inf"))

39 def get_main_page_template():	39 def get_main_page_template():

40 return get_template_environment().get_template(get_config().get("stats", "main PageTemplate"))	40 return get_template_environment().get_template(get_config().get("stats", "main PageTemplate"))

41	41

42 @cached(())	42 @cached(float("inf"))

43 def get_file_stats_template():	43 def get_file_stats_template():

44 return get_template_environment().get_template(get_config().get("stats", "file PageTemplate"))	44 return get_template_environment().get_template(get_config().get("stats", "file PageTemplate"))

45	45

46 @cached(())	46 @cached(float("inf"))

47 def get_file_overview_template():	47 def get_file_overview_template():

48 return get_template_environment().get_template(get_config().get("stats", "file OverviewTemplate"))	48 return get_template_environment().get_template(get_config().get("stats", "file OverviewTemplate"))

49	49

50 def default_sort(obj):	50 def default_sort(obj):

51 return sorted(obj.items(), key=lambda (k,v): v["hits"], reverse=True)	51 return sorted(obj.items(), key=lambda (k,v): v["hits"], reverse=True)

52	52

53 def ensure_dir(path):	53 def ensure_dir(path):
Sebastian Noack 2013/08/26 16:05:22 Instead of checking if the dir exist and creating Instead of checking if the dir exist and creating it if not, I would rather just create it and catch the OSError. This will be faster since the FS must be queried only once and you get around the theoretical condition that the dir doesn't exist when you check for its presence but suddenly was created by somebody else when actually try to create it a (few) ms later.
54 dir = os.path.dirname(path)	54 dir = os.path.dirname(path)

55 if not os.path.exists(dir):	55 try:

56 os.makedirs(dir)	56 os.makedirs(dir)
Wladimir Palant 2013/08/27 07:34:28 Fixed. However, this is a common pattern in our co Fixed. However, this is a common pattern in our code base and will have to be fixed consistently everywhere.
	57 except OSError:

	58 pass

57	59

58 def generate_main_page(outputfile, month, url, data):	60 def generate_main_page(outputfile, month, url, data):

59 ensure_dir(outputfile)	61 ensure_dir(outputfile)

60 get_main_page_template().stream({	62 get_main_page_template().stream({

61 "now": time.time(),	63 "now": time.time(),

62 "month": month,	64 "month": month,

63 "url": url,	65 "url": url,

64 "data": data,	66 "data": data,

65 }).dump(outputfile)	67 }).dump(outputfile)

66	68

(...skipping 39 matching lines...) Expand 10 before \| Expand all \| Expand 10 after Loading...
106 data = simplejson.load(file)	108 data = simplejson.load(file)

107	109

108 overview_url = "../../overview-" + common.filename_encode(filename + ".h tml")	110 overview_url = "../../overview-" + common.filename_encode(filename + ".h tml")

109 filtered_urls = {}	111 filtered_urls = {}

110 for field in common.fields:	112 for field in common.fields:

111 if field["name"] not in data:	113 if field["name"] not in data:

112 continue	114 continue

113 # Create filtered views for the first thirty values of a field if they	115 # Create filtered views for the first thirty values of a field if they

114 # have filtered data.	116 # have filtered data.

115 for name, value in get_template_environment().filters["sortfield"](dat a[field["name"]], field)[0:30]:	117 for name, value in get_template_environment().filters["sortfield"](dat a[field["name"]], field)[0:30]:

116 if filter(lambda k: k not in ("hits", "bandwidth"), value.keys()):	118 if filter(lambda k: k not in ("hits", "bandwidth"), value.iterkeys() ):
Sebastian Noack 2013/08/26 16:05:22 No need to create a new list with the keys first. No need to create a new list with the keys first. Just call filter(..., value). If value is a dict, it is iterable, and produces its keys on iteration. Also you might want to use itertools.ifilter instead of filter, which doesn't returns a pre-computed list, but a generator that is evaluated on the fly. Wladimir Palant 2013/08/27 07:34:28 I rather use value.iterkeys() here - it's obvious Show quoted text On 2013/08/26 16:05:22, sebastian wrote: > No need to create a new list with the keys first. Just call filter(..., value). > If value is a dict, it is iterable, and produces its keys on iteration. Also you > might want to use itertools.ifilter instead of filter, which doesn't returns a > pre-computed list, but a generator that is evaluated on the fly. I rather use value.iterkeys() here - it's obvious what's meant then, with Python dictionaries I'm frequently confused what I am iterating on. As to itertools.ifilter(), I would rather not complicate the code here since this spot isn't performance-critical. Wladimir Palant 2013/08/27 11:59:47 Reply by Sebastian: It does the same, but isn't a Reply by Sebastian: It does the same, but isn't actually necessary, because the dict object itself is iterable and will produce it keys on iteration. Wladimir Palant 2013/08/27 12:42:01 See my reply - I already know that. But I prefer i Show quoted text On 2013/08/27 11:59:47, Sebasitian wrote: > It does the same, but isn't actually necessary, because the dict object itself > is iterable and will produce it keys on iteration. See my reply - I already know that. But I prefer it to be obvious that we are iterating over keys and not items here.
117 outputfile = os.path.join(outputdir,	119 outputfile = os.path.join(outputdir,

118 common.filename_encode(server_type),	120 common.filename_encode(server_type),

119 common.filename_encode(month),	121 common.filename_encode(month),

120 common.filename_encode(filename),	122 common.filename_encode(filename),

121 "filtered-%s-%s.html" % (	123 "filtered-%s-%s.html" % (

122 common.filename_encode(field["name"]),	124 common.filename_encode(field["name"]),

123 common.filename_encode(name),	125 common.filename_encode(name),

124 ))	126 ))

125 generate_file_stats(outputfile, month, baseURL + filename, overvie w_url,	127 generate_file_stats(outputfile, month, baseURL + filename, overvie w_url,

126 value, filter={"field": field, "value": name})	128 value, filter={"field": field, "value": name})

127	129

128 if not field["name"] in filtered_urls:	130 if not field["name"] in filtered_urls:

129 filtered_urls[field["name"]] = {}	131 filtered_urls[field["name"]] = {}

130 filtered_urls[field["name"]][name] = outputfile	132 filtered_urls[field["name"]][name] = os.path.basename(outputfile)

131	133

132 outputfile = os.path.join(outputdir,	134 outputfile = os.path.join(outputdir,

133 common.filename_encode(server_type),	135 common.filename_encode(server_type),

134 common.filename_encode(month),	136 common.filename_encode(month),

135 common.filename_encode(filename),	137 common.filename_encode(filename),

136 "index.html")	138 "index.html")

137 generate_file_stats(outputfile, month, baseURL + filename, overview_url,	139 generate_file_stats(outputfile, month, baseURL + filename, overview_url,

138 data, filtered_urls=filtered_urls)	140 data, filtered_urls=filtered_urls)

139	141

140 if filename not in filedata:	142 if filename not in filedata:

141 filedata[filename] = {}	143 filedata[filename] = {}

142 month_url = (common.filename_encode(month) + "/" +	144 month_url = "%s/%s/%s" % (common.filename_encode(month),
Sebastian Noack 2013/08/26 16:05:22 You should use os.path.join() here as well. You should use os.path.join() here as well. Wladimir Palant 2013/08/27 07:34:28 No, definitely not going to use os.path.join() for Show quoted text On 2013/08/26 16:05:22, sebastian wrote: > You should use os.path.join() here as well. No, definitely not going to use os.path.join() for URLs - this would produce wrong URLs on Windows for example. And urlparse.urljoin() won't make the logic here more obvious. Wladimir Palant 2013/08/27 11:59:47 Reply by Sebastian: Oh, didn't noted, it was an u Reply by Sebastian: Oh, didn't noted, it was an url. In that case hardcoded slashes are ok, of course. But I would rather use a format string here, with all the literal parts already included, instead the plus operator. This is Python, not JS. ;) I only use the plus operator to concatenate two strings. If you have more strings to concatenate into a single string, it is more readable and faster to either use format strings or str.join().
143 common.filename_encode(filename) + "/" +	145 common.filename_encode(filename),

144 "index.html")	146 "index.html")

145 filedata[filename][month] = {"url": month_url, "hits": data["hits"], "ba ndwidth": data["bandwidth"]}	147 filedata[filename][month] = {"url": month_url, "hits": data["hits"], "ba ndwidth": data["bandwidth"]}

146	148

147 monthdata = {}	149 monthdata = {}

148 for filename, data in filedata.iteritems():	150 for filename, data in filedata.iteritems():

149 outputfile = os.path.join(outputdir,	151 outputfile = os.path.join(outputdir,

150 common.filename_encode(server_type),	152 common.filename_encode(server_type),

151 "overview-" + common.filename_encode(filename + ".html"))	153 "overview-" + common.filename_encode(filename + ".html"))

152 generate_file_overview(outputfile, baseURL + filename, data)	154 generate_file_overview(outputfile, baseURL + filename, data)

153	155

154 if current_month in data:	156 if current_month in data:

155 monthdata[filename] = dict(data[current_month])	157 monthdata[filename] = dict(data[current_month])

156	158

157 outputfile = os.path.join(outputdir, common.filename_encode(server_type), "i ndex.html")	159 outputfile = os.path.join(outputdir, common.filename_encode(server_type), "i ndex.html")

158 generate_main_page(outputfile, current_month, baseURL, monthdata)	160 generate_main_page(outputfile, current_month, baseURL, monthdata)

159	161

160 if __name__ == '__main__':	162 if __name__ == '__main__':

161 setupStderr()	163 setupStderr()

162	164

163 datadir = get_config().get("stats", "dataDirectory")	165 datadir = get_config().get("stats", "dataDirectory")

164 outputdir = get_config().get("stats", "outputDirectory")	166 outputdir = get_config().get("stats", "outputDirectory")

165 generate_pages(datadir, outputdir)	167 generate_pages(datadir, outputdir)

LEFT	RIGHT