Rietveld Code Review Tool
Help | Bug tracker | Discussion group | Source code

Side by Side Diff: sitescripts/stats/bin/pagegenerator.py

Issue 11481051: Update stats processing (Closed)
Patch Set: Fixed two presentation issues Created Aug. 24, 2013, 1:11 p.m.
Left:
Right:
Use n/p to move between diff chunks; N/P to move between comments.
Jump to:
View unified diff | Download patch
OLDNEW
1 # coding: utf-8 1 # coding: utf-8
2 2
3 # This file is part of the Adblock Plus web scripts, 3 # This file is part of the Adblock Plus web scripts,
4 # Copyright (C) 2006-2013 Eyeo GmbH 4 # Copyright (C) 2006-2013 Eyeo GmbH
5 # 5 #
6 # Adblock Plus is free software: you can redistribute it and/or modify 6 # Adblock Plus is free software: you can redistribute it and/or modify
7 # it under the terms of the GNU General Public License version 3 as 7 # it under the terms of the GNU General Public License version 3 as
8 # published by the Free Software Foundation. 8 # published by the Free Software Foundation.
9 # 9 #
10 # Adblock Plus is distributed in the hope that it will be useful, 10 # Adblock Plus is distributed in the hope that it will be useful,
11 # but WITHOUT ANY WARRANTY; without even the implied warranty of 11 # but WITHOUT ANY WARRANTY; without even the implied warranty of
12 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 12 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
13 # GNU General Public License for more details. 13 # GNU General Public License for more details.
14 # 14 #
15 # You should have received a copy of the GNU General Public License 15 # You should have received a copy of the GNU General Public License
16 # along with Adblock Plus. If not, see <http://www.gnu.org/licenses/>. 16 # along with Adblock Plus. If not, see <http://www.gnu.org/licenses/>.
17 17
18 import os, re, time 18 import os, re, codecs, simplejson, time, itertools
19 from datetime import date, timedelta 19 from datetime import date
20 from sitescripts.utils import get_config, setupStderr, get_template 20 from sitescripts.utils import get_config, setupStderr, get_custom_template_envir onment, cached
21 from sitescripts.logs.countryCodes import countryCodes 21 import sitescripts.stats.common as common
22 from ConfigParser import SafeConfigParser 22 from sitescripts.stats.countrycodes import countrycodes
23 23
24 def getSubscriptionFiles(data, month): 24 @cached(())
Sebastian Noack 2013/08/26 16:05:22 You passed an empty tuple as timeout. Obviously yo
Wladimir Palant 2013/08/27 07:34:28 This behavior is somewhat specified as described h
25 result = {} 25 def get_template_environment():
26 if data.has_section(month): 26 return get_custom_template_environment({
27 for option in data.options(month): 27 "monthname": lambda value: date(int(value[0:4]), int(value[4:]), 1).strftime ("%b %Y"),
28 result[option[0:option.index(' ')]] = True 28 "weekday": lambda value: ["Monday", "Tuesday", "Wednesday", "Thursday", "Fri day", "Saturday", "Sunday"][int(value)],
29 return result 29 "countryname": lambda value: countrycodes.get(value, "Unknown"),
30 "sortfield": lambda value, field: (field["sort"] if "sort" in field else def ault_sort)(value),
31 "maxhits": lambda items: max(itertools.chain((value["hits"] for key, value i n items), [1])),
32 "maxbandwidth": lambda items: max(itertools.chain((value["bandwidth"] for ke y, value in items), [1])),
33 "sumhits": lambda items: max(sum(value["hits"] for key, value in items), 1),
34 "sumbandwidth": lambda items: max(sum(value["bandwidth"] for key, value in i tems), 1),
35 "isspecial": lambda name, field: field["isspecial"](name) if "isspecial" in field else False,
36 })
30 37
31 def generateMainPage(data, outputDir): 38 @cached(())
32 def getDataInt(month, key): 39 def get_main_page_template():
33 if data.has_option(month, key): 40 return get_template_environment().get_template(get_config().get("stats", "main PageTemplate"))
34 return data.getint(month, key)
35 else:
36 return 0
37 41
38 month = date.today().strftime('%Y%m') 42 @cached(())
39 subscriptions = [] 43 def get_file_stats_template():
40 for fileName in getSubscriptionFiles(data, month).iterkeys(): 44 return get_template_environment().get_template(get_config().get("stats", "file PageTemplate"))
41 subscriptions.append({
42 'fileName': fileName,
43 'url': 'subscription_%s_%s.html' % (re.sub(r'\W', '_', fileName), month),
44 'hits': getDataInt(month, '%s hits' % fileName),
45 'bandwidth': getDataInt(month, '%s bandwidth' % fileName)
46 })
47 subscriptions = sorted(subscriptions, key=lambda s: s['hits'], reverse=True)
48 45
49 file = os.path.join(outputDir, 'index.html') 46 @cached(())
50 template = get_template(get_config().get('subscriptionStats', 'mainPageTemplat e')) 47 def get_file_overview_template():
51 template.stream({'now': time.time(), 'month': month, 'subscriptions': subscrip tions}).dump(file) 48 return get_template_environment().get_template(get_config().get("stats", "file OverviewTemplate"))
52 49
53 def generateSubscriptionPages(data, outputDir): 50 def default_sort(obj):
54 existingSubscriptions = {} 51 return sorted(obj.items(), key=lambda (k,v): v["hits"], reverse=True)
55 template = get_template(get_config().get('subscriptionStats', 'subscriptionPag eTemplate'))
56 for month in data.sections():
57 subscriptions = {}
58 for option in data.options(month):
59 spaceIndex = option.index(' ')
60 if spaceIndex < 0:
61 continue
62 fileName, key = option[0:spaceIndex], option[spaceIndex+1:]
63 existingSubscriptions[fileName] = True
64 if not fileName in subscriptions:
65 subscriptions[fileName] = {
66 'now': time.time(),
67 'month': month,
68 'daysInMonth': (date(int(month[0:4]), int(month[4:]), 1) - timedelta(d ays=1)).day,
69 'currentMonth': month == date.today().strftime('%Y%m'),
70 'fileName': fileName,
71 'overviewURL': 'overview_%s.html' % re.sub(r'\W', '_', fileName),
72 'hits': 0,
73 'bandwidth': 0,
74 'day': {},
75 'weekday': [{'id': i, 'hits': 0, 'bandwidth': 0, 'count': 0}for i in r ange(7)],
76 'hour': {},
77 'country': {},
78 'app': {},
79 'mirror': {},
80 }
81 if key == 'hits' or key == 'bandwidth':
82 subscriptions[fileName][key] = data.getint(month, option)
83 else:
84 match = re.search(r'^(hits|bandwidth) (day|hour|country|app|mirror) (.*) $', key)
85 if match:
86 if not match.group(3) in subscriptions[fileName][match.group(2)]:
87 subscriptions[fileName][match.group(2)][match.group(3)] = {
88 'id': match.group(3),
89 'hits': 0,
90 'bandwidth': 0,
91 }
92 if match.group(2) == 'day':
93 subscriptions[fileName][match.group(2)][match.group(3)]['weekday'] = date(int(month[0:4]), int(month[4:]), int(match.group(3))).weekday()
94 if match.group(2) == 'country':
95 if match.group(3) in countryCodes:
96 subscriptions[fileName][match.group(2)][match.group(3)]['name'] = countryCodes[match.group(3)]
97 subscriptions[fileName][match.group(2)][match.group(3)]['image'] = match.group(3)
98 else:
99 subscriptions[fileName][match.group(2)][match.group(3)]['name'] = 'Unknown'
100 subscriptions[fileName][match.group(2)][match.group(3)]['image'] = 'ip'
101 subscriptions[fileName][match.group(2)][match.group(3)][match.group(1) ] = data.getint(month, option)
102 52
103 for subscription in subscriptions.itervalues(): 53 def ensure_dir(path):
Sebastian Noack 2013/08/26 16:05:22 Instead of checking if the dir exist and creating
104 for key in ('day', 'hour'): 54 dir = os.path.dirname(path)
105 subscription[key] = sorted(subscription[key].itervalues(), key=lambda s: int(s['id'])) 55 if not os.path.exists(dir):
106 for key in ('country', 'app', 'mirror'): 56 os.makedirs(dir)
Wladimir Palant 2013/08/27 07:34:28 Fixed. However, this is a common pattern in our co
107 subscription[key] = sorted(subscription[key].itervalues(), key=lambda s: s['hits'], reverse=True)
108 for dayInfo in subscription['day']:
109 weekdayInfo = subscription['weekday'][dayInfo['weekday']]
110 weekdayInfo['hits'] = (weekdayInfo['hits'] * weekdayInfo['count'] + dayI nfo['hits']) / (weekdayInfo['count'] + 1)
111 weekdayInfo['bandwidth'] = (weekdayInfo['bandwidth'] * weekdayInfo['coun t'] + dayInfo['bandwidth']) / (weekdayInfo['count'] + 1)
112 weekdayInfo['count'] += 1
113 fileName = 'subscription_%s_%s.html' % (re.sub(r'\W', '_', subscription['f ileName']), month)
114 template.stream(subscription).dump(os.path.join(outputDir, fileName))
115 return existingSubscriptions
116 57
117 def generateOverviewPage(data, outputDir, fileName): 58 def generate_main_page(outputfile, month, url, data):
118 months = [] 59 ensure_dir(outputfile)
119 for month in data.sections(): 60 get_main_page_template().stream({
120 if data.has_option(month, '%s hits' % fileName) and data.has_option(month, ' %s bandwidth' % fileName): 61 "now": time.time(),
121 months.append({ 62 "month": month,
122 'id': month, 63 "url": url,
123 'url': 'subscription_%s_%s.html' % (re.sub(r'\W', '_', fileName), month) , 64 "data": data,
124 'hits': data.getint(month, '%s hits' % fileName), 65 }).dump(outputfile)
125 'bandwidth': data.getint(month, '%s bandwidth' % fileName),
126 })
127 months = sorted(months, key=lambda m: m['id'])
128 66
129 file = os.path.join(outputDir, 'overview_%s.html' % re.sub(r'\W', '_', fileNam e)) 67 def generate_file_stats(outputfile, month, url, overview_url, data, filter=None, filtered_urls={}):
130 template = get_template(get_config().get('subscriptionStats', 'subscriptionOve rviewTemplate')) 68 ensure_dir(outputfile)
131 template.stream({'now': time.time(), 'fileName': fileName, 'month': months}).d ump(file) 69 get_file_stats_template().stream({
70 "now": time.time(),
71 "month": month,
72 "url": url,
73 "overview_url": overview_url,
74 "data": data,
75 "fields": common.fields,
76 "filter": filter,
77 "filtered_urls": filtered_urls
78 }).dump(outputfile)
79
80 def generate_file_overview(outputfile, url, data):
81 ensure_dir(outputfile)
82 get_file_overview_template().stream({
83 "now": time.time(),
84 "url": url,
85 "data": data
86 }).dump(outputfile)
87
88 def get_names(dir, needdirectories):
89 for file in os.listdir(dir):
90 path = os.path.join(dir, file)
91 if (needdirectories and os.path.isdir(path)) or (not needdirectories and os. path.isfile(path)):
92 yield common.filename_decode(file), path
93
94 def generate_pages(datadir, outputdir):
95 for server_type, server_type_dir in get_names(datadir, True):
96 baseURL = get_config().get("stats", "baseURL_" + server_type)
97 filedata = {}
98 current_month = None
99 for month, month_dir in get_names(server_type_dir, True):
100 if current_month == None or month > current_month:
101 current_month = month
102
103 for filename, path in get_names(month_dir, False):
104 filename = re.sub(r"\.json$", "", filename)
105 with codecs.open(path, "rb", encoding="utf-8") as file:
106 data = simplejson.load(file)
107
108 overview_url = "../../overview-" + common.filename_encode(filename + ".h tml")
109 filtered_urls = {}
110 for field in common.fields:
111 if field["name"] not in data:
112 continue
113 # Create filtered views for the first thirty values of a field if they
114 # have filtered data.
115 for name, value in get_template_environment().filters["sortfield"](dat a[field["name"]], field)[0:30]:
116 if filter(lambda k: k not in ("hits", "bandwidth"), value.keys()):
Sebastian Noack 2013/08/26 16:05:22 No need to create a new list with the keys first.
Wladimir Palant 2013/08/27 07:34:28 I rather use value.iterkeys() here - it's obvious
Wladimir Palant 2013/08/27 11:59:47 Reply by Sebastian: It does the same, but isn't a
Wladimir Palant 2013/08/27 12:42:01 See my reply - I already know that. But I prefer i
117 outputfile = os.path.join(outputdir,
118 common.filename_encode(server_type),
119 common.filename_encode(month),
120 common.filename_encode(filename),
121 "filtered-%s-%s.html" % (
122 common.filename_encode(field["name"]),
123 common.filename_encode(name),
124 ))
125 generate_file_stats(outputfile, month, baseURL + filename, overvie w_url,
126 value, filter={"field": field, "value": name})
127
128 if not field["name"] in filtered_urls:
129 filtered_urls[field["name"]] = {}
130 filtered_urls[field["name"]][name] = outputfile
131
132 outputfile = os.path.join(outputdir,
133 common.filename_encode(server_type),
134 common.filename_encode(month),
135 common.filename_encode(filename),
136 "index.html")
137 generate_file_stats(outputfile, month, baseURL + filename, overview_url,
138 data, filtered_urls=filtered_urls)
139
140 if filename not in filedata:
141 filedata[filename] = {}
142 month_url = (common.filename_encode(month) + "/" +
Sebastian Noack 2013/08/26 16:05:22 You should use os.path.join() here as well.
Wladimir Palant 2013/08/27 07:34:28 No, definitely not going to use os.path.join() for
Wladimir Palant 2013/08/27 11:59:47 Reply by Sebastian: Oh, didn't noted, it was an u
143 common.filename_encode(filename) + "/" +
144 "index.html")
145 filedata[filename][month] = {"url": month_url, "hits": data["hits"], "ba ndwidth": data["bandwidth"]}
146
147 monthdata = {}
148 for filename, data in filedata.iteritems():
149 outputfile = os.path.join(outputdir,
150 common.filename_encode(server_type),
151 "overview-" + common.filename_encode(filename + ".html"))
152 generate_file_overview(outputfile, baseURL + filename, data)
153
154 if current_month in data:
155 monthdata[filename] = dict(data[current_month])
156
157 outputfile = os.path.join(outputdir, common.filename_encode(server_type), "i ndex.html")
158 generate_main_page(outputfile, current_month, baseURL, monthdata)
132 159
133 if __name__ == '__main__': 160 if __name__ == '__main__':
134 setupStderr() 161 setupStderr()
135 162
136 data = SafeConfigParser() 163 datadir = get_config().get("stats", "dataDirectory")
137 data.read(get_config().get('subscriptionStats', 'mainFile')) 164 outputdir = get_config().get("stats", "outputDirectory")
138 165 generate_pages(datadir, outputdir)
139 outputDir = get_config().get('subscriptionStats', 'outputDirectory')
140 if not os.path.exists(outputDir):
141 os.makedirs(outputDir)
142 generateMainPage(data, outputDir)
143 subscriptions = generateSubscriptionPages(data, outputDir)
144 for fileName in subscriptions.iterkeys():
145 generateOverviewPage(data, outputDir, fileName)
OLDNEW

Powered by Google App Engine
This is Rietveld