Rietveld Code Review Tool
Help | Bug tracker | Discussion group | Source code

Side by Side Diff: sitescripts/stats/bin/pagegenerator.py

Issue 11481051: Update stats processing (Closed)
Patch Set: Created Aug. 23, 2013, 3:53 p.m.
Left:
Right:
Use n/p to move between diff chunks; N/P to move between comments.
Jump to:
View unified diff | Download patch
OLDNEW
1 # coding: utf-8 1 # coding: utf-8
2 2
3 # This file is part of the Adblock Plus web scripts, 3 # This file is part of the Adblock Plus web scripts,
4 # Copyright (C) 2006-2013 Eyeo GmbH 4 # Copyright (C) 2006-2013 Eyeo GmbH
5 # 5 #
6 # Adblock Plus is free software: you can redistribute it and/or modify 6 # Adblock Plus is free software: you can redistribute it and/or modify
7 # it under the terms of the GNU General Public License version 3 as 7 # it under the terms of the GNU General Public License version 3 as
8 # published by the Free Software Foundation. 8 # published by the Free Software Foundation.
9 # 9 #
10 # Adblock Plus is distributed in the hope that it will be useful, 10 # Adblock Plus is distributed in the hope that it will be useful,
11 # but WITHOUT ANY WARRANTY; without even the implied warranty of 11 # but WITHOUT ANY WARRANTY; without even the implied warranty of
12 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 12 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
13 # GNU General Public License for more details. 13 # GNU General Public License for more details.
14 # 14 #
15 # You should have received a copy of the GNU General Public License 15 # You should have received a copy of the GNU General Public License
16 # along with Adblock Plus. If not, see <http://www.gnu.org/licenses/>. 16 # along with Adblock Plus. If not, see <http://www.gnu.org/licenses/>.
17 17
18 import os, re, time 18 import os, re, codecs, simplejson, time, itertools
19 from datetime import date, timedelta 19 from datetime import date
20 from sitescripts.utils import get_config, setupStderr, get_template 20 from sitescripts.utils import get_config, setupStderr, get_custom_template_envir onment, cached
21 from sitescripts.logs.countryCodes import countryCodes 21 import sitescripts.stats.common as common
22 from ConfigParser import SafeConfigParser 22 from sitescripts.stats.countrycodes import countrycodes
23 23
24 def getSubscriptionFiles(data, month): 24 @cached(())
25 result = {} 25 def get_template_environment():
26 if data.has_section(month): 26 return get_custom_template_environment({
27 for option in data.options(month): 27 "monthname": lambda value: date(int(value[0:4]), int(value[4:]), 1).strftime ("%b %Y"),
28 result[option[0:option.index(' ')]] = True 28 "countryname": lambda value: countrycodes.get(value, "Unknown"),
29 return result 29 "sortfield": lambda value, field: (field["sort"] if "sort" in field else def ault_sort)(value),
30 "maxhits": lambda items: max(itertools.chain((value["hits"] for key, value i n items), [1])),
31 "maxbandwidth": lambda items: max(itertools.chain((value["bandwidth"] for ke y, value in items), [1])),
32 "sumhits": lambda items: max(sum(value["hits"] for key, value in items), 1),
33 "sumbandwidth": lambda items: max(sum(value["bandwidth"] for key, value in i tems), 1),
34 "isspecial": lambda name, field: field["isspecial"](name) if "isspecial" in field else False,
35 })
30 36
31 def generateMainPage(data, outputDir): 37 @cached(())
32 def getDataInt(month, key): 38 def get_main_page_template():
33 if data.has_option(month, key): 39 return get_template_environment().get_template(get_config().get("stats", "main PageTemplate"))
34 return data.getint(month, key)
35 else:
36 return 0
37 40
38 month = date.today().strftime('%Y%m') 41 @cached(())
39 subscriptions = [] 42 def get_file_stats_template():
40 for fileName in getSubscriptionFiles(data, month).iterkeys(): 43 return get_template_environment().get_template(get_config().get("stats", "file PageTemplate"))
41 subscriptions.append({
42 'fileName': fileName,
43 'url': 'subscription_%s_%s.html' % (re.sub(r'\W', '_', fileName), month),
44 'hits': getDataInt(month, '%s hits' % fileName),
45 'bandwidth': getDataInt(month, '%s bandwidth' % fileName)
46 })
47 subscriptions = sorted(subscriptions, key=lambda s: s['hits'], reverse=True)
48 44
49 file = os.path.join(outputDir, 'index.html') 45 @cached(())
50 template = get_template(get_config().get('subscriptionStats', 'mainPageTemplat e')) 46 def get_file_overview_template():
51 template.stream({'now': time.time(), 'month': month, 'subscriptions': subscrip tions}).dump(file) 47 return get_template_environment().get_template(get_config().get("stats", "file OverviewTemplate"))
52 48
53 def generateSubscriptionPages(data, outputDir): 49 def default_sort(obj):
54 existingSubscriptions = {} 50 return sorted(obj.items(), key=lambda (k,v): v["hits"], reverse=True)
55 template = get_template(get_config().get('subscriptionStats', 'subscriptionPag eTemplate'))
56 for month in data.sections():
57 subscriptions = {}
58 for option in data.options(month):
59 spaceIndex = option.index(' ')
60 if spaceIndex < 0:
61 continue
62 fileName, key = option[0:spaceIndex], option[spaceIndex+1:]
63 existingSubscriptions[fileName] = True
64 if not fileName in subscriptions:
65 subscriptions[fileName] = {
66 'now': time.time(),
67 'month': month,
68 'daysInMonth': (date(int(month[0:4]), int(month[4:]), 1) - timedelta(d ays=1)).day,
69 'currentMonth': month == date.today().strftime('%Y%m'),
70 'fileName': fileName,
71 'overviewURL': 'overview_%s.html' % re.sub(r'\W', '_', fileName),
72 'hits': 0,
73 'bandwidth': 0,
74 'day': {},
75 'weekday': [{'id': i, 'hits': 0, 'bandwidth': 0, 'count': 0}for i in r ange(7)],
76 'hour': {},
77 'country': {},
78 'app': {},
79 'mirror': {},
80 }
81 if key == 'hits' or key == 'bandwidth':
82 subscriptions[fileName][key] = data.getint(month, option)
83 else:
84 match = re.search(r'^(hits|bandwidth) (day|hour|country|app|mirror) (.*) $', key)
85 if match:
86 if not match.group(3) in subscriptions[fileName][match.group(2)]:
87 subscriptions[fileName][match.group(2)][match.group(3)] = {
88 'id': match.group(3),
89 'hits': 0,
90 'bandwidth': 0,
91 }
92 if match.group(2) == 'day':
93 subscriptions[fileName][match.group(2)][match.group(3)]['weekday'] = date(int(month[0:4]), int(month[4:]), int(match.group(3))).weekday()
94 if match.group(2) == 'country':
95 if match.group(3) in countryCodes:
96 subscriptions[fileName][match.group(2)][match.group(3)]['name'] = countryCodes[match.group(3)]
97 subscriptions[fileName][match.group(2)][match.group(3)]['image'] = match.group(3)
98 else:
99 subscriptions[fileName][match.group(2)][match.group(3)]['name'] = 'Unknown'
100 subscriptions[fileName][match.group(2)][match.group(3)]['image'] = 'ip'
101 subscriptions[fileName][match.group(2)][match.group(3)][match.group(1) ] = data.getint(month, option)
102 51
103 for subscription in subscriptions.itervalues(): 52 def ensure_dir(path):
104 for key in ('day', 'hour'): 53 dir = os.path.dirname(path)
105 subscription[key] = sorted(subscription[key].itervalues(), key=lambda s: int(s['id'])) 54 if not os.path.exists(dir):
106 for key in ('country', 'app', 'mirror'): 55 os.makedirs(dir)
107 subscription[key] = sorted(subscription[key].itervalues(), key=lambda s: s['hits'], reverse=True)
108 for dayInfo in subscription['day']:
109 weekdayInfo = subscription['weekday'][dayInfo['weekday']]
110 weekdayInfo['hits'] = (weekdayInfo['hits'] * weekdayInfo['count'] + dayI nfo['hits']) / (weekdayInfo['count'] + 1)
111 weekdayInfo['bandwidth'] = (weekdayInfo['bandwidth'] * weekdayInfo['coun t'] + dayInfo['bandwidth']) / (weekdayInfo['count'] + 1)
112 weekdayInfo['count'] += 1
113 fileName = 'subscription_%s_%s.html' % (re.sub(r'\W', '_', subscription['f ileName']), month)
114 template.stream(subscription).dump(os.path.join(outputDir, fileName))
115 return existingSubscriptions
116 56
117 def generateOverviewPage(data, outputDir, fileName): 57 def generate_main_page(outputfile, month, url, data):
118 months = [] 58 ensure_dir(outputfile)
119 for month in data.sections(): 59 get_main_page_template().stream({
120 if data.has_option(month, '%s hits' % fileName) and data.has_option(month, ' %s bandwidth' % fileName): 60 "now": time.time(),
121 months.append({ 61 "month": month,
122 'id': month, 62 "url": url,
123 'url': 'subscription_%s_%s.html' % (re.sub(r'\W', '_', fileName), month) , 63 "data": data,
124 'hits': data.getint(month, '%s hits' % fileName), 64 }).dump(outputfile)
125 'bandwidth': data.getint(month, '%s bandwidth' % fileName),
126 })
127 months = sorted(months, key=lambda m: m['id'])
128 65
129 file = os.path.join(outputDir, 'overview_%s.html' % re.sub(r'\W', '_', fileNam e)) 66 def generate_file_stats(outputfile, month, url, overview_url, data, filter=None, filtered_urls={}):
130 template = get_template(get_config().get('subscriptionStats', 'subscriptionOve rviewTemplate')) 67 ensure_dir(outputfile)
131 template.stream({'now': time.time(), 'fileName': fileName, 'month': months}).d ump(file) 68 get_file_stats_template().stream({
69 "now": time.time(),
70 "month": month,
71 "url": url,
72 "overview_url": overview_url,
73 "data": data,
74 "fields": common.fields,
75 "filter": filter,
76 "filtered_urls": filtered_urls
77 }).dump(outputfile)
78
79 def generate_file_overview(outputfile, url, data):
80 ensure_dir(outputfile)
81 get_file_overview_template().stream({
82 "now": time.time(),
83 "url": url,
84 "data": data
85 }).dump(outputfile)
86
87 def get_names(dir, needdirectories):
88 for file in os.listdir(dir):
89 path = os.path.join(dir, file)
90 if (needdirectories and os.path.isdir(path)) or (not needdirectories and os. path.isfile(path)):
91 yield common.filename_decode(file), path
92
93 def generate_pages(datadir, outputdir):
94 for server_type, server_type_dir in get_names(datadir, True):
95 baseURL = get_config().get("stats", "baseURL_" + server_type)
96 filedata = {}
97 current_month = None
98 for month, month_dir in get_names(server_type_dir, True):
99 if current_month == None or month > current_month:
100 current_month = month
101
102 for filename, path in get_names(month_dir, False):
103 filename = re.sub(r"\.json$", "", filename)
104 with codecs.open(path, "rb", encoding="utf-8") as file:
105 data = simplejson.load(file)
106
107 overview_url = "../../overview-" + common.filename_encode(filename + ".h tml")
108 filtered_urls = {}
109 for field in common.fields:
110 if field["name"] not in data:
111 continue
112 # Create filtered views for the first thirty values of a field if they
113 # have filtered data.
114 for name, value in get_template_environment().filters["sortfield"](dat a[field["name"]], field)[0:30]:
115 if filter(lambda k: k not in ("hits", "bandwidth"), value.keys()):
116 outputfile = os.path.join(outputdir,
117 common.filename_encode(server_type),
118 common.filename_encode(month),
119 common.filename_encode(filename),
120 "filtered-%s-%s.html" % (
121 common.filename_encode(field["name"]),
122 common.filename_encode(name),
123 ))
124 generate_file_stats(outputfile, month, baseURL + filename, overvie w_url,
125 value, filter={"field": field, "value": name})
126
127 if not field["name"] in filtered_urls:
128 filtered_urls[field["name"]] = {}
129 filtered_urls[field["name"]][name] = outputfile
130
131 outputfile = os.path.join(outputdir,
132 common.filename_encode(server_type),
133 common.filename_encode(month),
134 common.filename_encode(filename),
135 "index.html")
136 generate_file_stats(outputfile, month, baseURL + filename, overview_url,
137 data, filtered_urls=filtered_urls)
138
139 if filename not in filedata:
140 filedata[filename] = {}
141 month_url = (common.filename_encode(month) + "/" +
142 common.filename_encode(filename) + "/" +
143 "index.html")
144 filedata[filename][month] = {"url": month_url, "hits": data["hits"], "ba ndwidth": data["bandwidth"]}
145
146 monthdata = {}
147 for filename, data in filedata.iteritems():
148 outputfile = os.path.join(outputdir,
149 common.filename_encode(server_type),
150 "overview-" + common.filename_encode(filename + ".html"))
151 generate_file_overview(outputfile, baseURL + filename, data)
152
153 if current_month in data:
154 monthdata[filename] = dict(data[current_month])
155
156 outputfile = os.path.join(outputdir, common.filename_encode(server_type), "i ndex.html")
157 generate_main_page(outputfile, current_month, baseURL, monthdata)
132 158
133 if __name__ == '__main__': 159 if __name__ == '__main__':
134 setupStderr() 160 setupStderr()
135 161
136 data = SafeConfigParser() 162 datadir = get_config().get("stats", "dataDirectory")
137 data.read(get_config().get('subscriptionStats', 'mainFile')) 163 outputdir = get_config().get("stats", "outputDirectory")
138 164 generate_pages(datadir, outputdir)
139 outputDir = get_config().get('subscriptionStats', 'outputDirectory')
140 if not os.path.exists(outputDir):
141 os.makedirs(outputDir)
142 generateMainPage(data, outputDir)
143 subscriptions = generateSubscriptionPages(data, outputDir)
144 for fileName in subscriptions.iterkeys():
145 generateOverviewPage(data, outputDir, fileName)
OLDNEW

Powered by Google App Engine
This is Rietveld