Rietveld Code Review Tool
Help | Bug tracker | Discussion group | Source code

Side by Side Diff: sitescripts/stats/bin/pagegenerator.py

Issue 11481051: Update stats processing (Closed)
Patch Set: Improved performance using memoization Created Aug. 29, 2013, 1:39 p.m.
Left:
Right:
Use n/p to move between diff chunks; N/P to move between comments.
Jump to:
View unified diff | Download patch
« no previous file with comments | « sitescripts/stats/bin/logprocessor.py ('k') | sitescripts/stats/common.py » ('j') | no next file with comments »
Toggle Intra-line Diffs ('i') | Expand Comments ('e') | Collapse Comments ('c') | Show Comments Hide Comments ('s')
OLDNEW
1 # coding: utf-8 1 # coding: utf-8
2 2
3 # This file is part of the Adblock Plus web scripts, 3 # This file is part of the Adblock Plus web scripts,
4 # Copyright (C) 2006-2013 Eyeo GmbH 4 # Copyright (C) 2006-2013 Eyeo GmbH
5 # 5 #
6 # Adblock Plus is free software: you can redistribute it and/or modify 6 # Adblock Plus is free software: you can redistribute it and/or modify
7 # it under the terms of the GNU General Public License version 3 as 7 # it under the terms of the GNU General Public License version 3 as
8 # published by the Free Software Foundation. 8 # published by the Free Software Foundation.
9 # 9 #
10 # Adblock Plus is distributed in the hope that it will be useful, 10 # Adblock Plus is distributed in the hope that it will be useful,
11 # but WITHOUT ANY WARRANTY; without even the implied warranty of 11 # but WITHOUT ANY WARRANTY; without even the implied warranty of
12 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 12 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
13 # GNU General Public License for more details. 13 # GNU General Public License for more details.
14 # 14 #
15 # You should have received a copy of the GNU General Public License 15 # You should have received a copy of the GNU General Public License
16 # along with Adblock Plus. If not, see <http://www.gnu.org/licenses/>. 16 # along with Adblock Plus. If not, see <http://www.gnu.org/licenses/>.
17 17
18 import os, re, time 18 import os, re, codecs, simplejson, time, itertools
19 from datetime import date, timedelta 19 from datetime import date
20 from sitescripts.utils import get_config, setupStderr, get_template 20 from sitescripts.utils import get_config, setupStderr, get_custom_template_envir onment, cached
21 from sitescripts.logs.countryCodes import countryCodes 21 import sitescripts.stats.common as common
22 from ConfigParser import SafeConfigParser 22 from sitescripts.stats.countrycodes import countrycodes
23 23
24 def getSubscriptionFiles(data, month): 24 @cached(float("inf"))
25 result = {} 25 def get_template_environment():
26 if data.has_section(month): 26 return get_custom_template_environment({
27 for option in data.options(month): 27 "monthname": lambda value: date(int(value[0:4]), int(value[4:]), 1).strftime ("%b %Y"),
28 result[option[0:option.index(' ')]] = True 28 "weekday": lambda value: ["Monday", "Tuesday", "Wednesday", "Thursday", "Fri day", "Saturday", "Sunday"][int(value)],
29 return result 29 "countryname": lambda value: countrycodes.get(value, "Unknown"),
30 "sortfield": lambda value, field: (field["sort"] if "sort" in field else def ault_sort)(value),
31 "maxhits": lambda items: max(itertools.chain((value["hits"] for key, value i n items), [1])),
32 "maxbandwidth": lambda items: max(itertools.chain((value["bandwidth"] for ke y, value in items), [1])),
33 "sumhits": lambda items: max(sum(value["hits"] for key, value in items), 1),
34 "sumbandwidth": lambda items: max(sum(value["bandwidth"] for key, value in i tems), 1),
35 "isspecial": lambda name, field: field["isspecial"](name) if "isspecial" in field else False,
36 })
30 37
31 def generateMainPage(data, outputDir): 38 @cached(float("inf"))
32 def getDataInt(month, key): 39 def get_main_page_template():
33 if data.has_option(month, key): 40 return get_template_environment().get_template(get_config().get("stats", "main PageTemplate"))
34 return data.getint(month, key)
35 else:
36 return 0
37 41
38 month = date.today().strftime('%Y%m') 42 @cached(float("inf"))
39 subscriptions = [] 43 def get_file_stats_template():
40 for fileName in getSubscriptionFiles(data, month).iterkeys(): 44 return get_template_environment().get_template(get_config().get("stats", "file PageTemplate"))
41 subscriptions.append({
42 'fileName': fileName,
43 'url': 'subscription_%s_%s.html' % (re.sub(r'\W', '_', fileName), month),
44 'hits': getDataInt(month, '%s hits' % fileName),
45 'bandwidth': getDataInt(month, '%s bandwidth' % fileName)
46 })
47 subscriptions = sorted(subscriptions, key=lambda s: s['hits'], reverse=True)
48 45
49 file = os.path.join(outputDir, 'index.html') 46 @cached(float("inf"))
50 template = get_template(get_config().get('subscriptionStats', 'mainPageTemplat e')) 47 def get_file_overview_template():
51 template.stream({'now': time.time(), 'month': month, 'subscriptions': subscrip tions}).dump(file) 48 return get_template_environment().get_template(get_config().get("stats", "file OverviewTemplate"))
52 49
53 def generateSubscriptionPages(data, outputDir): 50 def default_sort(obj):
54 existingSubscriptions = {} 51 return sorted(obj.items(), key=lambda (k,v): v["hits"], reverse=True)
55 template = get_template(get_config().get('subscriptionStats', 'subscriptionPag eTemplate'))
56 for month in data.sections():
57 subscriptions = {}
58 for option in data.options(month):
59 spaceIndex = option.index(' ')
60 if spaceIndex < 0:
61 continue
62 fileName, key = option[0:spaceIndex], option[spaceIndex+1:]
63 existingSubscriptions[fileName] = True
64 if not fileName in subscriptions:
65 subscriptions[fileName] = {
66 'now': time.time(),
67 'month': month,
68 'daysInMonth': (date(int(month[0:4]), int(month[4:]), 1) - timedelta(d ays=1)).day,
69 'currentMonth': month == date.today().strftime('%Y%m'),
70 'fileName': fileName,
71 'overviewURL': 'overview_%s.html' % re.sub(r'\W', '_', fileName),
72 'hits': 0,
73 'bandwidth': 0,
74 'day': {},
75 'weekday': [{'id': i, 'hits': 0, 'bandwidth': 0, 'count': 0}for i in r ange(7)],
76 'hour': {},
77 'country': {},
78 'app': {},
79 'mirror': {},
80 }
81 if key == 'hits' or key == 'bandwidth':
82 subscriptions[fileName][key] = data.getint(month, option)
83 else:
84 match = re.search(r'^(hits|bandwidth) (day|hour|country|app|mirror) (.*) $', key)
85 if match:
86 if not match.group(3) in subscriptions[fileName][match.group(2)]:
87 subscriptions[fileName][match.group(2)][match.group(3)] = {
88 'id': match.group(3),
89 'hits': 0,
90 'bandwidth': 0,
91 }
92 if match.group(2) == 'day':
93 subscriptions[fileName][match.group(2)][match.group(3)]['weekday'] = date(int(month[0:4]), int(month[4:]), int(match.group(3))).weekday()
94 if match.group(2) == 'country':
95 if match.group(3) in countryCodes:
96 subscriptions[fileName][match.group(2)][match.group(3)]['name'] = countryCodes[match.group(3)]
97 subscriptions[fileName][match.group(2)][match.group(3)]['image'] = match.group(3)
98 else:
99 subscriptions[fileName][match.group(2)][match.group(3)]['name'] = 'Unknown'
100 subscriptions[fileName][match.group(2)][match.group(3)]['image'] = 'ip'
101 subscriptions[fileName][match.group(2)][match.group(3)][match.group(1) ] = data.getint(month, option)
102 52
103 for subscription in subscriptions.itervalues(): 53 def ensure_dir(path):
104 for key in ('day', 'hour'): 54 dir = os.path.dirname(path)
105 subscription[key] = sorted(subscription[key].itervalues(), key=lambda s: int(s['id'])) 55 try:
106 for key in ('country', 'app', 'mirror'): 56 os.makedirs(dir)
107 subscription[key] = sorted(subscription[key].itervalues(), key=lambda s: s['hits'], reverse=True) 57 except OSError:
108 for dayInfo in subscription['day']: 58 pass
109 weekdayInfo = subscription['weekday'][dayInfo['weekday']]
110 weekdayInfo['hits'] = (weekdayInfo['hits'] * weekdayInfo['count'] + dayI nfo['hits']) / (weekdayInfo['count'] + 1)
111 weekdayInfo['bandwidth'] = (weekdayInfo['bandwidth'] * weekdayInfo['coun t'] + dayInfo['bandwidth']) / (weekdayInfo['count'] + 1)
112 weekdayInfo['count'] += 1
113 fileName = 'subscription_%s_%s.html' % (re.sub(r'\W', '_', subscription['f ileName']), month)
114 template.stream(subscription).dump(os.path.join(outputDir, fileName))
115 return existingSubscriptions
116 59
117 def generateOverviewPage(data, outputDir, fileName): 60 def generate_main_page(outputfile, month, url, data):
118 months = [] 61 ensure_dir(outputfile)
119 for month in data.sections(): 62 get_main_page_template().stream({
120 if data.has_option(month, '%s hits' % fileName) and data.has_option(month, ' %s bandwidth' % fileName): 63 "now": time.time(),
121 months.append({ 64 "month": month,
122 'id': month, 65 "url": url,
123 'url': 'subscription_%s_%s.html' % (re.sub(r'\W', '_', fileName), month) , 66 "data": data,
124 'hits': data.getint(month, '%s hits' % fileName), 67 }).dump(outputfile)
125 'bandwidth': data.getint(month, '%s bandwidth' % fileName),
126 })
127 months = sorted(months, key=lambda m: m['id'])
128 68
129 file = os.path.join(outputDir, 'overview_%s.html' % re.sub(r'\W', '_', fileNam e)) 69 def generate_file_stats(outputfile, month, url, overview_url, data, filter=None, filtered_urls={}):
130 template = get_template(get_config().get('subscriptionStats', 'subscriptionOve rviewTemplate')) 70 ensure_dir(outputfile)
131 template.stream({'now': time.time(), 'fileName': fileName, 'month': months}).d ump(file) 71 get_file_stats_template().stream({
72 "now": time.time(),
73 "month": month,
74 "url": url,
75 "overview_url": overview_url,
76 "data": data,
77 "fields": common.fields,
78 "filter": filter,
79 "filtered_urls": filtered_urls
80 }).dump(outputfile)
81
82 def generate_file_overview(outputfile, url, data):
83 ensure_dir(outputfile)
84 get_file_overview_template().stream({
85 "now": time.time(),
86 "url": url,
87 "data": data
88 }).dump(outputfile)
89
90 def get_names(dir, needdirectories):
91 for file in os.listdir(dir):
92 path = os.path.join(dir, file)
93 if (needdirectories and os.path.isdir(path)) or (not needdirectories and os. path.isfile(path)):
94 yield common.filename_decode(file), path
95
96 def generate_pages(datadir, outputdir):
97 for server_type, server_type_dir in get_names(datadir, True):
98 baseURL = get_config().get("stats", "baseURL_" + server_type)
99 filedata = {}
100 current_month = None
101 for month, month_dir in get_names(server_type_dir, True):
102 if current_month == None or month > current_month:
103 current_month = month
104
105 for filename, path in get_names(month_dir, False):
106 filename = re.sub(r"\.json$", "", filename)
107 with codecs.open(path, "rb", encoding="utf-8") as file:
108 data = simplejson.load(file)
109
110 overview_url = "../../overview-" + common.filename_encode(filename + ".h tml")
111 filtered_urls = {}
112 for field in common.fields:
113 if field["name"] not in data:
114 continue
115 # Create filtered views for the first thirty values of a field if they
116 # have filtered data.
117 for name, value in get_template_environment().filters["sortfield"](dat a[field["name"]], field)[0:30]:
118 if filter(lambda k: k not in ("hits", "bandwidth"), value.iterkeys() ):
119 outputfile = os.path.join(outputdir,
120 common.filename_encode(server_type),
121 common.filename_encode(month),
122 common.filename_encode(filename),
123 "filtered-%s-%s.html" % (
124 common.filename_encode(field["name"]),
125 common.filename_encode(name),
126 ))
127 generate_file_stats(outputfile, month, baseURL + filename, overvie w_url,
128 value, filter={"field": field, "value": name})
129
130 if not field["name"] in filtered_urls:
131 filtered_urls[field["name"]] = {}
132 filtered_urls[field["name"]][name] = os.path.basename(outputfile)
133
134 outputfile = os.path.join(outputdir,
135 common.filename_encode(server_type),
136 common.filename_encode(month),
137 common.filename_encode(filename),
138 "index.html")
139 generate_file_stats(outputfile, month, baseURL + filename, overview_url,
140 data, filtered_urls=filtered_urls)
141
142 if filename not in filedata:
143 filedata[filename] = {}
144 month_url = "%s/%s/%s" % (common.filename_encode(month),
145 common.filename_encode(filename),
146 "index.html")
147 filedata[filename][month] = {"url": month_url, "hits": data["hits"], "ba ndwidth": data["bandwidth"]}
148
149 monthdata = {}
150 for filename, data in filedata.iteritems():
151 outputfile = os.path.join(outputdir,
152 common.filename_encode(server_type),
153 "overview-" + common.filename_encode(filename + ".html"))
154 generate_file_overview(outputfile, baseURL + filename, data)
155
156 if current_month in data:
157 monthdata[filename] = dict(data[current_month])
158
159 outputfile = os.path.join(outputdir, common.filename_encode(server_type), "i ndex.html")
160 generate_main_page(outputfile, current_month, baseURL, monthdata)
132 161
133 if __name__ == '__main__': 162 if __name__ == '__main__':
134 setupStderr() 163 setupStderr()
135 164
136 data = SafeConfigParser() 165 datadir = get_config().get("stats", "dataDirectory")
137 data.read(get_config().get('subscriptionStats', 'mainFile')) 166 outputdir = get_config().get("stats", "outputDirectory")
138 167 generate_pages(datadir, outputdir)
139 outputDir = get_config().get('subscriptionStats', 'outputDirectory')
140 if not os.path.exists(outputDir):
141 os.makedirs(outputDir)
142 generateMainPage(data, outputDir)
143 subscriptions = generateSubscriptionPages(data, outputDir)
144 for fileName in subscriptions.iterkeys():
145 generateOverviewPage(data, outputDir, fileName)
OLDNEW
« no previous file with comments | « sitescripts/stats/bin/logprocessor.py ('k') | sitescripts/stats/common.py » ('j') | no next file with comments »

Powered by Google App Engine
This is Rietveld