Rietveld Code Review Tool
Help | Bug tracker | Discussion group | Source code

Unified Diff: sitescripts/stats/bin/pagegenerator.py

Issue 11481051: Update stats processing (Closed)
Patch Set: Fixed two presentation issues Created Aug. 24, 2013, 1:11 p.m.
Use n/p to move between diff chunks; N/P to move between comments.
Jump to:
View side-by-side diff with in-line comments
Download patch
Index: sitescripts/stats/bin/pagegenerator.py
===================================================================
rename from sitescripts/logs/bin/generateStatsPages.py
rename to sitescripts/stats/bin/pagegenerator.py
--- a/sitescripts/logs/bin/generateStatsPages.py
+++ b/sitescripts/stats/bin/pagegenerator.py
@@ -10,136 +10,156 @@
# Adblock Plus is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with Adblock Plus. If not, see <http://www.gnu.org/licenses/>.
-import os, re, time
-from datetime import date, timedelta
-from sitescripts.utils import get_config, setupStderr, get_template
-from sitescripts.logs.countryCodes import countryCodes
-from ConfigParser import SafeConfigParser
+import os, re, codecs, simplejson, time, itertools
+from datetime import date
+from sitescripts.utils import get_config, setupStderr, get_custom_template_environment, cached
+import sitescripts.stats.common as common
+from sitescripts.stats.countrycodes import countrycodes
-def getSubscriptionFiles(data, month):
- result = {}
- if data.has_section(month):
- for option in data.options(month):
- result[option[0:option.index(' ')]] = True
- return result
+@cached(())
Sebastian Noack 2013/08/26 16:05:22 You passed an empty tuple as timeout. Obviously yo
Wladimir Palant 2013/08/27 07:34:28 This behavior is somewhat specified as described h
+def get_template_environment():
+ return get_custom_template_environment({
+ "monthname": lambda value: date(int(value[0:4]), int(value[4:]), 1).strftime("%b %Y"),
+ "weekday": lambda value: ["Monday", "Tuesday", "Wednesday", "Thursday", "Friday", "Saturday", "Sunday"][int(value)],
+ "countryname": lambda value: countrycodes.get(value, "Unknown"),
+ "sortfield": lambda value, field: (field["sort"] if "sort" in field else default_sort)(value),
+ "maxhits": lambda items: max(itertools.chain((value["hits"] for key, value in items), [1])),
+ "maxbandwidth": lambda items: max(itertools.chain((value["bandwidth"] for key, value in items), [1])),
+ "sumhits": lambda items: max(sum(value["hits"] for key, value in items), 1),
+ "sumbandwidth": lambda items: max(sum(value["bandwidth"] for key, value in items), 1),
+ "isspecial": lambda name, field: field["isspecial"](name) if "isspecial" in field else False,
+ })
-def generateMainPage(data, outputDir):
- def getDataInt(month, key):
- if data.has_option(month, key):
- return data.getint(month, key)
- else:
- return 0
+@cached(())
+def get_main_page_template():
+ return get_template_environment().get_template(get_config().get("stats", "mainPageTemplate"))
- month = date.today().strftime('%Y%m')
- subscriptions = []
- for fileName in getSubscriptionFiles(data, month).iterkeys():
- subscriptions.append({
- 'fileName': fileName,
- 'url': 'subscription_%s_%s.html' % (re.sub(r'\W', '_', fileName), month),
- 'hits': getDataInt(month, '%s hits' % fileName),
- 'bandwidth': getDataInt(month, '%s bandwidth' % fileName)
- })
- subscriptions = sorted(subscriptions, key=lambda s: s['hits'], reverse=True)
+@cached(())
+def get_file_stats_template():
+ return get_template_environment().get_template(get_config().get("stats", "filePageTemplate"))
- file = os.path.join(outputDir, 'index.html')
- template = get_template(get_config().get('subscriptionStats', 'mainPageTemplate'))
- template.stream({'now': time.time(), 'month': month, 'subscriptions': subscriptions}).dump(file)
+@cached(())
+def get_file_overview_template():
+ return get_template_environment().get_template(get_config().get("stats", "fileOverviewTemplate"))
-def generateSubscriptionPages(data, outputDir):
- existingSubscriptions = {}
- template = get_template(get_config().get('subscriptionStats', 'subscriptionPageTemplate'))
- for month in data.sections():
- subscriptions = {}
- for option in data.options(month):
- spaceIndex = option.index(' ')
- if spaceIndex < 0:
- continue
- fileName, key = option[0:spaceIndex], option[spaceIndex+1:]
- existingSubscriptions[fileName] = True
- if not fileName in subscriptions:
- subscriptions[fileName] = {
- 'now': time.time(),
- 'month': month,
- 'daysInMonth': (date(int(month[0:4]), int(month[4:]), 1) - timedelta(days=1)).day,
- 'currentMonth': month == date.today().strftime('%Y%m'),
- 'fileName': fileName,
- 'overviewURL': 'overview_%s.html' % re.sub(r'\W', '_', fileName),
- 'hits': 0,
- 'bandwidth': 0,
- 'day': {},
- 'weekday': [{'id': i, 'hits': 0, 'bandwidth': 0, 'count': 0}for i in range(7)],
- 'hour': {},
- 'country': {},
- 'app': {},
- 'mirror': {},
- }
- if key == 'hits' or key == 'bandwidth':
- subscriptions[fileName][key] = data.getint(month, option)
- else:
- match = re.search(r'^(hits|bandwidth) (day|hour|country|app|mirror) (.*)$', key)
- if match:
- if not match.group(3) in subscriptions[fileName][match.group(2)]:
- subscriptions[fileName][match.group(2)][match.group(3)] = {
- 'id': match.group(3),
- 'hits': 0,
- 'bandwidth': 0,
- }
- if match.group(2) == 'day':
- subscriptions[fileName][match.group(2)][match.group(3)]['weekday'] = date(int(month[0:4]), int(month[4:]), int(match.group(3))).weekday()
- if match.group(2) == 'country':
- if match.group(3) in countryCodes:
- subscriptions[fileName][match.group(2)][match.group(3)]['name'] = countryCodes[match.group(3)]
- subscriptions[fileName][match.group(2)][match.group(3)]['image'] = match.group(3)
- else:
- subscriptions[fileName][match.group(2)][match.group(3)]['name'] = 'Unknown'
- subscriptions[fileName][match.group(2)][match.group(3)]['image'] = 'ip'
- subscriptions[fileName][match.group(2)][match.group(3)][match.group(1)] = data.getint(month, option)
+def default_sort(obj):
+ return sorted(obj.items(), key=lambda (k,v): v["hits"], reverse=True)
- for subscription in subscriptions.itervalues():
- for key in ('day', 'hour'):
- subscription[key] = sorted(subscription[key].itervalues(), key=lambda s: int(s['id']))
- for key in ('country', 'app', 'mirror'):
- subscription[key] = sorted(subscription[key].itervalues(), key=lambda s: s['hits'], reverse=True)
- for dayInfo in subscription['day']:
- weekdayInfo = subscription['weekday'][dayInfo['weekday']]
- weekdayInfo['hits'] = (weekdayInfo['hits'] * weekdayInfo['count'] + dayInfo['hits']) / (weekdayInfo['count'] + 1)
- weekdayInfo['bandwidth'] = (weekdayInfo['bandwidth'] * weekdayInfo['count'] + dayInfo['bandwidth']) / (weekdayInfo['count'] + 1)
- weekdayInfo['count'] += 1
- fileName = 'subscription_%s_%s.html' % (re.sub(r'\W', '_', subscription['fileName']), month)
- template.stream(subscription).dump(os.path.join(outputDir, fileName))
- return existingSubscriptions
+def ensure_dir(path):
Sebastian Noack 2013/08/26 16:05:22 Instead of checking if the dir exist and creating
+ dir = os.path.dirname(path)
+ if not os.path.exists(dir):
+ os.makedirs(dir)
Wladimir Palant 2013/08/27 07:34:28 Fixed. However, this is a common pattern in our co
-def generateOverviewPage(data, outputDir, fileName):
- months = []
- for month in data.sections():
- if data.has_option(month, '%s hits' % fileName) and data.has_option(month, '%s bandwidth' % fileName):
- months.append({
- 'id': month,
- 'url': 'subscription_%s_%s.html' % (re.sub(r'\W', '_', fileName), month),
- 'hits': data.getint(month, '%s hits' % fileName),
- 'bandwidth': data.getint(month, '%s bandwidth' % fileName),
- })
- months = sorted(months, key=lambda m: m['id'])
+def generate_main_page(outputfile, month, url, data):
+ ensure_dir(outputfile)
+ get_main_page_template().stream({
+ "now": time.time(),
+ "month": month,
+ "url": url,
+ "data": data,
+ }).dump(outputfile)
- file = os.path.join(outputDir, 'overview_%s.html' % re.sub(r'\W', '_', fileName))
- template = get_template(get_config().get('subscriptionStats', 'subscriptionOverviewTemplate'))
- template.stream({'now': time.time(), 'fileName': fileName, 'month': months}).dump(file)
+def generate_file_stats(outputfile, month, url, overview_url, data, filter=None, filtered_urls={}):
+ ensure_dir(outputfile)
+ get_file_stats_template().stream({
+ "now": time.time(),
+ "month": month,
+ "url": url,
+ "overview_url": overview_url,
+ "data": data,
+ "fields": common.fields,
+ "filter": filter,
+ "filtered_urls": filtered_urls
+ }).dump(outputfile)
+
+def generate_file_overview(outputfile, url, data):
+ ensure_dir(outputfile)
+ get_file_overview_template().stream({
+ "now": time.time(),
+ "url": url,
+ "data": data
+ }).dump(outputfile)
+
+def get_names(dir, needdirectories):
+ for file in os.listdir(dir):
+ path = os.path.join(dir, file)
+ if (needdirectories and os.path.isdir(path)) or (not needdirectories and os.path.isfile(path)):
+ yield common.filename_decode(file), path
+
+def generate_pages(datadir, outputdir):
+ for server_type, server_type_dir in get_names(datadir, True):
+ baseURL = get_config().get("stats", "baseURL_" + server_type)
+ filedata = {}
+ current_month = None
+ for month, month_dir in get_names(server_type_dir, True):
+ if current_month == None or month > current_month:
+ current_month = month
+
+ for filename, path in get_names(month_dir, False):
+ filename = re.sub(r"\.json$", "", filename)
+ with codecs.open(path, "rb", encoding="utf-8") as file:
+ data = simplejson.load(file)
+
+ overview_url = "../../overview-" + common.filename_encode(filename + ".html")
+ filtered_urls = {}
+ for field in common.fields:
+ if field["name"] not in data:
+ continue
+ # Create filtered views for the first thirty values of a field if they
+ # have filtered data.
+ for name, value in get_template_environment().filters["sortfield"](data[field["name"]], field)[0:30]:
+ if filter(lambda k: k not in ("hits", "bandwidth"), value.keys()):
Sebastian Noack 2013/08/26 16:05:22 No need to create a new list with the keys first.
Wladimir Palant 2013/08/27 07:34:28 I rather use value.iterkeys() here - it's obvious
Wladimir Palant 2013/08/27 11:59:47 Reply by Sebastian: It does the same, but isn't a
Wladimir Palant 2013/08/27 12:42:01 See my reply - I already know that. But I prefer i
+ outputfile = os.path.join(outputdir,
+ common.filename_encode(server_type),
+ common.filename_encode(month),
+ common.filename_encode(filename),
+ "filtered-%s-%s.html" % (
+ common.filename_encode(field["name"]),
+ common.filename_encode(name),
+ ))
+ generate_file_stats(outputfile, month, baseURL + filename, overview_url,
+ value, filter={"field": field, "value": name})
+
+ if not field["name"] in filtered_urls:
+ filtered_urls[field["name"]] = {}
+ filtered_urls[field["name"]][name] = outputfile
+
+ outputfile = os.path.join(outputdir,
+ common.filename_encode(server_type),
+ common.filename_encode(month),
+ common.filename_encode(filename),
+ "index.html")
+ generate_file_stats(outputfile, month, baseURL + filename, overview_url,
+ data, filtered_urls=filtered_urls)
+
+ if filename not in filedata:
+ filedata[filename] = {}
+ month_url = (common.filename_encode(month) + "/" +
Sebastian Noack 2013/08/26 16:05:22 You should use os.path.join() here as well.
Wladimir Palant 2013/08/27 07:34:28 No, definitely not going to use os.path.join() for
Wladimir Palant 2013/08/27 11:59:47 Reply by Sebastian: Oh, didn't noted, it was an u
+ common.filename_encode(filename) + "/" +
+ "index.html")
+ filedata[filename][month] = {"url": month_url, "hits": data["hits"], "bandwidth": data["bandwidth"]}
+
+ monthdata = {}
+ for filename, data in filedata.iteritems():
+ outputfile = os.path.join(outputdir,
+ common.filename_encode(server_type),
+ "overview-" + common.filename_encode(filename + ".html"))
+ generate_file_overview(outputfile, baseURL + filename, data)
+
+ if current_month in data:
+ monthdata[filename] = dict(data[current_month])
+
+ outputfile = os.path.join(outputdir, common.filename_encode(server_type), "index.html")
+ generate_main_page(outputfile, current_month, baseURL, monthdata)
if __name__ == '__main__':
setupStderr()
- data = SafeConfigParser()
- data.read(get_config().get('subscriptionStats', 'mainFile'))
-
- outputDir = get_config().get('subscriptionStats', 'outputDirectory')
- if not os.path.exists(outputDir):
- os.makedirs(outputDir)
- generateMainPage(data, outputDir)
- subscriptions = generateSubscriptionPages(data, outputDir)
- for fileName in subscriptions.iterkeys():
- generateOverviewPage(data, outputDir, fileName)
+ datadir = get_config().get("stats", "dataDirectory")
+ outputdir = get_config().get("stats", "outputDirectory")
+ generate_pages(datadir, outputdir)

Powered by Google App Engine
This is Rietveld