| Index: sitescripts/stats/bin/pagegenerator.py |
| =================================================================== |
| rename from sitescripts/logs/bin/generateStatsPages.py |
| rename to sitescripts/stats/bin/pagegenerator.py |
| --- a/sitescripts/logs/bin/generateStatsPages.py |
| +++ b/sitescripts/stats/bin/pagegenerator.py |
| @@ -10,136 +10,156 @@ |
| # Adblock Plus is distributed in the hope that it will be useful, |
| # but WITHOUT ANY WARRANTY; without even the implied warranty of |
| # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the |
| # GNU General Public License for more details. |
| # |
| # You should have received a copy of the GNU General Public License |
| # along with Adblock Plus. If not, see <http://www.gnu.org/licenses/>. |
| -import os, re, time |
| -from datetime import date, timedelta |
| -from sitescripts.utils import get_config, setupStderr, get_template |
| -from sitescripts.logs.countryCodes import countryCodes |
| -from ConfigParser import SafeConfigParser |
| +import os, re, codecs, simplejson, time, itertools |
| +from datetime import date |
| +from sitescripts.utils import get_config, setupStderr, get_custom_template_environment, cached |
| +import sitescripts.stats.common as common |
| +from sitescripts.stats.countrycodes import countrycodes |
| -def getSubscriptionFiles(data, month): |
| - result = {} |
| - if data.has_section(month): |
| - for option in data.options(month): |
| - result[option[0:option.index(' ')]] = True |
| - return result |
| +@cached(()) |
|
Sebastian Noack
2013/08/26 16:05:22
You passed an empty tuple as timeout. Obviously yo
Wladimir Palant
2013/08/27 07:34:28
This behavior is somewhat specified as described h
|
| +def get_template_environment(): |
| + return get_custom_template_environment({ |
| + "monthname": lambda value: date(int(value[0:4]), int(value[4:]), 1).strftime("%b %Y"), |
| + "weekday": lambda value: ["Monday", "Tuesday", "Wednesday", "Thursday", "Friday", "Saturday", "Sunday"][int(value)], |
| + "countryname": lambda value: countrycodes.get(value, "Unknown"), |
| + "sortfield": lambda value, field: (field["sort"] if "sort" in field else default_sort)(value), |
| + "maxhits": lambda items: max(itertools.chain((value["hits"] for key, value in items), [1])), |
| + "maxbandwidth": lambda items: max(itertools.chain((value["bandwidth"] for key, value in items), [1])), |
| + "sumhits": lambda items: max(sum(value["hits"] for key, value in items), 1), |
| + "sumbandwidth": lambda items: max(sum(value["bandwidth"] for key, value in items), 1), |
| + "isspecial": lambda name, field: field["isspecial"](name) if "isspecial" in field else False, |
| + }) |
| -def generateMainPage(data, outputDir): |
| - def getDataInt(month, key): |
| - if data.has_option(month, key): |
| - return data.getint(month, key) |
| - else: |
| - return 0 |
| +@cached(()) |
| +def get_main_page_template(): |
| + return get_template_environment().get_template(get_config().get("stats", "mainPageTemplate")) |
| - month = date.today().strftime('%Y%m') |
| - subscriptions = [] |
| - for fileName in getSubscriptionFiles(data, month).iterkeys(): |
| - subscriptions.append({ |
| - 'fileName': fileName, |
| - 'url': 'subscription_%s_%s.html' % (re.sub(r'\W', '_', fileName), month), |
| - 'hits': getDataInt(month, '%s hits' % fileName), |
| - 'bandwidth': getDataInt(month, '%s bandwidth' % fileName) |
| - }) |
| - subscriptions = sorted(subscriptions, key=lambda s: s['hits'], reverse=True) |
| +@cached(()) |
| +def get_file_stats_template(): |
| + return get_template_environment().get_template(get_config().get("stats", "filePageTemplate")) |
| - file = os.path.join(outputDir, 'index.html') |
| - template = get_template(get_config().get('subscriptionStats', 'mainPageTemplate')) |
| - template.stream({'now': time.time(), 'month': month, 'subscriptions': subscriptions}).dump(file) |
| +@cached(()) |
| +def get_file_overview_template(): |
| + return get_template_environment().get_template(get_config().get("stats", "fileOverviewTemplate")) |
| -def generateSubscriptionPages(data, outputDir): |
| - existingSubscriptions = {} |
| - template = get_template(get_config().get('subscriptionStats', 'subscriptionPageTemplate')) |
| - for month in data.sections(): |
| - subscriptions = {} |
| - for option in data.options(month): |
| - spaceIndex = option.index(' ') |
| - if spaceIndex < 0: |
| - continue |
| - fileName, key = option[0:spaceIndex], option[spaceIndex+1:] |
| - existingSubscriptions[fileName] = True |
| - if not fileName in subscriptions: |
| - subscriptions[fileName] = { |
| - 'now': time.time(), |
| - 'month': month, |
| - 'daysInMonth': (date(int(month[0:4]), int(month[4:]), 1) - timedelta(days=1)).day, |
| - 'currentMonth': month == date.today().strftime('%Y%m'), |
| - 'fileName': fileName, |
| - 'overviewURL': 'overview_%s.html' % re.sub(r'\W', '_', fileName), |
| - 'hits': 0, |
| - 'bandwidth': 0, |
| - 'day': {}, |
| - 'weekday': [{'id': i, 'hits': 0, 'bandwidth': 0, 'count': 0}for i in range(7)], |
| - 'hour': {}, |
| - 'country': {}, |
| - 'app': {}, |
| - 'mirror': {}, |
| - } |
| - if key == 'hits' or key == 'bandwidth': |
| - subscriptions[fileName][key] = data.getint(month, option) |
| - else: |
| - match = re.search(r'^(hits|bandwidth) (day|hour|country|app|mirror) (.*)$', key) |
| - if match: |
| - if not match.group(3) in subscriptions[fileName][match.group(2)]: |
| - subscriptions[fileName][match.group(2)][match.group(3)] = { |
| - 'id': match.group(3), |
| - 'hits': 0, |
| - 'bandwidth': 0, |
| - } |
| - if match.group(2) == 'day': |
| - subscriptions[fileName][match.group(2)][match.group(3)]['weekday'] = date(int(month[0:4]), int(month[4:]), int(match.group(3))).weekday() |
| - if match.group(2) == 'country': |
| - if match.group(3) in countryCodes: |
| - subscriptions[fileName][match.group(2)][match.group(3)]['name'] = countryCodes[match.group(3)] |
| - subscriptions[fileName][match.group(2)][match.group(3)]['image'] = match.group(3) |
| - else: |
| - subscriptions[fileName][match.group(2)][match.group(3)]['name'] = 'Unknown' |
| - subscriptions[fileName][match.group(2)][match.group(3)]['image'] = 'ip' |
| - subscriptions[fileName][match.group(2)][match.group(3)][match.group(1)] = data.getint(month, option) |
| +def default_sort(obj): |
| + return sorted(obj.items(), key=lambda (k,v): v["hits"], reverse=True) |
| - for subscription in subscriptions.itervalues(): |
| - for key in ('day', 'hour'): |
| - subscription[key] = sorted(subscription[key].itervalues(), key=lambda s: int(s['id'])) |
| - for key in ('country', 'app', 'mirror'): |
| - subscription[key] = sorted(subscription[key].itervalues(), key=lambda s: s['hits'], reverse=True) |
| - for dayInfo in subscription['day']: |
| - weekdayInfo = subscription['weekday'][dayInfo['weekday']] |
| - weekdayInfo['hits'] = (weekdayInfo['hits'] * weekdayInfo['count'] + dayInfo['hits']) / (weekdayInfo['count'] + 1) |
| - weekdayInfo['bandwidth'] = (weekdayInfo['bandwidth'] * weekdayInfo['count'] + dayInfo['bandwidth']) / (weekdayInfo['count'] + 1) |
| - weekdayInfo['count'] += 1 |
| - fileName = 'subscription_%s_%s.html' % (re.sub(r'\W', '_', subscription['fileName']), month) |
| - template.stream(subscription).dump(os.path.join(outputDir, fileName)) |
| - return existingSubscriptions |
| +def ensure_dir(path): |
|
Sebastian Noack
2013/08/26 16:05:22
Instead of checking if the dir exist and creating
|
| + dir = os.path.dirname(path) |
| + if not os.path.exists(dir): |
| + os.makedirs(dir) |
|
Wladimir Palant
2013/08/27 07:34:28
Fixed. However, this is a common pattern in our co
|
| -def generateOverviewPage(data, outputDir, fileName): |
| - months = [] |
| - for month in data.sections(): |
| - if data.has_option(month, '%s hits' % fileName) and data.has_option(month, '%s bandwidth' % fileName): |
| - months.append({ |
| - 'id': month, |
| - 'url': 'subscription_%s_%s.html' % (re.sub(r'\W', '_', fileName), month), |
| - 'hits': data.getint(month, '%s hits' % fileName), |
| - 'bandwidth': data.getint(month, '%s bandwidth' % fileName), |
| - }) |
| - months = sorted(months, key=lambda m: m['id']) |
| +def generate_main_page(outputfile, month, url, data): |
| + ensure_dir(outputfile) |
| + get_main_page_template().stream({ |
| + "now": time.time(), |
| + "month": month, |
| + "url": url, |
| + "data": data, |
| + }).dump(outputfile) |
| - file = os.path.join(outputDir, 'overview_%s.html' % re.sub(r'\W', '_', fileName)) |
| - template = get_template(get_config().get('subscriptionStats', 'subscriptionOverviewTemplate')) |
| - template.stream({'now': time.time(), 'fileName': fileName, 'month': months}).dump(file) |
| +def generate_file_stats(outputfile, month, url, overview_url, data, filter=None, filtered_urls={}): |
| + ensure_dir(outputfile) |
| + get_file_stats_template().stream({ |
| + "now": time.time(), |
| + "month": month, |
| + "url": url, |
| + "overview_url": overview_url, |
| + "data": data, |
| + "fields": common.fields, |
| + "filter": filter, |
| + "filtered_urls": filtered_urls |
| + }).dump(outputfile) |
| + |
| +def generate_file_overview(outputfile, url, data): |
| + ensure_dir(outputfile) |
| + get_file_overview_template().stream({ |
| + "now": time.time(), |
| + "url": url, |
| + "data": data |
| + }).dump(outputfile) |
| + |
| +def get_names(dir, needdirectories): |
| + for file in os.listdir(dir): |
| + path = os.path.join(dir, file) |
| + if (needdirectories and os.path.isdir(path)) or (not needdirectories and os.path.isfile(path)): |
| + yield common.filename_decode(file), path |
| + |
| +def generate_pages(datadir, outputdir): |
| + for server_type, server_type_dir in get_names(datadir, True): |
| + baseURL = get_config().get("stats", "baseURL_" + server_type) |
| + filedata = {} |
| + current_month = None |
| + for month, month_dir in get_names(server_type_dir, True): |
| + if current_month == None or month > current_month: |
| + current_month = month |
| + |
| + for filename, path in get_names(month_dir, False): |
| + filename = re.sub(r"\.json$", "", filename) |
| + with codecs.open(path, "rb", encoding="utf-8") as file: |
| + data = simplejson.load(file) |
| + |
| + overview_url = "../../overview-" + common.filename_encode(filename + ".html") |
| + filtered_urls = {} |
| + for field in common.fields: |
| + if field["name"] not in data: |
| + continue |
| + # Create filtered views for the first thirty values of a field if they |
| + # have filtered data. |
| + for name, value in get_template_environment().filters["sortfield"](data[field["name"]], field)[0:30]: |
| + if filter(lambda k: k not in ("hits", "bandwidth"), value.keys()): |
|
Sebastian Noack
2013/08/26 16:05:22
No need to create a new list with the keys first.
Wladimir Palant
2013/08/27 07:34:28
I rather use value.iterkeys() here - it's obvious
Wladimir Palant
2013/08/27 11:59:47
Reply by Sebastian:
It does the same, but isn't a
Wladimir Palant
2013/08/27 12:42:01
See my reply - I already know that. But I prefer i
|
| + outputfile = os.path.join(outputdir, |
| + common.filename_encode(server_type), |
| + common.filename_encode(month), |
| + common.filename_encode(filename), |
| + "filtered-%s-%s.html" % ( |
| + common.filename_encode(field["name"]), |
| + common.filename_encode(name), |
| + )) |
| + generate_file_stats(outputfile, month, baseURL + filename, overview_url, |
| + value, filter={"field": field, "value": name}) |
| + |
| + if not field["name"] in filtered_urls: |
| + filtered_urls[field["name"]] = {} |
| + filtered_urls[field["name"]][name] = outputfile |
| + |
| + outputfile = os.path.join(outputdir, |
| + common.filename_encode(server_type), |
| + common.filename_encode(month), |
| + common.filename_encode(filename), |
| + "index.html") |
| + generate_file_stats(outputfile, month, baseURL + filename, overview_url, |
| + data, filtered_urls=filtered_urls) |
| + |
| + if filename not in filedata: |
| + filedata[filename] = {} |
| + month_url = (common.filename_encode(month) + "/" + |
|
Sebastian Noack
2013/08/26 16:05:22
You should use os.path.join() here as well.
Wladimir Palant
2013/08/27 07:34:28
No, definitely not going to use os.path.join() for
Wladimir Palant
2013/08/27 11:59:47
Reply by Sebastian:
Oh, didn't noted, it was an u
|
| + common.filename_encode(filename) + "/" + |
| + "index.html") |
| + filedata[filename][month] = {"url": month_url, "hits": data["hits"], "bandwidth": data["bandwidth"]} |
| + |
| + monthdata = {} |
| + for filename, data in filedata.iteritems(): |
| + outputfile = os.path.join(outputdir, |
| + common.filename_encode(server_type), |
| + "overview-" + common.filename_encode(filename + ".html")) |
| + generate_file_overview(outputfile, baseURL + filename, data) |
| + |
| + if current_month in data: |
| + monthdata[filename] = dict(data[current_month]) |
| + |
| + outputfile = os.path.join(outputdir, common.filename_encode(server_type), "index.html") |
| + generate_main_page(outputfile, current_month, baseURL, monthdata) |
| if __name__ == '__main__': |
| setupStderr() |
| - data = SafeConfigParser() |
| - data.read(get_config().get('subscriptionStats', 'mainFile')) |
| - |
| - outputDir = get_config().get('subscriptionStats', 'outputDirectory') |
| - if not os.path.exists(outputDir): |
| - os.makedirs(outputDir) |
| - generateMainPage(data, outputDir) |
| - subscriptions = generateSubscriptionPages(data, outputDir) |
| - for fileName in subscriptions.iterkeys(): |
| - generateOverviewPage(data, outputDir, fileName) |
| + datadir = get_config().get("stats", "dataDirectory") |
| + outputdir = get_config().get("stats", "outputDirectory") |
| + generate_pages(datadir, outputdir) |