Index: sitescripts/stats/bin/pagegenerator.py |
=================================================================== |
rename from sitescripts/logs/bin/generateStatsPages.py |
rename to sitescripts/stats/bin/pagegenerator.py |
--- a/sitescripts/logs/bin/generateStatsPages.py |
+++ b/sitescripts/stats/bin/pagegenerator.py |
@@ -10,136 +10,155 @@ |
# Adblock Plus is distributed in the hope that it will be useful, |
# but WITHOUT ANY WARRANTY; without even the implied warranty of |
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the |
# GNU General Public License for more details. |
# |
# You should have received a copy of the GNU General Public License |
# along with Adblock Plus. If not, see <http://www.gnu.org/licenses/>. |
-import os, re, time |
-from datetime import date, timedelta |
-from sitescripts.utils import get_config, setupStderr, get_template |
-from sitescripts.logs.countryCodes import countryCodes |
-from ConfigParser import SafeConfigParser |
+import os, re, codecs, simplejson, time, itertools |
+from datetime import date |
+from sitescripts.utils import get_config, setupStderr, get_custom_template_environment, cached |
+import sitescripts.stats.common as common |
+from sitescripts.stats.countrycodes import countrycodes |
-def getSubscriptionFiles(data, month): |
- result = {} |
- if data.has_section(month): |
- for option in data.options(month): |
- result[option[0:option.index(' ')]] = True |
- return result |
+@cached(()) |
+def get_template_environment(): |
+ return get_custom_template_environment({ |
+ "monthname": lambda value: date(int(value[0:4]), int(value[4:]), 1).strftime("%b %Y"), |
+ "countryname": lambda value: countrycodes.get(value, "Unknown"), |
+ "sortfield": lambda value, field: (field["sort"] if "sort" in field else default_sort)(value), |
+ "maxhits": lambda items: max(itertools.chain((value["hits"] for key, value in items), [1])), |
+ "maxbandwidth": lambda items: max(itertools.chain((value["bandwidth"] for key, value in items), [1])), |
+ "sumhits": lambda items: max(sum(value["hits"] for key, value in items), 1), |
+ "sumbandwidth": lambda items: max(sum(value["bandwidth"] for key, value in items), 1), |
+ "isspecial": lambda name, field: field["isspecial"](name) if "isspecial" in field else False, |
+ }) |
-def generateMainPage(data, outputDir): |
- def getDataInt(month, key): |
- if data.has_option(month, key): |
- return data.getint(month, key) |
- else: |
- return 0 |
+@cached(()) |
+def get_main_page_template(): |
+ return get_template_environment().get_template(get_config().get("stats", "mainPageTemplate")) |
- month = date.today().strftime('%Y%m') |
- subscriptions = [] |
- for fileName in getSubscriptionFiles(data, month).iterkeys(): |
- subscriptions.append({ |
- 'fileName': fileName, |
- 'url': 'subscription_%s_%s.html' % (re.sub(r'\W', '_', fileName), month), |
- 'hits': getDataInt(month, '%s hits' % fileName), |
- 'bandwidth': getDataInt(month, '%s bandwidth' % fileName) |
- }) |
- subscriptions = sorted(subscriptions, key=lambda s: s['hits'], reverse=True) |
+@cached(()) |
+def get_file_stats_template(): |
+ return get_template_environment().get_template(get_config().get("stats", "filePageTemplate")) |
- file = os.path.join(outputDir, 'index.html') |
- template = get_template(get_config().get('subscriptionStats', 'mainPageTemplate')) |
- template.stream({'now': time.time(), 'month': month, 'subscriptions': subscriptions}).dump(file) |
+@cached(()) |
+def get_file_overview_template(): |
+ return get_template_environment().get_template(get_config().get("stats", "fileOverviewTemplate")) |
-def generateSubscriptionPages(data, outputDir): |
- existingSubscriptions = {} |
- template = get_template(get_config().get('subscriptionStats', 'subscriptionPageTemplate')) |
- for month in data.sections(): |
- subscriptions = {} |
- for option in data.options(month): |
- spaceIndex = option.index(' ') |
- if spaceIndex < 0: |
- continue |
- fileName, key = option[0:spaceIndex], option[spaceIndex+1:] |
- existingSubscriptions[fileName] = True |
- if not fileName in subscriptions: |
- subscriptions[fileName] = { |
- 'now': time.time(), |
- 'month': month, |
- 'daysInMonth': (date(int(month[0:4]), int(month[4:]), 1) - timedelta(days=1)).day, |
- 'currentMonth': month == date.today().strftime('%Y%m'), |
- 'fileName': fileName, |
- 'overviewURL': 'overview_%s.html' % re.sub(r'\W', '_', fileName), |
- 'hits': 0, |
- 'bandwidth': 0, |
- 'day': {}, |
- 'weekday': [{'id': i, 'hits': 0, 'bandwidth': 0, 'count': 0}for i in range(7)], |
- 'hour': {}, |
- 'country': {}, |
- 'app': {}, |
- 'mirror': {}, |
- } |
- if key == 'hits' or key == 'bandwidth': |
- subscriptions[fileName][key] = data.getint(month, option) |
- else: |
- match = re.search(r'^(hits|bandwidth) (day|hour|country|app|mirror) (.*)$', key) |
- if match: |
- if not match.group(3) in subscriptions[fileName][match.group(2)]: |
- subscriptions[fileName][match.group(2)][match.group(3)] = { |
- 'id': match.group(3), |
- 'hits': 0, |
- 'bandwidth': 0, |
- } |
- if match.group(2) == 'day': |
- subscriptions[fileName][match.group(2)][match.group(3)]['weekday'] = date(int(month[0:4]), int(month[4:]), int(match.group(3))).weekday() |
- if match.group(2) == 'country': |
- if match.group(3) in countryCodes: |
- subscriptions[fileName][match.group(2)][match.group(3)]['name'] = countryCodes[match.group(3)] |
- subscriptions[fileName][match.group(2)][match.group(3)]['image'] = match.group(3) |
- else: |
- subscriptions[fileName][match.group(2)][match.group(3)]['name'] = 'Unknown' |
- subscriptions[fileName][match.group(2)][match.group(3)]['image'] = 'ip' |
- subscriptions[fileName][match.group(2)][match.group(3)][match.group(1)] = data.getint(month, option) |
+def default_sort(obj): |
+ return sorted(obj.items(), key=lambda (k,v): v["hits"], reverse=True) |
- for subscription in subscriptions.itervalues(): |
- for key in ('day', 'hour'): |
- subscription[key] = sorted(subscription[key].itervalues(), key=lambda s: int(s['id'])) |
- for key in ('country', 'app', 'mirror'): |
- subscription[key] = sorted(subscription[key].itervalues(), key=lambda s: s['hits'], reverse=True) |
- for dayInfo in subscription['day']: |
- weekdayInfo = subscription['weekday'][dayInfo['weekday']] |
- weekdayInfo['hits'] = (weekdayInfo['hits'] * weekdayInfo['count'] + dayInfo['hits']) / (weekdayInfo['count'] + 1) |
- weekdayInfo['bandwidth'] = (weekdayInfo['bandwidth'] * weekdayInfo['count'] + dayInfo['bandwidth']) / (weekdayInfo['count'] + 1) |
- weekdayInfo['count'] += 1 |
- fileName = 'subscription_%s_%s.html' % (re.sub(r'\W', '_', subscription['fileName']), month) |
- template.stream(subscription).dump(os.path.join(outputDir, fileName)) |
- return existingSubscriptions |
+def ensure_dir(path): |
+ dir = os.path.dirname(path) |
+ if not os.path.exists(dir): |
+ os.makedirs(dir) |
-def generateOverviewPage(data, outputDir, fileName): |
- months = [] |
- for month in data.sections(): |
- if data.has_option(month, '%s hits' % fileName) and data.has_option(month, '%s bandwidth' % fileName): |
- months.append({ |
- 'id': month, |
- 'url': 'subscription_%s_%s.html' % (re.sub(r'\W', '_', fileName), month), |
- 'hits': data.getint(month, '%s hits' % fileName), |
- 'bandwidth': data.getint(month, '%s bandwidth' % fileName), |
- }) |
- months = sorted(months, key=lambda m: m['id']) |
+def generate_main_page(outputfile, month, url, data): |
+ ensure_dir(outputfile) |
+ get_main_page_template().stream({ |
+ "now": time.time(), |
+ "month": month, |
+ "url": url, |
+ "data": data, |
+ }).dump(outputfile) |
- file = os.path.join(outputDir, 'overview_%s.html' % re.sub(r'\W', '_', fileName)) |
- template = get_template(get_config().get('subscriptionStats', 'subscriptionOverviewTemplate')) |
- template.stream({'now': time.time(), 'fileName': fileName, 'month': months}).dump(file) |
+def generate_file_stats(outputfile, month, url, overview_url, data, filter=None, filtered_urls={}): |
+ ensure_dir(outputfile) |
+ get_file_stats_template().stream({ |
+ "now": time.time(), |
+ "month": month, |
+ "url": url, |
+ "overview_url": overview_url, |
+ "data": data, |
+ "fields": common.fields, |
+ "filter": filter, |
+ "filtered_urls": filtered_urls |
+ }).dump(outputfile) |
+ |
+def generate_file_overview(outputfile, url, data): |
+ ensure_dir(outputfile) |
+ get_file_overview_template().stream({ |
+ "now": time.time(), |
+ "url": url, |
+ "data": data |
+ }).dump(outputfile) |
+ |
+def get_names(dir, needdirectories): |
+ for file in os.listdir(dir): |
+ path = os.path.join(dir, file) |
+ if (needdirectories and os.path.isdir(path)) or (not needdirectories and os.path.isfile(path)): |
+ yield common.filename_decode(file), path |
+ |
+def generate_pages(datadir, outputdir): |
+ for server_type, server_type_dir in get_names(datadir, True): |
+ baseURL = get_config().get("stats", "baseURL_" + server_type) |
+ filedata = {} |
+ current_month = None |
+ for month, month_dir in get_names(server_type_dir, True): |
+ if current_month == None or month > current_month: |
+ current_month = month |
+ |
+ for filename, path in get_names(month_dir, False): |
+ filename = re.sub(r"\.json$", "", filename) |
+ with codecs.open(path, "rb", encoding="utf-8") as file: |
+ data = simplejson.load(file) |
+ |
+ overview_url = "../../overview-" + common.filename_encode(filename + ".html") |
+ filtered_urls = {} |
+ for field in common.fields: |
+ if field["name"] not in data: |
+ continue |
+ # Create filtered views for the first thirty values of a field if they |
+ # have filtered data. |
+ for name, value in get_template_environment().filters["sortfield"](data[field["name"]], field)[0:30]: |
+ if filter(lambda k: k not in ("hits", "bandwidth"), value.keys()): |
+ outputfile = os.path.join(outputdir, |
+ common.filename_encode(server_type), |
+ common.filename_encode(month), |
+ common.filename_encode(filename), |
+ "filtered-%s-%s.html" % ( |
+ common.filename_encode(field["name"]), |
+ common.filename_encode(name), |
+ )) |
+ generate_file_stats(outputfile, month, baseURL + filename, overview_url, |
+ value, filter={"field": field, "value": name}) |
+ |
+ if not field["name"] in filtered_urls: |
+ filtered_urls[field["name"]] = {} |
+ filtered_urls[field["name"]][name] = outputfile |
+ |
+ outputfile = os.path.join(outputdir, |
+ common.filename_encode(server_type), |
+ common.filename_encode(month), |
+ common.filename_encode(filename), |
+ "index.html") |
+ generate_file_stats(outputfile, month, baseURL + filename, overview_url, |
+ data, filtered_urls=filtered_urls) |
+ |
+ if filename not in filedata: |
+ filedata[filename] = {} |
+ month_url = (common.filename_encode(month) + "/" + |
+ common.filename_encode(filename) + "/" + |
+ "index.html") |
+ filedata[filename][month] = {"url": month_url, "hits": data["hits"], "bandwidth": data["bandwidth"]} |
+ |
+ monthdata = {} |
+ for filename, data in filedata.iteritems(): |
+ outputfile = os.path.join(outputdir, |
+ common.filename_encode(server_type), |
+ "overview-" + common.filename_encode(filename + ".html")) |
+ generate_file_overview(outputfile, baseURL + filename, data) |
+ |
+ if current_month in data: |
+ monthdata[filename] = dict(data[current_month]) |
+ |
+ outputfile = os.path.join(outputdir, common.filename_encode(server_type), "index.html") |
+ generate_main_page(outputfile, current_month, baseURL, monthdata) |
if __name__ == '__main__': |
setupStderr() |
- data = SafeConfigParser() |
- data.read(get_config().get('subscriptionStats', 'mainFile')) |
- |
- outputDir = get_config().get('subscriptionStats', 'outputDirectory') |
- if not os.path.exists(outputDir): |
- os.makedirs(outputDir) |
- generateMainPage(data, outputDir) |
- subscriptions = generateSubscriptionPages(data, outputDir) |
- for fileName in subscriptions.iterkeys(): |
- generateOverviewPage(data, outputDir, fileName) |
+ datadir = get_config().get("stats", "dataDirectory") |
+ outputdir = get_config().get("stats", "outputDirectory") |
+ generate_pages(datadir, outputdir) |