sitescripts/stats/bin/pagegenerator.py - Issue 11481051: Update stats processing

Unified Diff: sitescripts/stats/bin/pagegenerator.py

Issue 11481051: Update stats processing (Closed)

Patch Set: Fixed two presentation issues Created Aug. 24, 2013, 1:11 p.m.

Use n/p to move between diff chunks; N/P to move between comments.

Jump to:

View side-by-side diff with in-line comments

Index: sitescripts/stats/bin/pagegenerator.py

===================================================================

rename from sitescripts/logs/bin/generateStatsPages.py

rename to sitescripts/stats/bin/pagegenerator.py

--- a/sitescripts/logs/bin/generateStatsPages.py

+++ b/sitescripts/stats/bin/pagegenerator.py

@@ -10,136 +10,156 @@

# Adblock Plus is distributed in the hope that it will be useful,

# but WITHOUT ANY WARRANTY; without even the implied warranty of

# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the

# GNU General Public License for more details.

# You should have received a copy of the GNU General Public License

# along with Adblock Plus. If not, see <http://www.gnu.org/licenses/>.

-import os, re, time

-from datetime import date, timedelta

-from sitescripts.utils import get_config, setupStderr, get_template

-from sitescripts.logs.countryCodes import countryCodes

-from ConfigParser import SafeConfigParser

+import os, re, codecs, simplejson, time, itertools

+from datetime import date

+from sitescripts.utils import get_config, setupStderr, get_custom_template_environment, cached

+import sitescripts.stats.common as common

+from sitescripts.stats.countrycodes import countrycodes

-def getSubscriptionFiles(data, month):

- result = {}

- if data.has_section(month):

- for option in data.options(month):

- result[option[0:option.index(' ')]] = True

- return result

+@cached(())

Sebastian Noack 2013/08/26 16:05:22 You passed an empty tuple as timeout. Obviously yo

Wladimir Palant 2013/08/27 07:34:28 This behavior is somewhat specified as described h

+def get_template_environment():

+ return get_custom_template_environment({

+ "monthname": lambda value: date(int(value[0:4]), int(value[4:]), 1).strftime("%b %Y"),

+ "weekday": lambda value: ["Monday", "Tuesday", "Wednesday", "Thursday", "Friday", "Saturday", "Sunday"][int(value)],

+ "countryname": lambda value: countrycodes.get(value, "Unknown"),

+ "sortfield": lambda value, field: (field["sort"] if "sort" in field else default_sort)(value),

+ "maxhits": lambda items: max(itertools.chain((value["hits"] for key, value in items), [1])),

+ "maxbandwidth": lambda items: max(itertools.chain((value["bandwidth"] for key, value in items), [1])),

+ "sumhits": lambda items: max(sum(value["hits"] for key, value in items), 1),

+ "sumbandwidth": lambda items: max(sum(value["bandwidth"] for key, value in items), 1),

+ "isspecial": lambda name, field: field["isspecial"](name) if "isspecial" in field else False,

+ })

-def generateMainPage(data, outputDir):

- def getDataInt(month, key):

- if data.has_option(month, key):

- return data.getint(month, key)

- else:

- return 0

+@cached(())

+def get_main_page_template():

+ return get_template_environment().get_template(get_config().get("stats", "mainPageTemplate"))

- month = date.today().strftime('%Y%m')

- subscriptions = []

- for fileName in getSubscriptionFiles(data, month).iterkeys():

- subscriptions.append({

- 'fileName': fileName,

- 'url': 'subscription_%s_%s.html' % (re.sub(r'\W', '_', fileName), month),

- 'hits': getDataInt(month, '%s hits' % fileName),

- 'bandwidth': getDataInt(month, '%s bandwidth' % fileName)

- })

- subscriptions = sorted(subscriptions, key=lambda s: s['hits'], reverse=True)

+@cached(())

+def get_file_stats_template():

+ return get_template_environment().get_template(get_config().get("stats", "filePageTemplate"))

- file = os.path.join(outputDir, 'index.html')

- template = get_template(get_config().get('subscriptionStats', 'mainPageTemplate'))

- template.stream({'now': time.time(), 'month': month, 'subscriptions': subscriptions}).dump(file)

+@cached(())

+def get_file_overview_template():

+ return get_template_environment().get_template(get_config().get("stats", "fileOverviewTemplate"))

-def generateSubscriptionPages(data, outputDir):

- existingSubscriptions = {}

- template = get_template(get_config().get('subscriptionStats', 'subscriptionPageTemplate'))

- for month in data.sections():

- subscriptions = {}

- for option in data.options(month):

- spaceIndex = option.index(' ')

- if spaceIndex < 0:

- continue

- fileName, key = option[0:spaceIndex], option[spaceIndex+1:]

- existingSubscriptions[fileName] = True

- if not fileName in subscriptions:

- subscriptions[fileName] = {

- 'now': time.time(),

- 'month': month,

- 'daysInMonth': (date(int(month[0:4]), int(month[4:]), 1) - timedelta(days=1)).day,

- 'currentMonth': month == date.today().strftime('%Y%m'),

- 'fileName': fileName,

- 'overviewURL': 'overview_%s.html' % re.sub(r'\W', '_', fileName),

- 'hits': 0,

- 'bandwidth': 0,

- 'day': {},

- 'weekday': [{'id': i, 'hits': 0, 'bandwidth': 0, 'count': 0}for i in range(7)],

- 'hour': {},

- 'country': {},

- 'app': {},

- 'mirror': {},

- }

- if key == 'hits' or key == 'bandwidth':

- subscriptions[fileName][key] = data.getint(month, option)

- else:

- if match:

- if not match.group(3) in subscriptions[fileName][match.group(2)]:

- subscriptions[fileName][match.group(2)][match.group(3)] = {

- 'id': match.group(3),

- 'hits': 0,

- 'bandwidth': 0,

- }

- if match.group(2) == 'day':

- subscriptions[fileName][match.group(2)][match.group(3)]['weekday'] = date(int(month[0:4]), int(month[4:]), int(match.group(3))).weekday()

- if match.group(2) == 'country':

- if match.group(3) in countryCodes:

- subscriptions[fileName][match.group(2)][match.group(3)]['name'] = countryCodes[match.group(3)]

- subscriptions[fileName][match.group(2)][match.group(3)]['image'] = match.group(3)

- else:

- subscriptions[fileName][match.group(2)][match.group(3)]['name'] = 'Unknown'

- subscriptions[fileName][match.group(2)][match.group(3)]['image'] = 'ip'

- subscriptions[fileName][match.group(2)][match.group(3)][match.group(1)] = data.getint(month, option)

+def default_sort(obj):

+ return sorted(obj.items(), key=lambda (k,v): v["hits"], reverse=True)

- for subscription in subscriptions.itervalues():

- for key in ('day', 'hour'):

- subscription[key] = sorted(subscription[key].itervalues(), key=lambda s: int(s['id']))

- for key in ('country', 'app', 'mirror'):

- subscription[key] = sorted(subscription[key].itervalues(), key=lambda s: s['hits'], reverse=True)

- for dayInfo in subscription['day']:

- weekdayInfo = subscription['weekday'][dayInfo['weekday']]

- weekdayInfo['hits'] = (weekdayInfo['hits'] * weekdayInfo['count'] + dayInfo['hits']) / (weekdayInfo['count'] + 1)

- weekdayInfo['bandwidth'] = (weekdayInfo['bandwidth'] * weekdayInfo['count'] + dayInfo['bandwidth']) / (weekdayInfo['count'] + 1)

- weekdayInfo['count'] += 1

- fileName = 'subscription_%s_%s.html' % (re.sub(r'\W', '_', subscription['fileName']), month)

- template.stream(subscription).dump(os.path.join(outputDir, fileName))

- return existingSubscriptions

+def ensure_dir(path):

Sebastian Noack 2013/08/26 16:05:22 Instead of checking if the dir exist and creating

+ dir = os.path.dirname(path)

+ if not os.path.exists(dir):

+ os.makedirs(dir)

Wladimir Palant 2013/08/27 07:34:28 Fixed. However, this is a common pattern in our co

-def generateOverviewPage(data, outputDir, fileName):

- months = []

- for month in data.sections():

- if data.has_option(month, '%s hits' % fileName) and data.has_option(month, '%s bandwidth' % fileName):

- months.append({

- 'id': month,

- 'url': 'subscription_%s_%s.html' % (re.sub(r'\W', '_', fileName), month),

- 'hits': data.getint(month, '%s hits' % fileName),

- 'bandwidth': data.getint(month, '%s bandwidth' % fileName),

- })

- months = sorted(months, key=lambda m: m['id'])

+def generate_main_page(outputfile, month, url, data):

+ ensure_dir(outputfile)

+ get_main_page_template().stream({

+ "now": time.time(),

+ "month": month,

+ "url": url,

+ "data": data,

+ }).dump(outputfile)

- file = os.path.join(outputDir, 'overview_%s.html' % re.sub(r'\W', '_', fileName))

- template = get_template(get_config().get('subscriptionStats', 'subscriptionOverviewTemplate'))

- template.stream({'now': time.time(), 'fileName': fileName, 'month': months}).dump(file)

+def generate_file_stats(outputfile, month, url, overview_url, data, filter=None, filtered_urls={}):

+ ensure_dir(outputfile)

+ get_file_stats_template().stream({

+ "now": time.time(),

+ "month": month,

+ "url": url,

+ "overview_url": overview_url,

+ "data": data,

+ "fields": common.fields,

+ "filter": filter,

+ "filtered_urls": filtered_urls

+ }).dump(outputfile)

+def generate_file_overview(outputfile, url, data):

+ ensure_dir(outputfile)

+ get_file_overview_template().stream({

+ "now": time.time(),

+ "url": url,

+ "data": data

+ }).dump(outputfile)

+def get_names(dir, needdirectories):

+ for file in os.listdir(dir):

+ path = os.path.join(dir, file)

+ if (needdirectories and os.path.isdir(path)) or (not needdirectories and os.path.isfile(path)):

+ yield common.filename_decode(file), path

+def generate_pages(datadir, outputdir):

+ for server_type, server_type_dir in get_names(datadir, True):

+ baseURL = get_config().get("stats", "baseURL_" + server_type)

+ filedata = {}

+ current_month = None

+ for month, month_dir in get_names(server_type_dir, True):

+ if current_month == None or month > current_month:

+ current_month = month

+ for filename, path in get_names(month_dir, False):

+ filename = re.sub(r"\.json$", "", filename)

+ with codecs.open(path, "rb", encoding="utf-8") as file:

+ data = simplejson.load(file)

+ overview_url = "../../overview-" + common.filename_encode(filename + ".html")

+ filtered_urls = {}

+ for field in common.fields:

+ if field["name"] not in data:

+ continue

+ # Create filtered views for the first thirty values of a field if they

+ # have filtered data.

+ for name, value in get_template_environment().filters["sortfield"](data[field["name"]], field)[0:30]:

+ if filter(lambda k: k not in ("hits", "bandwidth"), value.keys()):

Sebastian Noack 2013/08/26 16:05:22 No need to create a new list with the keys first.

Wladimir Palant 2013/08/27 07:34:28 I rather use value.iterkeys() here - it's obvious

Wladimir Palant 2013/08/27 11:59:47 Reply by Sebastian: It does the same, but isn't a

Wladimir Palant 2013/08/27 12:42:01 See my reply - I already know that. But I prefer i

+ outputfile = os.path.join(outputdir,

+ common.filename_encode(server_type),

+ common.filename_encode(month),

+ common.filename_encode(filename),

+ "filtered-%s-%s.html" % (

+ common.filename_encode(field["name"]),

+ common.filename_encode(name),

+ ))

+ generate_file_stats(outputfile, month, baseURL + filename, overview_url,

+ value, filter={"field": field, "value": name})

+ if not field["name"] in filtered_urls:

+ filtered_urls[field["name"]] = {}

+ filtered_urls[field["name"]][name] = outputfile

+ outputfile = os.path.join(outputdir,

+ common.filename_encode(server_type),

+ common.filename_encode(month),

+ common.filename_encode(filename),

+ "index.html")

+ generate_file_stats(outputfile, month, baseURL + filename, overview_url,

+ data, filtered_urls=filtered_urls)

+ if filename not in filedata:

+ filedata[filename] = {}

+ month_url = (common.filename_encode(month) + "/" +

Sebastian Noack 2013/08/26 16:05:22 You should use os.path.join() here as well.

Wladimir Palant 2013/08/27 07:34:28 No, definitely not going to use os.path.join() for

Wladimir Palant 2013/08/27 11:59:47 Reply by Sebastian: Oh, didn't noted, it was an u

+ common.filename_encode(filename) + "/" +

+ "index.html")

+ filedata[filename][month] = {"url": month_url, "hits": data["hits"], "bandwidth": data["bandwidth"]}

+ monthdata = {}

+ for filename, data in filedata.iteritems():

+ outputfile = os.path.join(outputdir,

+ common.filename_encode(server_type),

+ "overview-" + common.filename_encode(filename + ".html"))

+ generate_file_overview(outputfile, baseURL + filename, data)

+ if current_month in data:

+ monthdata[filename] = dict(data[current_month])

+ outputfile = os.path.join(outputdir, common.filename_encode(server_type), "index.html")

+ generate_main_page(outputfile, current_month, baseURL, monthdata)

if __name__ == '__main__':

setupStderr()

- data = SafeConfigParser()

- data.read(get_config().get('subscriptionStats', 'mainFile'))

- outputDir = get_config().get('subscriptionStats', 'outputDirectory')

- if not os.path.exists(outputDir):

- os.makedirs(outputDir)

- generateMainPage(data, outputDir)

- subscriptions = generateSubscriptionPages(data, outputDir)

- for fileName in subscriptions.iterkeys():

- generateOverviewPage(data, outputDir, fileName)

+ datadir = get_config().get("stats", "dataDirectory")

+ outputdir = get_config().get("stats", "outputDirectory")

+ generate_pages(datadir, outputdir)

« sitescripts/stats/bin/logprocessor.py ('K') | « sitescripts/stats/bin/logprocessor.py ('k') | sitescripts/stats/common.py » ('j') | sitescripts/utils.py » ('J')