sitescripts/stats/bin/pagegenerator.py - Issue 11481051: Update stats processing

Side by Side Diff: sitescripts/stats/bin/pagegenerator.py

Issue 11481051: Update stats processing (Closed)

Patch Set: Fixed review issues Created Aug. 27, 2013, 7:29 a.m.

Left:
Right:

Use n/p to move between diff chunks; N/P to move between comments.

Jump to:

View unified diff | Download patch

OLD	NEW
1 # coding: utf-8	1 # coding: utf-8

2	2

3 # This file is part of the Adblock Plus web scripts,	3 # This file is part of the Adblock Plus web scripts,

4 # Copyright (C) 2006-2013 Eyeo GmbH	4 # Copyright (C) 2006-2013 Eyeo GmbH

5 #	5 #

6 # Adblock Plus is free software: you can redistribute it and/or modify	6 # Adblock Plus is free software: you can redistribute it and/or modify

7 # it under the terms of the GNU General Public License version 3 as	7 # it under the terms of the GNU General Public License version 3 as

8 # published by the Free Software Foundation.	8 # published by the Free Software Foundation.

9 #	9 #

10 # Adblock Plus is distributed in the hope that it will be useful,	10 # Adblock Plus is distributed in the hope that it will be useful,

11 # but WITHOUT ANY WARRANTY; without even the implied warranty of	11 # but WITHOUT ANY WARRANTY; without even the implied warranty of

12 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the	12 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the

13 # GNU General Public License for more details.	13 # GNU General Public License for more details.

14 #	14 #

15 # You should have received a copy of the GNU General Public License	15 # You should have received a copy of the GNU General Public License

16 # along with Adblock Plus. If not, see <http://www.gnu.org/licenses/>.	16 # along with Adblock Plus. If not, see <http://www.gnu.org/licenses/>.

17	17

18 import os, re, time	18 import os, re, codecs, simplejson, time, itertools

19 from datetime import date, timedelta	19 from datetime import date

20 from sitescripts.utils import get_config, setupStderr, get_template	20 from sitescripts.utils import get_config, setupStderr, get_custom_template_envir onment, cached

21 from sitescripts.logs.countryCodes import countryCodes	21 import sitescripts.stats.common as common

22 from ConfigParser import SafeConfigParser	22 from sitescripts.stats.countrycodes import countrycodes

23	23

24 def getSubscriptionFiles(data, month):	24 @cached(float("inf"))

25 result = {}	25 def get_template_environment():

26 if data.has_section(month):	26 return get_custom_template_environment({

27 for option in data.options(month):	27 "monthname": lambda value: date(int(value[0:4]), int(value[4:]), 1).strftime ("%b %Y"),

28 result[option[0:option.index(' ')]] = True	28 "weekday": lambda value: ["Monday", "Tuesday", "Wednesday", "Thursday", "Fri day", "Saturday", "Sunday"][int(value)],

29 return result	29 "countryname": lambda value: countrycodes.get(value, "Unknown"),

	30 "sortfield": lambda value, field: (field["sort"] if "sort" in field else def ault_sort)(value),

	31 "maxhits": lambda items: max(itertools.chain((value["hits"] for key, value i n items), [1])),

	32 "maxbandwidth": lambda items: max(itertools.chain((value["bandwidth"] for ke y, value in items), [1])),

	33 "sumhits": lambda items: max(sum(value["hits"] for key, value in items), 1),

	34 "sumbandwidth": lambda items: max(sum(value["bandwidth"] for key, value in i tems), 1),

	35 "isspecial": lambda name, field: field["isspecial"](name) if "isspecial" in field else False,

	36 })

30	37

31 def generateMainPage(data, outputDir):	38 @cached(float("inf"))

32 def getDataInt(month, key):	39 def get_main_page_template():

33 if data.has_option(month, key):	40 return get_template_environment().get_template(get_config().get("stats", "main PageTemplate"))

34 return data.getint(month, key)

35 else:

36 return 0

37	41

38 month = date.today().strftime('%Y%m')	42 @cached(float("inf"))

39 subscriptions = []	43 def get_file_stats_template():

40 for fileName in getSubscriptionFiles(data, month).iterkeys():	44 return get_template_environment().get_template(get_config().get("stats", "file PageTemplate"))

41 subscriptions.append({

42 'fileName': fileName,

43 'url': 'subscription_%s_%s.html' % (re.sub(r'\W', '_', fileName), month),

44 'hits': getDataInt(month, '%s hits' % fileName),

45 'bandwidth': getDataInt(month, '%s bandwidth' % fileName)

46 })

47 subscriptions = sorted(subscriptions, key=lambda s: s['hits'], reverse=True)

48	45

49 file = os.path.join(outputDir, 'index.html')	46 @cached(float("inf"))

50 template = get_template(get_config().get('subscriptionStats', 'mainPageTemplat e'))	47 def get_file_overview_template():

51 template.stream({'now': time.time(), 'month': month, 'subscriptions': subscrip tions}).dump(file)	48 return get_template_environment().get_template(get_config().get("stats", "file OverviewTemplate"))

52	49

53 def generateSubscriptionPages(data, outputDir):	50 def default_sort(obj):

54 existingSubscriptions = {}	51 return sorted(obj.items(), key=lambda (k,v): v["hits"], reverse=True)

55 template = get_template(get_config().get('subscriptionStats', 'subscriptionPag eTemplate'))

56 for month in data.sections():

57 subscriptions = {}

58 for option in data.options(month):

59 spaceIndex = option.index(' ')

60 if spaceIndex < 0:

61 continue

62 fileName, key = option[0:spaceIndex], option[spaceIndex+1:]

63 existingSubscriptions[fileName] = True

64 if not fileName in subscriptions:

65 subscriptions[fileName] = {

66 'now': time.time(),

67 'month': month,

68 'daysInMonth': (date(int(month[0:4]), int(month[4:]), 1) - timedelta(d ays=1)).day,

69 'currentMonth': month == date.today().strftime('%Y%m'),

70 'fileName': fileName,

71 'overviewURL': 'overview_%s.html' % re.sub(r'\W', '_', fileName),

72 'hits': 0,

73 'bandwidth': 0,

74 'day': {},

75 'weekday': [{'id': i, 'hits': 0, 'bandwidth': 0, 'count': 0}for i in r ange(7)],

76 'hour': {},

77 'country': {},

78 'app': {},

79 'mirror': {},

80 }

81 if key == 'hits' or key == 'bandwidth':

82 subscriptions[fileName][key] = data.getint(month, option)

83 else:

84 match = re.search(r'^(hits\|bandwidth) (day\|hour\|country\|app\|mirror) (.*) $', key)

85 if match:

86 if not match.group(3) in subscriptions[fileName][match.group(2)]:

87 subscriptions[fileName][match.group(2)][match.group(3)] = {

88 'id': match.group(3),

89 'hits': 0,

90 'bandwidth': 0,

91 }

92 if match.group(2) == 'day':

93 subscriptions[fileName][match.group(2)][match.group(3)]['weekday'] = date(int(month[0:4]), int(month[4:]), int(match.group(3))).weekday()

94 if match.group(2) == 'country':

95 if match.group(3) in countryCodes:

96 subscriptions[fileName][match.group(2)][match.group(3)]['name'] = countryCodes[match.group(3)]

97 subscriptions[fileName][match.group(2)][match.group(3)]['image'] = match.group(3)

98 else:

99 subscriptions[fileName][match.group(2)][match.group(3)]['name'] = 'Unknown'

100 subscriptions[fileName][match.group(2)][match.group(3)]['image'] = 'ip'

101 subscriptions[fileName][match.group(2)][match.group(3)][match.group(1) ] = data.getint(month, option)

102	52

103 for subscription in subscriptions.itervalues():	53 def ensure_dir(path):

104 for key in ('day', 'hour'):	54 dir = os.path.dirname(path)

105 subscription[key] = sorted(subscription[key].itervalues(), key=lambda s: int(s['id']))	55 try:

106 for key in ('country', 'app', 'mirror'):	56 os.makedirs(dir)

107 subscription[key] = sorted(subscription[key].itervalues(), key=lambda s: s['hits'], reverse=True)	57 except OSError:

108 for dayInfo in subscription['day']:	58 pass

109 weekdayInfo = subscription['weekday'][dayInfo['weekday']]

110 weekdayInfo['hits'] = (weekdayInfo['hits'] * weekdayInfo['count'] + dayI nfo['hits']) / (weekdayInfo['count'] + 1)

111 weekdayInfo['bandwidth'] = (weekdayInfo['bandwidth'] * weekdayInfo['coun t'] + dayInfo['bandwidth']) / (weekdayInfo['count'] + 1)

112 weekdayInfo['count'] += 1

113 fileName = 'subscription_%s_%s.html' % (re.sub(r'\W', '_', subscription['f ileName']), month)

114 template.stream(subscription).dump(os.path.join(outputDir, fileName))

115 return existingSubscriptions

116	59

117 def generateOverviewPage(data, outputDir, fileName):	60 def generate_main_page(outputfile, month, url, data):

118 months = []	61 ensure_dir(outputfile)

119 for month in data.sections():	62 get_main_page_template().stream({

120 if data.has_option(month, '%s hits' % fileName) and data.has_option(month, ' %s bandwidth' % fileName):	63 "now": time.time(),

121 months.append({	64 "month": month,

122 'id': month,	65 "url": url,

123 'url': 'subscription_%s_%s.html' % (re.sub(r'\W', '_', fileName), month) ,	66 "data": data,

124 'hits': data.getint(month, '%s hits' % fileName),	67 }).dump(outputfile)

125 'bandwidth': data.getint(month, '%s bandwidth' % fileName),

126 })

127 months = sorted(months, key=lambda m: m['id'])

128	68

129 file = os.path.join(outputDir, 'overview_%s.html' % re.sub(r'\W', '_', fileNam e))	69 def generate_file_stats(outputfile, month, url, overview_url, data, filter=None, filtered_urls={}):

130 template = get_template(get_config().get('subscriptionStats', 'subscriptionOve rviewTemplate'))	70 ensure_dir(outputfile)

131 template.stream({'now': time.time(), 'fileName': fileName, 'month': months}).d ump(file)	71 get_file_stats_template().stream({

	72 "now": time.time(),

	73 "month": month,

	74 "url": url,

	75 "overview_url": overview_url,

	76 "data": data,

	77 "fields": common.fields,

	78 "filter": filter,

	79 "filtered_urls": filtered_urls

	80 }).dump(outputfile)

	81

	82 def generate_file_overview(outputfile, url, data):

	83 ensure_dir(outputfile)

	84 get_file_overview_template().stream({

	85 "now": time.time(),

	86 "url": url,

	87 "data": data

	88 }).dump(outputfile)

	89

	90 def get_names(dir, needdirectories):

	91 for file in os.listdir(dir):

	92 path = os.path.join(dir, file)

	93 if (needdirectories and os.path.isdir(path)) or (not needdirectories and os. path.isfile(path)):

	94 yield common.filename_decode(file), path

	95

	96 def generate_pages(datadir, outputdir):

	97 for server_type, server_type_dir in get_names(datadir, True):

	98 baseURL = get_config().get("stats", "baseURL_" + server_type)

	99 filedata = {}

	100 current_month = None

	101 for month, month_dir in get_names(server_type_dir, True):

	102 if current_month == None or month > current_month:

	103 current_month = month

	104

	105 for filename, path in get_names(month_dir, False):

	106 filename = re.sub(r"\.json$", "", filename)

	107 with codecs.open(path, "rb", encoding="utf-8") as file:

	108 data = simplejson.load(file)

	109

	110 overview_url = "../../overview-" + common.filename_encode(filename + ".h tml")

	111 filtered_urls = {}

	112 for field in common.fields:

	113 if field["name"] not in data:

	114 continue

	115 # Create filtered views for the first thirty values of a field if they

	116 # have filtered data.

	117 for name, value in get_template_environment().filters["sortfield"](dat a[field["name"]], field)[0:30]:

	118 if filter(lambda k: k not in ("hits", "bandwidth"), value.iterkeys() ):

	119 outputfile = os.path.join(outputdir,

	120 common.filename_encode(server_type),

	121 common.filename_encode(month),

	122 common.filename_encode(filename),

	123 "filtered-%s-%s.html" % (

	124 common.filename_encode(field["name"]),

	125 common.filename_encode(name),

	126 ))

	127 generate_file_stats(outputfile, month, baseURL + filename, overvie w_url,

	128 value, filter={"field": field, "value": name})

	129

	130 if not field["name"] in filtered_urls:

	131 filtered_urls[field["name"]] = {}

	132 filtered_urls[field["name"]][name] = os.path.basename(outputfile)

	133

	134 outputfile = os.path.join(outputdir,

	135 common.filename_encode(server_type),

	136 common.filename_encode(month),

	137 common.filename_encode(filename),

	138 "index.html")

	139 generate_file_stats(outputfile, month, baseURL + filename, overview_url,

	140 data, filtered_urls=filtered_urls)

	141

	142 if filename not in filedata:

	143 filedata[filename] = {}

	144 month_url = (common.filename_encode(month) + "/" +

	145 common.filename_encode(filename) + "/" +

	146 "index.html")

	147 filedata[filename][month] = {"url": month_url, "hits": data["hits"], "ba ndwidth": data["bandwidth"]}

	148

	149 monthdata = {}

	150 for filename, data in filedata.iteritems():

	151 outputfile = os.path.join(outputdir,

	152 common.filename_encode(server_type),

	153 "overview-" + common.filename_encode(filename + ".html"))

	154 generate_file_overview(outputfile, baseURL + filename, data)

	155

	156 if current_month in data:

	157 monthdata[filename] = dict(data[current_month])

	158

	159 outputfile = os.path.join(outputdir, common.filename_encode(server_type), "i ndex.html")

	160 generate_main_page(outputfile, current_month, baseURL, monthdata)

132	161

133 if __name__ == '__main__':	162 if __name__ == '__main__':

134 setupStderr()	163 setupStderr()

135	164

136 data = SafeConfigParser()	165 datadir = get_config().get("stats", "dataDirectory")

137 data.read(get_config().get('subscriptionStats', 'mainFile'))	166 outputdir = get_config().get("stats", "outputDirectory")

138	167 generate_pages(datadir, outputdir)

139 outputDir = get_config().get('subscriptionStats', 'outputDirectory')

140 if not os.path.exists(outputDir):

141 os.makedirs(outputDir)

142 generateMainPage(data, outputDir)

143 subscriptions = generateSubscriptionPages(data, outputDir)

144 for fileName in subscriptions.iterkeys():

145 generateOverviewPage(data, outputDir, fileName)

OLD	NEW

« no previous file with comments | « sitescripts/stats/bin/logprocessor.py ('k') | sitescripts/stats/common.py » ('j') | no next file with comments »