Rietveld Code Review Tool
Help | Bug tracker | Discussion group | Source code

Unified Diff: sitescripts/stats/bin/datamerger.py

Issue 11481051: Update stats processing (Closed)
Patch Set: Improved performance using memoization Created Aug. 29, 2013, 1:39 p.m.
Use n/p to move between diff chunks; N/P to move between comments.
Jump to:
View side-by-side diff with in-line comments
Download patch
« no previous file with comments | « sitescripts/stats/bin/__init__.py ('k') | sitescripts/stats/bin/logprocessor.py » ('j') | no next file with comments »
Expand Comments ('e') | Collapse Comments ('c') | Show Comments Hide Comments ('s')
Index: sitescripts/stats/bin/datamerger.py
===================================================================
rename from sitescripts/logs/bin/mergeSubscriptionStats.py
rename to sitescripts/stats/bin/datamerger.py
--- a/sitescripts/logs/bin/mergeSubscriptionStats.py
+++ b/sitescripts/stats/bin/datamerger.py
@@ -10,63 +10,88 @@
# Adblock Plus is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with Adblock Plus. If not, see <http://www.gnu.org/licenses/>.
-import os, re, subprocess, urllib
+import os, sys, re, codecs, subprocess, urllib, simplejson, traceback
+import sitescripts.stats.common as common
from sitescripts.utils import get_config, setupStderr
-from ConfigParser import SafeConfigParser, NoOptionError
-from StringIO import StringIO
-def readStatsFile(path):
- result = SafeConfigParser()
- match = re.search(r'^ssh://(\w+)@([^/:]+)(?::(\d+))?', path)
+def read_stats_file(path):
+ match = re.search(r"^ssh://(\w+)@([^/:]+)(?::(\d+))?", path)
if match:
- command = ['ssh', '-q', '-o' 'NumberOfPasswordPrompts 0', '-T', '-k', '-l', match.group(1), match.group(2)]
+ command = ["ssh", "-q", "-o", "NumberOfPasswordPrompts 0", "-T", "-k", "-l", match.group(1), match.group(2)]
if match.group(3):
- command[1:1] = ['-P', match.group(3)]
+ command[1:1] = ["-P", match.group(3)]
data = subprocess.check_output(command)
- result.readfp(StringIO(data))
- elif path.startswith('http://') or path.startswith('https://'):
- result.readfp(urllib.urlopen(path))
+ return simplejson.loads(data.decode("utf-8"))
+ elif path.startswith("http://") or path.startswith("https://"):
+ return simplejson.load(urllib.urlopen(path).read().decode("utf-8"))
elif os.path.exists(path):
- result.read(path)
- return result
+ with codecs.open(path, "rb", encoding="utf-8") as file:
+ return simplejson.load(file)
-def getStatsFiles():
+ raise IOError("Path '%s' not recognized" % path)
+
+def get_stats_files(mirrors):
config = get_config()
- for option in config.options('subscriptionStats'):
- match = re.search(r'^mirror_(.*)', option, re.I)
- if match:
- yield (match.group(1), config.get('subscriptionStats', option))
+ if len(mirrors) > 0:
+ options = map(lambda m: "mirror_" + m, mirrors)
+ else:
+ options = filter(lambda o: o.startswith("mirror_"), config.options("stats"))
+ for option in options:
+ if config.has_option("stats", option):
+ value = config.get("stats", option)
+ if " " in value:
+ yield value.split(None, 1)
+ else:
+ print >>sys.stderr, "Option '%s' has invalid value: '%s'" % (option, value)
+ else:
+ print >>sys.stderr, "Option '%s' not found in the configuration" % option
-def mergeStatsFile(mirrorName, config1, config2):
- def increaseOption(section, option, increase):
- if config1.has_option(section, option):
- oldval = config1.getint(section, option)
- config1.set(section, option, str(oldval + increase))
+def merge_objects(object1, object2):
+ for key, value in object2.iteritems():
+ if key in object1:
+ if isinstance(value, int):
+ object1[key] += value
+ else:
+ merge_objects(object1[key], object2[key])
else:
- config1.set(section, option, str(increase))
+ object1[key] = value
- for section in config2.sections():
- if not config1.has_section(section):
- config1.add_section(section)
- for option in config2.options(section):
- increase = config2.getint(section, option)
- increaseOption(section, option, increase)
+def merge_stats_file(server_type, data):
+ base_dir = os.path.join(get_config().get("stats", "dataDirectory"), common.filename_encode(server_type))
+ for month, month_data in data.iteritems():
+ for name, file_data in month_data.iteritems():
+ path = os.path.join(base_dir, common.filename_encode(month), common.filename_encode(name + ".json"))
+ if os.path.exists(path):
+ with codecs.open(path, "rb", encoding="utf-8") as file:
+ existing = simplejson.load(file)
+ else:
+ existing = {}
- match = re.search(r'^(\S+) (hits|bandwidth)$', option, re.I)
- if match:
- increaseOption(section, '%s %s mirror %s' % (match.group(1), match.group(2), mirrorName), increase)
+ merge_objects(existing, file_data)
-if __name__ == '__main__':
+ dir = os.path.dirname(path)
+ try:
+ os.makedirs(dir)
+ except OSError:
+ pass
+
+ with codecs.open(path, "wb", encoding="utf-8") as file:
+ simplejson.dump(existing, file, indent=2, sort_keys=True)
+
+def merge_mirror_stats(mirrors):
+ for server_type, path in get_stats_files(mirrors):
+ try:
+ merge_stats_file(server_type, read_stats_file(path))
+ except:
+ print >>sys.stderr, "Unable to merge stats for '%s'" % path
+ traceback.print_exc()
+
+if __name__ == "__main__":
setupStderr()
-
- result = readStatsFile(get_config().get('subscriptionStats', 'mainFile'))
- for (mirrorName, statsFile) in getStatsFiles():
- mergeStatsFile(mirrorName, result, readStatsFile(statsFile))
- file = open(get_config().get('subscriptionStats', 'mainFile'), 'wb')
- result.write(file)
+ merge_mirror_stats(sys.argv[1:])
« no previous file with comments | « sitescripts/stats/bin/__init__.py ('k') | sitescripts/stats/bin/logprocessor.py » ('j') | no next file with comments »

Powered by Google App Engine
This is Rietveld