Rietveld Code Review Tool
Help | Bug tracker | Discussion group | Source code

Unified Diff: update-copyright/update_copyright.py

Issue 29459580: Issue 5250 - Add copyright update script (Closed) Base URL: https://hg.adblockplus.org/codingtools
Patch Set: Addressed comments Created June 23, 2017, 2:36 p.m.
Use n/p to move between diff chunks; N/P to move between comments.
Jump to:
View side-by-side diff with in-line comments
Download patch
« update-copyright/tox.ini ('K') | « update-copyright/tox.ini ('k') | no next file » | no next file with comments »
Expand Comments ('e') | Collapse Comments ('c') | Show Comments Hide Comments ('s')
Index: update-copyright/update_copyright.py
===================================================================
new file mode 100644
--- /dev/null
+++ b/update-copyright/update_copyright.py
@@ -0,0 +1,152 @@
+#!/usr/bin/env python3
+
+import os
+import subprocess
+import re
+import datetime
+import shutil
+import urllib.parse
+import urllib.request
+import html.parser
+import argparse
+from posixpath import dirname
+
+
+CURRENT_YEAR = datetime.datetime.now().year
+
+
+def process_repo(url, hg_upstream):
Vasily Kuznetsov 2017/06/27 16:18:54 Is `hg_upstream` unused in this function? It shoul
rosie 2017/07/03 15:33:46 Done.
+ repo = os.path.basename(os.path.normpath(url))
Vasily Kuznetsov 2017/06/27 16:18:51 Why are we applying `os.path.normpath` to a url he
rosie 2017/07/03 15:33:45 Done.
+ if repo in {
+ # headers are copied from libadblockplus, no need to update seperately
+ 'libadblockplus-binaries',
+ # huge and only contains autogenerated builds
+ 'downloads',
+ }:
+ return
+
+ try:
+ subprocess.check_call(['hg', 'clone', url, repo])
+ if repo == 'adblockbrowser':
+ # adblockbrowser is a FF fork with its own changes in a
+ # seperate branch
+ subprocess.check_call(['hg', 'up', '--rev', 'adblockbrowser',
+ '--repository', repo])
+ else:
+ # switch to 'master' bookmark if it exists
+ subprocess.call(['hg', 'up', '--rev', 'master',
Vasily Kuznetsov 2017/06/27 16:18:54 Any idea why we're not doing `check_call` here jus
rosie 2017/07/03 15:33:45 This line attempts to switch to the master branch.
Vasily Kuznetsov 2017/07/03 19:25:45 I see. Makes sense.
+ '--repository', repo])
+ for dirpath, dirnames, filenames in os.walk(repo):
+ if dirpath == repo:
+ dirnames.remove('.hg')
+
+ for filename in filenames:
+ text_replace(dirpath, filename)
+ hg_commit(repo, url)
+
+ finally:
+ shutil.rmtree(repo, ignore_errors=True)
+
+
+def text_replace(dirpath, filename):
+ with open(os.path.join(dirpath, filename), 'r+',
+ encoding='utf-8', newline='') as file:
+ try:
+ text = file.read()
+ except UnicodeDecodeError:
+ print("Error: Couldn't read {}{}".format(dirpath, filename))
+ return
+
+ text = re.sub(
+ r'(copyright.*?\d{4})(?:-\d{4})?\s+eyeo gmbh',
+ r'\1-{} eyeo GmbH'.format(CURRENT_YEAR), text, 0, re.I
+ )
+ file.seek(0)
+ file.write(text)
+ file.truncate()
+
+
+def hg_commit(repo, hg_upstream):
+ try:
+ subprocess.check_call(['hg', 'commit', '-m',
+ 'Noissue - Updated copyright year',
+ '--repository', repo])
+ except subprocess.CalledProcessError as e:
+ if e.returncode == 1: # no changes
+ return
+ raise
+
+ # Push changes, or save patch if access denied
+ if 'ssh://hg@hg.adblockplus.org/' in hg_upstream:
Vasily Kuznetsov 2017/06/27 16:18:52 `hg_commit` is called with the original url of the
rosie 2017/07/03 15:33:45 If we're pushing to ssh://hg@hg.adblockplus.org/ s
rosie 2017/07/04 13:38:24 If we're pushing to ssh://hg@hg.adblockplus.org/ s
+ hg_upstream += repo
+ if subprocess.call(['hg', 'push', '--repository', repo, hg_upstream]) != 0:
+ with open(repo + '.patch', 'wb') as file:
+ print('couldnt push, making patch instead')
+ subprocess.check_call(['hg', 'export', '--repository', repo],
+ stdout=file)
+
+
+class Parser(html.parser.HTMLParser):
+ result = []
+ recordtr = False
+ cell = 0
+ current_url = ''
+ dir_path = 'https://hg.adblockplus.org/'
Vasily Kuznetsov 2017/06/27 16:18:52 Is this used anywhere?
rosie 2017/07/03 15:33:44 Done.
+
+ def handle_starttag(self, tag, attrs):
+ if tag == 'tr':
+ self.recordtr = True
+ if tag == 'td':
+ self.cell += 1
+ if tag == 'a':
+ attrs = dict(attrs)
+ if 'list' in attrs.get('class', '').split():
+ self.current_url = attrs['href']
+
+ def handle_endtag(self, tag):
+ if tag == 'tr':
+ self.recordtr = False
+ self.cell = 0
+
+ def handle_data(self, data):
+ if (self.cell == 2) and self.recordtr is True:
Vasily Kuznetsov 2017/06/27 16:18:51 The parentheses around the first operand of `and`
rosie 2017/07/03 15:33:45 Done.
+ self.recordtr = False
+ self.cell = 0
+ # Only process the URL if the description is not Deprecated
+ depr1 = re.search(r'\*DEPRECATED\*', data)
Vasily Kuznetsov 2017/06/27 16:18:52 I think it would make the code more readable and e
rosie 2017/07/03 15:33:44 Done.
+ depr2 = re.search(r'(Deprecated)', data)
+ if not depr1 and not depr2 and len(self.current_url) > 2:
+ self.result += [self.current_url]
+ return self.result
+
+
+def extract_urls(hg_page):
+ dir_path = dirname(hg_page) + '/'
Vasily Kuznetsov 2017/06/27 16:18:52 Two points here: - `hg_page` is an url, could we u
rosie 2017/07/03 15:33:45 Done.
+ parser = Parser()
+ with urllib.request.urlopen(hg_page) as response:
+ parser.feed(response.read().decode('utf-8'))
+ parser.close()
+ my_result = []
Vasily Kuznetsov 2017/06/27 16:18:52 Maybe better call this `repo_urls`?
rosie 2017/07/03 15:33:46 Done.
+ for i in range(len(parser.result)):
Vasily Kuznetsov 2017/06/27 16:18:53 Why not `for url in parser.result:`?
rosie 2017/07/03 15:33:45 Done.
+ my_result.append(urllib.parse.urljoin(dir_path, parser.result[i]))
+ return my_result
+
+
+def main(hg_page, hg_upstream):
+ repo_list = extract_urls(hg_page)
+ for repo in repo_list:
Vasily Kuznetsov 2017/06/27 16:18:52 Could probably just be `for repo in extract_urls(h
rosie 2017/07/03 15:33:45 Done.
+ process_repo(repo, hg_upstream)
+
+
+if __name__ == '__main__':
+ arg_parser = argparse.ArgumentParser()
+ arg_parser.add_argument('-u', '--url',
+ default='https://hg.adblockplus.org/',
+ help='specify which Mercurial URL site to scrape')
+ arg_parser.add_argument('-p', '--push',
+ default='ssh://hg@hg.adblockplus.org/',
Vasily Kuznetsov 2017/06/27 16:18:52 This should probably default to `None` and then if
rosie 2017/07/03 15:33:46 Done.
+ help='specify where to push the repository')
+ args = arg_parser.parse_args()
+ hg_page = args.url
+ hg_upstream = args.push
+ main(hg_page, hg_upstream)
« update-copyright/tox.ini ('K') | « update-copyright/tox.ini ('k') | no next file » | no next file with comments »

Powered by Google App Engine
This is Rietveld