Index: update-copyright/update_copyright.py |
=================================================================== |
new file mode 100644 |
--- /dev/null |
+++ b/update-copyright/update_copyright.py |
@@ -0,0 +1,152 @@ |
+#!/usr/bin/env python3 |
+ |
+import os |
+import subprocess |
+import re |
+import datetime |
+import shutil |
+import urllib.parse |
+import urllib.request |
+import html.parser |
+import argparse |
+from posixpath import dirname |
+ |
+ |
+CURRENT_YEAR = datetime.datetime.now().year |
+ |
+ |
+def process_repo(url, hg_upstream): |
Vasily Kuznetsov
2017/06/27 16:18:54
Is `hg_upstream` unused in this function? It shoul
rosie
2017/07/03 15:33:46
Done.
|
+ repo = os.path.basename(os.path.normpath(url)) |
Vasily Kuznetsov
2017/06/27 16:18:51
Why are we applying `os.path.normpath` to a url he
rosie
2017/07/03 15:33:45
Done.
|
+ if repo in { |
+ # headers are copied from libadblockplus, no need to update seperately |
+ 'libadblockplus-binaries', |
+ # huge and only contains autogenerated builds |
+ 'downloads', |
+ }: |
+ return |
+ |
+ try: |
+ subprocess.check_call(['hg', 'clone', url, repo]) |
+ if repo == 'adblockbrowser': |
+ # adblockbrowser is a FF fork with its own changes in a |
+ # seperate branch |
+ subprocess.check_call(['hg', 'up', '--rev', 'adblockbrowser', |
+ '--repository', repo]) |
+ else: |
+ # switch to 'master' bookmark if it exists |
+ subprocess.call(['hg', 'up', '--rev', 'master', |
Vasily Kuznetsov
2017/06/27 16:18:54
Any idea why we're not doing `check_call` here jus
rosie
2017/07/03 15:33:45
This line attempts to switch to the master branch.
Vasily Kuznetsov
2017/07/03 19:25:45
I see. Makes sense.
|
+ '--repository', repo]) |
+ for dirpath, dirnames, filenames in os.walk(repo): |
+ if dirpath == repo: |
+ dirnames.remove('.hg') |
+ |
+ for filename in filenames: |
+ text_replace(dirpath, filename) |
+ hg_commit(repo, url) |
+ |
+ finally: |
+ shutil.rmtree(repo, ignore_errors=True) |
+ |
+ |
+def text_replace(dirpath, filename): |
+ with open(os.path.join(dirpath, filename), 'r+', |
+ encoding='utf-8', newline='') as file: |
+ try: |
+ text = file.read() |
+ except UnicodeDecodeError: |
+ print("Error: Couldn't read {}{}".format(dirpath, filename)) |
+ return |
+ |
+ text = re.sub( |
+ r'(copyright.*?\d{4})(?:-\d{4})?\s+eyeo gmbh', |
+ r'\1-{} eyeo GmbH'.format(CURRENT_YEAR), text, 0, re.I |
+ ) |
+ file.seek(0) |
+ file.write(text) |
+ file.truncate() |
+ |
+ |
+def hg_commit(repo, hg_upstream): |
+ try: |
+ subprocess.check_call(['hg', 'commit', '-m', |
+ 'Noissue - Updated copyright year', |
+ '--repository', repo]) |
+ except subprocess.CalledProcessError as e: |
+ if e.returncode == 1: # no changes |
+ return |
+ raise |
+ |
+ # Push changes, or save patch if access denied |
+ if 'ssh://hg@hg.adblockplus.org/' in hg_upstream: |
Vasily Kuznetsov
2017/06/27 16:18:52
`hg_commit` is called with the original url of the
rosie
2017/07/03 15:33:45
If we're pushing to ssh://hg@hg.adblockplus.org/ s
rosie
2017/07/04 13:38:24
If we're pushing to ssh://hg@hg.adblockplus.org/ s
|
+ hg_upstream += repo |
+ if subprocess.call(['hg', 'push', '--repository', repo, hg_upstream]) != 0: |
+ with open(repo + '.patch', 'wb') as file: |
+ print('couldnt push, making patch instead') |
+ subprocess.check_call(['hg', 'export', '--repository', repo], |
+ stdout=file) |
+ |
+ |
+class Parser(html.parser.HTMLParser): |
+ result = [] |
+ recordtr = False |
+ cell = 0 |
+ current_url = '' |
+ dir_path = 'https://hg.adblockplus.org/' |
Vasily Kuznetsov
2017/06/27 16:18:52
Is this used anywhere?
rosie
2017/07/03 15:33:44
Done.
|
+ |
+ def handle_starttag(self, tag, attrs): |
+ if tag == 'tr': |
+ self.recordtr = True |
+ if tag == 'td': |
+ self.cell += 1 |
+ if tag == 'a': |
+ attrs = dict(attrs) |
+ if 'list' in attrs.get('class', '').split(): |
+ self.current_url = attrs['href'] |
+ |
+ def handle_endtag(self, tag): |
+ if tag == 'tr': |
+ self.recordtr = False |
+ self.cell = 0 |
+ |
+ def handle_data(self, data): |
+ if (self.cell == 2) and self.recordtr is True: |
Vasily Kuznetsov
2017/06/27 16:18:51
The parentheses around the first operand of `and`
rosie
2017/07/03 15:33:45
Done.
|
+ self.recordtr = False |
+ self.cell = 0 |
+ # Only process the URL if the description is not Deprecated |
+ depr1 = re.search(r'\*DEPRECATED\*', data) |
Vasily Kuznetsov
2017/06/27 16:18:52
I think it would make the code more readable and e
rosie
2017/07/03 15:33:44
Done.
|
+ depr2 = re.search(r'(Deprecated)', data) |
+ if not depr1 and not depr2 and len(self.current_url) > 2: |
+ self.result += [self.current_url] |
+ return self.result |
+ |
+ |
+def extract_urls(hg_page): |
+ dir_path = dirname(hg_page) + '/' |
Vasily Kuznetsov
2017/06/27 16:18:52
Two points here:
- `hg_page` is an url, could we u
rosie
2017/07/03 15:33:45
Done.
|
+ parser = Parser() |
+ with urllib.request.urlopen(hg_page) as response: |
+ parser.feed(response.read().decode('utf-8')) |
+ parser.close() |
+ my_result = [] |
Vasily Kuznetsov
2017/06/27 16:18:52
Maybe better call this `repo_urls`?
rosie
2017/07/03 15:33:46
Done.
|
+ for i in range(len(parser.result)): |
Vasily Kuznetsov
2017/06/27 16:18:53
Why not `for url in parser.result:`?
rosie
2017/07/03 15:33:45
Done.
|
+ my_result.append(urllib.parse.urljoin(dir_path, parser.result[i])) |
+ return my_result |
+ |
+ |
+def main(hg_page, hg_upstream): |
+ repo_list = extract_urls(hg_page) |
+ for repo in repo_list: |
Vasily Kuznetsov
2017/06/27 16:18:52
Could probably just be `for repo in extract_urls(h
rosie
2017/07/03 15:33:45
Done.
|
+ process_repo(repo, hg_upstream) |
+ |
+ |
+if __name__ == '__main__': |
+ arg_parser = argparse.ArgumentParser() |
+ arg_parser.add_argument('-u', '--url', |
+ default='https://hg.adblockplus.org/', |
+ help='specify which Mercurial URL site to scrape') |
+ arg_parser.add_argument('-p', '--push', |
+ default='ssh://hg@hg.adblockplus.org/', |
Vasily Kuznetsov
2017/06/27 16:18:52
This should probably default to `None` and then if
rosie
2017/07/03 15:33:46
Done.
|
+ help='specify where to push the repository') |
+ args = arg_parser.parse_args() |
+ hg_page = args.url |
+ hg_upstream = args.push |
+ main(hg_page, hg_upstream) |