| Left: | ||
| Right: |
| OLD | NEW |
|---|---|
| (Empty) | |
| 1 #!/usr/bin/env python3 | |
| 2 | |
| 3 import os | |
| 4 import subprocess | |
| 5 import re | |
| 6 import datetime | |
| 7 import shutil | |
| 8 import urllib.parse | |
| 9 import urllib.request | |
| 10 import html.parser | |
| 11 import argparse | |
| 12 from posixpath import dirname | |
| 13 | |
| 14 | |
| 15 CURRENT_YEAR = datetime.datetime.now().year | |
| 16 | |
| 17 | |
| 18 def process_repo(url, hg_upstream): | |
|
Vasily Kuznetsov
2017/06/27 16:18:54
Is `hg_upstream` unused in this function? It shoul
rosie
2017/07/03 15:33:46
Done.
| |
| 19 repo = os.path.basename(os.path.normpath(url)) | |
|
Vasily Kuznetsov
2017/06/27 16:18:51
Why are we applying `os.path.normpath` to a url he
rosie
2017/07/03 15:33:45
Done.
| |
| 20 if repo in { | |
| 21 # headers are copied from libadblockplus, no need to update seperately | |
| 22 'libadblockplus-binaries', | |
| 23 # huge and only contains autogenerated builds | |
| 24 'downloads', | |
| 25 }: | |
| 26 return | |
| 27 | |
| 28 try: | |
| 29 subprocess.check_call(['hg', 'clone', url, repo]) | |
| 30 if repo == 'adblockbrowser': | |
| 31 # adblockbrowser is a FF fork with its own changes in a | |
| 32 # seperate branch | |
| 33 subprocess.check_call(['hg', 'up', '--rev', 'adblockbrowser', | |
| 34 '--repository', repo]) | |
| 35 else: | |
| 36 # switch to 'master' bookmark if it exists | |
| 37 subprocess.call(['hg', 'up', '--rev', 'master', | |
|
Vasily Kuznetsov
2017/06/27 16:18:54
Any idea why we're not doing `check_call` here jus
rosie
2017/07/03 15:33:45
This line attempts to switch to the master branch.
Vasily Kuznetsov
2017/07/03 19:25:45
I see. Makes sense.
| |
| 38 '--repository', repo]) | |
| 39 for dirpath, dirnames, filenames in os.walk(repo): | |
| 40 if dirpath == repo: | |
| 41 dirnames.remove('.hg') | |
| 42 | |
| 43 for filename in filenames: | |
| 44 text_replace(dirpath, filename) | |
| 45 hg_commit(repo, url) | |
| 46 | |
| 47 finally: | |
| 48 shutil.rmtree(repo, ignore_errors=True) | |
| 49 | |
| 50 | |
| 51 def text_replace(dirpath, filename): | |
| 52 with open(os.path.join(dirpath, filename), 'r+', | |
| 53 encoding='utf-8', newline='') as file: | |
| 54 try: | |
| 55 text = file.read() | |
| 56 except UnicodeDecodeError: | |
| 57 print("Error: Couldn't read {}{}".format(dirpath, filename)) | |
| 58 return | |
| 59 | |
| 60 text = re.sub( | |
| 61 r'(copyright.*?\d{4})(?:-\d{4})?\s+eyeo gmbh', | |
| 62 r'\1-{} eyeo GmbH'.format(CURRENT_YEAR), text, 0, re.I | |
| 63 ) | |
| 64 file.seek(0) | |
| 65 file.write(text) | |
| 66 file.truncate() | |
| 67 | |
| 68 | |
| 69 def hg_commit(repo, hg_upstream): | |
| 70 try: | |
| 71 subprocess.check_call(['hg', 'commit', '-m', | |
| 72 'Noissue - Updated copyright year', | |
| 73 '--repository', repo]) | |
| 74 except subprocess.CalledProcessError as e: | |
| 75 if e.returncode == 1: # no changes | |
| 76 return | |
| 77 raise | |
| 78 | |
| 79 # Push changes, or save patch if access denied | |
| 80 if 'ssh://hg@hg.adblockplus.org/' in hg_upstream: | |
|
Vasily Kuznetsov
2017/06/27 16:18:52
`hg_commit` is called with the original url of the
rosie
2017/07/03 15:33:45
If we're pushing to ssh://hg@hg.adblockplus.org/ s
rosie
2017/07/04 13:38:24
If we're pushing to ssh://hg@hg.adblockplus.org/ s
| |
| 81 hg_upstream += repo | |
| 82 if subprocess.call(['hg', 'push', '--repository', repo, hg_upstream]) != 0: | |
| 83 with open(repo + '.patch', 'wb') as file: | |
| 84 print('couldnt push, making patch instead') | |
| 85 subprocess.check_call(['hg', 'export', '--repository', repo], | |
| 86 stdout=file) | |
| 87 | |
| 88 | |
| 89 class Parser(html.parser.HTMLParser): | |
| 90 result = [] | |
| 91 recordtr = False | |
| 92 cell = 0 | |
| 93 current_url = '' | |
| 94 dir_path = 'https://hg.adblockplus.org/' | |
|
Vasily Kuznetsov
2017/06/27 16:18:52
Is this used anywhere?
rosie
2017/07/03 15:33:44
Done.
| |
| 95 | |
| 96 def handle_starttag(self, tag, attrs): | |
| 97 if tag == 'tr': | |
| 98 self.recordtr = True | |
| 99 if tag == 'td': | |
| 100 self.cell += 1 | |
| 101 if tag == 'a': | |
| 102 attrs = dict(attrs) | |
| 103 if 'list' in attrs.get('class', '').split(): | |
| 104 self.current_url = attrs['href'] | |
| 105 | |
| 106 def handle_endtag(self, tag): | |
| 107 if tag == 'tr': | |
| 108 self.recordtr = False | |
| 109 self.cell = 0 | |
| 110 | |
| 111 def handle_data(self, data): | |
| 112 if (self.cell == 2) and self.recordtr is True: | |
|
Vasily Kuznetsov
2017/06/27 16:18:51
The parentheses around the first operand of `and`
rosie
2017/07/03 15:33:45
Done.
| |
| 113 self.recordtr = False | |
| 114 self.cell = 0 | |
| 115 # Only process the URL if the description is not Deprecated | |
| 116 depr1 = re.search(r'\*DEPRECATED\*', data) | |
|
Vasily Kuznetsov
2017/06/27 16:18:52
I think it would make the code more readable and e
rosie
2017/07/03 15:33:44
Done.
| |
| 117 depr2 = re.search(r'(Deprecated)', data) | |
| 118 if not depr1 and not depr2 and len(self.current_url) > 2: | |
| 119 self.result += [self.current_url] | |
| 120 return self.result | |
| 121 | |
| 122 | |
| 123 def extract_urls(hg_page): | |
| 124 dir_path = dirname(hg_page) + '/' | |
|
Vasily Kuznetsov
2017/06/27 16:18:52
Two points here:
- `hg_page` is an url, could we u
rosie
2017/07/03 15:33:45
Done.
| |
| 125 parser = Parser() | |
| 126 with urllib.request.urlopen(hg_page) as response: | |
| 127 parser.feed(response.read().decode('utf-8')) | |
| 128 parser.close() | |
| 129 my_result = [] | |
|
Vasily Kuznetsov
2017/06/27 16:18:52
Maybe better call this `repo_urls`?
rosie
2017/07/03 15:33:46
Done.
| |
| 130 for i in range(len(parser.result)): | |
|
Vasily Kuznetsov
2017/06/27 16:18:53
Why not `for url in parser.result:`?
rosie
2017/07/03 15:33:45
Done.
| |
| 131 my_result.append(urllib.parse.urljoin(dir_path, parser.result[i])) | |
| 132 return my_result | |
| 133 | |
| 134 | |
| 135 def main(hg_page, hg_upstream): | |
| 136 repo_list = extract_urls(hg_page) | |
| 137 for repo in repo_list: | |
|
Vasily Kuznetsov
2017/06/27 16:18:52
Could probably just be `for repo in extract_urls(h
rosie
2017/07/03 15:33:45
Done.
| |
| 138 process_repo(repo, hg_upstream) | |
| 139 | |
| 140 | |
| 141 if __name__ == '__main__': | |
| 142 arg_parser = argparse.ArgumentParser() | |
| 143 arg_parser.add_argument('-u', '--url', | |
| 144 default='https://hg.adblockplus.org/', | |
| 145 help='specify which Mercurial URL site to scrape') | |
| 146 arg_parser.add_argument('-p', '--push', | |
| 147 default='ssh://hg@hg.adblockplus.org/', | |
|
Vasily Kuznetsov
2017/06/27 16:18:52
This should probably default to `None` and then if
rosie
2017/07/03 15:33:46
Done.
| |
| 148 help='specify where to push the repository') | |
| 149 args = arg_parser.parse_args() | |
| 150 hg_page = args.url | |
| 151 hg_upstream = args.push | |
| 152 main(hg_page, hg_upstream) | |
| OLD | NEW |