Left: | ||
Right: |
LEFT | RIGHT |
---|---|
1 #!/usr/bin/env python3 | 1 #!/usr/bin/env python3 |
2 | 2 |
3 import os | 3 import os |
4 import subprocess | 4 import subprocess |
5 import re | 5 import re |
6 import datetime | 6 import datetime |
7 import shutil | 7 import shutil |
8 import urllib.parse | 8 import urllib.parse |
9 import urllib.request | 9 import urllib.request |
10 import html.parser | 10 import html.parser |
11 import argparse | 11 import argparse |
12 from posixpath import dirname | |
13 | 12 |
14 | 13 |
15 CURRENT_YEAR = datetime.datetime.now().year | 14 CURRENT_YEAR = datetime.datetime.now().year |
16 | 15 |
17 | 16 |
18 def process_repo(url, hg_upstream): | 17 def process_repo(url, hg_upstream): |
Vasily Kuznetsov
2017/06/27 16:18:54
Is `hg_upstream` unused in this function? It shoul
rosie
2017/07/03 15:33:46
Done.
| |
19 repo = os.path.basename(os.path.normpath(url)) | 18 repo = url.rstrip('/').split('/')[-1] |
Vasily Kuznetsov
2017/06/27 16:18:51
Why are we applying `os.path.normpath` to a url he
rosie
2017/07/03 15:33:45
Done.
| |
19 | |
20 if repo in { | 20 if repo in { |
21 # headers are copied from libadblockplus, no need to update seperately | 21 # headers are copied from libadblockplus, no need to update seperately |
22 'libadblockplus-binaries', | 22 'libadblockplus-binaries', |
23 # huge and only contains autogenerated builds | 23 # huge and only contains autogenerated builds |
24 'downloads', | 24 'downloads', |
25 }: | 25 }: |
26 return | 26 return |
27 | 27 |
28 try: | 28 try: |
29 subprocess.check_call(['hg', 'clone', url, repo]) | 29 subprocess.check_call(['hg', 'clone', url, repo]) |
30 if repo == 'adblockbrowser': | 30 if repo == 'adblockbrowser': |
31 # adblockbrowser is a FF fork with its own changes in a | 31 # adblockbrowser is a FF fork with its own changes in a |
32 # seperate branch | 32 # seperate branch |
33 subprocess.check_call(['hg', 'up', '--rev', 'adblockbrowser', | 33 subprocess.check_call(['hg', 'up', '--rev', 'adblockbrowser', |
34 '--repository', repo]) | 34 '--repository', repo]) |
35 else: | 35 else: |
36 # switch to 'master' bookmark if it exists | 36 # switch to 'master' bookmark if it exists |
37 subprocess.call(['hg', 'up', '--rev', 'master', | 37 subprocess.call(['hg', 'up', '--rev', 'master', |
Vasily Kuznetsov
2017/06/27 16:18:54
Any idea why we're not doing `check_call` here jus
rosie
2017/07/03 15:33:45
This line attempts to switch to the master branch.
Vasily Kuznetsov
2017/07/03 19:25:45
I see. Makes sense.
| |
38 '--repository', repo]) | 38 '--repository', repo]) |
39 for dirpath, dirnames, filenames in os.walk(repo): | 39 for dirpath, dirnames, filenames in os.walk(repo): |
40 if dirpath == repo: | 40 if dirpath == repo: |
41 dirnames.remove('.hg') | 41 dirnames.remove('.hg') |
42 | 42 |
43 for filename in filenames: | 43 for filename in filenames: |
44 text_replace(dirpath, filename) | 44 text_replace(dirpath, filename) |
45 hg_commit(repo, url) | 45 if hg_upstream is None: |
46 hg_upstream = url | |
47 else: | |
48 hg_upstream += '/' + repo | |
49 hg_commit(repo, hg_upstream) | |
46 | 50 |
47 finally: | 51 finally: |
48 shutil.rmtree(repo, ignore_errors=True) | 52 shutil.rmtree(repo, ignore_errors=True) |
49 | 53 |
50 | 54 |
51 def text_replace(dirpath, filename): | 55 def text_replace(dirpath, filename): |
52 with open(os.path.join(dirpath, filename), 'r+', | 56 with open(os.path.join(dirpath, filename), 'r+', |
53 encoding='utf-8', newline='') as file: | 57 encoding='utf-8', newline='') as file: |
54 try: | 58 try: |
55 text = file.read() | 59 text = file.read() |
56 except UnicodeDecodeError: | 60 except UnicodeDecodeError: |
57 print("Error: Couldn't read {}{}".format(dirpath, filename)) | |
58 return | 61 return |
59 | 62 |
60 text = re.sub( | 63 text = re.sub( |
61 r'(copyright.*?\d{4})(?:-\d{4})?\s+eyeo gmbh', | 64 r'(copyright.*?\d{4})(?:-\d{4})?\s+eyeo gmbh', |
62 r'\1-{} eyeo GmbH'.format(CURRENT_YEAR), text, 0, re.I | 65 r'\1-{} eyeo GmbH'.format(CURRENT_YEAR), text, 0, re.I |
63 ) | 66 ) |
64 file.seek(0) | 67 file.seek(0) |
65 file.write(text) | 68 file.write(text) |
66 file.truncate() | 69 file.truncate() |
67 | 70 |
68 | 71 |
69 def hg_commit(repo, hg_upstream): | 72 def hg_commit(repo, hg_upstream): |
70 try: | 73 try: |
71 subprocess.check_call(['hg', 'commit', '-m', | 74 subprocess.check_call(['hg', 'commit', '-m', |
72 'Noissue - Updated copyright year', | 75 'Noissue - Updated copyright year', |
73 '--repository', repo]) | 76 '--repository', repo]) |
74 except subprocess.CalledProcessError as e: | 77 except subprocess.CalledProcessError as e: |
75 if e.returncode == 1: # no changes | 78 if e.returncode == 1: # no changes |
76 return | 79 return |
77 raise | 80 raise |
78 | 81 |
79 # Push changes, or save patch if access denied | 82 # Push changes, or save patch if access denied |
80 if 'ssh://hg@hg.adblockplus.org/' in hg_upstream: | |
Vasily Kuznetsov
2017/06/27 16:18:52
`hg_commit` is called with the original url of the
rosie
2017/07/03 15:33:45
If we're pushing to ssh://hg@hg.adblockplus.org/ s
rosie
2017/07/04 13:38:24
If we're pushing to ssh://hg@hg.adblockplus.org/ s
| |
81 hg_upstream += repo | |
82 if subprocess.call(['hg', 'push', '--repository', repo, hg_upstream]) != 0: | 83 if subprocess.call(['hg', 'push', '--repository', repo, hg_upstream]) != 0: |
83 with open(repo + '.patch', 'wb') as file: | 84 with open(repo + '.patch', 'wb') as file: |
84 print('couldnt push, making patch instead') | 85 print('couldnt push, making patch instead') |
85 subprocess.check_call(['hg', 'export', '--repository', repo], | 86 subprocess.check_call(['hg', 'export', '--repository', repo], |
86 stdout=file) | 87 stdout=file) |
87 | 88 |
88 | 89 |
89 class Parser(html.parser.HTMLParser): | 90 class Parser(html.parser.HTMLParser): |
90 result = [] | 91 result = [] |
91 recordtr = False | 92 recordtr = False |
92 cell = 0 | 93 cell = 0 |
93 current_url = '' | 94 current_url = '' |
94 dir_path = 'https://hg.adblockplus.org/' | |
Vasily Kuznetsov
2017/06/27 16:18:52
Is this used anywhere?
rosie
2017/07/03 15:33:44
Done.
| |
95 | 95 |
96 def handle_starttag(self, tag, attrs): | 96 def handle_starttag(self, tag, attrs): |
97 if tag == 'tr': | 97 if tag == 'tr': |
98 self.recordtr = True | 98 self.recordtr = True |
99 if tag == 'td': | 99 if tag == 'td': |
100 self.cell += 1 | 100 self.cell += 1 |
101 if tag == 'a': | 101 if tag == 'a': |
102 attrs = dict(attrs) | 102 attrs = dict(attrs) |
103 if 'list' in attrs.get('class', '').split(): | 103 if 'list' in attrs.get('class', '').split(): |
104 self.current_url = attrs['href'] | 104 self.current_url = attrs['href'] |
105 | 105 |
106 def handle_endtag(self, tag): | 106 def handle_endtag(self, tag): |
107 if tag == 'tr': | 107 if tag == 'tr': |
108 self.recordtr = False | 108 self.recordtr = False |
109 self.cell = 0 | 109 self.cell = 0 |
110 | 110 |
111 def handle_data(self, data): | 111 def handle_data(self, data): |
112 if (self.cell == 2) and self.recordtr is True: | 112 if self.cell == 2 and self.recordtr is True: |
Vasily Kuznetsov
2017/06/27 16:18:51
The parentheses around the first operand of `and`
rosie
2017/07/03 15:33:45
Done.
| |
113 self.recordtr = False | 113 self.recordtr = False |
114 self.cell = 0 | 114 self.cell = 0 |
115 # Only process the URL if the description is not Deprecated | 115 # Only process the URL if the description is not Deprecated |
116 depr1 = re.search(r'\*DEPRECATED\*', data) | 116 if ('*DEPRECATED*' not in data and |
Vasily Kuznetsov
2017/06/27 16:18:52
I think it would make the code more readable and e
rosie
2017/07/03 15:33:44
Done.
| |
117 depr2 = re.search(r'(Deprecated)', data) | 117 '(Deprecated)' not in data and |
118 if not depr1 and not depr2 and len(self.current_url) > 2: | 118 len(self.current_url) > 2): |
119 self.result += [self.current_url] | 119 self.result += [self.current_url] |
120 return self.result | 120 return self.result |
121 | 121 |
122 | 122 |
123 def extract_urls(hg_page): | 123 def extract_urls(hg_page): |
124 dir_path = dirname(hg_page) + '/' | 124 base_url = os.path.dirname(hg_page) + '/' |
Vasily Kuznetsov
2017/06/27 16:18:52
Two points here:
- `hg_page` is an url, could we u
rosie
2017/07/03 15:33:45
Done.
| |
125 parser = Parser() | 125 parser = Parser() |
126 with urllib.request.urlopen(hg_page) as response: | 126 with urllib.request.urlopen(hg_page) as response: |
127 parser.feed(response.read().decode('utf-8')) | 127 parser.feed(response.read().decode('utf-8')) |
128 parser.close() | 128 parser.close() |
129 my_result = [] | 129 repo_urls = [] |
Vasily Kuznetsov
2017/06/27 16:18:52
Maybe better call this `repo_urls`?
rosie
2017/07/03 15:33:46
Done.
| |
130 for i in range(len(parser.result)): | 130 for url in parser.result: |
Vasily Kuznetsov
2017/06/27 16:18:53
Why not `for url in parser.result:`?
rosie
2017/07/03 15:33:45
Done.
| |
131 my_result.append(urllib.parse.urljoin(dir_path, parser.result[i])) | 131 repo_urls.append(urllib.parse.urljoin(base_url, url)) |
132 return my_result | 132 return repo_urls |
133 | 133 |
134 | 134 |
135 def main(hg_page, hg_upstream): | 135 def main(hg_page, hg_upstream): |
136 repo_list = extract_urls(hg_page) | 136 for repo in extract_urls(hg_page): |
137 for repo in repo_list: | |
Vasily Kuznetsov
2017/06/27 16:18:52
Could probably just be `for repo in extract_urls(h
rosie
2017/07/03 15:33:45
Done.
| |
138 process_repo(repo, hg_upstream) | 137 process_repo(repo, hg_upstream) |
139 | 138 |
140 | 139 |
141 if __name__ == '__main__': | 140 if __name__ == '__main__': |
142 arg_parser = argparse.ArgumentParser() | 141 arg_parser = argparse.ArgumentParser() |
143 arg_parser.add_argument('-u', '--url', | 142 arg_parser.add_argument('-u', '--hg-url', |
144 default='https://hg.adblockplus.org/', | |
145 help='specify which Mercurial URL site to scrape') | 143 help='specify which Mercurial URL site to scrape') |
146 arg_parser.add_argument('-p', '--push', | 144 arg_parser.add_argument('-p', '--push-url', |
147 default='ssh://hg@hg.adblockplus.org/', | |
Vasily Kuznetsov
2017/06/27 16:18:52
This should probably default to `None` and then if
rosie
2017/07/03 15:33:46
Done.
| |
148 help='specify where to push the repository') | 145 help='specify where to push the repository') |
149 args = arg_parser.parse_args() | 146 args = arg_parser.parse_args() |
150 hg_page = args.url | 147 if args.hg_url and not args.push_url: |
151 hg_upstream = args.push | 148 arg_parser.error('If -u is provided, -p is mandatory') |
152 main(hg_page, hg_upstream) | 149 main(args.hg_url or 'https://hg.adblockplus.org/', |
150 args.push_url or 'ssh://hg@hg.adblockplus.org/') | |
LEFT | RIGHT |