OLD | NEW |
(Empty) | |
| 1 #!/usr/bin/env python3 |
| 2 |
| 3 import os |
| 4 import subprocess |
| 5 import re |
| 6 import datetime |
| 7 import shutil |
| 8 import urllib.parse |
| 9 import urllib.request |
| 10 import html.parser |
| 11 import argparse |
| 12 |
| 13 |
| 14 CURRENT_YEAR = datetime.datetime.now().year |
| 15 |
| 16 |
| 17 def process_repo(url, hg_upstream): |
| 18 repo = url.rstrip('/').split('/')[-1] |
| 19 |
| 20 if repo in { |
| 21 # headers are copied from libadblockplus, no need to update seperately |
| 22 'libadblockplus-binaries', |
| 23 # huge and only contains autogenerated builds |
| 24 'downloads', |
| 25 }: |
| 26 return |
| 27 |
| 28 try: |
| 29 subprocess.check_call(['hg', 'clone', url, repo]) |
| 30 if repo == 'adblockbrowser': |
| 31 # adblockbrowser is a FF fork with its own changes in a |
| 32 # seperate branch |
| 33 subprocess.check_call(['hg', 'up', '--rev', 'adblockbrowser', |
| 34 '--repository', repo]) |
| 35 else: |
| 36 # switch to 'master' bookmark if it exists |
| 37 subprocess.call(['hg', 'up', '--rev', 'master', |
| 38 '--repository', repo]) |
| 39 for dirpath, dirnames, filenames in os.walk(repo): |
| 40 if dirpath == repo: |
| 41 dirnames.remove('.hg') |
| 42 |
| 43 for filename in filenames: |
| 44 text_replace(dirpath, filename) |
| 45 if hg_upstream is None: |
| 46 hg_upstream = url |
| 47 else: |
| 48 hg_upstream += '/' + repo |
| 49 hg_commit(repo, hg_upstream) |
| 50 |
| 51 finally: |
| 52 shutil.rmtree(repo, ignore_errors=True) |
| 53 |
| 54 |
| 55 def text_replace(dirpath, filename): |
| 56 with open(os.path.join(dirpath, filename), 'r+', |
| 57 encoding='utf-8', newline='') as file: |
| 58 try: |
| 59 text = file.read() |
| 60 except UnicodeDecodeError: |
| 61 return |
| 62 |
| 63 text = re.sub( |
| 64 r'(copyright.*?\d{4})(?:-\d{4})?\s+eyeo gmbh', |
| 65 r'\1-{} eyeo GmbH'.format(CURRENT_YEAR), text, 0, re.I |
| 66 ) |
| 67 file.seek(0) |
| 68 file.write(text) |
| 69 file.truncate() |
| 70 |
| 71 |
| 72 def hg_commit(repo, hg_upstream): |
| 73 try: |
| 74 subprocess.check_call(['hg', 'commit', '-m', |
| 75 'Noissue - Updated copyright year', |
| 76 '--repository', repo]) |
| 77 except subprocess.CalledProcessError as e: |
| 78 if e.returncode == 1: # no changes |
| 79 return |
| 80 raise |
| 81 |
| 82 # Push changes, or save patch if access denied |
| 83 if subprocess.call(['hg', 'push', '--repository', repo, hg_upstream]) != 0: |
| 84 with open(repo + '.patch', 'wb') as file: |
| 85 print('couldnt push, making patch instead') |
| 86 subprocess.check_call(['hg', 'export', '--repository', repo], |
| 87 stdout=file) |
| 88 |
| 89 |
| 90 class Parser(html.parser.HTMLParser): |
| 91 result = [] |
| 92 recordtr = False |
| 93 cell = 0 |
| 94 current_url = '' |
| 95 |
| 96 def handle_starttag(self, tag, attrs): |
| 97 if tag == 'tr': |
| 98 self.recordtr = True |
| 99 if tag == 'td': |
| 100 self.cell += 1 |
| 101 if tag == 'a': |
| 102 attrs = dict(attrs) |
| 103 if 'list' in attrs.get('class', '').split(): |
| 104 self.current_url = attrs['href'] |
| 105 |
| 106 def handle_endtag(self, tag): |
| 107 if tag == 'tr': |
| 108 self.recordtr = False |
| 109 self.cell = 0 |
| 110 |
| 111 def handle_data(self, data): |
| 112 if self.cell == 2 and self.recordtr is True: |
| 113 self.recordtr = False |
| 114 self.cell = 0 |
| 115 # Only process the URL if the description is not Deprecated |
| 116 if ('*DEPRECATED*' not in data and '(Deprecated)' not in data and |
| 117 len(self.current_url) > 2): |
| 118 self.result += [self.current_url] |
| 119 return self.result |
| 120 |
| 121 |
| 122 def extract_urls(hg_page): |
| 123 base_url = os.path.dirname(hg_page) + '/' |
| 124 parser = Parser() |
| 125 with urllib.request.urlopen(hg_page) as response: |
| 126 parser.feed(response.read().decode('utf-8')) |
| 127 parser.close() |
| 128 repo_urls = [] |
| 129 for url in parser.result: |
| 130 repo_urls.append(urllib.parse.urljoin(base_url, url)) |
| 131 return repo_urls |
| 132 |
| 133 |
| 134 def main(hg_page, hg_upstream): |
| 135 for repo in extract_urls(hg_page): |
| 136 process_repo(repo, hg_upstream) |
| 137 |
| 138 |
| 139 if __name__ == '__main__': |
| 140 arg_parser = argparse.ArgumentParser() |
| 141 arg_parser.add_argument('-u', '--hg-url', |
| 142 help='specify which Mercurial URL site to scrape', |
| 143 default='https://hg.adblockplus.org/') |
| 144 arg_parser.add_argument('-p', '--push-url', |
| 145 default='ssh://hg@hg.adblockplus.org/', |
| 146 help='specify where to push the repository') |
| 147 args = arg_parser.parse_args() |
| 148 if (args.hg_url != 'https://hg.adblockplus.org/' and args.push_url == |
| 149 'ssh://hg@hg.adblockplus.org/'): |
| 150 arg_parser.error('If -u is provided, -p is mandatory') |
| 151 main(args.hg_url, args.push_url) |
OLD | NEW |