update-copyright/update_copyright.py - Issue 29459580: Issue 5250 - Add copyright update script

Unified Diff: update-copyright/update_copyright.py

Issue 29459580: Issue 5250 - Add copyright update script (Closed) Base URL: https://hg.adblockplus.org/codingtools

Patch Set: Addressed comments Created June 23, 2017, 2:36 p.m.

Use n/p to move between diff chunks; N/P to move between comments.

Jump to:

View side-by-side diff with in-line comments

Index: update-copyright/update_copyright.py

===================================================================

new file mode 100644

--- /dev/null

+++ b/update-copyright/update_copyright.py

@@ -0,0 +1,152 @@

+#!/usr/bin/env python3

+import os

+import subprocess

+import re

+import datetime

+import shutil

+import urllib.parse

+import urllib.request

+import html.parser

+import argparse

+from posixpath import dirname

+CURRENT_YEAR = datetime.datetime.now().year

+def process_repo(url, hg_upstream):

Vasily Kuznetsov 2017/06/27 16:18:54 Is `hg_upstream` unused in this function? It shoul

rosie 2017/07/03 15:33:46 Done.

+ repo = os.path.basename(os.path.normpath(url))

Vasily Kuznetsov 2017/06/27 16:18:51 Why are we applying `os.path.normpath` to a url he

rosie 2017/07/03 15:33:45 Done.

+ if repo in {

+ # headers are copied from libadblockplus, no need to update seperately

+ 'libadblockplus-binaries',

+ # huge and only contains autogenerated builds

+ 'downloads',

+ }:

+ return

+ try:

+ subprocess.check_call(['hg', 'clone', url, repo])

+ if repo == 'adblockbrowser':

+ # adblockbrowser is a FF fork with its own changes in a

+ # seperate branch

+ subprocess.check_call(['hg', 'up', '--rev', 'adblockbrowser',

+ '--repository', repo])

+ else:

+ # switch to 'master' bookmark if it exists

+ subprocess.call(['hg', 'up', '--rev', 'master',

Vasily Kuznetsov 2017/06/27 16:18:54 Any idea why we're not doing `check_call` here jus

rosie 2017/07/03 15:33:45 This line attempts to switch to the master branch.

Vasily Kuznetsov 2017/07/03 19:25:45 I see. Makes sense.

+ '--repository', repo])

+ for dirpath, dirnames, filenames in os.walk(repo):

+ if dirpath == repo:

+ dirnames.remove('.hg')

+ for filename in filenames:

+ text_replace(dirpath, filename)

+ hg_commit(repo, url)

+ finally:

+ shutil.rmtree(repo, ignore_errors=True)

+def text_replace(dirpath, filename):

+ with open(os.path.join(dirpath, filename), 'r+',

+ encoding='utf-8', newline='') as file:

+ try:

+ text = file.read()

+ except UnicodeDecodeError:

+ print("Error: Couldn't read {}{}".format(dirpath, filename))

+ return

+ text = re.sub(

+ r'(copyright.*?\d{4})(?:-\d{4})?\s+eyeo gmbh',

+ r'\1-{} eyeo GmbH'.format(CURRENT_YEAR), text, 0, re.I

+ )

+ file.seek(0)

+ file.write(text)

+ file.truncate()

+def hg_commit(repo, hg_upstream):

+ try:

+ subprocess.check_call(['hg', 'commit', '-m',

+ 'Noissue - Updated copyright year',

+ '--repository', repo])

+ except subprocess.CalledProcessError as e:

+ if e.returncode == 1: # no changes

+ return

+ raise

+ # Push changes, or save patch if access denied

+ if 'ssh://hg@hg.adblockplus.org/' in hg_upstream:

Vasily Kuznetsov 2017/06/27 16:18:52 `hg_commit` is called with the original url of the

rosie 2017/07/03 15:33:45 If we're pushing to ssh://hg@hg.adblockplus.org/ s

rosie 2017/07/04 13:38:24 If we're pushing to ssh://hg@hg.adblockplus.org/ s

+ hg_upstream += repo

+ if subprocess.call(['hg', 'push', '--repository', repo, hg_upstream]) != 0:

+ with open(repo + '.patch', 'wb') as file:

+ print('couldnt push, making patch instead')

+ subprocess.check_call(['hg', 'export', '--repository', repo],

+ stdout=file)

+class Parser(html.parser.HTMLParser):

+ result = []

+ recordtr = False

+ cell = 0

+ current_url = ''

+ dir_path = 'https://hg.adblockplus.org/'

Vasily Kuznetsov 2017/06/27 16:18:52 Is this used anywhere?

rosie 2017/07/03 15:33:44 Done.

+ def handle_starttag(self, tag, attrs):

+ if tag == 'tr':

+ self.recordtr = True

+ if tag == 'td':

+ self.cell += 1

+ if tag == 'a':

+ attrs = dict(attrs)

+ if 'list' in attrs.get('class', '').split():

+ self.current_url = attrs['href']

+ def handle_endtag(self, tag):

+ if tag == 'tr':

+ self.recordtr = False

+ self.cell = 0

+ def handle_data(self, data):

+ if (self.cell == 2) and self.recordtr is True:

Vasily Kuznetsov 2017/06/27 16:18:51 The parentheses around the first operand of `and`

rosie 2017/07/03 15:33:45 Done.

+ self.recordtr = False

+ self.cell = 0

+ # Only process the URL if the description is not Deprecated

+ depr1 = re.search(r'\*DEPRECATED\*', data)

Vasily Kuznetsov 2017/06/27 16:18:52 I think it would make the code more readable and e

rosie 2017/07/03 15:33:44 Done.

+ depr2 = re.search(r'(Deprecated)', data)

+ if not depr1 and not depr2 and len(self.current_url) > 2:

+ self.result += [self.current_url]

+ return self.result

+def extract_urls(hg_page):

+ dir_path = dirname(hg_page) + '/'

Vasily Kuznetsov 2017/06/27 16:18:52 Two points here: - `hg_page` is an url, could we u

rosie 2017/07/03 15:33:45 Done.

+ parser = Parser()

+ with urllib.request.urlopen(hg_page) as response:

+ parser.feed(response.read().decode('utf-8'))

+ parser.close()

+ my_result = []

Vasily Kuznetsov 2017/06/27 16:18:52 Maybe better call this `repo_urls`?

rosie 2017/07/03 15:33:46 Done.

+ for i in range(len(parser.result)):

Vasily Kuznetsov 2017/06/27 16:18:53 Why not `for url in parser.result:`?

rosie 2017/07/03 15:33:45 Done.

+ my_result.append(urllib.parse.urljoin(dir_path, parser.result[i]))

+ return my_result

+def main(hg_page, hg_upstream):

+ repo_list = extract_urls(hg_page)

+ for repo in repo_list:

Vasily Kuznetsov 2017/06/27 16:18:52 Could probably just be `for repo in extract_urls(h

rosie 2017/07/03 15:33:45 Done.

+ process_repo(repo, hg_upstream)

+if __name__ == '__main__':

+ arg_parser = argparse.ArgumentParser()

+ arg_parser.add_argument('-u', '--url',

+ default='https://hg.adblockplus.org/',

+ help='specify which Mercurial URL site to scrape')

+ arg_parser.add_argument('-p', '--push',

+ default='ssh://hg@hg.adblockplus.org/',

Vasily Kuznetsov 2017/06/27 16:18:52 This should probably default to `None` and then if

rosie 2017/07/03 15:33:46 Done.

+ help='specify where to push the repository')

+ args = arg_parser.parse_args()

+ hg_page = args.url

+ hg_upstream = args.push

+ main(hg_page, hg_upstream)

« update-copyright/tox.ini ('K') | « update-copyright/tox.ini ('k') | no next file » | no next file with comments »