Rietveld Code Review Tool
Help | Bug tracker | Discussion group | Source code

Delta Between Two Patch Sets: sitescripts/subscriptions/bin/updateMalwareDomainsList.py

Issue 29338216: Issue 3774 - Support multiple mirrors for the Malware Domains List (Closed)
Left Patch Set: Created March 14, 2016, 1:23 p.m.
Right Patch Set: Addressed review comments 2 Created March 15, 2016, 10:51 p.m.
Left:
Right:
Use n/p to move between diff chunks; N/P to move between comments.
Jump to:
Left: Side by side diff | Download
Right: Side by side diff | Download
« no previous file with change/comment | « .sitescripts.example ('k') | no next file » | no next file with change/comment »
Toggle Intra-line Diffs ('i') | Expand Comments ('e') | Collapse Comments ('c') | Show Comments Hide Comments ('s')
LEFTRIGHT
1 # coding: utf-8 1 # coding: utf-8
2 2
3 # This file is part of the Adblock Plus web scripts, 3 # This file is part of the Adblock Plus web scripts,
4 # Copyright (C) 2006-2016 Eyeo GmbH 4 # Copyright (C) 2006-2016 Eyeo GmbH
5 # 5 #
6 # Adblock Plus is free software: you can redistribute it and/or modify 6 # Adblock Plus is free software: you can redistribute it and/or modify
7 # it under the terms of the GNU General Public License version 3 as 7 # it under the terms of the GNU General Public License version 3 as
8 # published by the Free Software Foundation. 8 # published by the Free Software Foundation.
9 # 9 #
10 # Adblock Plus is distributed in the hope that it will be useful, 10 # Adblock Plus is distributed in the hope that it will be useful,
11 # but WITHOUT ANY WARRANTY; without even the implied warranty of 11 # but WITHOUT ANY WARRANTY; without even the implied warranty of
12 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 12 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
13 # GNU General Public License for more details. 13 # GNU General Public License for more details.
14 # 14 #
15 # You should have received a copy of the GNU General Public License 15 # You should have received a copy of the GNU General Public License
16 # along with Adblock Plus. If not, see <http://www.gnu.org/licenses/>. 16 # along with Adblock Plus. If not, see <http://www.gnu.org/licenses/>.
17 17
18 import os, subprocess, codecs, urllib2, zipfile, tempfile, shutil, sys 18 import os, subprocess, codecs, urllib2, zipfile, tempfile, shutil, sys
19 import ConfigParser 19 import ConfigParser
20 from StringIO import StringIO 20 from StringIO import StringIO
21 from sitescripts.utils import get_config 21 from sitescripts.utils import get_config
22 22
23 23
24 filterlist_header = '''[Adblock Plus 1.1] 24 FILTERLIST_HEADER = '''[Adblock Plus 1.1]
25 ! This is a list of malware domains generated from malwaredomains.com data. 25 ! This is a list of malware domains generated from malwaredomains.com data.
26 ! Homepage: http://malwaredomains.com/?page_id=2 26 ! Homepage: http://malwaredomains.com/?page_id=2
27 ! Last modified: %timestamp% 27 ! Last modified: %timestamp%
28 ! Expires: 1d 28 ! Expires: 1d
29 !''' 29 !'''
30 30
31 # Default value for malwaredomains_mirrors in the config. 31 DEFAULT_MIRRORS_LIST = [
32 # Equivalent to the following in the config:
33 # malwaredomains_mirrors=
Sebastian Noack 2016/03/14 13:36:07 Example configuration should be documented in .sit
Vasily Kuznetsov 2016/03/14 14:18:12 Acknowledged.
Vasily Kuznetsov 2016/03/14 14:47:58 Done.
34 # http://mirror3.malwaredomains.com
35 # http://mirror1.malwaredomains.com
36 # http://mirror2.malwaredomains.com
37 default_mirrors_list = [
Sebastian Noack 2016/03/14 13:36:07 I'd rather wait until sitescripts.ini got updated
Vasily Kuznetsov 2016/03/14 14:18:11 I'm cool with this, but I'm not very sure how to u
Sebastian Noack 2016/03/14 15:40:12 Matze, Felix or Wladimir can change the config man
38 'http://mirror3.malwaredomains.com', 32 'http://mirror3.malwaredomains.com',
39 'http://mirror1.malwaredomains.com', 33 'http://mirror1.malwaredomains.com',
40 'http://mirror2.malwaredomains.com' 34 'http://mirror2.malwaredomains.com'
41 ] 35 ]
42 36
43 malwaredomains_path = '/files/justdomains.zip' 37 MALWAREDOMAINS_PATH = '/files/justdomains.zip'
Sebastian Noack 2016/03/14 13:36:07 Nit: Please use upper case for constant-like varia
Vasily Kuznetsov 2016/03/14 14:18:11 Acknowledged.
Vasily Kuznetsov 2016/03/14 14:47:58 Done.
44 38
45 39
46 def try_mirror(mirror, path): 40 def try_mirror(mirror):
47 try: 41 try:
48 response = urllib2.urlopen(mirror + path) 42 response = urllib2.urlopen(mirror + MALWAREDOMAINS_PATH)
49 return response.read() 43 return response.read()
50 except urllib2.HTTPError, err: 44 except urllib2.HTTPError:
51 print >>sys.stderr, '{}: {}'.format(mirror, err)
Sebastian Noack 2016/03/14 13:36:07 Any output we generate will result into an email s
Vasily Kuznetsov 2016/03/14 14:18:11 Acknowledged.
Vasily Kuznetsov 2016/03/14 14:47:58 Done.
52 return None 45 return None
53 46
54 47
55 if __name__ == '__main__': 48 if __name__ == '__main__':
56 config = get_config() 49 config = get_config()
57 repository = config.get('subscriptionDownloads', 'malwaredomains_repository') 50 repository = config.get('subscriptionDownloads', 'malwaredomains_repository')
58 try: 51 try:
59 mirrors = config.get('subscriptionDownloads', 'malwaredomains_mirrors') 52 mirrors = config.get('subscriptionDownloads', 'malwaredomains_mirrors')
60 mirrors_list = filter(None, [mirror.strip() for mirror in mirrors.split()]) 53 mirrors_list = mirrors.split()
61 except ConfigParser.NoOptionError: 54 except ConfigParser.NoOptionError:
62 mirrors_list = default_mirrors_list 55 mirrors_list = DEFAULT_MIRRORS_LIST
63 56
64 tempdir = tempfile.mkdtemp(prefix='malwaredomains') 57 tempdir = tempfile.mkdtemp(prefix='malwaredomains')
65 try: 58 try:
66 subprocess.check_call(['hg', '-q', 'clone', '-U', repository, tempdir]) 59 subprocess.check_call(['hg', '-q', 'clone', '-U', repository, tempdir])
67 subprocess.check_call(['hg', '-q', 'up', '-R', tempdir, '-r', 'default']) 60 subprocess.check_call(['hg', '-q', 'up', '-R', tempdir, '-r', 'default'])
68 61
69 path = os.path.join(tempdir, 'malwaredomains_full.txt') 62 path = os.path.join(tempdir, 'malwaredomains_full.txt')
70 file = codecs.open(path, 'wb', encoding='utf-8') 63 file = codecs.open(path, 'wb', encoding='utf-8')
71 64
72 print >>file, filterlist_header 65 print >>file, FILTERLIST_HEADER
73 66
74 for mirror in mirrors_list: 67 for mirror in mirrors_list:
75 data = try_mirror(mirror, malwaredomains_path) 68 data = try_mirror(mirror)
76 if data is not None: 69 if data is not None:
77 break 70 break
78 else: 71 else:
79 print >>sys.stderr, 'Unable to fetch malware domains list.' 72 sys.exit('Unable to fetch malware domains list.')
80 sys.exit(1)
81 73
82 zip = zipfile.ZipFile(StringIO(data), 'r') 74 zip = zipfile.ZipFile(StringIO(data), 'r')
83 info = zip.infolist()[0] 75 info = zip.infolist()[0]
84 for line in str(zip.read(info.filename)).splitlines(): 76 for line in str(zip.read(info.filename)).splitlines():
85 domain = line.strip() 77 domain = line.strip()
86 if not domain: 78 if not domain:
87 continue 79 continue
88 80
89 print >>file, '||%s^' % domain.decode('idna') 81 print >>file, '||%s^' % domain.decode('idna')
90 file.close(); 82 file.close();
91 83
92 if subprocess.check_output(['hg', 'stat', '-R', tempdir]) != '': 84 if subprocess.check_output(['hg', 'stat', '-R', tempdir]) != '':
93 subprocess.check_call(['hg', '-q', 'commit', '-R', tempdir, '-A', '-u', 'h gbot', '-m', 'Updated malwaredomains.com data']) 85 subprocess.check_call(['hg', '-q', 'commit', '-R', tempdir, '-A', '-u', 'h gbot', '-m', 'Updated malwaredomains.com data'])
94 subprocess.check_call(['hg', '-q', 'push', '-R', tempdir]) 86 subprocess.check_call(['hg', '-q', 'push', '-R', tempdir])
95 finally: 87 finally:
96 shutil.rmtree(tempdir, ignore_errors=True) 88 shutil.rmtree(tempdir, ignore_errors=True)
LEFTRIGHT

Powered by Google App Engine
This is Rietveld