Rietveld Code Review Tool
Help | Bug tracker | Discussion group | Source code

Side by Side Diff: sitescripts/crawler/bin/import_sites.py

Issue 29345242: Noissue - Adapt quotes for compliance with our coding style in sitescripts (Closed)
Patch Set: Fixed raw string Created May 30, 2016, 8:47 a.m.
Left:
Right:
Use n/p to move between diff chunks; N/P to move between comments.
Jump to:
View unified diff | Download patch
OLDNEW
1 # This file is part of the Adblock Plus web scripts, 1 # This file is part of the Adblock Plus web scripts,
2 # Copyright (C) 2006-2016 Eyeo GmbH 2 # Copyright (C) 2006-2016 Eyeo GmbH
3 # 3 #
4 # Adblock Plus is free software: you can redistribute it and/or modify 4 # Adblock Plus is free software: you can redistribute it and/or modify
5 # it under the terms of the GNU General Public License version 3 as 5 # it under the terms of the GNU General Public License version 3 as
6 # published by the Free Software Foundation. 6 # published by the Free Software Foundation.
7 # 7 #
8 # Adblock Plus is distributed in the hope that it will be useful, 8 # Adblock Plus is distributed in the hope that it will be useful,
9 # but WITHOUT ANY WARRANTY; without even the implied warranty of 9 # but WITHOUT ANY WARRANTY; without even the implied warranty of
10 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 10 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
11 # GNU General Public License for more details. 11 # GNU General Public License for more details.
12 # 12 #
13 # You should have received a copy of the GNU General Public License 13 # You should have received a copy of the GNU General Public License
14 # along with Adblock Plus. If not, see <http://www.gnu.org/licenses/>. 14 # along with Adblock Plus. If not, see <http://www.gnu.org/licenses/>.
15 15
16 import MySQLdb 16 import MySQLdb
17 import os 17 import os
18 import re 18 import re
19 import subprocess 19 import subprocess
20 from sitescripts.utils import cached, get_config 20 from sitescripts.utils import cached, get_config
21 21
22 22
23 @cached(600) 23 @cached(600)
24 def _get_db(): 24 def _get_db():
25 database = get_config().get("crawler", "database") 25 database = get_config().get('crawler', 'database')
26 dbuser = get_config().get("crawler", "dbuser") 26 dbuser = get_config().get('crawler', 'dbuser')
27 dbpasswd = get_config().get("crawler", "dbpassword") 27 dbpasswd = get_config().get('crawler', 'dbpassword')
28 if os.name == "nt": 28 if os.name == 'nt':
29 return MySQLdb.connect(user=dbuser, passwd=dbpasswd, db=database, 29 return MySQLdb.connect(user=dbuser, passwd=dbpasswd, db=database,
30 use_unicode=True, charset="utf8", named_pipe=True ) 30 use_unicode=True, charset='utf8', named_pipe=True )
31 else: 31 else:
32 return MySQLdb.connect(user=dbuser, passwd=dbpasswd, db=database, 32 return MySQLdb.connect(user=dbuser, passwd=dbpasswd, db=database,
33 use_unicode=True, charset="utf8") 33 use_unicode=True, charset='utf8')
34 34
35 35
36 def _get_cursor(): 36 def _get_cursor():
37 return _get_db().cursor(MySQLdb.cursors.DictCursor) 37 return _get_db().cursor(MySQLdb.cursors.DictCursor)
38 38
39 39
40 def _hg(args): 40 def _hg(args):
41 return subprocess.check_output(["hg"] + args) 41 return subprocess.check_output(['hg'] + args)
42 42
43 43
44 def _extract_sites(easylist_dir): 44 def _extract_sites(easylist_dir):
45 os.chdir(easylist_dir) 45 os.chdir(easylist_dir)
46 process = _hg(["log", "--template", "{desc}\n"]) 46 process = _hg(['log', '--template', '{desc}\n'])
47 urls = set([]) 47 urls = set([])
48 48
49 for line in process.stdout: 49 for line in process.stdout:
50 match = re.search(r"\b(https?://\S*)", line) 50 match = re.search(r'\b(https?://\S*)', line)
51 if not match: 51 if not match:
52 continue 52 continue
53 53
54 url = match.group(1).strip() 54 url = match.group(1).strip()
55 urls.add(url) 55 urls.add(url)
56 56
57 return urls 57 return urls
58 58
59 59
60 def _insert_sites(site_urls): 60 def _insert_sites(site_urls):
61 cursor = _get_cursor() 61 cursor = _get_cursor()
62 for url in site_urls: 62 for url in site_urls:
63 cursor.execute("INSERT IGNORE INTO crawler_sites (url) VALUES (%s)", url ) 63 cursor.execute('INSERT IGNORE INTO crawler_sites (url) VALUES (%s)', url )
64 _get_db().commit() 64 _get_db().commit()
65 65
66 if __name__ == "__main__": 66 if __name__ == '__main__':
67 easylist_dir = get_config().get("crawler", "easylist_repository") 67 easylist_dir = get_config().get('crawler', 'easylist_repository')
68 site_urls = _extract_sites(easylist_dir) 68 site_urls = _extract_sites(easylist_dir)
69 _insert_sites(site_urls) 69 _insert_sites(site_urls)
OLDNEW

Powered by Google App Engine
This is Rietveld