Rietveld Code Review Tool
Help | Bug tracker | Discussion group | Source code

Side by Side Diff: sitescripts/crawler/bin/extract_filters.py

Issue 8432110: sitescripts: Script to extract domain-specific filters (Closed)
Patch Set: Created Sept. 28, 2012, 2:32 p.m.
Left:
Right:
Use n/p to move between diff chunks; N/P to move between comments.
Jump to:
View unified diff | Download patch
« no previous file with comments | « sitescripts/crawler/README.md ('k') | no next file » | no next file with comments »
Toggle Intra-line Diffs ('i') | Expand Comments ('e') | Collapse Comments ('c') | Show Comments Hide Comments ('s')
OLDNEW
(Empty)
1 # coding: utf-8
2
3 # This Source Code is subject to the terms of the Mozilla Public License
4 # version 2.0 (the "License"). You can obtain a copy of the License at
5 # http://mozilla.org/MPL/2.0/.
6
7 import MySQLdb, os, re, sys
8 from sitescripts.utils import get_config
9
10 def parse_hide_filter(line):
11 match = re.search(r"^(.*?)##", line)
12 return {line: match.group(1)} if match else {}
13
14 def parse_block_filter(line):
15 match = re.search(r"domain=(.*)", line)
16 if match:
17 domains = match.group(1).split("|")
18 filters = {}
19 for domain in domains:
20 filters[line] = domain;
21 return filters
22
23 match = re.search(r"^\|\|(.*?)[/\^]", line)
24 return {line: match.group(1)} if match else {}
25
26 def remove_comment(line):
27 exclamation_index = line.find("!")
28 if exclamation_index != -1:
29 return line[:exclamation_index]
30 return line
31
32 def extract_filters(filter_list_dir, domain_filter_files):
33 filters = {}
34
35 for filter_file in domain_filter_files:
36 is_hide_file = "hide" in filter_file
37 parse_filter = parse_hide_filter if is_hide_file else parse_block_filter
38 file_path = filter_list_dir + "/" + filter_file
39
40 try:
41 for line in open(file_path):
42 line = line.strip()
43 line = remove_comment(line)
44 filters.update(parse_filter(line))
45 except IOError:
46 print >>sys.stderr, "Unable to read filters from '%s'" % file_path
47
48 return filters
49
50 def print_statements(filters):
51 query = "INSERT INTO crawler_filters (filter, domain) VALUES ('%s', '%s');"
52 for filter_line, domain in filters.iteritems():
53 escaped_filter = MySQLdb.escape_string(filter_line)
54 escaped_domain = MySQLdb.escape_string(domain)
55 print query % (escaped_filter, escaped_domain)
56
57 if __name__ == "__main__":
58 config = get_config()
59 filter_list_dir = config.get("crawler", "filter_list_repository")
60 domain_filter_files = config.get("crawler", "domain_specific_filter_files")
61 domain_filter_file_list = re.split(r"\s*,\s*", domain_filter_files)
62 filters = extract_filters(filter_list_dir, domain_filter_file_list)
63 print_statements(filters)
OLDNEW
« no previous file with comments | « sitescripts/crawler/README.md ('k') | no next file » | no next file with comments »

Powered by Google App Engine
This is Rietveld