| Index: sitescripts/crawler/bin/import_filters.py |
| =================================================================== |
| new file mode 100644 |
| --- /dev/null |
| +++ b/sitescripts/crawler/bin/import_filters.py |
| @@ -0,0 +1,95 @@ |
| +# coding: utf-8 |
| + |
| +# This Source Code is subject to the terms of the Mozilla Public License |
| +# version 2.0 (the "License"). You can obtain a copy of the License at |
| +# http://mozilla.org/MPL/2.0/. |
| + |
| +import MySQLdb, os, re |
| +from sitescripts.utils import cached, get_config |
| + |
| +@cached(600) |
| +def _get_db(): |
| + database = get_config().get("crawler", "database") |
| + dbuser = get_config().get("crawler", "dbuser") |
| + dbpasswd = get_config().get("crawler", "dbpassword") |
| + if os.name == "nt": |
| + return MySQLdb.connect(user=dbuser, passwd=dbpasswd, db=database, |
| + use_unicode=True, charset="utf8", named_pipe=True) |
| + else: |
| + return MySQLdb.connect(user=dbuser, passwd=dbpasswd, db=database, |
| + use_unicode=True, charset="utf8") |
| + |
| +def _get_cursor(): |
| + return _get_db().cursor(MySQLdb.cursors.DictCursor) |
| + |
| +def _parse_hide_filter(line): |
| + match = re.search(r"^(.*?)##", line) |
| + return match.group(1).split(",") if match else [] |
| + |
| +def _parse_block_filter(line): |
| + match = re.search(r"domain=(.*)", line) |
| + if match: |
| + return match.group(1).split("|") |
| + |
| + match = re.search(r"^\|\|(.*?)[/\^]", line) |
| + return [match.group(1)] if match else [] |
| + |
| +def _remove_comment(line): |
| + exclamation_index = line.find("!") |
| + if exclamation_index != -1: |
| + return line[:exclamation_index] |
| + return line |
| + |
| +def _parse_filters(filter_path, parse_filter): |
| + filters = {} |
| + |
| + try: |
| + for line in open(filter_path): |
| + line = line.strip() |
| + line = _remove_comment(line) |
| + |
| + domains = parse_filter(line) |
| + for domain in domains: |
| + filters[line] = domain; |
| + |
| + except IOError: |
| + print >>sys.stderr, "Unable to read filters from '%s'" % file_path |
| + |
| + return filters |
| + |
| +def _extract_filters(easylist_dir): |
| + filter_files = {"easylist_specific_block.txt": _parse_block_filter, |
| + "easylist_specific_hide.txt": _parse_hide_filter} |
| + filters = {} |
| + for filter_file, parse_filter in filter_files.iteritems(): |
| + filter_path = easylist_dir + "/easylist/" + filter_file |
| + filters.update(_parse_filters(filter_path, parse_filter)) |
| + return filters |
| + |
| +def _insert_filters(filters): |
| + cursor = _get_cursor() |
| + filter_insert = """ |
| +INSERT INTO crawler_filters (filter, filter_hash) VALUES (%s, sha1(filter))""" |
| + domain_select = "SELECT id FROM crawler_domains WHERE domain = %s" |
| + domain_insert = "INSERT INTO crawler_domains (domain) VALUES (%s)" |
| + domain_filter_insert = """ |
| +INSERT INTO crawler_domain_filters (filter, domain) VALUES (%s, %s)""" |
| + |
| + for filter_line, domain in filters.iteritems(): |
| + cursor.execute(filter_insert, filter_line) |
| + filter_id = cursor.lastrowid |
| + |
| + cursor.execute(domain_select, domain) |
| + result = cursor.fetchone() |
| + if result: |
| + domain_id = result["id"] |
| + else: |
| + cursor.execute(domain_insert, domain) |
| + domain_id = cursor.lastrowid |
| + |
| + cursor.execute(domain_filter_insert, (domain_id, filter_id)) |
| + |
| +if __name__ == "__main__": |
| + easylist_dir = get_config().get("crawler", "easylist_repository") |
| + filters = _extract_filters(easylist_dir) |
| + _insert_filters(filters) |