Left: | ||
Right: |
OLD | NEW |
---|---|
(Empty) | |
1 #!/usr/bin/env python | |
2 """Anonymize data in access log lines. | |
3 | |
4 Read a line from stdin, write it to stdout with the following changes: | |
5 1. IP (v4 or v6) replaced with a salted hash of the IP and the date | |
6 2. Country information (extracted from IP) added after the salted hash. | |
7 | |
8 If the country information is unavailable in the database, '-' is added instead | |
9 of ISO 3166-1 alpha-2 country code (like 'DE'). | |
10 | |
11 Salt and the country information database are taken as command line options and | |
12 default to environment variables. | |
13 | |
14 Malformed lines are passed on as is, based on the assumption that they don't | |
15 contain sensitive information. Malformed here means the line couldn't be split | |
16 on space character. If it could be split, and an error occurs afterwards | |
17 (e.g. while trying to parse out the date), the script will fail and exit in | |
18 order to bring attention to the fact that something might not be getting | |
19 anonymized. | |
20 """ | |
21 | |
22 from __future__ import print_function | |
23 from __future__ import unicode_literals | |
24 | |
25 import argparse | |
26 import hashlib | |
27 import hmac | |
28 import os | |
29 import sys | |
30 | |
31 import geoip2.database | |
32 | |
33 | |
34 def main(salt, country_db): | |
35 reader = geoip2.database.Reader(country_db) | |
36 salt = salt.encode('utf-8') | |
37 | |
38 for line in sys.stdin: | |
39 try: | |
40 ip, non_sensitive_info = line.split(' ', 1) | |
41 except ValueError: | |
42 print(line, end='') | |
43 continue | |
44 | |
45 # http://geoip2.readthedocs.io/en/latest/#geoip2.database.Reader.country | |
46 try: | |
47 record = reader.country(ip) | |
48 except geoip2.errors.AddressNotFoundError: | |
49 country = '-' | |
50 else: | |
51 country = record.country.iso_code | |
52 | |
53 # 218.215.212.209 - - [04/May/2018:05:20:48 +0000] "GET /... | |
54 date_start = line.index('[') + 1 | |
55 # IP might be v4 or v6 | |
56 date_end = line.index(':', date_start) | |
57 date = line[date_start:date_end] | |
58 | |
59 # https://docs.python.org/2/library/hmac.html | |
60 to_hash = (ip + date).encode('utf-8') | |
61 token = hmac.HMAC(salt, to_hash, hashlib.sha1).hexdigest() | |
62 | |
63 print(token, country, non_sensitive_info, end='') | |
64 | |
65 reader.close() | |
66 | |
67 | |
68 if __name__ == '__main__': | |
69 parser = argparse.ArgumentParser( | |
70 description='Filter out sensitive data from access logs', | |
tlucas
2018/05/08 16:08:21
What do you think about
...
description=__
l.kryvonos
2018/05/08 16:26:19
I like doing `description=__doc__`, but I think th
tlucas
2018/05/08 16:40:55
Fair enough - let's see if those encountering the
| |
71 ) | |
72 | |
73 parser.add_argument( | |
74 '--salt', | |
75 dest='salt', | |
76 default=os.getenv('ANONYMIZE_SALT'), | |
77 help='Salt for hashing sensitive data, defaults to $ANONYMIZE_SALT' | |
78 ) | |
79 | |
80 # https://dev.maxmind.com/geoip/geoip2/geolite2/ | |
81 parser.add_argument( | |
82 '--geolite2-db', | |
83 dest='country_db', | |
84 default=os.getenv('ANONYMIZE_GEOLITE2_DB'), | |
85 help=('Path to MaxMind DB file with GeoLite2 Country data, defaults ' | |
tlucas
2018/05/08 16:08:21
Those parentheses are redundant (and trigger our c
l.kryvonos
2018/05/08 16:26:19
Acknowledged.
| |
86 'to $ANONYMIZE_GEOLITE2_DB') | |
87 ) | |
88 | |
89 args = parser.parse_args() | |
90 | |
91 if args.salt is None or args.country_db is None: | |
92 parser.print_help() | |
93 sys.exit(1) | |
94 | |
95 main(args.salt, args.country_db) | |
96 | |
tlucas
2018/05/08 16:08:21
Nit: this blank line is redundant.
l.kryvonos
2018/05/08 16:26:19
Do all of our repositories follow the 'no blank li
tlucas
2018/05/08 16:40:55
Yes - or at least they should (this is also a buil
| |
OLD | NEW |