Rietveld Code Review Tool
Help | Bug tracker | Discussion group | Source code

Side by Side Diff: anonymized.py

Issue 29817563: #12025 - Add city information to anonymized.py (Closed)
Patch Set: #12025 - Add city information to anonymized.py Created July 2, 2018, 2:42 p.m.
Left:
Right:
Use n/p to move between diff chunks; N/P to move between comments.
Jump to:
View unified diff | Download patch
« no previous file with comments | « no previous file | no next file » | no next file with comments »
Toggle Intra-line Diffs ('i') | Expand Comments ('e') | Collapse Comments ('c') | Show Comments Hide Comments ('s')
OLDNEW
(Empty)
1 #!/usr/bin/env python
2 """Anonymize data in access log lines.
3
4 Read a line from stdin, write it to stdout with the following changes:
5 1. IP (v4 or v6) replaced with a salted hash of the IP and the date
6 2. Country and city information (extracted from IP) added after the salted hash.
7
8 If the country or city information is unavailable in the database,
9 '-' is added instead of ISO 3166-1 alpha-2 country code (like DE) &
10 city name (like 'Berlin').
11
12 Salt and the country/city information database are taken
13 as command line options and default to environment variables.
14
15 Malformed lines are passed on as is, based on the assumption that they don't
16 contain sensitive information. Malformed here means the line couldn't be split
17 on space character. If it could be split, and an error occurs afterwards
18 (e.g. while trying to parse out the date), the script will fail and exit in
19 order to bring attention to the fact that something might not be getting
20 anonymized.
21 """
22
23 from __future__ import print_function
24 from __future__ import unicode_literals
25
26 import argparse
27 import hashlib
28 import hmac
29 import os
30 import sys
31
32 import geoip2.database
33
34 def main(salt, country_db, city_db):
35 country_reader = geoip2.database.Reader(country_db)
36 city_reader = geoip2.database.Reader(city_db)
37 salt = salt.encode('utf-8')
38
39 for line in sys.stdin:
40 try:
41 ip, non_sensitive_info = line.split(' ', 1)
42 except ValueError:
43 print(line, end='')
44 continue
45
46 # http://geoip2.readthedocs.io/en/latest/#geoip2.database.Reader.country
47 try:
48 record = country_reader.country(ip)
49 except geoip2.errors.AddressNotFoundError:
50 country = '-'
51 else:
52 country = record.country.iso_code
53
54 try:
55 record = city_reader.city(ip)
56 except geoip2.errors.AddressNotFoundError:
57 city = '-'
58 else:
59 city = record.city.name
60
61 # 218.215.212.209 - - [04/May/2018:05:20:48 +0000] "GET /...
62 date_start = line.index('[') + 1
63 # IP might be v4 or v6
64 date_end = line.index(':', date_start)
65 date = line[date_start:date_end]
66
67 # https://docs.python.org/2/library/hmac.html
68 to_hash = (ip + date).encode('utf-8')
69 token = hmac.HMAC(salt, to_hash, hashlib.sha1).hexdigest()
70
71 print(token, country, city, non_sensitive_info, end='')
72
73 country_reader.close()
74 city_reader.close()
75
76
77 if __name__ == '__main__':
78 parser = argparse.ArgumentParser(
79 description='Filter out sensitive data from access logs',
80 )
81
82 parser.add_argument(
83 '--salt',
84 dest='salt',
85 default=os.getenv('ANONYMIZE_SALT'),
86 help='Salt for hashing sensitive data, defaults to $ANONYMIZE_SALT'
87 )
88
89 # https://dev.maxmind.com/geoip/geoip2/geolite2/
90 parser.add_argument(
91 '--geolite2-country-db',
92 dest='country_db',
93 default=os.getenv('ANONYMIZE_GEOLITE2_DB'),
94 help='Path to MaxMind DB file with GeoLite2 Country data, defaults '
95 'to $ANONYMIZE_GEOLITE2_DB'
96 )
97
98 # https://dev.maxmind.com/geoip/geoip2/geolite2/
99 parser.add_argument(
100 '--geolite2-city-db',
101 dest='city_db',
102 default=os.getenv('ANONYMIZE_GEOLITE2_DB'),
103 help='Path to MaxMind DB file with GeoLite2 City data, defaults '
104 'to $ANONYMIZE_GEOLITE2_DB'
105 )
106
107 args = parser.parse_args()
108
109 if args.salt is None or args.country_db is None or args.city_db is None:
110 parser.print_help()
111 sys.exit(1)
112
113 main(args.salt, args.country_db, args.city_db)
114
OLDNEW
« no previous file with comments | « no previous file | no next file » | no next file with comments »

Powered by Google App Engine
This is Rietveld