Rietveld Code Review Tool
Help | Bug tracker | Discussion group | Source code

Side by Side Diff: anonymized.py

Issue 29817563: #12025 - Add city information to anonymized.py (Closed)
Patch Set: Created June 27, 2018, 1:23 p.m.
Left:
Right:
Use n/p to move between diff chunks; N/P to move between comments.
Jump to:
View unified diff | Download patch
« no previous file with comments | « no previous file | no next file » | no next file with comments »
Toggle Intra-line Diffs ('i') | Expand Comments ('e') | Collapse Comments ('c') | Show Comments Hide Comments ('s')
OLDNEW
(Empty)
1 #!/usr/bin/env python
2 """Anonymize data in access log lines.
3
4 Read a line from stdin, write it to stdout with the following changes:
5 1. IP (v4 or v6) replaced with a salted hash of the IP and the date
6 2. Country and city information (extracted from IP) added after the salted hash.
7
8 If the country or city information is unavailable in the database, '-' is added instead
9 of ISO 3166-1 alpha-2 country/city code (like 'DE').
10
11 Salt and the country/city information database are taken as command line options and
12 default to environment variables.
13
14 Malformed lines are passed on as is, based on the assumption that they don't
15 contain sensitive information. Malformed here means the line couldn't be split
16 on space character. If it could be split, and an error occurs afterwards
17 (e.g. while trying to parse out the date), the script will fail and exit in
18 order to bring attention to the fact that something might not be getting
19 anonymized.
20 """
21
22 from __future__ import print_function
23 from __future__ import unicode_literals
24
25 import argparse
26 import hashlib
27 import hmac
28 import os
29 import sys
30
31 import geoip2.database
32
33 def main(salt, country_db, city_db):
34 country_reader = geoip2.database.Reader(country_db)
35 city_reader = geoip2.database.Reader(city_db)
36 salt = salt.encode('utf-8')
37
38 for line in sys.stdin:
39 try:
40 ip, non_sensitive_info = line.split(' ', 1)
41 except ValueError:
42 print(line, end='')
43 continue
44
45 # http://geoip2.readthedocs.io/en/latest/#geoip2.database.Reader.country
46 try:
47 record = country_reader.country(ip)
48 except geoip2.errors.AddressNotFoundError:
49 country = '-'
50 else:
51 country = record.country.iso_code
52
53 try:
54 record = city_reader.city(ip)
55 except geoip2.errors.AddressNotFoundError:
56 city = '-'
57 else:
58 city = record.city.name
59
60 # 218.215.212.209 - - [04/May/2018:05:20:48 +0000] "GET /...
61 date_start = line.index('[') + 1
62 # IP might be v4 or v6
63 date_end = line.index(':', date_start)
64 date = line[date_start:date_end]
65
66 # https://docs.python.org/2/library/hmac.html
67 to_hash = (ip + date).encode('utf-8')
68 token = hmac.HMAC(salt, to_hash, hashlib.sha1).hexdigest()
69
70 print(token, country, city, non_sensitive_info, end='')
71
72 country_reader.close()
73 city_reader.close()
74
75
76 if __name__ == '__main__':
77 parser = argparse.ArgumentParser(
78 description='Filter out sensitive data from access logs',
79 )
80
81 parser.add_argument(
82 '--salt',
83 dest='salt',
84 default=os.getenv('ANONYMIZE_SALT'),
85 help='Salt for hashing sensitive data, defaults to $ANONYMIZE_SALT'
86 )
87
88 # https://dev.maxmind.com/geoip/geoip2/geolite2/
89 parser.add_argument(
90 '--geolite2-country-db',
91 dest='country_db',
92 default=os.getenv('ANONYMIZE_GEOLITE2_DB'),
93 help='Path to MaxMind DB file with GeoLite2 Country data, defaults '
94 'to $ANONYMIZE_GEOLITE2_DB'
95 )
96
97 # https://dev.maxmind.com/geoip/geoip2/geolite2/
98 parser.add_argument(
99 '--geolite2-city-db',
100 dest='city_db',
101 default=os.getenv('ANONYMIZE_GEOLITE2_DB'),
102 help='Path to MaxMind DB file with GeoLite2 City data, defaults '
103 'to $ANONYMIZE_GEOLITE2_DB'
104 )
105
106 args = parser.parse_args()
107
108 if args.salt is None or args.country_db is None or args.city_db is None:
109 parser.print_help()
110 sys.exit(1)
111
112 main(args.salt, args.country_db, args.city_db)
113
OLDNEW
« no previous file with comments | « no previous file | no next file » | no next file with comments »

Powered by Google App Engine
This is Rietveld