MaxMind BigQuery GeoIP

Idea here is to have maxmind ip lookup table in bigquery for further logs analysis

So, first of all we need clean everything up

bq rm -f -t rualogs:data.maxmind
rm *.txt *.csv *.zip

Download and unzip fresh data

wget http://geolite.maxmind.com/download/geoip/database/GeoLite2-Country-CSV.zip
wget http://geolite.maxmind.com/download/geoip/database/GeoLite2-ASN-CSV.zip

unzip -jo GeoLite2-Country-CSV.zip
unzip -jo GeoLite2-ASN-CSV.zip

Load data into BigQuery

bq load --skip_leading_rows=1 rualogs:data.tmp_country_ip GeoLite2-Country-Blocks-IPv4.csv "network:string,geoname_id:integer,registered_country_geoname_id:integer,represented_country_geoname_id:integer,is_anonymous_proxy:integer,is_satellite_provider:integer"

bq load --skip_leading_rows=1 rualogs:data.tmp_country_labels GeoLite2-Country-Locations-en.csv "geoname_id:integer,locale_code:string,continent_code:string,continent_name:string,country_iso_code:string,country_name:string"

bq load --skip_leading_rows=1 rualogs:data.tmp_asn GeoLite2-ASN-Blocks-IPv4.csv "network:string,autonomous_system_number:integer,autonomous_system_organization:string"

Now is most important stuff - we are going to process uploaded data and save result as new table

bq query --use_legacy_sql=false --allow_large_results --destination_table=rualogs:data.maxmind "$(cat maxmind.sql)"

After this step is done we may cleanup temp tables

bq rm -f -t rualogs:data.tmp_country_ip
bq rm -f -t rualogs:data.tmp_country_labels
bq rm -f -t rualogs:data.tmp_asn

From now on anywhere needed we may join our maxmind table like so

SELECT l.*, m.country_name, m.autonomous_system_organization
FROM data.log AS l
LEFT JOIN data.maxmind AS m ON CAST(NET.IPV4_TO_INT64(NET.IP_FROM_STRING(l.ip))/(256*256*256) AS INT64) = class_a AND NET.IPV4_TO_INT64(NET.IP_FROM_STRING(l.ip)) BETWEEN start_num AND end_num
WHERE (_PARTITIONTIME = TIMESTAMP(CURRENT_DATE()) OR _PARTITIONTIME IS NULL)
LIMIT 100

maxmind.sql

WITH raw AS (

SELECT
REGEXP_REPLACE(network, r'/\d+$', '') as net,
CAST(REGEXP_REPLACE(network, r'^\d+\.\d+\.\d+\.\d+/', '') AS int64) as mask,
CAST(CASE WHEN (-2 + POW(2, 32 - CAST(REGEXP_REPLACE(network, r'^\d+\.\d+\.\d+\.\d+/', '') AS int64))) < 1 THEN 1 ELSE (-2 + POW(2, 32 - CAST(REGEXP_REPLACE(network, r'^\d+\.\d+\.\d+\.\d+/', '') AS int64))) END AS INT64) as hosts,
* FROM `rualogs.data.tmp_country_ip`

), num AS (

select
NET.IPV4_TO_INT64(NET.IP_FROM_STRING(net)) + 1 as start_num,
NET.IPV4_TO_INT64(NET.IP_FROM_STRING(net)) + hosts as end_num,
* from raw

), ip AS (

select
cast(start_num/(256*256*256) as int64) as class_a,
NET.IP_TO_STRING(NET.IPV4_FROM_INT64(start_num)) as start_ip,
NET.IP_TO_STRING(NET.IPV4_FROM_INT64(end_num)) as end_ip,
* from num

), maxmind AS (

select
ip.*,
l.locale_code, l.continent_code, l.continent_name, l.country_iso_code, l.country_name,
a.autonomous_system_number, a.autonomous_system_organization
from ip
left join rualogs.data.tmp_country_labels l on ip.geoname_id = l.geoname_id
left join rualogs.data.tmp_asn a on ip.network = a.network

)
select * from maxmind