#!/usr/bin/python

import collections
import struct
import itertools
import bisect

def encodeInt(x):
    bytes = []
    while True:
        lowbits = x & 0x7f
        x >>= 7
        if x:
            bytes.append(chr(lowbits))
        else:
            bytes.append(chr(lowbits | 0x80))
            return "".join(bytes)

def encodeU32(x): return struct.pack("!L",x)
def decodeU32(x): return struct.unpack("!L",x)[0]

class GeoIP:
    def __init__(self):
        self.rangesPerCountry = collections.defaultdict(lambda: 0)
        self.rangeSizes = collections.defaultdict(lambda: 0)
        self.ranges = [] # Tuples of lo, hi, countryname

        self.countryCodes = {} # cc -> integer
        self.rangeLenCodes = {} # range length -> integer
        self.countryByCode = {} # integer -> cc
        self.rangeLenByCode = {} # integer -> range length

    def _addRange(self, lo, hi, cc):
        rangeLen = hi - lo + 1
        self.rangesPerCountry[cc] += 1
        self.rangeSizes[rangeLen] += 1
        self.ranges.append( (lo, hi, cc) )

    def readCSV(self, f):
        lasthi = -1
        for line in f:
            v = line.replace('"', "").split(",")
            lo, hi, cc = v[2:5]
            lo = int(lo)
            hi = int(hi)

            if lo != lasthi + 1:
                self._addRange(lasthi + 1, lo - 1, "??")

            self._addRange(lo, hi, cc)
            lasthi = hi

        if lasthi != (1<<32)-1:
            self._addRange(lasthi + 1, (1<<32)-1, "??")

    def _mktab(self, d):
        items = sorted(d.iteritems(), key=lambda i:i[1], reverse=True)
        codeByValue = {}
        valueByCode = []
        i = 0
        for val, _ in items:
            codeByValue[val] = i
            valueByCode.append(val)
            i += 1
        return codeByValue, valueByCode

    def generateCodings(self):
        self.countryCodes,self.countryByCode = self._mktab(self.rangesPerCountry)
        self.rangeLenCodes,self.rangeLenByCode = self._mktab(self.rangeSizes)

    def writeCountryCodes(self, f):
        f.write("@CC@")
        f.write(encodeU32(len(self.countryByCode)))
        for cc in self.countryByCode:
            f.write(cc)
        f.write("@cc@")

    def writeRunlenList(self, f):
        f.write("@RL@")
        f.write(encodeU32(len(self.rangeLenByCode)))
        for rl in self.rangeLenByCode:
            f.write(encodeU32(rl))
        f.write("@rl@")

    def writeRanges(self, f_ranges, f_idx):
        f_ranges.write("@IP@")
        f_ranges.write(encodeU32(len(self.ranges)))
        rangePos = 8

        f_idx.write("@IX@")
        f_idx.write(encodeU32(len(self.ranges) / 256))
        idxEntries = 0
        ip = 0

        for rangenum, (lo, hi, cc) in itertools.izip(
                    xrange(len(self.ranges)), self.ranges):

            if rangenum and (rangenum % 256) == 0:
                f_idx.write(encodeU32(ip))
                f_idx.write(encodeU32(rangePos))
                idxEntries += 1

            rangelen = hi - lo + 1

            rc = encodeInt(self.rangeLenCodes[rangelen])
            cc = encodeInt(self.countryCodes[cc])

            ip += rangelen
            rangePos += len(rc)+len(cc)
            f_ranges.write(rc)
            f_ranges.write(cc)
            assert f_ranges.tell() == rangePos

        f_ranges.write("@ip@")
        assert idxEntries == len(self.ranges) / 256
        f_idx.write("@ix@")

def realignFile(f):
    p = f.tell()
    if p&3:
        f.write("\0\0\0\0"[:(4-(p&3))])

def generate():
    G = GeoIP()
    G.readCSV(open("GeoIPCountryWhois.csv"))
    G.generateCodings()
    meta = open("geoip.idx", 'wb')
    data = open("geoip.dat", 'wb')
    G.writeCountryCodes(meta)
    realignFile(meta)
    G.writeRunlenList(meta)
    realignFile(meta)
    G.writeRanges(data, meta)
    meta.close()
    data.close()

if __name__ == '__main__':
    generate()
