Commit 942e8d24 authored by Rasmus Dahlberg's avatar Rasmus Dahlberg
Browse files

Add another digest script

$ ./scripts/digest2.py -i /home/rgdd/Downloads/2023-04-03-ct-sans/au-mel/*.stdout /home/rgdd/Downloads/2023-04-03-ct-sans/us-nyc/*.stdout /home/rgdd/Downloads/2023-04-03-ct-sans/de-fra/*.stdout 2>&1
digest2.py:26 INFO: found 3330 onions via Onion-Location
digest2.py:27 INFO: found 3077 via HTTP headers
digest2.py:28 INFO: found 281 via HTML tags
digest2.py:29 INFO: found 28 via both HTTP and HTML
parent fc6866e2
Loading
Loading
Loading
Loading

scripts/digest2.py

0 → 100755
+161 −0
Original line number Diff line number Diff line
#!/usr/bin/env python3

__program_description ='''
A script that digests the output of onion-grab.  Meant to be used for sorting
out the number of onion addresses and how they were discovered via O-L.  It
is digest "2" because this was added after discovering a redirect bug.   So,
this output gives a better view of how common HTTP and HTML config really is.
'''

import sys
import argparse
import logging

log = logging.getLogger(__name__)

import base64
import hashlib

def main(args):
    input_lines = []
    for inputFile in args.input_file:
        with open(inputFile) as fp:
            input_lines += [ line for line in fp ]

    numOnion, numHTTP, numHTML = parse_input(input_lines)
    log.info(f'found {numOnion} onions via Onion-Location')
    log.info(f'found {numHTTP} via HTTP headers')
    log.info(f'found {numHTML} via HTML tags')
    log.info(f'found {numHTTP + numHTML - numOnion} via both HTTP and HTML')

def parse_input(lines):
    onion2method = {}
    for line in lines:
        try:
            line = line[:len(line)-1]
            for result in parse_line(line):
                addr, isHTTP = result
                addr = trim_onion(trimPath(trimScheme(addr)))
                onion2method.setdefault(addr, {})
                if isHTTP:
                    onion2method[addr]["http"] = True
                else:
                    onion2method[addr]["html"] = True
        except Exception as e:
            log.debug(f'"{line}": {e}')

    numOnion = len(onion2method)
    numHTTP = 0
    numHTML = 0
    for onion in onion2method:
        d = onion2method[onion]
        if "http" in d:
            numHTTP += 1
        if "html" in d:
            numHTML += 1
    return numOnion, numHTTP, numHTML

def parse_line(line):
    '''
    Line format is:

    <domain> http=[value] html=[value]

    where at least one of http or html should have a value.  Note: there has
    been no vetting of what <value> is.  Outputs domain and a list of values,
    and bolean values indicating if the domain used an HTTP and/or HTML config.
    '''
    ret = []

    s = line.split(" ")
    if len(s) != 3:
        raise Exception(f'invalid line split')

    domain = s[0]
    http2onion = s[1]
    html2onion = s[2]

    s = http2onion.split("=")
    if len(s) < 2:
        raise Exception(f'invalid http split')
    if len(s[1]) > 0:
        ret += [ (s[1], True) ]

    s = html2onion.split("=")
    if len(s) < 2:
        raise Exception(f'invalid html split')
    if len(s[1]) > 0:
        ret += [ (s[1], False) ]

    return ret

def trimScheme(url):
    '''
    Removes required http:// or https:// scheme from url.
    '''
    for scheme in [ "http://", "https://" ]:
        if url.startswith(scheme):
            return url[len(scheme):]

    raise Exception(f'no http or https scheme')

def trimPath(url):
    '''
    Trims the path off from the url.
    '''
    return url.split("/")[0]

def trim_onion(host):
    '''
    Parses host as a v3 onion address, ports and subdomains are trimmed.
    '''
    s = host.split(":")
    if len(s) > 2:
        raise Exception(f'invalid host name')
    if len(s) == 2:
        port = int(s[1])
        if port < 1 or port > 2**16 - 1:
            raise Exception(f'port number not in [1, {2**16 - 1}]')

    domain = s[0]
    s = domain.split(".")
    if len(s) < 2:
        raise Exception(f'too few labels to be an onion address')
    if s[len(s)-1] != "onion":
        raise Exception(f'the final DNS label must be "onion"')
    if len(s[len(s)-2]) != 56:
        raise Exception(f'the DNS label before ".onion" must be 56 bytes')

    assert_v3(base64.b32decode(s[len(s)-2].upper().encode('UTF-8')))
    return ".".join(s[len(s)-2:])

def assert_v3(blob):
    '''
    https://gitweb.torproject.org/torspec.git/tree/rend-spec-v3.txt#n2240
    '''
    pubkey = blob[:32]
    checksum = blob[32:34]
    version = blob[34:35]
    if version[0] != 3:
        raise Exception(f'invalid version: {version[0]}')

    h = hashlib.sha3_256()
    h.update(b'.onion checksum')
    h.update(pubkey)
    h.update(version)
    c = h.digest()
    if checksum[0] != c[0] or checksum[1] != c[1]:
        raise Exception(f'invalid checksum')

if __name__ == "__main__":
    parser = argparse.ArgumentParser(description=__program_description)
    parser.add_argument("-v", "--verbosity", type=str, default="info",
            help="logging verbosity, select from debug, info, warning, error, and critical")
    parser.add_argument('-i','--input-file', nargs='+', required=True,
            help='input file with collected data')

    args = parser.parse_args()
    logging.basicConfig(level=logging.__dict__[args.verbosity.upper()],
            format='%(filename)s:%(lineno)d %(levelname)s: %(message)s')

    sys.exit(main(args))