#!/usr/bin/python3 '''rough textual overview of resource usage on everything''' import argparse import json import logging import os import re import subprocess import requests __epilog__ = ''' This script will connect to LDAP, PuppetDB and Prometheus to extract some metrics aimed at communicating general information about all our machines, in a monthly report. It's part of the "record monthly metrics" suggestion from Thomas Limoncelli in his Ops report card questionnaire, section 3. Make sure to provide HTTP_USER and HTTP_PASS through the environment to connect to the Prometheus server. The Puppet and LDAP servers are reached by first jumping to them over SSH and running queries locally. The output of this script should be valid markdown. It should also append to a CSV file to profile variations in over time, but that has not been implemented yet.''' # Missing metrics from the questionnaire, how many... # # sysadmins? # users? # CPU cores? # open tickets? # closed ticket in the last month? # tickets processed per sysadmin? # SLA metrics fulfilled? # backup disk usage? # overall disk usage? # nagios green? # metrics counts? # # This would obviously be better as a Grafana Dashboard, but this was # whipped up quickly for a monthly report. class LoggingAction(argparse.Action): """change log level on the fly The logging system should be initialized before this, using `basicConfig`. Example usage: parser.add_argument( "-v", "--verbose", action=LoggingAction, const="INFO", help="enable verbose messages", ) parser.add_argument( "-d", "--debug", action=LoggingAction, const="DEBUG", help="enable debugging messages", ) """ def __init__(self, *args, **kwargs): """setup the action parameters This enforces a selection of logging levels. It also checks if const is provided, in which case we assume it's an argument like `--verbose` or `--debug` without an argument. """ kwargs["choices"] = logging._nameToLevel.keys() if "const" in kwargs: kwargs["nargs"] = 0 super().__init__(*args, **kwargs) def __call__(self, parser, ns, values, option): """if const was specified it means argument-less parameters""" if self.const: logging.getLogger("").setLevel(self.const) else: logging.getLogger("").setLevel(values) def parse_args(): parser = argparse.ArgumentParser(description=__doc__, epilog=__epilog__) parser.add_argument('--puppetdb', default="puppetdb-01.torproject.org", help='PuppetDB server hostname, default %(default)s') parser.add_argument('--ldap', default="alberti.torproject.org", help='LDAP server hostname, default %(default)s') parser.add_argument('--prometheus', default="https://{HTTP_USER}:{HTTP_PASS}@prometheus.torproject.org/api/v1", # noqa: E501 help='Prometheus API endpoint, default %(default)s') parser.add_argument( "-v", "--verbose", action=LoggingAction, const="INFO", help="enable verbose messages", ) parser.add_argument( "-d", "--debug", action=LoggingAction, const="DEBUG", help="enable debugging messages", ) args = parser.parse_args() args.prometheus = args.prometheus.format(HTTP_USER=os.environ.get('HTTP_USER', ''), # noqa: E501 HTTP_PASS=os.environ.get('HTTP_PASS', '')) # noqa: E501 return args class PromRequestor(object): def __init__(self): "docstring" self.session = requests.Session() def query(self, args, query): resp = self.session.get(args.prometheus + "/query?query=" + query) resp.raise_for_status() j = resp.json() logging.debug("JSON response: %r", j) if len(j['data']['result']) < 1: return None return float(j['data']['result'][0]['value'][1]) def host_count_puppet(args): puppetdb_data = subprocess.check_output(['ssh', args.puppetdb, 'curl -s -G "http://localhost:8080/pdb/query/v4/nodes"']) # noqa: E501 return len(json.loads(puppetdb_data)) def host_count_ldap(args): ldap_data = subprocess.check_output(['ssh', args.ldap, 'ldapsearch -ZZ -vLx -H ldap://db.torproject.org -b "ou=hosts,dc=torproject,dc=org" 2>/dev/null']) # noqa: E501 return len(re.findall(r'^dn: host', ldap_data.decode('ascii'), re.M)) def sizeof_fmt(num, suffix='B', units=None, power=None, sep=' ', precision=2, sign=False): """format the given size as a human-readable size""" prefix = '+' if sign and num > 0 else '' for unit in units[:-1]: if abs(round(num, precision)) < power: if isinstance(num, int): return "{}{}{}{}{}".format(prefix, num, sep, unit, suffix) else: return "{}{:3.{}f}{}{}{}".format(prefix, num, precision, sep, unit, suffix) num /= float(power) return "{}{:.{}f}{}{}{}".format(prefix, num, precision, sep, units[-1], suffix) def sizeof_fmt_iec(num, suffix='B', sep=' ', precision=2, sign=False): return sizeof_fmt(num, suffix=suffix, power=1024, units=['', 'Ki', 'Mi', 'Gi', 'Ti', 'Pi', 'Ei', 'Zi', 'Yi'], sep=sep, precision=precision, sign=sign) def sizeof_fmt_decimal(num, suffix='B', sep=' ', precision=2, sign=False): """ # no rounding necessary for those >>> sizeof_fmt_decimal(0) '0 B' >>> sizeof_fmt_decimal(1) '1 B' >>> sizeof_fmt_decimal(142) '142 B' >>> sizeof_fmt_decimal(999) '999 B' >>> # rounding starts here >>> sizeof_fmt_decimal(1000) '1.00 kB' >>> # should be rounded away >>> sizeof_fmt_decimal(1001) '1.00 kB' >>> # should be rounded down >>> sizeof_fmt_decimal(1234) '1.23 kB' >>> # should be rounded up >>> sizeof_fmt_decimal(1235) '1.24 kB' >>> # rounded down as well >>> sizeof_fmt_decimal(1010) '1.01 kB' >>> # rounded down >>> sizeof_fmt_decimal(999990000) '999.99 MB' >>> # rounded down >>> sizeof_fmt_decimal(999990001) '999.99 MB' >>> # rounded up to next unit >>> sizeof_fmt_decimal(999995000) '1.00 GB' >>> # and all the remaining units, megabytes >>> sizeof_fmt_decimal(10**6) '1.00 MB' >>> # gigabytes >>> sizeof_fmt_decimal(10**9) '1.00 GB' >>> # terabytes >>> sizeof_fmt_decimal(10**12) '1.00 TB' >>> # petabytes >>> sizeof_fmt_decimal(10**15) '1.00 PB' >>> # exabytes >>> sizeof_fmt_decimal(10**18) '1.00 EB' >>> # zottabytes >>> sizeof_fmt_decimal(10**21) '1.00 ZB' >>> # yottabytes >>> sizeof_fmt_decimal(10**24) '1.00 YB' >>> # negative value >>> sizeof_fmt_decimal(-1) '-1 B' >>> # negative value with rounding >>> sizeof_fmt_decimal(-1010) '-1.01 kB' """ return sizeof_fmt(num, suffix=suffix, power=1000, units=['', 'k', 'M', 'G', 'T', 'P', 'E', 'Z', 'Y'], sep=sep, precision=precision, sign=sign) def main(): logging.basicConfig() args = parse_args() prom = PromRequestor() print(" * hosts in Puppet: %d, LDAP: %d, Prometheus exporters: %d" % (host_count_puppet(args), host_count_ldap(args), prom.query(args, 'sum(up)'))) print(" * number of Apache servers monitored: %d, hits per second: %d" % (prom.query(args, 'count(apache_up)'), # XXX: wtf vs http_requests_total prom.query(args, 'sum(rate(apache_accesses_total[30d]))'))) print(" * number of self-hosted nameservers: %d, mail servers: %d" % (prom.query(args, 'sum(bind_up)'), prom.query(args, 'count(count(postfix_queue_length) by (instance))'))) # noqa: E501 print(" * pending upgrades: %d, reboots: %d" % (prom.query(args, 'sum(apt_upgrades_pending)'), prom.query(args, 'sum(node_reboot_required)'))) print(" * average load: %0.2f, memory available: %s/%s, running processes: %d" % # noqa: E501 (prom.query(args, 'avg(node_load15)'), sizeof_fmt_iec(prom.query(args, 'sum(node_memory_MemFree_bytes)')), sizeof_fmt_iec(prom.query(args, 'sum(node_memory_MemTotal_bytes)')), prom.query(args, 'sum(node_procs_running)'))) print(" * disk free/total: %s/%s" % (sizeof_fmt_iec(prom.query(args, "sum(node_filesystem_avail_bytes{fstype!='tmpfs',fstype!='shm'})")), # noqa: E501 sizeof_fmt_iec(prom.query(args, "sum(node_filesystem_size_bytes{fstype!='tmpfs',fstype!='shm'})")))) # noqa: E501 print(" * bytes sent: %s/s, received: %s/s" % (sizeof_fmt_decimal(prom.query(args, 'sum(rate(node_network_transmit_bytes_total[30d]))')), # noqa: E501 sizeof_fmt_decimal(prom.query(args, 'sum(rate(node_network_receive_bytes_total[30d]))')))) # noqa: E501 print(" * planned bookworm upgrades completion date: ???") print(""" * [GitLab tickets][]: ? tickets including... * open: ? * icebox: ? * backlog: ? * next: ? * doing: ? * (closed: ?) """) print(" [Gitlab tickets]: https://gitlab.torproject.org/tpo/tpa/team/-/boards") # noqa: E501 print() print("Upgrade prediction graph lives at https://gitlab.torproject.org/tpo/tpa/team/-/wikis/howto/upgrades/bookworm/") # noqa: E501 print() print("Now also available as the main Grafana dashboard. Head to , change the time period to 30 days, and wait a while for results to render.") # noqa: E501 # TODO: talk with the gitlab API to extract those numbers # automatically if __name__ == '__main__': main()