move predict-os.py back into its own project

It doesn't make sense to maintain a free software project on the side in a wiki. This desserves proper documentation like a README and so on. I have cleaned up and unarchived the repository at: https://gitlab.com/anarcat/predict-os And further development should take place there.

move predict-os.py back into its own project
444b5d32 · anarcat · e841eb9e · 444b5d32 · e841eb9e
Unverified Commit 444b5d32 authored 5 years ago by anarcat
--- a/tsa/howto/upgrades.mdwn
+++ b/tsa/howto/upgrades.mdwn
@@ -16,11 +16,9 @@ major release. Here are the currently documented ones:
 <figcaption>

 The above graphic shows the progress of the migration between major
-releases. It can be regenerated with the [[predict-os.py]] script in
-the subdirectory. It pulls information from [[puppet]] to update a
-[[CSV file|data.csv]] to keep track of progress over time. See [[the
-script|predict-os.py]] for details. 
-
+releases. It can be regenerated with the [predict-os](https://gitlab.com/anarcat/predict-os) script. It
+pulls information from [[puppet]] to update a [[CSV file|data.csv]] to
+keep track of progress over time.
 </figure>

 Minor upgrades

--- a/tsa/howto/upgrades/predict-os.py
+++ b/tsa/howto/upgrades/predict-os.py
-#!/usr/bin/python3
-# coding: utf-8
-
-'''predict when major upgrades will complete'''
-# Copyright (C) 2016 Antoine Beaupré <anarcat@debian.org>
-#
-# This program is free software: you can redistribute it and/or modify
-# it under the terms of the GNU Affero General Public License as
-# published by the Free Software Foundation, either version 3 of the
-# License, or (at your option) any later version.
-#
-# This program is distributed in the hope that it will be useful,
-# but WITHOUT ANY WARRANTY; without even the implied warranty of
-# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
-# GNU Affero General Public License for more details.
-#
-# You should have received a copy of the GNU Affero General Public License
-# along with this program.  If not, see <http://www.gnu.org/licenses/>.
-
-from __future__ import division, absolute_import
-from __future__ import print_function, unicode_literals
-
-import argparse
-import collections
-from datetime import datetime
-import io
-import logging
-import logging.handlers
-import os
-import os.path
-import sys
-import tempfile
-
-try:
-    import pytest
-except ImportError:  # pragma: no cover
-    pytest = None
-
-import pandas as pd
-import matplotlib
-import matplotlib.pyplot as plt
-import numpy as np
-import requests
-import seaborn as sns
-
-
-__epilog__ = '''This scripts will predict when major OS upgrades will complete,
-based on regular samples stored in a CSV file, which are added from
-PuppetDB. It will also draw a graph, on the GUI or in a file,
-representing the state of the CSV file and progress. This project is a
-rewrite of this R toolset in Python:
-https://gitlab.com/anarcat/predict-os and expects the following Python
-packages to be installed: python3-requests python3-seaborn'''
-
-# the reason this was rewritten in Python was that:
-#
-# 1. libreoffice is a catastrophe, see the original predict-os for details
-# 2. i don't want to learn how to read/write/parse CSV files in R
-# 3. i don't want to learn how to make R talk with PuppetDB
-# 4. i got tired of chasing the PuppetDB SQL database changes
-# 5. i had to use python to massage data anyways
-# 6. "code without tests is legacy code" and i don't want legacy code
-
-PUPPETDB_URL = 'http://localhost:8080/pdb/query/v4'
-PUPPETDB_QUERY = 'facts[value] { name = "lsbdistcodename" }'
-
-DEFAULT_HEADER = ['Date', 'release', 'count']
-
-
-def parse_args(args=sys.argv[1:]):
-    parser = argparse.ArgumentParser(description=__doc__,
-                                     epilog=__epilog__)
-    parser.add_argument('--verbose', '-v', dest='log_level',
-                        action='store_const', const='info', default='warning')
-    parser.add_argument('--debug', '-d', dest='log_level',
-                        action='store_const', const='debug', default='warning')
-    parser.add_argument('--test', action='store_true',
-                        help='run self test suite and nothing else')
-    parser.add_argument('--puppetdb', '-p', default=PUPPETDB_URL,
-                        help='PuppetDB server URL')
-    parser.add_argument('--query', default=PUPPETDB_QUERY,
-                        help='query returning the list of Debian releases')
-    parser.add_argument('--path', default='data.csv',
-                        help='CSV datafile that keeps past records')
-    parser.add_argument('--refresh', '-r', action='store_true',
-                        help='fetch from PuppetDB (default: %(default)s)')
-    parser.add_argument('--dryrun', '-n', action='store_true',
-                        help='do nothing')
-    parser.add_argument('--output', '-o', type=argparse.FileType('wb'),
-                        default=sys.stdout, help='image to write, default to graphical display or stdout if unavailable')  # noqa: E501
-    parser.add_argument('--source', '-s', default='stretch',
-                        help='major version we are upgrading from')
-    return parser.parse_args(args=args)
-
-
-def main(args, now=None, session=requests):
-    logging.debug('loading previous records from %s', args.path)
-    if not os.path.exists(args.path):
-        with open(args.path, 'w') as fp:
-            fp.write(','.join(DEFAULT_HEADER))
-    with open(args.path) as fp:
-        records = load_csv(fp)
-    if args.refresh:
-        logging.info('querying PuppetDB on %s', args.puppetdb)
-        logging.debug('query: %s', args.query)
-        new_data = puppetdb_query(args.puppetdb, args.query, session)
-        logging.info('found %d hosts', len(new_data))
-        new_record = count_releases(new_data)
-        records = add_releases(records, new_record)
-        if not args.dryrun:
-            with open(args.path, 'w') as fp:
-                store_csv(fp, records)
-    records = prepare_records(records)
-    date = guess_completion_time(records, args.source, now)
-    print("completion time of %s major upgrades: %s" % (args.source, date))
-    plot_records(records, date, args)
-
-
-def test_main(capsys):
-    test_input = b'''Date,release,count
-2019-03-01,stretch,74
-2019-08-15,stretch,49
-2019-10-07,stretch,43
-2019-10-08,stretch,38
-'''
-    expected = '''completion time of stretch major upgrades: 2020-06-25'''
-    with tempfile.NamedTemporaryFile() as csv:
-        csv.write(test_input)
-        csv.flush()
-        with tempfile.NamedTemporaryFile(suffix='.png') as graph:
-            args = parse_args(['--path', csv.name, '--output', graph.name])
-            main(args, '2019-10-08')
-            assert os.path.getsize(graph.name) > 0
-        captured = capsys.readouterr()
-        assert expected in (captured.out + captured.err)
-
-
-def test_main_refresh():
-    import betamax
-    session = requests.Session()
-    recorder = betamax.Betamax(session, cassette_library_dir='cassettes')
-    handler = _setup_memory_handler()
-    with tempfile.NamedTemporaryFile() as csv:
-        os.unlink(csv.name)
-        with tempfile.NamedTemporaryFile(suffix='.png') as graph:
-            args = parse_args(['--path', csv.name, '--output', graph.name,
-                               '--refresh'])
-            with recorder.use_cassette('puppetdb'):
-                main(args, '2019-10-08', session)
-            assert os.path.exists(csv.name)
-            with open(csv.name) as fp:
-                assert fp.read() == '''Date,release,count
-2019-10-09,buster,40
-2019-10-09,stretch,38
-2019-10-09,jessie,1
-'''
-            messages = "\n".join([r.message for r in handler.buffer])
-            assert '''cannot guess completion time''' in messages
-
-
-def load_csv(fp):
-    '''load the data from the CSV'''
-    return pd.read_csv(fp)
-
-
-SAMPLE_CSV = '''Date,release,count
-2019-01-01,buster,32
-2019-01-01,stretch,10
-2019-02-02,buster,37
-2019-02-02,stretch,5
-2019-03-03,buster,50
-2019-03-03,stretch,1
-'''
-
-SAMPLE_DF_REPR = '''         Date  release  count
-0  2019-01-01   buster     32
-1  2019-01-01  stretch     10
-2  2019-02-02   buster     37
-3  2019-02-02  stretch      5
-4  2019-03-03   buster     50
-5  2019-03-03  stretch      1'''
-
-
-def test_load_csv():
-    '''just a sanity check that pandas works as expected'''
-    fp = io.StringIO(SAMPLE_CSV)
-    res = load_csv(fp)
-    assert repr(res) == SAMPLE_DF_REPR
-    return res
-
-
-def store_csv(fp, records):
-    '''write the CSV file back to the given stream'''
-    return fp.write(records.to_csv(index=False))
-
-
-def test_store_csv():
-    '''just a sanity check that we do the CSV rountrip cleanly'''
-    fp = io.StringIO()
-    data = test_load_csv()
-    store_csv(fp, data)
-    fp.seek(0)
-    assert fp.read() == SAMPLE_CSV
-
-
-def puppetdb_query(url, query, session=requests):
-    '''get the data from PuppetDB'''
-    resp = session.get(url, data={'query': query})
-    resp.raise_for_status()
-    return resp.json()
-
-
-def test_puppetdb_query():
-    '''simulate a PuppetDB query'''
-    import betamax
-    session = requests.Session()
-    recorder = betamax.Betamax(session, cassette_library_dir='cassettes')
-    with recorder.use_cassette('puppetdb'):
-        json = puppetdb_query(PUPPETDB_URL, PUPPETDB_QUERY, session=session)
-    assert len(json) > 0
-    return json
-
-
-def count_releases(data):
-    '''parse the data returned by PuppetDB
-
-    This counts the number of entries for each releases.
-
-    >>> d = [{'value': 'buster'}, {'value': 'stretch'}, {'value': 'buster'}]
-    >>> count_releases(d)
-    {'buster': 2, 'stretch': 1}
-    '''
-    total = collections.defaultdict(int)
-    for item in data:
-        logging.debug('checking item %s', item)
-        total[item['value']] += 1
-    return dict(total)
-
-
-def add_releases(data, new_data, date=None):
-    '''take the existing data and appending the new record'''
-    if date is None:
-        date = datetime.today().strftime('%Y-%m-%d')
-    series = [{'Date': date, 'release': release, 'count': count}
-              for release, count in new_data.items()]
-    return data.append(series, ignore_index=True)
-
-
-def test_add_releases():
-    '''check that we can add to the pandas dataframe as expected'''
-    data = test_load_csv()
-    new_data = {'buster': 33, 'stretch': 9}
-    d = add_releases(data, new_data, '2019-04-05')
-    assert SAMPLE_DF_REPR + '''
-6  2019-04-05   buster     33
-7  2019-04-05  stretch      9''' == repr(d)
-
-
-# cargo-culted from https://stackoverflow.com/questions/48860428/passing-datetime-like-object-to-seaborn-lmplot  # noqa: E501
-@plt.FuncFormatter
-def fake_dates(x, pos):
-    """ Custom formater to turn floats into e.g., 2016-05-08"""
-    return matplotlib.dates.num2date(x).strftime('%Y-%m-%d')
-
-
-def plot_records(records, guessed_date, args):
-    '''draw the actual graph, on the GUI or in a file as args dictates'''
-    sns.set(color_codes=True)
-    # ci=False because it looks kind of wrong
-    graph = sns.lmplot(x='datenum', y='count', hue='release',
-                       data=records, ci=False)
-    # return numeric dates into human-readable
-    graph.ax.xaxis.set_major_formatter(fake_dates)
-    graph.ax.set_title('Debian major upgrades to %s planned completion by %s' %
-                       (args.source, guessed_date))
-    graph.ax.set_xlabel('date')
-    # labels overlap otherwise
-    graph.ax.tick_params(labelrotation=45)
-    if (args.dryrun or
-        (args.output == sys.stdout and
-         (sys.stdout.isatty() or 'DISPLAY' in os.environ))):
-        plt.show()  # pragma: no cover
-    else:
-        _, ext = os.path.splitext(args.output.name)
-        plt.savefig(args.output, format=ext[1:], bbox_inches='tight')
-
-
-def prepare_records(records):
-    '''various massaging required by other tools
-
-    This currently only stores the numeric date for seaborn and
-    regression processing.
-    '''
-    records['datenum'] = matplotlib.dates.datestr2num(records['Date'])
-    return records
-
-
-def guess_completion_time(records, source, now=None):
-    '''take the given records and guess the estimated completion time
-
-    :param Dataframe records: the records, as loaded from the CSV file
-           by load_csv)
-    :param str source: the kind of `release`. will fail if unknown
-    :returns: completion date, formatted as a string (YYYY-MM-DD)
-
-    >>> records = prepare_records(test_load_csv())
-    >>> guess_completion_time(records, 'stretch')
-    '2019-03-09'
-    '''
-    if now is None:
-        now = datetime.today().strftime('%Y-%m-%d')
-    subdf = records[records['release'] == source]
-    try:
-        fit = np.polyfit(subdf['count'], subdf['datenum'], 1)
-        prediction = np.poly1d(fit)(0)
-    except (TypeError, ValueError) as e:
-        logging.warning("cannot guess completion time: %s", e)
-        date = 'N/A'
-    else:
-        date = fake_dates(prediction, None)
-        if date < now:
-            logging.warning('suspicious completion time in the past, data may be incomplete: %s', date)  # noqa: E501
-    return date
-
-
-def test_guess_completion_time():
-    test_input = '''Date,release,count
-2019-03-01,stretch,74
-2019-08-15,stretch,49
-2019-10-07,stretch,43
-2019-10-08,stretch,38
-'''
-    expected = '''2020-06-25'''
-    fp = io.StringIO(test_input)
-    records = prepare_records(pd.read_csv(fp))
-    date = guess_completion_time(records, 'stretch', '2019-10-08')
-    assert date == expected
-
-
-def _setup_memory_handler():
-    handler = logging.handlers.MemoryHandler(1000)
-    handler.setLevel('DEBUG')
-    logger = logging.getLogger('')
-    logger.setLevel('DEBUG')
-    logger.addHandler(handler)
-    return handler
-
-
-def test_weird_completion_time(capsys):
-    micah_data = '''Date,release,count
-2019-10-08,stretch,83
-2019-10-08,buster,3
-2019-10-08,sid,1
-2019-10-08,jessie,2'''
-    fp = io.StringIO(micah_data)
-    records = prepare_records(pd.read_csv(fp))
-    with pytest.warns(np.RankWarning):
-        handler = _setup_memory_handler()
-        guess_completion_time(records, 'stretch', '2019-10-08')
-        messages = "\n".join([r.message for r in handler.buffer])
-        assert 'suspicious completion time in the past' in messages
-
-
-if __name__ == '__main__':  # pragma: no cover
-    args = parse_args()
-    logging.basicConfig(format='%(message)s', level=args.log_level.upper())
-    if args.test:
-        logging.info('running test suite')
-        if pytest is None:
-            logging.error('test suite requires pytest to run properly')
-            sys.exit(1)
-        shortname, _ = os.path.splitext(os.path.basename(__file__))
-        sys.exit(pytest.main(['--doctest-modules', '--cov=%s' % shortname,
-                              __file__]))
-    try:
-        main(args)
-    except Exception as e:
-        logging.error('unexpected error: %s', e)
-        if args.log_level == 'debug':
-            logging.warning('starting debugger, type "c" and enter to continue')  # noqa: E501
-            import traceback
-            import pdb
-            import sys
-            traceback.print_exc()
-            pdb.post_mortem()
-            sys.exit(1)
-        raise e