Verified Commit b6503339 authored by anarcat's avatar anarcat
Browse files

handle plain instances more gracefully (team#41263)

I have considered trying to parse the output of gnt-node migrate but
it's a big unknown at the moment. I remember (and found in the Ganeti
source, in lib/cmdlib/instance_migrate.py) a message like:

    Instance's disk layout '%s' does not allow migrations

... but that shows the disk name, not the instance name, so we can't
just reuse that.

Instead, we handle failures from `gnt-node migrate` by assuming it's a
plain instance that crashed the migrate and shift to stopping that
instance. We still crash if there are no plain instances, and
there *is* now a possibility that we reboot a host even if a migrate
failed on a DRBD host, but that's a risk I'm willing to take to
simplify things.

The alternative would be to migrate hosts by hand, but I feel more
iffy about this, because it requires multiple round-trips over SSH,
something which could fail if we need user interaction (e.g. Yubikey
press).

So this is getting uglier, unfortunately, but I am hoping it also
happens to work.
parent 832c0284
Loading
Loading
Loading
Loading
+3 −19
Original line number Diff line number Diff line
@@ -103,8 +103,8 @@ def empty_node(node_con, master_host=None):

@task
def stop_instances(
        node_con,
        master_host="fsn-node-01.torproject.org",
        master_con,
        instances=[],
        delay_shutdown=None,
        delay_down=None,
):
@@ -123,19 +123,6 @@ def stop_instances(
    if delay_down is None:
        delay_down = DEFAULT_DELAY_DOWN

    master_con = host.find_context(master_host, config=node_con.config)
    logging.info(
        "finding instances running on %s from %s ", node_con.host, master_con.host
    )
    instances = list(
        _list_instances(
            master_con,
            {
                "pnode": node_con.host,
                "admin_state": "up",
            },
        )
    )
    if not len(instances):
        return []

@@ -184,9 +171,8 @@ def stop_instances(
    # end of FIXME

    logging.info(
        "forcibly stopping all instances (%d) on %s from master %s",
        "forcibly stopping all instances (%d) from master %s",
        len(instances),
        node_con.host,
        master_con.host,
    )

@@ -195,8 +181,6 @@ def stop_instances(
    # ThreadGroup
    master_con.run("gnt-instance shutdown --force-multiple %s" % " ".join(instances))

    return instances


@task
def stop(instance_con, master_host="fsn-node-01.torproject.org"):
+37 −15
Original line number Diff line number Diff line
@@ -282,10 +282,7 @@ def shutdown_and_wait(
        else:
            master_con = host.find_context(master, config=con.config)

            if ganeti_empty:
                # shorter delay, as the node will be empty
                delay_shutdown = 0
                migrated_instances = list(
            affected_instances = list(
                ganeti._list_instances(
                    master_con,
                    {
@@ -294,23 +291,48 @@ def shutdown_and_wait(
                    },
                )
            )

            if ganeti_empty:
                # shorter delay, as the node will be empty
                delay_shutdown = 0
                logging.info(
                    "ganeti node detected, migrating %d instances from %s: %s",
                    len(migrated_instances),
                    len(affected_instances),
                    con.host,
                    " ".join(migrated_instances),
                    " ".join(affected_instances),
                )
                if not ganeti.empty_node(con, master_con):
                    raise Exit("failed to empty node %s, aborting" % con.host)
                    logging.warning("failed to empty node %s trying to find plain instances...", con.host)
                    instances_to_shutdown = list(
                        ganeti._list_instances(
                            master_con,
                            {
                                "pnode": con.host,
                                "admin_state": "up",
                                "disk_template": "plain",
                            },
                        )
                    )

                    if not instances_to_shutdown:
                        raise Exit("no plain instance found, failed to empty node %s, aborting" % con.host)
                    logging.warning(
                        "failed to empty node %s, trying to shutdown plain instances: %s",
                        con.host,
                        instances_to_shutdown,
                    )
                    ganeti.stop_instances(master_con, instances_to_shutdown, delay_shutdown=delay_shutdown)
                    # exclude shutdown hosts from instances to migrate back
                    migrated_instances = [i for i in migrated_instances if i not in instances_to_shutdown]
                    shutdown_instances = instances_to_shutdown
            else:
                logging.info(
                    "ganeti node detected, shutting down instances on %s with %s min delay",
                    con.host,
                    delay_shutdown,
                )
                instances = list(ganeti.stop_instances(con, master_con, delay_shutdown=delay_shutdown))
                for instance in instances:
                    shutdown_instances.append(instance)
                shutdown_instances = affected_instances
                ganeti.stop_instances(master_con, affected_instances, delay_shutdown=delay_shutdown)
                # shorter delay, as the node will be empty
                delay_shutdown = 0
                # raise Exit("failed to shutdown all instances on node %s, aborting" % con.host)