ganeti.md

gnt-instance list
gnt-instance console test01.torproject.org
gnt-instance add \
  -o debootstrap+buster \
  -t drbd --no-wait-for-sync \
  --net 0:ip=pool,network=gnt-fsn \
  --no-ip-check \
  --no-name-check \
  --disk 0:size=10G \
  --disk 1:size=2G,name=swap \
  --disk 2:size=20G \
  --disk 3:size=800G,vg=vg_ganeti_hdd \
  --backend-parameters memory=8g,vcpus=2 \
  test-01.torproject.org
gnt-instance add \
  -o debootstrap+buster \
  -t drbd --no-wait-for-sync \
  --net 0:ip=pool,network=gnt-chi-01 \
  --no-ip-check \
  --no-name-check \
  --disk 0:size=10G \
  --disk 1:size=2G,name=swap \
  --disk 2:size=20G \
  --backend-parameters memory=8g,vcpus=2 \
  test-01.torproject.org
egrep 'root password|configured eth0 with|SHA256' $(ls -tr /var/log/ganeti/os/* | tail -1) | grep -v $(hostname)
 Can's find primary node using iallocator hail: Request failed: No valid allocation solutions, failure reasons: FailMem: 2, FailN1: 2
gnt-instance modify -B vcpus=2 test1.torproject.org
gnt-instance modify -B memory=4g test1.torproject.org
gnt-instance reboot test1.torproject.org
gnt-instance modify --net 0:modify,ip=116.202.120.175 test1.torproject.org
gnt-instance stop test1.torproject.org
gnt-instance start test1.torproject.org
gnt-instance console test1.torproject.org
gnt-instance grow-disk --absolute test1.torproject.org 0 16g
gnt-instance reboot test1.torproject.org
gnt-instance grow-disk test1.torproject.org 0 16g
gnt-instance reboot test1.torproject.org
ssh root@test1.torproject.org
root@cupani:~# pvs
PV         VG        Fmt  Attr PSize   PFree
/dev/sdc   vg_test   lvm2 a--  <8.00g  1020.00m
pvresize /dev/sdc
# lvs
LV            VG               Attr       LSize    Pool Origin Data%  Meta%  Move Log Cpy%Sync Convert
var-opt       vg_test-01     -wi-ao---- <10.00g
test-backup vg_test-01_hdd   -wi-ao---- <20.00g
lvextend -l '+100%FREE' vg_test-01/var-opt
resize2fs /dev/vg_test-01/var-opt
gnt-instance modify --disk add:size=100g,vg=vg_ganeti_hdd test1.torproject.org
gnt-instance reboot test1.torproject.org
gnt-instance modify --net add:link=br0,vlan=4002,mode=openvswitch test1.torproject.org
gnt-instance remove test01.torproject.org
root@fsn-node-01:~# gnt-instance info tb-build-02.torproject.org
- Instance name: tb-build-02.torproject.org
  UUID: 8e9f3ca6-204f-4b6c-8e3e-6a8fda137c9b
  Serial number: 5
  Creation time: 2020-12-15 14:06:41
  Modification time: 2020-12-15 14:07:31
  State: configured to be up, actual state is up
  Nodes:
    - primary: fsn-node-03.torproject.org
      group: default (UUID 8c32fd09-dc4c-4237-9dd2-3da3dfd3189e)
    - secondaries: fsn-node-04.torproject.org (group default, group UUID 8c32fd09-dc4c-4237-9dd2-3da3dfd3189e)
  Operating system: debootstrap+buster
gnt-instance info tb-build-02.torproject.org | grep -A 3 Nodes
gnt-instance list -o pnode,snodes,name,be/vcpus,be/memory,disk_usage,disk_template,status | sort
watch -n5 -d 'gnt-instance list -o pnode,snodes,name,be/vcpus,be/memory,disk_usage,disk_template,status | sort'
gnt-instance list -o pnode,name,be/vcpus,be/memory,disk_usage,disk_template,status | sort &&
echo &&
gnt-node list
gnt-node list-storage
root@fsn-node-01:~# gnt-cluster verify
Submitted jobs 48030, 48031
Waiting for job 48030 ...
Fri Jan 17 20:05:42 2020 * Verifying cluster config
Fri Jan 17 20:05:42 2020 * Verifying cluster certificate files
Fri Jan 17 20:05:42 2020 * Verifying hypervisor parameters
Fri Jan 17 20:05:42 2020 * Verifying all nodes belong to an existing group
Waiting for job 48031 ...
Fri Jan 17 20:05:42 2020 * Verifying group 'default'
Fri Jan 17 20:05:42 2020 * Gathering data (2 nodes)
Fri Jan 17 20:05:42 2020 * Gathering information about nodes (2 nodes)
Fri Jan 17 20:05:45 2020 * Gathering disk information (2 nodes)
Fri Jan 17 20:05:45 2020 * Verifying configuration file consistency
Fri Jan 17 20:05:45 2020 * Verifying node status
Fri Jan 17 20:05:45 2020 * Verifying instance status
Fri Jan 17 20:05:45 2020 * Verifying orphan volumes
Fri Jan 17 20:05:45 2020 * Verifying N+1 Memory redundancy
Fri Jan 17 20:05:45 2020 * Other Notes
Fri Jan 17 20:05:45 2020 * Hooks Results
Mon Oct 26 18:59:37 2009 * Verifying N+1 Memory redundancy
Mon Oct 26 18:59:37 2009   - ERROR: node node2: not enough memory to accommodate instance failovers should node node1 fail
gnt-instance migrate test1.torproject.org
gnt-node migrate test1.torproject.org
gnt-instance failover test1.torproject.org
gnt-node evacuate -I . fsn-node-02.torproject.org
gnt-node failover fsn-node-02.torproject.org
 fab -H crm-ext-01.torproject.org,crm-int-01.torproject.org,forrestii.torproject.org,nevii.torproject.org,rude.torproject.org,troodi.torproject.org,vineale.torproject.org libvirt.du -p kvm3.torproject.org | sed '/-swap$/d;s/ .*$//' <f | awk '{s+=$1} END {print s}'
 lvcreate -L 300G vg_ganeti -n srv-tmp &&
 mkfs /dev/vg_ganeti/srv-tmp &&
 mount /dev/vg_ganeti/srv-tmp /srv
./ganeti -v -H $INSTANCE libvirt-import  --ganeti-node $SPARE_NODE --libvirt-host $KVM_HOST
./ganeti -v -H $INSTANCE libvirt-import  --ganeti-node $SPARE_NODE --libvirt-host $KVM_HOST --suspend --adopt
./ganeti -v -H $INSTANCE renumber-instance --ganeti-node $SPARE_NODE
116.202.120.182 $INSTANCE
2a01:4f8:fff0:4f:266:37ff:fe32:cfb2 $INSTANCE
gnt-instance remove $INSTANCE
dnsTTL: 300
ssh root@alberti.torproject.org sudo -u sshdist ud-generate &&
ssh root@nevii.torproject.org ud-replicate
fab -H $INSTANCE reboot.halt-and-wait --delay-shutdown 60 --reason='migrating to new server' &&
./ganeti -v -H $INSTANCE libvirt-import  --ganeti-node $SPARE_NODE --libvirt-host $KVM_HOST --adopt &&
./ganeti -v -H $INSTANCE renumber-instance --ganeti-node $SPARE_NODE
gnt-instance stop $INSTANCE &&
gnt-instance modify -t drbd $INSTANCE &&
gnt-instance failover -f $INSTANCE &&
gnt-instance start $INSTANCE
The above can sometimes fail if the allocator is upset about
something in the cluster, for example:

    Can's find secondary node using iallocator hail: Request failed: No valid allocation solutions, failure reasons: FailMem: 2, FailN1: 2

This situation is covered by [ticket 33785](https://bugs.torproject.org/33785). To work around the
allocator, you can specify a secondary node directly:

    gnt-instance modify -t drbd -n fsn-node-04.torproject.org $INSTANCE &&
    gnt-instance failover -f $INSTANCE &&
    gnt-instance start $INSTANCE

TODO: move into fabric, maybe in a `libvirt-import-live` or
`post-libvirt-import` job that would also do the renumbering below
 grep -r -e 78.47.38.227  -e 2a01:4f8:fff0:4f:266:37ff:fe77:1ad8 /etc
 grep -r -e 78.47.38.227  -e 2a01:4f8:fff0:4f:266:37ff:fe77:1ad8 /srv
 cumin-all-puppet
 cumin-all 'grep -r -e 78.47.38.227  -e 2a01:4f8:fff0:4f:266:37ff:fe77:1ad8 /etc'
TODO: move those jobs into fabric
78.47.38.228 2a01:4f8:211:6e8:0:823:4:1
116.202.120.182 2a01:4f8:fff0:4f:266:37ff:fe32:cfb2
Warning: Permanently added the ED25519 host key for IP address '116.202.120.182' to the list of known hosts.
MASTER_NODE=fsn-node-01.torproject.org
SPARE_NODE=fsn-node-03.torproject.org
KVM_HOST=kvm1.torproject.org
INSTANCE=test.torproject.org
for disk in /srv/vmstore/$INSTANCE/*; do
    printf "$disk: "
    echo "$(qemu-img info --output=json $disk | jq '."virtual-size"') / 1024 / 1024 / 1024" | bc -l
done
sed -n '/<vcpu/{s/[^>]*>//;s/<.*//;p}' < /etc/libvirt/qemu/$INSTANCE.xml
echo "$(sed -n '/<memory/{s/[^>]*>//;s/<.*//;p}' < /etc/libvirt/qemu/$INSTANCE.xml) /1024 /1024" | bc -l
blkid -t TYPE=swap -s UUID -o value
ssh-agent bash
ssh-add /etc/ssh/ssh_host_ed25519_key
cat /etc/ssh/ssh_host_ed25519_key.pub
echo "$KEY_FROM_SPARE_NODE" >> /etc/ssh/userkeys/root
rsync -P $KVM_HOST:/srv/vmstore/$INSTANCE/$INSTANCE-root /srv/
rsync -P $KVM_HOST:/srv/vmstore/$INSTANCE/$INSTANCE-lvm /srv/ || true
(mkdir /root/srv && mv /srv/* /root/srv true) || true &&
lvcreate -L 200G vg_ganeti -n srv &&
mkfs /dev/vg_ganeti/srv &&
echo "/dev/vg_ganeti/srv /srv ext4 rw,noatime,errors=remount-ro 0 2" >> /etc/fstab &&
mount /srv &&
( mv /root/srv/* ; rmdir /root/srv )
lvcreate -L 4GiB -n $INSTANCE-swap vg_ganeti
mkswap --uuid $SWAP_UUID /dev/vg_ganeti/$INSTANCE-swap
lvcreate -L 20GiB -n $INSTANCE-root vg_ganeti
qemu-img convert /srv/$INSTANCE-root  -O raw /dev/vg_ganeti/$INSTANCE-root
lvcreate -L 40GiB -n $INSTANCE-lvm vg_ganeti_hdd
qemu-img convert /srv/$INSTANCE-lvm  -O raw /dev/vg_ganeti_hdd/$INSTANCE-lvm
qemu-img convert /srv/$INSTANCE-lvm -O raw /srv/$INSTANCE-lvm.raw
pv /srv/$INSTANCE-lvm.raw | dd of=/dev/vg_ganeti_hdd/$INSTANCE-lvm bs=4k
gnt-instance add -t plain \
    -n fsn-node-03 \
    --disk 0:adopt=$INSTANCE-root \
    --disk 1:adopt=$INSTANCE-swap \
    --disk 2:adopt=$INSTANCE-lvm,vg=vg_ganeti_hdd \
    --backend-parameters memory=2g,vcpus=2 \
    --net 0:ip=pool,network=gnt-fsn \
    --no-name-check \
    --no-ip-check \
    -o debootstrap+default \
    $INSTANCE
gnt-instance console $INSTANCE
  gnt-instance show $INSTANCE
  iface eth0 inet6 static
      accept_ra 0
      address 2a01:4f8:fff0:4f:266:37ff:fe65:870f/64
      gateway 2a01:4f8:fff0:4f::1
gnt-instance stop $INSTANCE
rsync -P $KVM_HOST:/srv/vmstore/$INSTANCE/$INSTANCE-root /srv/ &&
qemu-img convert /srv/$INSTANCE-root  -O raw /dev/vg_ganeti/$INSTANCE-root &&
rsync -P $KVM_HOST:/srv/vmstore/$INSTANCE/$INSTANCE-lvm /srv/ &&
qemu-img convert /srv/$INSTANCE-lvm  -O raw /dev/vg_ganeti_hdd/$INSTANCE-lvm
gnt-instance modify -t drbd $INSTANCE
gnt-instance failover $INSTANCE
gnt-instance startup $INSTANCE
 [  *** ] A start job is running for dev-disk-by\x2duuid-484b5...26s / 1min 30s)
 [DEPEND] Dependency failed for /dev/disk/by-…6f4b5-f334-4173-8491-9353d4f94e04.
 [DEPEND] Dependency failed for Swap.
 gnt-instance add -t plain -n HOME_NODE ... --disk 0:adopt=lv_name[,vg=vg_name] INSTANCE_NAME
hbal -L -C -v -X
root@fsn-node-01:~# gnt-instance list
Instance                          Hypervisor OS                 Primary_node               Status  Memory
loghost01.torproject.org          kvm        debootstrap+buster fsn-node-02.torproject.org running   2.0G
onionoo-backend-01.torproject.org kvm        debootstrap+buster fsn-node-02.torproject.org running  12.0G
static-master-fsn.torproject.org  kvm        debootstrap+buster fsn-node-02.torproject.org running   8.0G
web-fsn-01.torproject.org         kvm        debootstrap+buster fsn-node-02.torproject.org running   4.0G
web-fsn-02.torproject.org         kvm        debootstrap+buster fsn-node-02.torproject.org running   4.0G
root@fsn-node-01:~# hbal -L -X
Loaded 2 nodes, 5 instances
Group size 2 nodes, 5 instances
Selected node group: default
Initial check done: 0 bad nodes, 0 bad instances.
Initial score: 8.45007519
Trying to minimize the CV...
    1. onionoo-backend-01 fsn-node-02:fsn-node-01 => fsn-node-01:fsn-node-02   4.98124611 a=f
    2. loghost01          fsn-node-02:fsn-node-01 => fsn-node-01:fsn-node-02   1.78271883 a=f
Cluster score improved from 8.45007519 to 1.78271883
Solution length=2
Got job IDs 16345
Got job IDs 16346
root@fsn-node-01:~# gnt-instance list
Instance                          Hypervisor OS                 Primary_node               Status  Memory
loghost01.torproject.org          kvm        debootstrap+buster fsn-node-01.torproject.org running   2.0G
onionoo-backend-01.torproject.org kvm        debootstrap+buster fsn-node-01.torproject.org running  12.0G
static-master-fsn.torproject.org  kvm        debootstrap+buster fsn-node-02.torproject.org running   8.0G
web-fsn-01.torproject.org         kvm        debootstrap+buster fsn-node-02.torproject.org running   4.0G
web-fsn-02.torproject.org         kvm        debootstrap+buster fsn-node-02.torproject.org running   4.0G
gnt-cluster add-tags htools:iextags:service
gnt-instance add-tags web-fsn-01.torproject.org service:web-fsn
gnt-instance add-tags web-fsn-02.torproject.org service:web-fsn
# gnt-cluster list-tags
htools:iextags:service
# gnt-cluster search-tags service
/cluster htools:iextags:service
/instances/web-fsn-01.torproject.org service:web-fsn
/instances/web-fsn-02.torproject.org service:web-fsn
one of the migrate failed and stopped the cluster balance: Can't create block device: Can't create block device <LogicalVolume(/dev/vg_ganeti_hdd/98d30e7d-0a47-4a7d-aeed-6301645d8469.disk3_data, visible as /dev/, size=102400m)> on node fsn-node-07.torproject.org for instance gitlab-02.torproject.org: Can't create block device: Can't compute PV info for vg vg_ganeti_hdd
hbal -L -v --exclude-instances gitlab-02.torproject.org
gnt-instance list -o  pnode,snodes,name,be/vcpus,be/memory,disk_usage,disk_template,status \
  | sort | grep 'fsn-node-02' | awk '{print $3}' | \
  while read instance ; do
    printf "checking $instance: "
    if gnt-instance info $instance | grep -q hdd ; then
      echo "HAS HDD"
    else
      echo "NO HDD"
    fi
  done
gnt-instance list -o pnode,snodes,name,be/vcpus,be/memory,disk_usage,disk_template,status | sort | grep '^fsn-node-0[1234]' | grep 'fsn-node-0[5678]'
gnt-instance list -o pnode,snodes,name,be/vcpus,be/memory,disk_usage,disk_template,status | sort | grep ^fsn-node-04 | grep 'fsn-node-0[5678]'
gnt-instance modify --net -1:add,ip=116.202.120.174,network=gnt-fsn test01.torproject.org
lvs -o+tags
root@fsn-node-01:~# lvs -o tags /dev/vg_ganeti_hdd/4091b668-1177-41ac-9310-1eac45b46620.disk2_data
  LV Tags
  originstname+bacula-director-01.torproject.org
root@chi-node-02:~# ifup br0
add bridge failed: Package not installed
run-parts: /etc/network/if-pre-up.d/bridge exited with return code 1
ifup: failed to bring up br0
touch /etc/no_modules_disabled
reboot
mandos-ctl --enable chi-node-02.torproject
* Verifying orphan volumes
   - WARNING: node fsn-node-06.torproject.org: volume vg_ganeti/27dd3687-8953-447e-8632-adf4aa4e11b6.disk0_meta is unknown
   - WARNING: node fsn-node-06.torproject.org: volume vg_ganeti/27dd3687-8953-447e-8632-adf4aa4e11b6.disk0_data is unknown
   - WARNING: node fsn-node-06.torproject.org: volume vg_ganeti/abf0eeac-55a0-4ccc-b8a0-adb0d8d67cf7.disk1_meta is unknown
   - WARNING: node fsn-node-06.torproject.org: volume vg_ganeti/abf0eeac-55a0-4ccc-b8a0-adb0d8d67cf7.disk1_data is unknown