Project

General

Profile

Bug #51866 » start_mds.sh

Script for delaying MDS start until clean pgs - David Piper, 08/25/2021 03:01 PM

 
#!/bin/bash

# This script is heavily based on the restart_osd_daemon.sh.j2 from the
# ceph-ansible project: https://github.com/ceph/ceph-ansible/blob/master/roles/ceph-handler/templates/restart_osd_daemon.sh.j2
# The logic mandates that there is an OSD present on the local VM.

DELAY=30
CEPH_CLI="--name client.bootstrap-osd --keyring /var/lib/ceph/bootstrap-osd/ceph.keyring --cluster ceph"
PYTHON_INTERPRETER="/usr/bin/python3"

check_pgs() {
num_pgs=$($container_exec ceph $CEPH_CLI -s -f json | $PYTHON_INTERPRETER -c 'import sys, json; print(json.load(sys.stdin)["pgmap"]["num_pgs"])')
if [[ "$num_pgs" == "0" ]]; then
return 0
fi
while [ $RETRIES -ne 0 ]; do
num_active=$($container_exec ceph $CEPH_CLI -s -f json | $PYTHON_INTERPRETER -c 'import sys, json; print(sum ( [ i["count"] for i in json.load(sys.stdin)["pgmap"]["pgs_by_state"] if "active+clean" in i["state_name"]]))')
if [[ $num_pgs -eq $num_active ]]; then
return 0
else
sleep $DELAY
let RETRIES=RETRIES-1
fi
done
# PGs not clean, exiting with return code 1
echo "Error while running 'ceph $CEPH_CLI -s', PGs were not reported as active+clean"
echo "MDS cannot be started until PGs are clean"
echo "Will refuse to continue"
$container_exec ceph $CEPH_CLI -s
$container_exec ceph $CEPH_CLI osd dump
$container_exec ceph $CEPH_CLI osd tree
$container_exec ceph $CEPH_CLI osd crush rule dump
exit 1
}

start_mds() {
/usr/bin/docker run --rm --net=host \
--memory=11984m \
--cpus=2 \
-v /etc/pki/:/etc/pki:ro \
-v /cephconfig/var/lib/ceph:/var/lib/ceph:z \
-v /cephconfig/etc/ceph:/etc/ceph:z \
-v /cephconfig/var/run/ceph:/var/run/ceph:z \
-v /etc/localtime:/etc/localtime:ro \
-v /var/log/ceph:/var/log/ceph:z \
-e CLUSTER=ceph \
-e CEPH_DAEMON=MDS \
-e CONTAINER_IMAGE=ceph-daemon:v5.0.12-stable-5.0-octopus-centos-8 \
-e MDS_NAME="$1" \
--name=ceph-mds-"$1" \
ceph-daemon:v5.0.12-stable-5.0-octopus-centos-8

# Exit cleanly from this shell script once the docker container exits
exit 0
}

get_container_id_from_dev_name() {
local id
local count
count=10
while [ $count -ne 0 ]; do
id=$(docker ps | grep -E "${1}$" | cut -d' ' -f1)
test "$id" != "" && break
sleep $DELAY
let count=count-1
done
echo "$id"
}

########
# MAIN #
########

# Give the OSDs a head-start
sleep 1

# Retrieve the OSD ID and OSD container ID
osd_unit=$(systemctl list-units | grep -E "loaded * active" | grep -oE "ceph-osd@([0-9]+|[a-z]+).service")
osd_id=$(echo ${osd_unit#ceph-osd@} | grep -oE '[0-9]+')
container_id=$(get_container_id_from_dev_name "ceph-osd-${osd_id}")
container_exec="docker exec $container_id"

# We need to wait because it may take some time for the socket to actually exists
COUNT=10

SOCKET=/var/run/ceph/ceph-osd.${osd_id}.asok
while [ $COUNT -ne 0 ]; do
RETRIES=20
$container_exec test -S "$SOCKET" && check_pgs && start_mds
# If we get this far something went wrong.
sleep $DELAY
let COUNT=COUNT-1
done
# If we reach this point, it means the socket is not present.
echo "Socket file ${SOCKET} could not be found, which means the osd daemon is not running. Showing ceph-osd unit logs now:"
journalctl -u "${osd_unit}"
exit 1
(9-9/12)