|
#!/bin/bash
|
|
|
|
# This script is heavily based on the restart_osd_daemon.sh.j2 from the
|
|
# ceph-ansible project: https://github.com/ceph/ceph-ansible/blob/master/roles/ceph-handler/templates/restart_osd_daemon.sh.j2
|
|
# The logic mandates that there is an OSD present on the local VM.
|
|
|
|
DELAY=30
|
|
CEPH_CLI="--name client.bootstrap-osd --keyring /var/lib/ceph/bootstrap-osd/ceph.keyring --cluster ceph"
|
|
PYTHON_INTERPRETER="/usr/bin/python3"
|
|
|
|
check_pgs() {
|
|
num_pgs=$($container_exec ceph $CEPH_CLI -s -f json | $PYTHON_INTERPRETER -c 'import sys, json; print(json.load(sys.stdin)["pgmap"]["num_pgs"])')
|
|
if [[ "$num_pgs" == "0" ]]; then
|
|
return 0
|
|
fi
|
|
while [ $RETRIES -ne 0 ]; do
|
|
num_active=$($container_exec ceph $CEPH_CLI -s -f json | $PYTHON_INTERPRETER -c 'import sys, json; print(sum ( [ i["count"] for i in json.load(sys.stdin)["pgmap"]["pgs_by_state"] if "active+clean" in i["state_name"]]))')
|
|
if [[ $num_pgs -eq $num_active ]]; then
|
|
return 0
|
|
else
|
|
sleep $DELAY
|
|
let RETRIES=RETRIES-1
|
|
fi
|
|
done
|
|
# PGs not clean, exiting with return code 1
|
|
echo "Error while running 'ceph $CEPH_CLI -s', PGs were not reported as active+clean"
|
|
echo "MDS cannot be started until PGs are clean"
|
|
echo "Will refuse to continue"
|
|
$container_exec ceph $CEPH_CLI -s
|
|
$container_exec ceph $CEPH_CLI osd dump
|
|
$container_exec ceph $CEPH_CLI osd tree
|
|
$container_exec ceph $CEPH_CLI osd crush rule dump
|
|
exit 1
|
|
}
|
|
|
|
start_mds() {
|
|
/usr/bin/docker run --rm --net=host \
|
|
--memory=11984m \
|
|
--cpus=2 \
|
|
-v /etc/pki/:/etc/pki:ro \
|
|
-v /cephconfig/var/lib/ceph:/var/lib/ceph:z \
|
|
-v /cephconfig/etc/ceph:/etc/ceph:z \
|
|
-v /cephconfig/var/run/ceph:/var/run/ceph:z \
|
|
-v /etc/localtime:/etc/localtime:ro \
|
|
-v /var/log/ceph:/var/log/ceph:z \
|
|
-e CLUSTER=ceph \
|
|
-e CEPH_DAEMON=MDS \
|
|
-e CONTAINER_IMAGE=ceph-daemon:v5.0.12-stable-5.0-octopus-centos-8 \
|
|
-e MDS_NAME="$1" \
|
|
--name=ceph-mds-"$1" \
|
|
ceph-daemon:v5.0.12-stable-5.0-octopus-centos-8
|
|
|
|
# Exit cleanly from this shell script once the docker container exits
|
|
exit 0
|
|
}
|
|
|
|
get_container_id_from_dev_name() {
|
|
local id
|
|
local count
|
|
count=10
|
|
while [ $count -ne 0 ]; do
|
|
id=$(docker ps | grep -E "${1}$" | cut -d' ' -f1)
|
|
test "$id" != "" && break
|
|
sleep $DELAY
|
|
let count=count-1
|
|
done
|
|
echo "$id"
|
|
}
|
|
|
|
########
|
|
# MAIN #
|
|
########
|
|
|
|
# Give the OSDs a head-start
|
|
sleep 1
|
|
|
|
# Retrieve the OSD ID and OSD container ID
|
|
osd_unit=$(systemctl list-units | grep -E "loaded * active" | grep -oE "ceph-osd@([0-9]+|[a-z]+).service")
|
|
osd_id=$(echo ${osd_unit#ceph-osd@} | grep -oE '[0-9]+')
|
|
container_id=$(get_container_id_from_dev_name "ceph-osd-${osd_id}")
|
|
container_exec="docker exec $container_id"
|
|
|
|
# We need to wait because it may take some time for the socket to actually exists
|
|
COUNT=10
|
|
|
|
SOCKET=/var/run/ceph/ceph-osd.${osd_id}.asok
|
|
while [ $COUNT -ne 0 ]; do
|
|
RETRIES=20
|
|
$container_exec test -S "$SOCKET" && check_pgs && start_mds
|
|
# If we get this far something went wrong.
|
|
sleep $DELAY
|
|
let COUNT=COUNT-1
|
|
done
|
|
# If we reach this point, it means the socket is not present.
|
|
echo "Socket file ${SOCKET} could not be found, which means the osd daemon is not running. Showing ceph-osd unit logs now:"
|
|
journalctl -u "${osd_unit}"
|
|
exit 1
|