start_mds.sh - CephFS - Ceph

Bug #51866 » start_mds.sh

Script for delaying MDS start until clean pgs - David Piper, 08/25/2021 03:01 PM

    
    #!/bin/bash

    # This script is heavily based on the restart_osd_daemon.sh.j2 from the

    # ceph-ansible project: https://github.com/ceph/ceph-ansible/blob/master/roles/ceph-handler/templates/restart_osd_daemon.sh.j2

    # The logic mandates that there is an OSD present on the local VM.

    DELAY=30

    CEPH_CLI="--name client.bootstrap-osd --keyring /var/lib/ceph/bootstrap-osd/ceph.keyring --cluster ceph"

    PYTHON_INTERPRETER="/usr/bin/python3"

    check_pgs() {

      num_pgs=$($container_exec ceph $CEPH_CLI -s -f json | $PYTHON_INTERPRETER -c 'import sys, json; print(json.load(sys.stdin)["pgmap"]["num_pgs"])')

      if [[ "$num_pgs" == "0" ]]; then

        return 0

      fi

      while [ $RETRIES -ne 0 ]; do

        num_active=$($container_exec ceph $CEPH_CLI -s -f json | $PYTHON_INTERPRETER -c 'import sys, json; print(sum ( [ i["count"] for i in json.load(sys.stdin)["pgmap"]["pgs_by_state"] if "active+clean" in i["state_name"]]))')

        if [[ $num_pgs -eq $num_active ]]; then

          return 0

        else

          sleep $DELAY

          let RETRIES=RETRIES-1

        fi

      done

      # PGs not clean, exiting with return code 1

      echo "Error while running 'ceph $CEPH_CLI -s', PGs were not reported as active+clean"

      echo "MDS cannot be started until PGs are clean"

      echo "Will refuse to continue"

      $container_exec ceph $CEPH_CLI -s

      $container_exec ceph $CEPH_CLI osd dump

      $container_exec ceph $CEPH_CLI osd tree

      $container_exec ceph $CEPH_CLI osd crush rule dump

      exit 1

    }

    start_mds() {

      /usr/bin/docker run --rm --net=host \

        --memory=11984m \

        --cpus=2 \

        -v /etc/pki/:/etc/pki:ro \

        -v /cephconfig/var/lib/ceph:/var/lib/ceph:z \

        -v /cephconfig/etc/ceph:/etc/ceph:z \

        -v /cephconfig/var/run/ceph:/var/run/ceph:z \

        -v /etc/localtime:/etc/localtime:ro \

        -v /var/log/ceph:/var/log/ceph:z \

        -e CLUSTER=ceph \

        -e CEPH_DAEMON=MDS \

        -e CONTAINER_IMAGE=ceph-daemon:v5.0.12-stable-5.0-octopus-centos-8 \

        -e MDS_NAME="$1"  \

        --name=ceph-mds-"$1" \

        ceph-daemon:v5.0.12-stable-5.0-octopus-centos-8

        # Exit cleanly from this shell script once the docker container exits

        exit 0

    }

    get_container_id_from_dev_name() {

      local id

      local count

      count=10

      while [ $count -ne 0 ]; do

        id=$(docker ps | grep -E "${1}$" | cut -d' ' -f1)

        test "$id" != "" && break

        sleep $DELAY

        let count=count-1

      done

      echo "$id"

    }

    ########

    # MAIN #

    ########

    # Give the OSDs a head-start

    sleep 1

    # Retrieve the OSD ID and OSD container ID

    osd_unit=$(systemctl list-units | grep -E "loaded * active" | grep -oE "ceph-osd@([0-9]+|[a-z]+).service")

    osd_id=$(echo ${osd_unit#ceph-osd@} | grep -oE '[0-9]+')

    container_id=$(get_container_id_from_dev_name "ceph-osd-${osd_id}")

    container_exec="docker exec $container_id"

    # We need to wait because it may take some time for the socket to actually exists

    COUNT=10

    SOCKET=/var/run/ceph/ceph-osd.${osd_id}.asok

    while [ $COUNT -ne 0 ]; do

      RETRIES=20

      $container_exec test -S "$SOCKET" && check_pgs && start_mds

      # If we get this far something went wrong.

      sleep $DELAY

      let COUNT=COUNT-1

    done

    # If we reach this point, it means the socket is not present.

    echo "Socket file ${SOCKET} could not be found, which means the osd daemon is not running. Showing ceph-osd unit logs now:"

    journalctl -u "${osd_unit}"

    exit 1

(9-9/12)

Project

General

Profile

Ceph » CephFS

Bug #51866 » start_mds.sh