Project

General

Profile

Feature #63544 ยป ceph-clone-cleanup.sh

Raimund Sacherer, 11/20/2023 03:16 PM

 
#!/bin/bash
#
# Author: Raimund Sacherer (rsachere@redhat.com)
#
# Changes: 2023-10-10 Initial Version V1
#
#
# Please note that this is not to be treated as a Red Hat official binary / software,
# and feel free to go though the source code to look for what's happening behind the scenes.
#
# SPDX-License-Identifier: MIT-0
# MAINTAINER: rsachere@redhat.com
#
# TODO: 2023-10-10 Check if it would make sense to loop over all volumes
# and subvolume-groups automatically
# 2023-10-11 Add Creation Time output and if it exists, last accessed timestamps to log files
#


shopt -s checkwinsize; (:);

trap 'rm -f ${TEMPFILE_0} ; rm -f ${TEMPFILE_1}' EXIT

OPTIND=1

#
# Defaults
SUDO=sudo
LIST_CLONES=1
LIST_SNAPSHOTS=0
CASE_NUMBER=""
CASE_UPLOAD=0
USER_NAME=""
CEPHFS_GROUP=csi
CEPHFS_VOLUME=cephfs
DESTRUCTIVE_ACTION=0
CANCEL_IN_PROGRESS=0
CANCEL_PENDING_CLONES=0
REMOVE_CANCELED_SUBVOLUME=0
REMOVE_ALL_CANCELED_SUBVOLUMES=0

for program in jq ceph sudo mktemp ; do
which ${program} > /dev/null 2>&1

if [ ! $? -eq 0 ] ; then
echo "Please install program `${program}`."
exit 65
fi
done

for i in "$@" ; do
case $i in
--list-snapshots)
LIST_SNAPSHOTS=1
LIST_CLONES=0
shift
;;
--list-clones)
LIST_SNAPSHOTS=0
LIST_CLONES=1
shift
;;
--cancel-pending-clones)
CANCEL_PENDING_CLONES=1
shift
;;
--cancel-inprogress-clones)
CANCEL_IN_PROGRESS=1
shift
;;
--remove-canceled-clones)
REMOVE_CANCELED_SUBVOLUME=1
shift
;;
--remove-all-canceled-clones)
REMOVE_ALL_CANCELED_SUBVOLUMES=1
shift
;;
--group-name=*)
CEPHFS_GROUP="${i#*=}"
shift
;;
--volume-name=*)
CEPHFS_VOLUME="${i#*=}"
shift
;;

--case-number=*)
CASE_NUMBER="${i#*=}"
CASE_UPLOAD=1
shift
;;

--portal-user-name=*)
USER_NAME="${i#*=}"
shift
;;

--no-sudo)
SUDO=""
;;

-h|--help)
echo "$0 [--list-clones] [--list-snapshots]"
echo " [--cancel-pending-clones] [--cancel-inprogress-clones]"
echo " [--remove-canceled-clones] [--remove-all-canceled-clones]"
echo " [--group-name=NAME] [--volume-name=NAME]"
echo " [--case-number=NUMBER] [--portal-user-name=USER]"
echo " [--no-sudo]"
echo " [-h|--help]"
echo
echo " ********************************************************************************************"
echo " Pleasese note that this is not to be treated as a Red Hat official binary / software, "
echo " and feel free to go though the source code to look for what's happening behind the scenes. "
echo " ********************************************************************************************"
echo
echo " --list-clones List all clones (In-Progress, Pending, Complete). Default Operation"
echo " --list-snapshots List all snapshots and check for pending clone operations"
echo " --cancel-pending-clones Cancel all pending clones."
echo " --cancel-inprogress-clones Cancel all in-progress cloning operations."
echo " --remove-canceled-clones When a clone is canceled, also remove the subvolume."
echo " --remove-all-canceled-clones Remove the subvolumes for clones which have been canceled already."
echo " --group-name=NAME Use <NAME> as cephfs subvolume group name (default 'csi')."
echo " --volume-name=NAME Use <NAME> as cephfs subvolume name (default 'cephfs')."
echo " --case-number=NUMBER Case number to use to upload the logfiles to."
echo " --portal-user-name=USER access.redhat.com username to upload the logfiles."
echo " --no-sudo Do not use sudo when executing ceph commands."
echo " --help|-h This help screen."
echo
echo "Statistics: 'P' = Pending clones, 'In-P' = In progress clones, 'C' = Canceled clones."
echo
shift
exit 0
;;
*)
echo "Command option ${i} unknown." >&2
echo
${0} --help
exit 66
;;
esac
done



log () {
local TAB=$'\t'
#echo "${1}${TAB}${2}${TAB}${3}${TAB}${4}"
echo "$(date -Iseconds)${TAB}${1}${TAB}${2}${TAB}${3}${TAB}${4}" >> ${LOGFILE}
}

case_upload () {

if [ -n "${USER_NAME}" ] && [ "${CASE_UPLOAD}" -eq 1 ] && [ -n "${CASE_NUMBER}" ]; then
echo "Trying to upload log files to case ${CASE_NUMBER}."
UPLOAD_FILE="${0}-$(date -Iminutes).tgz"
tar -czf "${UPLOAD_FILE}" "${LOGFILE}" "${CMDFILE}"
curl -u "${USER_NAME}" -X POST -F "description=${LOGFILE}" -F "file=@${UPLOAD_FILE}" "https://attachments.access.redhat.com/hydra/rest/cases/${CASE_NUMBER}/attachments/"
rm "${UPLOAD_FILE}"
echo
fi
}

ceph_command () {
local OUTPUT_FILE

if [ -n "${5}" ] ; then
OUTPUT_FILE="${5}"
else
OUTPUT_FILE="${TEMPFILE_1}"
fi

if [ "${OUTPUT_FILE}" == "STDOUT_WITHERROR" ] ; then
echo "${SUDO} ceph fs ${1} ${2} ${3} --vol_name ${CEPHFS_VOLUME} --group_name ${CEPHFS_GROUP} ${4} 2>&1" >> "${CMDFILE}"
${SUDO} ceph fs ${1} ${2} ${3} --vol_name ${CEPHFS_VOLUME} --group_name ${CEPHFS_GROUP} ${4} 2>&1
elif [ "${OUTPUT_FILE}" == "STDOUT" ] ; then
echo "${SUDO} ceph fs ${1} ${2} ${3} --vol_name ${CEPHFS_VOLUME} --group_name ${CEPHFS_GROUP} ${4} 2>/dev/null" >> "${CMDFILE}"
${SUDO} ceph fs ${1} ${2} ${3} --vol_name ${CEPHFS_VOLUME} --group_name ${CEPHFS_GROUP} ${4} 2>/dev/null
else
echo "${SUDO} ceph fs ${1} ${2} ${3} --vol_name ${CEPHFS_VOLUME} --group_name ${CEPHFS_GROUP} ${4} > ${OUTPUT_FILE} 2>&1" >> "${CMDFILE}"
${SUDO} ceph fs ${1} ${2} ${3} --vol_name ${CEPHFS_VOLUME} --group_name ${CEPHFS_GROUP} ${4} > ${OUTPUT_FILE} 2>&1
fi

return $?
}

check_and_remove_subvolume () {
if [ $(cat ${TEMPFILE_1} | jq -r '.status.state') == "canceled" ] ; then
ceph_command subvolume rm "--sub_name ${1}" "--force"
CEPH_EXIT_CODE=$?

if [ ${CEPH_EXIT_CODE} -eq 0 ] ; then
log "removing-canceled-clone" "${1}" "${CEPH_EXIT_CODE}" "$(cat ${TEMPFILE_1})"
let stat_clone_removed+=1
else
log "not-removing-canceled-clone" "${1}" "${CEPH_EXIT_CODE}" "$(cat ${TEMPFILE_1})"
fi
else
log "not-removing-canceled-clone" "${1}" "${CEPH_EXIT_CODE}" "Status is not canceled (status: $(cat ${TEMPFILE_1} | jq -r '.status.state'))"
fi
}

TEMPFILE_0=$(mktemp)
TEMPFILE_1=$(mktemp)
LOGFILE=${0}_$(date -Iseconds).log
CMDFILE=${0}_$(date -Iseconds).cmd

touch ${LOGFILE}
touch ${CMDFILE}


# Function to draw progress bar
progressBar () {
local raw_percent=0
local percent
local stats
if [ "${2}" == "1" ] ; then
global_progressbar_percent=-1
return
fi

printf -v raw_percent "%5i" $(((${2}*10000)/${1}))
percent=${raw_percent::3}

if [ "${percent}" == " " ] ; then percent=0 ; fi
if [ "${percent}" -eq "${global_progressbar_percent}" ] ; then
# Only update on a integer percentage change
# This limits screen rewrites to max 100 and helps performance
# No need to update the screen if we don't modify the % value.
return
else
printf -v stats "%i of %i %s" "${2}" "${1}" "${3}"
printf "\r%${COLUMNS}s" ""
printf "\r[%3i%%] %s" "$percent" "$stats"
global_progressbar_percent=${percent}
fi

return
}

#
# For Future version: Loop through all ceph fs groups, maybe even ceph fs volumes
{ ceph_command subvolume ls "" "" "STDOUT_WITHERROR" ; } | jq -r '.[].name' > "${TEMPFILE_0}"
CEPH_EXIT_CODE=$?

if [ ${CEPH_EXIT_CODE} -ne 0 ] ; then
echo "Error executing ceph command."
exit 67
fi

subvol_completed=0
subvol_number=$(cat "${TEMPFILE_0}" | wc -l)
stat_clone_removed=0
stat_clone_canceled=0
stat_found_pending=0
stat_found_canceled=0
stat_found_inprogress=0
stat_canceled_inprogress=0

cat "${TEMPFILE_0}" | while read subvolume ; do

if [ "${LIST_SNAPSHOTS}" -eq 1 ] ; then
{ ceph_command subvolume snapshot ls "--sub_name ${subvolume}" "STDOUT" ; } | jq -r '.[].name' > "${TEMPFILE_1}"

if [ $(cat "${TEMPFILE_1}" | wc -l) -gt 0 ] ; then
for snapshot in $(cat "${TEMPFILE_1}") ; do
#
# In newer versions we could extract pending clones from the snapshot info output, but this is very new and the output is not
# present in older versions of ceph. To be backwards compatible we do not try to use it.
{ ceph_command "subvolume snapshot" info "--sub_name ${subvolume}" "--snap_name ${snapshot}" "STDOUT" ; } | \
jq -r --arg sv "${subvolume}" --arg ss "${snapshot}" '. += {"volume_name": $sv, "snap_name": $ss} |
[.volume_name, .snap_name, ("pending-clones:" + .has_pending_clones), .created_at] |
@tsv' >> "${LOGFILE}"
done
fi

let subvol_completed+=1
progressBar "${subvol_number}" "${subvol_completed}" "subvolumes checked for snapshots (latest: ${subvolume})."
else
ceph_command clone status "--clone_name ${subvolume}"
CEPH_EXIT_CODE=$?

if [ ${CEPH_EXIT_CODE} -eq 0 ] ; then
STATUS=$(cat ${TEMPFILE_1} | jq -r '.status.state')

case ${STATUS} in
pending)
let stat_found_pending+=1
if [ ${CANCEL_PENDING_CLONES} -eq 1 ] ; then
ceph_command clone cancel "--clone_name ${subvolume}"
CEPH_EXIT_CODE=$?
if [ ${CEPH_EXIT_CODE} -eq 0 ] ; then
log "cancel-pending-clone" "${subvolume}" "${CEPH_EXIT_CODE}" "$(cat ${TEMPFILE_1})"
let stat_clone_canceled+=1
if [ ${REMOVE_CANCELED_SUBVOLUME} -eq 1 ] ; then
ceph_command clone status "--clone_name ${subvolume}"
CEPH_EXIT_CODE=$?
if [ ${CEPH_EXIT_CODE} -eq 0 ] ; then
check_and_remove_subvolume "${subvolume}" "canceled"
else
log "not-removing-canceled-clone" "${subvolume}" "${CEPH_EXIT_CODE}" "$(cat ${TEMPFILE_1})"
fi
fi
else
log "error-cancel-pending-clone" "${subvolume}" "${CEPH_EXIT_CODE}" "$(cat ${TEMPFILE_1})"
fi
fi

if [ "${LIST_CLONES}" -eq 1 ] ; then
log "not-canceling" "${subvolume}" "${CEPH_EXIT_CODE}" "$(cat ${TEMPFILE_1} | jq -r '.status.state')"
fi

;;

canceled)
let stat_found_canceled+=1
if [ ${REMOVE_ALL_CANCELED_SUBVOLUMES} -eq 1 ] ; then
check_and_remove_subvolume "${subvolume}" "canceled"
else
log "not-canceling" "${subvolume}" "${CEPH_EXIT_CODE}" "clone already canceled."
fi
;;

in-progress)
let stat_found_inprogress+=1
if [ ${CANCEL_IN_PROGRESS} -eq 1 ] ; then
ceph_command clone cancel "--clone_name ${subvolume}"
CEPH_EXIT_CODE=$?

if [ ${CEPH_EXIT_CODE} -eq 0 ] ; then
log "cancel-in-progress-clone" "${subvolume}" "${CEPH_EXIT_CODE}" "$(cat ${TEMPFILE_1})"
let stat_canceled_inprogress+=1

if [ ${REMOVE_CANCELED_SUBVOLUME} -eq 1 ] ; then
ceph_command clone status "--clone_name ${subvolume}"
CEPH_EXIT_CODE=$?
if [ ${CEPH_EXIT_CODE} -eq 0 ] ; then
check_and_remove_subvolume "${subvolume}" "canceled"
else
log "not-removing-canceled-clone" "${subvolume}" "${CEPH_EXIT_CODE}" "$(cat ${TEMPFILE_1})"
fi
fi
else
log "error-cancel-in-progress-clone" "${subvolume}" "${CEPH_EXIT_CODE}" "$(cat ${TEMPFILE_1})"
fi
else
log "not-canceling" "${subvolume}" "${CEPH_EXIT_CODE}" "$(cat ${TEMPFILE_1} | jq -r '.status.state')"
fi
;;

*)
log "not-canceling" "${subvolume}" "${CEPH_EXIT_CODE}" "$(cat ${TEMPFILE_1} | jq -r '.status.state')"
;;
esac

else
log "not-canceling" "${subvolume}" "${CEPH_EXIT_CODE}" "$(cat ${TEMPFILE_1})"
fi

let subvol_completed+=1
progressBar "${subvol_number}" "${subvol_completed}" "subvolumes. Found: 'P': ${stat_found_pending} 'In-P': ${stat_found_inprogress} 'C': ${stat_found_canceled}. Action taken: ${stat_clone_canceled} 'P' canceled. ${stat_canceled_inprogress} 'In-P' canceled. ${stat_clone_removed} 'C' Removed."
fi
done

echo
echo Log file: ${LOGFILE}
echo Cmd file: ${CMDFILE}

case_upload

if [ "${LIST_CLONES}" -eq 1 ] ; then
if [ "$(cat "${LOGFILE}" | grep -v ENOTSUP | wc -l)" -ne "0" ] ; then
cat "${LOGFILE}" | grep -v ENOTSUP | column -t | less
fi
else
if [ "$(cat "${LOGFILE}" | grep -v ENOTSUP | grep -Ev "complete$" | wc -l)" -ne "0" ] ; then
cat "${LOGFILE}" | grep -v ENOTSUP | grep -Ev "complete$" | column -t | less
fi
fi

echo "Finished."

exit 0
    (1-1/1)