Bug #8344
closedUpstart scripts silently fail when asok missing
0%
Description
In situations like Issue 7188, the admin socket can be lost from /var/run/ceph/ceph-<daemon>.<name>.asok. When this happens, the Upstart (and possibly sysvinit) scripts do not properly stop or restart ceph daemons.
root@node1:~# ls /var/run/ceph
total 0
drwxr-xr-x 2 root root 100 May 13 18:55 .
drwxr-xr-x 22 root root 860 May 13 18:33 ..
srwxr-xr-x 1 root root 0 May 13 18:55 ceph-mon.node1.asok
srwxr-xr-x 1 root root 0 May 13 18:55 ceph-osd.0.asok
srwxr-xr-x 1 root root 0 May 13 18:55 ceph-osd.1.asok
root@node1:~# ps -ef | grep ceph-mon
root 4679 1 0 18:55 ? 00:00:00 /bin/sh -e -c /usr/bin/ceph-mon --cluster="${cluster:-ceph}" -i "$id" -f /bin/sh
root 4680 4679 0 18:55 ? 00:00:05 /usr/bin/ceph-mon --cluster=ceph -i node1 -f
root 17116 29198 0 19:38 pts/0 00:00:00 grep --color=auto ceph-mon
root@node1:~# logrotate --force /etc/logrotate.d/ceph
root@node1:~# ls /var/run/ceph
total 0
drwxr-xr-x 2 root root 80 May 13 19:39 .
drwxr-xr-x 22 root root 860 May 13 18:33 ..
srwxr-xr-x 1 root root 0 May 13 18:55 ceph-osd.0.asok
srwxr-xr-x 1 root root 0 May 13 18:55 ceph-osd.1.asok
root@node1:~# ps -ef | grep ceph-mon
root 4680 1 0 18:55 ? 00:00:05 /usr/bin/ceph-mon --cluster=ceph -i node1 -f
root 17329 29198 0 19:39 pts/0 00:00:00 grep --color=auto ceph-mon
root@node1:~# stop ceph-all
ceph-all stop/waiting
root@node1:~# ps -ef | grep ceph-mon
root 4680 1 0 18:55 ? 00:00:05 /usr/bin/ceph-mon --cluster=ceph -i node1 -f
root 17416 29198 0 19:39 pts/0 00:00:00 grep --color=auto ceph-mon
root@node1:~# restart ceph-all
restart: Unknown instance:
root@node1:~# ps -ef | grep ceph-mon
root 4680 1 0 18:55 ? 00:00:05 /usr/bin/ceph-mon --cluster=ceph -i node1 -f
root 17491 29198 0 19:39 pts/0 00:00:00 grep --color=auto ceph-mon
root@node1:~# start ceph-all
ceph-all start/running
root@node1:~# ps -ef | grep ceph-mon
root 4680 1 0 18:55 ? 00:00:05 /usr/bin/ceph-mon --cluster=ceph -i node1 -f
root 17833 29198 0 19:40 pts/0 00:00:00 grep --color=auto ceph-mon
root@node1:~# ls /var/run/ceph
total 0
drwxr-xr-x 2 root root 80 May 13 19:39 .
drwxr-xr-x 22 root root 860 May 13 18:33 ..
srwxr-xr-x 1 root root 0 May 13 19:39 ceph-osd.0.asok
srwxr-xr-x 1 root root 0 May 13 19:39 ceph-osd.1.asok
Ideally, the upstart scripts would work without an asok present to stop/restart the daemon. Or, at least throw some output to notify the operator of an issue in need of manual intervention.