Bug #23793
ceph-osd consumed 10+GB rss memory
Status:
New
Priority:
Normal
Assignee:
-
Category:
Performance/Resource Usage
Target version:
-
% Done:
0%
Source:
Community (dev)
Tags:
Backport:
Regression:
No
Severity:
1 - critical
Reviewed:
Affected Versions:
ceph-qa-suite:
Component(RADOS):
OSD
Pull request ID:
Crash signature (v1):
Crash signature (v2):
Description
After 26GB data is written, ceph-osd's memory(rss) reached 10+GB.
The objectstore backed is KStore. master branch(201064a9aad19f8897968a35c46c378875721c5).
https://github.com/ceph/ceph/pull/21487 must be applied to support rocksdb statistics.
how to reproduce this problem¶
# mkfs.xfs /dev/xvdb1
# mount /dev/xvdb1 /var/lib/ceph/osd/ceph-0/
# df -hl | grep xvdb1
/dev/xvdb1 50G 33M 50G 1% /home/ubuntu/work/osd
enable rocksdb perf¶
rocksdb_perf = true
rocksdb_collect_memory_stats = true
set up the test cluster¶
# CEPH_NUM_MON=1 CEPH_NUM_OSD=1 CEPH_NUM_MDS=0 CEPH_NUM_MGR=1 CEPH_NUM_RGW=0 ../src/vstart.sh -n -X -l -K --kstore_path /var/lib/ceph/osd/ceph-0
create datapool¶
# ./bin/ceph -c ceph.conf osd pool create datapool 512 512 replicated replicated_rule
check the cluster's space¶
# ceph -c ceph.conf df
GLOBAL:
SIZE AVAIL RAW USED %RAW USED
50.0G 49.9G 107M 0.21
POOLS:
NAME ID USED %USED MAX AVAIL OBJECTS
datapool 1 0 0 49.4G 0
record ceph-osd's memery history¶
# ps aux | grep ceph-osd
# top -b -p 21406 > trace/k.top 2>&1 &
kick off the stress test
# rados bench -p datapool -b 1024 -t 256 5000 write -c ceph.conf --no-cleanup > trace/1k.stress 2>&1 &
...
Total time run: 5000.16
Total writes made: 27272081
Write size: 1024
Object size: 1024
Bandwidth (MB/sec): 5.32641
Stddev Bandwidth: 1.95517
Max bandwidth (MB/sec): 13.5996
Min bandwidth (MB/sec): 0.714844
Average IOPS: 5454
Stddev IOPS: 2002
Max IOPS: 13926
Min IOPS: 732
Average Latency(s): 0.0469346
Stddev Latency(s): 0.0624825
Max latency(s): 0.972668
Min latency(s): 0.00261732
# rados bench -p datapool -b 1024 -t 256 5000 write -c ceph.conf --no-cleanup > trace/1k.stress 2>&1 &
...
Total time run: 5000.16
Total writes made: 27272081
Write size: 1024
Object size: 1024
Bandwidth (MB/sec): 5.32641
Stddev Bandwidth: 1.95517
Max bandwidth (MB/sec): 13.5996
Min bandwidth (MB/sec): 0.714844
Average IOPS: 5454
Stddev IOPS: 2002
Max IOPS: 13926
Min IOPS: 732
Average Latency(s): 0.0469346
Stddev Latency(s): 0.0624825
Max latency(s): 0.972668
Min latency(s): 0.00261732
check cluster's space after bench finished¶
#ceph -c ceph.conf df
GLOBAL:
SIZE AVAIL RAW USED %RAW USED
50.0G 9.53G 40.4G 80.93
POOLS:
NAME ID USED %USED MAX AVAIL OBJECTS
datapool 1 26.0G 74.23 9.03G 27272082
memory consumed by rocksdb¶
// 178M
AWS#./bin/ceph daemon /tmp/ceph-asok.tdZJMs/osd.0.asok dump_objectstore_kv_stats
{
"block_cache_usage": "129260760",
"block_cache_pinned_blocks_usage": "3297442",
"rocksdb_memtable_usage": "45703896",
"rocksdb_index_filter_blocks_usage": "0"
}
show mempool statistic¶
// 1G
#./bin/ceph daemon /tmp/ceph-asok.tdZJMs/osd.0.asok dump_mempools
{
"mempool": {
"by_pool": {
"bloom_filter": {
"items": 0,
"bytes": 0
},
"bluestore_alloc": {
"items": 0,
"bytes": 0
},
"bluestore_cache_data": {
"items": 0,
"bytes": 0
},
"bluestore_cache_onode": {
"items": 0,
"bytes": 0
},
"bluestore_cache_other": {
"items": 0,
"bytes": 0
},
"bluestore_fsck": {
"items": 0,
"bytes": 0
},
"bluestore_txc": {
"items": 0,
"bytes": 0
},
"bluestore_writing_deferred": {
"items": 0,
"bytes": 0
},
"bluestore_writing": {
"items": 0,
"bytes": 0
},
"bluefs": {
"items": 0,
"bytes": 0
},
"buffer_anon": {
"items": 1047573,
"bytes": 179203863
},
"buffer_meta": {
"items": 1,
"bytes": 64
},
"osd": {
"items": 512,
"bytes": 6438912
},
"osd_mapbl": {
"items": 4,
"bytes": 16096
},
"osd_pglog": {
"items": 3098694,
"bytes": 872768320
},
"osdmap": {
"items": 73,
"bytes": 15216
},
"osdmap_mapping": {
"items": 0,
"bytes": 0
},
"pgmap": {
"items": 0,
"bytes": 0
},
"mds_co": {
"items": 0,
"bytes": 0
},
"unittest_1": {
"items": 0,
"bytes": 0
},
"unittest_2": {
"items": 0,
"bytes": 0
}
},
"total": {
"items": 4146857,
"bytes": 1058442471
}
}
}
check ceph-osd's memory:¶
// 10.5GB
#./bin/ceph tell osd.0 heap stats -c ceph.conf
osd.0 tcmalloc heap stats:------------------------------------------------
MALLOC: *11040628704 (10529.2 MiB)* Bytes in use by application
MALLOC: + 0 ( 0.0 MiB) Bytes in page heap freelist
MALLOC: + 89499832 ( 85.4 MiB) Bytes in central cache freelist
MALLOC: + 5579312 ( 5.3 MiB) Bytes in transfer cache freelist
MALLOC: + 23680312 ( 22.6 MiB) Bytes in thread cache freelists
MALLOC: + 46919840 ( 44.7 MiB) Bytes in malloc metadata
MALLOC: ------------
MALLOC: = 11206308000 (10687.2 MiB) Actual memory used (physical + swap)
MALLOC: + 368549888 ( 351.5 MiB) Bytes released to OS (aka unmapped)
MALLOC: ------------
MALLOC: = 11574857888 (11038.6 MiB) Virtual address space used
MALLOC:
MALLOC: 703421 Spans in use
MALLOC: 47 Thread heaps in use
MALLOC: 8192 Tcmalloc page size
------------------------------------------------
Call ReleaseFreeMemory() to release freelist memory to the OS (via madvise()).
Bytes released to the OS take up virtual address space but no physical memory.
10.5G - 0.17G = 10+G, I don't know who takes away 10+GB memory
History
#1 Updated by Honggang Yang almost 6 years ago
the "mon max pg per osd" is 1024 in my test.
#2 Updated by Honggang Yang almost 6 years ago
Set osd_debug_op_order to false can fix this problem.
My ceph cluster is created through vstart.sh which set osd_debug_op_order to true.
osd_debug_op_order will enable the following code snip which eaten huge
number of memory.
// osd/PrimaryLogPG.cc
// verify that we are doing this in order?
if (cct->_conf->osd_debug_op_order && m->get_source().is_client() &&
!pool.info.is_tier() && !pool.info.has_tiers()) {
map<client_t,ceph_tid_t>& cm = debug_op_order[obc->obs.oi.soid];
ceph_tid_t t = m->get_tid();
client_t n = m->get_source().num();
map<client_t,ceph_tid_t>::iterator p = cm.find(n);
if (p == cm.end()) {
dout(20) << " op order client." << n << " tid " << t << " (first)" << dendl;
cm[n] = t;
} else {
dout(20) << " op order client." << n << " tid " << t << " last was " << p->second << dendl;
if (p->second > t) {
derr << "bad op order, already applied " << p->second << " > this " << t << dendl;
assert(0 == "out of order op");
}
p->second = t;
}
}