Project

General

Profile

Bug #23793

ceph-osd consumed 10+GB rss memory

Added by Honggang Yang almost 6 years ago. Updated almost 6 years ago.

Status:
New
Priority:
Normal
Assignee:
-
Category:
Performance/Resource Usage
Target version:
-
% Done:

0%

Source:
Community (dev)
Tags:
Backport:
Regression:
No
Severity:
1 - critical
Reviewed:
Affected Versions:
ceph-qa-suite:
Component(RADOS):
OSD
Pull request ID:
Crash signature (v1):
Crash signature (v2):

Description

After 26GB data is written, ceph-osd's memory(rss) reached 10+GB.

The objectstore backed is KStore. master branch(201064a9aad19f8897968a35c46c378875721c5).

https://github.com/ceph/ceph/pull/21487 must be applied to support rocksdb statistics.

how to reproduce this problem

  # mkfs.xfs /dev/xvdb1                                                                                
  # mount /dev/xvdb1 /var/lib/ceph/osd/ceph-0/                                                         
  # df -hl | grep xvdb1                                                                                
  /dev/xvdb1       50G   33M   50G   1% /home/ubuntu/work/osd  

enable rocksdb perf

  rocksdb_perf = true                                                                               
  rocksdb_collect_memory_stats = true 

set up the test cluster

# CEPH_NUM_MON=1 CEPH_NUM_OSD=1 CEPH_NUM_MDS=0 CEPH_NUM_MGR=1  CEPH_NUM_RGW=0 ../src/vstart.sh  -n -X -l -K --kstore_path /var/lib/ceph/osd/ceph-0

create datapool

# ./bin/ceph -c ceph.conf osd pool create datapool 512 512 replicated  replicated_rule

check the cluster's space

  # ceph -c ceph.conf df                                                                            
  GLOBAL:                                                                                           
      SIZE      AVAIL     RAW USED     %RAW USED                                                    
      50.0G     49.9G         107M          0.21                                                    
  POOLS:                                                                                            
      NAME         ID     USED     %USED     MAX AVAIL     OBJECTS                                  
      datapool     1         0         0         49.4G           0  

record ceph-osd's memery history

  # ps aux | grep ceph-osd                                                                          
  # top -b -p 21406 > trace/k.top 2>&1 & 

kick off the stress test
  # rados bench -p datapool -b 1024 -t 256 5000 write -c ceph.conf --no-cleanup > trace/1k.stress 2>&1 &
  ...                                                                                               
  Total time run:         5000.16                                                                   
  Total writes made:      27272081                                                                  
  Write size:             1024                                                                      
  Object size:            1024                                                                      
  Bandwidth (MB/sec):     5.32641                                                                   
  Stddev Bandwidth:       1.95517                                                                   
  Max bandwidth (MB/sec): 13.5996                                                                   
  Min bandwidth (MB/sec): 0.714844                                                                  
  Average IOPS:           5454                                                                      
  Stddev IOPS:            2002                                                                      
  Max IOPS:               13926                                                                     
  Min IOPS:               732                                                                       
  Average Latency(s):     0.0469346                                                                 
  Stddev Latency(s):      0.0624825                                                                 
  Max latency(s):         0.972668                                                                  
  Min latency(s):         0.00261732 

check cluster's space after bench finished

  #ceph -c ceph.conf df                                                                             
  GLOBAL:                                                                                           
      SIZE      AVAIL     RAW USED     %RAW USED                                                    
      50.0G     9.53G        40.4G         80.93                                                    
  POOLS:                                                                                            
      NAME         ID     USED      %USED     MAX AVAIL     OBJECTS                                 
      datapool     1      26.0G     74.23         9.03G     27272082  

memory consumed by rocksdb

  // 178M                                                                                           
  AWS#./bin/ceph daemon /tmp/ceph-asok.tdZJMs/osd.0.asok dump_objectstore_kv_stats                  
  {                                                                                                 
      "block_cache_usage": "129260760",                                                             
      "block_cache_pinned_blocks_usage": "3297442",                                                 
      "rocksdb_memtable_usage": "45703896",                                                         
      "rocksdb_index_filter_blocks_usage": "0"                                                      
  } 

show mempool statistic

  // 1G                                                                                             
  #./bin/ceph daemon /tmp/ceph-asok.tdZJMs/osd.0.asok dump_mempools                                 
  {                                                                                                 
      "mempool": {                                                                                  
          "by_pool": {                                                                              
              "bloom_filter": {                                                                     
                  "items": 0,                                                                       
                  "bytes": 0                                                                        
              },                                                                                    
              "bluestore_alloc": {                                                                  
                  "items": 0,                                                                       
                  "bytes": 0                                                                        
              },                                                                                    
              "bluestore_cache_data": {                                                             
                  "items": 0,                                                                       
                  "bytes": 0                                                                        
              },                                                                                    
              "bluestore_cache_onode": {                                                            
                  "items": 0,                                                                       
                  "bytes": 0                                                                        
              },                                                                                    
              "bluestore_cache_other": {                                                            
                  "items": 0,                                                                       
                  "bytes": 0                                                                        
              },                                                                                    
              "bluestore_fsck": {                                                                   
                  "items": 0,                                                                       
                  "bytes": 0                                                                        
              },                                                                                    
              "bluestore_txc": {                                                                    
                  "items": 0,                                                                       
                  "bytes": 0                                                                        
              },                                                                                    
              "bluestore_writing_deferred": {       
                  "items": 0,                                                                       
                  "bytes": 0                                                                        
              },                                                                                    
              "bluestore_writing": {                                                                
                  "items": 0,                                                                       
                  "bytes": 0                                                                        
              },                                                                                    
              "bluefs": {                                                                           
                  "items": 0,                                                                       
                  "bytes": 0                                                                        
              },                                                                                    
              "buffer_anon": {                                                                      
                  "items": 1047573,                                                                 
                  "bytes": 179203863                                                                
              },                                                                                    
              "buffer_meta": {                                                                      
                  "items": 1,                                                                       
                  "bytes": 64                                                                       
              },                                                                                    
              "osd": {                                                                              
                  "items": 512,                                                                     
                  "bytes": 6438912                                                                  
              },                                                                                    
              "osd_mapbl": {                                                                        
                  "items": 4,                                                                       
                  "bytes": 16096                                                                    
              },                                                                                    
              "osd_pglog": {                                                                        
                  "items": 3098694,                                                                 
                  "bytes": 872768320                                                                
              },                                                                                    
              "osdmap": {                                                                           
                  "items": 73,                                                                      
                  "bytes": 15216                                                                    
              },                                
              "osdmap_mapping": {                                                                   
                  "items": 0,                                                                       
                  "bytes": 0                                                                        
              },                                                                                    
              "pgmap": {                                                                            
                  "items": 0,                                                                       
                  "bytes": 0                                                                        
              },                                                                                    
              "mds_co": {                                                                           
                  "items": 0,                                                                       
                  "bytes": 0                                                                        
              },                                                                                    
              "unittest_1": {                                                                       
                  "items": 0,                                                                       
                  "bytes": 0                                                                        
              },                                                                                    
              "unittest_2": {                                                                       
                  "items": 0,                                                                       
                  "bytes": 0                                                                        
              }                                                                                     
          },                                                                                        
          "total": {                                                                                
              "items": 4146857,                                                                     
              "bytes": 1058442471                                                                   
          }                                                                                         
      }                                                                                             
  }                     

check ceph-osd's memory:

  // 10.5GB                                                                                         
  #./bin/ceph tell osd.0 heap stats -c ceph.conf                                                    
  osd.0 tcmalloc heap stats:------------------------------------------------                        
  MALLOC:    *11040628704 (10529.2 MiB)* Bytes in use by application                                  
  MALLOC: +            0 (    0.0 MiB) Bytes in page heap freelist                                  
  MALLOC: +     89499832 (   85.4 MiB) Bytes in central cache freelist                              
  MALLOC: +      5579312 (    5.3 MiB) Bytes in transfer cache freelist                             
  MALLOC: +     23680312 (   22.6 MiB) Bytes in thread cache freelists                              
  MALLOC: +     46919840 (   44.7 MiB) Bytes in malloc metadata                                     
  MALLOC:   ------------                                                                            
  MALLOC: =  11206308000 (10687.2 MiB) Actual memory used (physical + swap)                         
  MALLOC: +    368549888 (  351.5 MiB) Bytes released to OS (aka unmapped)                          
  MALLOC:   ------------                                                                            
  MALLOC: =  11574857888 (11038.6 MiB) Virtual address space used                                   
  MALLOC:                                                                                           
  MALLOC:         703421              Spans in use                                                  
  MALLOC:             47              Thread heaps in use                                           
  MALLOC:           8192              Tcmalloc page size                                            
  ------------------------------------------------                                                  
  Call ReleaseFreeMemory() to release freelist memory to the OS (via madvise()).                    
  Bytes released to the OS take up virtual address space but no physical memory.   

10.5G - 0.17G = 10+G, I don't know who takes away 10+GB memory

History

#1 Updated by Honggang Yang almost 6 years ago

the "mon max pg per osd" is 1024 in my test.

#2 Updated by Honggang Yang almost 6 years ago

Set osd_debug_op_order to false can fix this problem.
My ceph cluster is created through vstart.sh which set osd_debug_op_order to true.

osd_debug_op_order will enable the following code snip which eaten huge
number of memory.

  // osd/PrimaryLogPG.cc                                                                            
  // verify that we are doing this in order?                                                        
  if (cct->_conf->osd_debug_op_order && m->get_source().is_client() &&                              
      !pool.info.is_tier() && !pool.info.has_tiers()) {                                             
    map<client_t,ceph_tid_t>& cm = debug_op_order[obc->obs.oi.soid];                                
    ceph_tid_t t = m->get_tid();                                                                    
    client_t n = m->get_source().num();                                                             
    map<client_t,ceph_tid_t>::iterator p = cm.find(n);                                              
    if (p == cm.end()) {                                                                            
      dout(20) << " op order client." << n << " tid " << t << " (first)" << dendl;                  
      cm[n] = t;                                                                                    
    } else {                                                                                        
      dout(20) << " op order client." << n << " tid " << t << " last was " << p->second << dendl;   
      if (p->second > t) {                                                                          
        derr << "bad op order, already applied " << p->second << " > this " << t << dendl;          
        assert(0 == "out of order op");                                                             
      }                                                                                             
      p->second = t;                                                                                
    }                                                                                               
  }            

Also available in: Atom PDF