Bug #6495
Cpu load on cluster node is very high every has the write request to cluster
0%
Description
ceph version 0.61.8
Kernel Ubuntu 12.10 (GNU/Linux 3.5.0-41-generic )
CPU: 24 processors, Ram 64GB
Env: 3 cluster nodes (10 osds/node, 2 ssdd raid0 as journal for 10 osds/node ), use dedicated network (private network: infiniband over IP)
+ If I used the test tool "fio" to generate the write request to cluster, high load cpu happened to my storage cluster (cause of existing of huge kernel thread). But as i generated the read request, cluster worked fine.
-> I used the top, htop, and atop, iostat to check all osds, and saw a heavy i/o load . So i think that Ceph-osd processes were causing the high load cpu....
ceph --admin-daemon /var/run/ceph/ceph-osd.29.asok perf dump {"filestore":{"journal_queue_max_ops":4096,"journal_queue_ops":0,"journal_ops":2310279,"journal_queue_max_bytes":41943040,"journal_queue_bytes":0,"journal_bytes":783155502587,"journal_latency":{"avgcount":2310279,"sum":29779.007221000},"journal_wr":2195043,"journal_wr_bytes":{"avgcount":2195043,"sum":790773211136},"op_queue_max_ops":10240,"op_queue_ops":0,"ops":2310279,"op_queue_max_bytes":209715200,"op_queue_bytes":0,"bytes":783128135267,"apply_latency":{"avgcount":2310279,"sum":52114.282829000},"committing":0,"commitcycle":13965,"commitcycle_interval":{"avgcount":13965,"sum":421252.623903000},"commitcycle_latency":{"avgcount":13965,"sum":1815.430763000},"journal_full":0},"leveldb":{"leveldb_get":12262171,"leveldb_transaction":7902360,"leveldb_compact":0,"leveldb_compact_range":0,"leveldb_compact_queue_merge":0,"leveldb_compact_queue_len":0},"mutex-FileJournal::completions_lock":{"wait":{"avgcount":0,"sum":0.000000000}},"mutex-FileJournal::finisher_lock":{"wait":{"avgcount":0,"sum":0.000000000}},"mutex-FileJournal::write_lock":{"wait":{"avgcount":0,"sum":0.000000000}},"mutex-FileJournal::writeq_lock":{"wait":{"avgcount":0,"sum":0.000000000}},"mutex-JOS::ApplyManager::apply_lock":{"wait":{"avgcount":0,"sum":0.000000000}},"mutex-JOS::ApplyManager::com_lock":{"wait":{"avgcount":0,"sum":0.000000000}},"mutex-JOS::SubmitManager::lock":{"wait":{"avgcount":0,"sum":0.000000000}},"osd":{"opq":0,"op_wip":2,"op":11475380,"op_in_bytes":451681929752,"op_out_bytes":953634957004,"op_latency":{"avgcount":11475380,"sum":471643.159999000},"op_r":9912039,"op_r_out_bytes":953634957004,"op_r_latency":{"avgcount":9912039,"sum":314610.837718000},"op_w":1563341,"op_w_in_bytes":451681929752,"op_w_rlat":{"avgcount":1563341,"sum":28750.267983000},"op_w_latency":{"avgcount":1563341,"sum":157032.322281000},"op_rw":0,"op_rw_in_bytes":0,"op_rw_out_bytes":0,"op_rw_rlat":{"avgcount":0,"sum":0.000000000},"op_rw_latency":{"avgcount":0,"sum":0.000000000},"subop":969850,"subop_in_bytes":349674255306,"subop_latency":{"avgcount":969850,"sum":42226.727569000},"subop_w":0,"subop_w_in_bytes":349674255306,"subop_w_latency":{"avgcount":969780,"sum":42194.859888000},"subop_pull":0,"subop_pull_latency":{"avgcount":4,"sum":0.058636000},"subop_push":0,"subop_push_in_bytes":0,"subop_push_latency":{"avgcount":66,"sum":31.809045000},"pull":33,"push":12,"push_out_bytes":40026224,"push_in":33,"push_in_bytes":137592832,"recovery_ops":28,"loadavg":5847,"buffer_bytes":0,"numpg":1174,"numpg_primary":673,"numpg_replica":501,"numpg_stray":0,"heartbeat_to_peers":20,"heartbeat_from_peers":0,"map_messages":3563,"map_message_epochs":6189,"map_message_epoch_dups":26238},"throttle-filestore_bytes":{"val":132790,"max":41943040,"get":0,"get_sum":0,"get_or_fail_fail":0,"get_or_fail_success":0,"take":2310281,"take_sum":783155899239,"put":2195044,"put_sum":783155766449,"wait":{"avgcount":7,"sum":0.956338000}},"throttle-filestore_ops":{"val":1,"max":4096,"get":0,"get_sum":0,"get_or_fail_fail":0,"get_or_fail_success":0,"take":2310281,"take_sum":2310281,"put":2195044,"put_sum":2310280,"wait":{"avgcount":0,"sum":0.000000000}},"throttle-msgr_dispatch_throttler-client":{"val":131249,"max":1073741824,"get":11285265,"get_sum":431907691785,"get_or_fail_fail":0,"get_or_fail_success":0,"take":0,"take_sum":0,"put":11285264,"put_sum":431907560536,"wait":{"avgcount":0,"sum":0.000000000}},"throttle-msgr_dispatch_throttler-cluster":{"val":0,"max":1073741824,"get":2396528,"get_sum":351465026511,"get_or_fail_fail":0,"get_or_fail_success":0,"take":0,"take_sum":0,"put":2396528,"put_sum":351465026511,"wait":{"avgcount":0,"sum":0.000000000}},"throttle-msgr_dispatch_throttler-hbclient":{"val":0,"max":1073741824,"get":2593222,"get_sum":121881434,"get_or_fail_fail":0,"get_or_fail_success":0,"take":0,"take_sum":0,"put":2593222,"put_sum":121881434,"wait":{"avgcount":0,"sum":0.000000000}},"throttle-msgr_dispatch_throttler-hbserver":{"val":0,"max":1073741824,"get":2617576,"get_sum":123026072,"get_or_fail_fail":0,"get_or_fail_success":0,"take":0,"take_sum":0,"put":2617576,"put_sum":123026072,"wait":{"avgcount":0,"sum":0.000000000}},"throttle-osd_client_bytes":{"val":655537,"max":524288000,"get":11195453,"get_sum":431879485602,"get_or_fail_fail":0,"get_or_fail_success":0,"take":0,"take_sum":0,"put":12476385,"put_sum":431878830065,"wait":{"avgcount":0,"sum":0.000000000}},"throttle-osd_client_messages":{"val":24,"max":100,"get":11366090,"get_sum":11366090,"get_or_fail_fail":0,"get_or_fail_success":0,"take":0,"take_sum":0,"put":11366066,"put_sum":11366066,"wait":{"avgcount":39115,"sum":920.334202000}}}
History
#1 Updated by Sage Weil over 10 years ago
- Status changed from New to Need More Info
- Assignee deleted (
Sage Weil) - Source changed from other to Community (user)
it is normal for writes to generate more load than reads (they are replicated after all). is there anything that indicated there was a problem?
#2 Updated by Sage Weil over 10 years ago
- Priority changed from High to Normal
#3 Updated by Khanh Nguyen Dang Quoc over 10 years ago
yes,there was a problem with the high cpu load. It made all system slower or couldn't receive any request from client (the rbd volumes attached to kvm). Is there configuration that helps reduce this problem ...
#4 Updated by Khanh Nguyen Dang Quoc over 10 years ago
Is there configuration to QoS for each client to help limit the i/o requests send to ceph cluster from rbd ?
#5 Updated by Sage Weil over 10 years ago
- Status changed from Need More Info to Closed
Khanh Nguyen Dang Quoc wrote:
Is there configuration to QoS for each client to help limit the i/o requests send to ceph cluster from rbd ?
you can use the qemu io throttling to limit individual clients...
#6 Updated by Khanh Nguyen Dang Quoc over 10 years ago
Thanks for your reply. I think ceph need a configuration to help Qos for a specified pool, if help easier for managing cluster storage.