Project

General

Profile

Actions

Bug #5269

closed

osd: EEXIST on mkcoll

Added by Sage Weil almost 11 years ago. Updated almost 11 years ago.

Status:
Resolved
Priority:
Urgent
Assignee:
-
Category:
OSD
Target version:
-
% Done:

0%

Source:
Q/A
Tags:
Backport:
Regression:
Severity:
3 - minor
Reviewed:
Affected Versions:
ceph-qa-suite:
Pull request ID:
Crash signature (v1):
Crash signature (v2):

Description

2013-06-06 18:16:33.344842 7f669738b700  0 filestore(/var/lib/ceph/osd/ceph-1)  error (17) File exists not handled on operation 20 (5109.0.0, or op 0, counting from 0)
2013-06-06 18:16:33.344865 7f669738b700  0 filestore(/var/lib/ceph/osd/ceph-1) unexpected error code
2013-06-06 18:16:33.344868 7f669738b700  0 filestore(/var/lib/ceph/osd/ceph-1)  transaction dump:
{ "ops": [
        { "op_num": 0,
          "op_name": "mkcoll",
          "collection": "1.0_head"},
        { "op_num": 1,
          "op_name": "collection_setattr",
          "collection": "1.0_head",
          "name": "info",
          "length": 1},
        { "op_num": 2,
          "op_name": "omap_setkeys",
          "collection": "meta",
          "oid": "16ef7597\/infos\/head\/\/-1",
          "attr_lens": { "1.0_biginfo": 141,
              "1.0_epoch": 4,
              "1.0_info": 568}},
        { "op_num": 3,
          "op_name": "remove",
          "collection": "meta",
          "oid": "103aebbe\/pglog_1.0\/0\/\/-1"},
        { "op_num": 4,
          "op_name": "touch",
          "collection": "meta",
          "oid": "103aebbe\/pglog_1.0\/0\/\/-1"},
        { "op_num": 5,
          "op_name": "omap_setkeys",
          "collection": "meta",
          "oid": "103aebbe\/pglog_1.0\/0\/\/-1",
          "attr_lens": { "divergent_priors": 4}},
        { "op_num": 6,
          "op_name": "nop"}]}

job was
ubuntu@teuthology:/a/sage-2013-06-06_17:56:44-rados-wip-mon-testing-basic/32544$ cat orig.config.yaml 
kernel:
  kdb: true
  sha1: 19bb6a83cb93383b363cc5956e304213f0f1b79f
machine_type: plana
nuke-on-error: true
overrides:
  ceph:
    conf:
      mon:
        debug mon: 20
        debug ms: 20
        debug paxos: 20
    fs: xfs
    log-whitelist:
    - slow request
    sha1: 08923eb842a9768ff556939221e63b983724e9bf
  install:
    ceph:
      sha1: 08923eb842a9768ff556939221e63b983724e9bf
  s3tests:
    branch: master
  workunit:
    sha1: 08923eb842a9768ff556939221e63b983724e9bf
roles:
- - mon.0
  - mon.1
  - mon.2
  - mds.0
  - client.0
- - osd.0
- - osd.1
- - osd.2
tasks:
- chef: null
- clock.check: null
- install: null
- ceph: null
- thrashosds:
    chance_down: 1.0
    powercycle: true
    timeout: 600
- ceph-fuse: null
- workunit:
    clients:
      client.0:
      - rados/test.sh


Related issues 8 (0 open8 closed)

Related to Ceph - Bug #5473: osd/ReplicatedPG.cc: 1379: FAILED assert(0) in trim_object() on master, cuttlefishResolved06/27/2013

Actions
Related to Ceph - Bug #5625: osd: scrub finds 20 missing objects on cuttlefishResolved07/14/2013

Actions
Related to Ceph - Bug #5392: osd: unfound objects from thrashingResolved06/18/2013

Actions
Related to Ceph - Bug #5507: osd: ENOENT on cloneResolved07/05/2013

Actions
Related to Ceph - Bug #5631: osd/ReplicatedPG.cc: 3036: FAILED assert(iter)Resolved07/15/2013

Actions
Is duplicate of Ceph - Bug #5270: osd: crash in PG::peek_map_epoch()ResolvedSamuel Just06/06/2013

Actions
Has duplicate Ceph - Bug #5372: osd/SnapMapper.cc: 270: FAILED assert(check(oid))DuplicateSamuel Just06/16/2013

Actions
Has duplicate Ceph - Bug #5637: OSD crash upon pool creationRejected07/15/2013

Actions
Actions #1

Updated by Ian Colle almost 11 years ago

  • Assignee set to Samuel Just
Actions #2

Updated by Samuel Just almost 11 years ago

  • Status changed from 12 to Duplicate

This is probably the same thing as 5270.

Actions #3

Updated by Sage Weil almost 11 years ago

  • Status changed from Duplicate to 12

don't think this was #5270.. just hit it on

ubuntu@teuthology:/a/teuthology-2013-06-16_17:50:57-rados-master-testing-basic/37422$ cat orig.config.yaml 
kernel:
  kdb: true
  sha1: f28d6bf9ef4d3f8dd2e669fd21be827663f215e2
machine_type: plana
nuke-on-error: true
overrides:
  ceph:
    conf:
      global:
        ms inject socket failures: 5000
      mon:
        debug mon: 20
        debug ms: 20
        debug paxos: 20
    fs: xfs
    log-whitelist:
    - slow request
    sha1: e3fb095d8aa88556e4356c76b848fa61b09acbc0
  install:
    ceph:
      sha1: e3fb095d8aa88556e4356c76b848fa61b09acbc0
  s3tests:
    branch: master
  workunit:
    sha1: e3fb095d8aa88556e4356c76b848fa61b09acbc0
roles:
- - mon.a
  - mon.c
  - osd.0
  - osd.1
  - osd.2
- - mon.b
  - mds.a
  - osd.3
  - osd.4
  - osd.5
  - client.0
tasks:
- chef: null
- clock.check: null
- install: null
- ceph:
    log-whitelist:
    - wrongly marked me down
    - objects unfound and apparently lost
- thrashosds:
    chance_pgnum_grow: 1
    chance_pgpnum_fix: 1
    timeout: 1200
- rados:
    clients:
    - client.0
    objects: 500
    op_weights:
      delete: 50
      read: 100
      rollback: 50
      snap_create: 50
      snap_remove: 50
      write: 100
    ops: 4000
Actions #4

Updated by Sage Weil almost 11 years ago

and

ubuntu@teuthology:/a/teuthology-2013-06-16_17:50:57-rados-master-testing-basic/37420$ cat orig.config.yaml 
kernel:
  kdb: true
  sha1: f28d6bf9ef4d3f8dd2e669fd21be827663f215e2
machine_type: plana
nuke-on-error: true
overrides:
  ceph:
    conf:
      global:
        ms inject socket failures: 5000
      mon:
        debug mon: 20
        debug ms: 20
        debug paxos: 20
    fs: xfs
    log-whitelist:
    - slow request
    sha1: e3fb095d8aa88556e4356c76b848fa61b09acbc0
  install:
    ceph:
      sha1: e3fb095d8aa88556e4356c76b848fa61b09acbc0
  s3tests:
    branch: master
  workunit:
    sha1: e3fb095d8aa88556e4356c76b848fa61b09acbc0
roles:
- - mon.a
  - mon.c
  - osd.0
  - osd.1
  - osd.2
- - mon.b
  - mds.a
  - osd.3
  - osd.4
  - osd.5
  - client.0
tasks:
- chef: null
- clock.check: null
- install: null
- ceph:
    log-whitelist:
    - wrongly marked me down
    - objects unfound and apparently lost
- thrashosds:
    chance_pgnum_grow: 1
    chance_pgpnum_fix: 1
    timeout: 1200
- rados:
    clients:
    - client.0
    objects: 500
    op_weights:
      delete: 10
      read: 45
      write: 45
    ops: 4000

Actions #5

Updated by Samuel Just almost 11 years ago

Running with logging overnight to reproduce.

Actions #6

Updated by Sage Weil almost 11 years ago

ubuntu@teuthology:/a/teuthology-2013-06-17_01:00:05-rados-master-testing-basic/37637

Actions #7

Updated by Samuel Just almost 11 years ago

  • Status changed from 12 to Resolved
Actions #8

Updated by Sage Weil almost 11 years ago

  • Status changed from Resolved to 12
  -209> 2013-07-16 02:23:38.456961 fb4d700  0 filestore(/var/lib/ceph/osd/ceph-1)  error (17) File exists not handled on operation 20 (5474.0.0, or op 0, counting from 0)
  -208> 2013-07-16 02:23:38.479432 fb4d700  0 filestore(/var/lib/ceph/osd/ceph-1) unexpected error code
  -207> 2013-07-16 02:23:38.482962 fb4d700  0 filestore(/var/lib/ceph/osd/ceph-1)  transaction dump:
{ "ops": [
        { "op_num": 0,
          "op_name": "mkcoll",
          "collection": "2.19_head"},
        { "op_num": 1,
          "op_name": "op_split_collection",
          "collection": "2.9_head",
          "bits": "5",
          "rem": "25",
          "dest": "2.19_head"},
        { "op_num": 2,
          "op_name": "collection_setattr",
          "collection": "2.19_head",
          "name": "info",
          "length": 1},
        { "op_num": 3,
          "op_name": "omap_setkeys",
          "collection": "meta",
          "oid": "16ef7597\/infos\/head\/\/-1",
          "attr_lens": { "2.19_biginfo": 98,
              "2.19_epoch": 4,
              "2.19_info": 568}},
        { "op_num": 4,
          "op_name": "touch",
          "collection": "meta",
          "oid": "516be47b\/pglog_2.19\/0\/\/-1"},
        { "op_num": 5,
          "op_name": "unknown",
          "op_code": 37}]}

on run
ubuntu@teuthology:/a/teuthology-2013-07-16_01:00:13-rados-next-testing-basic/68429$ cat orig.config.yaml 
kernel:
  kdb: true
  sha1: 365b57b1317524bb0cdd15859a224ba1ab58d1d7
machine_type: plana
nuke-on-error: true
overrides:
  admin_socket:
    branch: next
  ceph:
    conf:
      global:
        ms inject socket failures: 5000
      mon:
        debug mon: 20
        debug ms: 20
        debug paxos: 20
      osd:
        osd op thread timeout: 60
    fs: btrfs
    log-whitelist:
    - slow request
    sha1: 39e5a2a406b77fa82e9a78c267b679d49927e3c3
    valgrind:
      mds:
      - --tool=memcheck
      mon:
      - --tool=memcheck
      - --leak-check=full
      - --show-reachable=yes
      osd:
      - --tool=memcheck
  install:
    ceph:
      flavor: notcmalloc
      sha1: 39e5a2a406b77fa82e9a78c267b679d49927e3c3
  s3tests:
    branch: next
  workunit:
    sha1: 39e5a2a406b77fa82e9a78c267b679d49927e3c3
roles:
- - mon.a
  - mon.c
  - osd.0
  - osd.1
  - osd.2
- - mon.b
  - mds.a
  - osd.3
  - osd.4
  - osd.5
  - client.0
tasks:
- chef: null
- clock.check: null
- install: null
- ceph:
    log-whitelist:
    - wrongly marked me down
    - objects unfound and apparently lost
- thrashosds:
    chance_pgnum_grow: 1
    chance_pgpnum_fix: 1
    timeout: 1200
- workunit:
    clients:
      client.0:
      - rados/test.sh

Actions #9

Updated by Sage Weil almost 11 years ago

Andrey saw this on latest cuttlefish, see #5637

Actions #10

Updated by Sage Weil almost 11 years ago

  • Status changed from 12 to 7
Actions #11

Updated by Samuel Just almost 11 years ago

  • Assignee deleted (Samuel Just)
Actions #12

Updated by Samuel Just almost 11 years ago

  • Status changed from 7 to Resolved
Actions

Also available in: Atom PDF