diff -Nru ceph-12.1.1/alpine/APKBUILD ceph-12.1.2/alpine/APKBUILD --- ceph-12.1.1/alpine/APKBUILD 2017-07-17 17:00:46.000000000 +0000 +++ ceph-12.1.2/alpine/APKBUILD 2017-08-01 18:00:09.000000000 +0000 @@ -1,7 +1,7 @@ # Contributor: John Coyle # Maintainer: John Coyle pkgname=ceph -pkgver=12.1.1 +pkgver=12.1.2 pkgrel=0 pkgdesc="Ceph is a distributed object store and file system" pkgusers="ceph" @@ -63,7 +63,7 @@ xmlstarlet yasm " -source="ceph-12.1.1.tar.bz2" +source="ceph-12.1.2.tar.bz2" subpackages=" $pkgname-base $pkgname-common @@ -116,7 +116,7 @@ _udevrulesdir=/etc/udev/rules.d _python_sitelib=/usr/lib/python2.7/site-packages -builddir=$srcdir/ceph-12.1.1 +builddir=$srcdir/ceph-12.1.2 build() { export CEPH_BUILD_VIRTUALENV=$builddir diff -Nru ceph-12.1.1/AUTHORS ceph-12.1.2/AUTHORS --- ceph-12.1.1/AUTHORS 2017-07-17 16:56:02.000000000 +0000 +++ ceph-12.1.2/AUTHORS 2017-08-01 17:55:40.000000000 +0000 @@ -9,7 +9,7 @@ RBD (kernel) - Ilya Dryomov RGW - Yehuda Sadeh Matt Benjamin -CephFS - John Spray +CephFS - Patrick Donnelly CephFS (kernel) - Yan, Zheng Deployment - Alfredo Deza Teuthology - Zack Cerza diff -Nru ceph-12.1.1/ceph.spec ceph-12.1.2/ceph.spec --- ceph-12.1.1/ceph.spec 2017-07-17 17:00:46.000000000 +0000 +++ ceph-12.1.2/ceph.spec 2017-08-01 18:00:09.000000000 +0000 @@ -22,7 +22,6 @@ %bcond_without ceph_test_package %endif %bcond_with make_check -%bcond_with xio %ifarch s390 s390x %bcond_with tcmalloc %else @@ -62,7 +61,7 @@ # main package definition ################################################################################# Name: ceph -Version: 12.1.1 +Version: 12.1.2 Release: 0%{?dist} %if 0%{?fedora} || 0%{?rhel} Epoch: 2 @@ -77,7 +76,7 @@ Group: System/Filesystems %endif URL: http://ceph.com/ -Source0: http://ceph.com/download/ceph-12.1.1.tar.bz2 +Source0: http://ceph.com/download/ceph-12.1.2.tar.bz2 %if 0%{?suse_version} %if 0%{?is_opensuse} ExclusiveArch: x86_64 aarch64 ppc64 ppc64le @@ -102,6 +101,18 @@ BuildRequires: selinux-policy-devel BuildRequires: /usr/share/selinux/devel/policyhelp %endif +%if 0%{with make_check} +%if 0%{?fedora} || 0%{?rhel} +BuildRequires: python-cherrypy +BuildRequires: python-werkzeug +%endif +%if 0%{?suse_version} +BuildRequires: python-CherryPy +BuildRequires: python-Werkzeug +%endif +BuildRequires: python-pecan +BuildRequires: socat +%endif BuildRequires: bc BuildRequires: gperf BuildRequires: cmake @@ -127,11 +138,8 @@ BuildRequires: python BuildRequires: python-devel BuildRequires: python-nose -BuildRequires: python-pecan BuildRequires: python-requests BuildRequires: python-virtualenv -BuildRequires: python-werkzeug -BuildRequires: socat BuildRequires: snappy-devel BuildRequires: udev BuildRequires: util-linux @@ -159,7 +167,6 @@ BuildRequires: libopenssl-devel BuildRequires: lsb-release BuildRequires: openldap2-devel -BuildRequires: python-CherryPy BuildRequires: python-Cython BuildRequires: python-PrettyTable BuildRequires: python-Sphinx @@ -176,7 +183,6 @@ BuildRequires: openssl-devel BuildRequires: redhat-lsb-core BuildRequires: Cython -BuildRequires: python-cherrypy BuildRequires: python-prettytable BuildRequires: python-sphinx %endif @@ -211,10 +217,6 @@ %if 0%{?fedora} || 0%{?rhel} BuildRequires: redhat-rpm-config %endif -# Accelio IB/RDMA -%if 0%{with xio} -BuildRequires: libxio-devel -%endif %description Ceph is a massively scalable, open-source, distributed storage system that runs @@ -251,9 +253,6 @@ %if 0%{?suse_version} Recommends: ntp-daemon %endif -%if 0%{with xio} -Requires: libxio -%endif %description base Base is the package that includes all the files shared amongst ceph servers @@ -280,9 +279,6 @@ %if 0%{?suse_version} Requires(pre): pwdutils %endif -%if 0%{with xio} -Requires: libxio -%endif %description -n ceph-common Common utilities to mount and interact with a ceph storage cluster. Comprised of files that are common to Ceph clients and servers. @@ -682,6 +678,7 @@ Requires: ceph-common Requires: xmlstarlet Requires: jq +Requires: socat %description -n ceph-test This package contains Ceph benchmarks and test tools. %endif @@ -775,7 +772,7 @@ # common ################################################################################# %prep -%autosetup -p1 -n ceph-12.1.1 +%autosetup -p1 -n ceph-12.1.2 %build %if 0%{with cephfs_java} @@ -817,6 +814,7 @@ -DCMAKE_INSTALL_SYSCONFDIR=%{_sysconfdir} \ -DCMAKE_INSTALL_MANDIR=%{_mandir} \ -DCMAKE_INSTALL_DOCDIR=%{_docdir}/ceph \ + -DCMAKE_INSTALL_INCLUDEDIR=%{_includedir} \ -DWITH_EMBEDDED=OFF \ -DWITH_MANPAGE=ON \ -DWITH_PYTHON3=ON \ @@ -824,9 +822,6 @@ %if 0%{?rhel} && ! 0%{?centos} -DWITH_SUBMAN=ON \ %endif -%if 0%{with xio} - -DWITH_XIO=ON \ -%endif %if 0%{without ceph_test_package} -DWITH_TESTS=OFF \ %endif @@ -838,10 +833,10 @@ %endif %if %{with lttng} -DWITH_LTTNG=ON \ - -DWTIH_BABELTRACE=ON \ + -DWITH_BABELTRACE=ON \ %else -DWITH_LTTNG=OFF \ - -DWTIH_BABELTRACE=OFF \ + -DWITH_BABELTRACE=OFF \ %endif $CEPH_EXTRA_CMAKE_ARGS \ %if 0%{with ocf} diff -Nru ceph-12.1.1/ceph.spec.in ceph-12.1.2/ceph.spec.in --- ceph-12.1.1/ceph.spec.in 2017-07-17 16:56:02.000000000 +0000 +++ ceph-12.1.2/ceph.spec.in 2017-08-01 17:55:40.000000000 +0000 @@ -22,7 +22,6 @@ %bcond_without ceph_test_package %endif %bcond_with make_check -%bcond_with xio %ifarch s390 s390x %bcond_with tcmalloc %else @@ -102,6 +101,18 @@ BuildRequires: selinux-policy-devel BuildRequires: /usr/share/selinux/devel/policyhelp %endif +%if 0%{with make_check} +%if 0%{?fedora} || 0%{?rhel} +BuildRequires: python-cherrypy +BuildRequires: python-werkzeug +%endif +%if 0%{?suse_version} +BuildRequires: python-CherryPy +BuildRequires: python-Werkzeug +%endif +BuildRequires: python-pecan +BuildRequires: socat +%endif BuildRequires: bc BuildRequires: gperf BuildRequires: cmake @@ -127,11 +138,8 @@ BuildRequires: python BuildRequires: python-devel BuildRequires: python-nose -BuildRequires: python-pecan BuildRequires: python-requests BuildRequires: python-virtualenv -BuildRequires: python-werkzeug -BuildRequires: socat BuildRequires: snappy-devel BuildRequires: udev BuildRequires: util-linux @@ -159,7 +167,6 @@ BuildRequires: libopenssl-devel BuildRequires: lsb-release BuildRequires: openldap2-devel -BuildRequires: python-CherryPy BuildRequires: python-Cython BuildRequires: python-PrettyTable BuildRequires: python-Sphinx @@ -176,7 +183,6 @@ BuildRequires: openssl-devel BuildRequires: redhat-lsb-core BuildRequires: Cython -BuildRequires: python-cherrypy BuildRequires: python-prettytable BuildRequires: python-sphinx %endif @@ -211,10 +217,6 @@ %if 0%{?fedora} || 0%{?rhel} BuildRequires: redhat-rpm-config %endif -# Accelio IB/RDMA -%if 0%{with xio} -BuildRequires: libxio-devel -%endif %description Ceph is a massively scalable, open-source, distributed storage system that runs @@ -251,9 +253,6 @@ %if 0%{?suse_version} Recommends: ntp-daemon %endif -%if 0%{with xio} -Requires: libxio -%endif %description base Base is the package that includes all the files shared amongst ceph servers @@ -280,9 +279,6 @@ %if 0%{?suse_version} Requires(pre): pwdutils %endif -%if 0%{with xio} -Requires: libxio -%endif %description -n ceph-common Common utilities to mount and interact with a ceph storage cluster. Comprised of files that are common to Ceph clients and servers. @@ -682,6 +678,7 @@ Requires: ceph-common Requires: xmlstarlet Requires: jq +Requires: socat %description -n ceph-test This package contains Ceph benchmarks and test tools. %endif @@ -817,6 +814,7 @@ -DCMAKE_INSTALL_SYSCONFDIR=%{_sysconfdir} \ -DCMAKE_INSTALL_MANDIR=%{_mandir} \ -DCMAKE_INSTALL_DOCDIR=%{_docdir}/ceph \ + -DCMAKE_INSTALL_INCLUDEDIR=%{_includedir} \ -DWITH_EMBEDDED=OFF \ -DWITH_MANPAGE=ON \ -DWITH_PYTHON3=ON \ @@ -824,9 +822,6 @@ %if 0%{?rhel} && ! 0%{?centos} -DWITH_SUBMAN=ON \ %endif -%if 0%{with xio} - -DWITH_XIO=ON \ -%endif %if 0%{without ceph_test_package} -DWITH_TESTS=OFF \ %endif @@ -838,10 +833,10 @@ %endif %if %{with lttng} -DWITH_LTTNG=ON \ - -DWTIH_BABELTRACE=ON \ + -DWITH_BABELTRACE=ON \ %else -DWITH_LTTNG=OFF \ - -DWTIH_BABELTRACE=OFF \ + -DWITH_BABELTRACE=OFF \ %endif $CEPH_EXTRA_CMAKE_ARGS \ %if 0%{with ocf} diff -Nru ceph-12.1.1/CMakeLists.txt ceph-12.1.2/CMakeLists.txt --- ceph-12.1.1/CMakeLists.txt 2017-07-17 16:56:02.000000000 +0000 +++ ceph-12.1.2/CMakeLists.txt 2017-08-01 17:55:40.000000000 +0000 @@ -1,7 +1,7 @@ cmake_minimum_required(VERSION 2.8.11) project(ceph) -set(VERSION 12.1.1) +set(VERSION 12.1.2) if(POLICY CMP0046) # Tweak policies (this one disables "missing" dependency warning) diff -Nru ceph-12.1.1/debian/ceph-base.install ceph-12.1.2/debian/ceph-base.install --- ceph-12.1.1/debian/ceph-base.install 2017-07-05 10:44:23.000000000 +0000 +++ ceph-12.1.2/debian/ceph-base.install 2017-08-07 11:21:07.000000000 +0000 @@ -5,8 +5,8 @@ usr/bin/crushtool usr/bin/monmaptool usr/bin/osdmaptool -usr/lib/*/ceph/erasure-code/libec_*.so -usr/lib/*/rados-classes/*.so +usr/lib/*/ceph/erasure-code/* +usr/lib/*/rados-classes/* usr/lib/ceph/ceph_common.sh usr/lib/python*/dist-packages/ceph_detect_init* usr/sbin/ceph-create-keys diff -Nru ceph-12.1.1/debian/changelog ceph-12.1.2/debian/changelog --- ceph-12.1.1/debian/changelog 2017-07-26 09:43:57.000000000 +0000 +++ ceph-12.1.2/debian/changelog 2017-08-09 15:49:34.000000000 +0000 @@ -1,3 +1,20 @@ +ceph (12.1.2-0ubuntu2) artful; urgency=medium + + * d/p/rocksdb-fallthrough-i386.patch: Mark intentional fallthroughs + for compatibility with gcc-7. + * d/p/32bit-compat-service-daemon.patch: Fix implicit type conversion + for Boost variant types on 32 bit architectures (LP: #1709396). + + -- James Page Wed, 09 Aug 2017 16:43:46 +0100 + +ceph (12.1.2-0ubuntu1) artful; urgency=medium + + * New release candidate for next stable release. + * d/ceph-base.install: Increase scope of install wildcards to ensure + that all ceph modules are included in the binary package. + + -- James Page Mon, 07 Aug 2017 14:23:59 +0100 + ceph (12.1.1-0ubuntu1) artful; urgency=medium * New release candidate for next stable release. diff -Nru ceph-12.1.1/debian/patches/32bit-compat-service-daemon.patch ceph-12.1.2/debian/patches/32bit-compat-service-daemon.patch --- ceph-12.1.1/debian/patches/32bit-compat-service-daemon.patch 1970-01-01 00:00:00.000000000 +0000 +++ ceph-12.1.2/debian/patches/32bit-compat-service-daemon.patch 2017-08-09 08:59:11.000000000 +0000 @@ -0,0 +1,33 @@ +Description: Resolve build failures on 32bit archs + size_t is an unsigned 32bit integer on 32bit archs which + does not unambiguously convert to a uint64_t or bool as + detailed in the service_daemon Attribute type. +Author: James Page +Forwareded: no + +--- a/src/tools/rbd_mirror/InstanceReplayer.cc ++++ b/src/tools/rbd_mirror/InstanceReplayer.cc +@@ -372,9 +372,9 @@ void InstanceReplayer::start_image_re + return; + } + +- size_t image_count = 0; +- size_t warning_count = 0; +- size_t error_count = 0; ++ uint64_t image_count = 0; ++ uint64_t warning_count = 0; ++ uint64_t error_count = 0; + for (auto &it : m_image_replayers) { + ++image_count; + auto health_state = it.second->get_health_state(); +--- a/src/tools/rbd_mirror/PoolWatcher.h ++++ b/src/tools/rbd_mirror/PoolWatcher.h +@@ -52,7 +52,7 @@ public: + void init(Context *on_finish = nullptr); + void shut_down(Context *on_finish); + +- inline size_t get_image_count() const { ++ inline uint64_t get_image_count() const { + Mutex::Locker locker(m_lock); + return m_image_ids.size(); + } diff -Nru ceph-12.1.1/debian/patches/rocksdb-fallthrough-i386.patch ceph-12.1.2/debian/patches/rocksdb-fallthrough-i386.patch --- ceph-12.1.1/debian/patches/rocksdb-fallthrough-i386.patch 1970-01-01 00:00:00.000000000 +0000 +++ ceph-12.1.2/debian/patches/rocksdb-fallthrough-i386.patch 2017-08-08 11:33:58.000000000 +0000 @@ -0,0 +1,17 @@ +Description: Mark intention fallthroughs for i386 codepaths +Author: James Page +Forwarded: https://github.com/facebook/rocksdb/pull/2700 + +--- a/src/rocksdb/util/murmurhash.cc ++++ b/src/rocksdb/util/murmurhash.cc +@@ -113,8 +113,8 @@ unsigned int MurmurHash2 ( const void * + + switch(len) + { +- case 3: h ^= data[2] << 16; +- case 2: h ^= data[1] << 8; ++ case 3: h ^= data[2] << 16; // fallthrough ++ case 2: h ^= data[1] << 8; // fallthrough + case 1: h ^= data[0]; + h *= m; + }; diff -Nru ceph-12.1.1/debian/patches/series ceph-12.1.2/debian/patches/series --- ceph-12.1.1/debian/patches/series 2017-07-25 16:30:07.000000000 +0000 +++ ceph-12.1.2/debian/patches/series 2017-08-09 08:45:36.000000000 +0000 @@ -2,3 +2,6 @@ 0001-CoreLocalArray-class.patch 0002-core-local-array-type-conversions.patch 0003-Core-local-statistics.patch +# Ubuntu: rocksdb/i386/gcc-7 compat +rocksdb-fallthrough-i386.patch +32bit-compat-service-daemon.patch diff -Nru ceph-12.1.1/doc/cephfs/administration.rst ceph-12.1.2/doc/cephfs/administration.rst --- ceph-12.1.1/doc/cephfs/administration.rst 2017-07-17 16:56:02.000000000 +0000 +++ ceph-12.1.2/doc/cephfs/administration.rst 2017-08-01 17:55:40.000000000 +0000 @@ -99,7 +99,15 @@ Deactivate an MDS, causing it to flush its entire journal to backing RADOS objects and close all open client sessions. Deactivating an MDS is primarily intended for bringing down a rank after reducing the number of -active MDS (max_mds). +active MDS (max_mds). Once the rank is deactivated, the MDS daemon will rejoin the +cluster as a standby. +```` can take one of three forms: + +:: + + : + : + Use ``mds deactivate`` in conjunction with adjustments to ``max_mds`` to shrink an MDS cluster. See :doc:`/cephfs/multimds` diff -Nru ceph-12.1.1/doc/cephfs/disaster-recovery.rst ceph-12.1.2/doc/cephfs/disaster-recovery.rst --- ceph-12.1.1/doc/cephfs/disaster-recovery.rst 2017-07-17 16:56:02.000000000 +0000 +++ ceph-12.1.2/doc/cephfs/disaster-recovery.rst 2017-08-01 17:55:40.000000000 +0000 @@ -130,18 +130,20 @@ Finally, you can regenerate metadata objects for missing files and directories based on the contents of a data pool. This is -a two-phase process. First, scanning *all* objects to calculate +a three-phase process. First, scanning *all* objects to calculate size and mtime metadata for inodes. Second, scanning the first -object from every file to collect this metadata and inject -it into the metadata pool. +object from every file to collect this metadata and inject it into +the metadata pool. Third, checking inode linkages and fixing found +errors. :: cephfs-data-scan scan_extents cephfs-data-scan scan_inodes + cephfs-data-scan scan_links -This command may take a *very long* time if there are many -files or very large files in the data pool. +'scan_extents' and 'scan_inodes' commands may take a *very long* time +if there are many files or very large files in the data pool. To accelerate the process, run multiple instances of the tool. @@ -246,7 +248,7 @@ ceph osd pool create recovery replicated ceph fs new recovery-fs recovery --allow-dangerous-metadata-overlay cephfs-data-scan init --force-init --filesystem recovery-fs --alternate-pool recovery - ceph fs reset recovery-fs --yes-i-realy-mean-it + ceph fs reset recovery-fs --yes-i-really-mean-it cephfs-table-tool recovery-fs:all reset session cephfs-table-tool recovery-fs:all reset snap cephfs-table-tool recovery-fs:all reset inode @@ -256,8 +258,9 @@ :: - cephfs-data-scan scan_extents --alternate-pool recovery --filesystem + cephfs-data-scan scan_extents --alternate-pool recovery --filesystem cephfs-data-scan scan_inodes --alternate-pool recovery --filesystem --force-corrupt --force-init + cephfs-data-scan scan_links --filesystem recovery-fs If the damaged filesystem contains dirty journal data, it may be recovered next with: @@ -267,10 +270,10 @@ cephfs-journal-tool --rank=:0 event recover_dentries list --alternate-pool recovery cephfs-journal-tool --rank recovery-fs:0 journal reset --force -After recovery, some recovered directories will have incorrect link counts. -Ensure the parameter mds_debug_scatterstat is set to false (the default) to -prevent the MDS from checking the link counts, then run a forward scrub to -repair them. Ensure you have an MDS running and issue: +After recovery, some recovered directories will have incorrect statistics. +Ensure the parameters mds_verify_scatter and mds_debug_scatterstat are set +to false (the default) to prevent the MDS from checking the statistics, then +run a forward scrub to repair them. Ensure you have an MDS running and issue: :: diff -Nru ceph-12.1.1/doc/cephfs/experimental-features.rst ceph-12.1.2/doc/cephfs/experimental-features.rst --- ceph-12.1.1/doc/cephfs/experimental-features.rst 2017-07-17 16:56:02.000000000 +0000 +++ ceph-12.1.2/doc/cephfs/experimental-features.rst 2017-08-01 17:55:40.000000000 +0000 @@ -4,7 +4,7 @@ CephFS includes a number of experimental features which are not fully stabilized or qualified for users to turn on in real deployments. We generally do our best -to clearly demarcate these and fence them off so they can't be used by mistake. +to clearly demarcate these and fence them off so they cannot be used by mistake. Some of these features are closer to being done than others, though. We describe each of them with an approximation of how risky they are and briefly describe diff -Nru ceph-12.1.1/doc/cephfs/journaler.rst ceph-12.1.2/doc/cephfs/journaler.rst --- ceph-12.1.1/doc/cephfs/journaler.rst 2017-07-17 16:56:02.000000000 +0000 +++ ceph-12.1.2/doc/cephfs/journaler.rst 2017-08-01 17:55:40.000000000 +0000 @@ -35,7 +35,7 @@ ``journaler batch max`` -:Description: Maximum bytes we'll delay flushing. +:Description: Maximum bytes we will delay flushing. :Type: 64-bit Unsigned Integer :Required: No :Default: ``0`` diff -Nru ceph-12.1.1/doc/cephfs/mantle.rst ceph-12.1.2/doc/cephfs/mantle.rst --- ceph-12.1.1/doc/cephfs/mantle.rst 2017-07-17 16:56:02.000000000 +0000 +++ ceph-12.1.2/doc/cephfs/mantle.rst 2017-08-01 17:55:40.000000000 +0000 @@ -35,7 +35,7 @@ Most of the time this guide will work but sometimes all MDSs lock up and you cannot actually see them spill. It is much better to run this on a cluster. -As a pre-requistie, we assume you've installed `mdtest +As a pre-requistie, we assume you have installed `mdtest `_ or pulled the `Docker image `_. We use mdtest because we need to generate enough load to get over the MIN_OFFLOAD threshold that is @@ -106,7 +106,7 @@ done -6. When you're done, you can kill all the clients with: +6. When you are done, you can kill all the clients with: :: @@ -197,7 +197,7 @@ in the MDBalancer. We do not want the error propagating up the call chain. The cls_lua class wants to handle the error itself because it must fail gracefully. For Mantle, we don't care if a Lua error crashes our balancer -- in that case, -we'll fall back to the original balancer. +we will fall back to the original balancer. The performance improvement of using `lua_call` over `lua_pcall` would not be leveraged here because the balancer is invoked every 10 seconds by default. diff -Nru ceph-12.1.1/doc/cephfs/multimds.rst ceph-12.1.2/doc/cephfs/multimds.rst --- ceph-12.1.1/doc/cephfs/multimds.rst 2017-07-17 16:56:02.000000000 +0000 +++ ceph-12.1.2/doc/cephfs/multimds.rst 2017-08-01 17:55:40.000000000 +0000 @@ -81,7 +81,7 @@ we have decreased max_mds, because max_mds only restricts creation of new ranks. -Next, use the ``ceph mds deactivate `` command to remove the +Next, use the ``ceph mds deactivate `` command to remove the unneeded rank: :: @@ -93,6 +93,9 @@ # fsmap e12: 1/1/1 up {0=a=up:active}, 1 up:standby # fsmap e13: 1/1/1 up {0=a=up:active}, 2 up:standby +See :doc:`/cephfs/administration` for more details which forms ```` can +take. + The deactivated rank will first enter the stopping state for a period of time while it hands off its share of the metadata to the remaining active daemons. This phase can take from seconds to minutes. If the diff -Nru ceph-12.1.1/doc/cephfs/troubleshooting.rst ceph-12.1.2/doc/cephfs/troubleshooting.rst --- ceph-12.1.1/doc/cephfs/troubleshooting.rst 2017-07-17 16:56:02.000000000 +0000 +++ ceph-12.1.2/doc/cephfs/troubleshooting.rst 2017-08-01 17:55:40.000000000 +0000 @@ -13,7 +13,7 @@ RADOS Health ============ -If part of the CephFS metadata or data pools is unavaible and CephFS isn't +If part of the CephFS metadata or data pools is unavaible and CephFS is not responding, it is probably because RADOS itself is unhealthy. Resolve those problems first (:doc:`../../rados/troubleshooting/index`). @@ -47,15 +47,15 @@ the operation off to the MDS log. If it is waiting on the OSDs, fix them. If operations are stuck on a specific inode, you probably have a client holding caps which prevent others from using it, either because the client is trying -to flush out dirty data or because you've encountered a bug in CephFS' +to flush out dirty data or because you have encountered a bug in CephFS' distributed file lock code (the file "capabilities" ["caps"] system). If it's a result of a bug in the capabilities code, restarting the MDS is likely to resolve the problem. -If there are no slow requests reported on the MDS, and it isn't reporting +If there are no slow requests reported on the MDS, and it is not reporting that clients are misbehaving, either the client has a problem or its -requests aren't reaching the MDS. +requests are not reaching the MDS. ceph-fuse debugging =================== @@ -101,7 +101,7 @@ * osdc: Dumps the current ops in-flight to OSDs (ie, file data IO) * osdmap: Dumps the current OSDMap epoch, pools, and OSDs -If there are no stuck requests but you have file IO which isn't progressing, +If there are no stuck requests but you have file IO which is not progressing, you might have a... Disconnected+Remounted FS @@ -109,7 +109,7 @@ Because CephFS has a "consistent cache", if your network connection is disrupted for a long enough time, the client will be forcibly disconnected from the system. At this point, the kernel client is in -a bind: it can't safely write back dirty data, and many applications +a bind: it cannot safely write back dirty data, and many applications do not handle IO errors correctly on close(). At the moment, the kernel client will remount the FS, but outstanding filesystem IO may or may not be satisfied. In these cases, you may need to reboot your diff -Nru ceph-12.1.1/doc/dev/ceph-disk.rst ceph-12.1.2/doc/dev/ceph-disk.rst --- ceph-12.1.1/doc/dev/ceph-disk.rst 1970-01-01 00:00:00.000000000 +0000 +++ ceph-12.1.2/doc/dev/ceph-disk.rst 2017-08-01 17:55:40.000000000 +0000 @@ -0,0 +1,61 @@ +========= +ceph-disk +========= + + +device-mapper crypt +=================== + +Settings +-------- + +``osd_dmcrypt_type`` + +:Description: this option specifies the mode in which ``cryptsetup`` works. It can be ``luks`` or ``plain``. It kicks in only if the ``--dmcrypt`` option is passed to ``ceph-disk``. See also `cryptsetup document `_ for more details. + +:Type: String +:Default: ``luks`` + + +``osd_dmcrypt_key_size`` + +:Description: the size of the random string in bytes used as the LUKS key. The string is read from ``/dev/urandom`` and then encoded using base64. It will be stored with the key of ``dm-crypt/osd/$uuid/luks`` using config-key. + +:Type: String +:Default: 1024 if ``osd_dmcrypt_type`` is ``luks``, 256 otherwise. + +lockbox +------- + +``ceph-disk`` supports dmcrypt (device-mapper crypt). If dmcrypt is enabled, the partitions will be encrypted using this machinary. For each OSD device, a lockbox is introduced for holding the information regarding how the dmcrypt key is stored. To prepare a lockbox, ``ceph-disk`` + +#. creates a dedicated lockbox partition on device, and +#. populates it with a tiny filesystem, then +#. automounts it at ``/var/lib/ceph/osd-lockbox/$uuid``, read-only. where the ``uuid`` is the lockbox's uuid. + +under which, settings are stored using plain files: + +- key-management-mode: ``ceph-mon v1`` +- osd-uuid: the OSD's uuid +- ceph_fsid: the fsid of the cluster +- keyring: the lockbox's allowing one to fetch the LUKS key +- block_uuid: the partition uuid for the block device +- journal_uuid: the partition uuid for the journal device +- block.db_uuid: the partition uuid for the block.db device +- block.wal_uuid: the partition uuid for the block.wal device +- magic: a magic string indicating that this partition is a lockbox. It's not used currently. +- ``${space_uuid}``: symbolic links named after the uuid of space partitions pointing to ``/var/lib/ceph/osd-lockbox/$uuid``. in the case of FileStore, the space partitions are ``data`` and ``journal`` partitions, for BlueStore, they are ``data``, ``block.db`` and ``block.wal``. + +Currently, ``ceph-mon v1`` is the only supported key-management-mode. In that case, the LUKS key is stored using the config-key in the monitor store with the key of ``dm-crypt/osd/$uuid/luks``. + + +partitions +========== + +``ceph-disk`` creates partitions for preparing a device for OSD deployment. Their partition numbers are hardcoded. For instance, data partition's partition number is always *1* : + +1. data partition +2. journal partition, if co-located with data +3. block.db for BlueStore, if co-located with data +4. block.wal for BlueStore, if co-located with data +5. lockbox diff -Nru ceph-12.1.1/doc/dev/config.rst ceph-12.1.2/doc/dev/config.rst --- ceph-12.1.1/doc/dev/config.rst 2017-07-17 16:56:02.000000000 +0000 +++ ceph-12.1.2/doc/dev/config.rst 2017-08-01 17:55:40.000000000 +0000 @@ -57,7 +57,7 @@ There are two ways for Ceph code to get configuration values. One way is to read it directly from a variable named "g_conf," or equivalently, "g_ceph_ctx->_conf." The other is to register an observer that will be called -every time the relevant configuration values changes. This observer will be +every time the relevant configuration values changes. This observer will be called soon after the initial configuration is read, and every time after that when one of the relevant values changes. Each observer tracks a set of keys and is invoked only when one of the relevant keys changes. @@ -88,3 +88,70 @@ Injectargs, parse_argv, and parse_env are three other functions which modify the configuration. Just like with set_val, you should call apply_changes after calling these functions to make sure your changes get applied. + + +Defining config options +======================= + +New-style config options are defined in common/options.cc. All new config +options should go here (and not into legacy_config_opts.h). + +Levels +------ + +The Option constructor takes a "level" value: + +* *LEVEL_BASIC* is for basic config options that a normal operator is likely to adjust. +* *LEVEL_ADVANCED* is for options that an operator *can* adjust, but should not touch unless they understand what they are doing. Adjusting advanced options poorly can lead to problems (performance or even data loss) if done incorrectly. +* *LEVEL_DEV* is for options in place for use by developers only, either for testing purposes, or to describe constants that no user should adjust but we prefer not to compile into the code. + +Description and long description +-------------------------------- + +Short description of the option. Sentence fragment. e.g.:: + + .set_description("Default checksum algorithm to use") + +The long description is complete sentences, perhaps even multiple +paragraphs, and may include other detailed information or notes.:: + + .set_long_description("crc32c, xxhash32, and xxhash64 are available. The _16 and _8 variants use only a subset of the bits for more compact (but less reliable) checksumming.") + +Default values +-------------- + +There is a default value for every config option. In some cases, there may +also be a *daemon default* that only applies to code that declares itself +as a daemon (in thise case, the regular default only applies to non-daemons). + +Safety +------ + +If an option can be safely changed at runtime:: + + .set_safe() + +Service +------- + +Service is a component name, like "common", "osd", "rgw", "mds", etc. It may +be a list of components, like:: + + .add_service("mon mds osd mgr") + +For example, the rocksdb options affect both the osd and mon. + +Tags +---- + +Tags identify options across services that relate in some way. Example include; + + - network -- options affecting network configuration + - mkfs -- options that only matter at mkfs time + +Enums +----- + +For options with a defined set of allowed values:: + + .set_enum_allowed({"none", "crc32c", "crc32c_16", "crc32c_8", "xxhash32", "xxhash64"}) diff -Nru ceph-12.1.1/doc/dev/development-workflow.rst ceph-12.1.2/doc/dev/development-workflow.rst --- ceph-12.1.1/doc/dev/development-workflow.rst 2017-07-17 16:56:02.000000000 +0000 +++ ceph-12.1.2/doc/dev/development-workflow.rst 2017-08-01 17:55:40.000000000 +0000 @@ -97,12 +97,9 @@ assign them a priority * The bugs with higher priority are worked on first -Each ``team`` is responsible for a project: +Each ``team`` is responsible for a project, managed by leads_. -* rgw lead is Yehuda Sadeh -* CephFS lead is John Spray -* rados lead is Samuel Just -* rbd lead is Jason Dillaman +.. _leads: index#Leads The ``developer`` assigned to an issue is responsible for it. The status of an open issue can be: diff -Nru ceph-12.1.1/doc/dev/documenting.rst ceph-12.1.2/doc/dev/documenting.rst --- ceph-12.1.1/doc/dev/documenting.rst 2017-07-17 16:56:02.000000000 +0000 +++ ceph-12.1.2/doc/dev/documenting.rst 2017-08-01 17:55:40.000000000 +0000 @@ -2,6 +2,28 @@ Documenting Ceph ================== +User documentation +================== + +The documentation on docs.ceph.com is generated from the restructuredText +sources in ``/doc/`` in the Ceph git repository. + +Please make sure that your changes are written in a way that is intended +for end users of the software, unless you are making additions in +``/doc/dev/``, which is the section for developers. + +All pull requests that modify user-facing functionality must +include corresponding updates to documentation: see +`Submitting Patches`_ for more detail. + +Check your .rst syntax is working as expected by using the "View" +button in the github user interface when looking at a diff on +an .rst file, or build the docs locally using the ``admin/build-doc`` +script. + +For more information about the Ceph documentation, see +:doc:`/start/documenting-ceph`. + Code Documentation ================== @@ -48,7 +70,7 @@ digraph "example" { foo -> bar; bar -> baz; - bar -> thud; + bar -> th } Most of the time, you'll want to put the actual DOT source in a @@ -106,3 +128,5 @@ SVG diagrams using Inkscape. HTML5 will support SVG inline. + +.. _Submitting Patches: /SubmittingPatches.rst diff -Nru ceph-12.1.1/doc/dev/index.rst ceph-12.1.2/doc/dev/index.rst --- ceph-12.1.1/doc/dev/index.rst 2017-07-17 16:56:02.000000000 +0000 +++ ceph-12.1.2/doc/dev/index.rst 2017-08-01 17:55:40.000000000 +0000 @@ -49,16 +49,16 @@ .. _github: https://github.com/ -========= =============== ============= -Scope Lead GitHub nick -========= =============== ============= -Ceph Sage Weil liewegas -RADOS Samuel Just athanatos -RGW Yehuda Sadeh yehudasa -RBD Jason Dillaman dillaman -CephFS John Spray jcsp -Build/Ops Ken Dreyer ktdreyer -========= =============== ============= +========= ================ ============= +Scope Lead GitHub nick +========= ================ ============= +Ceph Sage Weil liewegas +RADOS Samuel Just athanatos +RGW Yehuda Sadeh yehudasa +RBD Jason Dillaman dillaman +CephFS Patrick Donnelly batrick +Build/Ops Ken Dreyer ktdreyer +========= ================ ============= The Ceph-specific acronyms in the table are explained in :doc:`/architecture`. diff -Nru ceph-12.1.1/doc/dev/perf_counters.rst ceph-12.1.2/doc/dev/perf_counters.rst --- ceph-12.1.1/doc/dev/perf_counters.rst 2017-07-17 16:56:02.000000000 +0000 +++ ceph-12.1.2/doc/dev/perf_counters.rst 2017-08-01 17:55:40.000000000 +0000 @@ -4,7 +4,7 @@ The perf counters provide generic internal infrastructure for gauges and counters. The counted values can be both integer and float. There is also an "average" type (normally float) that combines a sum and num counter which can be divided to provide an average. -The intention is that this data will be collected and aggregated by a tool like ``collectd`` or ``statsd`` and fed into a tool like ``graphite`` for graphing and analysis. +The intention is that this data will be collected and aggregated by a tool like ``collectd`` or ``statsd`` and fed into a tool like ``graphite`` for graphing and analysis. Also, note the :doc:`../mgr/prometheus`. Access ------ diff -Nru ceph-12.1.1/doc/dev/quick_guide.rst ceph-12.1.2/doc/dev/quick_guide.rst --- ceph-12.1.1/doc/dev/quick_guide.rst 2017-07-17 16:56:02.000000000 +0000 +++ ceph-12.1.2/doc/dev/quick_guide.rst 2017-08-01 17:55:40.000000000 +0000 @@ -34,35 +34,32 @@ $ MON=1 MDS=1 ../src/vstart.sh -d -n -x -The system creates three pools on startup: `cephfs_data`, `cephfs_metadata`, and `rbd`. Let's get some stats on +The system creates two pools on startup: `cephfs_data_a` and `cephfs_metadata_a`. Let's get some stats on the current pools: .. code:: $ bin/ceph osd pool stats *** DEVELOPER MODE: setting PATH, PYTHONPATH and LD_LIBRARY_PATH *** - pool rbd id 0 - nothing is going on - - pool cephfs_data id 1 + pool cephfs_data_a id 1 nothing is going on - pool cephfs_metadata id 2 + pool cephfs_metadata_a id 2 nothing is going on - $ bin/ceph osd pool stats cephfs_data + $ bin/ceph osd pool stats cephfs_data_a *** DEVELOPER MODE: setting PATH, PYTHONPATH and LD_LIBRARY_PATH *** - pool cephfs_data id 1 + pool cephfs_data_a id 1 nothing is going on - $ ./rados df - pool name category KB objects clones degraded unfound rd rd KB wr wr KB - rbd - 0 0 0 0 0 0 0 0 0 - cephfs_data - 0 0 0 0 0 0 0 0 0 - cephfs_metadata - 2 20 0 40 0 0 0 21 8 - total used 12771536 20 - total avail 3697045460 - total space 3709816996 + $ bin/rados df + POOL_NAME USED OBJECTS CLONES COPIES MISSING_ON_PRIMARY UNFOUND DEGRADED RD_OPS RD WR_OPS WR + cephfs_data_a 0 0 0 0 0 0 0 0 0 0 0 + cephfs_metadata_a 2246 21 0 63 0 0 0 0 0 42 8192 + + total_objects 21 + total_used 244G + total_space 1180G Make a pool and run some benchmarks against it: diff -Nru ceph-12.1.1/doc/install/install-ceph-gateway.rst ceph-12.1.2/doc/install/install-ceph-gateway.rst --- ceph-12.1.1/doc/install/install-ceph-gateway.rst 2017-07-17 16:56:02.000000000 +0000 +++ ceph-12.1.2/doc/install/install-ceph-gateway.rst 2017-08-01 17:55:40.000000000 +0000 @@ -169,8 +169,8 @@ Migrating from Apache to Civetweb --------------------------------- -If you're running the Ceph Object Gateway on Apache and FastCGI with Ceph -Storage v0.80 or above, you're already running Civetweb--it starts with the +If you are running the Ceph Object Gateway on Apache and FastCGI with Ceph +Storage v0.80 or above, you are already running Civetweb--it starts with the ``ceph-radosgw`` daemon and it's running on port 7480 by default so that it doesn't conflict with your Apache and FastCGI installation and other commonly used web service ports. Migrating to use Civetweb basically involves removing @@ -277,7 +277,7 @@ radosgw-admin region set < region.json -Once you've updated your region, update the region map. For example:: +Once you have updated your region, update the region map. For example:: radosgw-admin regionmap update --name client.rgw.ceph-client diff -Nru ceph-12.1.1/doc/install/manual-deployment.rst ceph-12.1.2/doc/install/manual-deployment.rst --- ceph-12.1.1/doc/install/manual-deployment.rst 2017-07-17 16:56:02.000000000 +0000 +++ ceph-12.1.2/doc/install/manual-deployment.rst 2017-08-01 17:55:40.000000000 +0000 @@ -162,7 +162,7 @@ #. Generate an administrator keyring, generate a ``client.admin`` user and add the user to the keyring. :: - sudo ceph-authtool --create-keyring /etc/ceph/ceph.client.admin.keyring --gen-key -n client.admin --set-uid=0 --cap mon 'allow *' --cap osd 'allow *' --cap mds 'allow' + sudo ceph-authtool --create-keyring /etc/ceph/ceph.client.admin.keyring --gen-key -n client.admin --set-uid=0 --cap mon 'allow *' --cap osd 'allow *' --cap mds 'allow' --cap mgr 'allow *' #. Add the ``client.admin`` key to the ``ceph.mon.keyring``. :: @@ -348,116 +348,71 @@ Without the benefit of any helper utilities, create an OSD and add it to the cluster and CRUSH map with the following procedure. To create the first two -OSDs with the long form procedure, execute the following on ``node2`` and -``node3``: +OSDs with the long form procedure, execute the following steps for each OSD. -#. Connect to the OSD host. :: - - ssh {node-name} - -#. Generate a UUID for the OSD. :: - - uuidgen +.. note:: This procedure does not describe deployment on top of dm-crypt + making use of the dm-crypt 'lockbox'. +#. Connect to the OSD host and become root. :: -#. Create the OSD. If no UUID is given, it will be set automatically when the - OSD starts up. The following command will output the OSD number, which you - will need for subsequent steps. :: - - ceph osd create [{uuid} [{id}]] + ssh {node-name} + sudo bash +#. Generate a UUID for the OSD. :: -#. Create the default directory on your new OSD. :: + UUID=$(uuidgen) - ssh {new-osd-host} - sudo mkdir /var/lib/ceph/osd/{cluster-name}-{osd-number} - - -#. If the OSD is for a drive other than the OS drive, prepare it - for use with Ceph, and mount it to the directory you just created:: +#. Generate a cephx key for the OSD. :: - ssh {new-osd-host} - sudo mkfs -t {fstype} /dev/{hdd} - sudo mount -o user_xattr /dev/{hdd} /var/lib/ceph/osd/{cluster-name}-{osd-number} + OSD_SECRET=$(ceph-authtool --gen-print-key) +#. Create the OSD. Note that an OSD ID can be provided as an + additional argument to ``ceph osd new`` if you need to reuse a + previously-destroyed OSD id. We assume that the + ``client.bootstrap-osd`` key is present on the machine. You may + alternatively execute this command as ``client.admin`` on a + different host where that key is present.:: -#. Initialize the OSD data directory. :: - - ssh {new-osd-host} - sudo ceph-osd -i {osd-num} --mkfs --mkkey --osd-uuid [{uuid}] - - The directory must be empty before you can run ``ceph-osd`` with the - ``--mkkey`` option. In addition, the ceph-osd tool requires specification - of custom cluster names with the ``--cluster`` option. - - -#. Register the OSD authentication key. The value of ``ceph`` for - ``ceph-{osd-num}`` in the path is the ``$cluster-$id``. If your - cluster name differs from ``ceph``, use your cluster name instead.:: - - sudo ceph auth add osd.{osd-num} osd 'allow *' mon 'allow profile osd' -i /var/lib/ceph/osd/{cluster-name}-{osd-num}/keyring - - -#. Add your Ceph Node to the CRUSH map. :: - - ceph [--cluster {cluster-name}] osd crush add-bucket {hostname} host - - For example:: + ID=$(echo "{\"cephx_secret\": \"$OSD_SECRET\"}" | \ + ceph osd new $UUID -i - \ + -n client.bootstrap-osd -k /var/lib/ceph/bootstrap-osd/ceph.keyring) - ceph osd crush add-bucket node1 host +#. Create the default directory on your new OSD. :: + mkdir /var/lib/ceph/osd/ceph-$ID -#. Place the Ceph Node under the root ``default``. :: +#. If the OSD is for a drive other than the OS drive, prepare it + for use with Ceph, and mount it to the directory you just created. :: - ceph osd crush move node1 root=default + mkfs.xfs /dev/{DEV} + mount /dev/{DEV} /var/lib/ceph/osd/ceph-$ID +#. Write the secret to the OSD keyring file. :: -#. Add the OSD to the CRUSH map so that it can begin receiving data. You may - also decompile the CRUSH map, add the OSD to the device list, add the host as a - bucket (if it's not already in the CRUSH map), add the device as an item in the - host, assign it a weight, recompile it and set it. :: + ceph-authtool --create-keyring /var/lib/ceph/osd/ceph-$ID/keyring \ + --name osd.$ID --add-key $OSD_SECRET - ceph [--cluster {cluster-name}] osd crush add {id-or-name} {weight} [{bucket-type}={bucket-name} ...] +#. Initialize the OSD data directory. :: - For example:: + ceph-osd -i $ID --mkfs --osd-uuid $UUID - ceph osd crush add osd.0 1.0 host=node1 +#. Fix ownership. :: + chown -R ceph:ceph /var/lib/ceph/osd/ceph-$ID #. After you add an OSD to Ceph, the OSD is in your configuration. However, - it is not yet running. The OSD is ``down`` and ``in``. You must start + it is not yet running. You must start your new OSD before it can begin receiving data. - For Ubuntu, use Upstart:: - - sudo start ceph-osd id={osd-num} [cluster={cluster-name}] + For modern systemd distributions:: + systemctl enable ceph-osd@$ID + systemctl start ceph-osd@$ID + For example:: - sudo start ceph-osd id=0 - sudo start ceph-osd id=1 - - For Debian/CentOS/RHEL, use sysvinit:: - - sudo /etc/init.d/ceph start osd.{osd-num} [--cluster {cluster-name}] - - For example:: - - sudo /etc/init.d/ceph start osd.0 - sudo /etc/init.d/ceph start osd.1 - - In this case, to allow the start of the daemon at each reboot you - must create an empty file like this:: - - sudo touch /var/lib/ceph/osd/{cluster-name}-{osd-num}/sysvinit - - For example:: - - sudo touch /var/lib/ceph/osd/ceph-0/sysvinit - sudo touch /var/lib/ceph/osd/ceph-1/sysvinit - - Once you start your OSD, it is ``up`` and ``in``. - + systemctl enable ceph-osd@12 + systemctl start ceph-osd@12 Adding MDS diff -Nru ceph-12.1.1/doc/install/manual-freebsd-deployment.rst ceph-12.1.2/doc/install/manual-freebsd-deployment.rst --- ceph-12.1.1/doc/install/manual-freebsd-deployment.rst 2017-07-17 16:56:02.000000000 +0000 +++ ceph-12.1.2/doc/install/manual-freebsd-deployment.rst 2017-08-01 17:55:40.000000000 +0000 @@ -211,7 +211,7 @@ #. Generate an administrator keyring, generate a ``client.admin`` user and add the user to the keyring. :: - sudo ceph-authtool --create-keyring /etc/ceph/ceph.client.admin.keyring --gen-key -n client.admin --set-uid=0 --cap mon 'allow *' --cap osd 'allow *' --cap mds 'allow' + sudo ceph-authtool --create-keyring /etc/ceph/ceph.client.admin.keyring --gen-key -n client.admin --set-uid=0 --cap mon 'allow *' --cap osd 'allow *' --cap mds 'allow' --cap mgr 'allow *' #. Add the ``client.admin`` key to the ``ceph.mon.keyring``. :: diff -Nru ceph-12.1.1/doc/man/8/ceph-create-keys.rst ceph-12.1.2/doc/man/8/ceph-create-keys.rst --- ceph-12.1.1/doc/man/8/ceph-create-keys.rst 2017-07-17 16:56:02.000000000 +0000 +++ ceph-12.1.2/doc/man/8/ceph-create-keys.rst 2017-08-01 17:55:40.000000000 +0000 @@ -30,7 +30,7 @@ To list all users in the cluster:: - ceph auth list + ceph auth ls Options diff -Nru ceph-12.1.1/doc/man/8/ceph-rest-api.rst ceph-12.1.2/doc/man/8/ceph-rest-api.rst --- ceph-12.1.1/doc/man/8/ceph-rest-api.rst 2017-07-17 16:56:02.000000000 +0000 +++ ceph-12.1.2/doc/man/8/ceph-rest-api.rst 2017-08-01 17:55:40.000000000 +0000 @@ -47,7 +47,7 @@ specifies the client 'name', which is used to find the client-specific configuration options in the config file, and also is the name used for authentication when connecting - to the cluster (the entity name appearing in ceph auth list output, + to the cluster (the entity name appearing in 'ceph auth ls' output, for example). The default is 'client.restapi'. .. option:: -i/--id id diff -Nru ceph-12.1.1/doc/man/8/ceph.rst ceph-12.1.2/doc/man/8/ceph.rst --- ceph-12.1.1/doc/man/8/ceph.rst 2017-07-17 16:56:02.000000000 +0000 +++ ceph-12.1.2/doc/man/8/ceph.rst 2017-08-01 17:55:40.000000000 +0000 @@ -39,7 +39,7 @@ | **ceph** **mon_status** -| **ceph** **osd** [ *blacklist* \| *blocked-by* \| *create* \| *new* \| *deep-scrub* \| *df* \| *down* \| *dump* \| *erasure-code-profile* \| *find* \| *getcrushmap* \| *getmap* \| *getmaxosd* \| *in* \| *lspools* \| *map* \| *metadata* \| *out* \| *pause* \| *perf* \| *pg-temp* \| *primary-affinity* \| *primary-temp* \| *repair* \| *reweight* \| *reweight-by-pg* \| *rm* \| *destroy* \| *purge* \| *scrub* \| *set* \| *setcrushmap* \| *setmaxosd* \| *stat* \| *tree* \| *unpause* \| *unset* ] ... +| **ceph** **osd** [ *blacklist* \| *blocked-by* \| *create* \| *new* \| *deep-scrub* \| *df* \| *down* \| *dump* \| *erasure-code-profile* \| *find* \| *getcrushmap* \| *getmap* \| *getmaxosd* \| *in* \| *lspools* \| *map* \| *metadata* \| *out* \| *pause* \| *perf* \| *pg-temp* \| *force-create-pg* \| *primary-affinity* \| *primary-temp* \| *repair* \| *reweight* \| *reweight-by-pg* \| *rm* \| *destroy* \| *purge* \| *scrub* \| *set* \| *setcrushmap* \| *setmaxosd* \| *stat* \| *tree* \| *unpause* \| *unset* ] ... | **ceph** **osd** **crush** [ *add* \| *add-bucket* \| *create-or-move* \| *dump* \| *get-tunable* \| *link* \| *move* \| *remove* \| *rename-bucket* \| *reweight* \| *reweight-all* \| *reweight-subtree* \| *rm* \| *rule* \| *set* \| *set-tunable* \| *show-tunables* \| *tunables* \| *unlink* ] ... @@ -143,11 +143,11 @@ ceph auth import -Subcommand ``list`` lists authentication state. +Subcommand ``ls`` lists authentication state. Usage:: - ceph auth list + ceph auth ls Subcommand ``print-key`` displays requested key. @@ -199,7 +199,7 @@ Usage:: - ceph config-key list + ceph config-key ls Subcommand ``dump`` dumps configuration keys and values. @@ -207,11 +207,11 @@ ceph config-key dump -Subcommand ``put`` puts configuration key and value. +Subcommand ``set`` puts configuration key and value. Usage:: - ceph config-key put {} + ceph config-key set {} daemon @@ -247,6 +247,22 @@ ceph df {detail} +.. _ceph features: + +features +-------- + +Show the releases and features of all connected daemons and clients connected +to the cluster, along with the numbers of them in each bucket grouped by the +corresponding features/releases. Each release of Ceph supports a different set +of features, expressed by the features bitmask. New cluster features require +that clients support the feature, or else they are not allowed to connect to +these new features. As new features or capabilities are enabled after an +upgrade, older clients are prevented from connecting. + +Usage:: + + ceph features fs -- @@ -442,6 +458,63 @@ ceph mon_status +mgr +--- + +Ceph manager daemon configuration and management. + +Subcommand ``dump`` dumps the latest MgrMap, which describes the active +and standby manager daemons. + +Usage:: + + ceph mgr dump + +Subcommand ``fail`` will mark a manager daemon as failed, removing it +from the manager map. If it is the active manager daemon a standby +will take its place. + +Usage:: + + ceph mgr fail + +Subcommand ``module ls`` will list currently enabled manager modules (plugins). + +Usage:: + + ceph mgr module ls + +Subcommand ``module enable`` will enable a manager module. Available modules are included in MgrMap and visible via ``mgr dump``. + +Usage:: + + ceph mgr module enable + +Subcommand ``module disable`` will disable an active manager module. + +Usage:: + + ceph mgr module disable + +Subcommand ``metadata`` will report metadata about all manager daemons or, if the name is specified, a single manager daemon. + +Usage:: + + ceph mgr metadata [name] + +Subcommand ``versions`` will report a count of running daemon versions. + +Usage:: + + ceph mgr versions + +Subcommand ``count-metadata`` will report a count of any daemon metadata field. + +Usage:: + + ceph mgr count-metadata + + osd --- @@ -625,12 +698,6 @@ ceph osd crush rule dump {} -Subcommand ``list`` lists crush rules. - -Usage:: - - ceph osd crush rule list - Subcommand ``ls`` lists crush rules. Usage:: @@ -826,6 +893,13 @@ ceph osd pg-temp { [...]} +Subcommand ``force-create-pg`` forces creation of pg . + +Usage:: + + ceph osd force-create-pg + + Subcommand ``pool`` is used for managing data pools. It uses some additional subcommands. @@ -962,7 +1036,8 @@ ceph osd reweight-by-utilization {} {--no-increasing} -Subcommand ``rm`` removes osd(s) [...] in the cluster. +Subcommand ``rm`` removes osd(s) [...] from the OSD map. + Usage:: @@ -1016,6 +1091,18 @@ ceph osd setmaxosd +Subcommand ``set-require-min-compat-client`` enforces the cluster to be backward +compatible with the specified client version. This subcommand prevents you from +making any changes (e.g., crush tunables, or using new features) that +would violate the current setting. Please note, This subcommand will fail if +any connected daemon or client is not compatible with the features offered by +the given . To see the features and releases of all clients connected +to cluster, please see `ceph features`_. + +Usage:: + + ceph osd set-require-min-compat-client + Subcommand ``stat`` prints summary of OSD map. Usage:: @@ -1130,12 +1217,6 @@ ceph pg dump_stuck {inactive|unclean|stale|undersized|degraded [inactive|unclean|stale|undersized|degraded...]} {} -Subcommand ``force_create_pg`` forces creation of pg . - -Usage:: - - ceph pg force_create_pg - Subcommand ``getmap`` gets binary pg map to -o/stdout. Usage:: diff -Nru ceph-12.1.1/doc/man/8/rados.rst ceph-12.1.2/doc/man/8/rados.rst --- ceph-12.1.1/doc/man/8/rados.rst 2017-07-17 16:56:02.000000000 +0000 +++ ceph-12.1.2/doc/man/8/rados.rst 2017-08-01 17:55:40.000000000 +0000 @@ -159,12 +159,12 @@ :command:`getomapval` [ --omap-key-file *file* ] *name* *key* [ *out-file* ] Dump the hexadecimal value of key in the object map of object name. - If the optional *out-file* argument isn't provided, the value will be + If the optional *out-file* argument is not provided, the value will be written to standard output. :command:`setomapval` [ --omap-key-file *file* ] *name* *key* [ *value* ] Set the value of key in the object map of object name. If the optional - *value* argument isn't provided, the value will be read from standard + *value* argument is not provided, the value will be read from standard input. :command:`rmomapkey` [ --omap-key-file *file* ] *name* *key* diff -Nru ceph-12.1.1/doc/man/8/rbdmap.rst ceph-12.1.2/doc/man/8/rbdmap.rst --- ceph-12.1.1/doc/man/8/rbdmap.rst 2017-07-17 16:56:02.000000000 +0000 +++ ceph-12.1.2/doc/man/8/rbdmap.rst 2017-08-01 17:55:40.000000000 +0000 @@ -55,7 +55,7 @@ be unmounted and unmapped. ``rbdmap unmap-all`` attempts to unmount and subsequently unmap all currently -mapped RBD images, regardless of whether or not they're listed in the +mapped RBD images, regardless of whether or not they are listed in the configuration file. If successful, the ``rbd map`` operation maps the image to a ``/dev/rbdX`` diff -Nru ceph-12.1.1/doc/man/8/rbd.rst ceph-12.1.2/doc/man/8/rbd.rst --- ceph-12.1.1/doc/man/8/rbd.rst 2017-07-17 16:56:02.000000000 +0000 +++ ceph-12.1.2/doc/man/8/rbd.rst 2017-08-01 17:55:40.000000000 +0000 @@ -181,7 +181,7 @@ associated snapshots within the specified pool. It can also be used against individual images and snapshots. - If the RBD fast-diff feature isn't enabled on images, this operation will + If the RBD fast-diff feature is not enabled on images, this operation will require querying the OSDs for every potential object within the image. :command:`info` *image-spec* | *snap-spec* @@ -217,20 +217,20 @@ This requires image format 2. :command:`resize` (-s | --size *size-in-M/G/T*) [--allow-shrink] *image-spec* - Resizes rbd image. The size parameter also needs to be specified. + Resize rbd image. The size parameter also needs to be specified. The --allow-shrink option lets the size be reduced. :command:`rm` *image-spec* - Deletes an rbd image (including all data blocks). If the image has + Delete an rbd image (including all data blocks). If the image has snapshots, this fails and nothing is deleted. :command:`export` [--export-format *format (1 or 2)*] (*image-spec* | *snap-spec*) [*dest-path*] - Exports image to dest path (use - for stdout). + Export image to dest path (use - for stdout). The --export-format accepts '1' or '2' currently. Format 2 allow us to export not only the content of image, but also the snapshots and other properties, such as image_order, features. :command:`import` [--export-format *format (1 or 2)*] [--image-format *format-id*] [--object-size *size-in-B/K/M*] [--stripe-unit *size-in-B/K/M* --stripe-count *num*] [--image-feature *feature-name*]... [--image-shared] *src-path* [*image-spec*] - Creates a new image and imports its data from path (use - for + Create a new image and imports its data from path (use - for stdin). The import operation will try to create sparse rbd images if possible. For import from stdin, the sparsification unit is the data block size of the destination image (object size). @@ -242,7 +242,7 @@ of image, but also the snapshots and other properties, such as image_order, features. :command:`export-diff` [--from-snap *snap-name*] [--whole-object] (*image-spec* | *snap-spec*) *dest-path* - Exports an incremental diff for an image to dest path (use - for stdout). If + Export an incremental diff for an image to dest path (use - for stdout). If an initial snapshot is specified, only changes since that snapshot are included; otherwise, any regions of the image that contain data are included. The end snapshot is specified using the standard --snap option or @snap syntax (see below). The image diff format includes @@ -258,7 +258,7 @@ currently only support the source incremental diff with stripe_count == 1 :command:`import-diff` *src-path* *image-spec* - Imports an incremental diff of an image and applies it to the current image. If the diff + Import an incremental diff of an image and applies it to the current image. If the diff was generated relative to a start snapshot, we verify that snapshot already exists before continuing. If there was an end snapshot we verify it does not already exist before applying the changes, and create the snapshot when we are done. @@ -270,11 +270,11 @@ whether the region is known to be zeros or may contain other data. :command:`cp` (*src-image-spec* | *src-snap-spec*) *dest-image-spec* - Copies the content of a src-image into the newly created dest-image. + Copy the content of a src-image into the newly created dest-image. dest-image will have the same size, object size, and image format as src-image. :command:`mv` *src-image-spec* *dest-image-spec* - Renames an image. Note: rename across pools is not supported. + Rename an image. Note: rename across pools is not supported. :command:`image-meta list` *image-spec* Show metadata held on the image. The first column is the key @@ -290,24 +290,24 @@ Remove metadata key with the value. :command:`object-map rebuild` *image-spec* | *snap-spec* - Rebuilds an invalid object map for the specified image. An image snapshot can be + Rebuild an invalid object map for the specified image. An image snapshot can be specified to rebuild an invalid object map for a snapshot. :command:`snap ls` *image-spec* - Dumps the list of snapshots inside a specific image. + Dump the list of snapshots inside a specific image. :command:`snap create` *snap-spec* - Creates a new snapshot. Requires the snapshot name parameter specified. + Create a new snapshot. Requires the snapshot name parameter specified. :command:`snap rollback` *snap-spec* Rollback image content to snapshot. This will iterate through the entire blocks array and update the data head content to the snapshotted version. :command:`snap rm` [--force] *snap-spec* - Removes the specified snapshot. + Remove the specified snapshot. :command:`snap purge` *image-spec* - Removes all snapshots from an image. + Remove all snapshots from an image. :command:`snap protect` *snap-spec* Protect a snapshot from deletion, so that clones can be made of it @@ -333,19 +333,19 @@ an image. :command:`map` [-o | --options *krbd-options* ] [--read-only] *image-spec* | *snap-spec* - Maps the specified image to a block device via the rbd kernel module. + Map the specified image to a block device via the rbd kernel module. :command:`unmap` [-o | --options *krbd-options* ] *image-spec* | *snap-spec* | *device-path* - Unmaps the block device that was mapped via the rbd kernel module. + Unmap the block device that was mapped via the rbd kernel module. :command:`showmapped` Show the rbd images that are mapped via the rbd kernel module. :command:`nbd map` [--device *device-path*] [--read-only] *image-spec* | *snap-spec* - Maps the specified image to a block device via the rbd-nbd tool. + Map the specified image to a block device via the rbd-nbd tool. :command:`nbd unmap` *device-path* - Unmaps the block device that was mapped via the rbd-nbd tool. + Unmap the block device that was mapped via the rbd-nbd tool. :command:`nbd list` Show the list of used nbd devices via the rbd-nbd tool. @@ -354,11 +354,11 @@ Show the status of the image, including which clients have it open. :command:`feature disable` *image-spec* *feature-name*... - Disables the specified feature on the specified image. Multiple features can + Disable the specified feature on the specified image. Multiple features can be specified. :command:`feature enable` *image-spec* *feature-name*... - Enables the specified feature on the specified image. Multiple features can + Enable the specified feature on the specified image. Multiple features can be specified. :command:`lock list` *image-spec* @@ -383,6 +383,21 @@ --io-total. Defaults are: --io-size 4096, --io-threads 16, --io-total 1G, --io-pattern seq. +:command:`trash ls` [*pool-name*] + List all entries from trash. + +:command:`trash mv` *image-spec* + Move an image to the trash. Images, even ones actively in-use by + clones, can be moved to the trash and deleted at a later time. + +:command:`trash rm` *image-id* + Delete an image from trash. If image deferment time has not expired + you can not removed it unless use force. But an actively in-use by clones + or has snapshots can not be removed. + +:command:`trash restore` *image-id* + Restore an image from trash. + Image and snap specs ==================== @@ -561,6 +576,30 @@ rbd lock remove mypool/myimage mylockid client.2485 +To list images from trash:: + + rbd trash ls mypool + +To defer delete an image (use *--delay* to set delay-time, default is 0):: + + rbd trash mv mypool/myimage + +To delete an image from trash (be careful!):: + + rbd trash rm mypool/myimage-id + +To force delete an image from trash (be careful!):: + + rbd trash rm mypool/myimage-id --force + +To restore an image from trash:: + + rbd trash restore mypool/myimage-id + +To restore an image from trash and rename it:: + + rbd trash restore mypool/myimage-id --image mynewimage + Availability ============ diff -Nru ceph-12.1.1/doc/mgr/administrator.rst ceph-12.1.2/doc/mgr/administrator.rst --- ceph-12.1.1/doc/mgr/administrator.rst 2017-07-17 16:56:02.000000000 +0000 +++ ceph-12.1.2/doc/mgr/administrator.rst 2017-08-01 17:55:40.000000000 +0000 @@ -55,17 +55,20 @@ Calling module commands ----------------------- -Where a module implements command line hooks, using the Ceph CLI's -``tell`` command to call them like this:: +Where a module implements command line hooks, the commands will +be accessible as ordinary Ceph commands:: - ceph tell mgr + ceph + +If you would like to see the list of commands handled by the +manager (where normal ``ceph help`` would show all mon and mgr commands), +you can send a command directly to the manager daemon:: + + ceph tell mgr help Note that it is not necessary to address a particular mgr instance, simply ``mgr`` will pick the current active daemon. -Use the ``help`` command to get a list of available commands from all -modules. - Configuration ------------- diff -Nru ceph-12.1.1/doc/mgr/dashboard.rst ceph-12.1.2/doc/mgr/dashboard.rst --- ceph-12.1.1/doc/mgr/dashboard.rst 2017-07-17 16:56:02.000000000 +0000 +++ ceph-12.1.2/doc/mgr/dashboard.rst 2017-08-01 17:55:40.000000000 +0000 @@ -23,8 +23,8 @@ also be necessary to configure them separately. The hostname and port can be changed via the configuration key facility:: - ceph config-key put mgr/dashboard/$name/server_addr $IP - ceph config-key put mgr/dashboard/$name/server_port $PORT + ceph config-key set mgr/dashboard/$name/server_addr $IP + ceph config-key set mgr/dashboard/$name/server_port $PORT where ``$name`` is the ID of the ceph-mgr who is hosting this dashboard web app. @@ -32,8 +32,8 @@ These settings can also be configured cluster-wide and not manager specific. For example,:: - ceph config-key put mgr/dashboard/server_addr $IP - ceph config-key put mgr/dashboard/server_port $PORT + ceph config-key set mgr/dashboard/server_addr $IP + ceph config-key set mgr/dashboard/server_port $PORT If the port is not configured, the web app will bind to port ``7000``. If the address it not configured, the web app will bind to ``::``, diff -Nru ceph-12.1.1/doc/mgr/index.rst ceph-12.1.2/doc/mgr/index.rst --- ceph-12.1.1/doc/mgr/index.rst 2017-07-17 16:56:02.000000000 +0000 +++ ceph-12.1.2/doc/mgr/index.rst 2017-08-01 17:55:40.000000000 +0000 @@ -26,8 +26,9 @@ :maxdepth: 1 Installation and Configuration - Dashboard - RESTful - Zabbix + Dashboard plugin + RESTful plugin + Zabbix plugin + Prometheus plugin Writing plugins diff -Nru ceph-12.1.1/doc/mgr/prometheus.rst ceph-12.1.2/doc/mgr/prometheus.rst --- ceph-12.1.1/doc/mgr/prometheus.rst 1970-01-01 00:00:00.000000000 +0000 +++ ceph-12.1.2/doc/mgr/prometheus.rst 2017-08-01 17:55:40.000000000 +0000 @@ -0,0 +1,50 @@ +Prometheus plugin +================= + +Provides a Prometheus exporter to pass on Ceph performance counters +from the collection point in ceph-mgr. Ceph-mgr receives MMgrReport +messages from all MgrClient processes (mons and OSDs, for instance) +with performance counter schema data and actual counter data, and keeps +a circular buffer of the last N samples. This plugin creates an HTTP +endpoint (like all Prometheus exporters) and retrieves the latest sample +of every counter when polled (or "scraped" in Prometheus terminology). +The HTTP path and query parameters are ignored; all extant counters +for all reporting entities are returned in text exposition format. +(See the Prometheus `documentation `_.) + +Enabling +-------- + +The *prometheus* module is enabled with:: + + ceph mgr module enable prometheus + +Configuration +------------- + +By default the module will accept HTTP requests on port ``9283`` on all +IPv4 and IPv6 addresses on the host. The port and listen address are both +configurable with ``ceph config-key set``, with keys +``mgr/prometheus/server_addr`` and ``mgr/prometheus/server_port``. +This port is registered with Prometheus's `registry `_. + +Notes +----- + +Counters and gauges are exported; currently histograms and long-running +averages are not. It's possible that Ceph's 2-D histograms could be +reduced to two separate 1-D histograms, and that long-running averages +could be exported as Prometheus' Summary type. + +The names of the stats are exactly as Ceph names them, with +illegal characters ``.`` and ``-`` translated to ``_``. There is one +label applied, ``daemon``, and its value is the daemon.id for the +daemon in question (e.g. ``{daemon=mon.hosta}`` or ``{daemon=osd.11}``). + +Timestamps, as with many Prometheus exporters, are established by +the server's scrape time (Prometheus expects that it is polling the +actual counter process synchronously). It is possible to supply a +timestamp along with the stat report, but the Prometheus team strongly +advises against this. This means that timestamps will be delayed by +an unpredictable amount; it's not clear if this will be problematic, +but it's worth knowing about. diff -Nru ceph-12.1.1/doc/mgr/restful.rst ceph-12.1.2/doc/mgr/restful.rst --- ceph-12.1.1/doc/mgr/restful.rst 2017-07-17 16:56:02.000000000 +0000 +++ ceph-12.1.2/doc/mgr/restful.rst 2017-08-01 17:55:40.000000000 +0000 @@ -40,15 +40,15 @@ The ``restful.crt`` should then be signed by your organization's CA (certificate authority). Once that is done, you can set it with:: - ceph config-key put mgr/restful/$name/crt -i restful.crt - ceph config-key put mgr/restful/$name/key -i restful.key + ceph config-key set mgr/restful/$name/crt -i restful.crt + ceph config-key set mgr/restful/$name/key -i restful.key where ``$name`` is the name of the ``ceph-mgr`` instance (usually the hostname). If all manager instances are to share the same certificate, you can leave off the ``$name`` portion:: - ceph config-key put mgr/restful/crt -i restful.crt - ceph config-key put mgr/restful/key -i restful.key + ceph config-key set mgr/restful/crt -i restful.crt + ceph config-key set mgr/restful/key -i restful.key Configuring IP and port @@ -62,16 +62,16 @@ also be necessary to configure them separately. The IP and port can be changed via the configuration key facility:: - ceph config-key put mgr/restful/$name/server_addr $IP - ceph config-key put mgr/restful/$name/server_port $PORT + ceph config-key set mgr/restful/$name/server_addr $IP + ceph config-key set mgr/restful/$name/server_port $PORT where ``$name`` is the ID of the ceph-mgr daemon (usually the hostname). These settings can also be configured cluster-wide and not manager specific. For example,:: - ceph config-key put mgr/restful/server_addr $IP - ceph config-key put mgr/restful/server_port $PORT + ceph config-key set mgr/restful/server_addr $IP + ceph config-key set mgr/restful/server_port $PORT If the port is not configured, *restful* will bind to port ``8003``. If the address it not configured, the *restful* will bind to ``::``, diff -Nru ceph-12.1.1/doc/mgr/zabbix.rst ceph-12.1.2/doc/mgr/zabbix.rst --- ceph-12.1.1/doc/mgr/zabbix.rst 2017-07-17 16:56:02.000000000 +0000 +++ ceph-12.1.2/doc/mgr/zabbix.rst 2017-08-01 17:55:40.000000000 +0000 @@ -10,14 +10,14 @@ - Storage utilization Requirements -============ +------------ The plugin requires that the *zabbix_sender* executable is present on *all* machines running ceph-mgr. It can be installed on most distributions using the package manager. Dependencies ------------- +^^^^^^^^^^^^ Installing zabbix_sender can be done under Ubuntu or CentOS using either apt or dnf. @@ -35,7 +35,7 @@ Enabling -======== +-------- Add this to your ceph.conf on nodes where you run ceph-mgr: @@ -50,7 +50,7 @@ Configuration -============= +------------- Two configuration keys are mandatory for the module to work: @@ -71,25 +71,25 @@ - mgr/zabbix/zabbix_sender: /usr/bin/zabbix_sender - mgr/zabbix/interval: 60 -Configurations keys -------------------- +Configuration keys +^^^^^^^^^^^^^^^^^^^ Configuration keys can be set on any machine with the proper cephx credentials, these are usually Monitors where the *client.admin* key is present. :: - ceph config-key put + ceph config-key set For example: :: - ceph config-key put mgr/zabbix/zabbix_host zabbix.localdomain - ceph config-key put mgr/zabbix/identifier ceph.eu-ams02.local + ceph config-key set mgr/zabbix/zabbix_host zabbix.localdomain + ceph config-key set mgr/zabbix/identifier ceph.eu-ams02.local Debugging -========= +--------- Should you want to debug the Zabbix module increase the logging level for ceph-mgr and check the logs. diff -Nru ceph-12.1.1/doc/rados/api/librados.rst ceph-12.1.2/doc/rados/api/librados.rst --- ceph-12.1.1/doc/rados/api/librados.rst 2017-07-17 16:56:02.000000000 +0000 +++ ceph-12.1.2/doc/rados/api/librados.rst 2017-08-01 17:55:40.000000000 +0000 @@ -69,7 +69,7 @@ exit(1); } -In the end, you'll want to close your IO context and connection to RADOS with :c:func:`rados_ioctx_destroy()` and :c:func:`rados_shutdown()`:: +In the end, you will want to close your IO context and connection to RADOS with :c:func:`rados_ioctx_destroy()` and :c:func:`rados_shutdown()`:: rados_ioctx_destroy(io); rados_shutdown(cluster); diff -Nru ceph-12.1.1/doc/rados/configuration/ceph-conf.rst ceph-12.1.2/doc/rados/configuration/ceph-conf.rst --- ceph-12.1.1/doc/rados/configuration/ceph-conf.rst 2017-07-17 16:56:02.000000000 +0000 +++ ceph-12.1.2/doc/rados/configuration/ceph-conf.rst 2017-08-01 17:55:40.000000000 +0000 @@ -457,6 +457,104 @@ ceph daemon osd.0 config show | less +Reading Configuration Metadata at Runtime +========================================= + +Information about the available configuration options is available via +the ``config help`` command: + +:: + + ceph daemon {daemon-type}.{id} config help | less + + +This metadata is primarily intended to be used when integrating other +software with Ceph, such as graphical user interfaces. The output is +a list of JSON objects, for example: + +:: + + { + "name": "mon_host", + "type": "std::string", + "level": "basic", + "desc": "list of hosts or addresses to search for a monitor", + "long_desc": "This is a comma, whitespace, or semicolon separated list of IP addresses or hostnames. Hostnames are resolved via DNS and all A or AAAA records are included in the search list.", + "default": "", + "daemon_default": "", + "tags": [], + "services": [ + "common" + ], + "see_also": [], + "enum_values": [], + "min": "", + "max": "" + } + +type +____ + +The type of the setting, given as a C++ type name. + +level +_____ + +One of `basic`, `advanced`, `dev`. The `dev` options are not intended +for use outside of development and testing. + +desc +____ + +A short description -- this is a sentence fragment suitable for display +in small spaces like a single line in a list. + +long_desc +_________ + +A full description of what the setting does, this may be as long as needed. + +default +_______ + +The default value, if any. + +daemon_default +______________ + +An alternative default used for daemons (services) as opposed to clients. + +tags +____ + +A list of strings indicating topics to which this setting relates. Examples +of tags are `performance` and `networking`. + +services +________ + +A list of strings indicating which Ceph services the setting relates to, such +as `osd`, `mds`, `mon`. For settings that are relevant to any Ceph client +or server, `common` is used. + +see_also +________ + +A list of strings indicating other configuration options that may also +be of interest to a user setting this option. + +enum_values +___________ + +Optional: a list of strings indicating the valid settings. + +min, max +________ + +Optional: upper and lower (inclusive) bounds on valid settings. + + + Running Multiple Clusters ========================= diff -Nru ceph-12.1.1/doc/rados/configuration/mon-config-ref.rst ceph-12.1.2/doc/rados/configuration/mon-config-ref.rst --- ceph-12.1.1/doc/rados/configuration/mon-config-ref.rst 2017-07-17 16:56:02.000000000 +0000 +++ ceph-12.1.2/doc/rados/configuration/mon-config-ref.rst 2017-08-01 17:55:40.000000000 +0000 @@ -144,10 +144,10 @@ If Ceph Monitors discovered each other through the Ceph configuration file instead of through the monmap, it would introduce additional risks because the -Ceph configuration files aren't updated and distributed automatically. Ceph +Ceph configuration files are not updated and distributed automatically. Ceph Monitors might inadvertently use an older Ceph configuration file, fail to recognize a Ceph Monitor, fall out of a quorum, or develop a situation where -`Paxos`_ isn't able to determine the current state of the system accurately. +`Paxos`_ is not able to determine the current state of the system accurately. .. index:: Ceph Monitor; bootstrapping monitors @@ -877,7 +877,7 @@ .. tip:: You SHOULD install NTP on your Ceph monitor hosts to ensure that the monitor cluster operates with synchronized clocks. -Clock drift may still be noticeable with NTP even though the discrepancy isn't +Clock drift may still be noticeable with NTP even though the discrepancy is not yet harmful. Ceph's clock drift / clock skew warnings may get triggered even though NTP maintains a reasonable level of synchronization. Increasing your clock drift may be tolerable under such circumstances; however, a number of diff -Nru ceph-12.1.1/doc/rados/configuration/ms-ref.rst ceph-12.1.2/doc/rados/configuration/ms-ref.rst --- ceph-12.1.1/doc/rados/configuration/ms-ref.rst 2017-07-17 16:56:02.000000000 +0000 +++ ceph-12.1.2/doc/rados/configuration/ms-ref.rst 2017-08-01 17:55:40.000000000 +0000 @@ -102,7 +102,7 @@ :Description: Initial number of worker threads used by each Async Messenger instance. Should be at least equal to highest number of replicas, but you can - decrease it if you're low on CPU core count and/or you host a lot of + decrease it if you are low on CPU core count and/or you host a lot of OSDs on single server. :Type: 64-bit Unsigned Integer :Required: No @@ -135,7 +135,7 @@ workers #1 and #2 to CPU cores #0 and #2, respectively. NOTE: when manually setting affinity, make sure to not assign workers to processors that are virtual CPUs created as an effect of Hyperthreading - or similar technology, because they're slower than regular CPU cores. + or similar technology, because they are slower than regular CPU cores. :Type: String :Required: No :Default: ``(empty)`` diff -Nru ceph-12.1.1/doc/rados/configuration/network-config-ref.rst ceph-12.1.2/doc/rados/configuration/network-config-ref.rst --- ceph-12.1.1/doc/rados/configuration/network-config-ref.rst 2017-07-17 16:56:02.000000000 +0000 +++ ceph-12.1.2/doc/rados/configuration/network-config-ref.rst 2017-08-01 17:55:40.000000000 +0000 @@ -184,7 +184,7 @@ .. note:: Ceph uses `CIDR`_ notation for subnets (e.g., ``10.0.0.0/24``). -When you've configured your networks, you may restart your cluster or restart +When you have configured your networks, you may restart your cluster or restart each daemon. Ceph daemons bind dynamically, so you do not have to restart the entire cluster at once if you change your network configuration. @@ -370,7 +370,7 @@ ``ms bind ipv6`` :Description: Enables Ceph daemons to bind to IPv6 addresses. Currently the - messenger *either* uses IPv4 or IPv6, but it can't do both. + messenger *either* uses IPv4 or IPv6, but it cannot do both. :Type: Boolean :Default: ``false`` :Required: No diff -Nru ceph-12.1.1/doc/rados/configuration/osd-config-ref.rst ceph-12.1.2/doc/rados/configuration/osd-config-ref.rst --- ceph-12.1.1/doc/rados/configuration/osd-config-ref.rst 2017-07-17 16:56:02.000000000 +0000 +++ ceph-12.1.2/doc/rados/configuration/osd-config-ref.rst 2017-08-01 17:55:40.000000000 +0000 @@ -79,7 +79,7 @@ ``osd client message size cap`` :Description: The largest client data message allowed in memory. -:Type: 64-bit Integer Unsigned +:Type: 64-bit Unsigned Integer :Default: 500MB default. ``500*1024L*1024L`` @@ -377,13 +377,18 @@ token bucket system which when there are sufficient tokens will dequeue high priority queues first. If there are not enough tokens available, queues are dequeued low priority to high priority. - The new WeightedPriorityQueue (``wpq``) dequeues all priorities in + The WeightedPriorityQueue (``wpq``) dequeues all priorities in relation to their priorities to prevent starvation of any queue. WPQ should help in cases where a few OSDs are more overloaded - than others. Requires a restart. + than others. The new mClock based OpClassQueue + (``mclock_opclass``) prioritizes operations based on which class + they belong to (recovery, scrub, snaptrim, client op, osd subop). + And, the mClock based ClientQueue (``mclock_client``) also + incorporates the client identifier in order to promote fairness + between clients. See `QoS Based on mClock`_. Requires a restart. :Type: String -:Valid Choices: prio, wpq +:Valid Choices: prio, wpq, mclock_opclass, mclock_client :Default: ``prio`` @@ -523,6 +528,274 @@ :Type: 32-bit Integer :Default: ``5`` + +QoS Based on mClock +------------------- + +Ceph's use of mClock is currently in the experimental phase and should +be approached with an exploratory mindset. + +Core Concepts +````````````` + +The QoS support of Ceph is implemented using a queueing scheduler +based on `the dmClock algorithm`_. This algorithm allocates the I/O +resources of the Ceph cluster in proportion to weights, and enforces +the constraits of minimum reservation and maximum limitation, so that +the services can compete for the resources fairly. Currently the +*mclock_opclass* operation queue divides Ceph services involving I/O +resources into following buckets: + +- client op: the iops issued by client +- osd subop: the iops issued by primary OSD +- snap trim: the snap trimming related requests +- pg recovery: the recovery related requests +- pg scrub: the scrub related requests + +And the resources are partitioned using following three sets of tags. In other +words, the share of each type of service is controlled by three tags: + +#. reservation: the minimum IOPS allocated for the service. +#. limitation: the maximum IOPS allocated for the service. +#. weight: the proportional share of capacity if extra capacity or system + oversubscribed. + +In Ceph operations are graded with "cost". And the resources allocated +for serving various services are consumed by these "costs". So, for +example, the more reservation a services has, the more resource it is +guaranteed to possess, as long as it requires. Assuming there are 2 +services: recovery and client ops: + +- recovery: (r:1, l:5, w:1) +- client ops: (r:2, l:0, w:9) + +The settings above ensure that the recovery won't get more than 5 +requests per second serviced, even if it requires so (see CURRENT +IMPLEMENTATION NOTE below), and no other services are competing with +it. But if the clients start to issue large amount of I/O requests, +neither will they exhaust all the I/O resources. 1 request per second +is always allocated for recovery jobs as long as there are any such +requests. So the recovery jobs won't be starved even in a cluster with +high load. And in the meantime, the client ops can enjoy a larger +portion of the I/O resource, because its weight is "9", while its +competitor "1". In the case of client ops, it is not clamped by the +limit setting, so it can make use of all the resources if there is no +recovery ongoing. + +Along with *mclock_opclass* another mclock operation queue named +*mclock_client* is available. It divides operations based on category +but also divides them based on the client making the request. This +helps not only manage the distribution of resources spent on different +classes of operations but also tries to insure fairness among clients. + +CURRENT IMPLEMENTATION NOTE: the current experimental implementation +does not enforce the limit values. As a first approximation we decided +not to prevent operations that would otherwise enter the operation +sequencer from doing so. + +Subtleties of mClock +```````````````````` + +The reservation and limit values have a unit of requests per +second. The weight, however, does not technically have a unit and the +weights are relative to one another. So if one class of requests has a +weight of 1 and another a weight of 9, then the latter class of +requests should get 9 executed at a 9 to 1 ratio as the first class. +However that will only happen once the reservations are met and those +values include the operations executed under the reservation phase. + +Even though the weights do not have units, one must be careful in +choosing their values due how the algorithm assigns weight tags to +requests. If the weight is *W*, then for a given class of requests, +the next one that comes in will have a weight tag of *1/W* plus the +previous weight tag or the current time, whichever is larger. That +means if *W* is sufficiently large and therefore *1/W* is sufficiently +small, the calculated tag may never be assigned as it will get a value +of the current time. The ultimate lesson is that values for weight +should not be too large. They should be under the number of requests +one expects to ve serviced each second. + +Caveats +``````` + +There are some factors that can reduce the impact of the mClock op +queues within Ceph. First, requests to an OSD are sharded by their +placement group identifier. Each shard has its own mClock queue and +these queues neither interact nor share information among them. The +number of shards can be controlled with the configuration options +``osd_op_num_shards``, ``osd_op_num_shards_hdd``, and +``osd_op_num_shards_ssd``. A lower number of shards will increase the +impact of the mClock queues, but may have other deliterious effects. + +Second, requests are transferred from the operation queue to the +operation sequencer, in which they go through the phases of +execution. The operation queue is where mClock resides and mClock +determines the next op to transfer to the operation sequencer. The +number of operations allowed in the operation sequencer is a complex +issue. In general we want to keep enough operations in the sequencer +so it's always getting work done on some operations while it's waiting +for disk and network access to complete on other operations. On the +other hand, once an operation is transferred to the operation +sequencer, mClock no longer has control over it. Therefore to maximize +the impact of mClock, we want to keep as few operations in the +operation sequencer as possible. So we have an inherent tension. + +The configuration options that influence the number of operations in +the operation sequencer are ``bluestore_throttle_bytes``, +``bluestore_throttle_deferred_bytes``, +``bluestore_throttle_cost_per_io``, +``bluestore_throttle_cost_per_io_hdd``, and +``bluestore_throttle_cost_per_io_ssd``. + +A third factor that affects the impact of the mClock algorithm is that +we're using a distributed system, where requests are made to multiple +OSDs and each OSD has (can have) multiple shards. Yet we're currently +using the mClock algorithm, which is not distributed (note: dmClock is +the distributed version of mClock). + +Various organizations and individuals are currently experimenting with +mClock as it exists in this code base along with their modifications +to the code base. We hope you'll share you're experiences with your +mClock and dmClock experiments in the ceph-devel mailing list. + + +``osd push per object cost`` + +:Description: the overhead for serving a push op + +:Type: Unsigned Integer +:Default: 1000 + +``osd recovery max chunk`` + +:Description: the maximum total size of data chunks a recovery op can carry. + +:Type: Unsigned Integer +:Default: 8 MiB + + +``osd op queue mclock client op res`` + +:Description: the reservation of client op. + +:Type: Float +:Default: 1000.0 + + +``osd op queue mclock client op wgt`` + +:Description: the weight of client op. + +:Type: Float +:Default: 500.0 + + +``osd op queue mclock client op lim`` + +:Description: the limit of client op. + +:Type: Float +:Default: 1000.0 + + +``osd op queue mclock osd subop res`` + +:Description: the reservation of osd subop. + +:Type: Float +:Default: 1000.0 + + +``osd op queue mclock osd subop wgt`` + +:Description: the weight of osd subop. + +:Type: Float +:Default: 500.0 + + +``osd op queue mclock osd subop lim`` + +:Description: the limit of osd subop. + +:Type: Float +:Default: 0.0 + + +``osd op queue mclock snap res`` + +:Description: the reservation of snap trimming. + +:Type: Float +:Default: 0.0 + + +``osd op queue mclock snap wgt`` + +:Description: the weight of snap trimming. + +:Type: Float +:Default: 1.0 + + +``osd op queue mclock snap lim`` + +:Description: the limit of snap trimming. + +:Type: Float +:Default: 0.001 + + +``osd op queue mclock recov res`` + +:Description: the reservation of recovery. + +:Type: Float +:Default: 0.0 + + +``osd op queue mclock recov wgt`` + +:Description: the weight of recovery. + +:Type: Float +:Default: 1.0 + + +``osd op queue mclock recov lim`` + +:Description: the limit of recovery. + +:Type: Float +:Default: 0.001 + + +``osd op queue mclock scrub res`` + +:Description: the reservation of scrub jobs. + +:Type: Float +:Default: 0.0 + + +``osd op queue mclock scrub wgt`` + +:Description: the weight of scrub jobs. + +:Type: Float +:Default: 1.0 + + +``osd op queue mclock scrub lim`` + +:Description: the limit of scrub jobs. + +:Type: Float +:Default: 0.001 + +.. _the dmClock algorithm: https://www.usenix.org/legacy/event/osdi10/tech/full_papers/Gulati.pdf + + .. index:: OSD; backfilling Backfilling @@ -660,7 +933,7 @@ ``osd recovery max chunk`` :Description: The maximum size of a recovered chunk of data to push. -:Type: 64-bit Integer Unsigned +:Type: 64-bit Unsigned Integer :Default: ``8 << 20`` @@ -668,7 +941,7 @@ :Description: The maximum number of recovery operations per OSD that will be newly started when an OSD is recovering. -:Type: 64-bit Integer Unsigned +:Type: 64-bit Unsigned Integer :Default: ``1`` @@ -690,12 +963,30 @@ ``osd recovery sleep`` -:Description: Time to sleep before next recovery. Increasing this value will - slow down recovery operation while client operations will be - less impacted. +:Description: Time in seconds to sleep before next recovery or backfill op. + Increasing this value will slow down recovery operation while + client operations will be less impacted. + +:Type: Float +:Default: ``0`` + + +``osd recovery sleep hdd`` + +:Description: Time in seconds to sleep before next recovery or backfill op + for HDDs. + +:Type: Float +:Default: ``0.1`` + + +``osd recovery sleep ssd`` + +:Description: Time in seconds to sleep before next recovery or backfill op + for SSDs. :Type: Float -:Default: ``0.01`` +:Default: ``0`` Tiering ======= @@ -739,7 +1030,7 @@ ``osd default notify timeout`` :Description: The OSD default notification timeout (in seconds). -:Type: 32-bit Integer Unsigned +:Type: 32-bit Unsigned Integer :Default: ``30`` diff -Nru ceph-12.1.1/doc/rados/deployment/ceph-deploy-keys.rst ceph-12.1.2/doc/rados/deployment/ceph-deploy-keys.rst --- ceph-12.1.1/doc/rados/deployment/ceph-deploy-keys.rst 2017-07-17 16:56:02.000000000 +0000 +++ ceph-12.1.2/doc/rados/deployment/ceph-deploy-keys.rst 2017-08-01 17:55:40.000000000 +0000 @@ -18,7 +18,7 @@ .. note:: If you have specified multiple monitors in the setup of the cluster, make sure, that all monitors are up and running. If the monitors haven't - formed quorum, ``ceph-create-keys`` will not finish and the keys aren't + formed quorum, ``ceph-create-keys`` will not finish and the keys are not generated. Forget Keys diff -Nru ceph-12.1.1/doc/rados/operations/add-or-rm-mons.rst ceph-12.1.2/doc/rados/operations/add-or-rm-mons.rst --- ceph-12.1.1/doc/rados/operations/add-or-rm-mons.rst 2017-07-17 16:56:02.000000000 +0000 +++ ceph-12.1.2/doc/rados/operations/add-or-rm-mons.rst 2017-08-01 17:55:40.000000000 +0000 @@ -243,9 +243,9 @@ If monitors discovered each other through the Ceph configuration file instead of through the monmap, it would introduce additional risks because the Ceph -configuration files aren't updated and distributed automatically. Monitors +configuration files are not updated and distributed automatically. Monitors might inadvertently use an older ``ceph.conf`` file, fail to recognize a -monitor, fall out of a quorum, or develop a situation where `Paxos`_ isn't able +monitor, fall out of a quorum, or develop a situation where `Paxos`_ is not able to determine the current state of the system accurately. Consequently, making changes to an existing monitor's IP address must be done with great care. diff -Nru ceph-12.1.1/doc/rados/operations/add-or-rm-osds.rst ceph-12.1.2/doc/rados/operations/add-or-rm-osds.rst --- ceph-12.1.1/doc/rados/operations/add-or-rm-osds.rst 2017-07-17 16:56:02.000000000 +0000 +++ ceph-12.1.2/doc/rados/operations/add-or-rm-osds.rst 2017-08-01 17:55:40.000000000 +0000 @@ -165,6 +165,32 @@ subsequent releases. +Replacing an OSD +---------------- + +When disks fail, or if an admnistrator wants to reprovision OSDs with a new +backend, for instance, for switching from FileStore to BlueStore, OSDs need to +be replaced. Unlike `Removing the OSD`_, replaced OSD's id and CRUSH map entry +need to be keep intact after the OSD is destroyed for replacement. + +#. Destroy the OSD first:: + + ceph osd destroy {id} --yes-i-really-mean-it + +#. Zap a disk for the new OSD, if the disk was used before for other purposes. + It's not necessary for a new disk:: + + ceph-disk zap /dev/sdX + +#. Prepare the disk for replacement by using the previously destroyed OSD id:: + + ceph-disk prepare --bluestore /dev/sdX --osd-id {id} --osd-uuid `uuidgen` + +#. And activate the OSD:: + + ceph-disk activate /dev/sdX1 + + Starting the OSD ---------------- @@ -260,7 +286,7 @@ After that, you can observe the data migration which should come to its end. The difference between marking ``out`` the OSD and reweighting it to 0 is that in the first case the weight of the bucket which contains - the OSD isn't changed whereas in the second case the weight of the bucket + the OSD is not changed whereas in the second case the weight of the bucket is updated (and decreased of the OSD weight). The reweight command could be sometimes favoured in the case of a "small" cluster. @@ -287,6 +313,32 @@ ``ceph.conf`` file. If your host has multiple drives, you may need to remove an OSD for each drive by repeating this procedure. +#. Let the cluster forget the OSD first. This step removes the OSD from the CRUSH + map, removes its authentication key. And it is removed from the OSD map as + well. Please note the `purge subcommand`_ is introduced in Luminous, for older + versions, please see below :: + + ceph osd purge {id} --yes-i-really-mean-it + +#. Navigate to the host where you keep the master copy of the cluster's + ``ceph.conf`` file. :: + + ssh {admin-host} + cd /etc/ceph + vim ceph.conf + +#. Remove the OSD entry from your ``ceph.conf`` file (if it exists). :: + + [osd.1] + host = {hostname} + +#. From the host where you keep the master copy of the cluster's ``ceph.conf`` file, + copy the updated ``ceph.conf`` file to the ``/etc/ceph`` directory of other + hosts in your cluster. + +If your Ceph cluster is older than Luminous, instead of using ``ceph osd purge``, +you need to perform this step manually: + #. Remove the OSD from the CRUSH map so that it no longer receives data. You may also decompile the CRUSH map, remove the OSD from the device list, remove the @@ -308,23 +360,7 @@ ceph osd rm {osd-num} #for example ceph osd rm 1 - -#. Navigate to the host where you keep the master copy of the cluster's - ``ceph.conf`` file. :: - ssh {admin-host} - cd /etc/ceph - vim ceph.conf - -#. Remove the OSD entry from your ``ceph.conf`` file (if it exists). :: - - [osd.1] - host = {hostname} - -#. From the host where you keep the master copy of the cluster's ``ceph.conf`` file, - copy the updated ``ceph.conf`` file to the ``/etc/ceph`` directory of other - hosts in your cluster. - - .. _Remove an OSD: ../crush-map#removeosd +.. _purge subcommand: /man/8/ceph#osd diff -Nru ceph-12.1.1/doc/rados/operations/control.rst ceph-12.1.2/doc/rados/operations/control.rst --- ceph-12.1.1/doc/rados/operations/control.rst 2017-07-17 16:56:02.000000000 +0000 +++ ceph-12.1.2/doc/rados/operations/control.rst 2017-08-01 17:55:40.000000000 +0000 @@ -50,7 +50,7 @@ To list the cluster's keys and their capabilities, execute the following:: - ceph auth list + ceph auth ls Placement Group Subsystem @@ -141,16 +141,12 @@ Move an existing bucket from one position in the hierarchy to another. :: - ceph osd crush move {id} {loc1} [{loc2} ...] + ceph osd crush move {id} {loc1} [{loc2} ...] Set the weight of the item given by ``{name}`` to ``{weight}``. :: ceph osd crush reweight {name} {weight} -Create a cluster snapshot. :: - - ceph osd cluster_snap {name} - Mark an OSD as lost. This may result in permanent data loss. Use with caution. :: ceph osd lost {id} [--yes-i-really-mean-it] @@ -189,10 +185,6 @@ ceph osd in {osd-num} -List classes that are loaded in the ceph cluster. :: - - ceph class list - Set or clear the pause flags in the OSD map. If set, no IO requests will be sent to any OSD. Clearing the flags via unpause results in resending pending requests. :: @@ -207,7 +199,7 @@ and forces CRUSH to re-place (1-weight) of the data that would otherwise live on this drive. It does not change the weights assigned to the buckets above the OSD in the crush map, and is a corrective -measure in case the normal CRUSH distribution isn't working out quite +measure in case the normal CRUSH distribution is not working out quite right. For instance, if one of your OSDs is at 90% and the others are at 50%, you could reduce this weight to try and compensate for it. :: diff -Nru ceph-12.1.1/doc/rados/operations/crush-map-edits.rst ceph-12.1.2/doc/rados/operations/crush-map-edits.rst --- ceph-12.1.1/doc/rados/operations/crush-map-edits.rst 1970-01-01 00:00:00.000000000 +0000 +++ ceph-12.1.2/doc/rados/operations/crush-map-edits.rst 2017-08-01 17:55:40.000000000 +0000 @@ -0,0 +1,654 @@ +Manually editing a CRUSH Map +============================ + +.. note:: Manually editing the CRUSH map is considered an advanced + administrator operation. All CRUSH changes that are + necessary for the overwhelming majority of installations are + possible via the standard ceph CLI and do not require manual + CRUSH map edits. If you have identified a use case where + manual edits *are* necessary, consider contacting the Ceph + developers so that future versions of Ceph can make this + unnecessary. + +To edit an existing CRUSH map: + +#. `Get the CRUSH map`_. +#. `Decompile`_ the CRUSH map. +#. Edit at least one of `Devices`_, `Buckets`_ and `Rules`_. +#. `Recompile`_ the CRUSH map. +#. `Set the CRUSH map`_. + +To activate CRUSH map rules for a specific pool, identify the common ruleset +number for those rules and specify that ruleset number for the pool. See `Set +Pool Values`_ for details. + +.. _Get the CRUSH map: #getcrushmap +.. _Decompile: #decompilecrushmap +.. _Devices: #crushmapdevices +.. _Buckets: #crushmapbuckets +.. _Rules: #crushmaprules +.. _Recompile: #compilecrushmap +.. _Set the CRUSH map: #setcrushmap +.. _Set Pool Values: ../pools#setpoolvalues + +.. _getcrushmap: + +Get a CRUSH Map +--------------- + +To get the CRUSH map for your cluster, execute the following:: + + ceph osd getcrushmap -o {compiled-crushmap-filename} + +Ceph will output (-o) a compiled CRUSH map to the filename you specified. Since +the CRUSH map is in a compiled form, you must decompile it first before you can +edit it. + +.. _decompilecrushmap: + +Decompile a CRUSH Map +--------------------- + +To decompile a CRUSH map, execute the following:: + + crushtool -d {compiled-crushmap-filename} -o {decompiled-crushmap-filename} + + +Sections +-------- + +There are six main sections to a CRUSH Map. + +#. **tunables:** The preamble at the top of the map described any *tunables* + for CRUSH behavior that vary from the historical/legacy CRUSH behavior. These + correct for old bugs, optimizations, or other changes in behavior that have + been made over the years to improve CRUSH's behavior. + +#. **devices:** Devices are individual ``ceph-osd`` daemons that can + store data. + +#. **types**: Bucket ``types`` define the types of buckets used in + your CRUSH hierarchy. Buckets consist of a hierarchical aggregation + of storage locations (e.g., rows, racks, chassis, hosts, etc.) and + their assigned weights. + +#. **buckets:** Once you define bucket types, you must define each node + in the hierarchy, its type, and which devices or other nodes it + containes. + +#. **rules:** Rules define policy about how data is distributed across + devices in the hierarchy. + +#. **choose_args:** Choose_args are alternative weights associated with + the hierarchy that have been adjusted to optimize data placement. A single + choose_args map can be used for the entire cluster, or one can be + created for each individual pool. + + +.. _crushmapdevices: + +CRUSH Map Devices +----------------- + +Devices are individual ``ceph-osd`` daemons that can store data. You +will normally have one defined here for each OSD daemon in your +cluster. Devices are identified by an id (a non-negative integer) and +a name, normally ``osd.N`` where ``N`` is the device id. + +Devices may also have a *device class* associated with them (e.g., +``hdd`` or ``ssd``), allowing them to be conveniently targetted by a +crush rule. + +:: + + # devices + device {num} {osd.name} [class {class}] + +For example:: + + # devices + device 0 osd.0 class ssd + device 1 osd.1 class hdd + device 2 osd.2 + device 3 osd.3 + +In most cases, each device maps to a single ``ceph-osd`` daemon. This +is normally a single storage device, a pair of devices (for example, +one for data and one for a journal or metadata), or in some cases a +small RAID device. + + + + + +CRUSH Map Bucket Types +---------------------- + +The second list in the CRUSH map defines 'bucket' types. Buckets facilitate +a hierarchy of nodes and leaves. Node (or non-leaf) buckets typically represent +physical locations in a hierarchy. Nodes aggregate other nodes or leaves. +Leaf buckets represent ``ceph-osd`` daemons and their corresponding storage +media. + +.. tip:: The term "bucket" used in the context of CRUSH means a node in + the hierarchy, i.e. a location or a piece of physical hardware. It + is a different concept from the term "bucket" when used in the + context of RADOS Gateway APIs. + +To add a bucket type to the CRUSH map, create a new line under your list of +bucket types. Enter ``type`` followed by a unique numeric ID and a bucket name. +By convention, there is one leaf bucket and it is ``type 0``; however, you may +give it any name you like (e.g., osd, disk, drive, storage, etc.):: + + #types + type {num} {bucket-name} + +For example:: + + # types + type 0 osd + type 1 host + type 2 chassis + type 3 rack + type 4 row + type 5 pdu + type 6 pod + type 7 room + type 8 datacenter + type 9 region + type 10 root + + + +.. _crushmapbuckets: + +CRUSH Map Bucket Hierarchy +-------------------------- + +The CRUSH algorithm distributes data objects among storage devices according +to a per-device weight value, approximating a uniform probability distribution. +CRUSH distributes objects and their replicas according to the hierarchical +cluster map you define. Your CRUSH map represents the available storage +devices and the logical elements that contain them. + +To map placement groups to OSDs across failure domains, a CRUSH map defines a +hierarchical list of bucket types (i.e., under ``#types`` in the generated CRUSH +map). The purpose of creating a bucket hierarchy is to segregate the +leaf nodes by their failure domains, such as hosts, chassis, racks, power +distribution units, pods, rows, rooms, and data centers. With the exception of +the leaf nodes representing OSDs, the rest of the hierarchy is arbitrary, and +you may define it according to your own needs. + +We recommend adapting your CRUSH map to your firms's hardware naming conventions +and using instances names that reflect the physical hardware. Your naming +practice can make it easier to administer the cluster and troubleshoot +problems when an OSD and/or other hardware malfunctions and the administrator +need access to physical hardware. + +In the following example, the bucket hierarchy has a leaf bucket named ``osd``, +and two node buckets named ``host`` and ``rack`` respectively. + +.. ditaa:: + +-----------+ + | {o}rack | + | Bucket | + +-----+-----+ + | + +---------------+---------------+ + | | + +-----+-----+ +-----+-----+ + | {o}host | | {o}host | + | Bucket | | Bucket | + +-----+-----+ +-----+-----+ + | | + +-------+-------+ +-------+-------+ + | | | | + +-----+-----+ +-----+-----+ +-----+-----+ +-----+-----+ + | osd | | osd | | osd | | osd | + | Bucket | | Bucket | | Bucket | | Bucket | + +-----------+ +-----------+ +-----------+ +-----------+ + +.. note:: The higher numbered ``rack`` bucket type aggregates the lower + numbered ``host`` bucket type. + +Since leaf nodes reflect storage devices declared under the ``#devices`` list +at the beginning of the CRUSH map, you do not need to declare them as bucket +instances. The second lowest bucket type in your hierarchy usually aggregates +the devices (i.e., it's usually the computer containing the storage media, and +uses whatever term you prefer to describe it, such as "node", "computer", +"server," "host", "machine", etc.). In high density environments, it is +increasingly common to see multiple hosts/nodes per chassis. You should account +for chassis failure too--e.g., the need to pull a chassis if a node fails may +result in bringing down numerous hosts/nodes and their OSDs. + +When declaring a bucket instance, you must specify its type, give it a unique +name (string), assign it a unique ID expressed as a negative integer (optional), +specify a weight relative to the total capacity/capability of its item(s), +specify the bucket algorithm (usually ``straw``), and the hash (usually ``0``, +reflecting hash algorithm ``rjenkins1``). A bucket may have one or more items. +The items may consist of node buckets or leaves. Items may have a weight that +reflects the relative weight of the item. + +You may declare a node bucket with the following syntax:: + + [bucket-type] [bucket-name] { + id [a unique negative numeric ID] + weight [the relative capacity/capability of the item(s)] + alg [the bucket type: uniform | list | tree | straw ] + hash [the hash type: 0 by default] + item [item-name] weight [weight] + } + +For example, using the diagram above, we would define two host buckets +and one rack bucket. The OSDs are declared as items within the host buckets:: + + host node1 { + id -1 + alg straw + hash 0 + item osd.0 weight 1.00 + item osd.1 weight 1.00 + } + + host node2 { + id -2 + alg straw + hash 0 + item osd.2 weight 1.00 + item osd.3 weight 1.00 + } + + rack rack1 { + id -3 + alg straw + hash 0 + item node1 weight 2.00 + item node2 weight 2.00 + } + +.. note:: In the foregoing example, note that the rack bucket does not contain + any OSDs. Rather it contains lower level host buckets, and includes the + sum total of their weight in the item entry. + +.. topic:: Bucket Types + + Ceph supports four bucket types, each representing a tradeoff between + performance and reorganization efficiency. If you are unsure of which bucket + type to use, we recommend using a ``straw`` bucket. For a detailed + discussion of bucket types, refer to + `CRUSH - Controlled, Scalable, Decentralized Placement of Replicated Data`_, + and more specifically to **Section 3.4**. The bucket types are: + + #. **Uniform:** Uniform buckets aggregate devices with **exactly** the same + weight. For example, when firms commission or decommission hardware, they + typically do so with many machines that have exactly the same physical + configuration (e.g., bulk purchases). When storage devices have exactly + the same weight, you may use the ``uniform`` bucket type, which allows + CRUSH to map replicas into uniform buckets in constant time. With + non-uniform weights, you should use another bucket algorithm. + + #. **List**: List buckets aggregate their content as linked lists. Based on + the :abbr:`RUSH (Replication Under Scalable Hashing)` :sub:`P` algorithm, + a list is a natural and intuitive choice for an **expanding cluster**: + either an object is relocated to the newest device with some appropriate + probability, or it remains on the older devices as before. The result is + optimal data migration when items are added to the bucket. Items removed + from the middle or tail of the list, however, can result in a significant + amount of unnecessary movement, making list buckets most suitable for + circumstances in which they **never (or very rarely) shrink**. + + #. **Tree**: Tree buckets use a binary search tree. They are more efficient + than list buckets when a bucket contains a larger set of items. Based on + the :abbr:`RUSH (Replication Under Scalable Hashing)` :sub:`R` algorithm, + tree buckets reduce the placement time to O(log :sub:`n`), making them + suitable for managing much larger sets of devices or nested buckets. + + #. **Straw:** List and Tree buckets use a divide and conquer strategy + in a way that either gives certain items precedence (e.g., those + at the beginning of a list) or obviates the need to consider entire + subtrees of items at all. That improves the performance of the replica + placement process, but can also introduce suboptimal reorganization + behavior when the contents of a bucket change due an addition, removal, + or re-weighting of an item. The straw bucket type allows all items to + fairly “compete” against each other for replica placement through a + process analogous to a draw of straws. + +.. topic:: Hash + + Each bucket uses a hash algorithm. Currently, Ceph supports ``rjenkins1``. + Enter ``0`` as your hash setting to select ``rjenkins1``. + + +.. _weightingbucketitems: + +.. topic:: Weighting Bucket Items + + Ceph expresses bucket weights as doubles, which allows for fine + weighting. A weight is the relative difference between device capacities. We + recommend using ``1.00`` as the relative weight for a 1TB storage device. + In such a scenario, a weight of ``0.5`` would represent approximately 500GB, + and a weight of ``3.00`` would represent approximately 3TB. Higher level + buckets have a weight that is the sum total of the leaf items aggregated by + the bucket. + + A bucket item weight is one dimensional, but you may also calculate your + item weights to reflect the performance of the storage drive. For example, + if you have many 1TB drives where some have relatively low data transfer + rate and the others have a relatively high data transfer rate, you may + weight them differently, even though they have the same capacity (e.g., + a weight of 0.80 for the first set of drives with lower total throughput, + and 1.20 for the second set of drives with higher total throughput). + + +.. _crushmaprules: + +CRUSH Map Rules +--------------- + +CRUSH maps support the notion of 'CRUSH rules', which are the rules that +determine data placement for a pool. For large clusters, you will likely create +many pools where each pool may have its own CRUSH ruleset and rules. The default +CRUSH map has a rule for each pool, and one ruleset assigned to each of the +default pools. + +.. note:: In most cases, you will not need to modify the default rules. When + you create a new pool, its default ruleset is ``0``. + + +CRUSH rules define placement and replication strategies or distribution policies +that allow you to specify exactly how CRUSH places object replicas. For +example, you might create a rule selecting a pair of targets for 2-way +mirroring, another rule for selecting three targets in two different data +centers for 3-way mirroring, and yet another rule for erasure coding over six +storage devices. For a detailed discussion of CRUSH rules, refer to +`CRUSH - Controlled, Scalable, Decentralized Placement of Replicated Data`_, +and more specifically to **Section 3.2**. + +A rule takes the following form:: + + rule { + + ruleset + type [ replicated | erasure ] + min_size + max_size + step take [class ] + step [choose|chooseleaf] [firstn|indep] + step emit + } + + +``ruleset`` + +:Description: A means of classifying a rule as belonging to a set of rules. + Activated by `setting the ruleset in a pool`_. + +:Purpose: A component of the rule mask. +:Type: Integer +:Required: Yes +:Default: 0 + +.. _setting the ruleset in a pool: ../pools#setpoolvalues + + +``type`` + +:Description: Describes a rule for either a storage drive (replicated) + or a RAID. + +:Purpose: A component of the rule mask. +:Type: String +:Required: Yes +:Default: ``replicated`` +:Valid Values: Currently only ``replicated`` and ``erasure`` + +``min_size`` + +:Description: If a pool makes fewer replicas than this number, CRUSH will + **NOT** select this rule. + +:Type: Integer +:Purpose: A component of the rule mask. +:Required: Yes +:Default: ``1`` + +``max_size`` + +:Description: If a pool makes more replicas than this number, CRUSH will + **NOT** select this rule. + +:Type: Integer +:Purpose: A component of the rule mask. +:Required: Yes +:Default: 10 + + +``step take [class ]`` + +:Description: Takes a bucket name, and begins iterating down the tree. + If the ``device-class`` is specified, it must match + a class previously used when defining a device. All + devices that do not belong to the class are excluded. +:Purpose: A component of the rule. +:Required: Yes +:Example: ``step take data`` + + +``step choose firstn {num} type {bucket-type}`` + +:Description: Selects the number of buckets of the given type. The number is + usually the number of replicas in the pool (i.e., pool size). + + - If ``{num} == 0``, choose ``pool-num-replicas`` buckets (all available). + - If ``{num} > 0 && < pool-num-replicas``, choose that many buckets. + - If ``{num} < 0``, it means ``pool-num-replicas - {num}``. + +:Purpose: A component of the rule. +:Prerequisite: Follows ``step take`` or ``step choose``. +:Example: ``step choose firstn 1 type row`` + + +``step chooseleaf firstn {num} type {bucket-type}`` + +:Description: Selects a set of buckets of ``{bucket-type}`` and chooses a leaf + node from the subtree of each bucket in the set of buckets. The + number of buckets in the set is usually the number of replicas in + the pool (i.e., pool size). + + - If ``{num} == 0``, choose ``pool-num-replicas`` buckets (all available). + - If ``{num} > 0 && < pool-num-replicas``, choose that many buckets. + - If ``{num} < 0``, it means ``pool-num-replicas - {num}``. + +:Purpose: A component of the rule. Usage removes the need to select a device using two steps. +:Prerequisite: Follows ``step take`` or ``step choose``. +:Example: ``step chooseleaf firstn 0 type row`` + + + +``step emit`` + +:Description: Outputs the current value and empties the stack. Typically used + at the end of a rule, but may also be used to pick from different + trees in the same rule. + +:Purpose: A component of the rule. +:Prerequisite: Follows ``step choose``. +:Example: ``step emit`` + +.. important:: To activate one or more rules with a common ruleset number to a + pool, set the ruleset number of the pool. + + +Placing Different Pools on Different OSDS: +========================================== + +Suppose you want to have most pools default to OSDs backed by large hard drives, +but have some pools mapped to OSDs backed by fast solid-state drives (SSDs). +It's possible to have multiple independent CRUSH hierarchies within the same +CRUSH map. Define two hierarchies with two different root nodes--one for hard +disks (e.g., "root platter") and one for SSDs (e.g., "root ssd") as shown +below:: + + device 0 osd.0 + device 1 osd.1 + device 2 osd.2 + device 3 osd.3 + device 4 osd.4 + device 5 osd.5 + device 6 osd.6 + device 7 osd.7 + + host ceph-osd-ssd-server-1 { + id -1 + alg straw + hash 0 + item osd.0 weight 1.00 + item osd.1 weight 1.00 + } + + host ceph-osd-ssd-server-2 { + id -2 + alg straw + hash 0 + item osd.2 weight 1.00 + item osd.3 weight 1.00 + } + + host ceph-osd-platter-server-1 { + id -3 + alg straw + hash 0 + item osd.4 weight 1.00 + item osd.5 weight 1.00 + } + + host ceph-osd-platter-server-2 { + id -4 + alg straw + hash 0 + item osd.6 weight 1.00 + item osd.7 weight 1.00 + } + + root platter { + id -5 + alg straw + hash 0 + item ceph-osd-platter-server-1 weight 2.00 + item ceph-osd-platter-server-2 weight 2.00 + } + + root ssd { + id -6 + alg straw + hash 0 + item ceph-osd-ssd-server-1 weight 2.00 + item ceph-osd-ssd-server-2 weight 2.00 + } + + rule data { + ruleset 0 + type replicated + min_size 2 + max_size 2 + step take platter + step chooseleaf firstn 0 type host + step emit + } + + rule metadata { + ruleset 1 + type replicated + min_size 0 + max_size 10 + step take platter + step chooseleaf firstn 0 type host + step emit + } + + rule rbd { + ruleset 2 + type replicated + min_size 0 + max_size 10 + step take platter + step chooseleaf firstn 0 type host + step emit + } + + rule platter { + ruleset 3 + type replicated + min_size 0 + max_size 10 + step take platter + step chooseleaf firstn 0 type host + step emit + } + + rule ssd { + ruleset 4 + type replicated + min_size 0 + max_size 4 + step take ssd + step chooseleaf firstn 0 type host + step emit + } + + rule ssd-primary { + ruleset 5 + type replicated + min_size 5 + max_size 10 + step take ssd + step chooseleaf firstn 1 type host + step emit + step take platter + step chooseleaf firstn -1 type host + step emit + } + +You can then set a pool to use the SSD rule by:: + + ceph osd pool set crush_ruleset 4 + +Similarly, using the ``ssd-primary`` rule will cause each placement group in the +pool to be placed with an SSD as the primary and platters as the replicas. + + +Tuning CRUSH, the hard way +-------------------------- + +If you can ensure that all clients are running recent code, you can +adjust the tunables by extracting the CRUSH map, modifying the values, +and reinjecting it into the cluster. + +* Extract the latest CRUSH map:: + + ceph osd getcrushmap -o /tmp/crush + +* Adjust tunables. These values appear to offer the best behavior + for both large and small clusters we tested with. You will need to + additionally specify the ``--enable-unsafe-tunables`` argument to + ``crushtool`` for this to work. Please use this option with + extreme care.:: + + crushtool -i /tmp/crush --set-choose-local-tries 0 --set-choose-local-fallback-tries 0 --set-choose-total-tries 50 -o /tmp/crush.new + +* Reinject modified map:: + + ceph osd setcrushmap -i /tmp/crush.new + +Legacy values +------------- + +For reference, the legacy values for the CRUSH tunables can be set +with:: + + crushtool -i /tmp/crush --set-choose-local-tries 2 --set-choose-local-fallback-tries 5 --set-choose-total-tries 19 --set-chooseleaf-descend-once 0 --set-chooseleaf-vary-r 0 -o /tmp/crush.legacy + +Again, the special ``--enable-unsafe-tunables`` option is required. +Further, as noted above, be careful running old versions of the +``ceph-osd`` daemon after reverting to legacy values as the feature +bit is not perfectly enforced. diff -Nru ceph-12.1.1/doc/rados/operations/crush-map.rst ceph-12.1.2/doc/rados/operations/crush-map.rst --- ceph-12.1.1/doc/rados/operations/crush-map.rst 2017-07-17 16:56:02.000000000 +0000 +++ ceph-12.1.2/doc/rados/operations/crush-map.rst 2017-08-01 17:55:40.000000000 +0000 @@ -27,37 +27,23 @@ replicas are on devices using different shelves, racks, power supplies, controllers, and/or physical locations. -When you create a configuration file and deploy Ceph with ``ceph-deploy``, Ceph -generates a default CRUSH map for your configuration. The default CRUSH map is -fine for your Ceph sandbox environment. However, when you deploy a large-scale -data cluster, you should give significant consideration to developing a custom -CRUSH map, because it will help you manage your Ceph cluster, improve -performance and ensure data safety. - -For example, if an OSD goes down, a CRUSH map can help you to locate -the physical data center, room, row and rack of the host with the failed OSD in -the event you need to use onsite support or replace hardware. - -Similarly, CRUSH may help you identify faults more quickly. For example, if all -OSDs in a particular rack go down simultaneously, the fault may lie with a -network switch or power to the rack rather than the OSDs themselves. - -A custom CRUSH map can also help you identify the physical locations where -Ceph stores redundant copies of data when the placement group(s) associated -with a failed host are in a degraded state. - -.. note:: Lines of code in example boxes may extend past the edge of the box. - Please scroll when reading or copying longer examples. +When you deploy OSDs they are automatically placed within the CRUSH map under a +``host`` node named with the hostname for the host they are running on. This, +combined with the default CRUSH failure domain, ensures that replicas or erasure +code shards are separated across hosts and a single host failure will not +affect availability. For larger clusters, however, administrators should carefully consider their choice of failure domain. Separating replicas across racks, +for example, is common for mid- to large-sized clusters. CRUSH Location ============== -The location of an OSD in terms of the CRUSH map's hierarchy is referred to -as a 'crush location'. This location specifier takes the form of a list of -key and value pairs describing a position. For example, if an OSD is in a -particular row, rack, chassis and host, and is part of the 'default' CRUSH -tree, its crush location could be described as:: +The location of an OSD in terms of the CRUSH map's hierarchy is +referred to as a ``crush location``. This location specifier takes the +form of a list of key and value pairs describing a position. For +example, if an OSD is in a particular row, rack, chassis and host, and +is part of the 'default' CRUSH tree (this is the case for the vast +majority of clusters), its crush location could be described as:: root=default row=a rack=a2 chassis=a2a host=a2a1 @@ -72,38 +58,33 @@ automatically sets a ``ceph-osd`` daemon's location to be ``root=default host=HOSTNAME`` (based on the output from ``hostname -s``). -ceph-crush-location hook ------------------------- - -By default, the ``ceph-crush-location`` utility will generate a CRUSH -location string for a given daemon. The location is based on, in order of -preference: - -#. A ``TYPE crush location`` option in ceph.conf. For example, this - is ``osd crush location`` for OSD daemons. -#. A ``crush location`` option in ceph.conf. -#. A default of ``root=default host=HOSTNAME`` where the hostname is - generated with the ``hostname -s`` command. - -In a typical deployment scenario, provisioning software (or the system -administrator) can simply set the 'crush location' field in a host's -ceph.conf to describe that machine's location within the datacenter or -cluster. This will provide location awareness to both Ceph daemons -and clients alike. - -It is possible to manage the CRUSH map entirely manually by toggling -the hook off in the configuration:: +The crush location for an OSD is normally expressed via the ``crush location`` +config option being set in the ``ceph.conf`` file. Each time the OSD starts, +it verifies it is in the correct location in the CRUSH map and, if it is not, +it moved itself. To disable this automatic CRUSH map management, add the +following to your configuration file in the ``[osd]`` section:: osd crush update on start = false + Custom location hooks --------------------- -A customized location hook can be used in place of the generic hook for OSD -daemon placement in the hierarchy. (On startup, each OSD ensures its position is -correct.):: +A customized location hook can be used to generate a more complete +crush location on startup. The sample ``ceph-crush-location`` utility +will generate a CRUSH location string for a given daemon. The +location is based on, in order of preference: - osd crush location hook = /path/to/script +#. A ``crush location`` option in ceph.conf. +#. A default of ``root=default host=HOSTNAME`` where the hostname is + generated with the ``hostname -s`` command. + +This is not useful by itself, as the OSD itself has the exact same +behavior. However, the script can be modified to provide additional +location fields (for example, the rack or datacenter), and then the +hook enabled via the config option:: + + crush location hook = /path/to/customized-ceph-crush-location This hook is passed several arguments (below) and should output a single line to stdout with the CRUSH location description.:: @@ -114,686 +95,185 @@ identifier (the OSD number), and the daemon type is typically ``osd``. -Editing a CRUSH Map -=================== - -To edit an existing CRUSH map: - -#. `Get the CRUSH map`_. -#. `Decompile`_ the CRUSH map. -#. Edit at least one of `Devices`_, `Buckets`_ and `Rules`_. -#. `Recompile`_ the CRUSH map. -#. `Set the CRUSH map`_. - -To activate CRUSH map rules for a specific pool, identify the common ruleset -number for those rules and specify that ruleset number for the pool. See `Set -Pool Values`_ for details. - -.. _Get the CRUSH map: #getcrushmap -.. _Decompile: #decompilecrushmap -.. _Devices: #crushmapdevices -.. _Buckets: #crushmapbuckets -.. _Rules: #crushmaprules -.. _Recompile: #compilecrushmap -.. _Set the CRUSH map: #setcrushmap -.. _Set Pool Values: ../pools#setpoolvalues - -.. _getcrushmap: - -Get a CRUSH Map ---------------- - -To get the CRUSH map for your cluster, execute the following:: - - ceph osd getcrushmap -o {compiled-crushmap-filename} - -Ceph will output (-o) a compiled CRUSH map to the filename you specified. Since -the CRUSH map is in a compiled form, you must decompile it first before you can -edit it. - -.. _decompilecrushmap: - -Decompile a CRUSH Map ---------------------- - -To decompile a CRUSH map, execute the following:: - - crushtool -d {compiled-crushmap-filename} -o {decompiled-crushmap-filename} - -Ceph will decompile (-d) the compiled CRUSH map and output (-o) it to the -filename you specified. - - -.. _compilecrushmap: - -Compile a CRUSH Map -------------------- - -To compile a CRUSH map, execute the following:: - - crushtool -c {decompiled-crush-map-filename} -o {compiled-crush-map-filename} - -Ceph will store a compiled CRUSH map to the filename you specified. - - -.. _setcrushmap: - -Set a CRUSH Map ---------------- - -To set the CRUSH map for your cluster, execute the following:: - - ceph osd setcrushmap -i {compiled-crushmap-filename} - -Ceph will input the compiled CRUSH map of the filename you specified as the -CRUSH map for the cluster. - - - -CRUSH Map Parameters -==================== - -There are four main sections to a CRUSH Map. - -#. **Devices:** Devices consist of any object storage device--i.e., the storage - drive corresponding to a ``ceph-osd`` daemon. You should have a device for - each OSD daemon in your Ceph configuration file. - -#. **Bucket Types**: Bucket ``types`` define the types of buckets used in your - CRUSH hierarchy. Buckets consist of a hierarchical aggregation of storage - locations (e.g., rows, racks, chassis, hosts, etc.) and their assigned - weights. - -#. **Bucket Instances:** Once you define bucket types, you must declare bucket - instances for your hosts, and any other failure domain partitioning - you choose. - -#. **Rules:** Rules consist of the manner of selecting buckets. - -If you launched Ceph using one of our Quick Start guides, you'll notice -that you didn't need to create a CRUSH map. Ceph's deployment tools generate -a default CRUSH map that lists devices from the OSDs you defined in your -Ceph configuration file, and it declares a bucket for each host you specified -in the ``[osd]`` sections of your Ceph configuration file. You should create -your own CRUSH maps with buckets that reflect your cluster's failure domains -to better ensure data safety and availability. - -.. note:: The generated CRUSH map doesn't take your larger grained failure - domains into account. So you should modify your CRUSH map to account for - larger grained failure domains such as chassis, racks, rows, data - centers, etc. - - +CRUSH structure +=============== -.. _crushmapdevices: +The CRUSH map consists of, loosely speaking, a hierarchy describing +the physical topology of the cluster, and a set of rules defining +policy about how we place data on those devices. The hierarchy has +devices (``ceph-osd`` daemons) at the leaves, and internal nodes +corresponding to other physical features or groupings: hosts, racks, +rows, datacenters, and so on. The rules describe how replicas are +placed in terms of that hierarchy (e.g., 'three replicas in different +racks'). + +Devices +------- + +Devices are individual ``ceph-osd`` daemons that can store data. You +will normally have one defined here for each OSD daemon in your +cluster. Devices are identified by an id (a non-negative integer) and +a name, normally ``osd.N`` where ``N`` is the device id. + +Devices may also have a *device class* associated with them (e.g., +``hdd`` or ``ssd``), allowing them to be conveniently targetted by a +crush rule. -CRUSH Map Devices +Types and Buckets ----------------- -To map placement groups to OSDs, a CRUSH map requires a list of OSD devices -(i.e., the names of the OSD daemons from the Ceph configuration file). The list -of devices appears first in the CRUSH map. To declare a device in the CRUSH map, -create a new line under your list of devices, enter ``device`` followed by a -unique numeric ID, followed by the corresponding ``ceph-osd`` daemon instance. -The device class can optionaly be added to group devices so they can be -conveniently targetted by a crush rule. - -:: - - #devices - device {num} {osd.name} [class {class}] - -For example:: - - #devices - device 0 osd.0 class ssd - device 1 osd.1 class hdd - device 2 osd.2 - device 3 osd.3 - -As a general rule, an OSD daemon maps to a single storage drive or to a RAID. - - -CRUSH Map Bucket Types ----------------------- - -The second list in the CRUSH map defines 'bucket' types. Buckets facilitate -a hierarchy of nodes and leaves. Node (or non-leaf) buckets typically represent -physical locations in a hierarchy. Nodes aggregate other nodes or leaves. -Leaf buckets represent ``ceph-osd`` daemons and their corresponding storage -media. - -.. tip:: The term "bucket" used in the context of CRUSH means a node in - the hierarchy, i.e. a location or a piece of physical hardware. It - is a different concept from the term "bucket" when used in the - context of RADOS Gateway APIs. - -To add a bucket type to the CRUSH map, create a new line under your list of -bucket types. Enter ``type`` followed by a unique numeric ID and a bucket name. -By convention, there is one leaf bucket and it is ``type 0``; however, you may -give it any name you like (e.g., osd, disk, drive, storage, etc.):: - - #types - type {num} {bucket-name} - -For example:: - - # types - type 0 osd - type 1 host - type 2 chassis - type 3 rack - type 4 row - type 5 pdu - type 6 pod - type 7 room - type 8 datacenter - type 9 region - type 10 root - - - -.. _crushmapbuckets: - -CRUSH Map Bucket Hierarchy --------------------------- - -The CRUSH algorithm distributes data objects among storage devices according -to a per-device weight value, approximating a uniform probability distribution. -CRUSH distributes objects and their replicas according to the hierarchical -cluster map you define. Your CRUSH map represents the available storage -devices and the logical elements that contain them. - -To map placement groups to OSDs across failure domains, a CRUSH map defines a -hierarchical list of bucket types (i.e., under ``#types`` in the generated CRUSH -map). The purpose of creating a bucket hierarchy is to segregate the -leaf nodes by their failure domains, such as hosts, chassis, racks, power -distribution units, pods, rows, rooms, and data centers. With the exception of -the leaf nodes representing OSDs, the rest of the hierarchy is arbitrary, and -you may define it according to your own needs. - -We recommend adapting your CRUSH map to your firms's hardware naming conventions -and using instances names that reflect the physical hardware. Your naming -practice can make it easier to administer the cluster and troubleshoot -problems when an OSD and/or other hardware malfunctions and the administrator -need access to physical hardware. - -In the following example, the bucket hierarchy has a leaf bucket named ``osd``, -and two node buckets named ``host`` and ``rack`` respectively. - -.. ditaa:: - +-----------+ - | {o}rack | - | Bucket | - +-----+-----+ +A bucket is the CRUSH term for internal nodes in the hierarchy: hosts, +racks, rows, etc. The CRUSH map defines a series of *types* that are +used to describe these nodes. By default, these types include: + +- osd (or device) +- host +- chassis +- rack +- row +- pdu +- pod +- room +- datacenter +- region +- root + +Most clusters make use of only a handful of these types, and others +can be defined as needed. + +The hierarchy is built with devices (normally type ``osd``) at the +leaves, interior nodes with non-device types, and a root node of type +``root``. For example, + +.. ditaa:: + + +-----------------+ + | {o}root default | + +--------+--------+ | +---------------+---------------+ | | - +-----+-----+ +-----+-----+ - | {o}host | | {o}host | - | Bucket | | Bucket | - +-----+-----+ +-----+-----+ + +-------+-------+ +-----+-------+ + | {o}host foo | | {o}host bar | + +-------+-------+ +-----+-------+ | | +-------+-------+ +-------+-------+ | | | | +-----+-----+ +-----+-----+ +-----+-----+ +-----+-----+ - | osd | | osd | | osd | | osd | - | Bucket | | Bucket | | Bucket | | Bucket | + | osd.0 | | osd.1 | | osd.2 | | osd.3 | +-----------+ +-----------+ +-----------+ +-----------+ -.. note:: The higher numbered ``rack`` bucket type aggregates the lower - numbered ``host`` bucket type. - -Since leaf nodes reflect storage devices declared under the ``#devices`` list -at the beginning of the CRUSH map, you do not need to declare them as bucket -instances. The second lowest bucket type in your hierarchy usually aggregates -the devices (i.e., it's usually the computer containing the storage media, and -uses whatever term you prefer to describe it, such as "node", "computer", -"server," "host", "machine", etc.). In high density environments, it is -increasingly common to see multiple hosts/nodes per chassis. You should account -for chassis failure too--e.g., the need to pull a chassis if a node fails may -result in bringing down numerous hosts/nodes and their OSDs. - -When declaring a bucket instance, you must specify its type, give it a unique -name (string), assign it a unique ID expressed as a negative integer (optional), -specify a weight relative to the total capacity/capability of its item(s), -specify the bucket algorithm (usually ``straw``), and the hash (usually ``0``, -reflecting hash algorithm ``rjenkins1``). A bucket may have one or more items. -The items may consist of node buckets or leaves. Items may have a weight that -reflects the relative weight of the item. - -You may declare a node bucket with the following syntax:: - - [bucket-type] [bucket-name] { - id [a unique negative numeric ID] - weight [the relative capacity/capability of the item(s)] - alg [the bucket type: uniform | list | tree | straw ] - hash [the hash type: 0 by default] - item [item-name] weight [weight] - } - -For example, using the diagram above, we would define two host buckets -and one rack bucket. The OSDs are declared as items within the host buckets:: - - host node1 { - id -1 - alg straw - hash 0 - item osd.0 weight 1.00 - item osd.1 weight 1.00 - } - - host node2 { - id -2 - alg straw - hash 0 - item osd.2 weight 1.00 - item osd.3 weight 1.00 - } - - rack rack1 { - id -3 - alg straw - hash 0 - item node1 weight 2.00 - item node2 weight 2.00 - } - -.. note:: In the foregoing example, note that the rack bucket does not contain - any OSDs. Rather it contains lower level host buckets, and includes the - sum total of their weight in the item entry. - -.. topic:: Bucket Types - - Ceph supports four bucket types, each representing a tradeoff between - performance and reorganization efficiency. If you are unsure of which bucket - type to use, we recommend using a ``straw`` bucket. For a detailed - discussion of bucket types, refer to - `CRUSH - Controlled, Scalable, Decentralized Placement of Replicated Data`_, - and more specifically to **Section 3.4**. The bucket types are: +Each node (device or bucket) in the hierarchy has a *weight* +associated with it, indicating the relative proportion of the total +data that device or hierarchy subtree should store. Weights are set +at the leaves, indicating the size of the device, and automatically +sum up the tree from there, such that the weight of the default node +will be the total of all devices contained beneath it. Normally +weights are in units of terabytes (TB). + +You can get a simple view the CRUSH hierarchy for your cluster, +including the weights, with:: + + ceph osd crush tree + +Rules +----- + +Rules define policy about how data is distributed across the devices +in the hierarchy. + +CRUSH rules define placement and replication strategies or +distribution policies that allow you to specify exactly how CRUSH +places object replicas. For example, you might create a rule selecting +a pair of targets for 2-way mirroring, another rule for selecting +three targets in two different data centers for 3-way mirroring, and +yet another rule for erasure coding over six storage devices. For a +detailed discussion of CRUSH rules, refer to `CRUSH - Controlled, +Scalable, Decentralized Placement of Replicated Data`_, and more +specifically to **Section 3.2**. + +In almost all cases, CRUSH rules can be created via the CLI by +specifying the *pool type* they will be used for (replicated or +erasure coded), the *failure domain*, and optionally a *device class*. +In rare cases rules must be written by hand by manually editing the +CRUSH map. - #. **Uniform:** Uniform buckets aggregate devices with **exactly** the same - weight. For example, when firms commission or decommission hardware, they - typically do so with many machines that have exactly the same physical - configuration (e.g., bulk purchases). When storage devices have exactly - the same weight, you may use the ``uniform`` bucket type, which allows - CRUSH to map replicas into uniform buckets in constant time. With - non-uniform weights, you should use another bucket algorithm. - - #. **List**: List buckets aggregate their content as linked lists. Based on - the :abbr:`RUSH (Replication Under Scalable Hashing)` :sub:`P` algorithm, - a list is a natural and intuitive choice for an **expanding cluster**: - either an object is relocated to the newest device with some appropriate - probability, or it remains on the older devices as before. The result is - optimal data migration when items are added to the bucket. Items removed - from the middle or tail of the list, however, can result in a significant - amount of unnecessary movement, making list buckets most suitable for - circumstances in which they **never (or very rarely) shrink**. - - #. **Tree**: Tree buckets use a binary search tree. They are more efficient - than list buckets when a bucket contains a larger set of items. Based on - the :abbr:`RUSH (Replication Under Scalable Hashing)` :sub:`R` algorithm, - tree buckets reduce the placement time to O(log :sub:`n`), making them - suitable for managing much larger sets of devices or nested buckets. - - #. **Straw:** List and Tree buckets use a divide and conquer strategy - in a way that either gives certain items precedence (e.g., those - at the beginning of a list) or obviates the need to consider entire - subtrees of items at all. That improves the performance of the replica - placement process, but can also introduce suboptimal reorganization - behavior when the contents of a bucket change due an addition, removal, - or re-weighting of an item. The straw bucket type allows all items to - fairly “compete” against each other for replica placement through a - process analogous to a draw of straws. - -.. topic:: Hash - - Each bucket uses a hash algorithm. Currently, Ceph supports ``rjenkins1``. - Enter ``0`` as your hash setting to select ``rjenkins1``. - - -.. _weightingbucketitems: - -.. topic:: Weighting Bucket Items - - Ceph expresses bucket weights as doubles, which allows for fine - weighting. A weight is the relative difference between device capacities. We - recommend using ``1.00`` as the relative weight for a 1TB storage device. - In such a scenario, a weight of ``0.5`` would represent approximately 500GB, - and a weight of ``3.00`` would represent approximately 3TB. Higher level - buckets have a weight that is the sum total of the leaf items aggregated by - the bucket. - - A bucket item weight is one dimensional, but you may also calculate your - item weights to reflect the performance of the storage drive. For example, - if you have many 1TB drives where some have relatively low data transfer - rate and the others have a relatively high data transfer rate, you may - weight them differently, even though they have the same capacity (e.g., - a weight of 0.80 for the first set of drives with lower total throughput, - and 1.20 for the second set of drives with higher total throughput). - - -.. _crushmaprules: - -CRUSH Map Rules ---------------- - -CRUSH maps support the notion of 'CRUSH rules', which are the rules that -determine data placement for a pool. For large clusters, you will likely create -many pools where each pool may have its own CRUSH ruleset and rules. The default -CRUSH map has a rule for each pool, and one ruleset assigned to each of the -default pools. - -.. note:: In most cases, you will not need to modify the default rules. When - you create a new pool, its default ruleset is ``0``. - - -CRUSH rules define placement and replication strategies or distribution policies -that allow you to specify exactly how CRUSH places object replicas. For -example, you might create a rule selecting a pair of targets for 2-way -mirroring, another rule for selecting three targets in two different data -centers for 3-way mirroring, and yet another rule for erasure coding over six -storage devices. For a detailed discussion of CRUSH rules, refer to -`CRUSH - Controlled, Scalable, Decentralized Placement of Replicated Data`_, -and more specifically to **Section 3.2**. - -A rule takes the following form:: - - rule { - - ruleset - type [ replicated | erasure ] - min_size - max_size - step take [class ] - step [choose|chooseleaf] [firstn|indep] - step emit - } - - -``ruleset`` - -:Description: A means of classifying a rule as belonging to a set of rules. - Activated by `setting the ruleset in a pool`_. - -:Purpose: A component of the rule mask. -:Type: Integer -:Required: Yes -:Default: 0 - -.. _setting the ruleset in a pool: ../pools#setpoolvalues - - -``type`` - -:Description: Describes a rule for either a storage drive (replicated) - or a RAID. - -:Purpose: A component of the rule mask. -:Type: String -:Required: Yes -:Default: ``replicated`` -:Valid Values: Currently only ``replicated`` and ``erasure`` - -``min_size`` - -:Description: If a pool makes fewer replicas than this number, CRUSH will - **NOT** select this rule. - -:Type: Integer -:Purpose: A component of the rule mask. -:Required: Yes -:Default: ``1`` - -``max_size`` - -:Description: If a pool makes more replicas than this number, CRUSH will - **NOT** select this rule. - -:Type: Integer -:Purpose: A component of the rule mask. -:Required: Yes -:Default: 10 - - -``step take [class ]`` +You can see what rules are defined for your cluster with:: -:Description: Takes a bucket name, and begins iterating down the tree. - If the ``device-class`` is specified, it must match - a class previously used when defining a device. All - devices that do not belong to the class are excluded. -:Purpose: A component of the rule. -:Required: Yes -:Example: ``step take data`` - - -``step choose firstn {num} type {bucket-type}`` - -:Description: Selects the number of buckets of the given type. The number is - usually the number of replicas in the pool (i.e., pool size). - - - If ``{num} == 0``, choose ``pool-num-replicas`` buckets (all available). - - If ``{num} > 0 && < pool-num-replicas``, choose that many buckets. - - If ``{num} < 0``, it means ``pool-num-replicas - {num}``. - -:Purpose: A component of the rule. -:Prerequisite: Follows ``step take`` or ``step choose``. -:Example: ``step choose firstn 1 type row`` - - -``step chooseleaf firstn {num} type {bucket-type}`` - -:Description: Selects a set of buckets of ``{bucket-type}`` and chooses a leaf - node from the subtree of each bucket in the set of buckets. The - number of buckets in the set is usually the number of replicas in - the pool (i.e., pool size). - - - If ``{num} == 0``, choose ``pool-num-replicas`` buckets (all available). - - If ``{num} > 0 && < pool-num-replicas``, choose that many buckets. - - If ``{num} < 0``, it means ``pool-num-replicas - {num}``. - -:Purpose: A component of the rule. Usage removes the need to select a device using two steps. -:Prerequisite: Follows ``step take`` or ``step choose``. -:Example: ``step chooseleaf firstn 0 type row`` - - - -``step emit`` - -:Description: Outputs the current value and empties the stack. Typically used - at the end of a rule, but may also be used to pick from different - trees in the same rule. - -:Purpose: A component of the rule. -:Prerequisite: Follows ``step choose``. -:Example: ``step emit`` - -.. important:: To activate one or more rules with a common ruleset number to a - pool, set the ruleset number of the pool. - - - -Primary Affinity -================ - -When a Ceph Client reads or writes data, it always contacts the primary OSD in -the acting set. For set ``[2, 3, 4]``, ``osd.2`` is the primary. Sometimes an -OSD isn't well suited to act as a primary compared to other OSDs (e.g., it has -a slow disk or a slow controller). To prevent performance bottlenecks -(especially on read operations) while maximizing utilization of your hardware, -you can set a Ceph OSD's primary affinity so that CRUSH is less likely to use -the OSD as a primary in an acting set. :: - - ceph osd primary-affinity + ceph osd crush rule ls -Primary affinity is ``1`` by default (*i.e.,* an OSD may act as a primary). You -may set the OSD primary range from ``0-1``, where ``0`` means that the OSD may -**NOT** be used as a primary and ``1`` means that an OSD may be used as a -primary. When the weight is ``< 1``, it is less likely that CRUSH will select -the Ceph OSD Daemon to act as a primary. +You can view the contents of the rules with:: + ceph osd crush rule dump -Placing Different Pools on Different OSDS: -========================================== -Suppose you want to have most pools default to OSDs backed by large hard drives, -but have some pools mapped to OSDs backed by fast solid-state drives (SSDs). -It's possible to have multiple independent CRUSH hierarchies within the same -CRUSH map. Define two hierarchies with two different root nodes--one for hard -disks (e.g., "root platter") and one for SSDs (e.g., "root ssd") as shown -below:: - - device 0 osd.0 - device 1 osd.1 - device 2 osd.2 - device 3 osd.3 - device 4 osd.4 - device 5 osd.5 - device 6 osd.6 - device 7 osd.7 - - host ceph-osd-ssd-server-1 { - id -1 - alg straw - hash 0 - item osd.0 weight 1.00 - item osd.1 weight 1.00 - } - - host ceph-osd-ssd-server-2 { - id -2 - alg straw - hash 0 - item osd.2 weight 1.00 - item osd.3 weight 1.00 - } - - host ceph-osd-platter-server-1 { - id -3 - alg straw - hash 0 - item osd.4 weight 1.00 - item osd.5 weight 1.00 - } - - host ceph-osd-platter-server-2 { - id -4 - alg straw - hash 0 - item osd.6 weight 1.00 - item osd.7 weight 1.00 - } - - root platter { - id -5 - alg straw - hash 0 - item ceph-osd-platter-server-1 weight 2.00 - item ceph-osd-platter-server-2 weight 2.00 - } - - root ssd { - id -6 - alg straw - hash 0 - item ceph-osd-ssd-server-1 weight 2.00 - item ceph-osd-ssd-server-2 weight 2.00 - } - - rule data { - ruleset 0 - type replicated - min_size 2 - max_size 2 - step take platter - step chooseleaf firstn 0 type host - step emit - } - - rule metadata { - ruleset 1 - type replicated - min_size 0 - max_size 10 - step take platter - step chooseleaf firstn 0 type host - step emit - } - - rule rbd { - ruleset 2 - type replicated - min_size 0 - max_size 10 - step take platter - step chooseleaf firstn 0 type host - step emit - } - - rule platter { - ruleset 3 - type replicated - min_size 0 - max_size 10 - step take platter - step chooseleaf firstn 0 type host - step emit - } - - rule ssd { - ruleset 4 - type replicated - min_size 0 - max_size 4 - step take ssd - step chooseleaf firstn 0 type host - step emit - } - - rule ssd-primary { - ruleset 5 - type replicated - min_size 5 - max_size 10 - step take ssd - step chooseleaf firstn 1 type host - step emit - step take platter - step chooseleaf firstn -1 type host - step emit - } +Weights sets +------------ -You can then set a pool to use the SSD rule by:: +A *weight set* is an alternative set of weights to use when +calculating data placement. The normal weights associated with each +device in the CRUSH map are set based on the device size and indicate +how much data we *should* be storing where. However, because CRUSH is +based on a pseudorandom placement process, there is always some +variation from this ideal distribution, the same way that rolling a +dice sixty times will not result in rolling exactly 10 ones and 10 +sixes. Weight sets allow the cluster to do a numerical optimization +based on the specifics of your cluster (hierarchy, pools, etc.) to achieve +a balanced distribution. + +There are two types of weight sets supported: + + #. A **compat** weight set is a single alternative set of weights for + each device and node in the cluster. This is not well-suited for + correcting for all anomalies (for example, placement groups for + different pools may be different sizes and have different load + levels, but will be mostly treated the same by the balancer). + However, compat weight sets have the huge advantage that they are + *backward compatible* with previous versions of Ceph, which means + that even though weight sets were first introduced in Luminous + v12.2.z, older clients (e.g., firefly) can still connect to the + cluster when a compat weight set is being used to balance data. + #. A **per-pool** weight set is more flexible in that it allows + placement to be optimized for each data pool. Additionally, + weights can be adjusted for each position of placement, allowing + the optimizer to correct for a suble skew of data toward devices + with small weights relative to their peers (and effect that is + usually only apparently in very large clusters but which can cause + balancing problems). + +When weight sets are in use, the weights associated with each node in +the hierarchy is visible as a separate column (labeled either +``(compat)`` or the pool name) from the command:: + + ceph osd crush tree + +When both *compat* and *per-pool* weight sets are in use, data +placement for a particular pool will use its own per-pool weight set +if present. If not, it will use the compat weight set if present. If +neither are present, it will use the normal CRUSH weights. + +Although weight sets can be set up and manipulated by hand, it is +recommended that the *balancer* module be enabled to do so +automatically. - ceph osd pool set crush_ruleset 4 -Similarly, using the ``ssd-primary`` rule will cause each placement group in the -pool to be placed with an SSD as the primary and platters as the replicas. +Modifying the CRUSH map +======================= .. _addosd: Add/Move an OSD -=============== +--------------- -To add or move an OSD in the CRUSH map of a running cluster, execute the -``ceph osd crush set``. For Argonaut (v 0.48), execute the following:: +.. note: OSDs are normally automatically added to the CRUSH map when + the OSD is created. This command is rarely needed. - ceph osd crush set {id} {name} {weight} pool={pool-name} [{bucket-type}={bucket-name} ...] - -For Bobtail (v 0.56), execute the following:: +To add or move an OSD in the CRUSH map of a running cluster:: - ceph osd crush set {id-or-name} {weight} root={pool-name} [{bucket-type}={bucket-name} ...] + ceph osd crush set {name} {weight} root={root} [{bucket-type}={bucket-name} ...] Where: -``id`` - -:Description: The numeric ID of the OSD. -:Type: Integer -:Required: Yes -:Example: ``0`` - - ``name`` :Description: The full name of the OSD. @@ -804,7 +284,7 @@ ``weight`` -:Description: The CRUSH weight for the OSD. +:Description: The CRUSH weight for the OSD, normally its size measure in terabytes (TB). :Type: Double :Required: Yes :Example: ``2.0`` @@ -812,7 +292,7 @@ ``root`` -:Description: The root of the tree in which the OSD resides. +:Description: The root node of the tree in which the OSD resides (normally ``default``) :Type: Key/value pair. :Required: Yes :Example: ``root=default`` @@ -826,19 +306,23 @@ :Example: ``datacenter=dc1 room=room1 row=foo rack=bar host=foo-bar-1`` -The following example adds ``osd.0`` to the hierarchy, or moves the OSD from a -previous location. :: +The following example adds ``osd.0`` to the hierarchy, or moves the +OSD from a previous location. :: + + ceph osd crush set osd.0 1.0 root=default datacenter=dc1 room=room1 row=foo rack=bar host=foo-bar-1 - ceph osd crush set osd.0 1.0 root=default datacenter=dc1 room=room1 row=foo rack=bar host=foo-bar-1 +Adjust OSD weight +----------------- -Adjust an OSD's CRUSH Weight -============================ +.. note: Normally OSDs automatically add themselves to the CRUSH map + with the correct weight when they are created. This command + is rarely needed. To adjust an OSD's crush weight in the CRUSH map of a running cluster, execute the following:: - ceph osd crush reweight {name} {weight} + ceph osd crush reweight {name} {weight} Where: @@ -861,11 +345,15 @@ .. _removeosd: Remove an OSD -============= +------------- + +.. note: OSDs are normally removed from the CRUSH as part of the + ``ceph osd purge`` command. This command is rarely needed. -To remove an OSD from the CRUSH map of a running cluster, execute the following:: +To remove an OSD from the CRUSH map of a running cluster, execute the +following:: - ceph osd crush remove {name} + ceph osd crush remove {name} Where: @@ -876,12 +364,21 @@ :Required: Yes :Example: ``osd.0`` + Add a Bucket -============ +------------ -To add a bucket in the CRUSH map of a running cluster, execute the ``ceph osd crush add-bucket`` command:: +.. note: Buckets are normally implicitly created when an OSD is added + that specifies a ``{bucket-type}={bucket-name}`` as part of its + location and a bucket with that name does not already exist. This + command is typically used when manually adjusting the structure of the + hierarchy after OSDs have been created (for example, to move a + series of hosts underneath a new rack-level bucket). - ceph osd crush add-bucket {bucket-name} {bucket-type} +To add a bucket in the CRUSH map of a running cluster, execute the +``ceph osd crush add-bucket`` command:: + + ceph osd crush add-bucket {bucket-name} {bucket-type} Where: @@ -903,15 +400,15 @@ The following example adds the ``rack12`` bucket to the hierarchy:: - ceph osd crush add-bucket rack12 rack + ceph osd crush add-bucket rack12 rack Move a Bucket -============= +------------- -To move a bucket to a different location or position in the CRUSH map hierarchy, -execute the following:: +To move a bucket to a different location or position in the CRUSH map +hierarchy, execute the following:: - ceph osd crush move {bucket-name} {bucket-type}={bucket-name}, [...] + ceph osd crush move {bucket-name} {bucket-type}={bucket-name}, [...] Where: @@ -930,11 +427,11 @@ :Example: ``datacenter=dc1 room=room1 row=foo rack=bar host=foo-bar-1`` Remove a Bucket -=============== +--------------- To remove a bucket from the CRUSH map hierarchy, execute the following:: - ceph osd crush remove {bucket-name} + ceph osd crush remove {bucket-name} .. note:: A bucket must be empty before removing it from the CRUSH hierarchy. @@ -949,7 +446,180 @@ The following example removes the ``rack12`` bucket from the hierarchy:: - ceph osd crush remove rack12 + ceph osd crush remove rack12 + +Creating a compat weight set +---------------------------- + +.. note: This step is normally done automatically by the ``balancer`` + module when enabled. + +To create a *compat* weight set:: + + ceph osd crush weight-set create-compat + +Weights for the compat weight set can be adjusted with:: + + ceph osd crush weight-set reweight-compat {name} {weight} + +The compat weight set can be destroyed with:: + + ceph osd crush weight-set rm-compat + +Creating per-pool weight sets +----------------------------- + +To create a weight set for a specific pool,:: + + ceph osd crush weight-set create {pool-name} {mode} + +.. note:: Per-pool weight sets require that all servers and daemons + run Luminous v12.2.z or later. + +Where: + +``pool-name`` + +:Description: The name of a RADOS pool +:Type: String +:Required: Yes +:Example: ``rbd`` + +``mode`` + +:Description: Either ``flat`` or ``positional``. A *flat* weight set + has a single weight for each device or bucket. A + *positional* weight set has a potentially different + weight for each position in the resulting placement + mapping. For example, if a pool has a replica count of + 3, then a positional weight set will have three weights + for each device and bucket. +:Type: String +:Required: Yes +:Example: ``flat`` + +To adjust the weight of an item in a weight set:: + + ceph osd crush weight-set reweight {pool-name} {item-name} {weight [...]} + +To list existing weight sets,:: + + ceph osd crush weight-set ls + +To remove a weight set,:: + + ceph osd crush weight-set rm {pool-name} + +Creating a rule for a replicated pool +------------------------------------- + +For a replicated pool, the primary decision when creating the CRUSH +rule is what the failure domain is going to be. For example, if a +failure domain of ``host`` is selected, then CRUSH will ensure that +each replica of the data is stored on a different host. If ``rack`` +is selected, then each replica will be stored in a different rack. +What failure domain you choose primarily depends on the size of your +cluster and how your hierarchy is structured. + +Normally, the entire cluster hierarchy is nested beneath a root node +named ``default``. If you have customized your hierarchy, you may +want to create a rule nested at some other node in the hierarchy. It +doesn't matter what type is associated with that node (it doesn't have +to be a ``root`` node). + +It is also possible to create a rule that restricts data placement to +a specific *class* of device. By default, Ceph OSDs automatically +classify themselves as either ``hdd`` or ``ssd``, depending on the +underlying type of device being used. These classes can also be +customized. + +To create a replicated rule,:: + + ceph osd crush rule create-replicated {name} {root} {failure-domain-type} [{class}] + +Where: + +``name`` + +:Description: The name of the rule +:Type: String +:Required: Yes +:Example: ``rbd-rule`` + +``root`` + +:Description: The name of the node under which data should be placed. +:Type: String +:Required: Yes +:Example: ``default`` + +``failure-domain-type`` + +:Description: The type of CRUSH nodes across which we should separate replicas. +:Type: String +:Required: Yes +:Example: ``rack`` + +``class`` + +:Description: The device class data should be placed on. +:Type: String +:Required: No +:Example: ``ssd`` + +Creating a rule for an erasure coded pool +----------------------------------------- + +For an erasure-coded pool, the same basic decisions need to be made as +with a replicated pool: what is the failure domain, what node in the +hierarchy will data be placed under (usually ``default``), and will +placement be restricted to a specific device class. Erasure code +pools are created a bit differently, however, because they need to be +constructed carefully based on the erasure code being used. For this reason, +you must include this information in the *erasure code profile*. A CRUSH +rule will then be created from that either explicitly or automatically when +the profile is used to create a pool. + +The erasure code profiles can be listed with:: + + ceph osd erasure-code-profile ls + +An existing profile can be viewed with:: + + ceph osd erasure-code-profile get {profile-name} + +Normally profiles should never be modified; instead, a new profile +should be created and used when creating a new pool or creating a new +rule for an existing pool. + +An erasure code profile consists of a set of key=value pairs. Most of +these control the behavior of the erasure code that is encoding data +in the pool. Those that begin with ``crush-``, however, affect the +CRUSH rule that is created. + +The erasure code profile properties of interest are: + + * **crush-root**: the name of the CRUSH node to place data under [default: ``default``]. + * **crush-failure-domain**: the CRUSH type to separate erasure-coded shards across [default: ``host``]. + * **crush-device-class**: the device class to place data on [default: none, meaning all devices are used]. + * **k** and **m** (and, for the ``lrc`` plugin, **l**): these determine the number of erasure code shards, affecting the resulting CRUSH rule. + +Once a profile is defined, you can create a CRUSH rule with:: + + ceph osd crush rule create-erasure {name} {profile-name} + +.. note: When creating a new pool, it is not actually necessary to + explicitly create the rule. If the erasure code profile alone is + specified and the rule argument is left off then Ceph will create + the CRUSH rule automatically. + +Deleting rules +-------------- + +Rules that are not in use by pools can be deleted with:: + + ceph osd crush rule rm {rule-name} + Tunables ======== @@ -1206,6 +876,8 @@ * ``argonaut``: the legacy values supported by the original argonaut release * ``bobtail``: the values supported by the bobtail release * ``firefly``: the values supported by the firefly release + * ``hammer``: the values supported by the hammer release + * ``jewel``: the values supported by the jewel release * ``optimal``: the best (ie optimal) values of the current version of Ceph * ``default``: the default values of a new cluster installed from scratch. These values, which depend on the current version of Ceph, @@ -1221,40 +893,27 @@ Note that this may result in some data movement. -Tuning CRUSH, the hard way --------------------------- - -If you can ensure that all clients are running recent code, you can -adjust the tunables by extracting the CRUSH map, modifying the values, -and reinjecting it into the cluster. - -* Extract the latest CRUSH map:: - - ceph osd getcrushmap -o /tmp/crush - -* Adjust tunables. These values appear to offer the best behavior - for both large and small clusters we tested with. You will need to - additionally specify the ``--enable-unsafe-tunables`` argument to - ``crushtool`` for this to work. Please use this option with - extreme care.:: +.. _CRUSH - Controlled, Scalable, Decentralized Placement of Replicated Data: https://ceph.com/wp-content/uploads/2016/08/weil-crush-sc06.pdf - crushtool -i /tmp/crush --set-choose-local-tries 0 --set-choose-local-fallback-tries 0 --set-choose-total-tries 50 -o /tmp/crush.new -* Reinject modified map:: +Primary Affinity +================ - ceph osd setcrushmap -i /tmp/crush.new +When a Ceph Client reads or writes data, it always contacts the primary OSD in +the acting set. For set ``[2, 3, 4]``, ``osd.2`` is the primary. Sometimes an +OSD is not well suited to act as a primary compared to other OSDs (e.g., it has +a slow disk or a slow controller). To prevent performance bottlenecks +(especially on read operations) while maximizing utilization of your hardware, +you can set a Ceph OSD's primary affinity so that CRUSH is less likely to use +the OSD as a primary in an acting set. :: -Legacy values -------------- + ceph osd primary-affinity -For reference, the legacy values for the CRUSH tunables can be set -with:: +Primary affinity is ``1`` by default (*i.e.,* an OSD may act as a primary). You +may set the OSD primary range from ``0-1``, where ``0`` means that the OSD may +**NOT** be used as a primary and ``1`` means that an OSD may be used as a +primary. When the weight is ``< 1``, it is less likely that CRUSH will select +the Ceph OSD Daemon to act as a primary. - crushtool -i /tmp/crush --set-choose-local-tries 2 --set-choose-local-fallback-tries 5 --set-choose-total-tries 19 --set-chooseleaf-descend-once 0 --set-chooseleaf-vary-r 0 -o /tmp/crush.legacy -Again, the special ``--enable-unsafe-tunables`` option is required. -Further, as noted above, be careful running old versions of the -``ceph-osd`` daemon after reverting to legacy values as the feature -bit is not perfectly enforced. -.. _CRUSH - Controlled, Scalable, Decentralized Placement of Replicated Data: https://ceph.com/wp-content/uploads/2016/08/weil-crush-sc06.pdf diff -Nru ceph-12.1.1/doc/rados/operations/health-checks.rst ceph-12.1.2/doc/rados/operations/health-checks.rst --- ceph-12.1.1/doc/rados/operations/health-checks.rst 1970-01-01 00:00:00.000000000 +0000 +++ ceph-12.1.2/doc/rados/operations/health-checks.rst 2017-08-01 17:55:40.000000000 +0000 @@ -0,0 +1,545 @@ + +============= +Health checks +============= + +Overview +======== + +There is a finite set of possible health messages that a Ceph cluster can +raise -- these are defined as *health checks* which have unique identifiers. + +The identifier is a terse pseudo-human-readable (i.e. like a variable name) +string. It is intended to enable tools (such as UIs) to make sense of +health checks, and present them in a way that reflects their meaning. + +This page lists the health checks that are raised by the monitor and manager +daemons. In addition to these, you may also see health checks that originate +from MDS daemons (see :doc:`/cephfs/health-messages`), and health checks +that are defined by ceph-mgr python modules. + +Definitions +=========== + + +OSDs +---- + +OSD_DOWN +________ + +One or more OSDs are marked down. The ceph-osd daemon may have been +stopped, or peer OSDs may be unable to reach the OSD over the network. +Common causes include a stopped or crashed daemon, a down host, or a +network outage. + +Verify the host is healthy, the daemon is started, and network is +functioning. If the daemon has crashed, the daemon log file +(``/var/log/ceph/ceph-osd.*``) may contain debugging information. + +OSD__DOWN +_____________________ + +(e.g. OSD_HOST_DOWN, OSD_ROOT_DOWN) + +All the OSDs within a particular CRUSH subtree are marked down, for example +all OSDs on a host. + +OSD_ORPHAN +__________ + +An OSD is referenced in the CRUSH map hierarchy but does not exist. + +The OSD can be removed from the CRUSH hierarchy with:: + + ceph osd crush rm osd. + +OSD_OUT_OF_ORDER_FULL +_____________________ + +The utilization thresholds for `backfillfull`, `nearfull`, `full`, +and/or `failsafe_full` are not ascending. In particular, we expect +`backfillfull < nearfull`, `nearfull < full`, and `full < +failsafe_full`. + +The thresholds can be adjusted with:: + + ceph osd set-backfillfull-ratio + ceph osd set-nearfull-ratio + ceph osd set-full-ratio + + +OSD_FULL +________ + +One or more OSDs has exceeded the `full` threshold and is preventing +the cluster from servicing writes. + +Utilization by pool can be checked with:: + + ceph df + +The currently defined `full` ratio can be seen with:: + + ceph osd dump | grep full_ratio + +A short-term workaround to restore write availability is to raise the full +threshold by a small amount:: + + ceph osd set-full-ratio + +New storage should be added to the cluster by deploying more OSDs or +existing data should be deleted in order to free up space. + +OSD_BACKFILLFULL +________________ + +One or more OSDs has exceeded the `backfillfull` threshold, which will +prevent data from being allowed to rebalance to this device. This is +an early warning that rebalancing may not be able to complete and that +the cluster is approaching full. + +Utilization by pool can be checked with:: + + ceph df + +OSD_NEARFULL +____________ + +One or more OSDs has exceeded the `nearfull` threshold. This is an early +warning that the cluster is approaching full. + +Utilization by pool can be checked with:: + + ceph df + +OSDMAP_FLAGS +____________ + +One or more cluster flags of interest has been set. These flags include: + +* *full* - the cluster is flagged as full and cannot service writes +* *pauserd*, *pausewr* - paused reads or writes +* *noup* - OSDs are not allowed to start +* *nodown* - OSD failure reports are being ignored, such that the + monitors will not mark OSDs `down` +* *noin* - OSDs that were previously marked `out` will not be marked + back `in` when they start +* *noout* - down OSDs will not automatically be marked out after the + configured interval +* *nobackfill*, *norecover*, *norebalance* - recovery or data + rebalancing is suspended +* *noscrub*, *nodeep_scrub* - scrubbing is disabled +* *notieragent* - cache tiering activity is suspended + +With the exception of *full*, these flags can be set or cleared with:: + + ceph osd set + ceph osd unset + +OSD_FLAGS +_________ + +One or more OSDs has a per-OSD flag of interest set. These flags include: + +* *noup*: OSD is not allowed to start +* *nodown*: failure reports for this OSD will be ignored +* *noin*: if this OSD was previously marked `out` automatically + after a failure, it will not be marked in when it stats +* *noout*: if this OSD is down it will not automatically be marked + `out` after the configured interval + +Per-OSD flags can be set and cleared with:: + + ceph osd add- + ceph osd rm- + +For example, :: + + ceph osd rm-nodown osd.123 + +OLD_CRUSH_TUNABLES +__________________ + +The CRUSH map is using very old settings and should be updated. The +oldest tunables that can be used (i.e., the oldest client version that +can connect to the cluster) without triggering this health warning is +determined by the ``mon_crush_min_required_version`` config option. +See :doc:`/rados/operations/crush-map/#tunables` for more information. + +OLD_CRUSH_STRAW_CALC_VERSION +____________________________ + +The CRUSH map is using an older, non-optimal method for calculating +intermediate weight values for ``straw`` buckets. + +The CRUSH map should be updated to use the newer method +(``straw_calc_version=1``). See +:doc:`/rados/operations/crush-map/#tunables` for more information. + +CACHE_POOL_NO_HIT_SET +_____________________ + +One or more cache pools is not configured with a *hit set* to track +utilization, which will prevent the tiering agent from identifying +cold objects to flush and evict from the cache. + +Hit sets can be configured on the cache pool with:: + + ceph osd pool set hit_set_type + ceph osd pool set hit_set_period + ceph osd pool set hit_set_count + ceph osd pool set hit_set_fpp + +OSD_NO_SORTBITWISE +__________________ + +No pre-luminous v12.y.z OSDs are running but the ``sortbitwise`` flag has not +been set. + +The ``sortbitwise`` flag must be set before luminous v12.y.z or newer +OSDs can start. You can safely set the flag with:: + + ceph osd set sortbitwise + +POOL_FULL +_________ + +One or more pools has reached its quota and is no longer allowing writes. + +Pool quotas and utilization can be seen with:: + + ceph df detail + +You can either raise the pool quota with:: + + ceph osd pool set-quota max_objects + ceph osd pool set-quota max_bytes + +or delete some existing data to reduce utilization. + + +Data health (pools & placement groups) +------------------------------ + +PG_AVAILABILITY +_______________ + +Data availability is reduced, meaning that the cluster is unable to +service potential read or write requests for some data in the cluster. +Specifically, one or more PGs is in a state that does not allow IO +requests to be serviced. Problematic PG states include *peering*, +*stale*, *incomplete*, and the lack of *active* (if those conditions do not clear +quickly). + +Detailed information about which PGs are affected is available from:: + + ceph health detail + +In most cases the root cause is that one or more OSDs is currently +down; see the dicussion for ``OSD_DOWN`` above. + +The state of specific problematic PGs can be queried with:: + + ceph tell query + +PG_DEGRADED +___________ + +Data redundancy is reduced for some data, meaning the cluster does not +have the desired number of replicas for all data (for replicated +pools) or erasure code fragments (for erasure coded pools). +Specifically, one or more PGs: + +* has the *degraded* or *undersized* flag set, meaning there are not + enough instances of that placement group in the cluster; +* has not had the *clean* flag set for some time. + +Detailed information about which PGs are affected is available from:: + + ceph health detail + +In most cases the root cause is that one or more OSDs is currently +down; see the dicussion for ``OSD_DOWN`` above. + +The state of specific problematic PGs can be queried with:: + + ceph tell query + + +PG_DEGRADED_FULL +________________ + +Data redundancy may be reduced or at risk for some data due to a lack +of free space in the cluster. Specifically, one or more PGs has the +*backfill_toofull* or *recovery_toofull* flag set, meaning that the +cluster is unable to migrate or recover data because one or more OSDs +is above the *backfillfull* threshold. + +See the discussion for *OSD_BACKFILLFULL* or *OSD_FULL* above for +steps to resolve this condition. + +PG_DAMAGED +__________ + +Data scrubbing has discovered some problems with data consistency in +the cluster. Specifically, one or more PGs has the *inconsistent* or +*snaptrim_error* flag is set, indicating an earlier scrub operation +found a problem, or that the *repair* flag is set, meaning a repair +for such an inconsistency is currently in progress. + +See :doc:`pg-repair` for more information. + +OSD_SCRUB_ERRORS +________________ + +Recent OSD scrubs have uncovered inconsistencies. This error is generally +paired with *PG_DAMANGED* (see above). + +See :doc:`pg-repair` for more information. + +CACHE_POOL_NEAR_FULL +____________________ + +A cache tier pool is nearly full. Full in this context is determined +by the ``target_max_bytes`` and ``target_max_objects`` properties on +the cache pool. Once the pool reaches the target threshold, write +requests to the pool may block while data is flushed and evicted +from the cache, a state that normally leads to very high latencies and +poor performance. + +The cache pool target size can be adjusted with:: + + ceph osd pool set target_max_bytes + ceph osd pool set target_max_objects + +Normal cache flush and evict activity may also be throttled due to reduced +availability or performance of the base tier, or overall cluster load. + +TOO_FEW_PGS +___________ + +The number of PGs in use in the cluster is below the configurable +threshold of ``mon_pg_warn_min_per_osd`` PGs per OSD. This can lead +to suboptimizal distribution and balance of data across the OSDs in +the cluster, and similar reduce overall performance. + +This may be an expected condition if data pools have not yet been +created. + +The PG count for existing pools can be increased or new pools can be +created. Please refer to +:doc:`placement-groups#Choosing-the-number-of-Placement-Groups` for +more information. + +TOO_MANY_PGS +____________ + +The number of PGs in use in the cluster is above the configurable +threshold of ``mon_pg_warn_max_per_osd`` PGs per OSD. This can lead +to higher memory utilization for OSD daemons, slower peering after +cluster state changes (like OSD restarts, additions, or removals), and +higher load on the Manager and Monitor daemons. + +The ``pg_num`` value for existing pools cannot currently be reduced. +However, the ``pgp_num`` value can, which effectively collocates some +PGs on the same sets of OSDs, mitigating some of the negative impacts +described above. The ``pgp_num`` value can be adjusted with:: + + ceph osd pool set pgp_num + +Please refer to +:doc:`placement-groups#Choosing-the-number-of-Placement-Groups` for +more information. + +SMALLER_PGP_NUM +_______________ + +One or more pools has a ``pgp_num`` value less than ``pg_num``. This +is normally an indication that the PG count was increased without +also increasing the placement behavior. + +This is sometimes done deliberately to separate out the `split` step +when the PG count is adjusted from the data migration that is needed +when ``pgp_num`` is changed. + +This is normally resolved by setting ``pgp_num`` to match ``pg_num``, +triggering the data migration, with:: + + ceph osd pool set pgp_num + + +MANY_OBJECTS_PER_PG +___________________ + +One or more pools has an average number of objects per PG that is +significantly higher than the overall cluster average. The specific +threshold is controlled by the ``mon_pg_warn_max_object_skew`` +configuration value. + +This is usually an indication that the pool(s) containing most of the +data in the cluster have too few PGs, and/or that other pools that do +not contain as much data have too many PGs. See the discussion of +*TOO_MANY_PGS* above. + +The threshold can be raised to silence the health warning by adjusting +the ``mon_pg_warn_max_object_skew`` config option on the monitors. + +POOL_APP_NOT_ENABLED +____________________ + +A pool exists that contains one or more objects but has not been +tagged for use by a particular application. + +Resolve this warning by labeling the pool for use by an application. For +example, if the pool is used by RBD,:: + + rbd pool init + +If the pool is being used by a custom application 'foo', you can also label +via the low-level command:: + + ceph osd pool application enable foo + +For more information, see :doc:`pools.rst#associate-pool-to-application`. + +POOL_FULL +_________ + +One or more pools has reached (or is very close to reaching) its +quota. The threshold to trigger this error condition is controlled by +the ``mon_pool_quota_crit_threshold`` configuration option. + +Pool quotas can be adjusted up or down (or removed) with:: + + ceph osd pool set-quota max_bytes + ceph osd pool set-quota max_objects + +Setting the quota value to 0 will disable the quota. + +POOL_NEAR_FULL +______________ + +One or more pools is approaching is quota. The threshold to trigger +this warning condition is controlled by the +``mon_pool_quota_warn_threshold`` configuration option. + +Pool quotas can be adjusted up or down (or removed) with:: + + ceph osd pool set-quota max_bytes + ceph osd pool set-quota max_objects + +Setting the quota value to 0 will disable the quota. + +OBJECT_MISPLACED +________________ + +One or more objects in the cluster is not stored on the node the +cluster would like it to be stored on. This is an indication that +data migration due to some recent cluster change has not yet completed. + +Misplaced data is not a dangerous condition in and of itself; data +consistency is never at risk, and old copies of objects are never +removed until the desired number of new copies (in the desired +locations) are present. + +OBJECT_UNFOUND +______________ + +One or more objects in the cluster cannot be found. Specifically, the +OSDs know that a new or updated copy of an object should exist, but a +copy of that version of the object has not been found on OSDs that are +currently online. + +Read or write requests to unfound objects will block. + +Ideally, a down OSD can be brought back online that has the more +recent copy of the unfound object. Candidate OSDs can be identified from the +peering state for the PG(s) responsible for the unfound object:: + + ceph tell query + +If the latest copy of the object is not available, the cluster can be +told to roll back to a previous version of the object. See +:doc:`troubleshooting-pg#Unfound-objects` for more information. + +REQUEST_SLOW +____________ + +One or more OSD requests is taking a long time to process. This can +be an indication of extreme load, a slow storage device, or a software +bug. + +The request queue on the OSD(s) in question can be queried with the +following command, executed from the OSD host:: + + ceph daemon osd. ops + +A summary of the slowest recent requests can be seen with:: + + ceph daemon osd. dump_historic_ops + +The location of an OSD can be found with:: + + ceph osd find osd. + +REQUEST_STUCK +_____________ + +One or more OSD requests has been blocked for an extremely long time. +This is an indication that either the cluster has been unhealthy for +an extended period of time (e.g., not enough running OSDs) or there is +some internal problem with the OSD. See the dicussion of +*REQUEST_SLOW* above. + +PG_NOT_SCRUBBED +_______________ + +One or more PGs has not been scrubbed recently. PGs are normally +scrubbed every ``mon_scrub_interval`` seconds, and this warning +triggers when ``mon_warn_not_scrubbed`` such intervals have elapsed +without a scrub. + +PGs will not scrub if they are not flagged as *clean*, which may +happen if they are misplaced or degraded (see *PG_AVAILABILITY* and +*PG_DEGRADED* above). + +You can manually initiate a scrub of a clean PG with:: + + ceph pg scrub + +PG_NOT_DEEP_SCRUBBED +____________________ + +One or more PGs has not been deep scrubbed recently. PGs are normally +scrubbed every ``osd_deep_mon_scrub_interval`` seconds, and this warning +triggers when ``mon_warn_not_deep_scrubbed`` such intervals have elapsed +without a scrub. + +PGs will not (deep) scrub if they are not flagged as *clean*, which may +happen if they are misplaced or degraded (see *PG_AVAILABILITY* and +*PG_DEGRADED* above). + +You can manually initiate a scrub of a clean PG with:: + + ceph pg deep-scrub + +CephFS +------ + +FS_WITH_FAILED_MDS +__________________ + + +FS_DEGRADED +___________ + + +MDS_INSUFFICIENT_STANDBY +________________________ + + +MDS_DAMAGED +___________ + + diff -Nru ceph-12.1.1/doc/rados/operations/index.rst ceph-12.1.2/doc/rados/operations/index.rst --- ceph-12.1.1/doc/rados/operations/index.rst 2017-07-17 16:56:02.000000000 +0000 +++ ceph-12.1.2/doc/rados/operations/index.rst 2017-08-01 17:55:40.000000000 +0000 @@ -14,6 +14,7 @@ :maxdepth: 1 operating + health-checks monitoring monitoring-osd-pg user-management @@ -35,7 +36,9 @@ erasure-code cache-tiering placement-groups + upmap crush-map + crush-map-edits diff -Nru ceph-12.1.1/doc/rados/operations/monitoring-osd-pg.rst ceph-12.1.2/doc/rados/operations/monitoring-osd-pg.rst --- ceph-12.1.1/doc/rados/operations/monitoring-osd-pg.rst 2017-07-17 16:56:02.000000000 +0000 +++ ceph-12.1.2/doc/rados/operations/monitoring-osd-pg.rst 2017-08-01 17:55:40.000000000 +0000 @@ -10,7 +10,7 @@ finding the `placement group`_ and the underlying OSDs at root of the problem. .. tip:: A fault in one part of the cluster may prevent you from accessing a - particular object, but that doesn't mean that you can't access other objects. + particular object, but that doesn't mean that you cannot access other objects. When you run into a fault, don't panic. Just follow the steps for monitoring your OSDs and placement groups. Then, begin troubleshooting. @@ -73,7 +73,7 @@ If the number of OSDs that are ``in`` the cluster is more than the number of OSDs that are ``up``, execute the following command to identify the ``ceph-osd`` -daemons that aren't running:: +daemons that are not running:: ceph osd tree @@ -221,7 +221,7 @@ few cases: - You are reaching your ``near full ratio`` or ``full ratio``. -- Your data isn't getting distributed across the cluster due to an +- Your data is not getting distributed across the cluster due to an error in your CRUSH configuration. @@ -458,7 +458,7 @@ current state. During that time period, the OSD may reflect a ``recovering`` state. -Recovery isn't always trivial, because a hardware failure might cause a +Recovery is not always trivial, because a hardware failure might cause a cascading failure of multiple OSDs. For example, a network switch for a rack or cabinet may fail, which can cause the OSDs of a number of host machines to fall behind the current state of the cluster. Each one of the OSDs must recover once @@ -487,11 +487,11 @@ requests when it is ready. During the backfill operations, you may see one of several states: -``backfill_wait`` indicates that a backfill operation is pending, but isn't +``backfill_wait`` indicates that a backfill operation is pending, but is not underway yet; ``backfill`` indicates that a backfill operation is underway; and, ``backfill_too_full`` indicates that a backfill operation was requested, but couldn't be completed due to insufficient storage capacity. When a -placement group can't be backfilled, it may be considered ``incomplete``. +placement group cannot be backfilled, it may be considered ``incomplete``. Ceph provides a number of settings to manage the load spike associated with reassigning placement groups to an OSD (especially a new OSD). By default, @@ -519,7 +519,7 @@ ----- While Ceph uses heartbeats to ensure that hosts and daemons are running, the -``ceph-osd`` daemons may also get into a ``stuck`` state where they aren't +``ceph-osd`` daemons may also get into a ``stuck`` state where they are not reporting statistics in a timely manner (e.g., a temporary network fault). By default, OSD daemons report their placement group, up thru, boot and failure statistics every half second (i.e., ``0.5``), which is more frequent than the @@ -537,8 +537,8 @@ Identifying Troubled PGs ======================== -As previously noted, a placement group isn't necessarily problematic just -because its state isn't ``active+clean``. Generally, Ceph's ability to self +As previously noted, a placement group is not necessarily problematic just +because its state is not ``active+clean``. Generally, Ceph's ability to self repair may not be working when placement groups get stuck. The stuck states include: diff -Nru ceph-12.1.1/doc/rados/operations/monitoring.rst ceph-12.1.2/doc/rados/operations/monitoring.rst --- ceph-12.1.1/doc/rados/operations/monitoring.rst 2017-07-17 16:56:02.000000000 +0000 +++ ceph-12.1.2/doc/rados/operations/monitoring.rst 2017-08-01 17:55:40.000000000 +0000 @@ -6,8 +6,11 @@ cluster. Monitoring a cluster typically involves checking OSD status, monitor status, placement group status and metadata server status. -Interactive Mode -================ +Using the command line +====================== + +Interactive mode +---------------- To run the ``ceph`` tool in interactive mode, type ``ceph`` at the command line with no arguments. For example:: @@ -17,72 +20,58 @@ ceph> status ceph> quorum_status ceph> mon_status - -Checking Cluster Health -======================= - -After you start your cluster, and before you start reading and/or -writing data, check your cluster's health first. You can check on the -health of your Ceph cluster with the following:: - - ceph health +Non-default paths +----------------- If you specified non-default locations for your configuration or keyring, you may specify their locations:: ceph -c /path/to/conf -k /path/to/keyring health -Upon starting the Ceph cluster, you will likely encounter a health -warning such as ``HEALTH_WARN XXX num placement groups stale``. Wait a few moments and check -it again. When your cluster is ready, ``ceph health`` should return a message -such as ``HEALTH_OK``. At that point, it is okay to begin using the cluster. +Checking a Cluster's Status +=========================== -Watching a Cluster -================== +After you start your cluster, and before you start reading and/or +writing data, check your cluster's status first. -To watch the cluster's ongoing events, open a new terminal. Then, enter:: +To check a cluster's status, execute the following:: - ceph -w + ceph status + +Or:: -Ceph will print each event. For example, a tiny Ceph cluster consisting of -one monitor, and two OSDs may print the following:: + ceph -s + +In interactive mode, type ``status`` and press **Enter**. :: + + ceph> status + +Ceph will print the cluster status. For example, a tiny Ceph demonstration +cluster with one of each service may print the following: + +:: + + cluster: + id: 477e46f1-ae41-4e43-9c8f-72c918ab0a20 + health: HEALTH_OK + + services: + mon: 1 daemons, quorum a + mgr: x(active) + mds: 1/1/1 up {0=a=up:active} + osd: 1 osds: 1 up, 1 in + + data: + pools: 2 pools, 16 pgs + objects: 21 objects, 2246 bytes + usage: 546 GB used, 384 GB / 931 GB avail + pgs: 16 active+clean - cluster b370a29d-9287-4ca3-ab57-3d824f65e339 - health HEALTH_OK - monmap e1: 1 mons at {ceph1=10.0.0.8:6789/0}, election epoch 2, quorum 0 ceph1 - osdmap e63: 2 osds: 2 up, 2 in - pgmap v41338: 952 pgs, 20 pools, 17130 MB data, 2199 objects - 115 GB used, 167 GB / 297 GB avail - 952 active+clean - - 2014-06-02 15:45:21.655871 osd.0 [INF] 17.71 deep-scrub ok - 2014-06-02 15:45:47.880608 osd.1 [INF] 1.0 scrub ok - 2014-06-02 15:45:48.865375 osd.1 [INF] 1.3 scrub ok - 2014-06-02 15:45:50.866479 osd.1 [INF] 1.4 scrub ok - 2014-06-02 15:45:01.345821 mon.0 [INF] pgmap v41339: 952 pgs: 952 active+clean; 17130 MB data, 115 GB used, 167 GB / 297 GB avail - 2014-06-02 15:45:05.718640 mon.0 [INF] pgmap v41340: 952 pgs: 1 active+clean+scrubbing+deep, 951 active+clean; 17130 MB data, 115 GB used, 167 GB / 297 GB avail - 2014-06-02 15:45:53.997726 osd.1 [INF] 1.5 scrub ok - 2014-06-02 15:45:06.734270 mon.0 [INF] pgmap v41341: 952 pgs: 1 active+clean+scrubbing+deep, 951 active+clean; 17130 MB data, 115 GB used, 167 GB / 297 GB avail - 2014-06-02 15:45:15.722456 mon.0 [INF] pgmap v41342: 952 pgs: 952 active+clean; 17130 MB data, 115 GB used, 167 GB / 297 GB avail - 2014-06-02 15:46:06.836430 osd.0 [INF] 17.75 deep-scrub ok - 2014-06-02 15:45:55.720929 mon.0 [INF] pgmap v41343: 952 pgs: 1 active+clean+scrubbing+deep, 951 active+clean; 17130 MB data, 115 GB used, 167 GB / 297 GB avail - - -The output provides: - -- Cluster ID -- Cluster health status -- The monitor map epoch and the status of the monitor quorum -- The OSD map epoch and the status of OSDs -- The placement group map version -- The number of placement groups and pools -- The *notional* amount of data stored and the number of objects stored; and, -- The total amount of data stored. .. topic:: How Ceph Calculates Data Usage - The ``used`` value reflects the *actual* amount of raw storage used. The + The ``usage`` value reflects the *actual* amount of raw storage used. The ``xxx GB / xxx GB`` value means the amount available (the lesser number) of the overall storage capacity of the cluster. The notional number reflects the size of the stored data before it is replicated, cloned or snapshotted. @@ -91,6 +80,96 @@ storage capacity for cloning and snapshotting. +Watching a Cluster +================== + +In addition to local logging by each daemon, Ceph clusters maintain +a *cluster log* that records high level events about the whole system. +This is logged to disk on monitor servers (as ``/var/log/ceph/ceph.log`` by +default), but can also be monitored via the command line. + +To follow the cluster log, use the following command + +:: + + ceph -w + +Ceph will print the status of the system, followed by each log message as it +is emitted. For example: + +:: + + cluster: + id: 477e46f1-ae41-4e43-9c8f-72c918ab0a20 + health: HEALTH_OK + + services: + mon: 1 daemons, quorum a + mgr: x(active) + mds: 1/1/1 up {0=a=up:active} + osd: 1 osds: 1 up, 1 in + + data: + pools: 2 pools, 16 pgs + objects: 21 objects, 2246 bytes + usage: 546 GB used, 384 GB / 931 GB avail + pgs: 16 active+clean + + + 2017-07-24 08:15:11.329298 mon.a mon.0 172.21.9.34:6789/0 23 : cluster [INF] osd.0 172.21.9.34:6806/20527 boot + 2017-07-24 08:15:14.258143 mon.a mon.0 172.21.9.34:6789/0 39 : cluster [INF] Activating manager daemon x + 2017-07-24 08:15:15.446025 mon.a mon.0 172.21.9.34:6789/0 47 : cluster [INF] Manager daemon x is now available + + +In addition to using ``ceph -w`` to print log lines as they are emitted, +use ``ceph log last [n]`` to see the most recent ``n`` lines from the cluster +log. + +Monitoring Health Checks +======================== + +Ceph continously runs various *health checks* against its own status. When +a health check fails, this is reflected in the output of ``ceph status`` (or +``ceph health``). In addition, messages are sent to the cluster log to +indicate when a check fails, and when the cluster recovers. + +For example, when an OSD goes down, the ``health`` section of the status +output may be updated as follows: + +:: + + health: HEALTH_WARN + 1 osds down + Degraded data redundancy: 21/63 objects degraded (33.333%), 16 pgs unclean, 16 pgs degraded + +At this time, cluster log messages are also emitted to record the failure of the +health checks: + +:: + + 2017-07-25 10:08:58.265945 mon.a mon.0 172.21.9.34:6789/0 91 : cluster [WRN] Health check failed: 1 osds down (OSD_DOWN) + 2017-07-25 10:09:01.302624 mon.a mon.0 172.21.9.34:6789/0 94 : cluster [WRN] Health check failed: Degraded data redundancy: 21/63 objects degraded (33.333%), 16 pgs unclean, 16 pgs degraded (PG_DEGRADED) + +When the OSD comes back online, the cluster log records the cluster's return +to a health state: + +:: + + 2017-07-25 10:11:11.526841 mon.a mon.0 172.21.9.34:6789/0 109 : cluster [WRN] Health check update: Degraded data redundancy: 2 pgs unclean, 2 pgs degraded, 2 pgs undersized (PG_DEGRADED) + 2017-07-25 10:11:13.535493 mon.a mon.0 172.21.9.34:6789/0 110 : cluster [INF] Health check cleared: PG_DEGRADED (was: Degraded data redundancy: 2 pgs unclean, 2 pgs degraded, 2 pgs undersized) + 2017-07-25 10:11:13.535577 mon.a mon.0 172.21.9.34:6789/0 111 : cluster [INF] Cluster is now healthy + + +Detecting configuration issues +============================== + +In addition to the health checks that Ceph continuously runs on its +own status, there are some configuration issues that may only be detected +by an external tool. + +Use the `ceph-medic`_ tool to run these additional checks on your Ceph +cluster's configuration. + Checking a Cluster's Usage Stats ================================ @@ -138,33 +217,6 @@ mon_osd_full_ratio. -Checking a Cluster's Status -=========================== - -To check a cluster's status, execute the following:: - - ceph status - -Or:: - - ceph -s - -In interactive mode, type ``status`` and press **Enter**. :: - - ceph> status - -Ceph will print the cluster status. For example, a tiny Ceph cluster consisting -of one monitor, and two OSDs may print the following:: - - cluster b370a29d-9287-4ca3-ab57-3d824f65e339 - health HEALTH_OK - monmap e1: 1 mons at {ceph1=10.0.0.8:6789/0}, election epoch 2, quorum 0 ceph1 - osdmap e63: 2 osds: 2 up, 2 in - pgmap v41332: 952 pgs, 20 pools, 17130 MB data, 2199 objects - 115 GB used, 167 GB / 297 GB avail - 1 active+clean+scrubbing+deep - 951 active+clean - Checking OSD Status =================== @@ -296,3 +348,4 @@ .. _Viewing a Configuration at Runtime: ../../configuration/ceph-conf#ceph-runtime-config .. _Storage Capacity: ../../configuration/mon-config-ref#storage-capacity +.. _ceph-medic: http://docs.ceph.com/ceph-medic/master/ diff -Nru ceph-12.1.1/doc/rados/operations/pg-repair.rst ceph-12.1.2/doc/rados/operations/pg-repair.rst --- ceph-12.1.1/doc/rados/operations/pg-repair.rst 1970-01-01 00:00:00.000000000 +0000 +++ ceph-12.1.2/doc/rados/operations/pg-repair.rst 2017-08-01 17:55:40.000000000 +0000 @@ -0,0 +1,4 @@ +Repairing PG inconsistencies +============================ + + diff -Nru ceph-12.1.1/doc/rados/operations/pg-states.rst ceph-12.1.2/doc/rados/operations/pg-states.rst --- ceph-12.1.1/doc/rados/operations/pg-states.rst 2017-07-17 16:56:02.000000000 +0000 +++ ceph-12.1.2/doc/rados/operations/pg-states.rst 2017-08-01 17:55:40.000000000 +0000 @@ -38,11 +38,17 @@ *Recovering* Ceph is migrating/synchronizing objects and their replicas. +*Forced-Recovery* + High recovery priority of that PG is enforced by user. + *Backfill* Ceph is scanning and synchronizing the entire contents of a placement group instead of inferring what contents need to be synchronized from the logs of recent operations. *Backfill* is a special case of recovery. +*Forced-Backfill* + High backfill priority of that PG is enforced by user. + *Wait-backfill* The placement group is waiting in line to start backfill. diff -Nru ceph-12.1.1/doc/rados/operations/placement-groups.rst ceph-12.1.2/doc/rados/operations/placement-groups.rst --- ceph-12.1.1/doc/rados/operations/placement-groups.rst 2017-07-17 16:56:02.000000000 +0000 +++ ceph-12.1.2/doc/rados/operations/placement-groups.rst 2017-08-01 17:55:40.000000000 +0000 @@ -306,7 +306,7 @@ To set the number of placement groups in a pool, you must specify the number of placement groups at the time you create the pool. -See `Create a Pool`_ for details. Once you've set placement groups for a +See `Create a Pool`_ for details. Once you have set placement groups for a pool, you may increase the number of placement groups (but you cannot decrease the number of placement groups). To increase the number of placement groups, execute the following:: @@ -403,6 +403,36 @@ match, a final semantic sweep ensures that all of the snapshot-related object metadata is consistent. Errors are reported via logs. +Prioritize backfill/recovery of a Placement Group(s) +==================================================== + +You may run into a situation where a bunch of placement groups will require +recovery and/or backfill, and some particular groups hold data more important +than others (for example, those PGs may hold data for images used by running +machines and other PGs may be used by inactive machines/less relevant data). +In that case, you may want to prioritize recovery of those groups so +performance and/or availability of data stored on those groups is restored +earlier. To do this (mark particular placement group(s) as prioritized during +backfill or recovery), execute the following:: + + ceph pg force-recovery {pg-id} [{pg-id #2}] [{pg-id #3} ...] + ceph pg force-backfill {pg-id} [{pg-id #2}] [{pg-id #3} ...] + +This will cause Ceph to perform recovery or backfill on specified placement +groups first, before other placement groups. This does not interrupt currently +ongoing backfills or recovery, but causes specified PGs to be processed +as soon as possible. If you change your mind or prioritize wrong groups, +use:: + + ceph pg cancel-force-recovery {pg-id} [{pg-id #2}] [{pg-id #3} ...] + ceph pg cancel-force-backfill {pg-id} [{pg-id #2}] [{pg-id #3} ...] + +This will remove "force" flag from those PGs and they will be processed +in default order. Again, this doesn't affect currently processed placement +group, only those that are still queued. + +The "force" flag is cleared automatically after recovery or backfill of group +is done. Revert Lost =========== diff -Nru ceph-12.1.1/doc/rados/operations/pools.rst ceph-12.1.2/doc/rados/operations/pools.rst --- ceph-12.1.1/doc/rados/operations/pools.rst 2017-07-17 16:56:02.000000000 +0000 +++ ceph-12.1.2/doc/rados/operations/pools.rst 2017-08-01 17:55:40.000000000 +0000 @@ -49,6 +49,10 @@ groups in your Ceph configuration file, as the default is NOT ideal. For details on placement group numbers refer to `setting the number of placement groups`_ +.. note:: Starting with Luminous, all pools need to be associated to the + application using the pool. See `Associate Pool to Application`_ below for + more information. + For example:: osd pool default pg num = 100 @@ -153,6 +157,23 @@ :Required: No. :Default: 0, no splitting at the pool creation time. +Associate Pool to Application +============================= + +Pools need to be associated with an application before use. Pools that will be +used with CephFS or pools that are automatically created by RGW are +automatically associated. Pools that are intended for use with RBD should be +initialized using the ``rbd`` tool (see `Block Device Commands`_ for more +information). + +For other cases, you can manually associate a free-form application name to +a pool.:: + + ceph osd pool application enable {pool-name} {application-name} + +.. note:: CephFS uses the application name ``cephfs``, RBD uses the + application name ``rbd``, and RGW uses the application name ``rgw``. + Set Pool Quotas =============== @@ -198,7 +219,7 @@ If you created users with permissions strictly for a pool that no longer exists, you should consider deleting those users too:: - ceph auth list | grep -C 5 {pool-name} + ceph auth ls | grep -C 5 {pool-name} ceph auth del {user} @@ -742,3 +763,5 @@ .. _Bloom Filter: http://en.wikipedia.org/wiki/Bloom_filter .. _setting the number of placement groups: ../placement-groups#set-the-number-of-placement-groups .. _Erasure Coding with Overwrites: ../erasure-code#erasure-coding-with-overwrites +.. _Block Device Commands: ../../../rbd/rados-rbd-cmds/#create-a-block-device-pool + diff -Nru ceph-12.1.1/doc/rados/operations/upmap.rst ceph-12.1.2/doc/rados/operations/upmap.rst --- ceph-12.1.1/doc/rados/operations/upmap.rst 1970-01-01 00:00:00.000000000 +0000 +++ ceph-12.1.2/doc/rados/operations/upmap.rst 2017-08-01 17:55:40.000000000 +0000 @@ -0,0 +1,75 @@ +Using the pg-upmap +================== + +Starting in Luminous v12.2.z there is a new *pg-upmap* exception table +in the OSDMap that allows the cluster to explicitly map specific PGs to +specific OSDs. This allows the cluster to fine-tune the data +distribution to, in most cases, perfectly distributed PGs across OSDs. + +The key caveat to this new mechanism is that it requires that all +clients understand the new *pg-upmap* structure in the OSDMap. + +Enabling +-------- + +To allow use of the feature, you must tell the cluster that it only +needs to support luminous (and newer) clients with:: + + ceph osd set-require-min-compat-client luminous + +This command will fail if any pre-luminous clients or daemons are +connected to the monitors. You can see what client versions are in +use with:: + + ceph features + +A word of caution +----------------- + +This is a new feature and not very user friendly. At the time of this +writing we are working on a new `balancer` module for ceph-mgr that +will eventually do all of this automatically. + +Until then, + +Offline optimization +-------------------- + +Upmap entries are updated with an offline optimizer built into ``osdmaptool``. + +#. Grab the latest copy of your osdmap:: + + ceph osd getmap -o om + +#. Run the optimizer:: + + osdmaptool om --upmap out.txt [--upmap-pool ] [--upmap-max ] [--upmap-deviation ] + + It is highly recommended that optimization be done for each pool + individually, or for sets of similarly-utilized pools. You can + specify the ``--upmap-pool`` option multiple times. "Similar pools" + means pools that are mapped to the same devices and store the same + kind of data (e.g., RBD image pools, yes; RGW index pool and RGW + data pool, no). + + The ``max-count`` value is the maximum number of upmap entries to + identify in the run. The default is 100, but you may want to make + this a smaller number so that the tool completes more quickly (but + does less work). If it cannot find any additional changes to make + it will stop early (i.e., when the pool distribution is perfect). + + The ``max-deviation`` value defaults to `.01` (i.e., 1%). If an OSD + utilization varies from the average by less than this amount it + will be considered perfect. + +#. The proposed changes are written to the output file ``out.txt`` in + the example above. These are normal ceph CLI commands that can be + run to apply the changes to the cluster. This can be done with:: + + source out.txt + +The above steps can be repeated as many times as necessary to achieve +a perfect distribution of PGs for each set of pools. + +You can see some (gory) details about what the tool is doing by +passing ``--debug-osd 10`` to ``osdmaptool``. diff -Nru ceph-12.1.1/doc/rados/operations/user-management.rst ceph-12.1.2/doc/rados/operations/user-management.rst --- ceph-12.1.1/doc/rados/operations/user-management.rst 2017-07-17 16:56:02.000000000 +0000 +++ ceph-12.1.2/doc/rados/operations/user-management.rst 2017-08-01 17:55:40.000000000 +0000 @@ -98,29 +98,29 @@ Capability syntax follows the form:: - {daemon-type} 'allow {capability}' [{daemon-type} 'allow {capability}'] + {daemon-type} '{capspec}[, {capspec} ...]' - -- **Monitor Caps:** Monitor capabilities include ``r``, ``w``, ``x`` and - ``allow profile {cap}``. For example:: +- **Monitor Caps:** Monitor capabilities include ``r``, ``w``, ``x`` access + settings or ``profile {name}``. For example:: mon 'allow rwx' - mon 'allow profile osd' + mon 'profile osd' -- **OSD Caps:** OSD capabilities include ``r``, ``w``, ``x``, ``class-read``, - ``class-write`` and ``profile osd``. Additionally, OSD capabilities also - allow for pool and namespace settings. :: +- **OSD Caps:** OSD capabilities include ``r``, ``w``, ``x``, ``class-read``, + ``class-write`` access settings or ``profile {name}``. Additionally, OSD + capabilities also allow for pool and namespace settings. :: - osd 'allow {capability}' [pool={poolname}] [namespace={namespace-name}] + osd 'allow {access} [pool={pool-name} [namespace={namespace-name}]]' + osd 'profile {name} [pool={pool-name} [namespace={namespace-name}]]' - **Metadata Server Caps:** Metadata server capability simply requires ``allow``, or blank and does not parse anything further. :: - + mds 'allow' .. note:: The Ceph Object Gateway daemon (``radosgw``) is a client of the - Ceph Storage Cluster, so it isn't represented as a Ceph Storage + Ceph Storage Cluster, so it is not represented as a Ceph Storage Cluster daemon type. The following entries describe each capability. @@ -168,20 +168,20 @@ admin commands. -``profile osd`` +``profile osd`` (Monitor only) :Description: Gives a user permissions to connect as an OSD to other OSDs or monitors. Conferred on OSDs to enable OSDs to handle replication heartbeat traffic and status reporting. -``profile mds`` +``profile mds`` (Monitor only) :Description: Gives a user permissions to connect as a MDS to other MDSs or monitors. -``profile bootstrap-osd`` +``profile bootstrap-osd`` (Monitor only) :Description: Gives a user permissions to bootstrap an OSD. Conferred on deployment tools such as ``ceph-disk``, ``ceph-deploy``, etc. @@ -189,13 +189,23 @@ bootstrapping an OSD. -``profile bootstrap-mds`` +``profile bootstrap-mds`` (Monitor only) :Description: Gives a user permissions to bootstrap a metadata server. Conferred on deployment tools such as ``ceph-deploy``, etc. so they have permissions to add keys, etc. when bootstrapping a metadata server. +``profile rbd`` (Monitor and OSD) + +:Description: Gives a user permissions to manipulate RBD images. When used + as a Monitor cap, it provides the minimal privileges required + by an RBD client application. When used as an OSD cap, it + provides read-write access to an RBD client application. + +``profile rbd-read-only`` (OSD only) + +:Description: Gives a user read-only permissions to an RBD image. Pool @@ -218,10 +228,10 @@ namespace. Objects written to a namespace within the pool can only be accessed by users who have access to the namespace. -.. note:: Currently (i.e., ``firefly``), namespaces are only useful for - applications written on top of ``librados``. Ceph clients such as block - device, object storage and file system do not currently support this - feature. +.. note:: Namespaces are primarily useful for applications written on top of + ``librados`` where the logical grouping can alleviate the need to create + different pools. Ceph Object Gateway (from ``luminous``) uses namespaces for various + metadata objects. The rationale for namespaces is that pools can be a computationally expensive method of segregating data sets for the purposes of authorizing separate sets @@ -251,10 +261,10 @@ To list the users in your cluster, execute the following:: - ceph auth list + ceph auth ls Ceph will list out all users in your cluster. For example, in a two-node -exemplary cluster, ``ceph auth list`` will output something that looks like +exemplary cluster, ``ceph auth ls`` will output something that looks like this:: installed auth entries: @@ -286,7 +296,7 @@ Note also that each entry has a ``key: `` entry, and one or more ``caps:`` entries. -You may use the ``-o {filename}`` option with ``ceph auth list`` to +You may use the ``-o {filename}`` option with ``ceph auth ls`` to save the output to a file. @@ -308,7 +318,7 @@ ceph auth export {TYPE.ID} The ``auth export`` command is identical to ``auth get``, but also prints -out the internal ``auid``, which isn't relevant to end users. +out the internal ``auid``, which is not relevant to end users. diff -Nru ceph-12.1.1/doc/rados/troubleshooting/troubleshooting-mon.rst ceph-12.1.2/doc/rados/troubleshooting/troubleshooting-mon.rst --- ceph-12.1.1/doc/rados/troubleshooting/troubleshooting-mon.rst 2017-07-17 16:56:02.000000000 +0000 +++ ceph-12.1.2/doc/rados/troubleshooting/troubleshooting-mon.rst 2017-08-01 17:55:40.000000000 +0000 @@ -172,7 +172,7 @@ Second, make sure you are able to connect to ``mon.a``'s server from the other monitors' servers. Check the ports as well. Check ``iptables`` on - all your monitor nodes and make sure you're not dropping/rejecting + all your monitor nodes and make sure you are not dropping/rejecting connections. If this initial troubleshooting doesn't solve your problems, then it's @@ -204,7 +204,7 @@ If you have a quorum, however, the monitor should be able to find the remaining monitors pretty fast, as long as they can be reached. If your - monitor is stuck probing and you've gone through with all the communication + monitor is stuck probing and you have gone through with all the communication troubleshooting, then there is a fair chance that the monitor is trying to reach the other monitors on a wrong address. ``mon_status`` outputs the ``monmap`` known to the monitor: check if the other monitor's locations @@ -224,7 +224,7 @@ `Clock Skews`_ for more infos on that. If all your clocks are properly synchronized, it is best if you prepare some logs and reach out to the community. This is not a state that is likely to persist and aside from - (*really*) old bugs there isn't an obvious reason besides clock skews on + (*really*) old bugs there is not an obvious reason besides clock skews on why this would happen. What if state is ``synchronizing``? @@ -246,7 +246,7 @@ What if state is ``leader`` or ``peon``? This should not happen. There is a chance this might happen however, and - it has a lot to do with clock skews -- see `Clock Skews`_. If you're not + it has a lot to do with clock skews -- see `Clock Skews`_. If you are not suffering from clock skews, then please prepare your logs (see `Preparing your logs`_) and reach out to us. @@ -300,12 +300,12 @@ $ ceph mon getmap -o /tmp/monmap 2. No quorum? Grab the monmap directly from another monitor (this - assumes the monitor you're grabbing the monmap from has id ID-FOO + assumes the monitor you are grabbing the monmap from has id ID-FOO and has been stopped):: $ ceph-mon -i ID-FOO --extract-monmap /tmp/monmap - 3. Stop the monitor you're going to inject the monmap into. + 3. Stop the monitor you are going to inject the monmap into. 4. Inject the monmap:: @@ -434,9 +434,9 @@ # keyring with the caps, and there is no need to pass the "--keyring" option. # i.e. just use "ceph-monstore-tool /tmp/mon-store rebuild" instead ceph-authtool /path/to/admin.keyring -n mon. \ - --cap mon allow 'allow *' + --cap mon 'allow *' ceph-authtool /path/to/admin.keyring -n client.admin \ - --cap mon allow 'allow *' --cap osd 'allow *' --cap mds 'allow *' + --cap mon 'allow *' --cap osd 'allow *' --cap mds 'allow *' ceph-monstore-tool /tmp/mon-store rebuild -- --keyring /path/to/admin.keyring # backup corrupted store.db just in case mv /var/lib/ceph/mon/mon.0/store.db /var/lib/ceph/mon/mon.0/store.db.corrupted diff -Nru ceph-12.1.1/doc/rados/troubleshooting/troubleshooting-osd.rst ceph-12.1.2/doc/rados/troubleshooting/troubleshooting-osd.rst --- ceph-12.1.1/doc/rados/troubleshooting/troubleshooting-osd.rst 2017-07-17 16:56:02.000000000 +0000 +++ ceph-12.1.2/doc/rados/troubleshooting/troubleshooting-osd.rst 2017-08-01 17:55:40.000000000 +0000 @@ -162,7 +162,7 @@ to ensure you have addressed any issues related to your kernel. - **Segment Fault:** If there is a segment fault, turn your logging up - (if it isn't already), and try again. If it segment faults again, + (if it is not already), and try again. If it segment faults again, contact the ceph-devel email list and provide your Ceph configuration file, your monitor output and the contents of your log file(s). @@ -256,7 +256,7 @@ .. tip:: Newer versions of Ceph provide better recovery handling by preventing recovering OSDs from using up system resources so that ``up`` and ``in`` - OSDs aren't available or are otherwise slow. + OSDs are not available or are otherwise slow. Networking Issues @@ -479,7 +479,7 @@ We recommend using both a public (front-end) network and a cluster (back-end) network so that you can better meet the capacity requirements of object replication. Another advantage is that you can run a cluster network such that -it isn't connected to the internet, thereby preventing some denial of service +it is not connected to the internet, thereby preventing some denial of service attacks. When OSDs peer and check heartbeats, they use the cluster (back-end) network when it's available. See `Monitor/OSD Interaction`_ for details. diff -Nru ceph-12.1.1/doc/rados/troubleshooting/troubleshooting-pg.rst ceph-12.1.2/doc/rados/troubleshooting/troubleshooting-pg.rst --- ceph-12.1.1/doc/rados/troubleshooting/troubleshooting-pg.rst 2017-07-17 16:56:02.000000000 +0000 +++ ceph-12.1.2/doc/rados/troubleshooting/troubleshooting-pg.rst 2017-08-01 17:55:40.000000000 +0000 @@ -49,7 +49,7 @@ Fewer OSDs than Replicas ------------------------ -If you've brought up two OSDs to an ``up`` and ``in`` state, but you still +If you have brought up two OSDs to an ``up`` and ``in`` state, but you still don't see ``active + clean`` placement groups, you may have an ``osd pool default size`` set to greater than ``2``. @@ -76,7 +76,7 @@ ``ceph pg dump``), you can force the first OSD to notice the placement groups it needs by running:: - ceph pg force_create_pg + ceph osd force-create-pg CRUSH Map Errors @@ -328,7 +328,7 @@ mapped to OSDs, a small number of placement groups will not distribute across your cluster. Try creating a pool with a placement group count that is a multiple of the number of OSDs. See `Placement Groups`_ for details. The default -placement group count for pools isn't useful, but you can change it `here`_. +placement group count for pools is not useful, but you can change it `here`_. Can't Write Data diff -Nru ceph-12.1.1/doc/radosgw/multitenancy.rst ceph-12.1.2/doc/radosgw/multitenancy.rst --- ceph-12.1.1/doc/radosgw/multitenancy.rst 2017-07-17 16:56:02.000000000 +0000 +++ ceph-12.1.2/doc/radosgw/multitenancy.rst 2017-08-01 17:55:40.000000000 +0000 @@ -38,7 +38,9 @@ # radosgw-admin --tenant testx --uid tester --display-name "Test User" --subuser tester:test --key-type swift --access full user create # radosgw-admin --subuser 'testx$tester:test' --key-type swift --secret test123 -Note that the subuser with explicit tenant had to be quoted in the shell. +.. note:: The subuser with explicit tenant has to be quoted in the shell. + + Tenant names may contain only alphanumeric characters and underscores. Accessing Buckets with Explicit Tenants ======================================= diff -Nru ceph-12.1.1/doc/radosgw/s3/bucketops.rst ceph-12.1.2/doc/radosgw/s3/bucketops.rst --- ceph-12.1.1/doc/radosgw/s3/bucketops.rst 2017-07-17 16:56:02.000000000 +0000 +++ ceph-12.1.2/doc/radosgw/s3/bucketops.rst 2017-08-01 17:55:40.000000000 +0000 @@ -292,7 +292,7 @@ +------------------------+-----------+--------------------------------------------------------------------------------------+ | ``max-uploads`` | Integer | The maximum number of multipart uploads. The range from 1-1000. The default is 1000. | +------------------------+-----------+--------------------------------------------------------------------------------------+ -| ``upload-id-marker`` | String | Ignored if ``key-marker`` isn't specified. Specifies the ``ID`` of first | +| ``upload-id-marker`` | String | Ignored if ``key-marker`` is not specified. Specifies the ``ID`` of first | | | | upload to list in lexicographical order at or following the ``ID``. | +------------------------+-----------+--------------------------------------------------------------------------------------+ @@ -374,4 +374,4 @@ | ``VersioningConfiguration`` | Container | A container for the request. | +-----------------------------+-----------+---------------------------------------------------------------------------+ | ``Status`` | String | Sets the versioning state of the bucket. Valid Values: Suspended/Enabled | -+-----------------------------+-----------+---------------------------------------------------------------------------+ \ No newline at end of file ++-----------------------------+-----------+---------------------------------------------------------------------------+ diff -Nru ceph-12.1.1/doc/radosgw/s3/perl.rst ceph-12.1.2/doc/radosgw/s3/perl.rst --- ceph-12.1.1/doc/radosgw/s3/perl.rst 2017-07-17 16:56:02.000000000 +0000 +++ ceph-12.1.2/doc/radosgw/s3/perl.rst 2017-08-01 17:55:40.000000000 +0000 @@ -155,9 +155,9 @@ .. note:: The `Amazon::S3`_ module does not have a way to generate download - URLs, so we're going to be using another module instead. Unfortunately, + URLs, so we are going to be using another module instead. Unfortunately, most modules for generating these URLs assume that you are using Amazon, - so we've had to go with using a more obscure module, `Muck::FS::S3`_. This + so we have had to go with using a more obscure module, `Muck::FS::S3`_. This should be the same as Amazon's sample S3 perl module, but this sample module is not in CPAN. So, you can either use CPAN to install `Muck::FS::S3`_, or install Amazon's sample S3 module manually. If you go diff -Nru ceph-12.1.1/doc/radosgw/swift/containerops.rst ceph-12.1.2/doc/radosgw/swift/containerops.rst --- ceph-12.1.1/doc/radosgw/swift/containerops.rst 2017-07-17 16:56:02.000000000 +0000 +++ ceph-12.1.2/doc/radosgw/swift/containerops.rst 2017-08-01 17:55:40.000000000 +0000 @@ -257,7 +257,7 @@ To delete a container, make a ``DELETE`` request with the API version, account, and the name of the container. The container must be empty. If you'd like to check if the container is empty, execute a ``HEAD`` request against the container. Once -you've successfully removed the container, you'll be able to reuse the container name. +you have successfully removed the container, you will be able to reuse the container name. Syntax ~~~~~~ diff -Nru ceph-12.1.1/doc/radosgw/swift/objectops.rst ceph-12.1.2/doc/radosgw/swift/objectops.rst --- ceph-12.1.1/doc/radosgw/swift/objectops.rst 2017-07-17 16:56:02.000000000 +0000 +++ ceph-12.1.2/doc/radosgw/swift/objectops.rst 2017-08-01 17:55:40.000000000 +0000 @@ -146,7 +146,7 @@ To delete an object, make a ``DELETE`` request with the API version, account, container and object name. You must have write permissions on the container to delete -an object within it. Once you've successfully deleted the object, you'll be able to +an object within it. Once you have successfully deleted the object, you will be able to reuse the object name. Syntax diff -Nru ceph-12.1.1/doc/rbd/librbdpy.rst ceph-12.1.2/doc/rbd/librbdpy.rst --- ceph-12.1.1/doc/rbd/librbdpy.rst 2017-07-17 16:56:02.000000000 +0000 +++ ceph-12.1.2/doc/rbd/librbdpy.rst 2017-08-01 17:55:40.000000000 +0000 @@ -34,7 +34,7 @@ cannot be :type:unicode - `Librbd` does not know how to deal with characters wider than a :c:type:char. -In the end, you'll want to close the image, the IO context and the connection to RADOS:: +In the end, you will want to close the image, the IO context and the connection to RADOS:: image.close() ioctx.close() diff -Nru ceph-12.1.1/doc/rbd/libvirt.rst ceph-12.1.2/doc/rbd/libvirt.rst --- ceph-12.1.1/doc/rbd/libvirt.rst 2017-07-17 16:56:02.000000000 +0000 +++ ceph-12.1.2/doc/rbd/libvirt.rst 2017-08-01 17:55:40.000000000 +0000 @@ -47,7 +47,7 @@ To create VMs that use Ceph block devices, use the procedures in the following -sections. In the exemplary embodiment, we've used ``libvirt-pool`` for the pool +sections. In the exemplary embodiment, we have used ``libvirt-pool`` for the pool name, ``client.libvirt`` for the user name, and ``new-libvirt-image`` for the image name. You may use any value you like, but ensure you replace those values when executing commands in the subsequent procedures. @@ -58,7 +58,7 @@ To configure Ceph for use with ``libvirt``, perform the following steps: -#. `Create a pool`_ (or use the default). The following example uses the +#. `Create a pool`_. The following example uses the pool name ``libvirt-pool`` with 128 placement groups. :: ceph osd pool create libvirt-pool 128 128 @@ -67,15 +67,19 @@ ceph osd lspools -#. `Create a Ceph User`_ (or use ``client.admin`` for version 0.9.7 and - earlier). The following example uses the Ceph user name ``client.libvirt`` +#. Use the ``rbd`` tool to initialize the pool for use by RBD:: + + rbd pool init + +#. `Create a Ceph User`_ (or use ``client.admin`` for version 0.9.7 and + earlier). The following example uses the Ceph user name ``client.libvirt`` and references ``libvirt-pool``. :: - ceph auth get-or-create client.libvirt mon 'allow r' osd 'allow class-read object_prefix rbd_children, allow rwx pool=libvirt-pool' + ceph auth get-or-create client.libvirt mon 'profile rbd' osd 'profile rbd pool=libvirt-pool' Verify the name exists. :: - ceph auth list + ceph auth ls **NOTE**: ``libvirt`` will access Ceph using the ID ``libvirt``, not the Ceph name ``client.libvirt``. See `User Management - User`_ and diff -Nru ceph-12.1.1/doc/rbd/rados-rbd-cmds.rst ceph-12.1.2/doc/rbd/rados-rbd-cmds.rst --- ceph-12.1.1/doc/rbd/rados-rbd-cmds.rst 2017-07-17 16:56:02.000000000 +0000 +++ ceph-12.1.2/doc/rbd/rados-rbd-cmds.rst 2017-08-01 17:55:40.000000000 +0000 @@ -13,6 +13,41 @@ .. important:: To use Ceph Block Device commands, you must have access to a running Ceph cluster. +Create a Block Device Pool +========================== + +#. On the admin node, use the ``ceph`` tool to `create a pool`_. + +#. On the admin node, use the ``rbd`` tool to initialize the pool for use by RBD:: + + rbd pool init + +.. note:: The ``rbd`` tool assumes a default pool name of 'rbd' when not + provided. + +Create a Block Device User +========================== + +Unless specified, the ``rbd`` command will access the Ceph cluster using the ID +``admin``. This ID allows full administrative access to the cluster. It is +recommended that you utilize a more restricted user wherever possible. + +To `create a Ceph user`_, with ``ceph`` specify the ``auth get-or-create`` +command, user name, monitor caps, and OSD caps:: + + ceph auth get-or-create client.{ID} mon 'profile rbd' osd 'profile {profile name} [pool={pool-name}][, profile ...]' + +For example, to create a user ID named ``qemu`` with read-write access to the +pool ``vms`` and read-only access to the pool ``images``, execute the +following:: + + ceph auth get-or-create client.qemu mon 'profile rbd' osd 'profile rbd pool=vms, profile rbd-read-only pool=images' + +The output from the ``ceph auth get-or-create`` command will be the keyring for +the specified user, which can be written to ``/etc/ceph/ceph.client.{ID}.keyring``. + +.. note:: The user ID can be specified when using the ``rbd`` command by + providing the ``--id {id}`` optional argument. Creating a Block Device Image ============================= @@ -22,7 +57,7 @@ the following:: rbd create --size {megabytes} {pool-name}/{image-name} - + For example, to create a 1GB image named ``bar`` that stores information in a pool named ``swimmingpool``, execute the following:: @@ -53,7 +88,21 @@ For example:: rbd ls swimmingpool - + +To list deferred delete block devices in the ``rbd`` pool, execute the +following:: + + rbd trash ls + +To list deferred delete block devices in a particular pool, execute the +following, but replace ``{poolname}`` with the name of the pool:: + + rbd trash ls {poolname} + +For example:: + + rbd trash ls swimmingpool + Retrieving Image Information ============================ @@ -96,22 +145,79 @@ with the name of the image you want to remove:: rbd rm {image-name} - + For example:: rbd rm foo - + To remove a block device from a pool, execute the following, but replace ``{image-name}`` with the name of the image to remove and replace ``{pool-name}`` with the name of the pool:: rbd rm {pool-name}/{image-name} - + For example:: rbd rm swimmingpool/bar +To defer delete a block device from a pool, execute the following, but +replace ``{image-name}`` with the name of the image to move and replace +``{pool-name}`` with the name of the pool:: + + rbd trash mv {pool-name}/{image-name} + +For example:: + + rbd trash mv swimmingpool/bar + +To remove a deferred block device from a pool, execute the following, but +replace ``{image-id}`` with the id of the image to remove and replace +``{pool-name}`` with the name of the pool:: + + rbd trash rm {pool-name}/{image-id} + +For example:: + + rbd trash rm swimmingpool/2bf4474b0dc51 + +.. note:: + + * You can move an image to the trash even it has shapshot(s) or actively + in-use by clones, but can not be removed from trash. + + * You can use *--delay* to set the defer time (default is 0), and if its + deferment time has not expired, it can not be removed unless you use + force. + +Restoring a Block Device Image +============================== + +To restore a deferred delete block device in the rbd pool, execute the +following, but replace ``{image-id}`` with the id of the image:: + + rbd trash restore {image-d} + +For example:: + + rbd trash restore 2bf4474b0dc51 + +To restore a deferred delete block device in a particular pool, execute +the following, but replace ``{image-id}`` with the id of the image and +replace ``{pool-name}`` with the name of the pool:: + + rbd trash restore {pool-name}/{image-id} + +For example:: + + rbd trash restore swimmingpool/2bf4474b0dc51 + +Also you can use *--image* to rename the iamge when restore it, for +example:: + + rbd trash restore swimmingpool/2bf4474b0dc51 --image new-name +.. _create a pool: ../../rados/operations/pools/#create-a-pool .. _Storage Pools: ../../rados/operations/pools .. _RBD – Manage RADOS Block Device (RBD) Images: ../../man/8/rbd/ +.. _create a Ceph user: ../../rados/operations/user-management#add-a-user diff -Nru ceph-12.1.1/doc/rbd/rbd-cloudstack.rst ceph-12.1.2/doc/rbd/rbd-cloudstack.rst --- ceph-12.1.1/doc/rbd/rbd-cloudstack.rst 2017-07-17 16:56:02.000000000 +0000 +++ ceph-12.1.2/doc/rbd/rbd-cloudstack.rst 2017-08-01 17:55:40.000000000 +0000 @@ -68,6 +68,11 @@ for your pools, and `Placement Groups`_ for details on the number of placement groups you should set for your pools. +A newly created pool must initialized prior to use. Use the ``rbd`` tool +to initialize the pool:: + + rbd pool init cloudstack + Create a Ceph User ================== @@ -76,7 +81,7 @@ use ``client.admin`` for this, it's recommended to create a user with only access to the ``cloudstack`` pool. :: - ceph auth get-or-create client.cloudstack mon 'allow r' osd 'allow class-read object_prefix rbd_children, allow rwx pool=cloudstack' + ceph auth get-or-create client.cloudstack mon 'profile rbd' osd 'profile rbd pool=cloudstack' Use the information returned by the command in the next step when adding the Primary Storage. diff -Nru ceph-12.1.1/doc/rbd/rbd-openstack.rst ceph-12.1.2/doc/rbd/rbd-openstack.rst --- ceph-12.1.1/doc/rbd/rbd-openstack.rst 2017-07-17 16:56:02.000000000 +0000 +++ ceph-12.1.2/doc/rbd/rbd-openstack.rst 2017-08-01 17:55:40.000000000 +0000 @@ -90,6 +90,14 @@ your pools, and `Placement Groups`_ for details on the number of placement groups you should set for your pools. +Newly created pools must initialized prior to use. Use the ``rbd`` tool +to initialize the pools:: + + rbd pool init volumes + rbd pool init images + rbd pool init backups + rbd pool init vms + .. _Create a Pool: ../../rados/operations/pools#createpool .. _Placement Groups: ../../rados/operations/placement-groups @@ -106,7 +114,7 @@ Install Ceph client packages ---------------------------- -On the ``glance-api`` node, you'll need the Python bindings for ``librbd``:: +On the ``glance-api`` node, you will need the Python bindings for ``librbd``:: sudo apt-get install python-rbd sudo yum install python-rbd @@ -124,17 +132,9 @@ If you have `cephx authentication`_ enabled, create a new user for Nova/Cinder and Glance. Execute the following:: - ceph auth get-or-create client.glance mon 'allow r' osd 'allow class-read object_prefix rbd_children, allow rwx pool=images' - ceph auth get-or-create client.cinder-backup mon 'allow r' osd 'allow class-read object_prefix rbd_children, allow rwx pool=backups' - -If you run an OpenStack version before Mitaka, create the following ``client.cinder`` key:: - - ceph auth get-or-create client.cinder mon 'allow r' osd 'allow class-read object_prefix rbd_children, allow rwx pool=volumes, allow rwx pool=vms, allow rx pool=images' - -Since Mitaka introduced the support of RBD snapshots while doing a snapshot of a Nova instance, -we need to allow the ``client.cinder`` key write access to the ``images`` pool; therefore, create the following key:: - - ceph auth get-or-create client.cinder mon 'allow r' osd 'allow class-read object_prefix rbd_children, allow rwx pool=volumes, allow rwx pool=vms, allow rwx pool=images' + ceph auth get-or-create client.glance mon 'profile rbd' osd 'profile rbd pool=images' + ceph auth get-or-create client.cinder mon 'profile rbd' osd 'profile rbd pool=volumes, profile rbd pool=vms, profile rbd pool=images' + ceph auth get-or-create client.cinder-backup mon 'profile rbd' osd 'profile rbd pool=backups' Add the keyrings for ``client.cinder``, ``client.glance``, and ``client.cinder-backup`` to the appropriate nodes and change their ownership:: @@ -304,7 +304,7 @@ rados_connect_timeout = -1 glance_api_version = 2 -If you're using `cephx authentication`_, also configure the user and uuid of +If you are using `cephx authentication`_, also configure the user and uuid of the secret you added to ``libvirt`` as documented earlier:: [ceph] diff -Nru ceph-12.1.1/doc/release-notes.rst ceph-12.1.2/doc/release-notes.rst --- ceph-12.1.1/doc/release-notes.rst 2017-07-17 16:56:02.000000000 +0000 +++ ceph-12.1.2/doc/release-notes.rst 2017-08-01 17:55:40.000000000 +0000 @@ -19,7 +19,7 @@ - *General*: * Ceph now has a simple, built-in web-based dashboard for monitoring - cluster status. FIXME DOCS. + cluster status. See :doc:`/mgr/dashboard/`. - *RADOS*: @@ -37,7 +37,7 @@ BlueStore for performance reasons.) FIXME DOCS * *Erasure coded* pools now have full support for *overwrites*, - allowing them to be used with RBD and CephFS. `Read more about EC overwrites`_. + allowing them to be used with RBD and CephFS. See :doc:`/rados/operations/erasure-code/#erasure-coding-with-overwrites`. * *ceph-mgr*: @@ -48,32 +48,36 @@ *ceph-mgr* for reliability. See the notes on `Upgrading`_ below. - The *ceph-mgr* daemon includes a REST-based management API. The API is still experimental and somewhat limited but will form the basis - for API-based management of Ceph going forward. FIXME DOCS + for API-based management of Ceph going forward. See :doc:`/mgr/restful`. + - *ceph-mgr* also includes a Prometheus exporter plugin, which can + provide Ceph perfcounters to Prometheus. See :doc:`/mgr/prometheus`. * The overall *scalability* of the cluster has improved. We have successfully tested clusters with up to 10,000 OSDs. - * Each OSD can now have a *device class* associated with it (e.g., `hdd` or - `ssd`), allowing CRUSH rules to trivially map data to a subset of devices - in the system. Manually writing CRUSH rules or manual editing of the CRUSH - is normally not required. FIXME DOCS - * You can now *optimize CRUSH weights* can now be optimized to - maintain a *near-perfect distribution of data* across OSDs. FIXME DOCS + * Each OSD can now have a *device class* associated with it (e.g., + `hdd` or `ssd`), allowing CRUSH rules to trivially map data to a + subset of devices in the system. Manually writing CRUSH rules or + manual editing of the CRUSH is normally not required. See + :doc:`/rados/operations/crush-map/#crush-structure`. + * You can now *optimize CRUSH weights* to maintain a *near-perfect + distribution of data* across OSDs. FIXME DOCS * There is also a new `upmap` exception mechanism that allows individual PGs to be moved around to achieve a *perfect - distribution* (this requires luminous clients). FIXME DOCS + distribution* (this requires luminous clients). See + :doc:`/rados/operations/upmap`. * Each OSD now adjusts its default configuration based on whether the backing device is an HDD or SSD. Manual tuning generally not required. - * The prototype *mclock QoS queueing algorithm* is now available. FIXME DOCS + * The prototype `mClock QoS queueing algorithm ` is now available. * There is now a *backoff* mechanism that prevents OSDs from being overloaded by requests to objects or PGs that are not currently able to process IO. - * There is a *simplified OSD replacement process* that is more robust. FIXME DOCS + * There is a simplified OSD replacement process that is more robust (see :doc:`/rados/operations/add-or-rm-osds/#replacing-an-osd`). * You can query the supported features and (apparent) releases of - all connected daemons and clients with ``ceph features``. FIXME DOCS + all connected daemons and clients with `ceph features `_. * You can configure the oldest Ceph client version you wish to allow to connect to the cluster via ``ceph osd set-require-min-compat-client`` and Ceph will prevent you from enabling features that will break compatibility - with those clients. FIXME DOCS + with those clients. * Several `sleep` settings, include ``osd_recovery_sleep``, ``osd_snap_trim_sleep``, and ``osd_scrub_sleep`` have been reimplemented to work efficiently. (These are used in some cases @@ -127,7 +131,7 @@ * Improved discard handling when the object map feature is enabled. * rbd CLI ``import`` and ``copy`` commands now detect sparse and preserve sparse regions. - * Images and Snapshots will now include a creation timestamp + * Images and Snapshots will now include a creation timestamp. - *CephFS*: @@ -158,17 +162,20 @@ * *CLI changes*: - The ``ceph -s`` or ``ceph status`` command has a fresh look. - - ``ceph {osd,mds,mon} versions`` summarizes versions of running daemons. - - ``ceph {osd,mds,mon} count-metadata `` similarly + - ``ceph mgr metadata`` will dump metadata associated with each mgr + daemon. + - ``ceph versions`` or ``ceph {osd,mds,mon,mgr} versions`` + summarize versions of running daemons. + - ``ceph {osd,mds,mon,mgr} count-metadata `` similarly tabulates any other daemon metadata visible via the ``ceph - {osd,mds,mon} metadata`` commands. + {osd,mds,mon,mgr} metadata`` commands. - ``ceph features`` summarizes features and releases of connected clients and daemons. - ``ceph osd require-osd-release `` replaces the old ``require_RELEASE_osds`` flags. - ``ceph osd pg-upmap``, ``ceph osd rm-pg-upmap``, ``ceph osd pg-upmap-items``, ``ceph osd rm-pg-upmap-items`` can explicitly - manage `upmap` items (FIXME DOCS). + manage `upmap` items (see :doc:`/rados/operations/upmap`). - ``ceph osd getcrushmap`` returns a crush map version number on stderr, and ``ceph osd setcrushmap [version]`` will only inject an updated crush map if the version matches. This allows crush @@ -190,7 +197,7 @@ for applying changes to entire subtrees. For example, ``ceph osd down `ceph osd ls-tree rack1```. - ``ceph osd {add,rm}-{noout,noin,nodown,noup}`` allow the - `noout`, `nodown`, `noin`, and `noup` flags to be applied to + `noout`, `noin`, `nodown`, and `noup` flags to be applied to specific OSDs. - ``ceph log last [n]`` will output the last *n* lines of the cluster log. @@ -216,6 +223,9 @@ - ``ceph config-key dump`` dumps config-key entries and their contents. (The existing ``ceph config-key list`` only dumps the key names, not the values.) + - ``ceph config-key list`` is deprecated in favor of ``ceph config-key ls``. + - ``ceph auth list`` is deprecated in favor of ``ceph auth ls``. + - ``ceph osd crush rule list`` is deprecated in favor of ``ceph osd crush rule ls``. - ``ceph osd set-{full,nearfull,backfillfull}-ratio`` sets the cluster-wide ratio for various full thresholds (when the cluster refuses IO, when the cluster warns about being close to full, @@ -224,9 +234,14 @@ - ``ceph osd reweightn`` will specify the `reweight` values for multiple OSDs in a single command. This is equivalent to a series of ``ceph osd reweight`` commands. - - ``ceph osd crush class {create,rm,ls,rename}`` manage the new + - ``ceph osd crush class {rm,ls,ls-osd}`` manage the new CRUSH *device class* feature. ``ceph crush set-device-class [...]`` will set the class for particular devices. + Note that if you specify a non-existent class, it will be created + automatically. ``ceph crush rm-device-class [...]`` + will instead remove the class for particular devices. + And if a class contains no more devices, it will be automatically + destoryed. - ``ceph osd crush rule create-replicated`` replaces the old ``ceph osd crush rule create-simple`` command to create a CRUSH rule for a replicated pool. Notably it takes a `class` argument @@ -237,8 +252,6 @@ these exist yet). - ``ceph tell help`` will now return a usage summary. -.. _Read more about EC overwrites: ../rados/operations/erasure-code/#erasure-coding-with-overwrites - Major Changes from Jewel ------------------------ @@ -311,6 +324,10 @@ #. Do not create any new erasure-code pools while upgrading the monitors. +#. You can monitor the progress of your upgrade at each stage with the + ``ceph versions`` command, which will tell you what ceph version is + running for each type of daemon. + #. Set the ``noout`` flag for the duration of the upgrade. (Optional but recommended.):: @@ -318,7 +335,7 @@ #. Upgrade monitors by installing the new packages and restarting the monitor daemons. Note that, unlike prior releases, the ceph-mon - daemons *must* be upgraded first.:: + daemons *must* be upgraded first:: # systemctl restart ceph-mon.target @@ -342,7 +359,7 @@ If you are upgrading from kraken, you may already have ceph-mgr daemons deployed. If not, or if you are upgrading from jewel, you can deploy new daemons with tools like ceph-deploy or ceph-ansible. - For example,:: + For example:: # ceph-deploy mgr create HOST @@ -357,12 +374,12 @@ ... #. Upgrade all OSDs by installing the new packages and restarting the - ceph-osd daemons on all hosts.:: + ceph-osd daemons on all hosts:: # systemctl restart ceph-osd.target You can monitor the progress of the OSD upgrades with the new - ``ceph osd versions`` command.:: + ``ceph versions`` or ``ceph osd versions`` command:: # ceph osd versions { @@ -371,12 +388,12 @@ } #. Upgrade all CephFS daemons by upgrading packages and restarting - daemons on all hosts.:: + daemons on all hosts:: # systemctl restart ceph-mds.target #. Upgrade all radosgw daemons by upgrading packages and restarting - daemons on all hosts.:: + daemons on all hosts:: # systemctl restart radosgw.target @@ -420,6 +437,9 @@ (when the ``ceph osd require-osd-release luminous`` command is run) but any provisioning tools that create erasure coded pools may need to be updated. +* The structure of the XML output for ``osd crush tree`` has changed + slightly to better match the ``osd tree`` output. The top level + structure is now ``nodes`` instead of ``crush_map_roots``. * When assigning a network to the public network and not to the cluster network the network specification of the public network will be used for the cluster network as well. @@ -1011,7 +1031,7 @@ * bluestore: os/bluestore: fix typo(s/trasnaction/transaction/) (`pr#14890 `_, xie xingguo) * bluestore: os/bluestore: fix use after free race with aio_wait (`pr#14956 `_, Sage Weil) * bluestore: os/bluestore: pre-calculate number of ghost buffers to evict (`pr#15029 `_, xie xingguo) -* bluestore: os/bluestore: Record l_bluestore_state_kv_queued_lat for sync_submit_… (`pr#14448 `_, Jianpeng Ma) +* bluestore: os/bluestore: Record l_bluestore_state_kv_queued_lat for sync\_submit\_… (`pr#14448 `_, Jianpeng Ma) * bluestore: os/bluestore: Remove ExtentFreeListManager. (`pr#14772 `_, Jianpeng Ma) * bluestore: os/bluestore: remove unused condition variable (`pr#14973 `_, Igor Fedotov) * bluestore: os/bluestore: rename/fix throttle options (`pr#14717 `_, Sage Weil) @@ -1785,7 +1805,7 @@ * crush: add devices class that rules can use as a filter (`issue#18943 `_, `pr#13444 `_, Loic Dachary) * crush: add --dump to crushtool (`pr#13726 `_, Loic Dachary) * crush: allow uniform buckets with no items (`pr#13521 `_, Loic Dachary) -* crush: document tunables and rule step set_ (`pr#13722 `_, Loic Dachary) +* crush: document tunables and rule step set\_ (`pr#13722 `_, Loic Dachary) * crush: do is_out test only if we do not collide (`pr#13326 `_, xie xingguo) * crush: fix dprintk compilation (`pr#13424 `_, Loic Dachary) * debian: Add missing tp files in deb packaging (`pr#13526 `_, Ganesh Mahalingam) @@ -2310,7 +2330,7 @@ * cleanup: common/config: fix return type of string::find and use string::npos (`pr#9924 `_, Yan Jun) * cleanup: common/config_opts.h: remove obsolete configuration option (`pr#12659 `_, Li Wang) * cleanup,common: global: we need to handle the init_on_startup return value when global_init. (`pr#13018 `_, song baisen) -* cleanup,common: msg/async: assert if compiled code doesn't support the configured ms_… (`pr#12559 `_, Avner BenHanoch) +* cleanup,common: msg/async: assert if compiled code doesn't support the configured ms\_… (`pr#12559 `_, Avner BenHanoch) * cleanup,common: msg/async/rdma: clean line endings (`pr#12688 `_, Adir Lev) * cleanup,common: msg/async/rdma: Remove compilation warning (`pr#13142 `_, Sarit Zubakov) * cleanup,common: osd/OSDMap: get_previous_up_osd_before() may run into endless loop (`pr#12976 `_, Mingxin Liu) @@ -2839,7 +2859,7 @@ * The 'ceph osd perf' command will display 'commit_latency(ms)' and 'apply_latency(ms)'. Previously, the names of these two columns are 'fs_commit_latency(ms)' and 'fs_apply_latency(ms)'. We remove the - prefix 'fs_', because they are not filestore specific. + prefix 'fs\_', because they are not filestore specific. * Monitors will no longer allow pools to be removed by default. The setting mon_allow_pool_delete has to be set to true (defaults to @@ -3031,7 +3051,7 @@ * build/ops: rpm: Remove trailing whitespace in usermod command (SUSE) (`pr#10707 `_, Tim Serong) * build/ops: scripts/release-notes: allow title guesses from gh tags & description update (`pr#11399 `_, Abhishek Lekshmanan) * build/ops: systemd: Fix startup of ceph-mgr on Debian 8 (`pr#12555 `_, Mark Korenberg) -* build/ops: tracing/objectstore.tp: add missing move_ranges_... tp (`pr#11484 `_, Sage Weil) +* build/ops: tracing/objectstore.tp: add missing move_ranges\_... tp (`pr#11484 `_, Sage Weil) * build/ops: upstart: fix ceph-crush-location default (`issue#6698 `_, `pr#803 `_, Jason Dillaman) * build/ops: upstart: start ceph-all after static-network-up (`issue#17689 `_, `pr#11631 `_, Billy Olsen) * cephfs: add gid to asok status (`pr#11487 `_, Patrick Donnelly) @@ -3509,7 +3529,7 @@ * osd: print log when osd want to kill self (`pr#9288 `_, Haomai Wang) * osd: Remove extra call to reg_next_scrub() during splits (`issue#16474 `_, `pr#11206 `_, David Zafman) * osd: remove redudant call of heartbeat_check (`pr#12130 `_, Pan Liu) -* osd: remove the lock heartbeat_update_lock, and change heatbeat_need_… (`pr#12461 `_, Pan Liu) +* osd: remove the lock heartbeat_update_lock, and change heatbeat_need\_… (`pr#12461 `_, Pan Liu) * osd: remove the redundant clear method in consume_map function (`pr#10553 `_, song baisen) * osd: Remove unused '_lsb_release_' declarations (`pr#11364 `_, Brad Hubbard) * osd: replace hb_out and hb_in with a single hb_peers (`issue#18057 `_, `pr#12178 `_, Pan Liu) @@ -4577,7 +4597,7 @@ * rgw: merge setting flags operation together and cleanups (`pr#10203 `_, Yan Jun) * rgw: miscellaneous cleanups (`pr#10299 `_, Yan Jun) * rgw: multiple fixes for Swift's object expiration (`issue#16705 `_, `issue#16684 `_, `pr#10330 `_, Radoslaw Zarzynski) -* rgw: need to 'open_object_section' before dump stats in 'RGWGetUsage_… (`issue#17499 `_, `pr#11325 `_, weiqiaomiao) +* rgw: need to 'open_object_section' before dump stats in 'RGWGetUsage\_… (`issue#17499 `_, `pr#11325 `_, weiqiaomiao) * rgw: obsolete 'radosgw-admin period prepare' command (`issue#17387 `_, `pr#11278 `_, Gaurav Kumar Garg) * rgw: radosgw-admin: add "--orphan-stale-secs" to --help (`issue#17280 `_, `pr#11098 `_, Ken Dreyer) * rgw: radosgw-admin: zone[group] modify can change realm id (`issue#16839 `_, `pr#10477 `_, Casey Bodley) @@ -4806,7 +4826,7 @@ * osd: reindex properly on pg log split (`issue#18975 `_, `pr#14047 `_, Alexey Sheplyakov) * osd: restrict want_acting to up+acting on recovery completion (`issue#18929 `_, `pr#13541 `_, Sage Weil) * rbd-nbd: check /sys/block/nbdX/size to ensure kernel mapped correctly (`issue#18335 `_, `pr#13932 `_, Mykola Golub, Alexey Sheplyakov) -* rbd: [api] temporarily restrict (rbd_)mirror_peer_add from adding multiple peers (`issue#19256 `_, `pr#14664 `_, Jason Dillaman) +* rbd: [api] temporarily restrict (rbd\_)mirror_peer_add from adding multiple peers (`issue#19256 `_, `pr#14664 `_, Jason Dillaman) * rbd: qemu crash triggered by network issues (`issue#18436 `_, `pr#13244 `_, Jason Dillaman) * rbd: rbd --pool=x rename y z does not work (`issue#18326 `_, `pr#14148 `_, Gaurav Kumar Garg) * rbd: systemctl stop rbdmap unmaps all rbds and not just the ones in /etc/ceph/rbdmap (`issue#18884 `_, `issue#18262 `_, `pr#14083 `_, David Disseldorp, Nathan Cutler) @@ -5807,7 +5827,7 @@ * For all distributions that support systemd (CentOS 7, Fedora, Debian Jessie 8.x, OpenSUSE), ceph daemons are now managed using native systemd - files instead of the legacy sysvinit scripts. For example,:: + files instead of the legacy sysvinit scripts. For example:: systemctl start ceph.target # start all daemons systemctl status ceph-osd@12 # check status of osd.12 @@ -5848,7 +5868,7 @@ ceph-deploy install --stable jewel HOST - #. Stop the daemon(s).:: + #. Stop the daemon(s):: service ceph stop # fedora, centos, rhel, debian stop ceph-all # ubuntu @@ -5858,7 +5878,7 @@ chown -R ceph:ceph /var/lib/ceph chown -R ceph:ceph /var/log/ceph - #. Restart the daemon(s).:: + #. Restart the daemon(s):: start ceph-all # ubuntu systemctl start ceph.target # debian, centos, fedora, rhel @@ -9419,7 +9439,7 @@ * For all distributions that support systemd (CentOS 7, Fedora, Debian Jessie 8.x, OpenSUSE), ceph daemons are now managed using native systemd - files instead of the legacy sysvinit scripts. For example,:: + files instead of the legacy sysvinit scripts. For example:: systemctl start ceph.target # start all daemons systemctl status ceph-osd@12 # check status of osd.12 @@ -9459,7 +9479,7 @@ ceph-deploy install --stable infernalis HOST - #. Stop the daemon(s).:: + #. Stop the daemon(s):: service ceph stop # fedora, centos, rhel, debian stop ceph-all # ubuntu @@ -9469,7 +9489,7 @@ chown -R ceph:ceph /var/lib/ceph chown -R ceph:ceph /var/log/ceph - #. Restart the daemon(s).:: + #. Restart the daemon(s):: start ceph-all # ubuntu systemctl start ceph.target # debian, centos, fedora, rhel @@ -10110,7 +10130,7 @@ * For all distributions that support systemd (CentOS 7, Fedora, Debian Jessie 8.x, OpenSUSE), ceph daemons are now managed using native systemd - files instead of the legacy sysvinit scripts. For example,:: + files instead of the legacy sysvinit scripts. For example:: systemctl start ceph.target # start all daemons systemctl status ceph-osd@12 # check status of osd.12 @@ -10149,7 +10169,7 @@ ceph-deploy install --stable infernalis HOST - #. Stop the daemon(s).:: + #. Stop the daemon(s):: service ceph stop # fedora, centos, rhel, debian stop ceph-all # ubuntu @@ -10159,7 +10179,7 @@ chown -R ceph:ceph /var/lib/ceph chown -R ceph:ceph /var/log/ceph - #. Restart the daemon(s).:: + #. Restart the daemon(s):: start ceph-all # ubuntu systemctl start ceph.target # debian, centos, fedora, rhel @@ -18716,7 +18736,7 @@ Upgrading a cluster without adjusting the Ceph configuration will likely prevent the system from starting up on its own. We recommend first modifying the configuration to indicate that authentication is - disabled, and only then upgrading to the latest version.:: + disabled, and only then upgrading to the latest version:: auth client required = none auth service required = none diff -Nru ceph-12.1.1/doc/releases.rst ceph-12.1.2/doc/releases.rst --- ceph-12.1.1/doc/releases.rst 2017-07-17 16:56:02.000000000 +0000 +++ ceph-12.1.2/doc/releases.rst 2017-08-01 17:55:40.000000000 +0000 @@ -23,6 +23,10 @@ | |Development|`Dumpling`_|`Emperor`_ |`Firefly`_ |`Giant`_ |`Hammer`_ |`Infernalis`_ |`Jewel`_ |`Kraken`_ | | |Testing |LTS |Stable |LTS |Stable |LTS |Stable |LTS |Stable | +----------------+-----------+-----------+-----------+-----------+-----------+-----------+--------------+-----------+-----------+ +| July 2017 | 12.1.1 | | | | | | |`10.2.9`_ | | +| +-----------+-----------+-----------+-----------+-----------+-----------+--------------+-----------+-----------+ +| | | | | | | | |`10.2.8`_ | | ++----------------+-----------+-----------+-----------+-----------+-----------+-----------+--------------+-----------+-----------+ | June 2017 |`12.1.0`_ | | | | | | | | | +----------------+-----------+-----------+-----------+-----------+-----------+-----------+--------------+-----------+-----------+ | May 2017 |`12.0.3`_ | | | | | | | | | @@ -36,17 +40,17 @@ | January 2017 | 11.1.1 | | | | | | | |`11.2.0`_ | +----------------+-----------+-----------+-----------+-----------+-----------+-----------+--------------+-----------+-----------+ | December 2016 | 11.1.0 | | | | | | |`10.2.5`_ | | -+ +-----------+-----------+-----------+-----------+-----------+-----------+--------------+-----------+-----------+ +| +-----------+-----------+-----------+-----------+-----------+-----------+--------------+-----------+-----------+ | | | | | | | | |`10.2.4`_ | | +----------------+-----------+-----------+-----------+-----------+-----------+-----------+--------------+-----------+-----------+ | October 2016 |`11.0.2`_ | | | | | | | | | -+ +-----------+-----------+-----------+-----------+-----------+-----------+--------------+-----------+-----------+ +| +-----------+-----------+-----------+-----------+-----------+-----------+--------------+-----------+-----------+ | | 11.0.1 | | | | | | | | | +----------------+-----------+-----------+-----------+-----------+-----------+-----------+--------------+-----------+-----------+ | September 2016 | | | | | | | |`10.2.3`_ | | +----------------+-----------+-----------+-----------+-----------+-----------+-----------+--------------+-----------+-----------+ | August 2016 | | | | | |`0.94.9`_ | | | | -+ +-----------+-----------+-----------+-----------+-----------+-----------+--------------+-----------+-----------+ +| +-----------+-----------+-----------+-----------+-----------+-----------+--------------+-----------+-----------+ | | | | | | |`0.94.8`_ | | | | +----------------+-----------+-----------+-----------+-----------+-----------+-----------+--------------+-----------+-----------+ | June 2016 | 11.0.0 | | | | | | |`10.2.2`_ | | @@ -170,6 +174,8 @@ .. _11.0.2: ../release-notes#v11-0-2-kraken +.. _10.2.9: ../release-notes#v10-2-9-jewel +.. _10.2.8: ../release-notes#v10-2-8-jewel .. _10.2.7: ../release-notes#v10-2-7-jewel .. _10.2.6: ../release-notes#v10-2-6-jewel .. _10.2.5: ../release-notes#v10-2-5-jewel diff -Nru ceph-12.1.1/doc/start/documenting-ceph.rst ceph-12.1.2/doc/start/documenting-ceph.rst --- ceph-12.1.1/doc/start/documenting-ceph.rst 2017-07-17 16:56:02.000000000 +0000 +++ ceph-12.1.2/doc/start/documenting-ceph.rst 2017-08-01 17:55:40.000000000 +0000 @@ -284,7 +284,7 @@ -Install each dependency that isn't installed on your host. For Debian/Ubuntu +Install each dependency that is not installed on your host. For Debian/Ubuntu distributions, execute the following:: sudo apt-get install gcc python-dev python-pip python-virtualenv libxml2-dev libxslt-dev doxygen graphviz ant ditaa @@ -322,7 +322,7 @@ wget http://rpmfind.net/linux/centos/7/os/x86_64/Packages/python-sphinx-1.1.3-11.el7.noarch.rpm sudo yum install python-sphinx-1.1.3-11.el7.noarch.rpm -Ceph documentation makes extensive use of `ditaa`_, which isn't presently built +Ceph documentation makes extensive use of `ditaa`_, which is not presently built for CentOS/RHEL7. You must install ``ditaa`` if you are making changes to ``ditaa`` diagrams so that you can verify that they render properly before you commit new or modified ``ditaa`` diagrams. You may retrieve compatible required diff -Nru ceph-12.1.1/doc/start/hardware-recommendations.rst ceph-12.1.2/doc/start/hardware-recommendations.rst --- ceph-12.1.1/doc/start/hardware-recommendations.rst 2017-07-17 16:56:02.000000000 +0000 +++ ceph-12.1.2/doc/start/hardware-recommendations.rst 2017-08-01 17:55:40.000000000 +0000 @@ -111,7 +111,7 @@ drive, but SSDs often exhibit access times that are at least 100x faster than a hard disk drive. -SSDs do not have moving mechanical parts so they aren't necessarily subject to +SSDs do not have moving mechanical parts so they are not necessarily subject to the same types of limitations as hard disk drives. SSDs do have significant limitations though. When evaluating SSDs, it is important to consider the performance of sequential reads and writes. An SSD that has 400MB/s sequential diff -Nru ceph-12.1.1/doc/start/index.rst ceph-12.1.2/doc/start/index.rst --- ceph-12.1.1/doc/start/index.rst 2017-07-17 16:56:02.000000000 +0000 +++ ceph-12.1.2/doc/start/index.rst 2017-08-01 17:55:40.000000000 +0000 @@ -19,7 +19,7 @@

Step 2: Storage Cluster

-Once you've completed your preflight checklist, you should be able to begin +Once you have completed your preflight checklist, you should be able to begin deploying a Ceph Storage Cluster. .. toctree:: diff -Nru ceph-12.1.1/doc/start/quick-ceph-deploy.rst ceph-12.1.2/doc/start/quick-ceph-deploy.rst --- ceph-12.1.1/doc/start/quick-ceph-deploy.rst 2017-07-17 16:56:02.000000000 +0000 +++ ceph-12.1.2/doc/start/quick-ceph-deploy.rst 2017-08-01 17:55:40.000000000 +0000 @@ -118,9 +118,9 @@ ceph-deploy admin node1 node2 node3 -#. Deploy a manager daemon.:: +#. Deploy a manager daemon. (Required only for luminous+ builds):: - ceph-deploy mgr create node1 + ceph-deploy mgr create node1 *Required only for luminous+ builds, i.e >= 12.x builds* #. Add three OSDs. For the purposes of these instructions, we assume you have an unused disk in each node called ``/dev/vdb``. *Be sure that the device is not currently in use and does not contain any important data.* diff -Nru ceph-12.1.1/doc/start/quick-rbd.rst ceph-12.1.2/doc/start/quick-rbd.rst --- ceph-12.1.1/doc/start/quick-rbd.rst 2017-07-17 16:56:02.000000000 +0000 +++ ceph-12.1.2/doc/start/quick-rbd.rst 2017-08-01 17:55:40.000000000 +0000 @@ -47,11 +47,16 @@ directory. Ensure that the keyring file has appropriate read permissions (e.g., ``sudo chmod +r /etc/ceph/ceph.client.admin.keyring``). -Create an rbd pool -================== -#. On the admin node, use the ``ceph`` tool to `Create a Pool`_ +Create a Block Device Pool +========================== + +#. On the admin node, use the ``ceph`` tool to `create a pool`_ (we recommend the name 'rbd'). +#. On the admin node, use the ``rbd`` tool to initialize the pool for use by RBD:: + + rbd pool init + Configure a Block Device ======================== @@ -82,8 +87,8 @@ See `block devices`_ for additional details. -.. _Create a Pool: ../../rados/operations/pools#createpool .. _Storage Cluster Quick Start: ../quick-ceph-deploy +.. _create a pool: ../../rados/operations/pools/#create-a-pool .. _block devices: ../../rbd/rbd .. _FAQ: http://wiki.ceph.com/How_Can_I_Give_Ceph_a_Try .. _OS Recommendations: ../os-recommendations diff -Nru ceph-12.1.1/doc/start/quick-start-preflight.rst ceph-12.1.2/doc/start/quick-start-preflight.rst --- ceph-12.1.1/doc/start/quick-start-preflight.rst 2017-07-17 16:56:02.000000000 +0000 +++ ceph-12.1.2/doc/start/quick-start-preflight.rst 2017-08-01 17:55:40.000000000 +0000 @@ -65,7 +65,7 @@ #. Add the Ceph repository to your yum configuration file at ``/etc/yum.repos.d/ceph.repo`` with the following command:: - cat >/etc/yum.repos.d/ceph.repro + cat >/etc/yum.repos.d/ceph.repo [ceph-noarch] name=Ceph noarch packages baseurl=https://download.ceph.com/rpm/el7/noarch diff -Nru ceph-12.1.1/do_cmake.sh ceph-12.1.2/do_cmake.sh --- ceph-12.1.1/do_cmake.sh 2017-07-17 16:56:02.000000000 +0000 +++ ceph-12.1.2/do_cmake.sh 2017-08-01 17:55:40.000000000 +0000 @@ -8,7 +8,7 @@ ARGS="" if which ccache ; then echo "enabling ccache" - ARGS+="-DWITH_CCACHE=ON" + ARGS="$ARGS -DWITH_CCACHE=ON" fi mkdir build diff -Nru ceph-12.1.1/install-deps.sh ceph-12.1.2/install-deps.sh --- ceph-12.1.1/install-deps.sh 2017-07-17 16:56:02.000000000 +0000 +++ ceph-12.1.2/install-deps.sh 2017-08-01 17:55:40.000000000 +0000 @@ -19,8 +19,14 @@ fi export LC_ALL=C # the following is vulnerable to i18n +function munge_ceph_spec_in { + local OUTFILE=$1 + sed -e 's/@//g' -e 's/%bcond_with make_check/%bcond_without make_check/g' < ceph.spec.in > $OUTFILE +} + if [ x`uname`x = xFreeBSDx ]; then $SUDO pkg install -yq \ + devel/babeltrace \ devel/git \ devel/gperf \ devel/gmake \ @@ -36,7 +42,7 @@ lang/cython \ devel/py-virtualenv \ databases/leveldb \ - net/openldap24-client \ + net/openldap-client \ security/nss \ security/cryptopp \ archivers/snappy \ @@ -48,14 +54,16 @@ textproc/gsed \ textproc/libxml2 \ textproc/xmlstarlet \ - textproc/jq \ - textproc/sphinx \ + textproc/jq \ + textproc/py-sphinx \ emulators/fuse \ java/junit \ + lang/python \ lang/python27 \ - devel/py-pip \ + devel/py-pip \ devel/py-argparse \ devel/py-nose \ + devel/py-prettytable \ www/py-flask \ www/fcgi \ sysutils/flock \ @@ -129,14 +137,14 @@ fi ;; esac - sed -e 's/@//g' < ceph.spec.in > $DIR/ceph.spec + munge_ceph_spec_in $DIR/ceph.spec $SUDO $builddepcmd $DIR/ceph.spec 2>&1 | tee $DIR/yum-builddep.out ! grep -q -i error: $DIR/yum-builddep.out || exit 1 ;; opensuse|suse|sles) echo "Using zypper to install dependencies" $SUDO zypper --gpg-auto-import-keys --non-interactive install lsb-release systemd-rpm-macros - sed -e 's/@//g' < ceph.spec.in > $DIR/ceph.spec + munge_ceph_spec_in $DIR/ceph.spec $SUDO zypper --non-interactive install $(rpmspec -q --buildrequires $DIR/ceph.spec) || exit 1 ;; alpine) diff -Nru ceph-12.1.1/.mailmap ceph-12.1.2/.mailmap --- ceph-12.1.1/.mailmap 2017-07-17 16:56:02.000000000 +0000 +++ ceph-12.1.2/.mailmap 2017-08-01 17:55:40.000000000 +0000 @@ -483,6 +483,8 @@ Zack Cerza Zengran Zhang Zeqiang Zhuang +Zhang Lei +Zhang Lei <243290414@qq.com> Zhang Shaowen Zhang Zezhu Guo Zhandong @@ -500,4 +502,5 @@ Zhi Zhang Zhi Zhang Zhi Zhang +Zhu Shangzhong Zhuang Xiaochun diff -Nru ceph-12.1.1/.organizationmap ceph-12.1.2/.organizationmap --- ceph-12.1.1/.organizationmap 2017-07-17 16:56:02.000000000 +0000 +++ ceph-12.1.2/.organizationmap 2017-08-01 17:55:40.000000000 +0000 @@ -358,6 +358,7 @@ Red Hat Alex Elder Red Hat Alfredo Deza Red Hat Ali Maredia +Red Hat Amit Kumar Red Hat Andrew Schoen Red Hat Barbora Ančincová Red Hat Boris Ranto @@ -521,7 +522,7 @@ The University of Arizona James Ryan Cresawn Time Warner Cable Inc. Bryan Stillwell Trendy Tech shiqi -Trendy Tech Lei Zhang <243290414@qq.com> +Trendy Tech Zhang Lei Uber Technologies Inc. Henrik Korkuc Ubuntu Kylin Min Chen UMCloud Jiaying Ren @@ -783,16 +784,17 @@ ZTE Gong Chuang ZTE Lan De ZTE Luo Kexue -ZTE Ren Huanwen ZTE Luo Runbing -ZTE Xie Xingguo -ZTE Shun Song +ZTE Ren Huanwen ZTE Song Baisen +ZTE Song Shun ZTE Song Weibin ZTE Tang Wenjun ZTE Wei Qiaomiao +ZTE Xie Xingguo ZTE Yan Jun ZTE Zhang Zezhu +ZTE Zhu Shangzhong # # Local Variables: # compile-command: "git log --pretty='%aN <%aE>' | \ diff -Nru ceph-12.1.1/PendingReleaseNotes ceph-12.1.2/PendingReleaseNotes --- ceph-12.1.1/PendingReleaseNotes 2017-07-17 16:56:02.000000000 +0000 +++ ceph-12.1.2/PendingReleaseNotes 2017-08-01 17:55:40.000000000 +0000 @@ -5,6 +5,8 @@ * Added new configuration "public bind addr" to support dynamic environments like Kubernetes. When set the Ceph MON daemon could bind locally to an IP address and advertise a different IP address "public addr" on the network. +* RGW: bucket index resharding now uses the reshard namespace in upgrade scenarios as well + this is a changed behaviour from RC1 where a new pool for reshard was created 12.0.0 ------ @@ -210,3 +212,38 @@ * The "ceph -w" output no longer contains audit log entries by default. Add a "--watch-channel=audit" or "--watch-channel=*" to see them. +12.1.2 +------ + +* New "ceph -w" behavior - the "ceph -w" output no longer contains I/O rates, + available space, pg info, etc. because these are no longer logged to the + central log (which is what "ceph -w" shows). The same information can be + obtained by running "ceph pg stat"; alternatively, I/O rates per pool can + be determined using "ceph osd pool stats". Although these commands do not + self-update like "ceph -w" did, they do have the ability to return formatted + output by providing a "--format=" option. + +* Pools are now expected to be associated with the application using them. + Upon completing the upgrade to Luminous, the cluster will attempt to associate + existing pools to known applications (i.e. CephFS, RBD, and RGW). In-use pools + that are not associated to an application will generate a health warning. Any + unassociated pools can be manually associated using the new + "ceph osd pool application enable" command. For more details see + "Associate Pool to Application" in the documentation. + +* ceph-mgr now has a Zabbix plugin. Using zabbix_sender it sends trapper + events to a Zabbix server containing high-level information of the Ceph + cluster. This makes it easy to monitor a Ceph cluster's status and send + out notifications in case of a malfunction. + +* The 'mon_warn_osd_usage_min_max_delta' config option has been + removed and the associated health warning has been disabled because + it does not address clusters undergoing recovery or CRUSH rules that do + not target all devices in the cluster. + +* Specifying user authorization capabilities for RBD clients has been + simplified. The general syntax for using RBD capability profiles is + "mon 'profile rbd' osd 'profile rbd[-read-only][ pool={pool-name}[, ...]]'". + For more details see "User Management" in the documentation. + +* ``ceph config-key put`` has been deprecated in favor of ``ceph config-key set``. \ No newline at end of file diff -Nru ceph-12.1.1/qa/ceph-deploy-overrides/ceph_deploy_dmcrypt.yaml ceph-12.1.2/qa/ceph-deploy-overrides/ceph_deploy_dmcrypt.yaml --- ceph-12.1.1/qa/ceph-deploy-overrides/ceph_deploy_dmcrypt.yaml 2017-07-17 16:56:02.000000000 +0000 +++ ceph-12.1.2/qa/ceph-deploy-overrides/ceph_deploy_dmcrypt.yaml 1970-01-01 00:00:00.000000000 +0000 @@ -1,3 +0,0 @@ -overrides: - ceph-deploy: - dmcrypt: yes diff -Nru ceph-12.1.1/qa/ceph-deploy-overrides/disable_diff_journal_disk.yaml ceph-12.1.2/qa/ceph-deploy-overrides/disable_diff_journal_disk.yaml --- ceph-12.1.1/qa/ceph-deploy-overrides/disable_diff_journal_disk.yaml 2017-07-17 16:56:02.000000000 +0000 +++ ceph-12.1.2/qa/ceph-deploy-overrides/disable_diff_journal_disk.yaml 1970-01-01 00:00:00.000000000 +0000 @@ -1,3 +0,0 @@ -overrides: - ceph-deploy: - separate_journal_disk: diff -Nru ceph-12.1.1/qa/ceph-deploy-overrides/enable_diff_journal_disk.yaml ceph-12.1.2/qa/ceph-deploy-overrides/enable_diff_journal_disk.yaml --- ceph-12.1.1/qa/ceph-deploy-overrides/enable_diff_journal_disk.yaml 2017-07-17 16:56:02.000000000 +0000 +++ ceph-12.1.2/qa/ceph-deploy-overrides/enable_diff_journal_disk.yaml 1970-01-01 00:00:00.000000000 +0000 @@ -1,3 +0,0 @@ -overrides: - ceph-deploy: - separate_journal_disk: yes diff -Nru ceph-12.1.1/qa/ceph-deploy-overrides/enable_dmcrypt_diff_journal_disk.yaml ceph-12.1.2/qa/ceph-deploy-overrides/enable_dmcrypt_diff_journal_disk.yaml --- ceph-12.1.1/qa/ceph-deploy-overrides/enable_dmcrypt_diff_journal_disk.yaml 2017-07-17 16:56:02.000000000 +0000 +++ ceph-12.1.2/qa/ceph-deploy-overrides/enable_dmcrypt_diff_journal_disk.yaml 1970-01-01 00:00:00.000000000 +0000 @@ -1,4 +0,0 @@ -overrides: - ceph-deploy: - dmcrypt: yes - separate_journal_disk: yes diff -Nru ceph-12.1.1/qa/cephfs/overrides/whitelist_wrongly_marked_down.yaml ceph-12.1.2/qa/cephfs/overrides/whitelist_wrongly_marked_down.yaml --- ceph-12.1.1/qa/cephfs/overrides/whitelist_wrongly_marked_down.yaml 2017-07-17 16:56:02.000000000 +0000 +++ ceph-12.1.2/qa/cephfs/overrides/whitelist_wrongly_marked_down.yaml 2017-08-01 17:55:40.000000000 +0000 @@ -4,7 +4,7 @@ - overall HEALTH_ - (OSD_DOWN) - (OSD_ - - wrongly marked me down + - but it is still running # MDS daemon 'b' is not responding, replacing it as rank 0 with standby 'a' - is not responding conf: diff -Nru ceph-12.1.1/qa/config_options/cephdeploy_conf.yaml ceph-12.1.2/qa/config_options/cephdeploy_conf.yaml --- ceph-12.1.1/qa/config_options/cephdeploy_conf.yaml 2017-07-17 16:56:02.000000000 +0000 +++ ceph-12.1.2/qa/config_options/cephdeploy_conf.yaml 1970-01-01 00:00:00.000000000 +0000 @@ -1,6 +0,0 @@ -overrides: - ceph-deploy: - conf: - global: - mon pg warn min per osd: 2 - osd pool default size: 2 diff -Nru ceph-12.1.1/qa/machine_types/schedule_subset.sh ceph-12.1.2/qa/machine_types/schedule_subset.sh --- ceph-12.1.1/qa/machine_types/schedule_subset.sh 2017-07-17 16:56:02.000000000 +0000 +++ ceph-12.1.2/qa/machine_types/schedule_subset.sh 2017-08-01 17:55:40.000000000 +0000 @@ -36,6 +36,9 @@ elif [ $2 = "kraken" ] ; then # run kraken branch with /40 jobs teuthology-suite -v -c $2 -m $3 -k distro -s $4 --subset $(echo "(($(date +%U) % 4) * 7) + $1" | bc)/40 -e $5 $6 +elif [ $2 = "luminous" ] ; then + # run luminous branch with /40 jobs + teuthology-suite -v -c $2 -m $3 -k distro -s $4 --subset $(echo "(($(date +%U) % 4) * 7) + $1" | bc)/40 -e $5 $6 else # run NON master branches without --newest teuthology-suite -v -c $2 -m $3 -k distro -s $4 --subset $(echo "(($(date +%U) % 4) * 7) + $1" | bc)/28 -e $5 $6 diff -Nru ceph-12.1.1/qa/objectstore/bluestore.yaml ceph-12.1.2/qa/objectstore/bluestore.yaml --- ceph-12.1.1/qa/objectstore/bluestore.yaml 2017-07-17 16:56:02.000000000 +0000 +++ ceph-12.1.2/qa/objectstore/bluestore.yaml 2017-08-01 17:55:40.000000000 +0000 @@ -20,3 +20,21 @@ osd failsafe full ratio: .95 # this doesn't work with failures bc the log writes are not atomic across the two backends # bluestore bluefs env mirror: true + ceph-deploy: + fs: xfs + bluestore: yes + conf: + osd: + osd objectstore: bluestore + bluestore block size: 96636764160 + debug bluestore: 30 + debug bdev: 20 + debug bluefs: 20 + debug rocksdb: 10 + bluestore fsck on mount: true + # lower the full ratios since we can fill up a 100gb osd so quickly + mon osd full ratio: .9 + mon osd backfillfull_ratio: .85 + mon osd nearfull ratio: .8 + osd failsafe full ratio: .95 + diff -Nru ceph-12.1.1/qa/objectstore/filestore-xfs.yaml ceph-12.1.2/qa/objectstore/filestore-xfs.yaml --- ceph-12.1.1/qa/objectstore/filestore-xfs.yaml 2017-07-17 16:56:02.000000000 +0000 +++ ceph-12.1.2/qa/objectstore/filestore-xfs.yaml 2017-08-01 17:55:40.000000000 +0000 @@ -5,3 +5,11 @@ osd: osd objectstore: filestore osd sloppy crc: true + ceph-deploy: + fs: xfs + filestore: True + conf: + osd: + osd objectstore: filestore + osd sloppy crc: true + diff -Nru ceph-12.1.1/qa/objectstore_cephfs/bluestore.yaml ceph-12.1.2/qa/objectstore_cephfs/bluestore.yaml --- ceph-12.1.1/qa/objectstore_cephfs/bluestore.yaml 2017-07-17 16:56:02.000000000 +0000 +++ ceph-12.1.2/qa/objectstore_cephfs/bluestore.yaml 2017-08-01 17:55:40.000000000 +0000 @@ -20,3 +20,21 @@ osd failsafe full ratio: .95 # this doesn't work with failures bc the log writes are not atomic across the two backends # bluestore bluefs env mirror: true + ceph-deploy: + fs: xfs + bluestore: yes + conf: + osd: + osd objectstore: bluestore + bluestore block size: 96636764160 + debug bluestore: 30 + debug bdev: 20 + debug bluefs: 20 + debug rocksdb: 10 + bluestore fsck on mount: true + # lower the full ratios since we can fill up a 100gb osd so quickly + mon osd full ratio: .9 + mon osd backfillfull_ratio: .85 + mon osd nearfull ratio: .8 + osd failsafe full ratio: .95 + diff -Nru ceph-12.1.1/qa/objectstore_cephfs/filestore-xfs.yaml ceph-12.1.2/qa/objectstore_cephfs/filestore-xfs.yaml --- ceph-12.1.1/qa/objectstore_cephfs/filestore-xfs.yaml 2017-07-17 16:56:02.000000000 +0000 +++ ceph-12.1.2/qa/objectstore_cephfs/filestore-xfs.yaml 2017-08-01 17:55:40.000000000 +0000 @@ -5,3 +5,11 @@ osd: osd objectstore: filestore osd sloppy crc: true + ceph-deploy: + fs: xfs + filestore: True + conf: + osd: + osd objectstore: filestore + osd sloppy crc: true + diff -Nru ceph-12.1.1/qa/overrides/whitelist_wrongly_marked_down.yaml ceph-12.1.2/qa/overrides/whitelist_wrongly_marked_down.yaml --- ceph-12.1.1/qa/overrides/whitelist_wrongly_marked_down.yaml 2017-07-17 16:56:02.000000000 +0000 +++ ceph-12.1.2/qa/overrides/whitelist_wrongly_marked_down.yaml 2017-08-01 17:55:40.000000000 +0000 @@ -1,7 +1,7 @@ overrides: ceph: log-whitelist: - - wrongly marked me down + - but it is still running conf: mds: debug mds: 20 diff -Nru ceph-12.1.1/qa/run-standalone.sh ceph-12.1.2/qa/run-standalone.sh --- ceph-12.1.1/qa/run-standalone.sh 1970-01-01 00:00:00.000000000 +0000 +++ ceph-12.1.2/qa/run-standalone.sh 2017-08-01 17:55:40.000000000 +0000 @@ -0,0 +1,25 @@ +#!/bin/sh -ex + +if [ ! -e Makefile ]; then + echo 'run this from the build dir' + exit 1 +fi + +if [ `uname` = FreeBSD ]; then + # otherwise module prettytable will not be found + export PYTHONPATH=/usr/local/lib/python2.7/site-packages + exec_mode=+111 +else + exec_mode=/111 +fi + +for f in `find ../qa/standalone -perm $exec_mode -type f` +do + echo '--- $f ---' + PATH=$PATH:bin \ + CEPH_ROOT=.. \ + CEPH_LIB=lib \ + $f || exit 1 +done + +exit 0 diff -Nru ceph-12.1.1/qa/standalone/ceph-helpers.sh ceph-12.1.2/qa/standalone/ceph-helpers.sh --- ceph-12.1.1/qa/standalone/ceph-helpers.sh 1970-01-01 00:00:00.000000000 +0000 +++ ceph-12.1.2/qa/standalone/ceph-helpers.sh 2017-08-01 17:55:40.000000000 +0000 @@ -0,0 +1,1920 @@ +#!/bin/bash +# +# Copyright (C) 2013,2014 Cloudwatt +# Copyright (C) 2014,2015 Red Hat +# Copyright (C) 2014 Federico Gimenez +# +# Author: Loic Dachary +# Author: Federico Gimenez +# +# This program is free software; you can redistribute it and/or modify +# it under the terms of the GNU Library Public License as published by +# the Free Software Foundation; either version 2, or (at your option) +# any later version. +# +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU Library Public License for more details. +# +TIMEOUT=300 +PG_NUM=4 +: ${CEPH_BUILD_VIRTUALENV:=/tmp} + +if type xmlstarlet > /dev/null 2>&1; then + XMLSTARLET=xmlstarlet +elif type xml > /dev/null 2>&1; then + XMLSTARLET=xml +else + echo "Missing xmlstarlet binary!" + exit 1 +fi + +if [ `uname` = FreeBSD ]; then + SED=gsed + DIFFCOLOPTS="" +else + SED=sed + termwidth=$(stty -a | head -1 | sed -e 's/.*columns \([0-9]*\).*/\1/') + if [ -n "$termwidth" -a "$termwidth" != "0" ]; then + termwidth="-W ${termwidth}" + fi + DIFFCOLOPTS="-y $termwidth" +fi + +EXTRA_OPTS="" +if [ -n "$CEPH_LIB" ]; then + EXTRA_OPTS+=" --erasure-code-dir $CEPH_LIB" + EXTRA_OPTS+=" --plugin-dir $CEPH_LIB" + EXTRA_OPTS+=" --osd-class-dir $CEPH_LIB" +fi + +#! @file ceph-helpers.sh +# @brief Toolbox to manage Ceph cluster dedicated to testing +# +# Example use case: +# +# ~~~~~~~~~~~~~~~~{.sh} +# source ceph-helpers.sh +# +# function mytest() { +# # cleanup leftovers and reset mydir +# setup mydir +# # create a cluster with one monitor and three osds +# run_mon mydir a +# run_osd mydir 0 +# run_osd mydir 2 +# run_osd mydir 3 +# # put and get an object +# rados --pool rbd put GROUP /etc/group +# rados --pool rbd get GROUP /tmp/GROUP +# # stop the cluster and cleanup the directory +# teardown mydir +# } +# ~~~~~~~~~~~~~~~~ +# +# The focus is on simplicity and efficiency, in the context of +# functional tests. The output is intentionally very verbose +# and functions return as soon as an error is found. The caller +# is also expected to abort on the first error so that debugging +# can be done by looking at the end of the output. +# +# Each function is documented, implemented and tested independently. +# When modifying a helper, the test and the documentation are +# expected to be updated and it is easier of they are collocated. A +# test for a given function can be run with +# +# ~~~~~~~~~~~~~~~~{.sh} +# ceph-helpers.sh TESTS test_get_osds +# ~~~~~~~~~~~~~~~~ +# +# and all the tests (i.e. all functions matching test_*) are run +# with: +# +# ~~~~~~~~~~~~~~~~{.sh} +# ceph-helpers.sh TESTS +# ~~~~~~~~~~~~~~~~ +# +# A test function takes a single argument : the directory dedicated +# to the tests. It is expected to not create any file outside of this +# directory and remove it entirely when it completes successfully. +# + + +function get_asok_dir() { + if [ -n "$CEPH_ASOK_DIR" ]; then + echo "$CEPH_ASOK_DIR" + else + echo ${TMPDIR:-/tmp}/ceph-asok.$$ + fi +} + +function get_asok_path() { + local name=$1 + if [ -n "$name" ]; then + echo $(get_asok_dir)/ceph-$name.asok + else + echo $(get_asok_dir)/\$cluster-\$name.asok + fi +} +## +# Cleanup any leftovers found in **dir** via **teardown** +# and reset **dir** as an empty environment. +# +# @param dir path name of the environment +# @return 0 on success, 1 on error +# +function setup() { + local dir=$1 + teardown $dir || return 1 + mkdir -p $dir + mkdir -p $(get_asok_dir) +} + +function test_setup() { + local dir=$dir + setup $dir || return 1 + test -d $dir || return 1 + setup $dir || return 1 + test -d $dir || return 1 + teardown $dir +} + +####################################################################### + +## +# Kill all daemons for which a .pid file exists in **dir** and remove +# **dir**. If the file system in which **dir** is btrfs, delete all +# subvolumes that relate to it. +# +# @param dir path name of the environment +# @return 0 on success, 1 on error +# +function teardown() { + local dir=$1 + kill_daemons $dir KILL + if [ `uname` != FreeBSD ] \ + && [ $(stat -f -c '%T' .) == "btrfs" ]; then + __teardown_btrfs $dir + fi + rm -fr $dir + rm -rf $(get_asok_dir) +} + +function __teardown_btrfs() { + local btrfs_base_dir=$1 + local btrfs_root=$(df -P . | tail -1 | awk '{print $NF}') + local btrfs_dirs=$(cd $btrfs_base_dir; sudo btrfs subvolume list . -t | awk '/^[0-9]/ {print $4}' | grep "$btrfs_base_dir/$btrfs_dir") + for subvolume in $btrfs_dirs; do + sudo btrfs subvolume delete $btrfs_root/$subvolume + done +} + +function test_teardown() { + local dir=$dir + setup $dir || return 1 + teardown $dir || return 1 + ! test -d $dir || return 1 +} + +####################################################################### + +## +# Sends a signal to a single daemon. +# This is a helper function for kill_daemons +# +# After the daemon is sent **signal**, its actual termination +# will be verified by sending it signal 0. If the daemon is +# still alive, kill_daemon will pause for a few seconds and +# try again. This will repeat for a fixed number of times +# before kill_daemon returns on failure. The list of +# sleep intervals can be specified as **delays** and defaults +# to: +# +# 0.1 0.2 1 1 1 2 3 5 5 5 10 10 20 60 60 60 120 +# +# This sequence is designed to run first a very short sleep time (0.1) +# if the machine is fast enough and the daemon terminates in a fraction of a +# second. The increasing sleep numbers should give plenty of time for +# the daemon to die even on the slowest running machine. If a daemon +# takes more than a few minutes to stop (the sum of all sleep times), +# there probably is no point in waiting more and a number of things +# are likely to go wrong anyway: better give up and return on error. +# +# @param pid the process id to send a signal +# @param send_signal the signal to send +# @param delays sequence of sleep times before failure +# +function kill_daemon() { + local pid=$(cat $1) + local send_signal=$2 + local delays=${3:-0.1 0.2 1 1 1 2 3 5 5 5 10 10 20 60 60 60 120} + local exit_code=1 + for try in $delays ; do + if kill -$send_signal $pid 2> /dev/null ; then + exit_code=1 + else + exit_code=0 + break + fi + send_signal=0 + sleep $try + done; + return $exit_code +} + +function test_kill_daemon() { + local dir=$1 + setup $dir || return 1 + run_mon $dir a --osd_pool_default_size=1 || return 1 + run_mgr $dir x || return 1 + run_osd $dir 0 || return 1 + + name_prefix=osd + for pidfile in $(find $dir 2>/dev/null | grep $name_prefix'[^/]*\.pid') ; do + # + # sending signal 0 won't kill the daemon + # waiting just for one second instead of the default schedule + # allows us to quickly verify what happens when kill fails + # to stop the daemon (i.e. it must return false) + # + ! kill_daemon $pidfile 0 1 || return 1 + # + # killing just the osd and verify the mon still is responsive + # + kill_daemon $pidfile TERM || return 1 + done + + ceph osd dump | grep "osd.0 down" || return 1 + + name_prefix=mgr + for pidfile in $(find $dir 2>/dev/null | grep $name_prefix'[^/]*\.pid') ; do + # + # kill the mgr + # + kill_daemon $pidfile TERM || return 1 + done + + name_prefix=mon + for pidfile in $(find $dir 2>/dev/null | grep $name_prefix'[^/]*\.pid') ; do + # + # kill the mon and verify it cannot be reached + # + kill_daemon $pidfile TERM || return 1 + ! timeout 5 ceph status || return 1 + done + + teardown $dir || return 1 +} + +## +# Kill all daemons for which a .pid file exists in **dir**. Each +# daemon is sent a **signal** and kill_daemons waits for it to exit +# during a few minutes. By default all daemons are killed. If a +# **name_prefix** is provided, only the daemons for which a pid +# file is found matching the prefix are killed. See run_osd and +# run_mon for more information about the name conventions for +# the pid files. +# +# Send TERM to all daemons : kill_daemons $dir +# Send KILL to all daemons : kill_daemons $dir KILL +# Send KILL to all osds : kill_daemons $dir KILL osd +# Send KILL to osd 1 : kill_daemons $dir KILL osd.1 +# +# If a daemon is sent the TERM signal and does not terminate +# within a few minutes, it will still be running even after +# kill_daemons returns. +# +# If all daemons are kill successfully the function returns 0 +# if at least one daemon remains, this is treated as an +# error and the function return 1. +# +# @param dir path name of the environment +# @param signal name of the first signal (defaults to TERM) +# @param name_prefix only kill match daemons (defaults to all) +# @param delays sequence of sleep times before failure +# @return 0 on success, 1 on error +# +function kill_daemons() { + local trace=$(shopt -q -o xtrace && echo true || echo false) + $trace && shopt -u -o xtrace + local dir=$1 + local signal=${2:-TERM} + local name_prefix=$3 # optional, osd, mon, osd.1 + local delays=$4 #optional timing + local status=0 + local pids="" + + for pidfile in $(find $dir 2>/dev/null | grep $name_prefix'[^/]*\.pid') ; do + run_in_background pids kill_daemon $pidfile $signal $delays + done + + wait_background pids + status=$? + + $trace && shopt -s -o xtrace + return $status +} + +function test_kill_daemons() { + local dir=$1 + setup $dir || return 1 + run_mon $dir a --osd_pool_default_size=1 || return 1 + run_mgr $dir x || return 1 + run_osd $dir 0 || return 1 + # + # sending signal 0 won't kill the daemon + # waiting just for one second instead of the default schedule + # allows us to quickly verify what happens when kill fails + # to stop the daemon (i.e. it must return false) + # + ! kill_daemons $dir 0 osd 1 || return 1 + # + # killing just the osd and verify the mon still is responsive + # + kill_daemons $dir TERM osd || return 1 + ceph osd dump | grep "osd.0 down" || return 1 + # + # kill the mgr + # + kill_daemons $dir TERM mgr || return 1 + # + # kill the mon and verify it cannot be reached + # + kill_daemons $dir TERM || return 1 + ! timeout 5 ceph status || return 1 + teardown $dir || return 1 +} + +####################################################################### + +## +# Run a monitor by the name mon.**id** with data in **dir**/**id**. +# The logs can be found in **dir**/mon.**id**.log and the pid file +# is **dir**/mon.**id**.pid and the admin socket is +# **dir**/**id**/ceph-mon.**id**.asok. +# +# The remaining arguments are passed verbatim to ceph-mon --mkfs +# and the ceph-mon daemon. +# +# Two mandatory arguments must be provided: --fsid and --mon-host +# Instead of adding them to every call to run_mon, they can be +# set in the CEPH_ARGS environment variable to be read implicitly +# by every ceph command. +# +# The CEPH_CONF variable is expected to be set to /dev/null to +# only rely on arguments for configuration. +# +# Examples: +# +# CEPH_ARGS="--fsid=$(uuidgen) " +# CEPH_ARGS+="--mon-host=127.0.0.1:7018 " +# run_mon $dir a # spawn a mon and bind port 7018 +# run_mon $dir a --debug-filestore=20 # spawn with filestore debugging +# +# If mon_initial_members is not set, the default rbd pool is deleted +# and replaced with a replicated pool with less placement groups to +# speed up initialization. If mon_initial_members is set, no attempt +# is made to recreate the rbd pool because it would hang forever, +# waiting for other mons to join. +# +# A **dir**/ceph.conf file is created but not meant to be used by any +# function. It is convenient for debugging a failure with: +# +# ceph --conf **dir**/ceph.conf -s +# +# @param dir path name of the environment +# @param id mon identifier +# @param ... can be any option valid for ceph-mon +# @return 0 on success, 1 on error +# +function run_mon() { + local dir=$1 + shift + local id=$1 + shift + local data=$dir/$id + + ceph-mon \ + --id $id \ + --mkfs \ + --mon-data=$data \ + --run-dir=$dir \ + "$@" || return 1 + + ceph-mon \ + --id $id \ + --mon-osd-full-ratio=.99 \ + --mon-data-avail-crit=1 \ + --paxos-propose-interval=0.1 \ + --osd-crush-chooseleaf-type=0 \ + $EXTRA_OPTS \ + --debug-mon 20 \ + --debug-ms 20 \ + --debug-paxos 20 \ + --chdir= \ + --mon-data=$data \ + --log-file=$dir/\$name.log \ + --admin-socket=$(get_asok_path) \ + --mon-cluster-log-file=$dir/log \ + --run-dir=$dir \ + --pid-file=$dir/\$name.pid \ + --mon-allow-pool-delete \ + --mon-osd-backfillfull-ratio .99 \ + "$@" || return 1 + + cat > $dir/ceph.conf </dev/null | \ + jq '.acting | .[]') + # get rid of the trailing space + echo $osds +} + +function test_get_osds() { + local dir=$1 + + setup $dir || return 1 + run_mon $dir a --osd_pool_default_size=2 || return 1 + run_mgr $dir x || return 1 + run_osd $dir 0 || return 1 + run_osd $dir 1 || return 1 + create_rbd_pool || return 1 + wait_for_clean || return 1 + create_rbd_pool || return 1 + get_osds rbd GROUP | grep --quiet '^[0-1] [0-1]$' || return 1 + teardown $dir || return 1 +} + +####################################################################### + +## +# Wait for the monitor to form quorum (optionally, of size N) +# +# @param timeout duration (lower-bound) to wait for quorum to be formed +# @param quorumsize size of quorum to wait for +# @return 0 on success, 1 on error +# +function wait_for_quorum() { + local timeout=$1 + local quorumsize=$2 + + if [[ -z "$timeout" ]]; then + timeout=300 + fi + + if [[ -z "$quorumsize" ]]; then + timeout $timeout ceph mon_status --format=json >&/dev/null || return 1 + return 0 + fi + + no_quorum=1 + wait_until=$((`date +%s` + $timeout)) + while [[ $(date +%s) -lt $wait_until ]]; do + jqfilter='.quorum | length == '$quorumsize + jqinput="$(timeout $timeout ceph mon_status --format=json 2>/dev/null)" + res=$(echo $jqinput | jq "$jqfilter") + if [[ "$res" == "true" ]]; then + no_quorum=0 + break + fi + done + return $no_quorum +} + +####################################################################### + +## +# Return the PG of supporting the **objectname** stored in +# **poolname**, as reported by ceph osd map. +# +# @param poolname an existing pool +# @param objectname an objectname (may or may not exist) +# @param STDOUT a PG +# @return 0 on success, 1 on error +# +function get_pg() { + local poolname=$1 + local objectname=$2 + + ceph --format json osd map $poolname $objectname 2>/dev/null | jq -r '.pgid' +} + +function test_get_pg() { + local dir=$1 + + setup $dir || return 1 + run_mon $dir a --osd_pool_default_size=1 || return 1 + run_mgr $dir x || return 1 + run_osd $dir 0 || return 1 + create_rbd_pool || return 1 + wait_for_clean || return 1 + get_pg rbd GROUP | grep --quiet '^[0-9]\.[0-9a-f][0-9a-f]*$' || return 1 + teardown $dir || return 1 +} + +####################################################################### + +## +# Return the value of the **config**, obtained via the config get command +# of the admin socket of **daemon**.**id**. +# +# @param daemon mon or osd +# @param id mon or osd ID +# @param config the configuration variable name as found in config_opts.h +# @param STDOUT the config value +# @return 0 on success, 1 on error +# +function get_config() { + local daemon=$1 + local id=$2 + local config=$3 + + CEPH_ARGS='' \ + ceph --format json daemon $(get_asok_path $daemon.$id) \ + config get $config 2> /dev/null | \ + jq -r ".$config" +} + +function test_get_config() { + local dir=$1 + + # override the default config using command line arg and check it + setup $dir || return 1 + run_mon $dir a --osd_pool_default_size=1 || return 1 + test $(get_config mon a osd_pool_default_size) = 1 || return 1 + run_mgr $dir x || return 1 + run_osd $dir 0 --osd_max_scrubs=3 || return 1 + test $(get_config osd 0 osd_max_scrubs) = 3 || return 1 + teardown $dir || return 1 +} + +####################################################################### + +## +# Set the **config** to specified **value**, via the config set command +# of the admin socket of **daemon**.**id** +# +# @param daemon mon or osd +# @param id mon or osd ID +# @param config the configuration variable name as found in config_opts.h +# @param value the config value +# @return 0 on success, 1 on error +# +function set_config() { + local daemon=$1 + local id=$2 + local config=$3 + local value=$4 + + test $(env CEPH_ARGS='' ceph --format json daemon $(get_asok_path $daemon.$id) \ + config set $config $value 2> /dev/null | \ + jq 'has("success")') == true +} + +function test_set_config() { + local dir=$1 + + setup $dir || return 1 + run_mon $dir a --osd_pool_default_size=1 || return 1 + test $(get_config mon a ms_crc_header) = true || return 1 + set_config mon a ms_crc_header false || return 1 + test $(get_config mon a ms_crc_header) = false || return 1 + set_config mon a ms_crc_header true || return 1 + test $(get_config mon a ms_crc_header) = true || return 1 + teardown $dir || return 1 +} + +####################################################################### + +## +# Return the OSD id of the primary OSD supporting the **objectname** +# stored in **poolname**, as reported by ceph osd map. +# +# @param poolname an existing pool +# @param objectname an objectname (may or may not exist) +# @param STDOUT the primary OSD id +# @return 0 on success, 1 on error +# +function get_primary() { + local poolname=$1 + local objectname=$2 + + ceph --format json osd map $poolname $objectname 2>/dev/null | \ + jq '.acting_primary' +} + +function test_get_primary() { + local dir=$1 + + setup $dir || return 1 + run_mon $dir a --osd_pool_default_size=1 || return 1 + local osd=0 + run_mgr $dir x || return 1 + run_osd $dir $osd || return 1 + create_rbd_pool || return 1 + wait_for_clean || return 1 + test $(get_primary rbd GROUP) = $osd || return 1 + teardown $dir || return 1 +} + +####################################################################### + +## +# Return the id of any OSD supporting the **objectname** stored in +# **poolname**, as reported by ceph osd map, except the primary. +# +# @param poolname an existing pool +# @param objectname an objectname (may or may not exist) +# @param STDOUT the OSD id +# @return 0 on success, 1 on error +# +function get_not_primary() { + local poolname=$1 + local objectname=$2 + + local primary=$(get_primary $poolname $objectname) + ceph --format json osd map $poolname $objectname 2>/dev/null | \ + jq ".acting | map(select (. != $primary)) | .[0]" +} + +function test_get_not_primary() { + local dir=$1 + + setup $dir || return 1 + run_mon $dir a --osd_pool_default_size=2 || return 1 + run_mgr $dir x || return 1 + run_osd $dir 0 || return 1 + run_osd $dir 1 || return 1 + create_rbd_pool || return 1 + wait_for_clean || return 1 + local primary=$(get_primary rbd GROUP) + local not_primary=$(get_not_primary rbd GROUP) + test $not_primary != $primary || return 1 + test $not_primary = 0 -o $not_primary = 1 || return 1 + teardown $dir || return 1 +} + +####################################################################### + +## +# Run ceph-objectstore-tool against the OSD **id** using the data path +# **dir**. The OSD is killed with TERM prior to running +# ceph-objectstore-tool because access to the data path is +# exclusive. The OSD is restarted after the command completes. The +# objectstore_tool returns after all PG are active+clean again. +# +# @param dir the data path of the OSD +# @param id the OSD id +# @param ... arguments to ceph-objectstore-tool +# @param STDIN the input of ceph-objectstore-tool +# @param STDOUT the output of ceph-objectstore-tool +# @return 0 on success, 1 on error +# +# The value of $ceph_osd_args will be passed to restarted osds +# +function objectstore_tool() { + local dir=$1 + shift + local id=$1 + shift + local osd_data=$dir/$id + + local osd_type=$(cat $osd_data/type) + + kill_daemons $dir TERM osd.$id >&2 < /dev/null || return 1 + + local journal_args + if [ "$objectstore_type" == "filestore" ]; then + journal_args=" --journal-path $osd_data/journal" + fi + ceph-objectstore-tool \ + --data-path $osd_data \ + $journal_args \ + "$@" || return 1 + activate_osd $dir $id $ceph_osd_args >&2 || return 1 + wait_for_clean >&2 +} + +function test_objectstore_tool() { + local dir=$1 + + setup $dir || return 1 + run_mon $dir a --osd_pool_default_size=1 || return 1 + local osd=0 + run_mgr $dir x || return 1 + run_osd $dir $osd || return 1 + create_rbd_pool || return 1 + wait_for_clean || return 1 + rados --pool rbd put GROUP /etc/group || return 1 + objectstore_tool $dir $osd GROUP get-bytes | \ + diff - /etc/group + ! objectstore_tool $dir $osd NOTEXISTS get-bytes || return 1 + teardown $dir || return 1 +} + +####################################################################### + +## +# Predicate checking if there is an ongoing recovery in the +# cluster. If any of the recovering_{keys,bytes,objects}_per_sec +# counters are reported by ceph status, it means recovery is in +# progress. +# +# @return 0 if recovery in progress, 1 otherwise +# +function get_is_making_recovery_progress() { + local recovery_progress + recovery_progress+=".recovering_keys_per_sec + " + recovery_progress+=".recovering_bytes_per_sec + " + recovery_progress+=".recovering_objects_per_sec" + local progress=$(ceph --format json status 2>/dev/null | \ + jq -r ".pgmap | $recovery_progress") + test "$progress" != null +} + +function test_get_is_making_recovery_progress() { + local dir=$1 + + setup $dir || return 1 + run_mon $dir a || return 1 + run_mgr $dir x || return 1 + ! get_is_making_recovery_progress || return 1 + teardown $dir || return 1 +} + +####################################################################### + +## +# Return the number of active PGs in the cluster. A PG is active if +# ceph pg dump pgs reports it both **active** and **clean** and that +# not **stale**. +# +# @param STDOUT the number of active PGs +# @return 0 on success, 1 on error +# +function get_num_active_clean() { + local expression + expression+="select(contains(\"active\") and contains(\"clean\")) | " + expression+="select(contains(\"stale\") | not)" + ceph --format json pg dump pgs 2>/dev/null | \ + jq "[.[] | .state | $expression] | length" +} + +function test_get_num_active_clean() { + local dir=$1 + + setup $dir || return 1 + run_mon $dir a --osd_pool_default_size=1 || return 1 + run_mgr $dir x || return 1 + run_osd $dir 0 || return 1 + create_rbd_pool || return 1 + wait_for_clean || return 1 + local num_active_clean=$(get_num_active_clean) + test "$num_active_clean" = $PG_NUM || return 1 + teardown $dir || return 1 +} + +####################################################################### + +## +# Return the number of PGs in the cluster, according to +# ceph pg dump pgs. +# +# @param STDOUT the number of PGs +# @return 0 on success, 1 on error +# +function get_num_pgs() { + ceph --format json status 2>/dev/null | jq '.pgmap.num_pgs' +} + +function test_get_num_pgs() { + local dir=$1 + + setup $dir || return 1 + run_mon $dir a --osd_pool_default_size=1 || return 1 + run_mgr $dir x || return 1 + run_osd $dir 0 || return 1 + create_rbd_pool || return 1 + wait_for_clean || return 1 + local num_pgs=$(get_num_pgs) + test "$num_pgs" -gt 0 || return 1 + teardown $dir || return 1 +} + +####################################################################### + +## +# Return the OSD ids in use by at least one PG in the cluster (either +# in the up or the acting set), according to ceph pg dump pgs. Every +# OSD id shows as many times as they are used in up and acting sets. +# If an OSD id is in both the up and acting set of a given PG, it will +# show twice. +# +# @param STDOUT a sorted list of OSD ids +# @return 0 on success, 1 on error +# +function get_osd_id_used_by_pgs() { + ceph --format json pg dump pgs 2>/dev/null | jq '.[] | .up[], .acting[]' | sort +} + +function test_get_osd_id_used_by_pgs() { + local dir=$1 + + setup $dir || return 1 + run_mon $dir a --osd_pool_default_size=1 || return 1 + run_mgr $dir x || return 1 + run_osd $dir 0 || return 1 + create_rbd_pool || return 1 + wait_for_clean || return 1 + local osd_ids=$(get_osd_id_used_by_pgs | uniq) + test "$osd_ids" = "0" || return 1 + teardown $dir || return 1 +} + +####################################################################### + +## +# Wait until the OSD **id** shows **count** times in the +# PGs (see get_osd_id_used_by_pgs for more information about +# how OSD ids are counted). +# +# @param id the OSD id +# @param count the number of time it must show in the PGs +# @return 0 on success, 1 on error +# +function wait_osd_id_used_by_pgs() { + local id=$1 + local count=$2 + + status=1 + for ((i=0; i < $TIMEOUT / 5; i++)); do + echo $i + if ! test $(get_osd_id_used_by_pgs | grep -c $id) = $count ; then + sleep 5 + else + status=0 + break + fi + done + return $status +} + +function test_wait_osd_id_used_by_pgs() { + local dir=$1 + + setup $dir || return 1 + run_mon $dir a --osd_pool_default_size=1 || return 1 + run_mgr $dir x || return 1 + run_osd $dir 0 || return 1 + create_rbd_pool || return 1 + wait_for_clean || return 1 + wait_osd_id_used_by_pgs 0 8 || return 1 + ! TIMEOUT=1 wait_osd_id_used_by_pgs 123 5 || return 1 + teardown $dir || return 1 +} + +####################################################################### + +## +# Return the date and time of the last completed scrub for **pgid**, +# as reported by ceph pg dump pgs. Note that a repair also sets this +# date. +# +# @param pgid the id of the PG +# @param STDOUT the date and time of the last scrub +# @return 0 on success, 1 on error +# +function get_last_scrub_stamp() { + local pgid=$1 + local sname=${2:-last_scrub_stamp} + ceph --format json pg dump pgs 2>/dev/null | \ + jq -r ".[] | select(.pgid==\"$pgid\") | .$sname" +} + +function test_get_last_scrub_stamp() { + local dir=$1 + + setup $dir || return 1 + run_mon $dir a --osd_pool_default_size=1 || return 1 + run_mgr $dir x || return 1 + run_osd $dir 0 || return 1 + create_rbd_pool || return 1 + wait_for_clean || return 1 + stamp=$(get_last_scrub_stamp 2.0) + test -n "$stamp" || return 1 + teardown $dir || return 1 +} + +####################################################################### + +## +# Predicate checking if the cluster is clean, i.e. all of its PGs are +# in a clean state (see get_num_active_clean for a definition). +# +# @return 0 if the cluster is clean, 1 otherwise +# +function is_clean() { + num_pgs=$(get_num_pgs) + test $num_pgs != 0 || return 1 + test $(get_num_active_clean) = $num_pgs || return 1 +} + +function test_is_clean() { + local dir=$1 + + setup $dir || return 1 + run_mon $dir a --osd_pool_default_size=1 || return 1 + run_mgr $dir x || return 1 + run_osd $dir 0 || return 1 + create_rbd_pool || return 1 + wait_for_clean || return 1 + is_clean || return 1 + teardown $dir || return 1 +} + +####################################################################### + +## +# Return a list of numbers that are increasingly larger and whose +# total is **timeout** seconds. It can be used to have short sleep +# delay while waiting for an event on a fast machine. But if running +# very slowly the larger delays avoid stressing the machine even +# further or spamming the logs. +# +# @param timeout sum of all delays, in seconds +# @return a list of sleep delays +# +function get_timeout_delays() { + local trace=$(shopt -q -o xtrace && echo true || echo false) + $trace && shopt -u -o xtrace + local timeout=$1 + local first_step=${2:-1} + + local i + local total="0" + i=$first_step + while test "$(echo $total + $i \<= $timeout | bc -l)" = "1"; do + echo -n "$i " + total=$(echo $total + $i | bc -l) + i=$(echo $i \* 2 | bc -l) + done + if test "$(echo $total \< $timeout | bc -l)" = "1"; then + echo -n $(echo $timeout - $total | bc -l) + fi + $trace && shopt -s -o xtrace +} + +function test_get_timeout_delays() { + test "$(get_timeout_delays 1)" = "1 " || return 1 + test "$(get_timeout_delays 5)" = "1 2 2" || return 1 + test "$(get_timeout_delays 6)" = "1 2 3" || return 1 + test "$(get_timeout_delays 7)" = "1 2 4 " || return 1 + test "$(get_timeout_delays 8)" = "1 2 4 1" || return 1 + test "$(get_timeout_delays 1 .1)" = ".1 .2 .4 .3" || return 1 + test "$(get_timeout_delays 1.5 .1)" = ".1 .2 .4 .8 " || return 1 + test "$(get_timeout_delays 5 .1)" = ".1 .2 .4 .8 1.6 1.9" || return 1 + test "$(get_timeout_delays 6 .1)" = ".1 .2 .4 .8 1.6 2.9" || return 1 + test "$(get_timeout_delays 6.3 .1)" = ".1 .2 .4 .8 1.6 3.2 " || return 1 + test "$(get_timeout_delays 20 .1)" = ".1 .2 .4 .8 1.6 3.2 6.4 7.3" || return 1 +} + +####################################################################### + +## +# Wait until the cluster becomes clean or if it does not make progress +# for $TIMEOUT seconds. +# Progress is measured either via the **get_is_making_recovery_progress** +# predicate or if the number of clean PGs changes (as returned by get_num_active_clean) +# +# @return 0 if the cluster is clean, 1 otherwise +# +function wait_for_clean() { + local num_active_clean=-1 + local cur_active_clean + local -a delays=($(get_timeout_delays $TIMEOUT .1)) + local -i loop=0 + + while test $(get_num_pgs) == 0 ; do + sleep 1 + done + + while true ; do + # Comparing get_num_active_clean & get_num_pgs is used to determine + # if the cluster is clean. That's almost an inline of is_clean() to + # get more performance by avoiding multiple calls of get_num_active_clean. + cur_active_clean=$(get_num_active_clean) + test $cur_active_clean = $(get_num_pgs) && break + if test $cur_active_clean != $num_active_clean ; then + loop=0 + num_active_clean=$cur_active_clean + elif get_is_making_recovery_progress ; then + loop=0 + elif (( $loop >= ${#delays[*]} )) ; then + ceph report + return 1 + fi + sleep ${delays[$loop]} + loop+=1 + done + return 0 +} + +function test_wait_for_clean() { + local dir=$1 + + setup $dir || return 1 + run_mon $dir a --osd_pool_default_size=1 || return 1 + run_mgr $dir x || return 1 + create_rbd_pool || return 1 + ! TIMEOUT=1 wait_for_clean || return 1 + run_osd $dir 0 || return 1 + wait_for_clean || return 1 + teardown $dir || return 1 +} + +####################################################################### + +## +# Wait until the cluster becomes HEALTH_OK again or if it does not make progress +# for $TIMEOUT seconds. +# +# @return 0 if the cluster is HEALTHY, 1 otherwise +# +function wait_for_health() { + local grepstr=$1 + local -a delays=($(get_timeout_delays $TIMEOUT .1)) + local -i loop=0 + + while ! ceph health detail | grep "$grepstr" ; do + if (( $loop >= ${#delays[*]} )) ; then + ceph health detail + return 1 + fi + sleep ${delays[$loop]} + loop+=1 + done +} + +function wait_for_health_ok() { + wait_for_health "HEALTH_OK" || return 1 +} + +function test_wait_for_health_ok() { + local dir=$1 + + setup $dir || return 1 + run_mon $dir a --osd_pool_default_size=1 --osd_failsafe_full_ratio=.99 --mon_pg_warn_min_per_osd=0 || return 1 + run_mgr $dir x --mon_pg_warn_min_per_osd=0 || return 1 + run_osd $dir 0 || return 1 + kill_daemons $dir TERM osd || return 1 + ! TIMEOUT=1 wait_for_health_ok || return 1 + activate_osd $dir 0 || return 1 + wait_for_health_ok || return 1 + teardown $dir || return 1 +} + + +####################################################################### + +## +# Run repair on **pgid** and wait until it completes. The repair +# function will fail if repair does not complete within $TIMEOUT +# seconds. +# +# @param pgid the id of the PG +# @return 0 on success, 1 on error +# +function repair() { + local pgid=$1 + local last_scrub=$(get_last_scrub_stamp $pgid) + ceph pg repair $pgid + wait_for_scrub $pgid "$last_scrub" +} + +function test_repair() { + local dir=$1 + + setup $dir || return 1 + run_mon $dir a --osd_pool_default_size=1 || return 1 + run_mgr $dir x || return 1 + run_osd $dir 0 || return 1 + create_rbd_pool || return 1 + wait_for_clean || return 1 + repair 2.0 || return 1 + kill_daemons $dir KILL osd || return 1 + ! TIMEOUT=1 repair 2.0 || return 1 + teardown $dir || return 1 +} +####################################################################### + +## +# Run scrub on **pgid** and wait until it completes. The pg_scrub +# function will fail if repair does not complete within $TIMEOUT +# seconds. The pg_scrub is complete whenever the +# **get_last_scrub_stamp** function reports a timestamp different from +# the one stored before starting the scrub. +# +# @param pgid the id of the PG +# @return 0 on success, 1 on error +# +function pg_scrub() { + local pgid=$1 + local last_scrub=$(get_last_scrub_stamp $pgid) + ceph pg scrub $pgid + wait_for_scrub $pgid "$last_scrub" +} + +function pg_deep_scrub() { + local pgid=$1 + local last_scrub=$(get_last_scrub_stamp $pgid last_deep_scrub_stamp) + ceph pg deep-scrub $pgid + wait_for_scrub $pgid "$last_scrub" last_deep_scrub_stamp +} + +function test_pg_scrub() { + local dir=$1 + + setup $dir || return 1 + run_mon $dir a --osd_pool_default_size=1 || return 1 + run_mgr $dir x || return 1 + run_osd $dir 0 || return 1 + create_rbd_pool || return 1 + wait_for_clean || return 1 + pg_scrub 2.0 || return 1 + kill_daemons $dir KILL osd || return 1 + ! TIMEOUT=1 pg_scrub 2.0 || return 1 + teardown $dir || return 1 +} + +####################################################################### + +## +# Run the *command* and expect it to fail (i.e. return a non zero status). +# The output (stderr and stdout) is stored in a temporary file in *dir* +# and is expected to contain the string *expected*. +# +# Return 0 if the command failed and the string was found. Otherwise +# return 1 and cat the full output of the command on stderr for debug. +# +# @param dir temporary directory to store the output +# @param expected string to look for in the output +# @param command ... the command and its arguments +# @return 0 on success, 1 on error +# + +function expect_failure() { + local dir=$1 + shift + local expected="$1" + shift + local success + + if "$@" > $dir/out 2>&1 ; then + success=true + else + success=false + fi + + if $success || ! grep --quiet "$expected" $dir/out ; then + cat $dir/out >&2 + return 1 + else + return 0 + fi +} + +function test_expect_failure() { + local dir=$1 + + setup $dir || return 1 + expect_failure $dir FAIL bash -c 'echo FAIL ; exit 1' || return 1 + # the command did not fail + ! expect_failure $dir FAIL bash -c 'echo FAIL ; exit 0' > $dir/out || return 1 + grep --quiet FAIL $dir/out || return 1 + # the command failed but the output does not contain the expected string + ! expect_failure $dir FAIL bash -c 'echo UNEXPECTED ; exit 1' > $dir/out || return 1 + ! grep --quiet FAIL $dir/out || return 1 + teardown $dir || return 1 +} + +####################################################################### + +## +# Given the *last_scrub*, wait for scrub to happen on **pgid**. It +# will fail if scrub does not complete within $TIMEOUT seconds. The +# repair is complete whenever the **get_last_scrub_stamp** function +# reports a timestamp different from the one given in argument. +# +# @param pgid the id of the PG +# @param last_scrub timestamp of the last scrub for *pgid* +# @return 0 on success, 1 on error +# +function wait_for_scrub() { + local pgid=$1 + local last_scrub="$2" + local sname=${3:-last_scrub_stamp} + + for ((i=0; i < $TIMEOUT; i++)); do + if test "$last_scrub" != "$(get_last_scrub_stamp $pgid $sname)" ; then + return 0 + fi + sleep 1 + done + return 1 +} + +function test_wait_for_scrub() { + local dir=$1 + + setup $dir || return 1 + run_mon $dir a --osd_pool_default_size=1 || return 1 + run_mgr $dir x || return 1 + run_osd $dir 0 || return 1 + create_rbd_pool || return 1 + wait_for_clean || return 1 + local pgid=2.0 + ceph pg repair $pgid + local last_scrub=$(get_last_scrub_stamp $pgid) + wait_for_scrub $pgid "$last_scrub" || return 1 + kill_daemons $dir KILL osd || return 1 + last_scrub=$(get_last_scrub_stamp $pgid) + ! TIMEOUT=1 wait_for_scrub $pgid "$last_scrub" || return 1 + teardown $dir || return 1 +} + +####################################################################### + +## +# Return 0 if the erasure code *plugin* is available, 1 otherwise. +# +# @param plugin erasure code plugin +# @return 0 on success, 1 on error +# + +function erasure_code_plugin_exists() { + local plugin=$1 + local status + local grepstr + local s + case `uname` in + FreeBSD) grepstr="Cannot open.*$plugin" ;; + *) grepstr="$plugin.*No such file" ;; + esac + + s=$(ceph osd erasure-code-profile set TESTPROFILE plugin=$plugin 2>&1) + local status=$? + if [ $status -eq 0 ]; then + ceph osd erasure-code-profile rm TESTPROFILE + elif ! echo $s | grep --quiet "$grepstr" ; then + status=1 + # display why the string was rejected. + echo $s + fi + return $status +} + +function test_erasure_code_plugin_exists() { + local dir=$1 + + setup $dir || return 1 + run_mon $dir a || return 1 + run_mgr $dir x || return 1 + erasure_code_plugin_exists jerasure || return 1 + ! erasure_code_plugin_exists FAKE || return 1 + teardown $dir || return 1 +} + +####################################################################### + +## +# Display all log files from **dir** on stdout. +# +# @param dir directory in which all data is stored +# + +function display_logs() { + local dir=$1 + + find $dir -maxdepth 1 -name '*.log' | \ + while read file ; do + echo "======================= $file" + cat $file + done +} + +function test_display_logs() { + local dir=$1 + + setup $dir || return 1 + run_mon $dir a || return 1 + kill_daemons $dir || return 1 + display_logs $dir > $dir/log.out + grep --quiet mon.a.log $dir/log.out || return 1 + teardown $dir || return 1 +} + +####################################################################### +## +# Spawn a command in background and save the pid in the variable name +# passed in argument. To make the output reading easier, the output is +# prepend with the process id. +# +# Example: +# pids1="" +# run_in_background pids1 bash -c 'sleep 1; exit 1' +# +# @param pid_variable the variable name (not value) where the pids will be stored +# @param ... the command to execute +# @return only the pid_variable output should be considered and used with **wait_background** +# +function run_in_background() { + local pid_variable=$1 + shift; + # Execute the command and prepend the output with its pid + # We enforce to return the exit status of the command and not the awk one. + ("$@" |& awk '{ a[i++] = $0 }END{for (i = 0; i in a; ++i) { print "'$$': " a[i]} }'; return ${PIPESTATUS[0]}) >&2 & + eval "$pid_variable+=\" $!\"" +} + +function test_run_in_background() { + local pids + run_in_background pids sleep 1 + run_in_background pids sleep 1 + test $(echo $pids | wc -w) = 2 || return 1 + wait $pids || return 1 +} + +####################################################################### +## +# Wait for pids running in background to complete. +# This function is usually used after a **run_in_background** call +# Example: +# pids1="" +# run_in_background pids1 bash -c 'sleep 1; exit 1' +# wait_background pids1 +# +# @param pids The variable name that contains the active PIDS. Set as empty at then end of the function. +# @return returns 1 if at least one process exits in error unless returns 0 +# +function wait_background() { + # We extract the PIDS from the variable name + pids=${!1} + + return_code=0 + for pid in $pids; do + if ! wait $pid; then + # If one process failed then return 1 + return_code=1 + fi + done + + # We empty the variable reporting that all process ended + eval "$1=''" + + return $return_code +} + + +function test_wait_background() { + local pids="" + run_in_background pids bash -c "sleep 1; exit 1" + run_in_background pids bash -c "sleep 2; exit 0" + wait_background pids + if [ $? -ne 1 ]; then return 1; fi + + run_in_background pids bash -c "sleep 1; exit 0" + run_in_background pids bash -c "sleep 2; exit 0" + wait_background pids + if [ $? -ne 0 ]; then return 1; fi + + if [ ! -z "$pids" ]; then return 1; fi +} + +function flush_pg_stats() +{ + local timeout=${1:-$TIMEOUT} + + ids=`ceph osd ls` + seqs='' + for osd in $ids; do + seq=`ceph tell osd.$osd flush_pg_stats` + seqs="$seqs $osd-$seq" + done + + for s in $seqs; do + osd=`echo $s | cut -d - -f 1` + seq=`echo $s | cut -d - -f 2` + echo "waiting osd.$osd seq $seq" + while test $(ceph osd last-stat-seq $osd) -lt $seq; do + sleep 1 + if [ $((timeout--)) -eq 0 ]; then + return 1 + fi + done + done +} + +function test_flush_pg_stats() +{ + local dir=$1 + + setup $dir || return 1 + run_mon $dir a --osd_pool_default_size=1 || return 1 + run_mgr $dir x || return 1 + run_osd $dir 0 || return 1 + create_rbd_pool || return 1 + rados -p rbd put obj /etc/group + flush_pg_stats + local jq_filter='.pools | .[] | select(.name == "rbd") | .stats' + raw_bytes_used=`ceph df detail --format=json | jq "$jq_filter.raw_bytes_used"` + bytes_used=`ceph df detail --format=json | jq "$jq_filter.bytes_used"` + test $raw_bytes_used > 0 || return 1 + test $raw_bytes_used == $bytes_used || return 1 +} + +####################################################################### + +## +# Call the **run** function (which must be defined by the caller) with +# the **dir** argument followed by the caller argument list. +# +# If the **run** function returns on error, all logs found in **dir** +# are displayed for diagnostic purposes. +# +# **teardown** function is called when the **run** function returns +# (on success or on error), to cleanup leftovers. The CEPH_CONF is set +# to /dev/null and CEPH_ARGS is unset so that the tests are protected from +# external interferences. +# +# It is the responsibility of the **run** function to call the +# **setup** function to prepare the test environment (create a temporary +# directory etc.). +# +# The shell is required (via PS4) to display the function and line +# number whenever a statement is executed to help debugging. +# +# @param dir directory in which all data is stored +# @param ... arguments passed transparently to **run** +# @return 0 on success, 1 on error +# +function main() { + local dir=td/$1 + shift + + shopt -s -o xtrace + PS4='${BASH_SOURCE[0]}:$LINENO: ${FUNCNAME[0]}: ' + + export PATH=${CEPH_BUILD_VIRTUALENV}/ceph-disk-virtualenv/bin:${CEPH_BUILD_VIRTUALENV}/ceph-detect-init-virtualenv/bin:.:$PATH # make sure program from sources are preferred + #export PATH=$CEPH_ROOT/src/ceph-disk/virtualenv/bin:$CEPH_ROOT/src/ceph-detect-init/virtualenv/bin:.:$PATH # make sure program from sources are preferred + + export CEPH_CONF=/dev/null + unset CEPH_ARGS + + local code + if run $dir "$@" ; then + code=0 + else + display_logs $dir + code=1 + fi + teardown $dir || return 1 + return $code +} + +####################################################################### + +function run_tests() { + shopt -s -o xtrace + PS4='${BASH_SOURCE[0]}:$LINENO: ${FUNCNAME[0]}: ' + + export PATH=${CEPH_BUILD_VIRTUALENV}/ceph-disk-virtualenv/bin:${CEPH_BUILD_VIRTUALENV}/ceph-detect-init-virtualenv/bin:.:$PATH # make sure program from sources are preferred + #export PATH=$CEPH_ROOT/src/ceph-disk/virtualenv/bin:$CEPH_ROOT/src/ceph-detect-init/virtualenv/bin:.:$PATH # make sure program from sources are preferred + + export CEPH_MON="127.0.0.1:7109" # git grep '\<7109\>' : there must be only one + export CEPH_ARGS + CEPH_ARGS+="--fsid=$(uuidgen) --auth-supported=none " + CEPH_ARGS+="--mon-host=$CEPH_MON " + export CEPH_CONF=/dev/null + + local funcs=${@:-$(set | sed -n -e 's/^\(test_[0-9a-z_]*\) .*/\1/p')} + local dir=td/ceph-helpers + + for func in $funcs ; do + $func $dir || return 1 + done +} + +if test "$1" = TESTS ; then + shift + run_tests "$@" +fi + +# NOTE: +# jq only support --exit-status|-e from version 1.4 forwards, which makes +# returning on error waaaay prettier and straightforward. +# However, the current automated upstream build is running with v1.3, +# which has no idea what -e is. Hence the convoluted error checking we +# need. Sad. +# The next time someone changes this code, please check if v1.4 is now +# a thing, and, if so, please change these to use -e. Thanks. + +# jq '.all.supported | select([.[] == "foo"] | any)' +function jq_success() { + input="$1" + filter="$2" + expects="\"$3\"" + + in_escaped=$(printf %s "$input" | sed "s/'/'\\\\''/g") + filter_escaped=$(printf %s "$filter" | sed "s/'/'\\\\''/g") + + ret=$(echo "$in_escaped" | jq "$filter_escaped") + if [[ "$ret" == "true" ]]; then + return 0 + elif [[ -n "$expects" ]]; then + if [[ "$ret" == "$expects" ]]; then + return 0 + fi + fi + return 1 + input=$1 + filter=$2 + expects="$3" + + ret="$(echo $input | jq \"$filter\")" + if [[ "$ret" == "true" ]]; then + return 0 + elif [[ -n "$expects" && "$ret" == "$expects" ]]; then + return 0 + fi + return 1 +} + +# Local Variables: +# compile-command: "cd ../../src ; make -j4 && ../qa/standalone/ceph-helpers.sh TESTS # test_get_config" +# End: diff -Nru ceph-12.1.1/qa/standalone/crush/crush-choose-args.sh ceph-12.1.2/qa/standalone/crush/crush-choose-args.sh --- ceph-12.1.1/qa/standalone/crush/crush-choose-args.sh 1970-01-01 00:00:00.000000000 +0000 +++ ceph-12.1.2/qa/standalone/crush/crush-choose-args.sh 2017-08-01 17:55:40.000000000 +0000 @@ -0,0 +1,161 @@ +#!/bin/bash +# +# Copyright (C) 2017 Red Hat +# +# Author: Loic Dachary +# +# This program is free software; you can redistribute it and/or modify +# it under the terms of the GNU Library Public License as published by +# the Free Software Foundation; either version 2, or (at your option) +# any later version. +# +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU Library Public License for more details. +# + +source $CEPH_ROOT/qa/standalone/ceph-helpers.sh + +function run() { + local dir=$1 + shift + + export CEPH_MON="127.0.0.1:7131" # git grep '\<7131\>' : there must be only one + export CEPH_ARGS + CEPH_ARGS+="--fsid=$(uuidgen) --auth-supported=none " + CEPH_ARGS+="--mon-host=$CEPH_MON " + CEPH_ARGS+="--crush-location=root=default,host=HOST " + CEPH_ARGS+="--osd-crush-initial-weight=3 " + # + # Disable device auto class feature for now. + # The device class is non-deterministic and will + # crash the crushmap comparison below. + # + CEPH_ARGS+="--osd-class-update-on-start=false " + + local funcs=${@:-$(set | sed -n -e 's/^\(TEST_[0-9a-z_]*\) .*/\1/p')} + for func in $funcs ; do + setup $dir || return 1 + $func $dir || return 1 + teardown $dir || return 1 + done +} + +function TEST_choose_args_update() { + # + # adding a weighted OSD updates the weight up to the top + # + local dir=$1 + + run_mon $dir a || return 1 + run_osd $dir 0 || return 1 + + ceph osd set-require-min-compat-client luminous + ceph osd getcrushmap > $dir/map || return 1 + crushtool -d $dir/map -o $dir/map.txt || return 1 + sed -i -e '/end crush map/d' $dir/map.txt + cat >> $dir/map.txt < $dir/map-one-more || return 1 + crushtool -d $dir/map-one-more -o $dir/map-one-more.txt || return 1 + cat $dir/map-one-more.txt + diff -u $dir/map-one-more.txt $CEPH_ROOT/src/test/crush/crush-choose-args-expected-one-more-3.txt || return 1 + + destroy_osd $dir 1 || return 1 + ceph osd getcrushmap > $dir/map-one-less || return 1 + crushtool -d $dir/map-one-less -o $dir/map-one-less.txt || return 1 + diff -u $dir/map-one-less.txt $dir/map.txt || return 1 +} + +function TEST_no_update_weight_set() { + # + # adding a zero weight OSD does not update the weight set at all + # + local dir=$1 + + ORIG_CEPH_ARGS="$CEPH_ARGS" + CEPH_ARGS+="--osd-crush-update-weight-set=false " + + run_mon $dir a || return 1 + run_osd $dir 0 || return 1 + + ceph osd set-require-min-compat-client luminous + ceph osd crush tree + ceph osd getcrushmap > $dir/map || return 1 + crushtool -d $dir/map -o $dir/map.txt || return 1 + sed -i -e '/end crush map/d' $dir/map.txt + cat >> $dir/map.txt < $dir/map-one-more || return 1 + crushtool -d $dir/map-one-more -o $dir/map-one-more.txt || return 1 + cat $dir/map-one-more.txt + diff -u $dir/map-one-more.txt $CEPH_ROOT/src/test/crush/crush-choose-args-expected-one-more-0.txt || return 1 + + destroy_osd $dir 1 || return 1 + ceph osd crush tree + ceph osd getcrushmap > $dir/map-one-less || return 1 + crushtool -d $dir/map-one-less -o $dir/map-one-less.txt || return 1 + diff -u $dir/map-one-less.txt $dir/map.txt || return 1 + + CEPH_ARGS="$ORIG_CEPH_ARGS" +} + +main crush-choose-args "$@" + +# Local Variables: +# compile-command: "cd ../../../build ; ln -sf ../src/ceph-disk/ceph_disk/main.py bin/ceph-disk && make -j4 && ../src/test/crush/crush-choose-args.sh" +# End: diff -Nru ceph-12.1.1/qa/standalone/crush/crush-classes.sh ceph-12.1.2/qa/standalone/crush/crush-classes.sh --- ceph-12.1.1/qa/standalone/crush/crush-classes.sh 1970-01-01 00:00:00.000000000 +0000 +++ ceph-12.1.2/qa/standalone/crush/crush-classes.sh 2017-08-01 17:55:40.000000000 +0000 @@ -0,0 +1,252 @@ +#!/bin/bash +# +# Copyright (C) 2017 Red Hat +# +# Author: Loic Dachary +# +# This program is free software; you can redistribute it and/or modify +# it under the terms of the GNU Library Public License as published by +# the Free Software Foundation; either version 2, or (at your option) +# any later version. +# +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU Library Public License for more details. +# + +source $CEPH_ROOT/qa/standalone/ceph-helpers.sh + +function run() { + local dir=$1 + shift + + export CEPH_MON="127.0.0.1:7130" # git grep '\<7130\>' : there must be only one + export CEPH_ARGS + CEPH_ARGS+="--fsid=$(uuidgen) --auth-supported=none " + CEPH_ARGS+="--mon-host=$CEPH_MON " + # + # Disable auto-class, so we can inject device class manually below + # + CEPH_ARGS+="--osd-class-update-on-start=false " + + local funcs=${@:-$(set | sed -n -e 's/^\(TEST_[0-9a-z_]*\) .*/\1/p')} + for func in $funcs ; do + setup $dir || return 1 + $func $dir || return 1 + teardown $dir || return 1 + done +} + +function add_something() { + local dir=$1 + local obj=${2:-SOMETHING} + + local payload=ABCDEF + echo $payload > $dir/ORIGINAL + rados --pool rbd put $obj $dir/ORIGINAL || return 1 +} + +function get_osds_up() { + local poolname=$1 + local objectname=$2 + + local osds=$(ceph --format xml osd map $poolname $objectname 2>/dev/null | \ + $XMLSTARLET sel -t -m "//up/osd" -v . -o ' ') + # get rid of the trailing space + echo $osds +} + +function TEST_classes() { + local dir=$1 + + run_mon $dir a || return 1 + run_osd $dir 0 || return 1 + run_osd $dir 1 || return 1 + run_osd $dir 2 || return 1 + create_rbd_pool || return 1 + + test "$(get_osds_up rbd SOMETHING)" == "1 2 0" || return 1 + add_something $dir SOMETHING || return 1 + + # + # osd.0 has class ssd and the rule is modified + # to only take ssd devices. + # + ceph osd getcrushmap > $dir/map || return 1 + crushtool -d $dir/map -o $dir/map.txt || return 1 + ${SED} -i \ + -e '/device 0 osd.0/s/$/ class ssd/' \ + -e '/step take default/s/$/ class ssd/' \ + $dir/map.txt || return 1 + crushtool -c $dir/map.txt -o $dir/map-new || return 1 + ceph osd setcrushmap -i $dir/map-new || return 1 + + # + # There can only be one mapping since there only is + # one device with ssd class. + # + ok=false + for delay in 2 4 8 16 32 64 128 256 ; do + if test "$(get_osds_up rbd SOMETHING_ELSE)" == "0" ; then + ok=true + break + fi + sleep $delay + ceph osd dump # for debugging purposes + ceph pg dump # for debugging purposes + done + $ok || return 1 + # + # Writing keeps working because the pool is min_size 1 by + # default. + # + add_something $dir SOMETHING_ELSE || return 1 + + # + # Sanity check that the rule indeed has ssd + # generated bucket with a name including ~ssd. + # + ceph osd crush dump | grep -q '~ssd' || return 1 +} + +function TEST_set_device_class() { + local dir=$1 + + TEST_classes $dir || return 1 + + ceph osd crush set-device-class ssd osd.0 || return 1 + ceph osd crush class ls-osd ssd | grep 0 || return 1 + ceph osd crush set-device-class ssd osd.1 || return 1 + ceph osd crush class ls-osd ssd | grep 1 || return 1 + ceph osd crush set-device-class ssd 0 1 || return 1 # should be idempotent + + ok=false + for delay in 2 4 8 16 32 64 128 256 ; do + if test "$(get_osds_up rbd SOMETHING_ELSE)" == "0 1" ; then + ok=true + break + fi + sleep $delay + ceph osd crush dump + ceph osd dump # for debugging purposes + ceph pg dump # for debugging purposes + done + $ok || return 1 +} + +function TEST_mon_classes() { + local dir=$1 + + run_mon $dir a || return 1 + run_osd $dir 0 || return 1 + run_osd $dir 1 || return 1 + run_osd $dir 2 || return 1 + create_rbd_pool || return 1 + + test "$(get_osds_up rbd SOMETHING)" == "1 2 0" || return 1 + add_something $dir SOMETHING || return 1 + + # test rm-device-class + ceph osd crush set-device-class aaa osd.0 || return 1 + ceph osd tree | grep -q 'aaa' || return 1 + ceph osd crush dump | grep -q '~aaa' || return 1 + ceph osd crush tree --show-shadow | grep -q '~aaa' || return 1 + ceph osd crush set-device-class bbb osd.1 || return 1 + ceph osd tree | grep -q 'bbb' || return 1 + ceph osd crush dump | grep -q '~bbb' || return 1 + ceph osd crush tree --show-shadow | grep -q '~bbb' || return 1 + ceph osd crush set-device-class ccc osd.2 || return 1 + ceph osd tree | grep -q 'ccc' || return 1 + ceph osd crush dump | grep -q '~ccc' || return 1 + ceph osd crush tree --show-shadow | grep -q '~ccc' || return 1 + ceph osd crush rm-device-class 0 || return 1 + ceph osd tree | grep -q 'aaa' && return 1 + ceph osd crush dump | grep -q '~aaa' && return 1 + ceph osd crush tree --show-shadow | grep -q '~aaa' && return 1 + ceph osd crush class ls | grep -q 'aaa' && return 1 + ceph osd crush rm-device-class 1 || return 1 + ceph osd tree | grep -q 'bbb' && return 1 + ceph osd crush dump | grep -q '~bbb' && return 1 + ceph osd crush tree --show-shadow | grep -q '~bbb' && return 1 + ceph osd crush class ls | grep -q 'bbb' && return 1 + ceph osd crush rm-device-class 2 || return 1 + ceph osd tree | grep -q 'ccc' && return 1 + ceph osd crush dump | grep -q '~ccc' && return 1 + ceph osd crush tree --show-shadow | grep -q '~ccc' && return 1 + ceph osd crush class ls | grep -q 'ccc' && return 1 + ceph osd crush set-device-class asdf all || return 1 + ceph osd tree | grep -q 'asdf' || return 1 + ceph osd crush dump | grep -q '~asdf' || return 1 + ceph osd crush tree --show-shadow | grep -q '~asdf' || return 1 + ceph osd crush rm-device-class all || return 1 + ceph osd tree | grep -q 'asdf' && return 1 + ceph osd crush dump | grep -q '~asdf' && return 1 + ceph osd crush tree --show-shadow | grep -q '~asdf' && return 1 + + # test 'class rm' automatically recycles shadow trees + ceph osd crush set-device-class asdf 0 1 2 || return 1 + ceph osd tree | grep -q 'asdf' || return 1 + ceph osd crush dump | grep -q '~asdf' || return 1 + ceph osd crush tree --show-shadow | grep -q '~asdf' || return 1 + ceph osd crush class ls | grep -q 'asdf' || return 1 + ceph osd crush class rm asdf || return 1 + ceph osd tree | grep -q 'asdf' && return 1 + ceph osd crush dump | grep -q '~asdf' && return 1 + ceph osd crush tree --show-shadow | grep -q '~asdf' && return 1 + ceph osd crush class ls | grep -q 'asdf' && return 1 + + ceph osd crush set-device-class abc osd.2 || return 1 + ceph osd crush move osd.2 root=foo rack=foo-rack host=foo-host || return 1 + out=`ceph osd tree |awk '$1 == 2 && $2 == "abc" {print $0}'` + if [ "$out" == "" ]; then + return 1 + fi + + # verify 'crush move' too + ceph osd crush dump | grep -q 'foo~abc' || return 1 + ceph osd crush tree --show-shadow | grep -q 'foo~abc' || return 1 + ceph osd crush dump | grep -q 'foo-rack~abc' || return 1 + ceph osd crush tree --show-shadow | grep -q 'foo-rack~abc' || return 1 + ceph osd crush dump | grep -q 'foo-host~abc' || return 1 + ceph osd crush tree --show-shadow | grep -q 'foo-host~abc' || return 1 + ceph osd crush rm-device-class osd.2 || return 1 + ceph osd crush dump | grep -q 'foo~abc' && return 1 + ceph osd crush tree --show-shadow | grep -q 'foo~abc' && return 1 + ceph osd crush dump | grep -q 'foo-rack~abc' && return 1 + ceph osd crush tree --show-shadow | grep -q 'foo-rack~abc' && return 1 + ceph osd crush dump | grep -q 'foo-host~abc' && return 1 + ceph osd crush tree --show-shadow | grep -q 'foo-host~abc' && return 1 + # restore class, so we can continue to test create-replicated + ceph osd crush set-device-class abc osd.2 || return 1 + + ceph osd crush rule create-replicated foo-rule foo host abc || return 1 + + # test class_is_in_use + ceph osd crush set-device-class hdd osd.0 || return 1 + ceph osd crush set-device-class ssd osd.1 || return 1 + ceph osd crush rule create-replicated foo-hdd1 default host hdd || return 1 + ceph osd crush rule create-replicated foo-hdd2 default host hdd || return 1 + ceph osd crush rule create-replicated foo-ssd default host ssd || return 1 + expect_failure $dir EBUSY ceph osd crush class rm hdd || return 1 + expect_failure $dir EBUSY ceph osd crush class rm ssd || return 1 + ceph osd crush rule rm foo-hdd1 || return 1 + expect_failure $dir EBUSY ceph osd crush class rm hdd || return 1 # still referenced by foo-hdd2 + ceph osd crush rule rm foo-hdd2 || return 1 + ceph osd crush rule rm foo-ssd || return 1 + ceph osd crush class rm hdd || return 1 + ceph osd crush class rm ssd || return 1 + expect_failure $dir EBUSY ceph osd crush class rm abc || return 1 # still referenced by foo-rule + ceph osd crush rule rm foo-rule || return 1 + ceph osd crush class rm abc || return 1 + + # test set-device-class implicitly change class + ceph osd crush set-device-class hdd osd.0 || return 1 + expect_failure $dir EBUSY ceph osd crush set-device-class nvme osd.0 || return 1 +} + +main crush-classes "$@" + +# Local Variables: +# compile-command: "cd ../../../build ; ln -sf ../src/ceph-disk/ceph_disk/main.py bin/ceph-disk && make -j4 && ../src/test/crush/crush-classes.sh" +# End: diff -Nru ceph-12.1.1/qa/standalone/erasure-code/test-erasure-code-plugins.sh ceph-12.1.2/qa/standalone/erasure-code/test-erasure-code-plugins.sh --- ceph-12.1.1/qa/standalone/erasure-code/test-erasure-code-plugins.sh 1970-01-01 00:00:00.000000000 +0000 +++ ceph-12.1.2/qa/standalone/erasure-code/test-erasure-code-plugins.sh 2017-08-01 17:55:40.000000000 +0000 @@ -0,0 +1,117 @@ +#!/bin/bash -x + +source $CEPH_ROOT/qa/standalone/ceph-helpers.sh + +arch=$(uname -m) + +case $arch in + i[[3456]]86*|x86_64*|amd64*) + legacy_jerasure_plugins=(jerasure_generic jerasure_sse3 jerasure_sse4) + legacy_shec_plugins=(shec_generic shec_sse3 shec_sse4) + plugins=(jerasure shec lrc isa) + ;; + aarch64*|arm*) + legacy_jerasure_plugins=(jerasure_generic jerasure_neon) + legacy_shec_plugins=(shec_generic shec_neon) + plugins=(jerasure shec lrc) + ;; + *) + echo "unsupported platform ${arch}." + return 1 + ;; +esac + +function run() { + local dir=$1 + shift + + export CEPH_MON="127.0.0.1:17110" # git grep '\<17110\>' : there must be only one + export CEPH_ARGS + CEPH_ARGS+="--fsid=$(uuidgen) --auth-supported=none " + CEPH_ARGS+="--mon-host=$CEPH_MON " + + local funcs=${@:-$(set | sed -n -e 's/^\(TEST_[0-9a-z_]*\) .*/\1/p')} + for func in $funcs ; do + $func $dir || return 1 + done +} + +function TEST_preload_warning() { + local dir=$1 + + for plugin in ${legacy_jerasure_plugins[*]} ${legacy_shec_plugins[*]}; do + setup $dir || return 1 + run_mon $dir a --osd_erasure_code_plugins="${plugin}" || return 1 + run_mgr $dir x || return 1 + CEPH_ARGS='' ceph --admin-daemon $(get_asok_path mon.a) log flush || return 1 + run_osd $dir 0 --osd_erasure_code_plugins="${plugin}" || return 1 + CEPH_ARGS='' ceph --admin-daemon $(get_asok_path osd.0) log flush || return 1 + grep "WARNING: osd_erasure_code_plugins contains plugin ${plugin}" $dir/mon.a.log || return 1 + grep "WARNING: osd_erasure_code_plugins contains plugin ${plugin}" $dir/osd.0.log || return 1 + teardown $dir || return 1 + done + return 0 +} + +function TEST_preload_no_warning() { + local dir=$1 + + for plugin in ${plugins[*]}; do + setup $dir || return 1 + run_mon $dir a --osd_erasure_code_plugins="${plugin}" || return 1 + run_mgr $dir x || return 1 + CEPH_ARGS='' ceph --admin-daemon $(get_asok_path mon.a) log flush || return 1 + run_osd $dir 0 --osd_erasure_code_plugins="${plugin}" || return 1 + CEPH_ARGS='' ceph --admin-daemon $(get_asok_path osd.0) log flush || return 1 + ! grep "WARNING: osd_erasure_code_plugins contains plugin" $dir/mon.a.log || return 1 + ! grep "WARNING: osd_erasure_code_plugins contains plugin" $dir/osd.0.log || return 1 + teardown $dir || return 1 + done + + return 0 +} + +function TEST_preload_no_warning_default() { + local dir=$1 + + setup $dir || return 1 + run_mon $dir a || return 1 + CEPH_ARGS='' ceph --admin-daemon $(get_asok_path mon.a) log flush || return 1 + run_mgr $dir x || return 1 + run_osd $dir 0 || return 1 + CEPH_ARGS='' ceph --admin-daemon $(get_asok_path osd.0) log flush || return 1 + ! grep "WARNING: osd_erasure_code_plugins" $dir/mon.a.log || return 1 + ! grep "WARNING: osd_erasure_code_plugins" $dir/osd.0.log || return 1 + teardown $dir || return 1 + + return 0 +} + +function TEST_ec_profile_warning() { + local dir=$1 + + setup $dir || return 1 + run_mon $dir a || return 1 + run_mgr $dir x || return 1 + for id in $(seq 0 2) ; do + run_osd $dir $id || return 1 + done + create_rbd_pool || return 1 + wait_for_clean || return 1 + + for plugin in ${legacy_jerasure_plugins[*]}; do + ceph osd erasure-code-profile set prof-${plugin} crush-failure-domain=osd technique=reed_sol_van plugin=${plugin} || return 1 + CEPH_ARGS='' ceph --admin-daemon $(get_asok_path mon.a) log flush || return 1 + grep "WARNING: erasure coding profile prof-${plugin} uses plugin ${plugin}" $dir/mon.a.log || return 1 + done + + for plugin in ${legacy_shec_plugins[*]}; do + ceph osd erasure-code-profile set prof-${plugin} crush-failure-domain=osd plugin=${plugin} || return 1 + CEPH_ARGS='' ceph --admin-daemon $(get_asok_path mon.a) log flush || return 1 + grep "WARNING: erasure coding profile prof-${plugin} uses plugin ${plugin}" $dir/mon.a.log || return 1 + done + + teardown $dir || return 1 +} + +main test-erasure-code-plugins "$@" diff -Nru ceph-12.1.1/qa/standalone/erasure-code/test-erasure-code.sh ceph-12.1.2/qa/standalone/erasure-code/test-erasure-code.sh --- ceph-12.1.1/qa/standalone/erasure-code/test-erasure-code.sh 1970-01-01 00:00:00.000000000 +0000 +++ ceph-12.1.2/qa/standalone/erasure-code/test-erasure-code.sh 2017-08-01 17:55:40.000000000 +0000 @@ -0,0 +1,339 @@ +#!/bin/bash +# +# Copyright (C) 2014 Cloudwatt +# Copyright (C) 2014, 2015 Red Hat +# +# Author: Loic Dachary +# +# This program is free software; you can redistribute it and/or modify +# it under the terms of the GNU Library Public License as published by +# the Free Software Foundation; either version 2, or (at your option) +# any later version. +# +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU Library Public License for more details. +# + +source $CEPH_ROOT/qa/standalone/ceph-helpers.sh + +function run() { + local dir=$1 + shift + + export CEPH_MON="127.0.0.1:7101" # git grep '\<7101\>' : there must be only one + export CEPH_ARGS + CEPH_ARGS+="--fsid=$(uuidgen) --auth-supported=none " + CEPH_ARGS+="--mon-host=$CEPH_MON --mon-osd-prime-pg-temp=false" + + setup $dir || return 1 + run_mon $dir a || return 1 + run_mgr $dir x || return 1 + # check that erasure code plugins are preloaded + CEPH_ARGS='' ceph --admin-daemon $(get_asok_path mon.a) log flush || return 1 + grep 'load: jerasure.*lrc' $dir/mon.a.log || return 1 + for id in $(seq 0 10) ; do + run_osd $dir $id || return 1 + done + create_rbd_pool || return 1 + wait_for_clean || return 1 + # check that erasure code plugins are preloaded + CEPH_ARGS='' ceph --admin-daemon $(get_asok_path osd.0) log flush || return 1 + grep 'load: jerasure.*lrc' $dir/osd.0.log || return 1 + create_erasure_coded_pool ecpool || return 1 + + local funcs=${@:-$(set | sed -n -e 's/^\(TEST_[0-9a-z_]*\) .*/\1/p')} + for func in $funcs ; do + $func $dir || return 1 + done + + delete_pool ecpool || return 1 + teardown $dir || return 1 +} + +function create_erasure_coded_pool() { + local poolname=$1 + + ceph osd erasure-code-profile set myprofile \ + crush-failure-domain=osd || return 1 + ceph osd pool create $poolname 12 12 erasure myprofile \ + || return 1 + wait_for_clean || return 1 +} + +function delete_pool() { + local poolname=$1 + + ceph osd pool delete $poolname $poolname --yes-i-really-really-mean-it +} + +function rados_put_get() { + local dir=$1 + local poolname=$2 + local objname=${3:-SOMETHING} + + + for marker in AAA BBB CCCC DDDD ; do + printf "%*s" 1024 $marker + done > $dir/ORIGINAL + + # + # get and put an object, compare they are equal + # + rados --pool $poolname put $objname $dir/ORIGINAL || return 1 + rados --pool $poolname get $objname $dir/COPY || return 1 + diff $dir/ORIGINAL $dir/COPY || return 1 + rm $dir/COPY + + # + # take out an OSD used to store the object and + # check the object can still be retrieved, which implies + # recovery + # + local -a initial_osds=($(get_osds $poolname $objname)) + local last=$((${#initial_osds[@]} - 1)) + ceph osd out ${initial_osds[$last]} || return 1 + ! get_osds $poolname $objname | grep '\<'${initial_osds[$last]}'\>' || return 1 + rados --pool $poolname get $objname $dir/COPY || return 1 + diff $dir/ORIGINAL $dir/COPY || return 1 + ceph osd in ${initial_osds[$last]} || return 1 + + rm $dir/ORIGINAL +} + +function rados_osds_out_in() { + local dir=$1 + local poolname=$2 + local objname=${3:-SOMETHING} + + + for marker in FFFF GGGG HHHH IIII ; do + printf "%*s" 1024 $marker + done > $dir/ORIGINAL + + # + # get and put an object, compare they are equal + # + rados --pool $poolname put $objname $dir/ORIGINAL || return 1 + rados --pool $poolname get $objname $dir/COPY || return 1 + diff $dir/ORIGINAL $dir/COPY || return 1 + rm $dir/COPY + + # + # take out two OSDs used to store the object, wait for the cluster + # to be clean (i.e. all PG are clean and active) again which + # implies the PG have been moved to use the remaining OSDs. Check + # the object can still be retrieved. + # + wait_for_clean || return 1 + local osds_list=$(get_osds $poolname $objname) + local -a osds=($osds_list) + for osd in 0 1 ; do + ceph osd out ${osds[$osd]} || return 1 + done + wait_for_clean || return 1 + # + # verify the object is no longer mapped to the osds that are out + # + for osd in 0 1 ; do + ! get_osds $poolname $objname | grep '\<'${osds[$osd]}'\>' || return 1 + done + rados --pool $poolname get $objname $dir/COPY || return 1 + diff $dir/ORIGINAL $dir/COPY || return 1 + # + # bring the osds back in, , wait for the cluster + # to be clean (i.e. all PG are clean and active) again which + # implies the PG go back to using the same osds as before + # + for osd in 0 1 ; do + ceph osd in ${osds[$osd]} || return 1 + done + wait_for_clean || return 1 + test "$osds_list" = "$(get_osds $poolname $objname)" || return 1 + rm $dir/ORIGINAL +} + +function TEST_rados_put_get_lrc_advanced() { + local dir=$1 + local poolname=pool-lrc-a + local profile=profile-lrc-a + + ceph osd erasure-code-profile set $profile \ + plugin=lrc \ + mapping=DD_ \ + crush-steps='[ [ "chooseleaf", "osd", 0 ] ]' \ + layers='[ [ "DDc", "" ] ]' || return 1 + ceph osd pool create $poolname 12 12 erasure $profile \ + || return 1 + + rados_put_get $dir $poolname || return 1 + + delete_pool $poolname + ceph osd erasure-code-profile rm $profile +} + +function TEST_rados_put_get_lrc_kml() { + local dir=$1 + local poolname=pool-lrc + local profile=profile-lrc + + ceph osd erasure-code-profile set $profile \ + plugin=lrc \ + k=4 m=2 l=3 \ + crush-failure-domain=osd || return 1 + ceph osd pool create $poolname 12 12 erasure $profile \ + || return 1 + + rados_put_get $dir $poolname || return 1 + + delete_pool $poolname + ceph osd erasure-code-profile rm $profile +} + +function TEST_rados_put_get_isa() { + if ! erasure_code_plugin_exists isa ; then + echo "SKIP because plugin isa has not been built" + return 0 + fi + local dir=$1 + local poolname=pool-isa + + ceph osd erasure-code-profile set profile-isa \ + plugin=isa \ + crush-failure-domain=osd || return 1 + ceph osd pool create $poolname 1 1 erasure profile-isa \ + || return 1 + + rados_put_get $dir $poolname || return 1 + + delete_pool $poolname +} + +function TEST_rados_put_get_jerasure() { + local dir=$1 + + rados_put_get $dir ecpool || return 1 + + local poolname=pool-jerasure + local profile=profile-jerasure + + ceph osd erasure-code-profile set $profile \ + plugin=jerasure \ + k=4 m=2 \ + crush-failure-domain=osd || return 1 + ceph osd pool create $poolname 12 12 erasure $profile \ + || return 1 + + rados_put_get $dir $poolname || return 1 + rados_osds_out_in $dir $poolname || return 1 + + delete_pool $poolname + ceph osd erasure-code-profile rm $profile +} + +function TEST_rados_put_get_shec() { + local dir=$1 + + local poolname=pool-shec + local profile=profile-shec + + ceph osd erasure-code-profile set $profile \ + plugin=shec \ + k=2 m=1 c=1 \ + crush-failure-domain=osd || return 1 + ceph osd pool create $poolname 12 12 erasure $profile \ + || return 1 + + rados_put_get $dir $poolname || return 1 + + delete_pool $poolname + ceph osd erasure-code-profile rm $profile +} + +function TEST_alignment_constraints() { + local payload=ABC + echo "$payload" > $dir/ORIGINAL + # + # Verify that the rados command enforces alignment constraints + # imposed by the stripe width + # See http://tracker.ceph.com/issues/8622 + # + local stripe_unit=$(ceph-conf --show-config-value osd_pool_erasure_code_stripe_unit) + eval local $(ceph osd erasure-code-profile get myprofile | grep k=) + local block_size=$((stripe_unit * k - 1)) + dd if=/dev/zero of=$dir/ORIGINAL bs=$block_size count=2 + rados --block-size=$block_size \ + --pool ecpool put UNALIGNED $dir/ORIGINAL || return 1 + rm $dir/ORIGINAL +} + +function chunk_size() { + echo $(ceph-conf --show-config-value osd_pool_erasure_code_stripe_unit) +} + +# +# By default an object will be split in two (k=2) with the first part +# of the object in the first OSD of the up set and the second part in +# the next OSD in the up set. This layout is defined by the mapping +# parameter and this function helps verify that the first and second +# part of the object are located in the OSD where they should be. +# +function verify_chunk_mapping() { + local dir=$1 + local poolname=$2 + local first=$3 + local second=$4 + + local payload=$(printf '%*s' $(chunk_size) FIRST$poolname ; printf '%*s' $(chunk_size) SECOND$poolname) + echo -n "$payload" > $dir/ORIGINAL + + rados --pool $poolname put SOMETHING$poolname $dir/ORIGINAL || return 1 + rados --pool $poolname get SOMETHING$poolname $dir/COPY || return 1 + local -a osds=($(get_osds $poolname SOMETHING$poolname)) + for (( i = 0; i < ${#osds[@]}; i++ )) ; do + ceph daemon osd.${osds[$i]} flush_journal + done + diff $dir/ORIGINAL $dir/COPY || return 1 + rm $dir/COPY + + local -a osds=($(get_osds $poolname SOMETHING$poolname)) + grep --quiet --recursive --text FIRST$poolname $dir/${osds[$first]} || return 1 + grep --quiet --recursive --text SECOND$poolname $dir/${osds[$second]} || return 1 +} + +function TEST_chunk_mapping() { + local dir=$1 + + # + # mapping=DD_ is the default: + # first OSD (i.e. 0) in the up set has the first part of the object + # second OSD (i.e. 1) in the up set has the second part of the object + # + verify_chunk_mapping $dir ecpool 0 1 || return 1 + + ceph osd erasure-code-profile set remap-profile \ + plugin=lrc \ + layers='[ [ "_DD", "" ] ]' \ + mapping='_DD' \ + crush-steps='[ [ "choose", "osd", 0 ] ]' || return 1 + ceph osd erasure-code-profile get remap-profile + ceph osd pool create remap-pool 12 12 erasure remap-profile \ + || return 1 + + # + # mapping=_DD + # second OSD (i.e. 1) in the up set has the first part of the object + # third OSD (i.e. 2) in the up set has the second part of the object + # + verify_chunk_mapping $dir remap-pool 1 2 || return 1 + + delete_pool remap-pool + ceph osd erasure-code-profile rm remap-profile +} + +main test-erasure-code "$@" + +# Local Variables: +# compile-command: "cd ../.. ; make -j4 && test/erasure-code/test-erasure-code.sh" +# End: diff -Nru ceph-12.1.1/qa/standalone/erasure-code/test-erasure-eio.sh ceph-12.1.2/qa/standalone/erasure-code/test-erasure-eio.sh --- ceph-12.1.1/qa/standalone/erasure-code/test-erasure-eio.sh 1970-01-01 00:00:00.000000000 +0000 +++ ceph-12.1.2/qa/standalone/erasure-code/test-erasure-eio.sh 2017-08-01 17:55:40.000000000 +0000 @@ -0,0 +1,339 @@ +#!/bin/bash +# +# Copyright (C) 2015 Red Hat +# +# +# Author: Kefu Chai +# +# This program is free software; you can redistribute it and/or modify +# it under the terms of the GNU Library Public License as published by +# the Free Software Foundation; either version 2, or (at your option) +# any later version. +# +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU Library Public License for more details. +# + +source $CEPH_ROOT/qa/standalone/ceph-helpers.sh + +function run() { + local dir=$1 + shift + + export CEPH_MON="127.0.0.1:7112" # git grep '\<7112\>' : there must be only one + export CEPH_ARGS + CEPH_ARGS+="--fsid=$(uuidgen) --auth-supported=none " + CEPH_ARGS+="--mon-host=$CEPH_MON " + + local funcs=${@:-$(set | sed -n -e 's/^\(TEST_[0-9a-z_]*\) .*/\1/p')} + for func in $funcs ; do + setup $dir || return 1 + run_mon $dir a || return 1 + run_mgr $dir x || return 1 + create_rbd_pool || return 1 + + # check that erasure code plugins are preloaded + CEPH_ARGS='' ceph --admin-daemon $(get_asok_path mon.a) log flush || return 1 + grep 'load: jerasure.*lrc' $dir/mon.a.log || return 1 + $func $dir || return 1 + teardown $dir || return 1 + done +} + +function setup_osds() { + for id in $(seq 0 3) ; do + run_osd $dir $id || return 1 + done + wait_for_clean || return 1 + + # check that erasure code plugins are preloaded + CEPH_ARGS='' ceph --admin-daemon $(get_asok_path osd.0) log flush || return 1 + grep 'load: jerasure.*lrc' $dir/osd.0.log || return 1 +} + +function create_erasure_coded_pool() { + local poolname=$1 + + ceph osd erasure-code-profile set myprofile \ + plugin=jerasure \ + k=2 m=1 \ + crush-failure-domain=osd || return 1 + ceph osd pool create $poolname 1 1 erasure myprofile \ + || return 1 + wait_for_clean || return 1 +} + +function delete_pool() { + local poolname=$1 + + ceph osd pool delete $poolname $poolname --yes-i-really-really-mean-it + ceph osd erasure-code-profile rm myprofile +} + +function rados_put() { + local dir=$1 + local poolname=$2 + local objname=${3:-SOMETHING} + + for marker in AAA BBB CCCC DDDD ; do + printf "%*s" 1024 $marker + done > $dir/ORIGINAL + # + # get and put an object, compare they are equal + # + rados --pool $poolname put $objname $dir/ORIGINAL || return 1 +} + +function rados_get() { + local dir=$1 + local poolname=$2 + local objname=${3:-SOMETHING} + local expect=${4:-ok} + + # + # Expect a failure to get object + # + if [ $expect = "fail" ]; + then + ! rados --pool $poolname get $objname $dir/COPY + return + fi + # + # get an object, compare with $dir/ORIGINAL + # + rados --pool $poolname get $objname $dir/COPY || return 1 + diff $dir/ORIGINAL $dir/COPY || return 1 + rm $dir/COPY +} + +function rados_put_get() { + local dir=$1 + local poolname=$2 + local objname=${3:-SOMETHING} + local recovery=$4 + + # + # get and put an object, compare they are equal + # + rados_put $dir $poolname $objname || return 1 + # We can read even though caller injected read error on one of the shards + rados_get $dir $poolname $objname || return 1 + + if [ -n "$recovery" ]; + then + # + # take out the last OSD used to store the object, + # bring it back, and check for clean PGs which means + # recovery didn't crash the primary. + # + local -a initial_osds=($(get_osds $poolname $objname)) + local last=$((${#initial_osds[@]} - 1)) + # Kill OSD + kill_daemons $dir TERM osd.${initial_osds[$last]} >&2 < /dev/null || return 1 + ceph osd out ${initial_osds[$last]} || return 1 + ! get_osds $poolname $objname | grep '\<'${initial_osds[$last]}'\>' || return 1 + ceph osd in ${initial_osds[$last]} || return 1 + run_osd $dir ${initial_osds[$last]} || return 1 + wait_for_clean || return 1 + fi + + rm $dir/ORIGINAL +} + +function inject_eio() { + local objname=$1 + shift + local dir=$1 + shift + local shard_id=$1 + shift + + local poolname=pool-jerasure + local -a initial_osds=($(get_osds $poolname $objname)) + local osd_id=${initial_osds[$shard_id]} + set_config osd $osd_id filestore_debug_inject_read_err true || return 1 + CEPH_ARGS='' ceph --admin-daemon $(get_asok_path osd.$osd_id) \ + injectdataerr $poolname $objname $shard_id || return 1 +} + +function rados_get_data_eio() { + local dir=$1 + shift + local shard_id=$1 + shift + local recovery=$1 + shift + + # inject eio to speificied shard + # + local poolname=pool-jerasure + local objname=obj-eio-$$-$shard_id + inject_eio $objname $dir $shard_id || return 1 + rados_put_get $dir $poolname $objname $recovery || return 1 + + shard_id=$(expr $shard_id + 1) + inject_eio $objname $dir $shard_id || return 1 + # Now 2 out of 3 shards get EIO, so should fail + rados_get $dir $poolname $objname fail || return 1 +} + +# Change the size of speificied shard +# +function set_size() { + local objname=$1 + shift + local dir=$1 + shift + local shard_id=$1 + shift + local bytes=$1 + shift + local mode=${1} + + local poolname=pool-jerasure + local -a initial_osds=($(get_osds $poolname $objname)) + local osd_id=${initial_osds[$shard_id]} + ceph osd set noout + if [ "$mode" = "add" ]; + then + objectstore_tool $dir $osd_id $objname get-bytes $dir/CORRUPT || return 1 + dd if=/dev/urandom bs=$bytes count=1 >> $dir/CORRUPT + elif [ "$bytes" = "0" ]; + then + touch $dir/CORRUPT + else + dd if=/dev/urandom bs=$bytes count=1 of=$dir/CORRUPT + fi + objectstore_tool $dir $osd_id $objname set-bytes $dir/CORRUPT || return 1 + rm -f $dir/CORRUPT + ceph osd unset noout +} + +function rados_get_data_bad_size() { + local dir=$1 + shift + local shard_id=$1 + shift + local bytes=$1 + shift + local mode=${1:-set} + + local poolname=pool-jerasure + local objname=obj-size-$$-$shard_id-$bytes + rados_put $dir $poolname $objname || return 1 + + # Change the size of speificied shard + # + set_size $objname $dir $shard_id $bytes $mode || return 1 + + rados_get $dir $poolname $objname || return 1 + + # Leave objname and modify another shard + shard_id=$(expr $shard_id + 1) + set_size $objname $dir $shard_id $bytes $mode || return 1 + rados_get $dir $poolname $objname fail || return 1 +} + +# +# These two test cases try to validate the following behavior: +# For object on EC pool, if there is one shard having read error ( +# either primary or replica), client can still read object. +# +# If 2 shards have read errors the client will get an error. +# +function TEST_rados_get_subread_eio_shard_0() { + local dir=$1 + setup_osds || return 1 + + local poolname=pool-jerasure + create_erasure_coded_pool $poolname || return 1 + # inject eio on primary OSD (0) and replica OSD (1) + local shard_id=0 + rados_get_data_eio $dir $shard_id || return 1 + delete_pool $poolname +} + +function TEST_rados_get_subread_eio_shard_1() { + local dir=$1 + setup_osds || return 1 + + local poolname=pool-jerasure + create_erasure_coded_pool $poolname || return 1 + # inject eio into replicas OSD (1) and OSD (2) + local shard_id=1 + rados_get_data_eio $dir $shard_id || return 1 + delete_pool $poolname +} + +# +# These two test cases try to validate that following behavior: +# For object on EC pool, if there is one shard which an incorrect +# size this will cause an internal read error, client can still read object. +# +# If 2 shards have incorrect size the client will get an error. +# +function TEST_rados_get_bad_size_shard_0() { + local dir=$1 + setup_osds || return 1 + + local poolname=pool-jerasure + create_erasure_coded_pool $poolname || return 1 + # Set incorrect size into primary OSD (0) and replica OSD (1) + local shard_id=0 + rados_get_data_bad_size $dir $shard_id 10 || return 1 + rados_get_data_bad_size $dir $shard_id 0 || return 1 + rados_get_data_bad_size $dir $shard_id 256 add || return 1 + delete_pool $poolname +} + +function TEST_rados_get_bad_size_shard_1() { + local dir=$1 + setup_osds || return 1 + + local poolname=pool-jerasure + create_erasure_coded_pool $poolname || return 1 + # Set incorrect size into replicas OSD (1) and OSD (2) + local shard_id=1 + rados_get_data_bad_size $dir $shard_id 10 || return 1 + rados_get_data_bad_size $dir $shard_id 0 || return 1 + rados_get_data_bad_size $dir $shard_id 256 add || return 1 + delete_pool $poolname +} + +function TEST_rados_get_with_subreadall_eio_shard_0() { + local dir=$1 + local shard_id=0 + + setup_osds || return 1 + + local poolname=pool-jerasure + create_erasure_coded_pool $poolname || return 1 + # inject eio on primary OSD (0) + local shard_id=0 + rados_get_data_eio $dir $shard_id recovery || return 1 + + delete_pool $poolname +} + +function TEST_rados_get_with_subreadall_eio_shard_1() { + local dir=$1 + local shard_id=0 + + setup_osds || return 1 + + local poolname=pool-jerasure + create_erasure_coded_pool $poolname || return 1 + # inject eio on replica OSD (1) + local shard_id=1 + rados_get_data_eio $dir $shard_id recovery || return 1 + + delete_pool $poolname +} + +main test-erasure-eio "$@" + +# Local Variables: +# compile-command: "cd ../.. ; make -j4 && test/erasure-code/test-erasure-eio.sh" +# End: diff -Nru ceph-12.1.1/qa/standalone/misc/rados-striper.sh ceph-12.1.2/qa/standalone/misc/rados-striper.sh --- ceph-12.1.1/qa/standalone/misc/rados-striper.sh 1970-01-01 00:00:00.000000000 +0000 +++ ceph-12.1.2/qa/standalone/misc/rados-striper.sh 2017-08-01 17:55:40.000000000 +0000 @@ -0,0 +1,101 @@ +#!/bin/bash +# +# Copyright (C) 2014 Red Hat +# +# Author: Sebastien Ponce +# +# This program is free software; you can redistribute it and/or modify +# it under the terms of the GNU Library Public License as published by +# the Free Software Foundation; either version 2, or (at your option) +# any later version. +# +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU Library Public License for more details. +# +source $CEPH_ROOT/qa/standalone/ceph-helpers.sh + +function run() { + local dir=$1 + shift + + export CEPH_MON="127.0.0.1:7116" # git grep '\<7116\>' : there must be only one + export CEPH_ARGS + CEPH_ARGS+="--fsid=$(uuidgen) --auth-supported=none " + CEPH_ARGS+="--mon-host=$CEPH_MON " + + # setup + setup $dir || return 1 + + # create a cluster with one monitor and three osds + run_mon $dir a || return 1 + run_osd $dir 0 || return 1 + run_osd $dir 1 || return 1 + run_osd $dir 2 || return 1 + create_rbd_pool || return 1 + + # create toyfile + dd if=/dev/urandom of=$dir/toyfile bs=1234 count=1 + + # put a striped object + rados --pool rbd --striper put toyfile $dir/toyfile || return 1 + + # stat it, with and without striping + rados --pool rbd --striper stat toyfile | cut -d ',' -f 2 > $dir/stripedStat || return 1 + rados --pool rbd stat toyfile.0000000000000000 | cut -d ',' -f 2 > $dir/stat || return 1 + echo ' size 1234' > $dir/refstat + diff -w $dir/stripedStat $dir/refstat || return 1 + diff -w $dir/stat $dir/refstat || return 1 + rados --pool rbd stat toyfile >& $dir/staterror + grep -q 'No such file or directory' $dir/staterror || return 1 + + # get the file back with and without striping + rados --pool rbd --striper get toyfile $dir/stripedGroup || return 1 + diff -w $dir/toyfile $dir/stripedGroup || return 1 + rados --pool rbd get toyfile.0000000000000000 $dir/nonSTripedGroup || return 1 + diff -w $dir/toyfile $dir/nonSTripedGroup || return 1 + + # test truncate + rados --pool rbd --striper truncate toyfile 12 + rados --pool rbd --striper stat toyfile | cut -d ',' -f 2 > $dir/stripedStat || return 1 + rados --pool rbd stat toyfile.0000000000000000 | cut -d ',' -f 2 > $dir/stat || return 1 + echo ' size 12' > $dir/reftrunc + diff -w $dir/stripedStat $dir/reftrunc || return 1 + diff -w $dir/stat $dir/reftrunc || return 1 + + # test xattrs + + rados --pool rbd --striper setxattr toyfile somexattr somevalue || return 1 + rados --pool rbd --striper getxattr toyfile somexattr > $dir/xattrvalue || return 1 + rados --pool rbd getxattr toyfile.0000000000000000 somexattr > $dir/xattrvalue2 || return 1 + echo 'somevalue' > $dir/refvalue + diff -w $dir/xattrvalue $dir/refvalue || return 1 + diff -w $dir/xattrvalue2 $dir/refvalue || return 1 + rados --pool rbd --striper listxattr toyfile > $dir/xattrlist || return 1 + echo 'somexattr' > $dir/reflist + diff -w $dir/xattrlist $dir/reflist || return 1 + rados --pool rbd listxattr toyfile.0000000000000000 | grep -v striper > $dir/xattrlist2 || return 1 + diff -w $dir/xattrlist2 $dir/reflist || return 1 + rados --pool rbd --striper rmxattr toyfile somexattr || return 1 + + local attr_not_found_str="No data available" + [ `uname` = FreeBSD ] && \ + attr_not_found_str="Attribute not found" + expect_failure $dir "$attr_not_found_str" \ + rados --pool rbd --striper getxattr toyfile somexattr || return 1 + expect_failure $dir "$attr_not_found_str" \ + rados --pool rbd getxattr toyfile.0000000000000000 somexattr || return 1 + + # test rm + rados --pool rbd --striper rm toyfile || return 1 + expect_failure $dir 'No such file or directory' \ + rados --pool rbd --striper stat toyfile || return 1 + expect_failure $dir 'No such file or directory' \ + rados --pool rbd stat toyfile.0000000000000000 || return 1 + + # cleanup + teardown $dir || return 1 +} + +main rados-striper "$@" diff -Nru ceph-12.1.1/qa/standalone/misc/test-ceph-helpers.sh ceph-12.1.2/qa/standalone/misc/test-ceph-helpers.sh --- ceph-12.1.1/qa/standalone/misc/test-ceph-helpers.sh 1970-01-01 00:00:00.000000000 +0000 +++ ceph-12.1.2/qa/standalone/misc/test-ceph-helpers.sh 2017-08-01 17:55:40.000000000 +0000 @@ -0,0 +1,21 @@ +#!/bin/bash +# +# Copyright (C) 2013,2014 Cloudwatt +# Copyright (C) 2014 Red Hat +# Copyright (C) 2014 Federico Gimenez +# +# Author: Loic Dachary +# Author: Federico Gimenez +# +# This program is free software; you can redistribute it and/or modify +# it under the terms of the GNU Library Public License as published by +# the Free Software Foundation; either version 2, or (at your option) +# any later version. +# +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU Library Public License for more details. +# + +$CEPH_ROOT/qa/standalone/ceph-helpers.sh TESTS diff -Nru ceph-12.1.1/qa/standalone/mon/misc.sh ceph-12.1.2/qa/standalone/mon/misc.sh --- ceph-12.1.1/qa/standalone/mon/misc.sh 1970-01-01 00:00:00.000000000 +0000 +++ ceph-12.1.2/qa/standalone/mon/misc.sh 2017-08-01 17:55:40.000000000 +0000 @@ -0,0 +1,238 @@ +#!/bin/bash +# +# Copyright (C) 2014 Cloudwatt +# Copyright (C) 2014, 2015 Red Hat +# +# Author: Loic Dachary +# +# This program is free software; you can redistribute it and/or modify +# it under the terms of the GNU Library Public License as published by +# the Free Software Foundation; either version 2, or (at your option) +# any later version. +# +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU Library Public License for more details. +# +source $CEPH_ROOT/qa/standalone/ceph-helpers.sh + +function run() { + local dir=$1 + shift + + export CEPH_MON="127.0.0.1:7102" # git grep '\<7102\>' : there must be only one + export CEPH_ARGS + CEPH_ARGS+="--fsid=$(uuidgen) --auth-supported=none " + CEPH_ARGS+="--mon-host=$CEPH_MON " + + local funcs=${@:-$(set | sed -n -e 's/^\(TEST_[0-9a-z_]*\) .*/\1/p')} + for func in $funcs ; do + $func $dir || return 1 + done +} + +TEST_POOL=rbd + +function TEST_osd_pool_get_set() { + local dir=$1 + + setup $dir || return 1 + run_mon $dir a || return 1 + create_rbd_pool || return 1 + ceph osd pool create $TEST_POOL 8 + + local flag + for flag in nodelete nopgchange nosizechange write_fadvise_dontneed noscrub nodeep-scrub; do + ceph osd pool set $TEST_POOL $flag 0 || return 1 + ! ceph osd dump | grep 'pool ' | grep $flag || return 1 + ceph osd pool set $TEST_POOL $flag 1 || return 1 + ceph osd dump | grep 'pool ' | grep $flag || return 1 + ceph osd pool set $TEST_POOL $flag false || return 1 + ! ceph osd dump | grep 'pool ' | grep $flag || return 1 + ceph osd pool set $TEST_POOL $flag false || return 1 + # check that setting false twice does not toggle to true (bug) + ! ceph osd dump | grep 'pool ' | grep $flag || return 1 + ceph osd pool set $TEST_POOL $flag true || return 1 + ceph osd dump | grep 'pool ' | grep $flag || return 1 + # cleanup + ceph osd pool set $TEST_POOL $flag 0 || return 1 + done + + local size=$(ceph osd pool get $TEST_POOL size|awk '{print $2}') + local min_size=$(ceph osd pool get $TEST_POOL min_size|awk '{print $2}') + + ceph osd pool set $TEST_POOL scrub_min_interval 123456 || return 1 + ceph osd dump | grep 'pool ' | grep 'scrub_min_interval 123456' || return 1 + ceph osd pool set $TEST_POOL scrub_min_interval 0 || return 1 + ceph osd dump | grep 'pool ' | grep 'scrub_min_interval' && return 1 + ceph osd pool set $TEST_POOL scrub_max_interval 123456 || return 1 + ceph osd dump | grep 'pool ' | grep 'scrub_max_interval 123456' || return 1 + ceph osd pool set $TEST_POOL scrub_max_interval 0 || return 1 + ceph osd dump | grep 'pool ' | grep 'scrub_max_interval' && return 1 + ceph osd pool set $TEST_POOL deep_scrub_interval 123456 || return 1 + ceph osd dump | grep 'pool ' | grep 'deep_scrub_interval 123456' || return 1 + ceph osd pool set $TEST_POOL deep_scrub_interval 0 || return 1 + ceph osd dump | grep 'pool ' | grep 'deep_scrub_interval' && return 1 + + #replicated pool size restrict in 1 and 10 + ! ceph osd pool set $TEST_POOL 11 || return 1 + #replicated pool min_size must be between in 1 and size + ! ceph osd pool set $TEST_POOL min_size $(expr $size + 1) || return 1 + ! ceph osd pool set $TEST_POOL min_size 0 || return 1 + + local ecpool=erasepool + ceph osd pool create $ecpool 12 12 erasure default || return 1 + #erasue pool size=k+m, min_size=k + local size=$(ceph osd pool get $ecpool size|awk '{print $2}') + local min_size=$(ceph osd pool get $ecpool min_size|awk '{print $2}') + local k=$(expr $min_size - 1) # default min_size=k+1 + #erasure pool size can't change + ! ceph osd pool set $ecpool size $(expr $size + 1) || return 1 + #erasure pool min_size must be between in k and size + ceph osd pool set $ecpool min_size $(expr $k + 1) || return 1 + ! ceph osd pool set $ecpool min_size $(expr $k - 1) || return 1 + ! ceph osd pool set $ecpool min_size $(expr $size + 1) || return 1 + + teardown $dir || return 1 +} + +function TEST_mon_add_to_single_mon() { + local dir=$1 + + fsid=$(uuidgen) + MONA=127.0.0.1:7117 # git grep '\<7117\>' : there must be only one + MONB=127.0.0.1:7118 # git grep '\<7118\>' : there must be only one + CEPH_ARGS_orig=$CEPH_ARGS + CEPH_ARGS="--fsid=$fsid --auth-supported=none " + CEPH_ARGS+="--mon-initial-members=a " + CEPH_ARGS+="--mon-host=$MONA " + + setup $dir || return 1 + run_mon $dir a --public-addr $MONA || return 1 + # wait for the quorum + timeout 120 ceph -s > /dev/null || return 1 + run_mon $dir b --public-addr $MONB || return 1 + teardown $dir || return 1 + + setup $dir || return 1 + run_mon $dir a --public-addr $MONA || return 1 + # without the fix of #5454, mon.a will assert failure at seeing the MMonJoin + # from mon.b + run_mon $dir b --public-addr $MONB || return 1 + # wait for the quorum + timeout 120 ceph -s > /dev/null || return 1 + local num_mons + num_mons=$(ceph mon dump --format=json 2>/dev/null | jq ".mons | length") || return 1 + [ $num_mons == 2 ] || return 1 + # no reason to take more than 120 secs to get this submitted + timeout 120 ceph mon add b $MONB || return 1 + teardown $dir || return 1 +} + +function TEST_no_segfault_for_bad_keyring() { + local dir=$1 + setup $dir || return 1 + # create a client.admin key and add it to ceph.mon.keyring + ceph-authtool --create-keyring $dir/ceph.mon.keyring --gen-key -n mon. --cap mon 'allow *' + ceph-authtool --create-keyring $dir/ceph.client.admin.keyring --gen-key -n client.admin --cap mon 'allow *' + ceph-authtool $dir/ceph.mon.keyring --import-keyring $dir/ceph.client.admin.keyring + CEPH_ARGS_TMP="--fsid=$(uuidgen) --mon-host=127.0.0.1:7102 --auth-supported=cephx " + CEPH_ARGS_orig=$CEPH_ARGS + CEPH_ARGS="$CEPH_ARGS_TMP --keyring=$dir/ceph.mon.keyring " + run_mon $dir a + # create a bad keyring and make sure no segfault occurs when using the bad keyring + echo -e "[client.admin]\nkey = BQAUlgtWoFePIxAAQ9YLzJSVgJX5V1lh5gyctg==" > $dir/bad.keyring + CEPH_ARGS="$CEPH_ARGS_TMP --keyring=$dir/bad.keyring" + ceph osd dump 2> /dev/null + # 139(11|128) means segfault and core dumped + [ $? -eq 139 ] && return 1 + CEPH_ARGS=$CEPH_ARGS_orig + teardown $dir || return 1 +} + +function TEST_mon_features() { + local dir=$1 + setup $dir || return 1 + + fsid=$(uuidgen) + MONA=127.0.0.1:7127 # git grep '\<7127\>' ; there must be only one + MONB=127.0.0.1:7128 # git grep '\<7128\>' ; there must be only one + MONC=127.0.0.1:7129 # git grep '\<7129\>' ; there must be only one + CEPH_ARGS_orig=$CEPH_ARGS + CEPH_ARGS="--fsid=$fsid --auth-supported=none " + CEPH_ARGS+="--mon-initial-members=a,b,c " + CEPH_ARGS+="--mon-host=$MONA,$MONB,$MONC " + CEPH_ARGS+="--mon-debug-no-initial-persistent-features " + CEPH_ARGS+="--mon-debug-no-require-luminous " + + run_mon $dir a --public-addr $MONA || return 1 + run_mon $dir b --public-addr $MONB || return 1 + timeout 120 ceph -s > /dev/null || return 1 + + # expect monmap to contain 3 monitors (a, b, and c) + jqinput="$(ceph mon_status --format=json 2>/dev/null)" + jq_success "$jqinput" '.monmap.mons | length == 3' || return 1 + # quorum contains two monitors + jq_success "$jqinput" '.quorum | length == 2' || return 1 + # quorum's monitor features contain kraken and luminous + jqfilter='.features.quorum_mon[]|select(. == "kraken")' + jq_success "$jqinput" "$jqfilter" "kraken" || return 1 + jqfilter='.features.quorum_mon[]|select(. == "luminous")' + jq_success "$jqinput" "$jqfilter" "luminous" || return 1 + + # monmap must have no persistent features set, because we + # don't currently have a quorum made out of all the monitors + # in the monmap. + jqfilter='.monmap.features.persistent | length == 0' + jq_success "$jqinput" "$jqfilter" || return 1 + + # nor do we have any optional features, for that matter. + jqfilter='.monmap.features.optional | length == 0' + jq_success "$jqinput" "$jqfilter" || return 1 + + # validate 'mon feature ls' + + jqinput="$(ceph mon feature ls --format=json 2>/dev/null)" + # 'kraken' and 'luminous' are supported + jqfilter='.all.supported[] | select(. == "kraken")' + jq_success "$jqinput" "$jqfilter" "kraken" || return 1 + jqfilter='.all.supported[] | select(. == "luminous")' + jq_success "$jqinput" "$jqfilter" "luminous" || return 1 + + # start third monitor + run_mon $dir c --public-addr $MONC || return 1 + + wait_for_quorum 300 3 || return 1 + + timeout 300 ceph -s > /dev/null || return 1 + + jqinput="$(ceph mon_status --format=json 2>/dev/null)" + # expect quorum to have all three monitors + jqfilter='.quorum | length == 3' + jq_success "$jqinput" "$jqfilter" || return 1 + # quorum's monitor features contain kraken and luminous + jqfilter='.features.quorum_mon[]|select(. == "kraken")' + jq_success "$jqinput" "$jqfilter" "kraken" || return 1 + jqfilter='.features.quorum_mon[]|select(. == "luminous")' + jq_success "$jqinput" "$jqfilter" "luminous" || return 1 + + # monmap must have no both 'kraken' and 'luminous' persistent + # features set. + jqfilter='.monmap.features.persistent | length == 2' + jq_success "$jqinput" "$jqfilter" || return 1 + jqfilter='.monmap.features.persistent[]|select(. == "kraken")' + jq_success "$jqinput" "$jqfilter" "kraken" || return 1 + jqfilter='.monmap.features.persistent[]|select(. == "luminous")' + jq_success "$jqinput" "$jqfilter" "luminous" || return 1 + + CEPH_ARGS=$CEPH_ARGS_orig + # that's all folks. thank you for tuning in. + teardown $dir || return 1 +} + +main misc "$@" + +# Local Variables: +# compile-command: "cd ../.. ; make -j4 && test/mon/misc.sh" +# End: diff -Nru ceph-12.1.1/qa/standalone/mon/mkfs.sh ceph-12.1.2/qa/standalone/mon/mkfs.sh --- ceph-12.1.1/qa/standalone/mon/mkfs.sh 1970-01-01 00:00:00.000000000 +0000 +++ ceph-12.1.2/qa/standalone/mon/mkfs.sh 2017-08-01 17:55:40.000000000 +0000 @@ -0,0 +1,198 @@ +#!/bin/bash +# +# Copyright (C) 2013 Cloudwatt +# Copyright (C) 2014 Red Hat +# +# Author: Loic Dachary +# +# This program is free software; you can redistribute it and/or modify +# it under the terms of the GNU Library Public License as published by +# the Free Software Foundation; either version 2, or (at your option) +# any later version. +# +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU Library Public License for more details. +# +set -xe +PS4='${BASH_SOURCE[0]}:$LINENO: ${FUNCNAME[0]}: ' + + +DIR=mkfs +export CEPH_CONF=/dev/null +unset CEPH_ARGS +MON_ID=a +MON_DIR=$DIR/$MON_ID +CEPH_MON=127.0.0.1:7110 # git grep '\<7110\>' : there must be only one +TIMEOUT=360 + +EXTRAOPTS="" +if [ -n "$CEPH_LIB" ]; then + EXTRAOPTS+=" --erasure-code-dir $CEPH_LIB" + EXTRAOPTS+=" --plugin-dir $CEPH_LIB" + EXTRAOPTS+=" --osd-class-dir $CEPH_LIB" +fi + +function setup() { + teardown + mkdir $DIR +} + +function teardown() { + kill_daemons + rm -fr $DIR +} + +function mon_mkfs() { + local fsid=$(uuidgen) + + ceph-mon \ + --id $MON_ID \ + --fsid $fsid \ + $EXTRAOPTS \ + --mkfs \ + --mon-data=$MON_DIR \ + --mon-initial-members=$MON_ID \ + --mon-host=$CEPH_MON \ + "$@" +} + +function mon_run() { + ceph-mon \ + --id $MON_ID \ + --chdir= \ + --mon-osd-full-ratio=.99 \ + --mon-data-avail-crit=1 \ + $EXTRAOPTS \ + --mon-data=$MON_DIR \ + --log-file=$MON_DIR/log \ + --mon-cluster-log-file=$MON_DIR/log \ + --run-dir=$MON_DIR \ + --pid-file=$MON_DIR/pidfile \ + --public-addr $CEPH_MON \ + "$@" +} + +function kill_daemons() { + for pidfile in $(find $DIR -name pidfile) ; do + pid=$(cat $pidfile) + for try in 0 1 1 1 2 3 ; do + kill $pid || break + sleep $try + done + done +} + +function auth_none() { + mon_mkfs --auth-supported=none + + ceph-mon \ + --id $MON_ID \ + --mon-osd-full-ratio=.99 \ + --mon-data-avail-crit=1 \ + $EXTRAOPTS \ + --mon-data=$MON_DIR \ + --extract-monmap $MON_DIR/monmap + + [ -f $MON_DIR/monmap ] || return 1 + + [ ! -f $MON_DIR/keyring ] || return 1 + + mon_run --auth-supported=none + + timeout $TIMEOUT ceph --mon-host $CEPH_MON mon stat || return 1 +} + +function auth_cephx_keyring() { + cat > $DIR/keyring <&1 | tee $DIR/makedir.log + grep 'toodeep.*No such file' $DIR/makedir.log > /dev/null + rm $DIR/makedir.log + + # an empty directory does not mean the mon exists + mkdir $MON_DIR + mon_mkfs --auth-supported=none 2>&1 | tee $DIR/makedir.log + ! grep "$MON_DIR already exists" $DIR/makedir.log || return 1 +} + +function idempotent() { + mon_mkfs --auth-supported=none + mon_mkfs --auth-supported=none 2>&1 | tee $DIR/makedir.log + grep "'$MON_DIR' already exists" $DIR/makedir.log > /dev/null || return 1 +} + +function run() { + local actions + actions+="makedir " + actions+="idempotent " + actions+="auth_cephx_key " + actions+="auth_cephx_keyring " + actions+="auth_none " + for action in $actions ; do + setup + $action || return 1 + teardown + done +} + +run + +# Local Variables: +# compile-command: "cd ../.. ; make TESTS=test/mon/mkfs.sh check" +# End: diff -Nru ceph-12.1.1/qa/standalone/mon/mon-bind.sh ceph-12.1.2/qa/standalone/mon/mon-bind.sh --- ceph-12.1.1/qa/standalone/mon/mon-bind.sh 1970-01-01 00:00:00.000000000 +0000 +++ ceph-12.1.2/qa/standalone/mon/mon-bind.sh 2017-08-01 17:55:40.000000000 +0000 @@ -0,0 +1,147 @@ +#!/bin/bash +# +# Copyright (C) 2017 Quantum Corp. +# +# This program is free software; you can redistribute it and/or modify +# it under the terms of the GNU Library Public License as published by +# the Free Software Foundation; either version 2, or (at your option) +# any later version. +# +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU Library Public License for more details. +# +source $CEPH_ROOT/qa/standalone/ceph-helpers.sh + +SOCAT_PIDS=() + +function port_forward() { + local source_port=$1 + local target_port=$2 + + socat TCP-LISTEN:${source_port},fork,reuseaddr TCP:localhost:${target_port} & + SOCAT_PIDS+=( $! ) +} + +function cleanup() { + for p in "${SOCAT_PIDS[@]}"; do + kill $p + done + SOCAT_PIDS=() +} + +trap cleanup SIGTERM SIGKILL SIGQUIT SIGINT + +function run() { + local dir=$1 + shift + + export MON_IP=127.0.0.1 + export MONA_PUBLIC=7132 # git grep '\<7132\>' ; there must be only one + export MONB_PUBLIC=7133 # git grep '\<7133\>' ; there must be only one + export MONC_PUBLIC=7134 # git grep '\<7134\>' ; there must be only one + export MONA_BIND=7135 # git grep '\<7135\>' ; there must be only one + export MONB_BIND=7136 # git grep '\<7136\>' ; there must be only one + export MONC_BIND=7137 # git grep '\<7137\>' ; there must be only one + export CEPH_ARGS + CEPH_ARGS+="--fsid=$(uuidgen) --auth-supported=none " + + local funcs=${@:-$(set | sed -n -e 's/^\(TEST_[0-9a-z_]*\) .*/\1/p')} + for func in $funcs ; do + setup $dir || return 1 + $func $dir && cleanup || { cleanup; return 1; } + teardown $dir + done +} + +function TEST_mon_client_connect_fails() { + local dir=$1 + + # start the mon with a public-bind-addr that is different + # from the public-addr. + CEPH_ARGS+="--mon-initial-members=a " + CEPH_ARGS+="--mon-host=${MON_IP}:${MONA_PUBLIC} " + run_mon $dir a --mon-host=${MON_IP}:${MONA_PUBLIC} --public-bind-addr=${MON_IP}:${MONA_BIND} || return 1 + + # now attempt to ping it that should fail. + timeout 3 ceph ping mon.a || return 0 + return 1 +} + +function TEST_mon_client_connect() { + local dir=$1 + + # start the mon with a public-bind-addr that is different + # from the public-addr. + CEPH_ARGS+="--mon-initial-members=a " + CEPH_ARGS+="--mon-host=${MON_IP}:${MONA_PUBLIC} " + run_mon $dir a --mon-host=${MON_IP}:${MONA_PUBLIC} --public-bind-addr=${MON_IP}:${MONA_BIND} || return 1 + + # now forward the public port to the bind port. + port_forward ${MONA_PUBLIC} ${MONA_BIND} + + # attempt to connect. we expect that to work + ceph ping mon.a || return 1 +} + +function TEST_mon_quorum() { + local dir=$1 + + # start the mon with a public-bind-addr that is different + # from the public-addr. + CEPH_ARGS+="--mon-initial-members=a,b,c " + CEPH_ARGS+="--mon-host=${MON_IP}:${MONA_PUBLIC},${MON_IP}:${MONB_PUBLIC},${MON_IP}:${MONC_PUBLIC} " + run_mon $dir a --public-addr=${MON_IP}:${MONA_PUBLIC} --public-bind-addr=${MON_IP}:${MONA_BIND} || return 1 + run_mon $dir b --public-addr=${MON_IP}:${MONB_PUBLIC} --public-bind-addr=${MON_IP}:${MONB_BIND} || return 1 + run_mon $dir c --public-addr=${MON_IP}:${MONC_PUBLIC} --public-bind-addr=${MON_IP}:${MONC_BIND} || return 1 + + # now forward the public port to the bind port. + port_forward ${MONA_PUBLIC} ${MONA_BIND} + port_forward ${MONB_PUBLIC} ${MONB_BIND} + port_forward ${MONC_PUBLIC} ${MONC_BIND} + + # expect monmap to contain 3 monitors (a, b, and c) + jqinput="$(ceph mon_status --format=json 2>/dev/null)" + jq_success "$jqinput" '.monmap.mons | length == 3' || return 1 + + # quorum should form + wait_for_quorum 300 3 || return 1 + # expect quorum to have all three monitors + jqfilter='.quorum | length == 3' + jq_success "$jqinput" "$jqfilter" || return 1 +} + +function TEST_put_get() { + local dir=$1 + + # start the mon with a public-bind-addr that is different + # from the public-addr. + CEPH_ARGS+="--mon-initial-members=a,b,c " + CEPH_ARGS+="--mon-host=${MON_IP}:${MONA_PUBLIC},${MON_IP}:${MONB_PUBLIC},${MON_IP}:${MONC_PUBLIC} " + run_mon $dir a --public-addr=${MON_IP}:${MONA_PUBLIC} --public-bind-addr=${MON_IP}:${MONA_BIND} || return 1 + run_mon $dir b --public-addr=${MON_IP}:${MONB_PUBLIC} --public-bind-addr=${MON_IP}:${MONB_BIND} || return 1 + run_mon $dir c --public-addr=${MON_IP}:${MONC_PUBLIC} --public-bind-addr=${MON_IP}:${MONC_BIND} || return 1 + + # now forward the public port to the bind port. + port_forward ${MONA_PUBLIC} ${MONA_BIND} + port_forward ${MONB_PUBLIC} ${MONB_BIND} + port_forward ${MONC_PUBLIC} ${MONC_BIND} + + # quorum should form + wait_for_quorum 300 3 || return 1 + + run_mgr $dir x || return 1 + run_osd $dir 0 || return 1 + run_osd $dir 1 || return 1 + run_osd $dir 2 || return 1 + + ceph osd pool create hello 8 || return 1 + + echo "hello world" > $dir/hello + rados --pool hello put foo $dir/hello || return 1 + rados --pool hello get foo $dir/hello2 || return 1 + diff $dir/hello $dir/hello2 || return 1 +} + +main mon-bind "$@" diff -Nru ceph-12.1.1/qa/standalone/mon/mon-created-time.sh ceph-12.1.2/qa/standalone/mon/mon-created-time.sh --- ceph-12.1.1/qa/standalone/mon/mon-created-time.sh 1970-01-01 00:00:00.000000000 +0000 +++ ceph-12.1.2/qa/standalone/mon/mon-created-time.sh 2017-08-01 17:55:40.000000000 +0000 @@ -0,0 +1,54 @@ +#!/bin/bash +# +# Copyright (C) 2015 SUSE LINUX GmbH +# +# This program is free software; you can redistribute it and/or modify +# it under the terms of the GNU Library Public License as published by +# the Free Software Foundation; either version 2, or (at your option) +# any later version. +# +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU Library Public License for more details. +# +source $CEPH_ROOT/qa/standalone/ceph-helpers.sh + +function run() { + local dir=$1 + shift + + export CEPH_MON="127.0.0.1:7125" # git grep '\<7125\>' : there must be only one + export CEPH_ARGS + CEPH_ARGS+="--fsid=$(uuidgen) --auth-supported=none " + CEPH_ARGS+="--mon-host=$CEPH_MON " + + local funcs=${@:-$(set | sed -n -e 's/^\(TEST_[0-9a-z_]*\) .*/\1/p')} + for func in $funcs ; do + setup $dir || return 1 + $func $dir || return 1 + teardown $dir || return 1 + done +} + +function TEST_mon_created_time() { + local dir=$1 + + run_mon $dir a || return 1 + + ceph mon dump || return 1 + + if test "$(ceph mon dump 2>/dev/null | sed -n '/created/p' | awk '{print $NF}')"x = ""x ; then + return 1 + fi + + if test "$(ceph mon dump 2>/dev/null | sed -n '/created/p' | awk '{print $NF}')"x = "0.000000"x ; then + return 1 + fi +} + +main mon-created-time "$@" + +# Local Variables: +# compile-command: "cd ../.. ; make -j4 && test/mon/mon-created-time.sh" +# End: diff -Nru ceph-12.1.1/qa/standalone/mon/mon-handle-forward.sh ceph-12.1.2/qa/standalone/mon/mon-handle-forward.sh --- ceph-12.1.1/qa/standalone/mon/mon-handle-forward.sh 1970-01-01 00:00:00.000000000 +0000 +++ ceph-12.1.2/qa/standalone/mon/mon-handle-forward.sh 2017-08-01 17:55:40.000000000 +0000 @@ -0,0 +1,64 @@ +#!/bin/bash +# +# Copyright (C) 2013 Cloudwatt +# Copyright (C) 2014,2015 Red Hat +# +# Author: Loic Dachary +# +# This program is free software; you can redistribute it and/or modify +# it under the terms of the GNU Library Public License as published by +# the Free Software Foundation; either version 2, or (at your option) +# any later version. +# +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU Library Public License for more details. +# +source $CEPH_ROOT/qa/standalone/ceph-helpers.sh + +function run() { + local dir=$1 + + setup $dir || return 1 + + MONA=127.0.0.1:7300 + MONB=127.0.0.1:7301 + ( + FSID=$(uuidgen) + export CEPH_ARGS + CEPH_ARGS+="--fsid=$FSID --auth-supported=none " + CEPH_ARGS+="--mon-initial-members=a,b --mon-host=$MONA,$MONB " + run_mon $dir a --public-addr $MONA || return 1 + run_mon $dir b --public-addr $MONB || return 1 + ) + + timeout 360 ceph --mon-host $MONA mon stat || return 1 + # check that MONB is indeed a peon + ceph --admin-daemon $(get_asok_path mon.b) mon_status | + grep '"peon"' || return 1 + # when the leader ( MONA ) is used, there is no message forwarding + ceph --mon-host $MONA osd pool create POOL1 12 + CEPH_ARGS='' ceph --admin-daemon $(get_asok_path mon.a) log flush || return 1 + grep 'mon_command(.*"POOL1"' $dir/a/mon.a.log + CEPH_ARGS='' ceph --admin-daemon $(get_asok_path mon.b) log flush || return 1 + grep 'mon_command(.*"POOL1"' $dir/mon.b.log && return 1 + # when the peon ( MONB ) is used, the message is forwarded to the leader + ceph --mon-host $MONB osd pool create POOL2 12 + CEPH_ARGS='' ceph --admin-daemon $(get_asok_path mon.b) log flush || return 1 + grep 'forward_request.*mon_command(.*"POOL2"' $dir/mon.b.log + CEPH_ARGS='' ceph --admin-daemon $(get_asok_path mon.a) log flush || return 1 + grep ' forward(mon_command(.*"POOL2"' $dir/mon.a.log + # forwarded messages must retain features from the original connection + features=$(sed -n -e 's|.*127.0.0.1:0.*accept features \([0-9][0-9]*\)|\1|p' < \ + $dir/mon.b.log) + grep ' forward(mon_command(.*"POOL2".*con_features '$features $dir/mon.a.log + + teardown $dir || return 1 +} + +main mon-handle-forward "$@" + +# Local Variables: +# compile-command: "cd ../.. ; make -j4 TESTS=test/mon/mon-handle-forward.sh check" +# End: diff -Nru ceph-12.1.1/qa/standalone/mon/mon-ping.sh ceph-12.1.2/qa/standalone/mon/mon-ping.sh --- ceph-12.1.1/qa/standalone/mon/mon-ping.sh 1970-01-01 00:00:00.000000000 +0000 +++ ceph-12.1.2/qa/standalone/mon/mon-ping.sh 2017-08-01 17:55:40.000000000 +0000 @@ -0,0 +1,46 @@ +#!/bin/bash +# +# Copyright (C) 2015 SUSE LINUX GmbH +# +# This program is free software; you can redistribute it and/or modify +# it under the terms of the GNU Library Public License as published by +# the Free Software Foundation; either version 2, or (at your option) +# any later version. +# +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU Library Public License for more details. +# +source $CEPH_ROOT/qa/standalone/ceph-helpers.sh + +function run() { + local dir=$1 + shift + + export CEPH_MON="127.0.0.1:7119" # git grep '\<7119\>' : there must be only one + export CEPH_ARGS + CEPH_ARGS+="--fsid=$(uuidgen) --auth-supported=none " + CEPH_ARGS+="--mon-host=$CEPH_MON " + + local funcs=${@:-$(set | sed -n -e 's/^\(TEST_[0-9a-z_]*\) .*/\1/p')} + for func in $funcs ; do + setup $dir || return 1 + $func $dir || return 1 + teardown $dir || return 1 + done +} + +function TEST_mon_ping() { + local dir=$1 + + run_mon $dir a || return 1 + + ceph ping mon.a || return 1 +} + +main mon-ping "$@" + +# Local Variables: +# compile-command: "cd ../.. ; make -j4 && test/mon/mon-ping.sh" +# End: diff -Nru ceph-12.1.1/qa/standalone/mon/mon-scrub.sh ceph-12.1.2/qa/standalone/mon/mon-scrub.sh --- ceph-12.1.1/qa/standalone/mon/mon-scrub.sh 1970-01-01 00:00:00.000000000 +0000 +++ ceph-12.1.2/qa/standalone/mon/mon-scrub.sh 2017-08-01 17:55:40.000000000 +0000 @@ -0,0 +1,49 @@ +#!/bin/bash +# +# Copyright (C) 2014 Cloudwatt +# Copyright (C) 2014, 2015 Red Hat +# +# Author: Loic Dachary +# +# This program is free software; you can redistribute it and/or modify +# it under the terms of the GNU Library Public License as published by +# the Free Software Foundation; either version 2, or (at your option) +# any later version. +# +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU Library Public License for more details. +# +source $CEPH_ROOT/qa/standalone/ceph-helpers.sh + +function run() { + local dir=$1 + shift + + export CEPH_MON="127.0.0.1:7120" # git grep '\<7120\>' : there must be only one + export CEPH_ARGS + CEPH_ARGS+="--fsid=$(uuidgen) --auth-supported=none " + CEPH_ARGS+="--mon-host=$CEPH_MON " + + local funcs=${@:-$(set | sed -n -e 's/^\(TEST_[0-9a-z_]*\) .*/\1/p')} + for func in $funcs ; do + setup $dir || return 1 + $func $dir || return 1 + teardown $dir || return 1 + done +} + +function TEST_mon_scrub() { + local dir=$1 + + run_mon $dir a || return 1 + + ceph mon scrub || return 1 +} + +main mon-scrub "$@" + +# Local Variables: +# compile-command: "cd ../.. ; make -j4 && test/mon/mon-scrub.sh" +# End: diff -Nru ceph-12.1.1/qa/standalone/mon/osd-crush.sh ceph-12.1.2/qa/standalone/mon/osd-crush.sh --- ceph-12.1.1/qa/standalone/mon/osd-crush.sh 1970-01-01 00:00:00.000000000 +0000 +++ ceph-12.1.2/qa/standalone/mon/osd-crush.sh 2017-08-01 17:55:40.000000000 +0000 @@ -0,0 +1,229 @@ +#!/bin/bash +# +# Copyright (C) 2014 Cloudwatt +# Copyright (C) 2014, 2015 Red Hat +# +# Author: Loic Dachary +# +# This program is free software; you can redistribute it and/or modify +# it under the terms of the GNU Library Public License as published by +# the Free Software Foundation; either version 2, or (at your option) +# any later version. +# +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU Library Public License for more details. +# +source $CEPH_ROOT/qa/standalone/ceph-helpers.sh + +function run() { + local dir=$1 + shift + + export CEPH_MON="127.0.0.1:7104" # git grep '\<7104\>' : there must be only one + export CEPH_ARGS + CEPH_ARGS+="--fsid=$(uuidgen) --auth-supported=none " + CEPH_ARGS+="--mon-host=$CEPH_MON " + + local funcs=${@:-$(set | ${SED} -n -e 's/^\(TEST_[0-9a-z_]*\) .*/\1/p')} + for func in $funcs ; do + setup $dir || return 1 + $func $dir || return 1 + teardown $dir || return 1 + done +} + +function TEST_crush_rule_create_simple() { + local dir=$1 + + run_mon $dir a || return 1 + + ceph --format xml osd crush rule dump replicated_rule | \ + egrep 'take[^<]+default' | \ + grep 'choose_firstn0osd' || return 1 + local ruleset=ruleset0 + local root=host1 + ceph osd crush add-bucket $root host + local failure_domain=osd + ceph osd crush rule create-simple $ruleset $root $failure_domain || return 1 + ceph osd crush rule create-simple $ruleset $root $failure_domain 2>&1 | \ + grep "$ruleset already exists" || return 1 + ceph --format xml osd crush rule dump $ruleset | \ + egrep 'take[^<]+'$root'' | \ + grep 'choose_firstn0'$failure_domain'' || return 1 + ceph osd crush rule rm $ruleset || return 1 +} + +function TEST_crush_rule_dump() { + local dir=$1 + + run_mon $dir a || return 1 + + local ruleset=ruleset1 + ceph osd crush rule create-erasure $ruleset || return 1 + test $(ceph --format json osd crush rule dump $ruleset | \ + jq ".rule_name == \"$ruleset\"") == true || return 1 + test $(ceph --format json osd crush rule dump | \ + jq "map(select(.rule_name == \"$ruleset\")) | length == 1") == true || return 1 + ! ceph osd crush rule dump non_existent_ruleset || return 1 + ceph osd crush rule rm $ruleset || return 1 +} + +function TEST_crush_rule_rm() { + local ruleset=erasure2 + + run_mon $dir a || return 1 + + ceph osd crush rule create-erasure $ruleset default || return 1 + ceph osd crush rule ls | grep $ruleset || return 1 + ceph osd crush rule rm $ruleset || return 1 + ! ceph osd crush rule ls | grep $ruleset || return 1 +} + +function TEST_crush_rule_create_erasure() { + local dir=$1 + + run_mon $dir a || return 1 + # should have at least one OSD + run_osd $dir 0 || return 1 + + local ruleset=ruleset3 + # + # create a new ruleset with the default profile, implicitly + # + ceph osd crush rule create-erasure $ruleset || return 1 + ceph osd crush rule create-erasure $ruleset 2>&1 | \ + grep "$ruleset already exists" || return 1 + ceph --format xml osd crush rule dump $ruleset | \ + egrep 'take[^<]+default' | \ + grep 'chooseleaf_indep0host' || return 1 + ceph osd crush rule rm $ruleset || return 1 + ! ceph osd crush rule ls | grep $ruleset || return 1 + # + # create a new ruleset with the default profile, explicitly + # + ceph osd crush rule create-erasure $ruleset default || return 1 + ceph osd crush rule ls | grep $ruleset || return 1 + ceph osd crush rule rm $ruleset || return 1 + ! ceph osd crush rule ls | grep $ruleset || return 1 + # + # create a new ruleset and the default profile, implicitly + # + ceph osd erasure-code-profile rm default || return 1 + ! ceph osd erasure-code-profile ls | grep default || return 1 + ceph osd crush rule create-erasure $ruleset || return 1 + CEPH_ARGS='' ceph --admin-daemon $(get_asok_path mon.a) log flush || return 1 + grep 'profile set default' $dir/mon.a.log || return 1 + ceph osd erasure-code-profile ls | grep default || return 1 + ceph osd crush rule rm $ruleset || return 1 + ! ceph osd crush rule ls | grep $ruleset || return 1 +} + +function check_ruleset_id_match_rule_id() { + local rule_name=$1 + rule_id=`ceph osd crush rule dump $rule_name | grep "\"rule_id\":" | awk -F ":|," '{print int($2)}'` + ruleset_id=`ceph osd crush rule dump $rule_name | grep "\"ruleset\":"| awk -F ":|," '{print int($2)}'` + test $ruleset_id = $rule_id || return 1 +} + +function generate_manipulated_rules() { + local dir=$1 + ceph osd crush add-bucket $root host + ceph osd crush rule create-simple test_rule1 $root osd firstn || return 1 + ceph osd crush rule create-simple test_rule2 $root osd firstn || return 1 + ceph osd getcrushmap -o $dir/original_map + crushtool -d $dir/original_map -o $dir/decoded_original_map + #manipulate the rulesets , to make the rule_id != ruleset_id + ${SED} -i 's/ruleset 0/ruleset 3/' $dir/decoded_original_map + ${SED} -i 's/ruleset 2/ruleset 0/' $dir/decoded_original_map + ${SED} -i 's/ruleset 1/ruleset 2/' $dir/decoded_original_map + + crushtool -c $dir/decoded_original_map -o $dir/new_map + ceph osd setcrushmap -i $dir/new_map + + ceph osd crush rule dump +} + +function TEST_crush_ruleset_match_rule_when_creating() { + local dir=$1 + + run_mon $dir a || return 1 + + local root=host1 + + generate_manipulated_rules $dir + + ceph osd crush rule create-simple special_rule_simple $root osd firstn || return 1 + + ceph osd crush rule dump + #show special_rule_simple has same rule_id and ruleset_id + check_ruleset_id_match_rule_id special_rule_simple || return 1 +} + +function TEST_add_ruleset_failed() { + local dir=$1 + + run_mon $dir a || return 1 + + local root=host1 + + ceph osd crush add-bucket $root host + ceph osd crush rule create-simple test_rule1 $root osd firstn || return 1 + ceph osd crush rule create-simple test_rule2 $root osd firstn || return 1 + ceph osd getcrushmap > $dir/crushmap || return 1 + crushtool --decompile $dir/crushmap > $dir/crushmap.txt || return 1 + for i in $(seq 3 255) + do + cat <> $dir/crushmap.txt + crushtool --compile $dir/crushmap.txt -o $dir/crushmap || return 1 + ceph osd setcrushmap -i $dir/crushmap || return 1 + ceph osd crush rule create-simple test_rule_nospace $root osd firstn 2>&1 | grep "Error ENOSPC" || return 1 + +} + +function TEST_crush_rename_bucket() { + local dir=$1 + + run_mon $dir a || return 1 + + ceph osd crush add-bucket host1 host + ceph osd tree + ! ceph osd tree | grep host2 || return 1 + ceph osd crush rename-bucket host1 host2 || return 1 + ceph osd tree + ceph osd tree | grep host2 || return 1 + ceph osd crush rename-bucket host1 host2 || return 1 # idempotency + ceph osd crush rename-bucket nonexistent something 2>&1 | grep "Error ENOENT" || return 1 +} + +function TEST_crush_reject_empty() { + local dir=$1 + run_mon $dir a || return 1 + # should have at least one OSD + run_osd $dir 0 || return 1 + create_rbd_pool || return 1 + + local empty_map=$dir/empty_map + :> $empty_map.txt + crushtool -c $empty_map.txt -o $empty_map.map || return 1 + expect_failure $dir "Error EINVAL" \ + ceph osd setcrushmap -i $empty_map.map || return 1 +} + +main osd-crush "$@" + +# Local Variables: +# compile-command: "cd ../.. ; make -j4 && test/mon/osd-crush.sh" +# End: diff -Nru ceph-12.1.1/qa/standalone/mon/osd-erasure-code-profile.sh ceph-12.1.2/qa/standalone/mon/osd-erasure-code-profile.sh --- ceph-12.1.1/qa/standalone/mon/osd-erasure-code-profile.sh 1970-01-01 00:00:00.000000000 +0000 +++ ceph-12.1.2/qa/standalone/mon/osd-erasure-code-profile.sh 2017-08-01 17:55:40.000000000 +0000 @@ -0,0 +1,229 @@ +#!/bin/bash +# +# Copyright (C) 2014 Cloudwatt +# Copyright (C) 2014, 2015 Red Hat +# +# Author: Loic Dachary +# +# This program is free software; you can redistribute it and/or modify +# it under the terms of the GNU Library Public License as published by +# the Free Software Foundation; either version 2, or (at your option) +# any later version. +# +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU Library Public License for more details. +# +source $CEPH_ROOT/qa/standalone/ceph-helpers.sh + +function run() { + local dir=$1 + shift + + export CEPH_MON="127.0.0.1:7220" # git grep '\<7220\>' : there must be only one + export CEPH_ARGS + CEPH_ARGS+="--fsid=$(uuidgen) --auth-supported=none " + CEPH_ARGS+="--mon-host=$CEPH_MON " + + local funcs=${@:-$(set | sed -n -e 's/^\(TEST_[0-9a-z_]*\) .*/\1/p')} + for func in $funcs ; do + setup $dir || return 1 + $func $dir || return 1 + teardown $dir || return 1 + done +} + +function TEST_set() { + local dir=$1 + local id=$2 + + run_mon $dir a || return 1 + + local profile=myprofile + # + # no key=value pairs : use the default configuration + # + ceph osd erasure-code-profile set $profile 2>&1 || return 1 + ceph osd erasure-code-profile get $profile | \ + grep plugin=jerasure || return 1 + ceph osd erasure-code-profile rm $profile + # + # key=value pairs override the default + # + ceph osd erasure-code-profile set $profile \ + key=value plugin=isa || return 1 + ceph osd erasure-code-profile get $profile | \ + grep -e key=value -e plugin=isa || return 1 + # + # --force is required to override an existing profile + # + ! ceph osd erasure-code-profile set $profile > $dir/out 2>&1 || return 1 + grep 'will not override' $dir/out || return 1 + ceph osd erasure-code-profile set $profile key=other --force || return 1 + ceph osd erasure-code-profile get $profile | \ + grep key=other || return 1 + + ceph osd erasure-code-profile rm $profile # cleanup +} + +function TEST_ls() { + local dir=$1 + local id=$2 + + run_mon $dir a || return 1 + + local profile=myprofile + ! ceph osd erasure-code-profile ls | grep $profile || return 1 + ceph osd erasure-code-profile set $profile 2>&1 || return 1 + ceph osd erasure-code-profile ls | grep $profile || return 1 + ceph --format xml osd erasure-code-profile ls | \ + grep "$profile" || return 1 + + ceph osd erasure-code-profile rm $profile # cleanup +} + +function TEST_rm() { + local dir=$1 + local id=$2 + + run_mon $dir a || return 1 + + local profile=myprofile + ceph osd erasure-code-profile set $profile 2>&1 || return 1 + ceph osd erasure-code-profile ls | grep $profile || return 1 + ceph osd erasure-code-profile rm $profile || return 1 + ! ceph osd erasure-code-profile ls | grep $profile || return 1 + ceph osd erasure-code-profile rm WRONG 2>&1 | \ + grep "WRONG does not exist" || return 1 + + ceph osd erasure-code-profile set $profile || return 1 + ceph osd pool create poolname 12 12 erasure $profile || return 1 + ! ceph osd erasure-code-profile rm $profile > $dir/out 2>&1 || return 1 + grep "poolname.*using.*$profile" $dir/out || return 1 + ceph osd pool delete poolname poolname --yes-i-really-really-mean-it || return 1 + ceph osd erasure-code-profile rm $profile || return 1 + + ceph osd erasure-code-profile rm $profile # cleanup +} + +function TEST_get() { + local dir=$1 + local id=$2 + + run_mon $dir a || return 1 + + local default_profile=default + ceph osd erasure-code-profile get $default_profile | \ + grep plugin=jerasure || return 1 + ceph --format xml osd erasure-code-profile get $default_profile | \ + grep 'jerasure' || return 1 + ! ceph osd erasure-code-profile get WRONG > $dir/out 2>&1 || return 1 + grep -q "unknown erasure code profile 'WRONG'" $dir/out || return 1 +} + +function TEST_set_idempotent() { + local dir=$1 + local id=$2 + + run_mon $dir a || return 1 + # + # The default profile is set using a code path different from + # ceph osd erasure-code-profile set: verify that it is idempotent, + # as if it was using the same code path. + # + ceph osd erasure-code-profile set default k=2 m=1 2>&1 || return 1 + local profile + # + # Because plugin=jerasure is the default, it uses a slightly + # different code path where defaults (m=1 for instance) are added + # implicitly. + # + profile=profileidempotent1 + ! ceph osd erasure-code-profile ls | grep $profile || return 1 + ceph osd erasure-code-profile set $profile k=2 crush-failure-domain=osd 2>&1 || return 1 + ceph osd erasure-code-profile ls | grep $profile || return 1 + ceph osd erasure-code-profile set $profile k=2 crush-failure-domain=osd 2>&1 || return 1 + ceph osd erasure-code-profile rm $profile # cleanup + + # + # In the general case the profile is exactly what is on + # + profile=profileidempotent2 + ! ceph osd erasure-code-profile ls | grep $profile || return 1 + ceph osd erasure-code-profile set $profile plugin=lrc k=4 m=2 l=3 crush-failure-domain=osd 2>&1 || return 1 + ceph osd erasure-code-profile ls | grep $profile || return 1 + ceph osd erasure-code-profile set $profile plugin=lrc k=4 m=2 l=3 crush-failure-domain=osd 2>&1 || return 1 + ceph osd erasure-code-profile rm $profile # cleanup +} + +function TEST_format_invalid() { + local dir=$1 + + local profile=profile + # osd_pool_default_erasure-code-profile is + # valid JSON but not of the expected type + run_mon $dir a \ + --osd_pool_default_erasure-code-profile 1 || return 1 + ! ceph osd erasure-code-profile set $profile > $dir/out 2>&1 || return 1 + cat $dir/out + grep 'must be a JSON object' $dir/out || return 1 +} + +function TEST_format_json() { + local dir=$1 + + # osd_pool_default_erasure-code-profile is JSON + expected='"plugin":"isa"' + run_mon $dir a \ + --osd_pool_default_erasure-code-profile "{$expected}" || return 1 + ceph --format json osd erasure-code-profile get default | \ + grep "$expected" || return 1 +} + +function TEST_format_plain() { + local dir=$1 + + # osd_pool_default_erasure-code-profile is plain text + expected='"plugin":"isa"' + run_mon $dir a \ + --osd_pool_default_erasure-code-profile "plugin=isa" || return 1 + ceph --format json osd erasure-code-profile get default | \ + grep "$expected" || return 1 +} + +function TEST_profile_k_sanity() { + local dir=$1 + local profile=profile-sanity + + run_mon $dir a || return 1 + + expect_failure $dir 'k must be a multiple of (k + m) / l' \ + ceph osd erasure-code-profile set $profile \ + plugin=lrc \ + l=1 \ + k=1 \ + m=1 || return 1 + + if erasure_code_plugin_exists isa ; then + expect_failure $dir 'k=1 must be >= 2' \ + ceph osd erasure-code-profile set $profile \ + plugin=isa \ + k=1 \ + m=1 || return 1 + else + echo "SKIP because plugin isa has not been built" + fi + + expect_failure $dir 'k=1 must be >= 2' \ + ceph osd erasure-code-profile set $profile \ + plugin=jerasure \ + k=1 \ + m=1 || return 1 +} + +main osd-erasure-code-profile "$@" + +# Local Variables: +# compile-command: "cd ../.. ; make -j4 && test/mon/osd-erasure-code-profile.sh" +# End: diff -Nru ceph-12.1.1/qa/standalone/mon/osd-pool-create.sh ceph-12.1.2/qa/standalone/mon/osd-pool-create.sh --- ceph-12.1.1/qa/standalone/mon/osd-pool-create.sh 1970-01-01 00:00:00.000000000 +0000 +++ ceph-12.1.2/qa/standalone/mon/osd-pool-create.sh 2017-08-01 17:55:40.000000000 +0000 @@ -0,0 +1,215 @@ +#!/bin/bash +# +# Copyright (C) 2013, 2014 Cloudwatt +# Copyright (C) 2014, 2015 Red Hat +# +# Author: Loic Dachary +# +# This program is free software; you can redistribute it and/or modify +# it under the terms of the GNU Library Public License as published by +# the Free Software Foundation; either version 2, or (at your option) +# any later version. +# +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU Library Public License for more details. +# +source $CEPH_ROOT/qa/standalone/ceph-helpers.sh + +function run() { + local dir=$1 + shift + + export CEPH_MON="127.0.0.1:7105" # git grep '\<7105\>' : there must be only one + export CEPH_ARGS + CEPH_ARGS+="--fsid=$(uuidgen) --auth-supported=none " + CEPH_ARGS+="--mon-host=$CEPH_MON " + + local funcs=${@:-$(set | sed -n -e 's/^\(TEST_[0-9a-z_]*\) .*/\1/p')} + for func in $funcs ; do + setup $dir || return 1 + $func $dir || return 1 + teardown $dir || return 1 + done +} + +# Before http://tracker.ceph.com/issues/8307 the invalid profile was created +function TEST_erasure_invalid_profile() { + local dir=$1 + run_mon $dir a || return 1 + local poolname=pool_erasure + local notaprofile=not-a-valid-erasure-code-profile + ! ceph osd pool create $poolname 12 12 erasure $notaprofile || return 1 + ! ceph osd erasure-code-profile ls | grep $notaprofile || return 1 +} + +function TEST_erasure_crush_rule() { + local dir=$1 + run_mon $dir a || return 1 + # + # choose the crush ruleset used with an erasure coded pool + # + local crush_ruleset=myruleset + ! ceph osd crush rule ls | grep $crush_ruleset || return 1 + ceph osd crush rule create-erasure $crush_ruleset + ceph osd crush rule ls | grep $crush_ruleset + local poolname + poolname=pool_erasure1 + ! ceph --format json osd dump | grep '"crush_rule":1' || return 1 + ceph osd pool create $poolname 12 12 erasure default $crush_ruleset + ceph --format json osd dump | grep '"crush_rule":1' || return 1 + # + # a crush ruleset by the name of the pool is implicitly created + # + poolname=pool_erasure2 + ceph osd erasure-code-profile set myprofile + ceph osd pool create $poolname 12 12 erasure myprofile + ceph osd crush rule ls | grep $poolname || return 1 + # + # a non existent crush ruleset given in argument is an error + # http://tracker.ceph.com/issues/9304 + # + poolname=pool_erasure3 + ! ceph osd pool create $poolname 12 12 erasure myprofile INVALIDRULESET || return 1 +} + +function TEST_erasure_code_profile_default() { + local dir=$1 + run_mon $dir a || return 1 + ceph osd erasure-code-profile rm default || return 1 + ! ceph osd erasure-code-profile ls | grep default || return 1 + ceph osd pool create $poolname 12 12 erasure default + ceph osd erasure-code-profile ls | grep default || return 1 +} + +function TEST_erasure_crush_stripe_unit() { + local dir=$1 + # the default stripe unit is used to initialize the pool + run_mon $dir a --public-addr $CEPH_MON + stripe_unit=$(ceph-conf --show-config-value osd_pool_erasure_code_stripe_unit) + eval local $(ceph osd erasure-code-profile get myprofile | grep k=) + stripe_width = $((stripe_unit * k)) + ceph osd pool create pool_erasure 12 12 erasure + ceph --format json osd dump | tee $dir/osd.json + grep '"stripe_width":'$stripe_width $dir/osd.json > /dev/null || return 1 +} + +function TEST_erasure_crush_stripe_unit_padded() { + local dir=$1 + # setting osd_pool_erasure_code_stripe_unit modifies the stripe_width + # and it is padded as required by the default plugin + profile+=" plugin=jerasure" + profile+=" technique=reed_sol_van" + k=4 + profile+=" k=$k" + profile+=" m=2" + actual_stripe_unit=2048 + desired_stripe_unit=$((actual_stripe_unit - 1)) + actual_stripe_width=$((actual_stripe_unit * k)) + run_mon $dir a \ + --osd_pool_erasure_code_stripe_unit $desired_stripe_unit \ + --osd_pool_default_erasure_code_profile "$profile" || return 1 + ceph osd pool create pool_erasure 12 12 erasure + ceph osd dump | tee $dir/osd.json + grep "stripe_width $actual_stripe_width" $dir/osd.json > /dev/null || return 1 +} + +function TEST_erasure_code_pool() { + local dir=$1 + run_mon $dir a || return 1 + ceph --format json osd dump > $dir/osd.json + local expected='"erasure_code_profile":"default"' + ! grep "$expected" $dir/osd.json || return 1 + ceph osd pool create erasurecodes 12 12 erasure + ceph --format json osd dump | tee $dir/osd.json + grep "$expected" $dir/osd.json > /dev/null || return 1 + + ceph osd pool create erasurecodes 12 12 erasure 2>&1 | \ + grep 'already exists' || return 1 + ceph osd pool create erasurecodes 12 12 2>&1 | \ + grep 'cannot change to type replicated' || return 1 +} + +function TEST_replicated_pool_with_ruleset() { + local dir=$1 + run_mon $dir a + local ruleset=ruleset0 + local root=host1 + ceph osd crush add-bucket $root host + local failure_domain=osd + local poolname=mypool + ceph osd crush rule create-simple $ruleset $root $failure_domain || return 1 + ceph osd crush rule ls | grep $ruleset + ceph osd pool create $poolname 12 12 replicated $ruleset || return 1 + rule_id=`ceph osd crush rule dump $ruleset | grep "rule_id" | awk -F[' ':,] '{print $4}'` + ceph osd pool get $poolname crush_rule 2>&1 | \ + grep "crush_rule: $rule_id" || return 1 + #non-existent crush ruleset + ceph osd pool create newpool 12 12 replicated non-existent 2>&1 | \ + grep "doesn't exist" || return 1 +} + +function TEST_erasure_code_pool_lrc() { + local dir=$1 + run_mon $dir a || return 1 + + ceph osd erasure-code-profile set LRCprofile \ + plugin=lrc \ + mapping=DD_ \ + layers='[ [ "DDc", "" ] ]' || return 1 + + ceph --format json osd dump > $dir/osd.json + local expected='"erasure_code_profile":"LRCprofile"' + local poolname=erasurecodes + ! grep "$expected" $dir/osd.json || return 1 + ceph osd pool create $poolname 12 12 erasure LRCprofile + ceph --format json osd dump | tee $dir/osd.json + grep "$expected" $dir/osd.json > /dev/null || return 1 + ceph osd crush rule ls | grep $poolname || return 1 +} + +function TEST_replicated_pool() { + local dir=$1 + run_mon $dir a || return 1 + ceph osd pool create replicated 12 12 replicated replicated_rule || return 1 + ceph osd pool create replicated 12 12 replicated replicated_rule 2>&1 | \ + grep 'already exists' || return 1 + # default is replicated + ceph osd pool create replicated1 12 12 || return 1 + # default is replicated, pgp_num = pg_num + ceph osd pool create replicated2 12 || return 1 + ceph osd pool create replicated 12 12 erasure 2>&1 | \ + grep 'cannot change to type erasure' || return 1 +} + +function TEST_no_pool_delete() { + local dir=$1 + run_mon $dir a || return 1 + ceph osd pool create foo 1 || return 1 + ceph tell mon.a injectargs -- --no-mon-allow-pool-delete || return 1 + ! ceph osd pool delete foo foo --yes-i-really-really-mean-it || return 1 + ceph tell mon.a injectargs -- --mon-allow-pool-delete || return 1 + ceph osd pool delete foo foo --yes-i-really-really-mean-it || return 1 +} + +function TEST_utf8_cli() { + local dir=$1 + run_mon $dir a || return 1 + # Hopefully it's safe to include literal UTF-8 characters to test + # the fix for http://tracker.ceph.com/issues/7387. If it turns out + # to not be OK (when is the default encoding *not* UTF-8?), maybe + # the character '黄' can be replaced with the escape $'\xe9\xbb\x84' + ceph osd pool create 黄 1024 || return 1 + ceph osd lspools 2>&1 | \ + grep "黄" || return 1 + ceph -f json-pretty osd dump | \ + python -c "import json; import sys; json.load(sys.stdin)" || return 1 + ceph osd pool delete 黄 黄 --yes-i-really-really-mean-it +} + +main osd-pool-create "$@" + +# Local Variables: +# compile-command: "cd ../.. ; make -j4 && test/mon/osd-pool-create.sh" +# End: diff -Nru ceph-12.1.1/qa/standalone/mon/test_pool_quota.sh ceph-12.1.2/qa/standalone/mon/test_pool_quota.sh --- ceph-12.1.1/qa/standalone/mon/test_pool_quota.sh 1970-01-01 00:00:00.000000000 +0000 +++ ceph-12.1.2/qa/standalone/mon/test_pool_quota.sh 2017-08-01 17:55:40.000000000 +0000 @@ -0,0 +1,63 @@ +#!/bin/bash + +# +# Generic pool quota test +# + +# Includes + + +source $CEPH_ROOT/qa/standalone/ceph-helpers.sh + +function run() { + local dir=$1 + shift + + export CEPH_MON="127.0.0.1:17108" # git grep '\<17108\>' : there must be only one + export CEPH_ARGS + CEPH_ARGS+="--fsid=$(uuidgen) --auth-supported=none " + CEPH_ARGS+="--mon-host=$CEPH_MON " + + local funcs=${@:-$(set | sed -n -e 's/^\(TEST_[0-9a-z_]*\) .*/\1/p')} + for func in $funcs ; do + $func $dir || return 1 + done +} + +function TEST_pool_quota() { + local dir=$1 + setup $dir || return 1 + + run_mon $dir a || return 1 + run_osd $dir 0 || return 1 + run_osd $dir 1 || return 1 + run_osd $dir 2 || return 1 + + local poolname=testquoa + ceph osd pool create $poolname 20 + local objects=`ceph df detail | grep -w $poolname|awk '{print $3}'` + local bytes=`ceph df detail | grep -w $poolname|awk '{print $4}'` + + echo $objects + echo $bytes + if [ $objects != 'N/A' ] || [ $bytes != 'N/A' ] ; + then + return 1 + fi + + ceph osd pool set-quota $poolname max_objects 1000 + ceph osd pool set-quota $poolname max_bytes 1024 + + objects=`ceph df detail | grep -w $poolname|awk '{print $3}'` + bytes=`ceph df detail | grep -w $poolname|awk '{print $4}'` + + if [ $objects != '1000' ] || [ $bytes != '1024' ] ; + then + return 1 + fi + + ceph osd pool delete $poolname $poolname --yes-i-really-really-mean-it + teardown $dir || return 1 +} + +main testpoolquota diff -Nru ceph-12.1.1/qa/standalone/osd/osd-bench.sh ceph-12.1.2/qa/standalone/osd/osd-bench.sh --- ceph-12.1.1/qa/standalone/osd/osd-bench.sh 1970-01-01 00:00:00.000000000 +0000 +++ ceph-12.1.2/qa/standalone/osd/osd-bench.sh 2017-08-01 17:55:40.000000000 +0000 @@ -0,0 +1,96 @@ +#!/bin/bash +# +# Copyright (C) 2014 Cloudwatt +# Copyright (C) 2014, 2015 Red Hat +# +# Author: Loic Dachary +# +# This program is free software; you can redistribute it and/or modify +# it under the terms of the GNU Library Public License as published by +# the Free Software Foundation; either version 2, or (at your option) +# any later version. +# +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU Library Public License for more details. +# + +source $CEPH_ROOT/qa/standalone/ceph-helpers.sh + +function run() { + local dir=$1 + shift + + export CEPH_MON="127.0.0.1:7106" # git grep '\<7106\>' : there must be only one + export CEPH_ARGS + CEPH_ARGS+="--fsid=$(uuidgen) --auth-supported=none " + CEPH_ARGS+="--mon-host=$CEPH_MON " + + local funcs=${@:-$(set | sed -n -e 's/^\(TEST_[0-9a-z_]*\) .*/\1/p')} + for func in $funcs ; do + setup $dir || return 1 + $func $dir || return 1 + teardown $dir || return 1 + done +} + +function TEST_bench() { + local dir=$1 + + run_mon $dir a || return 1 + run_mgr $dir x || return 1 + run_osd $dir 0 || return 1 + + local osd_bench_small_size_max_iops=$(CEPH_ARGS='' ceph-conf \ + --show-config-value osd_bench_small_size_max_iops) + local osd_bench_large_size_max_throughput=$(CEPH_ARGS='' ceph-conf \ + --show-config-value osd_bench_large_size_max_throughput) + local osd_bench_max_block_size=$(CEPH_ARGS='' ceph-conf \ + --show-config-value osd_bench_max_block_size) + local osd_bench_duration=$(CEPH_ARGS='' ceph-conf \ + --show-config-value osd_bench_duration) + + # + # block size too high + # + expect_failure $dir osd_bench_max_block_size \ + ceph tell osd.0 bench 1024 $((osd_bench_max_block_size + 1)) || return 1 + + # + # count too high for small (< 1MB) block sizes + # + local bsize=1024 + local max_count=$(($bsize * $osd_bench_duration * $osd_bench_small_size_max_iops)) + expect_failure $dir bench_small_size_max_iops \ + ceph tell osd.0 bench $(($max_count + 1)) $bsize || return 1 + + # + # count too high for large (>= 1MB) block sizes + # + local bsize=$((1024 * 1024 + 1)) + local max_count=$(($osd_bench_large_size_max_throughput * $osd_bench_duration)) + expect_failure $dir osd_bench_large_size_max_throughput \ + ceph tell osd.0 bench $(($max_count + 1)) $bsize || return 1 + + # + # default values should work + # + ceph tell osd.0 bench || return 1 + + # + # test object_size < block_size + ceph tell osd.0 bench 10 14456 4444 3 + # + + # + # test object_size < block_size & object_size = 0(default value) + # + ceph tell osd.0 bench 1 14456 +} + +main osd-bench "$@" + +# Local Variables: +# compile-command: "cd ../.. ; make -j4 && test/osd/osd-bench.sh" +# End: diff -Nru ceph-12.1.1/qa/standalone/osd/osd-config.sh ceph-12.1.2/qa/standalone/osd/osd-config.sh --- ceph-12.1.1/qa/standalone/osd/osd-config.sh 1970-01-01 00:00:00.000000000 +0000 +++ ceph-12.1.2/qa/standalone/osd/osd-config.sh 2017-08-01 17:55:40.000000000 +0000 @@ -0,0 +1,118 @@ +#!/bin/bash +# +# Copyright (C) 2014 Cloudwatt +# Copyright (C) 2014, 2015 Red Hat +# +# Author: Loic Dachary +# +# This program is free software; you can redistribute it and/or modify +# it under the terms of the GNU Library Public License as published by +# the Free Software Foundation; either version 2, or (at your option) +# any later version. +# +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU Library Public License for more details. +# + +source $CEPH_ROOT/qa/standalone/ceph-helpers.sh + +function run() { + local dir=$1 + shift + + export CEPH_MON="127.0.0.1:7100" # git grep '\<7100\>' : there must be only one + export CEPH_ARGS + CEPH_ARGS+="--fsid=$(uuidgen) --auth-supported=none " + CEPH_ARGS+="--mon-host=$CEPH_MON " + + local funcs=${@:-$(set | sed -n -e 's/^\(TEST_[0-9a-z_]*\) .*/\1/p')} + for func in $funcs ; do + setup $dir || return 1 + $func $dir || return 1 + teardown $dir || return 1 + done +} + +function TEST_config_init() { + local dir=$1 + + run_mon $dir a || return 1 + run_mgr $dir x || return 1 + local advance=1000 + local stale=1000 + local cache=500 + run_osd $dir 0 \ + --osd-map-max-advance $advance \ + --osd-map-cache-size $cache \ + --osd-pg-epoch-persisted-max-stale $stale \ + || return 1 + CEPH_ARGS='' ceph --admin-daemon $(get_asok_path osd.0) log flush || return 1 + grep 'is not > osd_map_max_advance' $dir/osd.0.log || return 1 + grep 'is not > osd_pg_epoch_persisted_max_stale' $dir/osd.0.log || return 1 +} + +function TEST_config_track() { + local dir=$1 + + run_mon $dir a || return 1 + run_mgr $dir x || return 1 + run_osd $dir 0 || return 1 + + local osd_map_cache_size=$(CEPH_ARGS='' ceph-conf \ + --show-config-value osd_map_cache_size) + local osd_map_max_advance=$(CEPH_ARGS='' ceph-conf \ + --show-config-value osd_map_max_advance) + local osd_pg_epoch_persisted_max_stale=$(CEPH_ARGS='' ceph-conf \ + --show-config-value osd_pg_epoch_persisted_max_stale) + # + # lower cache_size under max_advance to trigger the warning + # + ! grep 'is not > osd_map_max_advance' $dir/osd.0.log || return 1 + local cache=$(($osd_map_max_advance / 2)) + ceph tell osd.0 injectargs "--osd-map-cache-size $cache" || return 1 + CEPH_ARGS='' ceph --admin-daemon $(get_asok_path osd.0) log flush || return 1 + grep 'is not > osd_map_max_advance' $dir/osd.0.log || return 1 + rm $dir/osd.0.log + CEPH_ARGS='' ceph --admin-daemon $(get_asok_path osd.0) log reopen || return 1 + + # + # reset cache_size to the default and assert that it does not trigger the warning + # + ! grep 'is not > osd_map_max_advance' $dir/osd.0.log || return 1 + local cache=$osd_map_cache_size + ceph tell osd.0 injectargs "--osd-map-cache-size $cache" || return 1 + CEPH_ARGS='' ceph --admin-daemon $(get_asok_path osd.0) log flush || return 1 + ! grep 'is not > osd_map_max_advance' $dir/osd.0.log || return 1 + rm $dir/osd.0.log + CEPH_ARGS='' ceph --admin-daemon $(get_asok_path osd.0) log reopen || return 1 + + # + # increase the osd_map_max_advance above the default cache_size + # + ! grep 'is not > osd_map_max_advance' $dir/osd.0.log || return 1 + local advance=$(($osd_map_cache_size * 2)) + ceph tell osd.0 injectargs "--osd-map-max-advance $advance" || return 1 + CEPH_ARGS='' ceph --admin-daemon $(get_asok_path osd.0) log flush || return 1 + grep 'is not > osd_map_max_advance' $dir/osd.0.log || return 1 + rm $dir/osd.0.log + CEPH_ARGS='' ceph --admin-daemon $(get_asok_path osd.0) log reopen || return 1 + + # + # increase the osd_pg_epoch_persisted_max_stale above the default cache_size + # + ! grep 'is not > osd_pg_epoch_persisted_max_stale' $dir/osd.0.log || return 1 + local stale=$(($osd_map_cache_size * 2)) + ceph tell osd.0 injectargs "--osd-pg-epoch-persisted-max-stale $stale" || return 1 + CEPH_ARGS='' ceph --admin-daemon $(get_asok_path osd.0) log flush || return 1 + grep 'is not > osd_pg_epoch_persisted_max_stale' $dir/osd.0.log || return 1 + rm $dir/osd.0.log + CEPH_ARGS='' ceph --admin-daemon $(get_asok_path osd.0) log reopen || return 1 +} + +main osd-config "$@" + +# Local Variables: +# compile-command: "cd ../.. ; make -j4 && test/osd/osd-config.sh" +# End: diff -Nru ceph-12.1.1/qa/standalone/osd/osd-copy-from.sh ceph-12.1.2/qa/standalone/osd/osd-copy-from.sh --- ceph-12.1.1/qa/standalone/osd/osd-copy-from.sh 1970-01-01 00:00:00.000000000 +0000 +++ ceph-12.1.2/qa/standalone/osd/osd-copy-from.sh 2017-08-01 17:55:40.000000000 +0000 @@ -0,0 +1,68 @@ +#!/bin/bash +# +# Copyright (C) 2014 Cloudwatt +# Copyright (C) 2014, 2015 Red Hat +# +# Author: Loic Dachary +# Author: Sage Weil +# +# This program is free software; you can redistribute it and/or modify +# it under the terms of the GNU Library Public License as published by +# the Free Software Foundation; either version 2, or (at your option) +# any later version. +# +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU Library Public License for more details. +# + +source $CEPH_ROOT/qa/standalone/ceph-helpers.sh + +function run() { + local dir=$1 + shift + + export CEPH_MON="127.0.0.1:7111" # git grep '\<7111\>' : there must be only one + export CEPH_ARGS + CEPH_ARGS+="--fsid=$(uuidgen) --auth-supported=none " + CEPH_ARGS+="--mon-host=$CEPH_MON " + + local funcs=${@:-$(set | sed -n -e 's/^\(TEST_[0-9a-z_]*\) .*/\1/p')} + for func in $funcs ; do + setup $dir || return 1 + $func $dir || return 1 + teardown $dir || return 1 + done +} + +function TEST_copy_from() { + local dir=$1 + + run_mon $dir a || return 1 + run_mgr $dir x || return 1 + run_osd $dir 0 || return 1 + run_osd $dir 1 || return 1 + create_rbd_pool || return 1 + + # success + rados -p rbd put foo $(which rados) + rados -p rbd cp foo foo2 + rados -p rbd stat foo2 + + # failure + ceph tell osd.\* injectargs -- --osd-debug-inject-copyfrom-error + ! rados -p rbd cp foo foo3 + ! rados -p rbd stat foo3 + + # success again + ceph tell osd.\* injectargs -- --no-osd-debug-inject-copyfrom-error + ! rados -p rbd cp foo foo3 + rados -p rbd stat foo3 +} + +main osd-copy-from "$@" + +# Local Variables: +# compile-command: "cd ../.. ; make -j4 && test/osd/osd-bench.sh" +# End: diff -Nru ceph-12.1.1/qa/standalone/osd/osd-dup.sh ceph-12.1.2/qa/standalone/osd/osd-dup.sh --- ceph-12.1.1/qa/standalone/osd/osd-dup.sh 1970-01-01 00:00:00.000000000 +0000 +++ ceph-12.1.2/qa/standalone/osd/osd-dup.sh 2017-08-01 17:55:40.000000000 +0000 @@ -0,0 +1,81 @@ +#!/bin/bash + +source $CEPH_ROOT/qa/standalone/ceph-helpers.sh + +function run() { + local dir=$1 + shift + + export CEPH_MON="127.0.0.1:7146" # git grep '\<7146\>' : there must be only one + export CEPH_ARGS + CEPH_ARGS+="--fsid=$(uuidgen) --auth-supported=none " + CEPH_ARGS+="--mon-host=$CEPH_MON " + # avoid running out of fds in rados bench + CEPH_ARGS+="--filestore_wbthrottle_xfs_ios_hard_limit=900 " + CEPH_ARGS+="--filestore_wbthrottle_btrfs_ios_hard_limit=900 " + local funcs=${@:-$(set | sed -n -e 's/^\(TEST_[0-9a-z_]*\) .*/\1/p')} + for func in $funcs ; do + setup $dir || return 1 + $func $dir || return 1 + teardown $dir || return 1 + done +} + +function TEST_filestore_to_bluestore() { + local dir=$1 + + local flimit=$(ulimit -n) + if [ $flimit -lt 1536 ]; then + echo "Low open file limit ($flimit), test may fail. Increase to 1536 or higher and retry if that happens." + fi + + run_mon $dir a || return 1 + run_mgr $dir x || return 1 + run_osd $dir 0 || return 1 + osd_pid=$(cat $dir/osd.0.pid) + run_osd $dir 1 || return 1 + run_osd $dir 2 || return 1 + + sleep 5 + + ceph osd pool create foo 16 + + # write some objects + rados bench -p foo 10 write -b 4096 --no-cleanup || return 1 + + # kill + while kill $osd_pid; do sleep 1 ; done + ceph osd down 0 + + mv $dir/0 $dir/0.old || return 1 + mkdir $dir/0 || return 1 + ofsid=$(cat $dir/0.old/fsid) + echo "osd fsid $ofsid" + O=$CEPH_ARGS + CEPH_ARGS+="--log-file $dir/cot.log --log-max-recent 0 " + ceph-objectstore-tool --type bluestore --data-path $dir/0 --fsid $ofsid \ + --op mkfs || return 1 + ceph-objectstore-tool --data-path $dir/0.old --target-data-path $dir/0 \ + --op dup || return 1 + CEPH_ARGS=$O + + run_osd_bluestore $dir 0 || return 1 + + while ! ceph osd stat | grep '3 up' ; do sleep 1 ; done + ceph osd metadata 0 | grep bluestore || return 1 + + ceph osd scrub 0 + + # give it some time + sleep 15 + # and make sure mon is sync'ed + flush_pg_stats + + wait_for_clean || return 1 +} + +main osd-dup "$@" + +# Local Variables: +# compile-command: "cd ../.. ; make -j4 && test/osd/osd-dup.sh" +# End: diff -Nru ceph-12.1.1/qa/standalone/osd/osd-fast-mark-down.sh ceph-12.1.2/qa/standalone/osd/osd-fast-mark-down.sh --- ceph-12.1.1/qa/standalone/osd/osd-fast-mark-down.sh 1970-01-01 00:00:00.000000000 +0000 +++ ceph-12.1.2/qa/standalone/osd/osd-fast-mark-down.sh 2017-08-01 17:55:40.000000000 +0000 @@ -0,0 +1,135 @@ +#!/bin/bash +# +# Copyright (C) 2016 Piotr Dałek +# Copyright (C) 2014, 2015 Red Hat +# +# Author: Piotr Dałek +# +# This program is free software; you can redistribute it and/or modify +# it under the terms of the GNU Library Public License as published by +# the Free Software Foundation; either version 2, or (at your option) +# any later version. +# +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU Library Public License for more details. +# + +source $CEPH_ROOT/qa/standalone/ceph-helpers.sh +MAX_PROPAGATION_TIME=30 + +function run() { + local dir=$1 + shift + rm -f $dir/*.pid + export CEPH_MON="127.0.0.1:7126" # git grep '\<7126\>' : there must be only one + export CEPH_ARGS + CEPH_ARGS+="--fsid=$(uuidgen) --auth-supported=none " + CEPH_ARGS+="--mon-host=$CEPH_MON " + # + # Disable device auto class feature for this testing, + # as it will automatically make root clones based on new class types + # and hence affect the down osd counting. + # E.g., + # + # ID WEIGHT TYPE NAME UP/DOWN REWEIGHT PRIMARY-AFFINITY + # -4 3.00000 root default~hdd + # -3 3.00000 host gitbuilder-ceph-rpm-centos7-amd64-basic~hdd + # 0 1.00000 osd.0 down 1.00000 1.00000 + # 1 1.00000 osd.1 up 1.00000 1.00000 + # 2 1.00000 osd.2 up 1.00000 1.00000 + # -1 3.00000 root default + # -2 3.00000 host gitbuilder-ceph-rpm-centos7-amd64-basic + # 0 1.00000 osd.0 down 1.00000 1.00000 + # 1 1.00000 osd.1 up 1.00000 1.00000 + # 2 1.00000 osd.2 up 1.00000 1.00000 + # + CEPH_ARGS+="--osd-class-update-on-start=false " + + OLD_ARGS=$CEPH_ARGS + CEPH_ARGS+="--osd-fast-fail-on-connection-refused=false " + echo "Ensuring old behavior is there..." + test_fast_kill $dir && (echo "OSDs died too early! Old behavior doesn't work." ; return 1) + + CEPH_ARGS=$OLD_ARGS"--osd-fast-fail-on-connection-refused=true " + OLD_ARGS=$CEPH_ARGS + + CEPH_ARGS+="--ms_type=simple" + echo "Testing simple msgr..." + test_fast_kill $dir || return 1 + + CEPH_ARGS=$OLD_ARGS"--ms_type=async" + echo "Testing async msgr..." + test_fast_kill $dir || return 1 + + return 0 + +} + +function test_fast_kill() { + # create cluster with 3 osds + setup $dir || return 1 + run_mon $dir a --osd_pool_default_size=3 || return 1 + run_mgr $dir x || return 1 + for oi in {0..2}; do + run_osd $dir $oi || return 1 + pids[$oi]=$(cat $dir/osd.$oi.pid) + done + + create_rbd_pool || return 1 + + # make some objects so osds to ensure connectivity between osds + rados -p rbd bench 10 write -b 4096 --max-objects 128 --no-cleanup + sleep 1 + + killid=0 + previd=0 + + # kill random osd and see if after max MAX_PROPAGATION_TIME, the osd count decreased. + for i in {1..2}; do + while [ $killid -eq $previd ]; do + killid=${pids[$RANDOM%${#pids[@]}]} + done + previd=$killid + + kill -9 $killid + time_left=$MAX_PROPAGATION_TIME + down_osds=0 + + while [ $time_left -gt 0 ]; do + sleep 1 + time_left=$[$time_left - 1]; + + grep -m 1 -c -F "ms_handle_refused" $dir/osd.*.log > /dev/null + if [ $? -ne 0 ]; then + continue + fi + + down_osds=$(ceph osd tree | grep -c down) + if [ $down_osds -lt $i ]; then + # osds not marked down yet, try again in a second + continue + elif [ $down_osds -gt $i ]; then + echo Too many \($down_osds\) osds died! + return 1 + else + break + fi + done + + if [ $down_osds -lt $i ]; then + echo Killed the OSD, yet it is not marked down + ceph osd tree + return 1 + fi + done + pkill -SIGTERM rados + teardown $dir || return 1 +} + +main osd-fast-mark-down "$@" + +# Local Variables: +# compile-command: "cd ../.. ; make -j4 && test/osd/osd-fast-mark-down.sh" +# End: diff -Nru ceph-12.1.1/qa/standalone/osd/osd-markdown.sh ceph-12.1.2/qa/standalone/osd/osd-markdown.sh --- ceph-12.1.1/qa/standalone/osd/osd-markdown.sh 1970-01-01 00:00:00.000000000 +0000 +++ ceph-12.1.2/qa/standalone/osd/osd-markdown.sh 2017-08-01 17:55:40.000000000 +0000 @@ -0,0 +1,122 @@ +#!/bin/bash +# +# Copyright (C) 2015 Intel +# Copyright (C) 2014, 2015 Red Hat +# +# Author: Xiaoxi Chen +# +# This program is free software; you can redistribute it and/or modify +# it under the terms of the GNU Library Public License as published by +# the Free Software Foundation; either version 2, or (at your option) +# any later version. +# +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU Library Public License for more details. +# + +source $CEPH_ROOT/qa/standalone/ceph-helpers.sh + +function run() { + local dir=$1 + shift + + export CEPH_MON="127.0.0.1:7108" # git grep '\<7108\>' : there must be only one + export CEPH_ARGS + CEPH_ARGS+="--fsid=$(uuidgen) --auth-supported=none " + CEPH_ARGS+="--mon-host=$CEPH_MON " + + local funcs=${@:-$(set | sed -n -e 's/^\(TEST_[0-9a-z_]*\) .*/\1/p')} + for func in $funcs ; do + setup $dir || return 1 + $func $dir || return 1 + teardown $dir || return 1 + done +} + +function markdown_N_impl() { + markdown_times=$1 + total_time=$2 + sleeptime=$3 + for i in `seq 1 $markdown_times` + do + # check the OSD is UP + ceph osd tree + ceph osd tree | grep osd.0 |grep up || return 1 + # mark the OSD down. + ceph osd down 0 + sleep $sleeptime + done +} + + +function TEST_markdown_exceed_maxdown_count() { + local dir=$1 + + run_mon $dir a || return 1 + run_mgr $dir x || return 1 + run_osd $dir 0 || return 1 + run_osd $dir 1 || return 1 + run_osd $dir 2 || return 1 + # 3+1 times within 300s, osd should stay dead on the 4th time + local count=3 + local sleeptime=10 + local period=300 + ceph tell osd.0 injectargs '--osd_max_markdown_count '$count'' || return 1 + ceph tell osd.0 injectargs '--osd_max_markdown_period '$period'' || return 1 + + markdown_N_impl $(($count+1)) $period $sleeptime + # down N+1 times ,the osd.0 shoud die + ceph osd tree | grep down | grep osd.0 || return 1 +} + +function TEST_markdown_boot() { + local dir=$1 + + run_mon $dir a || return 1 + run_mgr $dir x || return 1 + run_osd $dir 0 || return 1 + run_osd $dir 1 || return 1 + run_osd $dir 2 || return 1 + + # 3 times within 120s, should stay up + local count=3 + local sleeptime=10 + local period=120 + ceph tell osd.0 injectargs '--osd_max_markdown_count '$count'' || return 1 + ceph tell osd.0 injectargs '--osd_max_markdown_period '$period'' || return 1 + + markdown_N_impl $count $period $sleeptime + #down N times, osd.0 should be up + sleep 15 # give osd plenty of time to notice and come back up + ceph osd tree | grep up | grep osd.0 || return 1 +} + +function TEST_markdown_boot_exceed_time() { + local dir=$1 + + run_mon $dir a || return 1 + run_mgr $dir x || return 1 + run_osd $dir 0 || return 1 + run_osd $dir 1 || return 1 + run_osd $dir 2 || return 1 + + + # 3+1 times, but over 40s, > 20s, so should stay up + local count=3 + local period=20 + local sleeptime=10 + ceph tell osd.0 injectargs '--osd_max_markdown_count '$count'' || return 1 + ceph tell osd.0 injectargs '--osd_max_markdown_period '$period'' || return 1 + + markdown_N_impl $(($count+1)) $period $sleeptime + sleep 15 # give osd plenty of time to notice and come back up + ceph osd tree | grep up | grep osd.0 || return 1 +} + +main osd-markdown "$@" + +# Local Variables: +# compile-command: "cd ../.. ; make -j4 && test/osd/osd-bench.sh" +# End: diff -Nru ceph-12.1.1/qa/standalone/osd/osd-reactivate.sh ceph-12.1.2/qa/standalone/osd/osd-reactivate.sh --- ceph-12.1.1/qa/standalone/osd/osd-reactivate.sh 1970-01-01 00:00:00.000000000 +0000 +++ ceph-12.1.2/qa/standalone/osd/osd-reactivate.sh 2017-08-01 17:55:40.000000000 +0000 @@ -0,0 +1,56 @@ +#!/bin/bash +# +# Author: Vicente Cheng +# +# This program is free software; you can redistribute it and/or modify +# it under the terms of the GNU Library Public License as published by +# the Free Software Foundation; either version 2, or (at your option) +# any later version. +# +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU Library Public License for more details. +# + +source $CEPH_ROOT/qa/standalone/ceph-helpers.sh + +function run() { + local dir=$1 + shift + + export CEPH_MON="127.0.0.1:7122" # git grep '\<7122\>' : there must be only one + export CEPH_ARGS + CEPH_ARGS+="--fsid=$(uuidgen) --auth-supported=none " + CEPH_ARGS+="--mon-host=$CEPH_MON " + + local funcs=${@:-$(set | sed -n -e 's/^\(TEST_[0-9a-z_]*\) .*/\1/p')} + for func in $funcs ; do + setup $dir || return 1 + $func $dir || return 1 + teardown $dir || return 1 + done +} + +function TEST_reactivate() { + local dir=$1 + + run_mon $dir a || return 1 + run_mgr $dir x || return 1 + run_osd $dir 0 || return 1 + + kill_daemons $dir TERM osd || return 1 + + ready_path=$dir"/0/ready" + activate_path=$dir"/0/active" + # trigger mkfs again + rm -rf $ready_path $activate_path + activate_osd $dir 0 || return 1 + +} + +main osd-reactivate "$@" + +# Local Variables: +# compile-command: "cd ../.. ; make -j4 && test/osd/osd-reactivate.sh" +# End: diff -Nru ceph-12.1.1/qa/standalone/osd/osd-reuse-id.sh ceph-12.1.2/qa/standalone/osd/osd-reuse-id.sh --- ceph-12.1.1/qa/standalone/osd/osd-reuse-id.sh 1970-01-01 00:00:00.000000000 +0000 +++ ceph-12.1.2/qa/standalone/osd/osd-reuse-id.sh 2017-08-01 17:55:40.000000000 +0000 @@ -0,0 +1,52 @@ +#! /bin/bash +# +# Copyright (C) 2015 Red Hat +# +# Author: Loic Dachary +# +# This program is free software; you can redistribute it and/or modify +# it under the terms of the GNU Library Public License as published by +# the Free Software Foundation; either version 2, or (at your option) +# any later version. +# +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU Library Public License for more details. +# +source $CEPH_ROOT/qa/standalone/ceph-helpers.sh + +function run() { + local dir=$1 + shift + + export CEPH_MON="127.0.0.1:7123" # git grep '\<7123\>' : there must be only one + export CEPH_ARGS + CEPH_ARGS+="--fsid=$(uuidgen) --auth-supported=none " + CEPH_ARGS+="--mon-host=$CEPH_MON " + + local funcs=${@:-$(set | sed -n -e 's/^\(TEST_[0-9a-z_]*\) .*/\1/p')} + for func in $funcs ; do + $func $dir || return 1 + done +} + +function TEST_reuse_id() { + local dir=$1 + + setup $dir || return 1 + run_mon $dir a --osd_pool_default_size=1 || return 1 + run_mgr $dir x || return 1 + run_osd $dir 0 || return 1 + run_osd $dir 1 || return 1 + create_rbd_pool || return 1 + wait_for_clean || return 1 + destroy_osd $dir 1 || return 1 + run_osd $dir 1 || return 1 +} + +main osd-reuse-id "$@" + +# Local Variables: +# compile-command: "cd ../.. ; make -j4 && test/osd/osd-reuse-id.sh" +# End: diff -Nru ceph-12.1.1/qa/standalone/README ceph-12.1.2/qa/standalone/README --- ceph-12.1.1/qa/standalone/README 1970-01-01 00:00:00.000000000 +0000 +++ ceph-12.1.2/qa/standalone/README 2017-08-01 17:55:40.000000000 +0000 @@ -0,0 +1,18 @@ +qa/standalone +============= + +These scripts run standalone clusters, but not in a normal way. They make +use of functions ceph-helpers.sh to quickly start/stop daemons against +toy clusters in a single directory. + +They are normally run via teuthology based on qa/suites/rados/standalone/*.yaml. + +You can run them in a git checkout + build directory as well: + + * The qa/run-standalone.sh will run all of them in sequence. This is slow + since there is no parallelism. + + * You can run an individual script by passing these environment args. For + example, if you are in the build/ directory, + +PATH=$PATH:bin CEPH_ROOT=.. CEPH_LIB=lib ../qa/standalone/mon/misc.sh diff -Nru ceph-12.1.1/qa/standalone/scrub/osd-scrub-repair.sh ceph-12.1.2/qa/standalone/scrub/osd-scrub-repair.sh --- ceph-12.1.1/qa/standalone/scrub/osd-scrub-repair.sh 1970-01-01 00:00:00.000000000 +0000 +++ ceph-12.1.2/qa/standalone/scrub/osd-scrub-repair.sh 2017-08-01 17:55:40.000000000 +0000 @@ -0,0 +1,2630 @@ +#!/bin/bash -x +# +# Copyright (C) 2014 Red Hat +# +# Author: Loic Dachary +# +# This program is free software; you can redistribute it and/or modify +# it under the terms of the GNU Library Public License as published by +# the Free Software Foundation; either version 2, or (at your option) +# any later version. +# +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU Library Public License for more details. +# +source $CEPH_ROOT/qa/standalone/ceph-helpers.sh + +if [ `uname` = FreeBSD ]; then + # erasure coding overwrites are only tested on Bluestore + # erasure coding on filestore is unsafe + # http://docs.ceph.com/docs/master/rados/operations/erasure-code/#erasure-coding-with-overwrites + use_ec_overwrite=false +else + use_ec_overwrite=true +fi + +# Test development and debugging +# Set to "yes" in order to ignore diff errors and save results to update test +getjson="no" + +# Ignore the epoch and filter out the attr '_' value because it has date information and won't match +jqfilter='.inconsistents | (.[].shards[].attrs[]? | select(.name == "_") | .value) |= "----Stripped-by-test----"' +sortkeys='import json; import sys ; JSON=sys.stdin.read() ; ud = json.loads(JSON) ; print json.dumps(ud, sort_keys=True, indent=2)' + +# Remove items are not consistent across runs, the pg interval and client +sedfilter='s/\([ ]*\"\(selected_\)*object_info\":.*head[(]\)[^[:space:]]* [^[:space:]]* \(.*\)/\1\3/' + +function run() { + local dir=$1 + shift + + export CEPH_MON="127.0.0.1:7107" # git grep '\<7107\>' : there must be only one + export CEPH_ARGS + CEPH_ARGS+="--fsid=$(uuidgen) --auth-supported=none " + CEPH_ARGS+="--mon-host=$CEPH_MON " + + local funcs=${@:-$(set | sed -n -e 's/^\(TEST_[0-9a-z_]*\) .*/\1/p')} + for func in $funcs ; do + $func $dir || return 1 + done +} + +function add_something() { + local dir=$1 + local poolname=$2 + local obj=${3:-SOMETHING} + local scrub=${4:-noscrub} + + if [ "$scrub" = "noscrub" ]; + then + ceph osd set noscrub || return 1 + ceph osd set nodeep-scrub || return 1 + else + ceph osd unset noscrub || return 1 + ceph osd unset nodeep-scrub || return 1 + fi + + local payload=ABCDEF + echo $payload > $dir/ORIGINAL + rados --pool $poolname put $obj $dir/ORIGINAL || return 1 +} + +# +# Corrupt one copy of a replicated pool +# +function TEST_corrupt_and_repair_replicated() { + local dir=$1 + local poolname=rbd + + setup $dir || return 1 + run_mon $dir a --osd_pool_default_size=2 || return 1 + run_mgr $dir x || return 1 + run_osd $dir 0 || return 1 + run_osd $dir 1 || return 1 + create_rbd_pool || return 1 + wait_for_clean || return 1 + + add_something $dir $poolname || return 1 + corrupt_and_repair_one $dir $poolname $(get_not_primary $poolname SOMETHING) || return 1 + # Reproduces http://tracker.ceph.com/issues/8914 + corrupt_and_repair_one $dir $poolname $(get_primary $poolname SOMETHING) || return 1 + + teardown $dir || return 1 +} + +function corrupt_and_repair_two() { + local dir=$1 + local poolname=$2 + local first=$3 + local second=$4 + + # + # 1) remove the corresponding file from the OSDs + # + pids="" + run_in_background pids objectstore_tool $dir $first SOMETHING remove + run_in_background pids objectstore_tool $dir $second SOMETHING remove + wait_background pids + return_code=$? + if [ $return_code -ne 0 ]; then return $return_code; fi + + # + # 2) repair the PG + # + local pg=$(get_pg $poolname SOMETHING) + repair $pg + # + # 3) The files must be back + # + pids="" + run_in_background pids objectstore_tool $dir $first SOMETHING list-attrs + run_in_background pids objectstore_tool $dir $second SOMETHING list-attrs + wait_background pids + return_code=$? + if [ $return_code -ne 0 ]; then return $return_code; fi + + rados --pool $poolname get SOMETHING $dir/COPY || return 1 + diff $dir/ORIGINAL $dir/COPY || return 1 +} + +# +# 1) add an object +# 2) remove the corresponding file from a designated OSD +# 3) repair the PG +# 4) check that the file has been restored in the designated OSD +# +function corrupt_and_repair_one() { + local dir=$1 + local poolname=$2 + local osd=$3 + + # + # 1) remove the corresponding file from the OSD + # + objectstore_tool $dir $osd SOMETHING remove || return 1 + # + # 2) repair the PG + # + local pg=$(get_pg $poolname SOMETHING) + repair $pg + # + # 3) The file must be back + # + objectstore_tool $dir $osd SOMETHING list-attrs || return 1 + rados --pool $poolname get SOMETHING $dir/COPY || return 1 + diff $dir/ORIGINAL $dir/COPY || return 1 +} + +function corrupt_and_repair_erasure_coded() { + local dir=$1 + local poolname=$2 + + add_something $dir $poolname || return 1 + + local primary=$(get_primary $poolname SOMETHING) + local -a osds=($(get_osds $poolname SOMETHING | sed -e "s/$primary//")) + local not_primary_first=${osds[0]} + local not_primary_second=${osds[1]} + + # Reproduces http://tracker.ceph.com/issues/10017 + corrupt_and_repair_one $dir $poolname $primary || return 1 + # Reproduces http://tracker.ceph.com/issues/10409 + corrupt_and_repair_one $dir $poolname $not_primary_first || return 1 + corrupt_and_repair_two $dir $poolname $not_primary_first $not_primary_second || return 1 + corrupt_and_repair_two $dir $poolname $primary $not_primary_first || return 1 + +} + +function create_ec_pool() { + local pool_name=$1 + local allow_overwrites=$2 + + ceph osd erasure-code-profile set myprofile crush-failure-domain=osd $3 $4 $5 $6 $7 || return 1 + + ceph osd pool create "$poolname" 1 1 erasure myprofile || return 1 + + if [ "$allow_overwrites" = "true" ]; then + ceph osd pool set "$poolname" allow_ec_overwrites true || return 1 + fi + + wait_for_clean || return 1 + return 0 +} + +function auto_repair_erasure_coded() { + local dir=$1 + local allow_overwrites=$2 + local poolname=ecpool + + # Launch a cluster with 5 seconds scrub interval + setup $dir || return 1 + run_mon $dir a || return 1 + run_mgr $dir x || return 1 + local ceph_osd_args="--osd-scrub-auto-repair=true \ + --osd-deep-scrub-interval=5 \ + --osd-scrub-max-interval=5 \ + --osd-scrub-min-interval=5 \ + --osd-scrub-interval-randomize-ratio=0" + for id in $(seq 0 2) ; do + if [ "$allow_overwrites" = "true" ]; then + run_osd_bluestore $dir $id $ceph_osd_args || return 1 + else + run_osd $dir $id $ceph_osd_args || return 1 + fi + done + create_rbd_pool || return 1 + wait_for_clean || return 1 + + # Create an EC pool + create_ec_pool $poolname $allow_overwrites k=2 m=1 || return 1 + + # Put an object + local payload=ABCDEF + echo $payload > $dir/ORIGINAL + rados --pool $poolname put SOMETHING $dir/ORIGINAL || return 1 + + # Remove the object from one shard physically + # Restarted osd get $ceph_osd_args passed + objectstore_tool $dir $(get_not_primary $poolname SOMETHING) SOMETHING remove || return 1 + # Wait for auto repair + local pgid=$(get_pg $poolname SOMETHING) + wait_for_scrub $pgid "$(get_last_scrub_stamp $pgid)" + wait_for_clean || return 1 + # Verify - the file should be back + # Restarted osd get $ceph_osd_args passed + objectstore_tool $dir $(get_not_primary $poolname SOMETHING) SOMETHING list-attrs || return 1 + rados --pool $poolname get SOMETHING $dir/COPY || return 1 + diff $dir/ORIGINAL $dir/COPY || return 1 + + # Tear down + teardown $dir || return 1 +} + +function TEST_auto_repair_erasure_coded_appends() { + auto_repair_erasure_coded $1 false +} + +function TEST_auto_repair_erasure_coded_overwrites() { + if [ "$use_ec_overwrite" = "true" ]; then + auto_repair_erasure_coded $1 true + fi +} + +function corrupt_and_repair_jerasure() { + local dir=$1 + local allow_overwrites=$2 + local poolname=ecpool + + setup $dir || return 1 + run_mon $dir a || return 1 + run_mgr $dir x || return 1 + for id in $(seq 0 3) ; do + if [ "$allow_overwrites" = "true" ]; then + run_osd_bluestore $dir $id || return 1 + else + run_osd $dir $id || return 1 + fi + done + create_rbd_pool || return 1 + wait_for_clean || return 1 + + create_ec_pool $poolname $allow_overwrites k=2 m=2 || return 1 + corrupt_and_repair_erasure_coded $dir $poolname || return 1 + + teardown $dir || return 1 +} + +function TEST_corrupt_and_repair_jerasure_appends() { + corrupt_and_repair_jerasure $1 +} + +function TEST_corrupt_and_repair_jerasure_overwrites() { + if [ "$use_ec_overwrite" = "true" ]; then + corrupt_and_repair_jerasure $1 true + fi +} + +function corrupt_and_repair_lrc() { + local dir=$1 + local allow_overwrites=$2 + local poolname=ecpool + + setup $dir || return 1 + run_mon $dir a || return 1 + run_mgr $dir x || return 1 + for id in $(seq 0 9) ; do + if [ "$allow_overwrites" = "true" ]; then + run_osd_bluestore $dir $id || return 1 + else + run_osd $dir $id || return 1 + fi + done + create_rbd_pool || return 1 + wait_for_clean || return 1 + + create_ec_pool $poolname $allow_overwrites k=4 m=2 l=3 plugin=lrc || return 1 + corrupt_and_repair_erasure_coded $dir $poolname || return 1 + + teardown $dir || return 1 +} + +function TEST_corrupt_and_repair_lrc_appends() { + corrupt_and_repair_jerasure $1 +} + +function TEST_corrupt_and_repair_lrc_overwrites() { + if [ "$use_ec_overwrite" = "true" ]; then + corrupt_and_repair_jerasure $1 true + fi +} + +function unfound_erasure_coded() { + local dir=$1 + local allow_overwrites=$2 + local poolname=ecpool + local payload=ABCDEF + + setup $dir || return 1 + run_mon $dir a || return 1 + run_mgr $dir x || return 1 + for id in $(seq 0 3) ; do + if [ "$allow_overwrites" = "true" ]; then + run_osd_bluestore $dir $id || return 1 + else + run_osd $dir $id || return 1 + fi + done + create_rbd_pool || return 1 + wait_for_clean || return 1 + + create_ec_pool $poolname $allow_overwrites k=2 m=2 || return 1 + + add_something $dir $poolname || return 1 + + local primary=$(get_primary $poolname SOMETHING) + local -a osds=($(get_osds $poolname SOMETHING | sed -e "s/$primary//")) + local not_primary_first=${osds[0]} + local not_primary_second=${osds[1]} + local not_primary_third=${osds[2]} + + # + # 1) remove the corresponding file from the OSDs + # + pids="" + run_in_background pids objectstore_tool $dir $not_primary_first SOMETHING remove + run_in_background pids objectstore_tool $dir $not_primary_second SOMETHING remove + run_in_background pids objectstore_tool $dir $not_primary_third SOMETHING remove + wait_background pids + return_code=$? + if [ $return_code -ne 0 ]; then return $return_code; fi + + # + # 2) repair the PG + # + local pg=$(get_pg $poolname SOMETHING) + repair $pg + # + # 3) check pg state + # + # it may take a bit to appear due to mon/mgr asynchrony + for f in `seq 1 60`; do + ceph -s | grep "1/1 unfound" && break + sleep 1 + done + ceph -s|grep "4 osds: 4 up, 4 in" || return 1 + ceph -s|grep "1/1 unfound" || return 1 + + teardown $dir || return 1 +} + +function TEST_unfound_erasure_coded_appends() { + unfound_erasure_coded $1 +} + +function TEST_unfound_erasure_coded_overwrites() { + if [ "$use_ec_overwrite" = "true" ]; then + unfound_erasure_coded $1 true + fi +} + +# +# list_missing for EC pool +# +function list_missing_erasure_coded() { + local dir=$1 + local allow_overwrites=$2 + local poolname=ecpool + + setup $dir || return 1 + run_mon $dir a || return 1 + run_mgr $dir x || return 1 + for id in $(seq 0 2) ; do + if [ "$allow_overwrites" = "true" ]; then + run_osd_bluestore $dir $id || return 1 + else + run_osd $dir $id || return 1 + fi + done + create_rbd_pool || return 1 + wait_for_clean || return 1 + + create_ec_pool $poolname $allow_overwrites k=2 m=1 || return 1 + + # Put an object and remove the two shards (including primary) + add_something $dir $poolname MOBJ0 || return 1 + local -a osds0=($(get_osds $poolname MOBJ0)) + + # Put another object and remove two shards (excluding primary) + add_something $dir $poolname MOBJ1 || return 1 + local -a osds1=($(get_osds $poolname MOBJ1)) + + # Stop all osd daemons + for id in $(seq 0 2) ; do + kill_daemons $dir TERM osd.$id >&2 < /dev/null || return 1 + done + + id=${osds0[0]} + ceph-objectstore-tool --data-path $dir/$id \ + MOBJ0 remove || return 1 + id=${osds0[1]} + ceph-objectstore-tool --data-path $dir/$id \ + MOBJ0 remove || return 1 + + id=${osds1[1]} + ceph-objectstore-tool --data-path $dir/$id \ + MOBJ1 remove || return 1 + id=${osds1[2]} + ceph-objectstore-tool --data-path $dir/$id \ + MOBJ1 remove || return 1 + + for id in $(seq 0 2) ; do + activate_osd $dir $id >&2 || return 1 + done + create_rbd_pool || return 1 + wait_for_clean || return 1 + + # Get get - both objects should in the same PG + local pg=$(get_pg $poolname MOBJ0) + + # Repair the PG, which triggers the recovering, + # and should mark the object as unfound + repair $pg + + for i in $(seq 0 120) ; do + [ $i -lt 60 ] || return 1 + matches=$(ceph pg $pg list_missing | egrep "MOBJ0|MOBJ1" | wc -l) + [ $matches -eq 2 ] && break + done + + teardown $dir || return 1 +} + +function TEST_list_missing_erasure_coded_appends() { + list_missing_erasure_coded $1 false +} + +function TEST_list_missing_erasure_coded_overwrites() { + if [ "$use_ec_overwrite" = "true" ]; then + list_missing_erasure_coded $1 true + fi +} + +# +# Corrupt one copy of a replicated pool +# +function TEST_corrupt_scrub_replicated() { + local dir=$1 + local poolname=csr_pool + local total_objs=15 + + setup $dir || return 1 + run_mon $dir a --osd_pool_default_size=2 || return 1 + run_mgr $dir x || return 1 + run_osd $dir 0 || return 1 + run_osd $dir 1 || return 1 + create_rbd_pool || return 1 + wait_for_clean || return 1 + + ceph osd pool create foo 1 || return 1 + ceph osd pool create $poolname 1 1 || return 1 + wait_for_clean || return 1 + + for i in $(seq 1 $total_objs) ; do + objname=ROBJ${i} + add_something $dir $poolname $objname || return 1 + + rados --pool $poolname setomapheader $objname hdr-$objname || return 1 + rados --pool $poolname setomapval $objname key-$objname val-$objname || return 1 + done + + local pg=$(get_pg $poolname ROBJ0) + + # Compute an old omap digest and save oi + CEPH_ARGS='' ceph daemon $(get_asok_path osd.0) \ + config set osd_deep_scrub_update_digest_min_age 0 + CEPH_ARGS='' ceph daemon $(get_asok_path osd.1) \ + config set osd_deep_scrub_update_digest_min_age 0 + pg_deep_scrub $pg + + for i in $(seq 1 $total_objs) ; do + objname=ROBJ${i} + + # Alternate corruption between osd.0 and osd.1 + local osd=$(expr $i % 2) + + case $i in + 1) + # Size (deep scrub data_digest too) + local payload=UVWXYZZZ + echo $payload > $dir/CORRUPT + objectstore_tool $dir $osd $objname set-bytes $dir/CORRUPT || return 1 + ;; + + 2) + # digest (deep scrub only) + local payload=UVWXYZ + echo $payload > $dir/CORRUPT + objectstore_tool $dir $osd $objname set-bytes $dir/CORRUPT || return 1 + ;; + + 3) + # missing + objectstore_tool $dir $osd $objname remove || return 1 + ;; + + 4) + # Modify omap value (deep scrub only) + objectstore_tool $dir $osd $objname set-omap key-$objname $dir/CORRUPT || return 1 + ;; + + 5) + # Delete omap key (deep scrub only) + objectstore_tool $dir $osd $objname rm-omap key-$objname || return 1 + ;; + + 6) + # Add extra omap key (deep scrub only) + echo extra > $dir/extra-val + objectstore_tool $dir $osd $objname set-omap key2-$objname $dir/extra-val || return 1 + rm $dir/extra-val + ;; + + 7) + # Modify omap header (deep scrub only) + echo -n newheader > $dir/hdr + objectstore_tool $dir $osd $objname set-omaphdr $dir/hdr || return 1 + rm $dir/hdr + ;; + + 8) + rados --pool $poolname setxattr $objname key1-$objname val1-$objname || return 1 + rados --pool $poolname setxattr $objname key2-$objname val2-$objname || return 1 + + # Break xattrs + echo -n bad-val > $dir/bad-val + objectstore_tool $dir $osd $objname set-attr _key1-$objname $dir/bad-val || return 1 + objectstore_tool $dir $osd $objname rm-attr _key2-$objname || return 1 + echo -n val3-$objname > $dir/newval + objectstore_tool $dir $osd $objname set-attr _key3-$objname $dir/newval || return 1 + rm $dir/bad-val $dir/newval + ;; + + 9) + objectstore_tool $dir $osd $objname get-attr _ > $dir/robj9-oi + echo -n D > $dir/change + rados --pool $poolname put $objname $dir/change + objectstore_tool $dir $osd $objname set-attr _ $dir/robj9-oi + rm $dir/oi $dir/change + ;; + + # ROBJ10 must be handled after digests are re-computed by a deep scrub below + # ROBJ11 must be handled with config change before deep scrub + # ROBJ12 must be handled with config change before scrubs + # ROBJ13 must be handled before scrubs + + 14) + echo -n bad-val > $dir/bad-val + objectstore_tool $dir 0 $objname set-attr _ $dir/bad-val || return 1 + objectstore_tool $dir 1 $objname rm-attr _ || return 1 + rm $dir/bad-val + ;; + + 15) + objectstore_tool $dir $osd $objname rm-attr _ || return 1 + + esac + done + + local pg=$(get_pg $poolname ROBJ0) + + set_config osd 0 filestore_debug_inject_read_err true || return 1 + set_config osd 1 filestore_debug_inject_read_err true || return 1 + CEPH_ARGS='' ceph --admin-daemon $(get_asok_path osd.1) \ + injectdataerr $poolname ROBJ11 || return 1 + CEPH_ARGS='' ceph --admin-daemon $(get_asok_path osd.0) \ + injectmdataerr $poolname ROBJ12 || return 1 + CEPH_ARGS='' ceph --admin-daemon $(get_asok_path osd.0) \ + injectmdataerr $poolname ROBJ13 || return 1 + CEPH_ARGS='' ceph --admin-daemon $(get_asok_path osd.1) \ + injectdataerr $poolname ROBJ13 || return 1 + + pg_scrub $pg + + rados list-inconsistent-pg $poolname > $dir/json || return 1 + # Check pg count + test $(jq '. | length' $dir/json) = "1" || return 1 + # Check pgid + test $(jq -r '.[0]' $dir/json) = $pg || return 1 + + rados list-inconsistent-obj $pg > $dir/json || return 1 + # Get epoch for repair-get requests + epoch=$(jq .epoch $dir/json) + + jq "$jqfilter" << EOF | python -c "$sortkeys" | sed -e "$sedfilter" > $dir/checkcsjson +{ + "inconsistents": [ + { + "shards": [ + { + "size": 7, + "errors": [], + "osd": 0 + }, + { + "size": 9, + "errors": [ + "size_mismatch_oi" + ], + "osd": 1 + } + ], + "selected_object_info": "3:ce3f1d6a:::ROBJ1:head(47'54 osd.0.0:53 dirty|omap|data_digest|omap_digest s 7 uv 3 dd 2ddbf8f5 od f5fba2c6 alloc_hint [0 0 0])", + "union_shard_errors": [ + "size_mismatch_oi" + ], + "errors": [ + "size_mismatch" + ], + "object": { + "version": 3, + "snap": "head", + "locator": "", + "nspace": "", + "name": "ROBJ1" + } + }, + { + "shards": [ + { + "errors": [ + "stat_error" + ], + "osd": 0 + }, + { + "size": 7, + "errors": [], + "osd": 1 + } + ], + "selected_object_info": "3:bc819597:::ROBJ12:head(47'52 osd.0.0:51 dirty|omap|data_digest|omap_digest s 7 uv 36 dd 2ddbf8f5 od 67f306a alloc_hint [0 0 0])", + "union_shard_errors": [ + "stat_error" + ], + "errors": [], + "object": { + "version": 36, + "snap": "head", + "locator": "", + "nspace": "", + "name": "ROBJ12" + } + }, + { + "shards": [ + { + "errors": [ + "stat_error" + ], + "osd": 0 + }, + { + "size": 7, + "errors": [], + "osd": 1 + } + ], + "selected_object_info": "3:d60617f9:::ROBJ13:head(47'55 osd.0.0:54 dirty|omap|data_digest|omap_digest s 7 uv 39 dd 2ddbf8f5 od 6441854d alloc_hint [0 0 0])", + "union_shard_errors": [ + "stat_error" + ], + "errors": [], + "object": { + "version": 39, + "snap": "head", + "locator": "", + "nspace": "", + "name": "ROBJ13" + } + }, + { + "shards": [ + { + "size": 7, + "errors": [ + "oi_attr_corrupted" + ], + "osd": 0 + }, + { + "size": 7, + "errors": [ + "oi_attr_missing" + ], + "osd": 1 + } + ], + "union_shard_errors": [ + "oi_attr_missing", + "oi_attr_corrupted" + ], + "errors": [], + "object": { + "version": 0, + "snap": "head", + "locator": "", + "nspace": "", + "name": "ROBJ14" + } + }, + { + "shards": [ + { + "attrs": [ + { + "Base64": true, + "value": "", + "name": "_" + }, + { + "Base64": true, + "value": "AwIdAAAAAAAAAAAAAAABAAAAAAAAAAAAAAAAAAAAAAAAAAA=", + "name": "snapset" + } + ], + "size": 7, + "errors": [], + "osd": 0 + }, + { + "attrs": [ + { + "Base64": true, + "value": "AwIdAAAAAAAAAAAAAAABAAAAAAAAAAAAAAAAAAAAAAAAAAA=", + "name": "snapset" + } + ], + "size": 7, + "errors": [ + "oi_attr_missing" + ], + "osd": 1 + } + ], + "selected_object_info": "3:30259878:::ROBJ15:head(47'46 osd.0.0:45 dirty|omap|data_digest|omap_digest s 7 uv 45 dd 2ddbf8f5 od 2d2a4d6e alloc_hint [0 0 0])", + "union_shard_errors": [ + "oi_attr_missing" + ], + "errors": [ + "attr_name_mismatch" + ], + "object": { + "version": 45, + "snap": "head", + "locator": "", + "nspace": "", + "name": "ROBJ15" + } + }, + { + "shards": [ + { + "size": 7, + "errors": [], + "osd": 0 + }, + { + "errors": [ + "missing" + ], + "osd": 1 + } + ], + "selected_object_info": "3:f2a5b2a4:::ROBJ3:head(47'57 osd.0.0:56 dirty|omap|data_digest|omap_digest s 7 uv 9 dd 2ddbf8f5 od b35dfd alloc_hint [0 0 0])", + "union_shard_errors": [ + "missing" + ], + "errors": [], + "object": { + "version": 9, + "snap": "head", + "locator": "", + "nspace": "", + "name": "ROBJ3" + } + }, + { + "shards": [ + { + "attrs": [ + { + "Base64": true, + "value": "", + "name": "_" + }, + { + "Base64": false, + "value": "bad-val", + "name": "_key1-ROBJ8" + }, + { + "Base64": false, + "value": "val3-ROBJ8", + "name": "_key3-ROBJ8" + }, + { + "Base64": true, + "value": "AwIdAAAAAAAAAAAAAAABAAAAAAAAAAAAAAAAAAAAAAAAAAA=", + "name": "snapset" + } + ], + "size": 7, + "errors": [], + "osd": 0 + }, + { + "attrs": [ + { + "Base64": true, + "value": "", + "name": "_" + }, + { + "Base64": false, + "value": "val1-ROBJ8", + "name": "_key1-ROBJ8" + }, + { + "Base64": false, + "value": "val2-ROBJ8", + "name": "_key2-ROBJ8" + }, + { + "Base64": true, + "value": "AwIdAAAAAAAAAAAAAAABAAAAAAAAAAAAAAAAAAAAAAAAAAA=", + "name": "snapset" + } + ], + "size": 7, + "errors": [], + "osd": 1 + } + ], + "selected_object_info": "3:86586531:::ROBJ8:head(82'62 client.4351.0:1 dirty|omap|data_digest|omap_digest s 7 uv 62 dd 2ddbf8f5 od d6be81dc alloc_hint [0 0 0])", + "union_shard_errors": [], + "errors": [ + "attr_value_mismatch", + "attr_name_mismatch" + ], + "object": { + "version": 62, + "snap": "head", + "locator": "", + "nspace": "", + "name": "ROBJ8" + } + }, + { + "shards": [ + { + "attrs": [ + { + "Base64": true, + "value": "", + "name": "_" + }, + { + "Base64": true, + "value": "AwIdAAAAAAAAAAAAAAABAAAAAAAAAAAAAAAAAAAAAAAAAAA=", + "name": "snapset" + } + ], + "object_info": "3:ffdb2004:::ROBJ9:head(102'63 client.4433.0:1 dirty|omap|data_digest|omap_digest s 1 uv 63 dd 2b63260d od 2eecc539 alloc_hint [0 0 0])", + "size": 1, + "errors": [], + "osd": 0 + }, + { + "attrs": [ + { + "Base64": true, + "value": "", + "name": "_" + }, + { + "Base64": true, + "value": "AwIdAAAAAAAAAAAAAAABAAAAAAAAAAAAAAAAAAAAAAAAAAA=", + "name": "snapset" + } + ], + "object_info": "3:ffdb2004:::ROBJ9:head(47'60 osd.0.0:59 dirty|omap|data_digest|omap_digest s 7 uv 27 dd 2ddbf8f5 od 2eecc539 alloc_hint [0 0 0])", + "size": 1, + "errors": [], + "osd": 1 + } + ], + "selected_object_info": "3:ffdb2004:::ROBJ9:head(102'63 client.4433.0:1 dirty|omap|data_digest|omap_digest s 1 uv 63 dd 2b63260d od 2eecc539 alloc_hint [0 0 0])", + "union_shard_errors": [], + "errors": [ + "object_info_inconsistency", + "attr_value_mismatch" + ], + "object": { + "version": 63, + "snap": "head", + "locator": "", + "nspace": "", + "name": "ROBJ9" + } + } + ], + "epoch": 0 +} +EOF + + jq "$jqfilter" $dir/json | python -c "$sortkeys" | sed -e "$sedfilter" > $dir/csjson + diff ${DIFFCOLOPTS} $dir/checkcsjson $dir/csjson || test $getjson = "yes" || return 1 + if test $getjson = "yes" + then + jq '.' $dir/json > save1.json + fi + + if which jsonschema > /dev/null; + then + jsonschema -i $dir/json $CEPH_ROOT/doc/rados/command/list-inconsistent-obj.json || return 1 + fi + + objname=ROBJ9 + # Change data and size again because digest was recomputed + echo -n ZZZ > $dir/change + rados --pool $poolname put $objname $dir/change + # Set one to an even older value + objectstore_tool $dir 0 $objname set-attr _ $dir/robj9-oi + rm $dir/oi $dir/change + + objname=ROBJ10 + objectstore_tool $dir 1 $objname get-attr _ > $dir/oi + rados --pool $poolname setomapval $objname key2-$objname val2-$objname + objectstore_tool $dir 0 $objname set-attr _ $dir/oi + objectstore_tool $dir 1 $objname set-attr _ $dir/oi + rm $dir/oi + + set_config osd 0 filestore_debug_inject_read_err true || return 1 + set_config osd 1 filestore_debug_inject_read_err true || return 1 + CEPH_ARGS='' ceph --admin-daemon $(get_asok_path osd.1) \ + injectdataerr $poolname ROBJ11 || return 1 + CEPH_ARGS='' ceph --admin-daemon $(get_asok_path osd.0) \ + injectmdataerr $poolname ROBJ12 || return 1 + CEPH_ARGS='' ceph --admin-daemon $(get_asok_path osd.0) \ + injectmdataerr $poolname ROBJ13 || return 1 + CEPH_ARGS='' ceph --admin-daemon $(get_asok_path osd.1) \ + injectdataerr $poolname ROBJ13 || return 1 + pg_deep_scrub $pg + + rados list-inconsistent-pg $poolname > $dir/json || return 1 + # Check pg count + test $(jq '. | length' $dir/json) = "1" || return 1 + # Check pgid + test $(jq -r '.[0]' $dir/json) = $pg || return 1 + + rados list-inconsistent-obj $pg > $dir/json || return 1 + # Get epoch for repair-get requests + epoch=$(jq .epoch $dir/json) + + jq "$jqfilter" << EOF | python -c "$sortkeys" | sed -e "$sedfilter" > $dir/checkcsjson +{ + "inconsistents": [ + { + "shards": [ + { + "data_digest": "0x2ddbf8f5", + "omap_digest": "0xf5fba2c6", + "size": 7, + "errors": [], + "osd": 0 + }, + { + "data_digest": "0x2d4a11c2", + "omap_digest": "0xf5fba2c6", + "size": 9, + "errors": [ + "data_digest_mismatch_oi", + "size_mismatch_oi" + ], + "osd": 1 + } + ], + "selected_object_info": "3:ce3f1d6a:::ROBJ1:head(47'54 osd.0.0:53 dirty|omap|data_digest|omap_digest s 7 uv 3 dd 2ddbf8f5 od f5fba2c6 alloc_hint [0 0 0])", + "union_shard_errors": [ + "data_digest_mismatch_oi", + "size_mismatch_oi" + ], + "errors": [ + "data_digest_mismatch", + "size_mismatch" + ], + "object": { + "version": 3, + "snap": "head", + "locator": "", + "nspace": "", + "name": "ROBJ1" + } + }, + { + "shards": [ + { + "data_digest": "0x2ddbf8f5", + "omap_digest": "0xa8dd5adc", + "size": 7, + "errors": [ + "omap_digest_mismatch_oi" + ], + "osd": 0 + }, + { + "data_digest": "0x2ddbf8f5", + "omap_digest": "0xa8dd5adc", + "size": 7, + "errors": [ + "omap_digest_mismatch_oi" + ], + "osd": 1 + } + ], + "selected_object_info": "3:b1f19cbd:::ROBJ10:head(47'51 osd.0.0:50 dirty|omap|data_digest|omap_digest s 7 uv 30 dd 2ddbf8f5 od c2025a24 alloc_hint [0 0 0])", + "union_shard_errors": [ + "omap_digest_mismatch_oi" + ], + "errors": [], + "object": { + "version": 30, + "snap": "head", + "locator": "", + "nspace": "", + "name": "ROBJ10" + } + }, + { + "shards": [ + { + "data_digest": "0x2ddbf8f5", + "omap_digest": "0xa03cef03", + "size": 7, + "errors": [], + "osd": 0 + }, + { + "size": 7, + "errors": [ + "read_error" + ], + "osd": 1 + } + ], + "selected_object_info": "3:87abbf36:::ROBJ11:head(47'48 osd.0.0:47 dirty|omap|data_digest|omap_digest s 7 uv 33 dd 2ddbf8f5 od a03cef03 alloc_hint [0 0 0])", + "union_shard_errors": [ + "read_error" + ], + "errors": [], + "object": { + "version": 33, + "snap": "head", + "locator": "", + "nspace": "", + "name": "ROBJ11" + } + }, + { + "shards": [ + { + "errors": [ + "stat_error" + ], + "osd": 0 + }, + { + "data_digest": "0x2ddbf8f5", + "omap_digest": "0x067f306a", + "size": 7, + "errors": [], + "osd": 1 + } + ], + "selected_object_info": "3:bc819597:::ROBJ12:head(47'52 osd.0.0:51 dirty|omap|data_digest|omap_digest s 7 uv 36 dd 2ddbf8f5 od 67f306a alloc_hint [0 0 0])", + "union_shard_errors": [ + "stat_error" + ], + "errors": [], + "object": { + "version": 36, + "snap": "head", + "locator": "", + "nspace": "", + "name": "ROBJ12" + } + }, + { + "shards": [ + { + "errors": [ + "stat_error" + ], + "osd": 0 + }, + { + "size": 7, + "errors": [ + "read_error" + ], + "osd": 1 + } + ], + "union_shard_errors": [ + "stat_error", + "read_error" + ], + "errors": [], + "object": { + "version": 0, + "snap": "head", + "locator": "", + "nspace": "", + "name": "ROBJ13" + } + }, + { + "shards": [ + { + "data_digest": "0x2ddbf8f5", + "omap_digest": "0x4f14f849", + "size": 7, + "errors": [ + "oi_attr_corrupted" + ], + "osd": 0 + }, + { + "data_digest": "0x2ddbf8f5", + "omap_digest": "0x4f14f849", + "size": 7, + "errors": [ + "oi_attr_missing" + ], + "osd": 1 + } + ], + "union_shard_errors": [ + "oi_attr_missing", + "oi_attr_corrupted" + ], + "errors": [], + "object": { + "version": 0, + "snap": "head", + "locator": "", + "nspace": "", + "name": "ROBJ14" + } + }, + { + "shards": [ + { + "attrs": [ + { + "Base64": true, + "value": "", + "name": "_" + }, + { + "Base64": true, + "value": "AwIdAAAAAAAAAAAAAAABAAAAAAAAAAAAAAAAAAAAAAAAAAA=", + "name": "snapset" + } + ], + "data_digest": "0x2ddbf8f5", + "omap_digest": "0x2d2a4d6e", + "size": 7, + "errors": [], + "osd": 0 + }, + { + "attrs": [ + { + "Base64": true, + "value": "AwIdAAAAAAAAAAAAAAABAAAAAAAAAAAAAAAAAAAAAAAAAAA=", + "name": "snapset" + } + ], + "data_digest": "0x2ddbf8f5", + "omap_digest": "0x2d2a4d6e", + "size": 7, + "errors": [ + "oi_attr_missing" + ], + "osd": 1 + } + ], + "selected_object_info": "3:30259878:::ROBJ15:head(47'46 osd.0.0:45 dirty|omap|data_digest|omap_digest s 7 uv 45 dd 2ddbf8f5 od 2d2a4d6e alloc_hint [0 0 0])", + "union_shard_errors": [ + "oi_attr_missing" + ], + "errors": [ + "attr_name_mismatch" + ], + "object": { + "version": 45, + "snap": "head", + "locator": "", + "nspace": "", + "name": "ROBJ15" + } + }, + { + "shards": [ + { + "data_digest": "0x578a4830", + "omap_digest": "0xf8e11918", + "size": 7, + "errors": [ + "data_digest_mismatch_oi" + ], + "osd": 0 + }, + { + "data_digest": "0x2ddbf8f5", + "omap_digest": "0xf8e11918", + "size": 7, + "errors": [], + "osd": 1 + } + ], + "selected_object_info": "3:e97ce31e:::ROBJ2:head(47'56 osd.0.0:55 dirty|omap|data_digest|omap_digest s 7 uv 6 dd 2ddbf8f5 od f8e11918 alloc_hint [0 0 0])", + "union_shard_errors": [ + "data_digest_mismatch_oi" + ], + "errors": [ + "data_digest_mismatch" + ], + "object": { + "version": 6, + "snap": "head", + "locator": "", + "nspace": "", + "name": "ROBJ2" + } + }, + { + "shards": [ + { + "data_digest": "0x2ddbf8f5", + "omap_digest": "0x00b35dfd", + "size": 7, + "errors": [], + "osd": 0 + }, + { + "errors": [ + "missing" + ], + "osd": 1 + } + ], + "selected_object_info": "3:f2a5b2a4:::ROBJ3:head(47'57 osd.0.0:56 dirty|omap|data_digest|omap_digest s 7 uv 9 dd 2ddbf8f5 od b35dfd alloc_hint [0 0 0])", + "union_shard_errors": [ + "missing" + ], + "errors": [], + "object": { + "version": 9, + "snap": "head", + "locator": "", + "nspace": "", + "name": "ROBJ3" + } + }, + { + "shards": [ + { + "data_digest": "0x2ddbf8f5", + "omap_digest": "0xd7178dfe", + "size": 7, + "errors": [ + "omap_digest_mismatch_oi" + ], + "osd": 0 + }, + { + "data_digest": "0x2ddbf8f5", + "omap_digest": "0xe2d46ea4", + "size": 7, + "errors": [], + "osd": 1 + } + ], + "selected_object_info": "3:f4981d31:::ROBJ4:head(47'58 osd.0.0:57 dirty|omap|data_digest|omap_digest s 7 uv 12 dd 2ddbf8f5 od e2d46ea4 alloc_hint [0 0 0])", + "union_shard_errors": [ + "omap_digest_mismatch_oi" + ], + "errors": [ + "omap_digest_mismatch" + ], + "object": { + "version": 12, + "snap": "head", + "locator": "", + "nspace": "", + "name": "ROBJ4" + } + }, + { + "shards": [ + { + "data_digest": "0x2ddbf8f5", + "omap_digest": "0x1a862a41", + "size": 7, + "errors": [], + "osd": 0 + }, + { + "data_digest": "0x2ddbf8f5", + "omap_digest": "0x06cac8f6", + "size": 7, + "errors": [ + "omap_digest_mismatch_oi" + ], + "osd": 1 + } + ], + "selected_object_info": "3:f4bfd4d1:::ROBJ5:head(47'59 osd.0.0:58 dirty|omap|data_digest|omap_digest s 7 uv 15 dd 2ddbf8f5 od 1a862a41 alloc_hint [0 0 0])", + "union_shard_errors": [ + "omap_digest_mismatch_oi" + ], + "errors": [ + "omap_digest_mismatch" + ], + "object": { + "version": 15, + "snap": "head", + "locator": "", + "nspace": "", + "name": "ROBJ5" + } + }, + { + "shards": [ + { + "data_digest": "0x2ddbf8f5", + "omap_digest": "0x689ee887", + "size": 7, + "errors": [ + "omap_digest_mismatch_oi" + ], + "osd": 0 + }, + { + "data_digest": "0x2ddbf8f5", + "omap_digest": "0x179c919f", + "size": 7, + "errors": [], + "osd": 1 + } + ], + "selected_object_info": "3:a53c12e8:::ROBJ6:head(47'50 osd.0.0:49 dirty|omap|data_digest|omap_digest s 7 uv 18 dd 2ddbf8f5 od 179c919f alloc_hint [0 0 0])", + "union_shard_errors": [ + "omap_digest_mismatch_oi" + ], + "errors": [ + "omap_digest_mismatch" + ], + "object": { + "version": 18, + "snap": "head", + "locator": "", + "nspace": "", + "name": "ROBJ6" + } + }, + { + "shards": [ + { + "data_digest": "0x2ddbf8f5", + "omap_digest": "0xefced57a", + "size": 7, + "errors": [], + "osd": 0 + }, + { + "data_digest": "0x2ddbf8f5", + "omap_digest": "0x6a73cc07", + "size": 7, + "errors": [ + "omap_digest_mismatch_oi" + ], + "osd": 1 + } + ], + "selected_object_info": "3:8b55fa4b:::ROBJ7:head(47'49 osd.0.0:48 dirty|omap|data_digest|omap_digest s 7 uv 21 dd 2ddbf8f5 od efced57a alloc_hint [0 0 0])", + "union_shard_errors": [ + "omap_digest_mismatch_oi" + ], + "errors": [ + "omap_digest_mismatch" + ], + "object": { + "version": 21, + "snap": "head", + "locator": "", + "nspace": "", + "name": "ROBJ7" + } + }, + { + "shards": [ + { + "attrs": [ + { + "Base64": true, + "value": "", + "name": "_" + }, + { + "Base64": false, + "value": "bad-val", + "name": "_key1-ROBJ8" + }, + { + "Base64": false, + "value": "val3-ROBJ8", + "name": "_key3-ROBJ8" + }, + { + "Base64": true, + "value": "AwIdAAAAAAAAAAAAAAABAAAAAAAAAAAAAAAAAAAAAAAAAAA=", + "name": "snapset" + } + ], + "data_digest": "0x2ddbf8f5", + "omap_digest": "0xd6be81dc", + "size": 7, + "errors": [], + "osd": 0 + }, + { + "attrs": [ + { + "Base64": true, + "value": "", + "name": "_" + }, + { + "Base64": false, + "value": "val1-ROBJ8", + "name": "_key1-ROBJ8" + }, + { + "Base64": false, + "value": "val2-ROBJ8", + "name": "_key2-ROBJ8" + }, + { + "Base64": true, + "value": "AwIdAAAAAAAAAAAAAAABAAAAAAAAAAAAAAAAAAAAAAAAAAA=", + "name": "snapset" + } + ], + "data_digest": "0x2ddbf8f5", + "omap_digest": "0xd6be81dc", + "size": 7, + "errors": [], + "osd": 1 + } + ], + "selected_object_info": "3:86586531:::ROBJ8:head(82'62 client.4351.0:1 dirty|omap|data_digest|omap_digest s 7 uv 62 dd 2ddbf8f5 od d6be81dc alloc_hint [0 0 0])", + "union_shard_errors": [], + "errors": [ + "attr_value_mismatch", + "attr_name_mismatch" + ], + "object": { + "version": 62, + "snap": "head", + "locator": "", + "nspace": "", + "name": "ROBJ8" + } + }, + { + "shards": [ + { + "attrs": [ + { + "Base64": true, + "value": "", + "name": "_" + }, + { + "Base64": true, + "value": "AwIdAAAAAAAAAAAAAAABAAAAAAAAAAAAAAAAAAAAAAAAAAA=", + "name": "snapset" + } + ], + "object_info": "3:ffdb2004:::ROBJ9:head(47'60 osd.0.0:59 dirty|omap|data_digest|omap_digest s 7 uv 27 dd 2ddbf8f5 od 2eecc539 alloc_hint [0 0 0])", + "data_digest": "0x1f26fb26", + "omap_digest": "0x2eecc539", + "size": 3, + "errors": [], + "osd": 0 + }, + { + "attrs": [ + { + "Base64": true, + "value": "", + "name": "_" + }, + { + "Base64": true, + "value": "AwIdAAAAAAAAAAAAAAABAAAAAAAAAAAAAAAAAAAAAAAAAAA=", + "name": "snapset" + } + ], + "object_info": "3:ffdb2004:::ROBJ9:head(122'64 client.4532.0:1 dirty|omap|data_digest|omap_digest s 3 uv 64 dd 1f26fb26 od 2eecc539 alloc_hint [0 0 0])", + "data_digest": "0x1f26fb26", + "omap_digest": "0x2eecc539", + "size": 3, + "errors": [], + "osd": 1 + } + ], + "selected_object_info": "3:ffdb2004:::ROBJ9:head(122'64 client.4532.0:1 dirty|omap|data_digest|omap_digest s 3 uv 64 dd 1f26fb26 od 2eecc539 alloc_hint [0 0 0])", + "union_shard_errors": [], + "errors": [ + "object_info_inconsistency", + "attr_value_mismatch" + ], + "object": { + "version": 64, + "snap": "head", + "locator": "", + "nspace": "", + "name": "ROBJ9" + } + } + ], + "epoch": 0 +} +EOF + + jq "$jqfilter" $dir/json | python -c "$sortkeys" | sed -e "$sedfilter" > $dir/csjson + diff ${DIFFCOLOPTS} $dir/checkcsjson $dir/csjson || test $getjson = "yes" || return 1 + if test $getjson = "yes" + then + jq '.' $dir/json > save2.json + fi + + if which jsonschema > /dev/null; + then + jsonschema -i $dir/json $CEPH_ROOT/doc/rados/command/list-inconsistent-obj.json || return 1 + fi + + rados rmpool $poolname $poolname --yes-i-really-really-mean-it + teardown $dir || return 1 +} + + +# +# Test scrub errors for an erasure coded pool +# +function corrupt_scrub_erasure() { + local dir=$1 + local allow_overwrites=$2 + local poolname=ecpool + local total_objs=5 + + setup $dir || return 1 + run_mon $dir a || return 1 + run_mgr $dir x || return 1 + for id in $(seq 0 2) ; do + if [ "$allow_overwrites" = "true" ]; then + run_osd_bluestore $dir $id || return 1 + else + run_osd $dir $id || return 1 + fi + done + create_rbd_pool || return 1 + ceph osd pool create foo 1 + + create_ec_pool $poolname $allow_overwrites k=2 m=1 stripe_unit=2K --force || return 1 + wait_for_clean || return 1 + + for i in $(seq 1 $total_objs) ; do + objname=EOBJ${i} + add_something $dir $poolname $objname || return 1 + + local osd=$(expr $i % 2) + + case $i in + 1) + # Size (deep scrub data_digest too) + local payload=UVWXYZZZ + echo $payload > $dir/CORRUPT + objectstore_tool $dir $osd $objname set-bytes $dir/CORRUPT || return 1 + ;; + + 2) + # Corrupt EC shard + dd if=/dev/urandom of=$dir/CORRUPT bs=2048 count=1 + objectstore_tool $dir $osd $objname set-bytes $dir/CORRUPT || return 1 + ;; + + 3) + # missing + objectstore_tool $dir $osd $objname remove || return 1 + ;; + + 4) + rados --pool $poolname setxattr $objname key1-$objname val1-$objname || return 1 + rados --pool $poolname setxattr $objname key2-$objname val2-$objname || return 1 + + # Break xattrs + echo -n bad-val > $dir/bad-val + objectstore_tool $dir $osd $objname set-attr _key1-$objname $dir/bad-val || return 1 + objectstore_tool $dir $osd $objname rm-attr _key2-$objname || return 1 + echo -n val3-$objname > $dir/newval + objectstore_tool $dir $osd $objname set-attr _key3-$objname $dir/newval || return 1 + rm $dir/bad-val $dir/newval + ;; + + 5) + # Corrupt EC shard + dd if=/dev/urandom of=$dir/CORRUPT bs=2048 count=2 + objectstore_tool $dir $osd $objname set-bytes $dir/CORRUPT || return 1 + ;; + + esac + done + + local pg=$(get_pg $poolname EOBJ0) + + pg_scrub $pg + + rados list-inconsistent-pg $poolname > $dir/json || return 1 + # Check pg count + test $(jq '. | length' $dir/json) = "1" || return 1 + # Check pgid + test $(jq -r '.[0]' $dir/json) = $pg || return 1 + + rados list-inconsistent-obj $pg > $dir/json || return 1 + # Get epoch for repair-get requests + epoch=$(jq .epoch $dir/json) + + jq "$jqfilter" << EOF | python -c "$sortkeys" | sed -e "$sedfilter" > $dir/checkcsjson +{ + "inconsistents": [ + { + "shards": [ + { + "size": 2048, + "errors": [], + "shard": 2, + "osd": 0 + }, + { + "size": 9, + "shard": 0, + "errors": [ + "size_mismatch_oi" + ], + "osd": 1 + }, + { + "size": 2048, + "shard": 1, + "errors": [], + "osd": 2 + } + ], + "selected_object_info": "3:9175b684:::EOBJ1:head(21'1 client.4179.0:1 dirty|data_digest|omap_digest s 7 uv 1 dd 2ddbf8f5 od ffffffff alloc_hint [0 0 0])", + "union_shard_errors": [ + "size_mismatch_oi" + ], + "errors": [ + "size_mismatch" + ], + "object": { + "version": 1, + "snap": "head", + "locator": "", + "nspace": "", + "name": "EOBJ1" + } + }, + { + "shards": [ + { + "size": 2048, + "errors": [], + "shard": 2, + "osd": 0 + }, + { + "shard": 0, + "errors": [ + "missing" + ], + "osd": 1 + }, + { + "size": 2048, + "shard": 1, + "errors": [], + "osd": 2 + } + ], + "selected_object_info": "3:b197b25d:::EOBJ3:head(37'3 client.4251.0:1 dirty|data_digest|omap_digest s 7 uv 3 dd 2ddbf8f5 od ffffffff alloc_hint [0 0 0])", + "union_shard_errors": [ + "missing" + ], + "errors": [], + "object": { + "version": 3, + "snap": "head", + "locator": "", + "nspace": "", + "name": "EOBJ3" + } + }, + { + "shards": [ + { + "attrs": [ + { + "Base64": true, + "value": "", + "name": "_" + }, + { + "Base64": false, + "value": "bad-val", + "name": "_key1-EOBJ4" + }, + { + "Base64": false, + "value": "val3-EOBJ4", + "name": "_key3-EOBJ4" + }, + { + "Base64": true, + "value": "AQEYAAAAAAgAAAAAAAADAAAAL6fPBLB8dlsvp88E", + "name": "hinfo_key" + }, + { + "Base64": true, + "value": "AwIdAAAAAAAAAAAAAAABAAAAAAAAAAAAAAAAAAAAAAAAAAA=", + "name": "snapset" + } + ], + "size": 2048, + "errors": [], + "shard": 2, + "osd": 0 + }, + { + "osd": 1, + "shard": 0, + "errors": [], + "size": 2048, + "attrs": [ + { + "Base64": true, + "value": "", + "name": "_" + }, + { + "Base64": false, + "value": "val1-EOBJ4", + "name": "_key1-EOBJ4" + }, + { + "Base64": false, + "value": "val2-EOBJ4", + "name": "_key2-EOBJ4" + }, + { + "Base64": true, + "value": "AQEYAAAAAAgAAAAAAAADAAAAL6fPBLB8dlsvp88E", + "name": "hinfo_key" + }, + { + "Base64": true, + "value": "AwIdAAAAAAAAAAAAAAABAAAAAAAAAAAAAAAAAAAAAAAAAAA=", + "name": "snapset" + } + ] + }, + { + "osd": 2, + "shard": 1, + "errors": [], + "size": 2048, + "attrs": [ + { + "Base64": true, + "value": "", + "name": "_" + }, + { + "Base64": false, + "value": "val1-EOBJ4", + "name": "_key1-EOBJ4" + }, + { + "Base64": false, + "value": "val2-EOBJ4", + "name": "_key2-EOBJ4" + }, + { + "Base64": true, + "value": "AQEYAAAAAAgAAAAAAAADAAAAL6fPBLB8dlsvp88E", + "name": "hinfo_key" + }, + { + "Base64": true, + "value": "AwIdAAAAAAAAAAAAAAABAAAAAAAAAAAAAAAAAAAAAAAAAAA=", + "name": "snapset" + } + ] + } + ], + "selected_object_info": "3:5e723e06:::EOBJ4:head(45'6 client.4289.0:1 dirty|data_digest|omap_digest s 7 uv 6 dd 2ddbf8f5 od ffffffff alloc_hint [0 0 0])", + "union_shard_errors": [], + "errors": [ + "attr_value_mismatch", + "attr_name_mismatch" + ], + "object": { + "version": 6, + "snap": "head", + "locator": "", + "nspace": "", + "name": "EOBJ4" + } + }, + { + "shards": [ + { + "size": 2048, + "errors": [], + "shard": 2, + "osd": 0 + }, + { + "size": 4096, + "shard": 0, + "errors": [ + "size_mismatch_oi" + ], + "osd": 1 + }, + { + "size": 2048, + "shard": 1, + "errors": [], + "osd": 2 + } + ], + "selected_object_info": "3:8549dfb5:::EOBJ5:head(65'7 client.4441.0:1 dirty|data_digest|omap_digest s 7 uv 7 dd 2ddbf8f5 od ffffffff alloc_hint [0 0 0])", + "union_shard_errors": [ + "size_mismatch_oi" + ], + "errors": [ + "size_mismatch" + ], + "object": { + "version": 7, + "snap": "head", + "locator": "", + "nspace": "", + "name": "EOBJ5" + } + } + ], + "epoch": 0 +} +EOF + + jq "$jqfilter" $dir/json | python -c "$sortkeys" | sed -e "$sedfilter" > $dir/csjson + diff ${DIFFCOLOPTS} $dir/checkcsjson $dir/csjson || test $getjson = "yes" || return 1 + if test $getjson = "yes" + then + jq '.' $dir/json > save3.json + fi + + if which jsonschema > /dev/null; + then + jsonschema -i $dir/json $CEPH_ROOT/doc/rados/command/list-inconsistent-obj.json || return 1 + fi + + pg_deep_scrub $pg + + rados list-inconsistent-pg $poolname > $dir/json || return 1 + # Check pg count + test $(jq '. | length' $dir/json) = "1" || return 1 + # Check pgid + test $(jq -r '.[0]' $dir/json) = $pg || return 1 + + rados list-inconsistent-obj $pg > $dir/json || return 1 + # Get epoch for repair-get requests + epoch=$(jq .epoch $dir/json) + + if [ "$allow_overwrites" = "true" ] + then + jq "$jqfilter" << EOF | python -c "$sortkeys" | sed -e "$sedfilter" > $dir/checkcsjson +{ + "inconsistents": [ + { + "shards": [ + { + "data_digest": "0x00000000", + "omap_digest": "0xffffffff", + "size": 2048, + "errors": [], + "shard": 2, + "osd": 0 + }, + { + "size": 9, + "shard": 0, + "errors": [ + "read_error", + "size_mismatch_oi" + ], + "osd": 1 + }, + { + "data_digest": "0x00000000", + "omap_digest": "0xffffffff", + "size": 2048, + "shard": 1, + "errors": [], + "osd": 2 + } + ], + "selected_object_info": "3:9175b684:::EOBJ1:head(27'1 client.4155.0:1 dirty|data_digest|omap_digest s 7 uv 1 dd 2ddbf8f5 od ffffffff alloc_hint [0 0 0])", + "union_shard_errors": [ + "read_error", + "size_mismatch_oi" + ], + "errors": [ + "size_mismatch" + ], + "object": { + "version": 1, + "snap": "head", + "locator": "", + "nspace": "", + "name": "EOBJ1" + } + }, + { + "shards": [ + { + "data_digest": "0x00000000", + "omap_digest": "0xffffffff", + "size": 2048, + "errors": [], + "shard": 2, + "osd": 0 + }, + { + "shard": 0, + "errors": [ + "missing" + ], + "osd": 1 + }, + { + "data_digest": "0x00000000", + "omap_digest": "0xffffffff", + "size": 2048, + "shard": 1, + "errors": [], + "osd": 2 + } + ], + "selected_object_info": "3:b197b25d:::EOBJ3:head(41'3 client.4199.0:1 dirty|data_digest|omap_digest s 7 uv 3 dd 2ddbf8f5 od ffffffff alloc_hint [0 0 0])", + "union_shard_errors": [ + "missing" + ], + "errors": [], + "object": { + "version": 3, + "snap": "head", + "locator": "", + "nspace": "", + "name": "EOBJ3" + } + }, + { + "shards": [ + { + "attrs": [ + { + "Base64": true, + "value": "", + "name": "_" + }, + { + "Base64": false, + "value": "bad-val", + "name": "_key1-EOBJ4" + }, + { + "Base64": false, + "value": "val3-EOBJ4", + "name": "_key3-EOBJ4" + }, + { + "Base64": true, + "value": "AQEYAAAAAAgAAAAAAAADAAAAL6fPBLB8dlsvp88E", + "name": "hinfo_key" + }, + { + "Base64": true, + "value": "AwIdAAAAAAAAAAAAAAABAAAAAAAAAAAAAAAAAAAAAAAAAAA=", + "name": "snapset" + } + ], + "data_digest": "0x00000000", + "omap_digest": "0xffffffff", + "size": 2048, + "errors": [], + "shard": 2, + "osd": 0 + }, + { + "attrs": [ + { + "Base64": true, + "value": "", + "name": "_" + }, + { + "Base64": false, + "value": "val1-EOBJ4", + "name": "_key1-EOBJ4" + }, + { + "Base64": false, + "value": "val2-EOBJ4", + "name": "_key2-EOBJ4" + }, + { + "Base64": true, + "value": "AQEYAAAAAAgAAAAAAAADAAAAL6fPBLB8dlsvp88E", + "name": "hinfo_key" + }, + { + "Base64": true, + "value": "AwIdAAAAAAAAAAAAAAABAAAAAAAAAAAAAAAAAAAAAAAAAAA=", + "name": "snapset" + } + ], + "data_digest": "0x00000000", + "omap_digest": "0xffffffff", + "size": 2048, + "errors": [], + "shard": 0, + "osd": 1 + }, + { + "attrs": [ + { + "Base64": true, + "value": "", + "name": "_" + }, + { + "Base64": false, + "value": "val1-EOBJ4", + "name": "_key1-EOBJ4" + }, + { + "Base64": false, + "value": "val2-EOBJ4", + "name": "_key2-EOBJ4" + }, + { + "Base64": true, + "value": "AQEYAAAAAAgAAAAAAAADAAAAL6fPBLB8dlsvp88E", + "name": "hinfo_key" + }, + { + "Base64": true, + "value": "AwIdAAAAAAAAAAAAAAABAAAAAAAAAAAAAAAAAAAAAAAAAAA=", + "name": "snapset" + } + ], + "data_digest": "0x00000000", + "omap_digest": "0xffffffff", + "size": 2048, + "errors": [], + "shard": 1, + "osd": 2 + } + ], + "selected_object_info": "3:5e723e06:::EOBJ4:head(48'6 client.4223.0:1 dirty|data_digest|omap_digest s 7 uv 6 dd 2ddbf8f5 od ffffffff alloc_hint [0 0 0])", + "union_shard_errors": [], + "errors": [ + "attr_value_mismatch", + "attr_name_mismatch" + ], + "object": { + "version": 6, + "snap": "head", + "locator": "", + "nspace": "", + "name": "EOBJ4" + } + }, + { + "shards": [ + { + "data_digest": "0x00000000", + "omap_digest": "0xffffffff", + "size": 2048, + "errors": [], + "shard": 2, + "osd": 0 + }, + { + "data_digest": "0x00000000", + "omap_digest": "0xffffffff", + "size": 4096, + "errors": [ + "size_mismatch_oi" + ], + "shard": 0, + "osd": 1 + }, + { + "data_digest": "0x00000000", + "omap_digest": "0xffffffff", + "size": 2048, + "errors": [], + "shard": 1, + "osd": 2 + } + ], + "selected_object_info": "3:8549dfb5:::EOBJ5:head(65'7 client.4288.0:1 dirty|data_digest|omap_digest s 7 uv 7 dd 2ddbf8f5 od ffffffff alloc_hint [0 0 0])", + "union_shard_errors": [ + "size_mismatch_oi" + ], + "errors": [ + "size_mismatch" + ], + "object": { + "version": 7, + "snap": "head", + "locator": "", + "nspace": "", + "name": "EOBJ5" + } + } + ], + "epoch": 0 +} +EOF + + else + + jq "$jqfilter" << EOF | python -c "$sortkeys" | sed -e "$sedfilter" > $dir/checkcsjson +{ + "inconsistents": [ + { + "shards": [ + { + "data_digest": "0x04cfa72f", + "omap_digest": "0xffffffff", + "size": 2048, + "errors": [], + "shard": 2, + "osd": 0 + }, + { + "size": 9, + "shard": 0, + "errors": [ + "read_error", + "size_mismatch_oi" + ], + "osd": 1 + }, + { + "data_digest": "0x04cfa72f", + "omap_digest": "0xffffffff", + "size": 2048, + "shard": 1, + "errors": [], + "osd": 2 + } + ], + "selected_object_info": "3:9175b684:::EOBJ1:head(21'1 client.4179.0:1 dirty|data_digest|omap_digest s 7 uv 1 dd 2ddbf8f5 od ffffffff alloc_hint [0 0 0])", + "union_shard_errors": [ + "read_error", + "size_mismatch_oi" + ], + "errors": [ + "size_mismatch" + ], + "object": { + "version": 1, + "snap": "head", + "locator": "", + "nspace": "", + "name": "EOBJ1" + } + }, + { + "shards": [ + { + "size": 2048, + "errors": [ + "ec_hash_error" + ], + "shard": 2, + "osd": 0 + }, + { + "data_digest": "0x04cfa72f", + "omap_digest": "0xffffffff", + "size": 2048, + "errors": [], + "shard": 0, + "osd": 1 + }, + { + "data_digest": "0x04cfa72f", + "omap_digest": "0xffffffff", + "size": 2048, + "errors": [], + "shard": 1, + "osd": 2 + } + ], + "selected_object_info": "3:9babd184:::EOBJ2:head(29'2 client.4217.0:1 dirty|data_digest|omap_digest s 7 uv 2 dd 2ddbf8f5 od ffffffff alloc_hint [0 0 0])", + "union_shard_errors": [ + "ec_hash_error" + ], + "errors": [], + "object": { + "version": 2, + "snap": "head", + "locator": "", + "nspace": "", + "name": "EOBJ2" + } + }, + { + "shards": [ + { + "data_digest": "0x04cfa72f", + "omap_digest": "0xffffffff", + "size": 2048, + "errors": [], + "shard": 2, + "osd": 0 + }, + { + "osd": 1, + "shard": 0, + "errors": [ + "missing" + ] + }, + { + "data_digest": "0x04cfa72f", + "omap_digest": "0xffffffff", + "size": 2048, + "shard": 1, + "errors": [], + "osd": 2 + } + ], + "selected_object_info": "3:b197b25d:::EOBJ3:head(37'3 client.4251.0:1 dirty|data_digest|omap_digest s 7 uv 3 dd 2ddbf8f5 od ffffffff alloc_hint [0 0 0])", + "union_shard_errors": [ + "missing" + ], + "errors": [], + "object": { + "version": 3, + "snap": "head", + "locator": "", + "nspace": "", + "name": "EOBJ3" + } + }, + { + "shards": [ + { + "attrs": [ + { + "Base64": true, + "value": "", + "name": "_" + }, + { + "Base64": false, + "value": "bad-val", + "name": "_key1-EOBJ4" + }, + { + "Base64": false, + "value": "val3-EOBJ4", + "name": "_key3-EOBJ4" + }, + { + "Base64": true, + "value": "AQEYAAAAAAgAAAAAAAADAAAAL6fPBLB8dlsvp88E", + "name": "hinfo_key" + }, + { + "Base64": true, + "value": "AwIdAAAAAAAAAAAAAAABAAAAAAAAAAAAAAAAAAAAAAAAAAA=", + "name": "snapset" + } + ], + "data_digest": "0x04cfa72f", + "omap_digest": "0xffffffff", + "size": 2048, + "errors": [], + "shard": 2, + "osd": 0 + }, + { + "osd": 1, + "shard": 0, + "errors": [], + "size": 2048, + "omap_digest": "0xffffffff", + "data_digest": "0x04cfa72f", + "attrs": [ + { + "Base64": true, + "value": "", + "name": "_" + }, + { + "Base64": false, + "value": "val1-EOBJ4", + "name": "_key1-EOBJ4" + }, + { + "Base64": false, + "value": "val2-EOBJ4", + "name": "_key2-EOBJ4" + }, + { + "Base64": true, + "value": "AQEYAAAAAAgAAAAAAAADAAAAL6fPBLB8dlsvp88E", + "name": "hinfo_key" + }, + { + "Base64": true, + "value": "AwIdAAAAAAAAAAAAAAABAAAAAAAAAAAAAAAAAAAAAAAAAAA=", + "name": "snapset" + } + ] + }, + { + "osd": 2, + "shard": 1, + "errors": [], + "size": 2048, + "omap_digest": "0xffffffff", + "data_digest": "0x04cfa72f", + "attrs": [ + { + "Base64": true, + "value": "", + "name": "_" + }, + { + "Base64": false, + "value": "val1-EOBJ4", + "name": "_key1-EOBJ4" + }, + { + "Base64": false, + "value": "val2-EOBJ4", + "name": "_key2-EOBJ4" + }, + { + "Base64": true, + "value": "AQEYAAAAAAgAAAAAAAADAAAAL6fPBLB8dlsvp88E", + "name": "hinfo_key" + }, + { + "Base64": true, + "value": "AwIdAAAAAAAAAAAAAAABAAAAAAAAAAAAAAAAAAAAAAAAAAA=", + "name": "snapset" + } + ] + } + ], + "selected_object_info": "3:5e723e06:::EOBJ4:head(45'6 client.4289.0:1 dirty|data_digest|omap_digest s 7 uv 6 dd 2ddbf8f5 od ffffffff alloc_hint [0 0 0])", + "union_shard_errors": [], + "errors": [ + "attr_value_mismatch", + "attr_name_mismatch" + ], + "object": { + "version": 6, + "snap": "head", + "locator": "", + "nspace": "", + "name": "EOBJ4" + } + }, + { + "shards": [ + { + "data_digest": "0x04cfa72f", + "omap_digest": "0xffffffff", + "size": 2048, + "errors": [], + "shard": 2, + "osd": 0 + }, + { + "size": 4096, + "shard": 0, + "errors": [ + "size_mismatch_oi", + "ec_size_error" + ], + "osd": 1 + }, + { + "data_digest": "0x04cfa72f", + "omap_digest": "0xffffffff", + "size": 2048, + "shard": 1, + "errors": [], + "osd": 2 + } + ], + "selected_object_info": "3:8549dfb5:::EOBJ5:head(65'7 client.4441.0:1 dirty|data_digest|omap_digest s 7 uv 7 dd 2ddbf8f5 od ffffffff alloc_hint [0 0 0])", + "union_shard_errors": [ + "size_mismatch_oi", + "ec_size_error" + ], + "errors": [ + "size_mismatch" + ], + "object": { + "version": 7, + "snap": "head", + "locator": "", + "nspace": "", + "name": "EOBJ5" + } + } + ], + "epoch": 0 +} +EOF + + fi + + jq "$jqfilter" $dir/json | python -c "$sortkeys" | sed -e "$sedfilter" > $dir/csjson + diff ${DIFFCOLOPTS} $dir/checkcsjson $dir/csjson || test $getjson = "yes" || return 1 + if test $getjson = "yes" + then + if [ "$allow_overwrites" = "true" ] + then + num=4 + else + num=5 + fi + jq '.' $dir/json > save${num}.json + fi + + if which jsonschema > /dev/null; + then + jsonschema -i $dir/json $CEPH_ROOT/doc/rados/command/list-inconsistent-obj.json || return 1 + fi + + rados rmpool $poolname $poolname --yes-i-really-really-mean-it + teardown $dir || return 1 +} + +function TEST_corrupt_scrub_erasure_appends() { + corrupt_scrub_erasure $1 false +} + +function TEST_corrupt_scrub_erasure_overwrites() { + if [ "$use_ec_overwrite" = "true" ]; then + corrupt_scrub_erasure $1 true + fi +} + +# +# Test to make sure that a periodic scrub won't cause deep-scrub info to be lost +# +function TEST_periodic_scrub_replicated() { + local dir=$1 + local poolname=psr_pool + local objname=POBJ + + setup $dir || return 1 + run_mon $dir a --osd_pool_default_size=2 || return 1 + run_mgr $dir x || return 1 + local ceph_osd_args="--osd-scrub-interval-randomize-ratio=0 --osd-deep-scrub-randomize-ratio=0" + run_osd $dir 0 $ceph_osd_args || return 1 + run_osd $dir 1 $ceph_osd_args || return 1 + create_rbd_pool || return 1 + wait_for_clean || return 1 + + ceph osd pool create $poolname 1 1 || return 1 + wait_for_clean || return 1 + + local osd=0 + add_something $dir $poolname $objname scrub || return 1 + local primary=$(get_primary $poolname $objname) + local pg=$(get_pg $poolname $objname) + + # Add deep-scrub only error + local payload=UVWXYZ + echo $payload > $dir/CORRUPT + # Uses $ceph_osd_args for osd restart + objectstore_tool $dir $osd $objname set-bytes $dir/CORRUPT || return 1 + + # No scrub information available, so expect failure + set -o pipefail + ! rados list-inconsistent-obj $pg | jq '.' || return 1 + set +o pipefail + + pg_deep_scrub $pg || return 1 + + # Make sure bad object found + rados list-inconsistent-obj $pg | jq '.' | grep -q $objname || return 1 + + local last_scrub=$(get_last_scrub_stamp $pg) + # Fake a schedule scrub + CEPH_ARGS='' ceph --admin-daemon $(get_asok_path osd.${primary}) \ + trigger_scrub $pg || return 1 + # Wait for schedule regular scrub + wait_for_scrub $pg "$last_scrub" + + # It needed to be upgraded + grep -q "Deep scrub errors, upgrading scrub to deep-scrub" $dir/osd.${primary}.log || return 1 + + # Bad object still known + rados list-inconsistent-obj $pg | jq '.' | grep -q $objname || return 1 + + # Can't upgrade with this set + ceph osd set nodeep-scrub + # Let map change propagate to OSDs + sleep 2 + + # Fake a schedule scrub + local last_scrub=$(get_last_scrub_stamp $pg) + CEPH_ARGS='' ceph --admin-daemon $(get_asok_path osd.${primary}) \ + trigger_scrub $pg || return 1 + # Wait for schedule regular scrub + # to notice scrub and skip it + local found=false + for i in $(seq 14 -1 0) + do + sleep 1 + ! grep -q "Regular scrub skipped due to deep-scrub errors and nodeep-scrub set" $dir/osd.${primary}.log || { found=true ; break; } + echo Time left: $i seconds + done + test $found = "true" || return 1 + + # Bad object still known + rados list-inconsistent-obj $pg | jq '.' | grep -q $objname || return 1 + + # Request a regular scrub and it will be done + local scrub_backoff_ratio=$(get_config osd ${primary} osd_scrub_backoff_ratio) + set_config osd ${primary} osd_scrub_backoff_ratio 0 + pg_scrub $pg + sleep 1 + set_config osd ${primary} osd_scrub_backoff_ratio $scrub_backoff_ratio + grep -q "Regular scrub request, deep-scrub details will be lost" $dir/osd.${primary}.log || return 1 + + # deep-scrub error is no longer present + rados list-inconsistent-obj $pg | jq '.' | grep -qv $objname || return 1 +} + + +main osd-scrub-repair "$@" + +# Local Variables: +# compile-command: "cd ../.. ; make -j4 && \ +# test/osd/osd-scrub-repair.sh # TEST_corrupt_and_repair_replicated" +# End: diff -Nru ceph-12.1.1/qa/standalone/scrub/osd-scrub-snaps.sh ceph-12.1.2/qa/standalone/scrub/osd-scrub-snaps.sh --- ceph-12.1.1/qa/standalone/scrub/osd-scrub-snaps.sh 1970-01-01 00:00:00.000000000 +0000 +++ ceph-12.1.2/qa/standalone/scrub/osd-scrub-snaps.sh 2017-08-01 17:55:40.000000000 +0000 @@ -0,0 +1,481 @@ +#! /bin/bash +# +# Copyright (C) 2015 Red Hat +# +# Author: David Zafman +# +# This program is free software; you can redistribute it and/or modify +# it under the terms of the GNU Library Public License as published by +# the Free Software Foundation; either version 2, or (at your option) +# any later version. +# +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU Library Public License for more details. +# +source $CEPH_ROOT/qa/standalone/ceph-helpers.sh + +function run() { + local dir=$1 + shift + + export CEPH_MON="127.0.0.1:7121" # git grep '\<7121\>' : there must be only one + export CEPH_ARGS + CEPH_ARGS+="--fsid=$(uuidgen) --auth-supported=none " + CEPH_ARGS+="--mon-host=$CEPH_MON " + + local funcs=${@:-$(set | sed -n -e 's/^\(TEST_[0-9a-z_]*\) .*/\1/p')} + for func in $funcs ; do + $func $dir || return 1 + done +} + +function TEST_scrub_snaps() { + local dir=$1 + local poolname=test + + TESTDATA="testdata.$$" + + setup $dir || return 1 + run_mon $dir a --osd_pool_default_size=1 || return 1 + run_mgr $dir x || return 1 + run_osd $dir 0 || return 1 + + create_rbd_pool || return 1 + wait_for_clean || return 1 + + # Create a pool with a single pg + ceph osd pool create $poolname 1 1 + poolid=$(ceph osd dump | grep "^pool.*[']test[']" | awk '{ print $2 }') + + dd if=/dev/urandom of=$TESTDATA bs=1032 count=1 + for i in `seq 1 15` + do + rados -p $poolname put obj${i} $TESTDATA + done + + SNAP=1 + rados -p $poolname mksnap snap${SNAP} + dd if=/dev/urandom of=$TESTDATA bs=256 count=${SNAP} + rados -p $poolname put obj1 $TESTDATA + rados -p $poolname put obj5 $TESTDATA + rados -p $poolname put obj3 $TESTDATA + for i in `seq 6 14` + do rados -p $poolname put obj${i} $TESTDATA + done + + SNAP=2 + rados -p $poolname mksnap snap${SNAP} + dd if=/dev/urandom of=$TESTDATA bs=256 count=${SNAP} + rados -p $poolname put obj5 $TESTDATA + + SNAP=3 + rados -p $poolname mksnap snap${SNAP} + dd if=/dev/urandom of=$TESTDATA bs=256 count=${SNAP} + rados -p $poolname put obj3 $TESTDATA + + SNAP=4 + rados -p $poolname mksnap snap${SNAP} + dd if=/dev/urandom of=$TESTDATA bs=256 count=${SNAP} + rados -p $poolname put obj5 $TESTDATA + rados -p $poolname put obj2 $TESTDATA + + SNAP=5 + rados -p $poolname mksnap snap${SNAP} + SNAP=6 + rados -p $poolname mksnap snap${SNAP} + dd if=/dev/urandom of=$TESTDATA bs=256 count=${SNAP} + rados -p $poolname put obj5 $TESTDATA + + SNAP=7 + rados -p $poolname mksnap snap${SNAP} + + rados -p $poolname rm obj4 + rados -p $poolname rm obj2 + + kill_daemons $dir TERM osd || return 1 + + # Don't need to ceph_objectstore_tool function because osd stopped + + JSON="$(ceph-objectstore-tool --data-path $dir/0 --journal-path $dir/0/journal --head --op list obj1)" + ceph-objectstore-tool --data-path $dir/0 --journal-path $dir/0/journal "$JSON" --force remove + + JSON="$(ceph-objectstore-tool --data-path $dir/0 --journal-path $dir/0/journal --op list obj5 | grep \"snapid\":2)" + ceph-objectstore-tool --data-path $dir/0 --journal-path $dir/0/journal "$JSON" remove + + JSON="$(ceph-objectstore-tool --data-path $dir/0 --journal-path $dir/0/journal --op list obj5 | grep \"snapid\":1)" + OBJ5SAVE="$JSON" + ceph-objectstore-tool --data-path $dir/0 --journal-path $dir/0/journal "$JSON" remove + + JSON="$(ceph-objectstore-tool --data-path $dir/0 --journal-path $dir/0/journal --op list obj5 | grep \"snapid\":4)" + dd if=/dev/urandom of=$TESTDATA bs=256 count=18 + ceph-objectstore-tool --data-path $dir/0 --journal-path $dir/0/journal "$JSON" set-bytes $TESTDATA + + JSON="$(ceph-objectstore-tool --data-path $dir/0 --journal-path $dir/0/journal --head --op list obj3)" + dd if=/dev/urandom of=$TESTDATA bs=256 count=15 + ceph-objectstore-tool --data-path $dir/0 --journal-path $dir/0/journal "$JSON" set-bytes $TESTDATA + + JSON="$(ceph-objectstore-tool --data-path $dir/0 --journal-path $dir/0/journal --op list obj4 | grep \"snapid\":7)" + ceph-objectstore-tool --data-path $dir/0 --journal-path $dir/0/journal "$JSON" remove + + JSON="$(ceph-objectstore-tool --data-path $dir/0 --journal-path $dir/0/journal --head --op list obj2)" + ceph-objectstore-tool --data-path $dir/0 --journal-path $dir/0/journal "$JSON" rm-attr snapset + + # Create a clone which isn't in snapset and doesn't have object info + JSON="$(echo "$OBJ5SAVE" | sed s/snapid\":1/snapid\":7/)" + dd if=/dev/urandom of=$TESTDATA bs=256 count=7 + ceph-objectstore-tool --data-path $dir/0 --journal-path $dir/0/journal "$JSON" set-bytes $TESTDATA + + rm -f $TESTDATA + + JSON="$(ceph-objectstore-tool --data-path $dir/0 --journal-path $dir/0/journal --head --op list obj6)" + ceph-objectstore-tool --data-path $dir/0 --journal-path $dir/0/journal "$JSON" clear-snapset + JSON="$(ceph-objectstore-tool --data-path $dir/0 --journal-path $dir/0/journal --head --op list obj7)" + ceph-objectstore-tool --data-path $dir/0 --journal-path $dir/0/journal "$JSON" clear-snapset corrupt + JSON="$(ceph-objectstore-tool --data-path $dir/0 --journal-path $dir/0/journal --head --op list obj8)" + ceph-objectstore-tool --data-path $dir/0 --journal-path $dir/0/journal "$JSON" clear-snapset seq + JSON="$(ceph-objectstore-tool --data-path $dir/0 --journal-path $dir/0/journal --head --op list obj9)" + ceph-objectstore-tool --data-path $dir/0 --journal-path $dir/0/journal "$JSON" clear-snapset clone_size + JSON="$(ceph-objectstore-tool --data-path $dir/0 --journal-path $dir/0/journal --head --op list obj10)" + ceph-objectstore-tool --data-path $dir/0 --journal-path $dir/0/journal "$JSON" clear-snapset clone_overlap + JSON="$(ceph-objectstore-tool --data-path $dir/0 --journal-path $dir/0/journal --head --op list obj11)" + ceph-objectstore-tool --data-path $dir/0 --journal-path $dir/0/journal "$JSON" clear-snapset clones + JSON="$(ceph-objectstore-tool --data-path $dir/0 --journal-path $dir/0/journal --head --op list obj12)" + ceph-objectstore-tool --data-path $dir/0 --journal-path $dir/0/journal "$JSON" clear-snapset head + JSON="$(ceph-objectstore-tool --data-path $dir/0 --journal-path $dir/0/journal --head --op list obj13)" + ceph-objectstore-tool --data-path $dir/0 --journal-path $dir/0/journal "$JSON" clear-snapset snaps + JSON="$(ceph-objectstore-tool --data-path $dir/0 --journal-path $dir/0/journal --head --op list obj14)" + ceph-objectstore-tool --data-path $dir/0 --journal-path $dir/0/journal "$JSON" clear-snapset size + + echo "garbage" > $dir/bad + JSON="$(ceph-objectstore-tool --data-path $dir/0 --journal-path $dir/0/journal --head --op list obj15)" + ceph-objectstore-tool --data-path $dir/0 --journal-path $dir/0/journal "$JSON" set-attr snapset $dir/bad + rm -f $dir/bad + + run_osd $dir 0 || return 1 + create_rbd_pool || return 1 + wait_for_clean || return 1 + + local pgid="${poolid}.0" + if ! pg_scrub "$pgid" ; then + cat $dir/osd.0.log + return 1 + fi + grep 'log_channel' $dir/osd.0.log + + rados list-inconsistent-pg $poolname > $dir/json || return 1 + # Check pg count + test $(jq '. | length' $dir/json) = "1" || return 1 + # Check pgid + test $(jq -r '.[0]' $dir/json) = $pgid || return 1 + + rados list-inconsistent-snapset $pgid > $dir/json || return 1 + test $(jq '.inconsistents | length' $dir/json) = "21" || return 1 + + local jqfilter='.inconsistents' + local sortkeys='import json; import sys ; JSON=sys.stdin.read() ; ud = json.loads(JSON) ; print json.dumps(ud, sort_keys=True, indent=2)' + + jq "$jqfilter" << EOF | python -c "$sortkeys" > $dir/checkcsjson +{ + "inconsistents": [ + { + "errors": [ + "headless" + ], + "snap": 1, + "locator": "", + "nspace": "", + "name": "obj1" + }, + { + "errors": [ + "size_mismatch" + ], + "snap": 1, + "locator": "", + "nspace": "", + "name": "obj10" + }, + { + "errors": [ + "headless" + ], + "snap": 1, + "locator": "", + "nspace": "", + "name": "obj11" + }, + { + "errors": [ + "size_mismatch" + ], + "snap": 1, + "locator": "", + "nspace": "", + "name": "obj14" + }, + { + "errors": [ + "headless" + ], + "snap": 1, + "locator": "", + "nspace": "", + "name": "obj6" + }, + { + "errors": [ + "headless" + ], + "snap": 1, + "locator": "", + "nspace": "", + "name": "obj7" + }, + { + "errors": [ + "size_mismatch" + ], + "snap": 1, + "locator": "", + "nspace": "", + "name": "obj9" + }, + { + "errors": [ + "headless" + ], + "snap": 4, + "locator": "", + "nspace": "", + "name": "obj2" + }, + { + "errors": [ + "size_mismatch" + ], + "snap": 4, + "locator": "", + "nspace": "", + "name": "obj5" + }, + { + "errors": [ + "headless" + ], + "snap": 7, + "locator": "", + "nspace": "", + "name": "obj2" + }, + { + "errors": [ + "oi_attr_missing", + "headless" + ], + "snap": 7, + "locator": "", + "nspace": "", + "name": "obj5" + }, + { + "extra clones": [ + 1 + ], + "errors": [ + "extra_clones" + ], + "snap": "head", + "locator": "", + "nspace": "", + "name": "obj11" + }, + { + "errors": [ + "head_mismatch" + ], + "snap": "head", + "locator": "", + "nspace": "", + "name": "obj12" + }, + { + "errors": [ + "ss_attr_corrupted" + ], + "snap": "head", + "locator": "", + "nspace": "", + "name": "obj15" + }, + { + "extra clones": [ + 7, + 4 + ], + "errors": [ + "ss_attr_missing", + "extra_clones" + ], + "snap": "head", + "locator": "", + "nspace": "", + "name": "obj2" + }, + { + "errors": [ + "size_mismatch" + ], + "snap": "head", + "locator": "", + "nspace": "", + "name": "obj3" + }, + { + "missing": [ + 7 + ], + "errors": [ + "clone_missing" + ], + "snap": "head", + "locator": "", + "nspace": "", + "name": "obj4" + }, + { + "missing": [ + 2, + 1 + ], + "extra clones": [ + 7 + ], + "errors": [ + "extra_clones", + "clone_missing" + ], + "snap": "head", + "locator": "", + "nspace": "", + "name": "obj5" + }, + { + "extra clones": [ + 1 + ], + "errors": [ + "extra_clones" + ], + "snap": "head", + "locator": "", + "nspace": "", + "name": "obj6" + }, + { + "extra clones": [ + 1 + ], + "errors": [ + "head_mismatch", + "extra_clones" + ], + "snap": "head", + "locator": "", + "nspace": "", + "name": "obj7" + }, + { + "errors": [ + "snapset_mismatch" + ], + "snap": "head", + "locator": "", + "nspace": "", + "name": "obj8" + } + ], + "epoch": 20 +} +EOF + + jq "$jqfilter" $dir/json | python -c "$sortkeys" > $dir/csjson + diff ${DIFFCOLOPTS} $dir/checkcsjson $dir/csjson || return 1 + + if which jsonschema > /dev/null; + then + jsonschema -i $dir/json $CEPH_ROOT/doc/rados/command/list-inconsistent-snap.json || return 1 + fi + + for i in `seq 1 7` + do + rados -p $poolname rmsnap snap$i + done + + ERRORS=0 + + pidfile=$(find $dir 2>/dev/null | grep $name_prefix'[^/]*\.pid') + pid=$(cat $pidfile) + if ! kill -0 $pid + then + echo "OSD crash occurred" + tail -100 $dir/osd.0.log + ERRORS=$(expr $ERRORS + 1) + fi + + kill_daemons $dir || return 1 + + declare -a err_strings + err_strings[0]="log_channel[(]cluster[)] log [[]ERR[]] : scrub [0-9]*[.]0 .*::obj10:.* is missing in clone_overlap" + err_strings[1]="log_channel[(]cluster[)] log [[]ERR[]] : scrub [0-9]*[.]0 .*::obj5:7 no '_' attr" + err_strings[2]="log_channel[(]cluster[)] log [[]ERR[]] : scrub [0-9]*[.]0 .*::obj5:7 is an unexpected clone" + err_strings[3]="log_channel[(]cluster[)] log [[]ERR[]] : scrub [0-9]*[.]0 .*::obj5:4 on disk size [(]4608[)] does not match object info size [(]512[)] adjusted for ondisk to [(]512[)]" + err_strings[4]="log_channel[(]cluster[)] log [[]ERR[]] : scrub [0-9]*[.]0 .*:::obj5:head expected clone .*:::obj5:2" + err_strings[5]="log_channel[(]cluster[)] log [[]ERR[]] : scrub [0-9]*[.]0 .*:::obj5:head expected clone .*:::obj5:1" + err_strings[6]="log_channel[(]cluster[)] log [[]INF[]] : scrub [0-9]*[.]0 .*:::obj5:head 2 missing clone[(]s[)]" + err_strings[7]="log_channel[(]cluster[)] log [[]ERR[]] : scrub [0-9]*[.]0 .*:::obj12:head snapset.head_exists=false, but head exists" + err_strings[8]="log_channel[(]cluster[)] log [[]ERR[]] : scrub [0-9]*[.]0 .*:::obj8:head snaps.seq not set" + err_strings[9]="log_channel[(]cluster[)] log [[]ERR[]] : scrub [0-9]*[.]0 .*:::obj7:head snapset.head_exists=false, but head exists" + err_strings[10]="log_channel[(]cluster[)] log [[]ERR[]] : scrub [0-9]*[.]0 .*:::obj7:1 is an unexpected clone" + err_strings[11]="log_channel[(]cluster[)] log [[]ERR[]] : scrub [0-9]*[.]0 .*:::obj3:head on disk size [(]3840[)] does not match object info size [(]768[)] adjusted for ondisk to [(]768[)]" + err_strings[12]="log_channel[(]cluster[)] log [[]ERR[]] : scrub [0-9]*[.]0 .*:::obj6:1 is an unexpected clone" + err_strings[13]="log_channel[(]cluster[)] log [[]ERR[]] : scrub [0-9]*[.]0 .*:::obj2:head no 'snapset' attr" + err_strings[14]="log_channel[(]cluster[)] log [[]ERR[]] : scrub [0-9]*[.]0 .*:::obj2:7 clone ignored due to missing snapset" + err_strings[15]="log_channel[(]cluster[)] log [[]ERR[]] : scrub [0-9]*[.]0 .*:::obj2:4 clone ignored due to missing snapset" + err_strings[16]="log_channel[(]cluster[)] log [[]ERR[]] : scrub [0-9]*[.]0 .*:::obj4:head expected clone .*:::obj4:7" + err_strings[17]="log_channel[(]cluster[)] log [[]INF[]] : scrub [0-9]*[.]0 .*:::obj4:head 1 missing clone[(]s[)]" + err_strings[18]="log_channel[(]cluster[)] log [[]ERR[]] : scrub [0-9]*[.]0 .*:::obj1:1 is an unexpected clone" + err_strings[19]="log_channel[(]cluster[)] log [[]ERR[]] : scrub [0-9]*[.]0 .*:::obj9:1 is missing in clone_size" + err_strings[20]="log_channel[(]cluster[)] log [[]ERR[]] : scrub [0-9]*[.]0 .*:::obj11:1 is an unexpected clone" + err_strings[21]="log_channel[(]cluster[)] log [[]ERR[]] : scrub [0-9]*[.]0 .*:::obj14:1 size 1032 != clone_size 1033" + err_strings[22]="log_channel[(]cluster[)] log [[]ERR[]] : [0-9]*[.]0 scrub 23 errors" + err_strings[23]="log_channel[(]cluster[)] log [[]ERR[]] : scrub [0-9]*[.]0 .*:::obj15:head can't decode 'snapset' attr buffer" + err_strings[24]="log_channel[(]cluster[)] log [[]ERR[]] : scrub [0-9]*[.]0 .*:::obj12:1 has no oi or legacy_snaps; cannot convert 1=[[]1[]]:[[]1[]].stray_clone_snaps=[{]1=[[]1[]][}]" + + for i in `seq 0 ${#err_strings[@]}` + do + if ! grep "${err_strings[$i]}" $dir/osd.0.log > /dev/null; + then + echo "Missing log message '${err_strings[$i]}'" + ERRORS=$(expr $ERRORS + 1) + fi + done + + teardown $dir || return 1 + + if [ $ERRORS != "0" ]; + then + echo "TEST FAILED WITH $ERRORS ERRORS" + return 1 + fi + + echo "TEST PASSED" + return 0 +} + +main osd-scrub-snaps "$@" + +# Local Variables: +# compile-command: "cd ../.. ; make -j4 && \ +# test/osd/osd-scrub-snaps.sh" diff -Nru ceph-12.1.1/qa/suites/big/rados-thrash/objectstore/bluestore.yaml ceph-12.1.2/qa/suites/big/rados-thrash/objectstore/bluestore.yaml --- ceph-12.1.1/qa/suites/big/rados-thrash/objectstore/bluestore.yaml 2017-07-17 16:56:02.000000000 +0000 +++ ceph-12.1.2/qa/suites/big/rados-thrash/objectstore/bluestore.yaml 2017-08-01 17:55:40.000000000 +0000 @@ -20,3 +20,21 @@ osd failsafe full ratio: .95 # this doesn't work with failures bc the log writes are not atomic across the two backends # bluestore bluefs env mirror: true + ceph-deploy: + fs: xfs + bluestore: yes + conf: + osd: + osd objectstore: bluestore + bluestore block size: 96636764160 + debug bluestore: 30 + debug bdev: 20 + debug bluefs: 20 + debug rocksdb: 10 + bluestore fsck on mount: true + # lower the full ratios since we can fill up a 100gb osd so quickly + mon osd full ratio: .9 + mon osd backfillfull_ratio: .85 + mon osd nearfull ratio: .8 + osd failsafe full ratio: .95 + diff -Nru ceph-12.1.1/qa/suites/big/rados-thrash/objectstore/filestore-xfs.yaml ceph-12.1.2/qa/suites/big/rados-thrash/objectstore/filestore-xfs.yaml --- ceph-12.1.1/qa/suites/big/rados-thrash/objectstore/filestore-xfs.yaml 2017-07-17 16:56:02.000000000 +0000 +++ ceph-12.1.2/qa/suites/big/rados-thrash/objectstore/filestore-xfs.yaml 2017-08-01 17:55:40.000000000 +0000 @@ -5,3 +5,11 @@ osd: osd objectstore: filestore osd sloppy crc: true + ceph-deploy: + fs: xfs + filestore: True + conf: + osd: + osd objectstore: filestore + osd sloppy crc: true + diff -Nru ceph-12.1.1/qa/suites/big/rados-thrash/thrashers/default.yaml ceph-12.1.2/qa/suites/big/rados-thrash/thrashers/default.yaml --- ceph-12.1.1/qa/suites/big/rados-thrash/thrashers/default.yaml 2017-07-17 16:56:02.000000000 +0000 +++ ceph-12.1.2/qa/suites/big/rados-thrash/thrashers/default.yaml 2017-08-01 17:55:40.000000000 +0000 @@ -1,7 +1,7 @@ overrides: ceph: log-whitelist: - - wrongly marked me down + - but it is still running - objects unfound and apparently lost tasks: - thrashosds: diff -Nru ceph-12.1.1/qa/suites/ceph-deploy/basic/objectstore/bluestore.yaml ceph-12.1.2/qa/suites/ceph-deploy/basic/objectstore/bluestore.yaml --- ceph-12.1.1/qa/suites/ceph-deploy/basic/objectstore/bluestore.yaml 1970-01-01 00:00:00.000000000 +0000 +++ ceph-12.1.2/qa/suites/ceph-deploy/basic/objectstore/bluestore.yaml 2017-08-01 17:55:40.000000000 +0000 @@ -0,0 +1,40 @@ +overrides: + thrashosds: + bdev_inject_crash: 2 + bdev_inject_crash_probability: .5 + ceph: + fs: xfs + conf: + osd: + osd objectstore: bluestore + bluestore block size: 96636764160 + debug bluestore: 30 + debug bdev: 20 + debug bluefs: 20 + debug rocksdb: 10 + bluestore fsck on mount: true + # lower the full ratios since we can fill up a 100gb osd so quickly + mon osd full ratio: .9 + mon osd backfillfull_ratio: .85 + mon osd nearfull ratio: .8 + osd failsafe full ratio: .95 +# this doesn't work with failures bc the log writes are not atomic across the two backends +# bluestore bluefs env mirror: true + ceph-deploy: + fs: xfs + bluestore: yes + conf: + osd: + osd objectstore: bluestore + bluestore block size: 96636764160 + debug bluestore: 30 + debug bdev: 20 + debug bluefs: 20 + debug rocksdb: 10 + bluestore fsck on mount: true + # lower the full ratios since we can fill up a 100gb osd so quickly + mon osd full ratio: .9 + mon osd backfillfull_ratio: .85 + mon osd nearfull ratio: .8 + osd failsafe full ratio: .95 + diff -Nru ceph-12.1.1/qa/suites/ceph-deploy/basic/objectstore/filestore-xfs.yaml ceph-12.1.2/qa/suites/ceph-deploy/basic/objectstore/filestore-xfs.yaml --- ceph-12.1.1/qa/suites/ceph-deploy/basic/objectstore/filestore-xfs.yaml 1970-01-01 00:00:00.000000000 +0000 +++ ceph-12.1.2/qa/suites/ceph-deploy/basic/objectstore/filestore-xfs.yaml 2017-08-01 17:55:40.000000000 +0000 @@ -0,0 +1,15 @@ +overrides: + ceph: + fs: xfs + conf: + osd: + osd objectstore: filestore + osd sloppy crc: true + ceph-deploy: + fs: xfs + filestore: True + conf: + osd: + osd objectstore: filestore + osd sloppy crc: true + diff -Nru ceph-12.1.1/qa/suites/fs/32bits/objectstore/bluestore.yaml ceph-12.1.2/qa/suites/fs/32bits/objectstore/bluestore.yaml --- ceph-12.1.1/qa/suites/fs/32bits/objectstore/bluestore.yaml 2017-07-17 16:56:02.000000000 +0000 +++ ceph-12.1.2/qa/suites/fs/32bits/objectstore/bluestore.yaml 2017-08-01 17:55:40.000000000 +0000 @@ -20,3 +20,21 @@ osd failsafe full ratio: .95 # this doesn't work with failures bc the log writes are not atomic across the two backends # bluestore bluefs env mirror: true + ceph-deploy: + fs: xfs + bluestore: yes + conf: + osd: + osd objectstore: bluestore + bluestore block size: 96636764160 + debug bluestore: 30 + debug bdev: 20 + debug bluefs: 20 + debug rocksdb: 10 + bluestore fsck on mount: true + # lower the full ratios since we can fill up a 100gb osd so quickly + mon osd full ratio: .9 + mon osd backfillfull_ratio: .85 + mon osd nearfull ratio: .8 + osd failsafe full ratio: .95 + diff -Nru ceph-12.1.1/qa/suites/fs/32bits/objectstore/filestore-xfs.yaml ceph-12.1.2/qa/suites/fs/32bits/objectstore/filestore-xfs.yaml --- ceph-12.1.1/qa/suites/fs/32bits/objectstore/filestore-xfs.yaml 2017-07-17 16:56:02.000000000 +0000 +++ ceph-12.1.2/qa/suites/fs/32bits/objectstore/filestore-xfs.yaml 2017-08-01 17:55:40.000000000 +0000 @@ -5,3 +5,11 @@ osd: osd objectstore: filestore osd sloppy crc: true + ceph-deploy: + fs: xfs + filestore: True + conf: + osd: + osd objectstore: filestore + osd sloppy crc: true + diff -Nru ceph-12.1.1/qa/suites/fs/32bits/overrides/whitelist_wrongly_marked_down.yaml ceph-12.1.2/qa/suites/fs/32bits/overrides/whitelist_wrongly_marked_down.yaml --- ceph-12.1.1/qa/suites/fs/32bits/overrides/whitelist_wrongly_marked_down.yaml 2017-07-17 16:56:02.000000000 +0000 +++ ceph-12.1.2/qa/suites/fs/32bits/overrides/whitelist_wrongly_marked_down.yaml 2017-08-01 17:55:40.000000000 +0000 @@ -4,7 +4,7 @@ - overall HEALTH_ - (OSD_DOWN) - (OSD_ - - wrongly marked me down + - but it is still running # MDS daemon 'b' is not responding, replacing it as rank 0 with standby 'a' - is not responding conf: diff -Nru ceph-12.1.1/qa/suites/fs/basic_functional/objectstore/bluestore.yaml ceph-12.1.2/qa/suites/fs/basic_functional/objectstore/bluestore.yaml --- ceph-12.1.1/qa/suites/fs/basic_functional/objectstore/bluestore.yaml 2017-07-17 16:56:02.000000000 +0000 +++ ceph-12.1.2/qa/suites/fs/basic_functional/objectstore/bluestore.yaml 2017-08-01 17:55:40.000000000 +0000 @@ -20,3 +20,21 @@ osd failsafe full ratio: .95 # this doesn't work with failures bc the log writes are not atomic across the two backends # bluestore bluefs env mirror: true + ceph-deploy: + fs: xfs + bluestore: yes + conf: + osd: + osd objectstore: bluestore + bluestore block size: 96636764160 + debug bluestore: 30 + debug bdev: 20 + debug bluefs: 20 + debug rocksdb: 10 + bluestore fsck on mount: true + # lower the full ratios since we can fill up a 100gb osd so quickly + mon osd full ratio: .9 + mon osd backfillfull_ratio: .85 + mon osd nearfull ratio: .8 + osd failsafe full ratio: .95 + diff -Nru ceph-12.1.1/qa/suites/fs/basic_functional/overrides/whitelist_wrongly_marked_down.yaml ceph-12.1.2/qa/suites/fs/basic_functional/overrides/whitelist_wrongly_marked_down.yaml --- ceph-12.1.1/qa/suites/fs/basic_functional/overrides/whitelist_wrongly_marked_down.yaml 2017-07-17 16:56:02.000000000 +0000 +++ ceph-12.1.2/qa/suites/fs/basic_functional/overrides/whitelist_wrongly_marked_down.yaml 2017-08-01 17:55:40.000000000 +0000 @@ -4,7 +4,7 @@ - overall HEALTH_ - (OSD_DOWN) - (OSD_ - - wrongly marked me down + - but it is still running # MDS daemon 'b' is not responding, replacing it as rank 0 with standby 'a' - is not responding conf: diff -Nru ceph-12.1.1/qa/suites/fs/basic_functional/tasks/client-recovery.yaml ceph-12.1.2/qa/suites/fs/basic_functional/tasks/client-recovery.yaml --- ceph-12.1.1/qa/suites/fs/basic_functional/tasks/client-recovery.yaml 2017-07-17 16:56:02.000000000 +0000 +++ ceph-12.1.2/qa/suites/fs/basic_functional/tasks/client-recovery.yaml 2017-08-01 17:55:40.000000000 +0000 @@ -5,7 +5,7 @@ ceph: log-whitelist: - evicting unresponsive client - - wrongly marked me down + - but it is still running - slow request tasks: diff -Nru ceph-12.1.1/qa/suites/fs/basic_functional/tasks/data-scan.yaml ceph-12.1.2/qa/suites/fs/basic_functional/tasks/data-scan.yaml --- ceph-12.1.1/qa/suites/fs/basic_functional/tasks/data-scan.yaml 2017-07-17 16:56:02.000000000 +0000 +++ ceph-12.1.2/qa/suites/fs/basic_functional/tasks/data-scan.yaml 2017-08-01 17:55:40.000000000 +0000 @@ -7,6 +7,7 @@ - error reading table object - error reading sessionmap - unmatched fragstat + - unmatched rstat - was unreadable, recreating it now - Scrub error on inode - Metadata damage detected diff -Nru ceph-12.1.1/qa/suites/fs/basic_workload/objectstore/bluestore.yaml ceph-12.1.2/qa/suites/fs/basic_workload/objectstore/bluestore.yaml --- ceph-12.1.1/qa/suites/fs/basic_workload/objectstore/bluestore.yaml 2017-07-17 16:56:02.000000000 +0000 +++ ceph-12.1.2/qa/suites/fs/basic_workload/objectstore/bluestore.yaml 2017-08-01 17:55:40.000000000 +0000 @@ -20,3 +20,21 @@ osd failsafe full ratio: .95 # this doesn't work with failures bc the log writes are not atomic across the two backends # bluestore bluefs env mirror: true + ceph-deploy: + fs: xfs + bluestore: yes + conf: + osd: + osd objectstore: bluestore + bluestore block size: 96636764160 + debug bluestore: 30 + debug bdev: 20 + debug bluefs: 20 + debug rocksdb: 10 + bluestore fsck on mount: true + # lower the full ratios since we can fill up a 100gb osd so quickly + mon osd full ratio: .9 + mon osd backfillfull_ratio: .85 + mon osd nearfull ratio: .8 + osd failsafe full ratio: .95 + diff -Nru ceph-12.1.1/qa/suites/fs/basic_workload/objectstore/filestore-xfs.yaml ceph-12.1.2/qa/suites/fs/basic_workload/objectstore/filestore-xfs.yaml --- ceph-12.1.1/qa/suites/fs/basic_workload/objectstore/filestore-xfs.yaml 2017-07-17 16:56:02.000000000 +0000 +++ ceph-12.1.2/qa/suites/fs/basic_workload/objectstore/filestore-xfs.yaml 2017-08-01 17:55:40.000000000 +0000 @@ -5,3 +5,11 @@ osd: osd objectstore: filestore osd sloppy crc: true + ceph-deploy: + fs: xfs + filestore: True + conf: + osd: + osd objectstore: filestore + osd sloppy crc: true + diff -Nru ceph-12.1.1/qa/suites/fs/basic_workload/overrides/whitelist_wrongly_marked_down.yaml ceph-12.1.2/qa/suites/fs/basic_workload/overrides/whitelist_wrongly_marked_down.yaml --- ceph-12.1.1/qa/suites/fs/basic_workload/overrides/whitelist_wrongly_marked_down.yaml 2017-07-17 16:56:02.000000000 +0000 +++ ceph-12.1.2/qa/suites/fs/basic_workload/overrides/whitelist_wrongly_marked_down.yaml 2017-08-01 17:55:40.000000000 +0000 @@ -4,7 +4,7 @@ - overall HEALTH_ - (OSD_DOWN) - (OSD_ - - wrongly marked me down + - but it is still running # MDS daemon 'b' is not responding, replacing it as rank 0 with standby 'a' - is not responding conf: diff -Nru ceph-12.1.1/qa/suites/fs/multiclient/objectstore/bluestore.yaml ceph-12.1.2/qa/suites/fs/multiclient/objectstore/bluestore.yaml --- ceph-12.1.1/qa/suites/fs/multiclient/objectstore/bluestore.yaml 2017-07-17 16:56:02.000000000 +0000 +++ ceph-12.1.2/qa/suites/fs/multiclient/objectstore/bluestore.yaml 2017-08-01 17:55:40.000000000 +0000 @@ -20,3 +20,21 @@ osd failsafe full ratio: .95 # this doesn't work with failures bc the log writes are not atomic across the two backends # bluestore bluefs env mirror: true + ceph-deploy: + fs: xfs + bluestore: yes + conf: + osd: + osd objectstore: bluestore + bluestore block size: 96636764160 + debug bluestore: 30 + debug bdev: 20 + debug bluefs: 20 + debug rocksdb: 10 + bluestore fsck on mount: true + # lower the full ratios since we can fill up a 100gb osd so quickly + mon osd full ratio: .9 + mon osd backfillfull_ratio: .85 + mon osd nearfull ratio: .8 + osd failsafe full ratio: .95 + diff -Nru ceph-12.1.1/qa/suites/fs/multiclient/objectstore/filestore-xfs.yaml ceph-12.1.2/qa/suites/fs/multiclient/objectstore/filestore-xfs.yaml --- ceph-12.1.1/qa/suites/fs/multiclient/objectstore/filestore-xfs.yaml 2017-07-17 16:56:02.000000000 +0000 +++ ceph-12.1.2/qa/suites/fs/multiclient/objectstore/filestore-xfs.yaml 2017-08-01 17:55:40.000000000 +0000 @@ -5,3 +5,11 @@ osd: osd objectstore: filestore osd sloppy crc: true + ceph-deploy: + fs: xfs + filestore: True + conf: + osd: + osd objectstore: filestore + osd sloppy crc: true + diff -Nru ceph-12.1.1/qa/suites/fs/multiclient/overrides/whitelist_wrongly_marked_down.yaml ceph-12.1.2/qa/suites/fs/multiclient/overrides/whitelist_wrongly_marked_down.yaml --- ceph-12.1.1/qa/suites/fs/multiclient/overrides/whitelist_wrongly_marked_down.yaml 2017-07-17 16:56:02.000000000 +0000 +++ ceph-12.1.2/qa/suites/fs/multiclient/overrides/whitelist_wrongly_marked_down.yaml 2017-08-01 17:55:40.000000000 +0000 @@ -4,7 +4,7 @@ - overall HEALTH_ - (OSD_DOWN) - (OSD_ - - wrongly marked me down + - but it is still running # MDS daemon 'b' is not responding, replacing it as rank 0 with standby 'a' - is not responding conf: diff -Nru ceph-12.1.1/qa/suites/fs/multifs/objectstore/bluestore.yaml ceph-12.1.2/qa/suites/fs/multifs/objectstore/bluestore.yaml --- ceph-12.1.1/qa/suites/fs/multifs/objectstore/bluestore.yaml 2017-07-17 16:56:02.000000000 +0000 +++ ceph-12.1.2/qa/suites/fs/multifs/objectstore/bluestore.yaml 2017-08-01 17:55:40.000000000 +0000 @@ -20,3 +20,21 @@ osd failsafe full ratio: .95 # this doesn't work with failures bc the log writes are not atomic across the two backends # bluestore bluefs env mirror: true + ceph-deploy: + fs: xfs + bluestore: yes + conf: + osd: + osd objectstore: bluestore + bluestore block size: 96636764160 + debug bluestore: 30 + debug bdev: 20 + debug bluefs: 20 + debug rocksdb: 10 + bluestore fsck on mount: true + # lower the full ratios since we can fill up a 100gb osd so quickly + mon osd full ratio: .9 + mon osd backfillfull_ratio: .85 + mon osd nearfull ratio: .8 + osd failsafe full ratio: .95 + diff -Nru ceph-12.1.1/qa/suites/fs/multifs/objectstore/filestore-xfs.yaml ceph-12.1.2/qa/suites/fs/multifs/objectstore/filestore-xfs.yaml --- ceph-12.1.1/qa/suites/fs/multifs/objectstore/filestore-xfs.yaml 2017-07-17 16:56:02.000000000 +0000 +++ ceph-12.1.2/qa/suites/fs/multifs/objectstore/filestore-xfs.yaml 2017-08-01 17:55:40.000000000 +0000 @@ -5,3 +5,11 @@ osd: osd objectstore: filestore osd sloppy crc: true + ceph-deploy: + fs: xfs + filestore: True + conf: + osd: + osd objectstore: filestore + osd sloppy crc: true + diff -Nru ceph-12.1.1/qa/suites/fs/multifs/overrides/whitelist_wrongly_marked_down.yaml ceph-12.1.2/qa/suites/fs/multifs/overrides/whitelist_wrongly_marked_down.yaml --- ceph-12.1.1/qa/suites/fs/multifs/overrides/whitelist_wrongly_marked_down.yaml 2017-07-17 16:56:02.000000000 +0000 +++ ceph-12.1.2/qa/suites/fs/multifs/overrides/whitelist_wrongly_marked_down.yaml 2017-08-01 17:55:40.000000000 +0000 @@ -4,7 +4,7 @@ - overall HEALTH_ - (OSD_DOWN) - (OSD_ - - wrongly marked me down + - but it is still running # MDS daemon 'b' is not responding, replacing it as rank 0 with standby 'a' - is not responding conf: diff -Nru ceph-12.1.1/qa/suites/fs/permission/objectstore/bluestore.yaml ceph-12.1.2/qa/suites/fs/permission/objectstore/bluestore.yaml --- ceph-12.1.1/qa/suites/fs/permission/objectstore/bluestore.yaml 2017-07-17 16:56:02.000000000 +0000 +++ ceph-12.1.2/qa/suites/fs/permission/objectstore/bluestore.yaml 2017-08-01 17:55:40.000000000 +0000 @@ -20,3 +20,21 @@ osd failsafe full ratio: .95 # this doesn't work with failures bc the log writes are not atomic across the two backends # bluestore bluefs env mirror: true + ceph-deploy: + fs: xfs + bluestore: yes + conf: + osd: + osd objectstore: bluestore + bluestore block size: 96636764160 + debug bluestore: 30 + debug bdev: 20 + debug bluefs: 20 + debug rocksdb: 10 + bluestore fsck on mount: true + # lower the full ratios since we can fill up a 100gb osd so quickly + mon osd full ratio: .9 + mon osd backfillfull_ratio: .85 + mon osd nearfull ratio: .8 + osd failsafe full ratio: .95 + diff -Nru ceph-12.1.1/qa/suites/fs/permission/objectstore/filestore-xfs.yaml ceph-12.1.2/qa/suites/fs/permission/objectstore/filestore-xfs.yaml --- ceph-12.1.1/qa/suites/fs/permission/objectstore/filestore-xfs.yaml 2017-07-17 16:56:02.000000000 +0000 +++ ceph-12.1.2/qa/suites/fs/permission/objectstore/filestore-xfs.yaml 2017-08-01 17:55:40.000000000 +0000 @@ -5,3 +5,11 @@ osd: osd objectstore: filestore osd sloppy crc: true + ceph-deploy: + fs: xfs + filestore: True + conf: + osd: + osd objectstore: filestore + osd sloppy crc: true + diff -Nru ceph-12.1.1/qa/suites/fs/permission/overrides/whitelist_wrongly_marked_down.yaml ceph-12.1.2/qa/suites/fs/permission/overrides/whitelist_wrongly_marked_down.yaml --- ceph-12.1.1/qa/suites/fs/permission/overrides/whitelist_wrongly_marked_down.yaml 2017-07-17 16:56:02.000000000 +0000 +++ ceph-12.1.2/qa/suites/fs/permission/overrides/whitelist_wrongly_marked_down.yaml 2017-08-01 17:55:40.000000000 +0000 @@ -4,7 +4,7 @@ - overall HEALTH_ - (OSD_DOWN) - (OSD_ - - wrongly marked me down + - but it is still running # MDS daemon 'b' is not responding, replacing it as rank 0 with standby 'a' - is not responding conf: diff -Nru ceph-12.1.1/qa/suites/fs/snaps/objectstore/bluestore.yaml ceph-12.1.2/qa/suites/fs/snaps/objectstore/bluestore.yaml --- ceph-12.1.1/qa/suites/fs/snaps/objectstore/bluestore.yaml 2017-07-17 16:56:02.000000000 +0000 +++ ceph-12.1.2/qa/suites/fs/snaps/objectstore/bluestore.yaml 2017-08-01 17:55:40.000000000 +0000 @@ -20,3 +20,21 @@ osd failsafe full ratio: .95 # this doesn't work with failures bc the log writes are not atomic across the two backends # bluestore bluefs env mirror: true + ceph-deploy: + fs: xfs + bluestore: yes + conf: + osd: + osd objectstore: bluestore + bluestore block size: 96636764160 + debug bluestore: 30 + debug bdev: 20 + debug bluefs: 20 + debug rocksdb: 10 + bluestore fsck on mount: true + # lower the full ratios since we can fill up a 100gb osd so quickly + mon osd full ratio: .9 + mon osd backfillfull_ratio: .85 + mon osd nearfull ratio: .8 + osd failsafe full ratio: .95 + diff -Nru ceph-12.1.1/qa/suites/fs/snaps/objectstore/filestore-xfs.yaml ceph-12.1.2/qa/suites/fs/snaps/objectstore/filestore-xfs.yaml --- ceph-12.1.1/qa/suites/fs/snaps/objectstore/filestore-xfs.yaml 2017-07-17 16:56:02.000000000 +0000 +++ ceph-12.1.2/qa/suites/fs/snaps/objectstore/filestore-xfs.yaml 2017-08-01 17:55:40.000000000 +0000 @@ -5,3 +5,11 @@ osd: osd objectstore: filestore osd sloppy crc: true + ceph-deploy: + fs: xfs + filestore: True + conf: + osd: + osd objectstore: filestore + osd sloppy crc: true + diff -Nru ceph-12.1.1/qa/suites/fs/snaps/overrides/whitelist_wrongly_marked_down.yaml ceph-12.1.2/qa/suites/fs/snaps/overrides/whitelist_wrongly_marked_down.yaml --- ceph-12.1.1/qa/suites/fs/snaps/overrides/whitelist_wrongly_marked_down.yaml 2017-07-17 16:56:02.000000000 +0000 +++ ceph-12.1.2/qa/suites/fs/snaps/overrides/whitelist_wrongly_marked_down.yaml 2017-08-01 17:55:40.000000000 +0000 @@ -4,7 +4,7 @@ - overall HEALTH_ - (OSD_DOWN) - (OSD_ - - wrongly marked me down + - but it is still running # MDS daemon 'b' is not responding, replacing it as rank 0 with standby 'a' - is not responding conf: diff -Nru ceph-12.1.1/qa/suites/fs/thrash/objectstore/bluestore.yaml ceph-12.1.2/qa/suites/fs/thrash/objectstore/bluestore.yaml --- ceph-12.1.1/qa/suites/fs/thrash/objectstore/bluestore.yaml 2017-07-17 16:56:02.000000000 +0000 +++ ceph-12.1.2/qa/suites/fs/thrash/objectstore/bluestore.yaml 2017-08-01 17:55:40.000000000 +0000 @@ -20,3 +20,21 @@ osd failsafe full ratio: .95 # this doesn't work with failures bc the log writes are not atomic across the two backends # bluestore bluefs env mirror: true + ceph-deploy: + fs: xfs + bluestore: yes + conf: + osd: + osd objectstore: bluestore + bluestore block size: 96636764160 + debug bluestore: 30 + debug bdev: 20 + debug bluefs: 20 + debug rocksdb: 10 + bluestore fsck on mount: true + # lower the full ratios since we can fill up a 100gb osd so quickly + mon osd full ratio: .9 + mon osd backfillfull_ratio: .85 + mon osd nearfull ratio: .8 + osd failsafe full ratio: .95 + diff -Nru ceph-12.1.1/qa/suites/fs/thrash/objectstore/filestore-xfs.yaml ceph-12.1.2/qa/suites/fs/thrash/objectstore/filestore-xfs.yaml --- ceph-12.1.1/qa/suites/fs/thrash/objectstore/filestore-xfs.yaml 2017-07-17 16:56:02.000000000 +0000 +++ ceph-12.1.2/qa/suites/fs/thrash/objectstore/filestore-xfs.yaml 2017-08-01 17:55:40.000000000 +0000 @@ -5,3 +5,11 @@ osd: osd objectstore: filestore osd sloppy crc: true + ceph-deploy: + fs: xfs + filestore: True + conf: + osd: + osd objectstore: filestore + osd sloppy crc: true + diff -Nru ceph-12.1.1/qa/suites/fs/thrash/overrides/whitelist_wrongly_marked_down.yaml ceph-12.1.2/qa/suites/fs/thrash/overrides/whitelist_wrongly_marked_down.yaml --- ceph-12.1.1/qa/suites/fs/thrash/overrides/whitelist_wrongly_marked_down.yaml 2017-07-17 16:56:02.000000000 +0000 +++ ceph-12.1.2/qa/suites/fs/thrash/overrides/whitelist_wrongly_marked_down.yaml 2017-08-01 17:55:40.000000000 +0000 @@ -4,7 +4,7 @@ - overall HEALTH_ - (OSD_DOWN) - (OSD_ - - wrongly marked me down + - but it is still running # MDS daemon 'b' is not responding, replacing it as rank 0 with standby 'a' - is not responding conf: diff -Nru ceph-12.1.1/qa/suites/fs/traceless/objectstore/bluestore.yaml ceph-12.1.2/qa/suites/fs/traceless/objectstore/bluestore.yaml --- ceph-12.1.1/qa/suites/fs/traceless/objectstore/bluestore.yaml 2017-07-17 16:56:02.000000000 +0000 +++ ceph-12.1.2/qa/suites/fs/traceless/objectstore/bluestore.yaml 2017-08-01 17:55:40.000000000 +0000 @@ -20,3 +20,21 @@ osd failsafe full ratio: .95 # this doesn't work with failures bc the log writes are not atomic across the two backends # bluestore bluefs env mirror: true + ceph-deploy: + fs: xfs + bluestore: yes + conf: + osd: + osd objectstore: bluestore + bluestore block size: 96636764160 + debug bluestore: 30 + debug bdev: 20 + debug bluefs: 20 + debug rocksdb: 10 + bluestore fsck on mount: true + # lower the full ratios since we can fill up a 100gb osd so quickly + mon osd full ratio: .9 + mon osd backfillfull_ratio: .85 + mon osd nearfull ratio: .8 + osd failsafe full ratio: .95 + diff -Nru ceph-12.1.1/qa/suites/fs/traceless/objectstore/filestore-xfs.yaml ceph-12.1.2/qa/suites/fs/traceless/objectstore/filestore-xfs.yaml --- ceph-12.1.1/qa/suites/fs/traceless/objectstore/filestore-xfs.yaml 2017-07-17 16:56:02.000000000 +0000 +++ ceph-12.1.2/qa/suites/fs/traceless/objectstore/filestore-xfs.yaml 2017-08-01 17:55:40.000000000 +0000 @@ -5,3 +5,11 @@ osd: osd objectstore: filestore osd sloppy crc: true + ceph-deploy: + fs: xfs + filestore: True + conf: + osd: + osd objectstore: filestore + osd sloppy crc: true + diff -Nru ceph-12.1.1/qa/suites/fs/traceless/overrides/whitelist_wrongly_marked_down.yaml ceph-12.1.2/qa/suites/fs/traceless/overrides/whitelist_wrongly_marked_down.yaml --- ceph-12.1.1/qa/suites/fs/traceless/overrides/whitelist_wrongly_marked_down.yaml 2017-07-17 16:56:02.000000000 +0000 +++ ceph-12.1.2/qa/suites/fs/traceless/overrides/whitelist_wrongly_marked_down.yaml 2017-08-01 17:55:40.000000000 +0000 @@ -4,7 +4,7 @@ - overall HEALTH_ - (OSD_DOWN) - (OSD_ - - wrongly marked me down + - but it is still running # MDS daemon 'b' is not responding, replacing it as rank 0 with standby 'a' - is not responding conf: diff -Nru ceph-12.1.1/qa/suites/fs/verify/objectstore/bluestore.yaml ceph-12.1.2/qa/suites/fs/verify/objectstore/bluestore.yaml --- ceph-12.1.1/qa/suites/fs/verify/objectstore/bluestore.yaml 2017-07-17 16:56:02.000000000 +0000 +++ ceph-12.1.2/qa/suites/fs/verify/objectstore/bluestore.yaml 2017-08-01 17:55:40.000000000 +0000 @@ -20,3 +20,21 @@ osd failsafe full ratio: .95 # this doesn't work with failures bc the log writes are not atomic across the two backends # bluestore bluefs env mirror: true + ceph-deploy: + fs: xfs + bluestore: yes + conf: + osd: + osd objectstore: bluestore + bluestore block size: 96636764160 + debug bluestore: 30 + debug bdev: 20 + debug bluefs: 20 + debug rocksdb: 10 + bluestore fsck on mount: true + # lower the full ratios since we can fill up a 100gb osd so quickly + mon osd full ratio: .9 + mon osd backfillfull_ratio: .85 + mon osd nearfull ratio: .8 + osd failsafe full ratio: .95 + diff -Nru ceph-12.1.1/qa/suites/fs/verify/objectstore/filestore-xfs.yaml ceph-12.1.2/qa/suites/fs/verify/objectstore/filestore-xfs.yaml --- ceph-12.1.1/qa/suites/fs/verify/objectstore/filestore-xfs.yaml 2017-07-17 16:56:02.000000000 +0000 +++ ceph-12.1.2/qa/suites/fs/verify/objectstore/filestore-xfs.yaml 2017-08-01 17:55:40.000000000 +0000 @@ -5,3 +5,11 @@ osd: osd objectstore: filestore osd sloppy crc: true + ceph-deploy: + fs: xfs + filestore: True + conf: + osd: + osd objectstore: filestore + osd sloppy crc: true + diff -Nru ceph-12.1.1/qa/suites/fs/verify/overrides/whitelist_wrongly_marked_down.yaml ceph-12.1.2/qa/suites/fs/verify/overrides/whitelist_wrongly_marked_down.yaml --- ceph-12.1.1/qa/suites/fs/verify/overrides/whitelist_wrongly_marked_down.yaml 2017-07-17 16:56:02.000000000 +0000 +++ ceph-12.1.2/qa/suites/fs/verify/overrides/whitelist_wrongly_marked_down.yaml 2017-08-01 17:55:40.000000000 +0000 @@ -4,7 +4,7 @@ - overall HEALTH_ - (OSD_DOWN) - (OSD_ - - wrongly marked me down + - but it is still running # MDS daemon 'b' is not responding, replacing it as rank 0 with standby 'a' - is not responding conf: diff -Nru ceph-12.1.1/qa/suites/hadoop/basic/filestore-xfs.yaml ceph-12.1.2/qa/suites/hadoop/basic/filestore-xfs.yaml --- ceph-12.1.1/qa/suites/hadoop/basic/filestore-xfs.yaml 2017-07-17 16:56:02.000000000 +0000 +++ ceph-12.1.2/qa/suites/hadoop/basic/filestore-xfs.yaml 2017-08-01 17:55:40.000000000 +0000 @@ -5,3 +5,11 @@ osd: osd objectstore: filestore osd sloppy crc: true + ceph-deploy: + fs: xfs + filestore: True + conf: + osd: + osd objectstore: filestore + osd sloppy crc: true + diff -Nru ceph-12.1.1/qa/suites/kcephfs/cephfs/objectstore/bluestore.yaml ceph-12.1.2/qa/suites/kcephfs/cephfs/objectstore/bluestore.yaml --- ceph-12.1.1/qa/suites/kcephfs/cephfs/objectstore/bluestore.yaml 2017-07-17 16:56:02.000000000 +0000 +++ ceph-12.1.2/qa/suites/kcephfs/cephfs/objectstore/bluestore.yaml 2017-08-01 17:55:40.000000000 +0000 @@ -20,3 +20,21 @@ osd failsafe full ratio: .95 # this doesn't work with failures bc the log writes are not atomic across the two backends # bluestore bluefs env mirror: true + ceph-deploy: + fs: xfs + bluestore: yes + conf: + osd: + osd objectstore: bluestore + bluestore block size: 96636764160 + debug bluestore: 30 + debug bdev: 20 + debug bluefs: 20 + debug rocksdb: 10 + bluestore fsck on mount: true + # lower the full ratios since we can fill up a 100gb osd so quickly + mon osd full ratio: .9 + mon osd backfillfull_ratio: .85 + mon osd nearfull ratio: .8 + osd failsafe full ratio: .95 + diff -Nru ceph-12.1.1/qa/suites/kcephfs/cephfs/objectstore/filestore-xfs.yaml ceph-12.1.2/qa/suites/kcephfs/cephfs/objectstore/filestore-xfs.yaml --- ceph-12.1.1/qa/suites/kcephfs/cephfs/objectstore/filestore-xfs.yaml 2017-07-17 16:56:02.000000000 +0000 +++ ceph-12.1.2/qa/suites/kcephfs/cephfs/objectstore/filestore-xfs.yaml 2017-08-01 17:55:40.000000000 +0000 @@ -5,3 +5,11 @@ osd: osd objectstore: filestore osd sloppy crc: true + ceph-deploy: + fs: xfs + filestore: True + conf: + osd: + osd objectstore: filestore + osd sloppy crc: true + diff -Nru ceph-12.1.1/qa/suites/kcephfs/mixed-clients/objectstore/bluestore.yaml ceph-12.1.2/qa/suites/kcephfs/mixed-clients/objectstore/bluestore.yaml --- ceph-12.1.1/qa/suites/kcephfs/mixed-clients/objectstore/bluestore.yaml 2017-07-17 16:56:02.000000000 +0000 +++ ceph-12.1.2/qa/suites/kcephfs/mixed-clients/objectstore/bluestore.yaml 2017-08-01 17:55:40.000000000 +0000 @@ -20,3 +20,21 @@ osd failsafe full ratio: .95 # this doesn't work with failures bc the log writes are not atomic across the two backends # bluestore bluefs env mirror: true + ceph-deploy: + fs: xfs + bluestore: yes + conf: + osd: + osd objectstore: bluestore + bluestore block size: 96636764160 + debug bluestore: 30 + debug bdev: 20 + debug bluefs: 20 + debug rocksdb: 10 + bluestore fsck on mount: true + # lower the full ratios since we can fill up a 100gb osd so quickly + mon osd full ratio: .9 + mon osd backfillfull_ratio: .85 + mon osd nearfull ratio: .8 + osd failsafe full ratio: .95 + diff -Nru ceph-12.1.1/qa/suites/kcephfs/mixed-clients/objectstore/filestore-xfs.yaml ceph-12.1.2/qa/suites/kcephfs/mixed-clients/objectstore/filestore-xfs.yaml --- ceph-12.1.1/qa/suites/kcephfs/mixed-clients/objectstore/filestore-xfs.yaml 2017-07-17 16:56:02.000000000 +0000 +++ ceph-12.1.2/qa/suites/kcephfs/mixed-clients/objectstore/filestore-xfs.yaml 2017-08-01 17:55:40.000000000 +0000 @@ -5,3 +5,11 @@ osd: osd objectstore: filestore osd sloppy crc: true + ceph-deploy: + fs: xfs + filestore: True + conf: + osd: + osd objectstore: filestore + osd sloppy crc: true + diff -Nru ceph-12.1.1/qa/suites/kcephfs/recovery/objectstore/bluestore.yaml ceph-12.1.2/qa/suites/kcephfs/recovery/objectstore/bluestore.yaml --- ceph-12.1.1/qa/suites/kcephfs/recovery/objectstore/bluestore.yaml 2017-07-17 16:56:02.000000000 +0000 +++ ceph-12.1.2/qa/suites/kcephfs/recovery/objectstore/bluestore.yaml 2017-08-01 17:55:40.000000000 +0000 @@ -20,3 +20,21 @@ osd failsafe full ratio: .95 # this doesn't work with failures bc the log writes are not atomic across the two backends # bluestore bluefs env mirror: true + ceph-deploy: + fs: xfs + bluestore: yes + conf: + osd: + osd objectstore: bluestore + bluestore block size: 96636764160 + debug bluestore: 30 + debug bdev: 20 + debug bluefs: 20 + debug rocksdb: 10 + bluestore fsck on mount: true + # lower the full ratios since we can fill up a 100gb osd so quickly + mon osd full ratio: .9 + mon osd backfillfull_ratio: .85 + mon osd nearfull ratio: .8 + osd failsafe full ratio: .95 + diff -Nru ceph-12.1.1/qa/suites/kcephfs/recovery/objectstore/filestore-xfs.yaml ceph-12.1.2/qa/suites/kcephfs/recovery/objectstore/filestore-xfs.yaml --- ceph-12.1.1/qa/suites/kcephfs/recovery/objectstore/filestore-xfs.yaml 2017-07-17 16:56:02.000000000 +0000 +++ ceph-12.1.2/qa/suites/kcephfs/recovery/objectstore/filestore-xfs.yaml 2017-08-01 17:55:40.000000000 +0000 @@ -5,3 +5,11 @@ osd: osd objectstore: filestore osd sloppy crc: true + ceph-deploy: + fs: xfs + filestore: True + conf: + osd: + osd objectstore: filestore + osd sloppy crc: true + diff -Nru ceph-12.1.1/qa/suites/kcephfs/recovery/tasks/client-recovery.yaml ceph-12.1.2/qa/suites/kcephfs/recovery/tasks/client-recovery.yaml --- ceph-12.1.1/qa/suites/kcephfs/recovery/tasks/client-recovery.yaml 2017-07-17 16:56:02.000000000 +0000 +++ ceph-12.1.2/qa/suites/kcephfs/recovery/tasks/client-recovery.yaml 2017-08-01 17:55:40.000000000 +0000 @@ -4,7 +4,7 @@ overrides: ceph: log-whitelist: - - wrongly marked me down + - but it is still running - slow request tasks: diff -Nru ceph-12.1.1/qa/suites/kcephfs/thrash/objectstore/bluestore.yaml ceph-12.1.2/qa/suites/kcephfs/thrash/objectstore/bluestore.yaml --- ceph-12.1.1/qa/suites/kcephfs/thrash/objectstore/bluestore.yaml 2017-07-17 16:56:02.000000000 +0000 +++ ceph-12.1.2/qa/suites/kcephfs/thrash/objectstore/bluestore.yaml 2017-08-01 17:55:40.000000000 +0000 @@ -20,3 +20,21 @@ osd failsafe full ratio: .95 # this doesn't work with failures bc the log writes are not atomic across the two backends # bluestore bluefs env mirror: true + ceph-deploy: + fs: xfs + bluestore: yes + conf: + osd: + osd objectstore: bluestore + bluestore block size: 96636764160 + debug bluestore: 30 + debug bdev: 20 + debug bluefs: 20 + debug rocksdb: 10 + bluestore fsck on mount: true + # lower the full ratios since we can fill up a 100gb osd so quickly + mon osd full ratio: .9 + mon osd backfillfull_ratio: .85 + mon osd nearfull ratio: .8 + osd failsafe full ratio: .95 + diff -Nru ceph-12.1.1/qa/suites/kcephfs/thrash/objectstore/filestore-xfs.yaml ceph-12.1.2/qa/suites/kcephfs/thrash/objectstore/filestore-xfs.yaml --- ceph-12.1.1/qa/suites/kcephfs/thrash/objectstore/filestore-xfs.yaml 2017-07-17 16:56:02.000000000 +0000 +++ ceph-12.1.2/qa/suites/kcephfs/thrash/objectstore/filestore-xfs.yaml 2017-08-01 17:55:40.000000000 +0000 @@ -5,3 +5,11 @@ osd: osd objectstore: filestore osd sloppy crc: true + ceph-deploy: + fs: xfs + filestore: True + conf: + osd: + osd objectstore: filestore + osd sloppy crc: true + diff -Nru ceph-12.1.1/qa/suites/kcephfs/thrash/thrashers/default.yaml ceph-12.1.2/qa/suites/kcephfs/thrash/thrashers/default.yaml --- ceph-12.1.1/qa/suites/kcephfs/thrash/thrashers/default.yaml 2017-07-17 16:56:02.000000000 +0000 +++ ceph-12.1.2/qa/suites/kcephfs/thrash/thrashers/default.yaml 2017-08-01 17:55:40.000000000 +0000 @@ -2,6 +2,6 @@ - install: - ceph: log-whitelist: - - wrongly marked me down + - but it is still running - objects unfound and apparently lost - thrashosds: diff -Nru ceph-12.1.1/qa/suites/krbd/thrash/ceph/ceph.yaml ceph-12.1.2/qa/suites/krbd/thrash/ceph/ceph.yaml --- ceph-12.1.1/qa/suites/krbd/thrash/ceph/ceph.yaml 1970-01-01 00:00:00.000000000 +0000 +++ ceph-12.1.2/qa/suites/krbd/thrash/ceph/ceph.yaml 2017-08-01 17:55:40.000000000 +0000 @@ -0,0 +1,3 @@ +tasks: +- install: +- ceph: diff -Nru ceph-12.1.1/qa/suites/krbd/thrash/thrashers/backoff.yaml ceph-12.1.2/qa/suites/krbd/thrash/thrashers/backoff.yaml --- ceph-12.1.1/qa/suites/krbd/thrash/thrashers/backoff.yaml 1970-01-01 00:00:00.000000000 +0000 +++ ceph-12.1.2/qa/suites/krbd/thrash/thrashers/backoff.yaml 2017-08-01 17:55:40.000000000 +0000 @@ -0,0 +1,14 @@ +overrides: + ceph: + conf: + osd: + osd backoff on peering: true + osd backoff on degraded: true + log-whitelist: + - wrongly marked me down + - objects unfound and apparently lost +tasks: +- thrashosds: + timeout: 1200 + chance_pgnum_grow: 1 + chance_pgpnum_fix: 1 diff -Nru ceph-12.1.1/qa/suites/krbd/thrash/thrashers/default.yaml ceph-12.1.2/qa/suites/krbd/thrash/thrashers/default.yaml --- ceph-12.1.1/qa/suites/krbd/thrash/thrashers/default.yaml 2017-07-17 16:56:02.000000000 +0000 +++ ceph-12.1.2/qa/suites/krbd/thrash/thrashers/default.yaml 1970-01-01 00:00:00.000000000 +0000 @@ -1,7 +0,0 @@ -tasks: -- install: -- ceph: - log-whitelist: - - wrongly marked me down - - objects unfound and apparently lost -- thrashosds: diff -Nru ceph-12.1.1/qa/suites/krbd/thrash/thrashers/mon-thrasher.yaml ceph-12.1.2/qa/suites/krbd/thrash/thrashers/mon-thrasher.yaml --- ceph-12.1.1/qa/suites/krbd/thrash/thrashers/mon-thrasher.yaml 2017-07-17 16:56:02.000000000 +0000 +++ ceph-12.1.2/qa/suites/krbd/thrash/thrashers/mon-thrasher.yaml 2017-08-01 17:55:40.000000000 +0000 @@ -1,6 +1,4 @@ tasks: -- install: -- ceph: - mon_thrash: revive_delay: 20 thrash_delay: 1 diff -Nru ceph-12.1.1/qa/suites/krbd/thrash/thrashers/pggrow.yaml ceph-12.1.2/qa/suites/krbd/thrash/thrashers/pggrow.yaml --- ceph-12.1.1/qa/suites/krbd/thrash/thrashers/pggrow.yaml 1970-01-01 00:00:00.000000000 +0000 +++ ceph-12.1.2/qa/suites/krbd/thrash/thrashers/pggrow.yaml 2017-08-01 17:55:40.000000000 +0000 @@ -0,0 +1,10 @@ +overrides: + ceph: + log-whitelist: + - but it is still running + - objects unfound and apparently lost +tasks: +- thrashosds: + timeout: 1200 + chance_pgnum_grow: 2 + chance_pgpnum_fix: 1 diff -Nru ceph-12.1.1/qa/suites/krbd/thrash/thrashers/upmap.yaml ceph-12.1.2/qa/suites/krbd/thrash/thrashers/upmap.yaml --- ceph-12.1.1/qa/suites/krbd/thrash/thrashers/upmap.yaml 1970-01-01 00:00:00.000000000 +0000 +++ ceph-12.1.2/qa/suites/krbd/thrash/thrashers/upmap.yaml 2017-08-01 17:55:40.000000000 +0000 @@ -0,0 +1,16 @@ +overrides: + ceph: + crush_tunables: optimal + conf: + mon: + mon osd initial require min compat client: luminous + log-whitelist: + - wrongly marked me down + - objects unfound and apparently lost +tasks: +- thrashosds: + timeout: 1200 + chance_pgnum_grow: 1 + chance_pgpnum_fix: 1 + chance_thrash_pg_upmap: 3 + chance_thrash_pg_upmap_items: 3 diff -Nru ceph-12.1.1/qa/suites/krbd/thrash/thrashosds-health.yaml ceph-12.1.2/qa/suites/krbd/thrash/thrashosds-health.yaml --- ceph-12.1.1/qa/suites/krbd/thrash/thrashosds-health.yaml 1970-01-01 00:00:00.000000000 +0000 +++ ceph-12.1.2/qa/suites/krbd/thrash/thrashosds-health.yaml 2017-08-01 17:55:40.000000000 +0000 @@ -0,0 +1,13 @@ +overrides: + ceph: + log-whitelist: + - overall HEALTH_ + - (OSDMAP_FLAGS) + - (OSD_ + - (PG_ + - (POOL_ + - (CACHE_POOL_ + - (SMALLER_PGP_NUM) + - (OBJECT_ + - (REQUEST_SLOW) + - (TOO_FEW_PGS) diff -Nru ceph-12.1.1/qa/suites/krbd/thrash/workloads/rbd_fio.yaml ceph-12.1.2/qa/suites/krbd/thrash/workloads/rbd_fio.yaml --- ceph-12.1.1/qa/suites/krbd/thrash/workloads/rbd_fio.yaml 2017-07-17 16:56:02.000000000 +0000 +++ ceph-12.1.2/qa/suites/krbd/thrash/workloads/rbd_fio.yaml 2017-08-01 17:55:40.000000000 +0000 @@ -1,9 +1,11 @@ tasks: - rbd_fio: client.0: - fio-io-size: 90% + fio-io-size: 100% formats: [2] features: [[layering,exclusive-lock]] - io-engine: sync + io-engine: libaio rw: randrw + bs: 1024 + io-depth: 256 runtime: 1200 diff -Nru ceph-12.1.1/qa/suites/krbd/thrash/workloads/rbd_workunit_suites_iozone.yaml.disabled ceph-12.1.2/qa/suites/krbd/thrash/workloads/rbd_workunit_suites_iozone.yaml.disabled --- ceph-12.1.1/qa/suites/krbd/thrash/workloads/rbd_workunit_suites_iozone.yaml.disabled 2017-07-17 16:56:02.000000000 +0000 +++ ceph-12.1.2/qa/suites/krbd/thrash/workloads/rbd_workunit_suites_iozone.yaml.disabled 1970-01-01 00:00:00.000000000 +0000 @@ -1,8 +0,0 @@ -tasks: -- rbd: - all: - image_size: 20480 -- workunit: - clients: - all: - - suites/iozone.sh diff -Nru ceph-12.1.1/qa/suites/krbd/unmap/filestore-xfs.yaml ceph-12.1.2/qa/suites/krbd/unmap/filestore-xfs.yaml --- ceph-12.1.1/qa/suites/krbd/unmap/filestore-xfs.yaml 2017-07-17 16:56:02.000000000 +0000 +++ ceph-12.1.2/qa/suites/krbd/unmap/filestore-xfs.yaml 2017-08-01 17:55:40.000000000 +0000 @@ -5,3 +5,11 @@ osd: osd objectstore: filestore osd sloppy crc: true + ceph-deploy: + fs: xfs + filestore: True + conf: + osd: + osd objectstore: filestore + osd sloppy crc: true + diff -Nru ceph-12.1.1/qa/suites/mixed-clients/basic/objectstore/bluestore.yaml ceph-12.1.2/qa/suites/mixed-clients/basic/objectstore/bluestore.yaml --- ceph-12.1.1/qa/suites/mixed-clients/basic/objectstore/bluestore.yaml 2017-07-17 16:56:02.000000000 +0000 +++ ceph-12.1.2/qa/suites/mixed-clients/basic/objectstore/bluestore.yaml 2017-08-01 17:55:40.000000000 +0000 @@ -20,3 +20,21 @@ osd failsafe full ratio: .95 # this doesn't work with failures bc the log writes are not atomic across the two backends # bluestore bluefs env mirror: true + ceph-deploy: + fs: xfs + bluestore: yes + conf: + osd: + osd objectstore: bluestore + bluestore block size: 96636764160 + debug bluestore: 30 + debug bdev: 20 + debug bluefs: 20 + debug rocksdb: 10 + bluestore fsck on mount: true + # lower the full ratios since we can fill up a 100gb osd so quickly + mon osd full ratio: .9 + mon osd backfillfull_ratio: .85 + mon osd nearfull ratio: .8 + osd failsafe full ratio: .95 + diff -Nru ceph-12.1.1/qa/suites/mixed-clients/basic/objectstore/filestore-xfs.yaml ceph-12.1.2/qa/suites/mixed-clients/basic/objectstore/filestore-xfs.yaml --- ceph-12.1.1/qa/suites/mixed-clients/basic/objectstore/filestore-xfs.yaml 2017-07-17 16:56:02.000000000 +0000 +++ ceph-12.1.2/qa/suites/mixed-clients/basic/objectstore/filestore-xfs.yaml 2017-08-01 17:55:40.000000000 +0000 @@ -5,3 +5,11 @@ osd: osd objectstore: filestore osd sloppy crc: true + ceph-deploy: + fs: xfs + filestore: True + conf: + osd: + osd objectstore: filestore + osd sloppy crc: true + diff -Nru ceph-12.1.1/qa/suites/multimds/basic/objectstore/bluestore.yaml ceph-12.1.2/qa/suites/multimds/basic/objectstore/bluestore.yaml --- ceph-12.1.1/qa/suites/multimds/basic/objectstore/bluestore.yaml 2017-07-17 16:56:02.000000000 +0000 +++ ceph-12.1.2/qa/suites/multimds/basic/objectstore/bluestore.yaml 2017-08-01 17:55:40.000000000 +0000 @@ -20,3 +20,21 @@ osd failsafe full ratio: .95 # this doesn't work with failures bc the log writes are not atomic across the two backends # bluestore bluefs env mirror: true + ceph-deploy: + fs: xfs + bluestore: yes + conf: + osd: + osd objectstore: bluestore + bluestore block size: 96636764160 + debug bluestore: 30 + debug bdev: 20 + debug bluefs: 20 + debug rocksdb: 10 + bluestore fsck on mount: true + # lower the full ratios since we can fill up a 100gb osd so quickly + mon osd full ratio: .9 + mon osd backfillfull_ratio: .85 + mon osd nearfull ratio: .8 + osd failsafe full ratio: .95 + diff -Nru ceph-12.1.1/qa/suites/multimds/basic/objectstore/filestore-xfs.yaml ceph-12.1.2/qa/suites/multimds/basic/objectstore/filestore-xfs.yaml --- ceph-12.1.1/qa/suites/multimds/basic/objectstore/filestore-xfs.yaml 2017-07-17 16:56:02.000000000 +0000 +++ ceph-12.1.2/qa/suites/multimds/basic/objectstore/filestore-xfs.yaml 2017-08-01 17:55:40.000000000 +0000 @@ -5,3 +5,11 @@ osd: osd objectstore: filestore osd sloppy crc: true + ceph-deploy: + fs: xfs + filestore: True + conf: + osd: + osd objectstore: filestore + osd sloppy crc: true + diff -Nru ceph-12.1.1/qa/suites/multimds/basic/overrides/basic/whitelist_wrongly_marked_down.yaml ceph-12.1.2/qa/suites/multimds/basic/overrides/basic/whitelist_wrongly_marked_down.yaml --- ceph-12.1.1/qa/suites/multimds/basic/overrides/basic/whitelist_wrongly_marked_down.yaml 2017-07-17 16:56:02.000000000 +0000 +++ ceph-12.1.2/qa/suites/multimds/basic/overrides/basic/whitelist_wrongly_marked_down.yaml 2017-08-01 17:55:40.000000000 +0000 @@ -4,7 +4,7 @@ - overall HEALTH_ - (OSD_DOWN) - (OSD_ - - wrongly marked me down + - but it is still running # MDS daemon 'b' is not responding, replacing it as rank 0 with standby 'a' - is not responding conf: diff -Nru ceph-12.1.1/qa/suites/multimds/thrash/objectstore/bluestore.yaml ceph-12.1.2/qa/suites/multimds/thrash/objectstore/bluestore.yaml --- ceph-12.1.1/qa/suites/multimds/thrash/objectstore/bluestore.yaml 2017-07-17 16:56:02.000000000 +0000 +++ ceph-12.1.2/qa/suites/multimds/thrash/objectstore/bluestore.yaml 2017-08-01 17:55:40.000000000 +0000 @@ -20,3 +20,21 @@ osd failsafe full ratio: .95 # this doesn't work with failures bc the log writes are not atomic across the two backends # bluestore bluefs env mirror: true + ceph-deploy: + fs: xfs + bluestore: yes + conf: + osd: + osd objectstore: bluestore + bluestore block size: 96636764160 + debug bluestore: 30 + debug bdev: 20 + debug bluefs: 20 + debug rocksdb: 10 + bluestore fsck on mount: true + # lower the full ratios since we can fill up a 100gb osd so quickly + mon osd full ratio: .9 + mon osd backfillfull_ratio: .85 + mon osd nearfull ratio: .8 + osd failsafe full ratio: .95 + diff -Nru ceph-12.1.1/qa/suites/multimds/thrash/objectstore/filestore-xfs.yaml ceph-12.1.2/qa/suites/multimds/thrash/objectstore/filestore-xfs.yaml --- ceph-12.1.1/qa/suites/multimds/thrash/objectstore/filestore-xfs.yaml 2017-07-17 16:56:02.000000000 +0000 +++ ceph-12.1.2/qa/suites/multimds/thrash/objectstore/filestore-xfs.yaml 2017-08-01 17:55:40.000000000 +0000 @@ -5,3 +5,11 @@ osd: osd objectstore: filestore osd sloppy crc: true + ceph-deploy: + fs: xfs + filestore: True + conf: + osd: + osd objectstore: filestore + osd sloppy crc: true + diff -Nru ceph-12.1.1/qa/suites/multimds/thrash/overrides/thrash/whitelist_wrongly_marked_down.yaml ceph-12.1.2/qa/suites/multimds/thrash/overrides/thrash/whitelist_wrongly_marked_down.yaml --- ceph-12.1.1/qa/suites/multimds/thrash/overrides/thrash/whitelist_wrongly_marked_down.yaml 2017-07-17 16:56:02.000000000 +0000 +++ ceph-12.1.2/qa/suites/multimds/thrash/overrides/thrash/whitelist_wrongly_marked_down.yaml 2017-08-01 17:55:40.000000000 +0000 @@ -4,7 +4,7 @@ - overall HEALTH_ - (OSD_DOWN) - (OSD_ - - wrongly marked me down + - but it is still running # MDS daemon 'b' is not responding, replacing it as rank 0 with standby 'a' - is not responding conf: diff -Nru ceph-12.1.1/qa/suites/multimds/verify/objectstore/bluestore.yaml ceph-12.1.2/qa/suites/multimds/verify/objectstore/bluestore.yaml --- ceph-12.1.1/qa/suites/multimds/verify/objectstore/bluestore.yaml 2017-07-17 16:56:02.000000000 +0000 +++ ceph-12.1.2/qa/suites/multimds/verify/objectstore/bluestore.yaml 2017-08-01 17:55:40.000000000 +0000 @@ -20,3 +20,21 @@ osd failsafe full ratio: .95 # this doesn't work with failures bc the log writes are not atomic across the two backends # bluestore bluefs env mirror: true + ceph-deploy: + fs: xfs + bluestore: yes + conf: + osd: + osd objectstore: bluestore + bluestore block size: 96636764160 + debug bluestore: 30 + debug bdev: 20 + debug bluefs: 20 + debug rocksdb: 10 + bluestore fsck on mount: true + # lower the full ratios since we can fill up a 100gb osd so quickly + mon osd full ratio: .9 + mon osd backfillfull_ratio: .85 + mon osd nearfull ratio: .8 + osd failsafe full ratio: .95 + diff -Nru ceph-12.1.1/qa/suites/multimds/verify/objectstore/filestore-xfs.yaml ceph-12.1.2/qa/suites/multimds/verify/objectstore/filestore-xfs.yaml --- ceph-12.1.1/qa/suites/multimds/verify/objectstore/filestore-xfs.yaml 2017-07-17 16:56:02.000000000 +0000 +++ ceph-12.1.2/qa/suites/multimds/verify/objectstore/filestore-xfs.yaml 2017-08-01 17:55:40.000000000 +0000 @@ -5,3 +5,11 @@ osd: osd objectstore: filestore osd sloppy crc: true + ceph-deploy: + fs: xfs + filestore: True + conf: + osd: + osd objectstore: filestore + osd sloppy crc: true + diff -Nru ceph-12.1.1/qa/suites/multimds/verify/overrides/verify/whitelist_wrongly_marked_down.yaml ceph-12.1.2/qa/suites/multimds/verify/overrides/verify/whitelist_wrongly_marked_down.yaml --- ceph-12.1.1/qa/suites/multimds/verify/overrides/verify/whitelist_wrongly_marked_down.yaml 2017-07-17 16:56:02.000000000 +0000 +++ ceph-12.1.2/qa/suites/multimds/verify/overrides/verify/whitelist_wrongly_marked_down.yaml 2017-08-01 17:55:40.000000000 +0000 @@ -4,7 +4,7 @@ - overall HEALTH_ - (OSD_DOWN) - (OSD_ - - wrongly marked me down + - but it is still running # MDS daemon 'b' is not responding, replacing it as rank 0 with standby 'a' - is not responding conf: diff -Nru ceph-12.1.1/qa/suites/powercycle/osd/objectstore/bluestore.yaml ceph-12.1.2/qa/suites/powercycle/osd/objectstore/bluestore.yaml --- ceph-12.1.1/qa/suites/powercycle/osd/objectstore/bluestore.yaml 2017-07-17 16:56:02.000000000 +0000 +++ ceph-12.1.2/qa/suites/powercycle/osd/objectstore/bluestore.yaml 2017-08-01 17:55:40.000000000 +0000 @@ -20,3 +20,21 @@ osd failsafe full ratio: .95 # this doesn't work with failures bc the log writes are not atomic across the two backends # bluestore bluefs env mirror: true + ceph-deploy: + fs: xfs + bluestore: yes + conf: + osd: + osd objectstore: bluestore + bluestore block size: 96636764160 + debug bluestore: 30 + debug bdev: 20 + debug bluefs: 20 + debug rocksdb: 10 + bluestore fsck on mount: true + # lower the full ratios since we can fill up a 100gb osd so quickly + mon osd full ratio: .9 + mon osd backfillfull_ratio: .85 + mon osd nearfull ratio: .8 + osd failsafe full ratio: .95 + diff -Nru ceph-12.1.1/qa/suites/powercycle/osd/objectstore/filestore-xfs.yaml ceph-12.1.2/qa/suites/powercycle/osd/objectstore/filestore-xfs.yaml --- ceph-12.1.1/qa/suites/powercycle/osd/objectstore/filestore-xfs.yaml 2017-07-17 16:56:02.000000000 +0000 +++ ceph-12.1.2/qa/suites/powercycle/osd/objectstore/filestore-xfs.yaml 2017-08-01 17:55:40.000000000 +0000 @@ -5,3 +5,11 @@ osd: osd objectstore: filestore osd sloppy crc: true + ceph-deploy: + fs: xfs + filestore: True + conf: + osd: + osd objectstore: filestore + osd sloppy crc: true + diff -Nru ceph-12.1.1/qa/suites/powercycle/osd/tasks/rados_api_tests.yaml ceph-12.1.2/qa/suites/powercycle/osd/tasks/rados_api_tests.yaml --- ceph-12.1.1/qa/suites/powercycle/osd/tasks/rados_api_tests.yaml 2017-07-17 16:56:02.000000000 +0000 +++ ceph-12.1.2/qa/suites/powercycle/osd/tasks/rados_api_tests.yaml 2017-08-01 17:55:40.000000000 +0000 @@ -2,6 +2,7 @@ ceph: log-whitelist: - reached quota + - (POOL_APP_NOT_ENABLED) tasks: - ceph-fuse: - workunit: diff -Nru ceph-12.1.1/qa/suites/powercycle/osd/whitelist_health.yaml ceph-12.1.2/qa/suites/powercycle/osd/whitelist_health.yaml --- ceph-12.1.1/qa/suites/powercycle/osd/whitelist_health.yaml 1970-01-01 00:00:00.000000000 +0000 +++ ceph-12.1.2/qa/suites/powercycle/osd/whitelist_health.yaml 2017-08-01 17:55:40.000000000 +0000 @@ -0,0 +1,4 @@ +overrides: + ceph: + log-whitelist: + - (MDS_TRIM) diff -Nru ceph-12.1.1/qa/suites/rados/basic/d-require-luminous/at-end.yaml ceph-12.1.2/qa/suites/rados/basic/d-require-luminous/at-end.yaml --- ceph-12.1.1/qa/suites/rados/basic/d-require-luminous/at-end.yaml 2017-07-17 16:56:02.000000000 +0000 +++ ceph-12.1.2/qa/suites/rados/basic/d-require-luminous/at-end.yaml 2017-08-01 17:55:40.000000000 +0000 @@ -6,6 +6,7 @@ - exec: mon.a: - ceph osd require-osd-release luminous + - ceph osd pool application enable base rados || true # make sure osds have latest map - rados -p rbd bench 5 write -b 4096 - ceph.healthy: @@ -20,5 +21,11 @@ conf: global: mon debug no require luminous: true + +# setting luminous triggers peering, which *might* trigger health alerts + log-whitelist: + - overall HEALTH_ + - (PG_AVAILABILITY) + - (PG_DEGRADED) thrashosds: chance_thrash_cluster_full: 0 diff -Nru ceph-12.1.1/qa/suites/rados/basic/objectstore/bluestore.yaml ceph-12.1.2/qa/suites/rados/basic/objectstore/bluestore.yaml --- ceph-12.1.1/qa/suites/rados/basic/objectstore/bluestore.yaml 2017-07-17 16:56:02.000000000 +0000 +++ ceph-12.1.2/qa/suites/rados/basic/objectstore/bluestore.yaml 2017-08-01 17:55:40.000000000 +0000 @@ -20,3 +20,21 @@ osd failsafe full ratio: .95 # this doesn't work with failures bc the log writes are not atomic across the two backends # bluestore bluefs env mirror: true + ceph-deploy: + fs: xfs + bluestore: yes + conf: + osd: + osd objectstore: bluestore + bluestore block size: 96636764160 + debug bluestore: 30 + debug bdev: 20 + debug bluefs: 20 + debug rocksdb: 10 + bluestore fsck on mount: true + # lower the full ratios since we can fill up a 100gb osd so quickly + mon osd full ratio: .9 + mon osd backfillfull_ratio: .85 + mon osd nearfull ratio: .8 + osd failsafe full ratio: .95 + diff -Nru ceph-12.1.1/qa/suites/rados/basic/objectstore/filestore-xfs.yaml ceph-12.1.2/qa/suites/rados/basic/objectstore/filestore-xfs.yaml --- ceph-12.1.1/qa/suites/rados/basic/objectstore/filestore-xfs.yaml 2017-07-17 16:56:02.000000000 +0000 +++ ceph-12.1.2/qa/suites/rados/basic/objectstore/filestore-xfs.yaml 2017-08-01 17:55:40.000000000 +0000 @@ -5,3 +5,11 @@ osd: osd objectstore: filestore osd sloppy crc: true + ceph-deploy: + fs: xfs + filestore: True + conf: + osd: + osd objectstore: filestore + osd sloppy crc: true + diff -Nru ceph-12.1.1/qa/suites/rados/basic/tasks/rados_api_tests.yaml ceph-12.1.2/qa/suites/rados/basic/tasks/rados_api_tests.yaml --- ceph-12.1.1/qa/suites/rados/basic/tasks/rados_api_tests.yaml 2017-07-17 16:56:02.000000000 +0000 +++ ceph-12.1.2/qa/suites/rados/basic/tasks/rados_api_tests.yaml 2017-08-01 17:55:40.000000000 +0000 @@ -2,12 +2,13 @@ ceph: log-whitelist: - reached quota - - wrongly marked me down + - but it is still running - overall HEALTH_ - (POOL_FULL) - (SMALLER_PGP_NUM) - (CACHE_POOL_NO_HIT_SET) - (CACHE_POOL_NEAR_FULL) + - (POOL_APP_NOT_ENABLED) tasks: - workunit: clients: diff -Nru ceph-12.1.1/qa/suites/rados/basic/tasks/rados_python.yaml ceph-12.1.2/qa/suites/rados/basic/tasks/rados_python.yaml --- ceph-12.1.1/qa/suites/rados/basic/tasks/rados_python.yaml 2017-07-17 16:56:02.000000000 +0000 +++ ceph-12.1.2/qa/suites/rados/basic/tasks/rados_python.yaml 2017-08-01 17:55:40.000000000 +0000 @@ -1,7 +1,7 @@ overrides: ceph: log-whitelist: - - wrongly marked me down + - but it is still running - overall HEALTH_ - (OSDMAP_FLAGS) - (PG_ diff -Nru ceph-12.1.1/qa/suites/rados/basic/tasks/rados_workunit_loadgen_big.yaml ceph-12.1.2/qa/suites/rados/basic/tasks/rados_workunit_loadgen_big.yaml --- ceph-12.1.1/qa/suites/rados/basic/tasks/rados_workunit_loadgen_big.yaml 2017-07-17 16:56:02.000000000 +0000 +++ ceph-12.1.2/qa/suites/rados/basic/tasks/rados_workunit_loadgen_big.yaml 2017-08-01 17:55:40.000000000 +0000 @@ -1,7 +1,9 @@ overrides: ceph: log-whitelist: - - wrongly marked me down + - but it is still running + - overall HEALTH_ + - (POOL_APP_NOT_ENABLED) tasks: - workunit: clients: diff -Nru ceph-12.1.1/qa/suites/rados/basic/tasks/rados_workunit_loadgen_mix.yaml ceph-12.1.2/qa/suites/rados/basic/tasks/rados_workunit_loadgen_mix.yaml --- ceph-12.1.1/qa/suites/rados/basic/tasks/rados_workunit_loadgen_mix.yaml 2017-07-17 16:56:02.000000000 +0000 +++ ceph-12.1.2/qa/suites/rados/basic/tasks/rados_workunit_loadgen_mix.yaml 2017-08-01 17:55:40.000000000 +0000 @@ -1,7 +1,9 @@ overrides: ceph: log-whitelist: - - wrongly marked me down + - but it is still running + - overall HEALTH_ + - (POOL_APP_NOT_ENABLED) tasks: - workunit: clients: diff -Nru ceph-12.1.1/qa/suites/rados/basic/tasks/rados_workunit_loadgen_mostlyread.yaml ceph-12.1.2/qa/suites/rados/basic/tasks/rados_workunit_loadgen_mostlyread.yaml --- ceph-12.1.1/qa/suites/rados/basic/tasks/rados_workunit_loadgen_mostlyread.yaml 2017-07-17 16:56:02.000000000 +0000 +++ ceph-12.1.2/qa/suites/rados/basic/tasks/rados_workunit_loadgen_mostlyread.yaml 2017-08-01 17:55:40.000000000 +0000 @@ -1,7 +1,9 @@ overrides: ceph: log-whitelist: - - wrongly marked me down + - but it is still running + - overall HEALTH_ + - (POOL_APP_NOT_ENABLED) tasks: - workunit: clients: diff -Nru ceph-12.1.1/qa/suites/rados/basic/tasks/repair_test.yaml ceph-12.1.2/qa/suites/rados/basic/tasks/repair_test.yaml --- ceph-12.1.1/qa/suites/rados/basic/tasks/repair_test.yaml 2017-07-17 16:56:02.000000000 +0000 +++ ceph-12.1.2/qa/suites/rados/basic/tasks/repair_test.yaml 2017-08-01 17:55:40.000000000 +0000 @@ -16,7 +16,7 @@ - scrub [0-9]+ errors - 'size 1 != size' - attr name mismatch - - Regular scrub request, losing deep-scrub details + - Regular scrub request, deep-scrub details will be lost - overall HEALTH_ - (OSDMAP_FLAGS) - (OSD_ diff -Nru ceph-12.1.1/qa/suites/rados/basic/tasks/rgw_snaps.yaml ceph-12.1.2/qa/suites/rados/basic/tasks/rgw_snaps.yaml --- ceph-12.1.1/qa/suites/rados/basic/tasks/rgw_snaps.yaml 2017-07-17 16:56:02.000000000 +0000 +++ ceph-12.1.2/qa/suites/rados/basic/tasks/rgw_snaps.yaml 2017-08-01 17:55:40.000000000 +0000 @@ -9,15 +9,21 @@ tasks: - rgw: client.0: +- ceph_manager.wait_for_pools: + kwargs: + pools: + - .rgw.buckets + - .rgw.root + - default.rgw.control + - default.rgw.meta + - default.rgw.log - thrash_pool_snaps: pools: - .rgw.buckets - .rgw.root - - .rgw.control - - .rgw - - .users.uid - - .users.email - - .users + - default.rgw.control + - default.rgw.meta + - default.rgw.log - s3readwrite: client.0: rgw_server: client.0 diff -Nru ceph-12.1.1/qa/suites/rados/basic-luminous/objectstore/bluestore.yaml ceph-12.1.2/qa/suites/rados/basic-luminous/objectstore/bluestore.yaml --- ceph-12.1.1/qa/suites/rados/basic-luminous/objectstore/bluestore.yaml 2017-07-17 16:56:02.000000000 +0000 +++ ceph-12.1.2/qa/suites/rados/basic-luminous/objectstore/bluestore.yaml 2017-08-01 17:55:40.000000000 +0000 @@ -20,3 +20,21 @@ osd failsafe full ratio: .95 # this doesn't work with failures bc the log writes are not atomic across the two backends # bluestore bluefs env mirror: true + ceph-deploy: + fs: xfs + bluestore: yes + conf: + osd: + osd objectstore: bluestore + bluestore block size: 96636764160 + debug bluestore: 30 + debug bdev: 20 + debug bluefs: 20 + debug rocksdb: 10 + bluestore fsck on mount: true + # lower the full ratios since we can fill up a 100gb osd so quickly + mon osd full ratio: .9 + mon osd backfillfull_ratio: .85 + mon osd nearfull ratio: .8 + osd failsafe full ratio: .95 + diff -Nru ceph-12.1.1/qa/suites/rados/basic-luminous/objectstore/filestore-xfs.yaml ceph-12.1.2/qa/suites/rados/basic-luminous/objectstore/filestore-xfs.yaml --- ceph-12.1.1/qa/suites/rados/basic-luminous/objectstore/filestore-xfs.yaml 2017-07-17 16:56:02.000000000 +0000 +++ ceph-12.1.2/qa/suites/rados/basic-luminous/objectstore/filestore-xfs.yaml 2017-08-01 17:55:40.000000000 +0000 @@ -5,3 +5,11 @@ osd: osd objectstore: filestore osd sloppy crc: true + ceph-deploy: + fs: xfs + filestore: True + conf: + osd: + osd objectstore: filestore + osd sloppy crc: true + diff -Nru ceph-12.1.1/qa/suites/rados/mgr/objectstore/bluestore.yaml ceph-12.1.2/qa/suites/rados/mgr/objectstore/bluestore.yaml --- ceph-12.1.1/qa/suites/rados/mgr/objectstore/bluestore.yaml 2017-07-17 16:56:02.000000000 +0000 +++ ceph-12.1.2/qa/suites/rados/mgr/objectstore/bluestore.yaml 2017-08-01 17:55:40.000000000 +0000 @@ -20,3 +20,21 @@ osd failsafe full ratio: .95 # this doesn't work with failures bc the log writes are not atomic across the two backends # bluestore bluefs env mirror: true + ceph-deploy: + fs: xfs + bluestore: yes + conf: + osd: + osd objectstore: bluestore + bluestore block size: 96636764160 + debug bluestore: 30 + debug bdev: 20 + debug bluefs: 20 + debug rocksdb: 10 + bluestore fsck on mount: true + # lower the full ratios since we can fill up a 100gb osd so quickly + mon osd full ratio: .9 + mon osd backfillfull_ratio: .85 + mon osd nearfull ratio: .8 + osd failsafe full ratio: .95 + diff -Nru ceph-12.1.1/qa/suites/rados/mgr/objectstore/filestore-xfs.yaml ceph-12.1.2/qa/suites/rados/mgr/objectstore/filestore-xfs.yaml --- ceph-12.1.1/qa/suites/rados/mgr/objectstore/filestore-xfs.yaml 2017-07-17 16:56:02.000000000 +0000 +++ ceph-12.1.2/qa/suites/rados/mgr/objectstore/filestore-xfs.yaml 2017-08-01 17:55:40.000000000 +0000 @@ -5,3 +5,11 @@ osd: osd objectstore: filestore osd sloppy crc: true + ceph-deploy: + fs: xfs + filestore: True + conf: + osd: + osd objectstore: filestore + osd sloppy crc: true + diff -Nru ceph-12.1.1/qa/suites/rados/monthrash/d-require-luminous/at-end.yaml ceph-12.1.2/qa/suites/rados/monthrash/d-require-luminous/at-end.yaml --- ceph-12.1.1/qa/suites/rados/monthrash/d-require-luminous/at-end.yaml 2017-07-17 16:56:02.000000000 +0000 +++ ceph-12.1.2/qa/suites/rados/monthrash/d-require-luminous/at-end.yaml 2017-08-01 17:55:40.000000000 +0000 @@ -6,6 +6,7 @@ - exec: mon.a: - ceph osd require-osd-release luminous + - ceph osd pool application enable base rados || true # make sure osds have latest map - rados -p rbd bench 5 write -b 4096 - ceph.healthy: @@ -20,5 +21,11 @@ conf: global: mon debug no require luminous: true + +# setting luminous triggers peering, which *might* trigger health alerts + log-whitelist: + - overall HEALTH_ + - (PG_AVAILABILITY) + - (PG_DEGRADED) thrashosds: chance_thrash_cluster_full: 0 diff -Nru ceph-12.1.1/qa/suites/rados/monthrash/msgr-failures/mon-delay.yaml ceph-12.1.2/qa/suites/rados/monthrash/msgr-failures/mon-delay.yaml --- ceph-12.1.1/qa/suites/rados/monthrash/msgr-failures/mon-delay.yaml 2017-07-17 16:56:02.000000000 +0000 +++ ceph-12.1.2/qa/suites/rados/monthrash/msgr-failures/mon-delay.yaml 2017-08-01 17:55:40.000000000 +0000 @@ -7,3 +7,5 @@ ms inject delay probability: .005 ms inject delay max: 1 ms inject internal delays: .002 + mgr: + debug monc: 10 diff -Nru ceph-12.1.1/qa/suites/rados/monthrash/objectstore/bluestore.yaml ceph-12.1.2/qa/suites/rados/monthrash/objectstore/bluestore.yaml --- ceph-12.1.1/qa/suites/rados/monthrash/objectstore/bluestore.yaml 2017-07-17 16:56:02.000000000 +0000 +++ ceph-12.1.2/qa/suites/rados/monthrash/objectstore/bluestore.yaml 2017-08-01 17:55:40.000000000 +0000 @@ -20,3 +20,21 @@ osd failsafe full ratio: .95 # this doesn't work with failures bc the log writes are not atomic across the two backends # bluestore bluefs env mirror: true + ceph-deploy: + fs: xfs + bluestore: yes + conf: + osd: + osd objectstore: bluestore + bluestore block size: 96636764160 + debug bluestore: 30 + debug bdev: 20 + debug bluefs: 20 + debug rocksdb: 10 + bluestore fsck on mount: true + # lower the full ratios since we can fill up a 100gb osd so quickly + mon osd full ratio: .9 + mon osd backfillfull_ratio: .85 + mon osd nearfull ratio: .8 + osd failsafe full ratio: .95 + diff -Nru ceph-12.1.1/qa/suites/rados/monthrash/objectstore/filestore-xfs.yaml ceph-12.1.2/qa/suites/rados/monthrash/objectstore/filestore-xfs.yaml --- ceph-12.1.1/qa/suites/rados/monthrash/objectstore/filestore-xfs.yaml 2017-07-17 16:56:02.000000000 +0000 +++ ceph-12.1.2/qa/suites/rados/monthrash/objectstore/filestore-xfs.yaml 2017-08-01 17:55:40.000000000 +0000 @@ -5,3 +5,11 @@ osd: osd objectstore: filestore osd sloppy crc: true + ceph-deploy: + fs: xfs + filestore: True + conf: + osd: + osd objectstore: filestore + osd sloppy crc: true + diff -Nru ceph-12.1.1/qa/suites/rados/monthrash/workloads/pool-create-delete.yaml ceph-12.1.2/qa/suites/rados/monthrash/workloads/pool-create-delete.yaml --- ceph-12.1.1/qa/suites/rados/monthrash/workloads/pool-create-delete.yaml 2017-07-17 16:56:02.000000000 +0000 +++ ceph-12.1.2/qa/suites/rados/monthrash/workloads/pool-create-delete.yaml 2017-08-01 17:55:40.000000000 +0000 @@ -2,6 +2,8 @@ ceph: log-whitelist: - slow request + - overall HEALTH_ + - (POOL_APP_NOT_ENABLED) tasks: - exec: client.0: diff -Nru ceph-12.1.1/qa/suites/rados/monthrash/workloads/rados_5925.yaml ceph-12.1.2/qa/suites/rados/monthrash/workloads/rados_5925.yaml --- ceph-12.1.1/qa/suites/rados/monthrash/workloads/rados_5925.yaml 2017-07-17 16:56:02.000000000 +0000 +++ ceph-12.1.2/qa/suites/rados/monthrash/workloads/rados_5925.yaml 2017-08-01 17:55:40.000000000 +0000 @@ -1,3 +1,8 @@ +overrides: + ceph: + log-whitelist: + - overall HEALTH_ + - (POOL_APP_NOT_ENABLED) tasks: - exec: client.0: diff -Nru ceph-12.1.1/qa/suites/rados/monthrash/workloads/rados_api_tests.yaml ceph-12.1.2/qa/suites/rados/monthrash/workloads/rados_api_tests.yaml --- ceph-12.1.1/qa/suites/rados/monthrash/workloads/rados_api_tests.yaml 2017-07-17 16:56:02.000000000 +0000 +++ ceph-12.1.2/qa/suites/rados/monthrash/workloads/rados_api_tests.yaml 2017-08-01 17:55:40.000000000 +0000 @@ -8,6 +8,7 @@ - (REQUEST_SLOW) - (MON_DOWN) - (PG_ + - (POOL_APP_NOT_ENABLED) conf: global: debug objecter: 20 diff -Nru ceph-12.1.1/qa/suites/rados/monthrash/workloads/rados_mon_workunits.yaml ceph-12.1.2/qa/suites/rados/monthrash/workloads/rados_mon_workunits.yaml --- ceph-12.1.1/qa/suites/rados/monthrash/workloads/rados_mon_workunits.yaml 2017-07-17 16:56:02.000000000 +0000 +++ ceph-12.1.2/qa/suites/rados/monthrash/workloads/rados_mon_workunits.yaml 2017-08-01 17:55:40.000000000 +0000 @@ -1,7 +1,7 @@ overrides: ceph: log-whitelist: - - wrongly marked me down + - but it is still running - overall HEALTH_ - (PG_ - (MON_DOWN) diff -Nru ceph-12.1.1/qa/suites/rados/multimon/objectstore/bluestore.yaml ceph-12.1.2/qa/suites/rados/multimon/objectstore/bluestore.yaml --- ceph-12.1.1/qa/suites/rados/multimon/objectstore/bluestore.yaml 2017-07-17 16:56:02.000000000 +0000 +++ ceph-12.1.2/qa/suites/rados/multimon/objectstore/bluestore.yaml 2017-08-01 17:55:40.000000000 +0000 @@ -20,3 +20,21 @@ osd failsafe full ratio: .95 # this doesn't work with failures bc the log writes are not atomic across the two backends # bluestore bluefs env mirror: true + ceph-deploy: + fs: xfs + bluestore: yes + conf: + osd: + osd objectstore: bluestore + bluestore block size: 96636764160 + debug bluestore: 30 + debug bdev: 20 + debug bluefs: 20 + debug rocksdb: 10 + bluestore fsck on mount: true + # lower the full ratios since we can fill up a 100gb osd so quickly + mon osd full ratio: .9 + mon osd backfillfull_ratio: .85 + mon osd nearfull ratio: .8 + osd failsafe full ratio: .95 + diff -Nru ceph-12.1.1/qa/suites/rados/multimon/objectstore/filestore-xfs.yaml ceph-12.1.2/qa/suites/rados/multimon/objectstore/filestore-xfs.yaml --- ceph-12.1.1/qa/suites/rados/multimon/objectstore/filestore-xfs.yaml 2017-07-17 16:56:02.000000000 +0000 +++ ceph-12.1.2/qa/suites/rados/multimon/objectstore/filestore-xfs.yaml 2017-08-01 17:55:40.000000000 +0000 @@ -5,3 +5,11 @@ osd: osd objectstore: filestore osd sloppy crc: true + ceph-deploy: + fs: xfs + filestore: True + conf: + osd: + osd objectstore: filestore + osd sloppy crc: true + diff -Nru ceph-12.1.1/qa/suites/rados/rest/mgr-restful.yaml ceph-12.1.2/qa/suites/rados/rest/mgr-restful.yaml --- ceph-12.1.1/qa/suites/rados/rest/mgr-restful.yaml 2017-07-17 16:56:02.000000000 +0000 +++ ceph-12.1.2/qa/suites/rados/rest/mgr-restful.yaml 2017-08-01 17:55:40.000000000 +0000 @@ -8,8 +8,8 @@ - (MGR_DOWN) - exec: mon.a: - - ceph tell mgr.x restful create-key admin - - ceph tell mgr.x restful create-self-signed-cert + - ceph restful create-key admin + - ceph restful create-self-signed-cert - ceph.restart: [mgr.x] - workunit: clients: diff -Nru ceph-12.1.1/qa/suites/rados/singleton/all/admin-socket.yaml ceph-12.1.2/qa/suites/rados/singleton/all/admin-socket.yaml --- ceph-12.1.1/qa/suites/rados/singleton/all/admin-socket.yaml 2017-07-17 16:56:02.000000000 +0000 +++ ceph-12.1.2/qa/suites/rados/singleton/all/admin-socket.yaml 2017-08-01 17:55:40.000000000 +0000 @@ -17,6 +17,7 @@ git_version: help: config show: + config help: config set filestore_dump_file /tmp/foo: perf dump: perf schema: diff -Nru ceph-12.1.1/qa/suites/rados/singleton/all/dump-stuck.yaml ceph-12.1.2/qa/suites/rados/singleton/all/dump-stuck.yaml --- ceph-12.1.1/qa/suites/rados/singleton/all/dump-stuck.yaml 2017-07-17 16:56:02.000000000 +0000 +++ ceph-12.1.2/qa/suites/rados/singleton/all/dump-stuck.yaml 2017-08-01 17:55:40.000000000 +0000 @@ -11,7 +11,7 @@ - install: - ceph: log-whitelist: - - wrongly marked me down + - but it is still running - overall HEALTH_ - (OSDMAP_FLAGS) - (OSD_ diff -Nru ceph-12.1.1/qa/suites/rados/singleton/all/erasure-code-nonregression.yaml ceph-12.1.2/qa/suites/rados/singleton/all/erasure-code-nonregression.yaml --- ceph-12.1.1/qa/suites/rados/singleton/all/erasure-code-nonregression.yaml 1970-01-01 00:00:00.000000000 +0000 +++ ceph-12.1.2/qa/suites/rados/singleton/all/erasure-code-nonregression.yaml 2017-08-01 17:55:40.000000000 +0000 @@ -0,0 +1,17 @@ +roles: +- - mon.a + - mgr.x + - osd.0 + - osd.1 + - osd.2 + - client.0 +openstack: + - volumes: # attached to each instance + count: 3 + size: 10 # GB +tasks: +- install: +- workunit: + clients: + all: + - erasure-code/encode-decode-non-regression.sh diff -Nru ceph-12.1.1/qa/suites/rados/singleton/all/mon-thrasher.yaml ceph-12.1.2/qa/suites/rados/singleton/all/mon-thrasher.yaml --- ceph-12.1.1/qa/suites/rados/singleton/all/mon-thrasher.yaml 2017-07-17 16:56:02.000000000 +0000 +++ ceph-12.1.2/qa/suites/rados/singleton/all/mon-thrasher.yaml 1970-01-01 00:00:00.000000000 +0000 @@ -1,30 +0,0 @@ -roles: -- - mon.a - - mon.b - - mon.c - - mgr.x - - osd.0 - - osd.1 - - client.0 -openstack: - - volumes: # attached to each instance - count: 2 - size: 10 # GB -tasks: -- install: -- ceph: - log-whitelist: - - overall HEALTH_ - - (MON_DOWN) - - (PG_ -- mon_thrash: - revive_delay: 20 - thrash_delay: 1 -- workunit: - clients: - all: - - mon/workloadgen.sh - env: - LOADGEN_NUM_OSDS: "5" - VERBOSE: "1" - DURATION: "600" diff -Nru ceph-12.1.1/qa/suites/rados/singleton/all/osd-backfill.yaml ceph-12.1.2/qa/suites/rados/singleton/all/osd-backfill.yaml --- ceph-12.1.1/qa/suites/rados/singleton/all/osd-backfill.yaml 2017-07-17 16:56:02.000000000 +0000 +++ ceph-12.1.2/qa/suites/rados/singleton/all/osd-backfill.yaml 2017-08-01 17:55:40.000000000 +0000 @@ -14,7 +14,7 @@ - install: - ceph: log-whitelist: - - wrongly marked me down + - but it is still running - overall HEALTH_ - (OSDMAP_FLAGS) - (OSD_ diff -Nru ceph-12.1.1/qa/suites/rados/singleton/all/osd-recovery-incomplete.yaml ceph-12.1.2/qa/suites/rados/singleton/all/osd-recovery-incomplete.yaml --- ceph-12.1.1/qa/suites/rados/singleton/all/osd-recovery-incomplete.yaml 2017-07-17 16:56:02.000000000 +0000 +++ ceph-12.1.2/qa/suites/rados/singleton/all/osd-recovery-incomplete.yaml 2017-08-01 17:55:40.000000000 +0000 @@ -15,7 +15,7 @@ - install: - ceph: log-whitelist: - - wrongly marked me down + - but it is still running - overall HEALTH_ - (OSDMAP_FLAGS) - (OSD_ diff -Nru ceph-12.1.1/qa/suites/rados/singleton/all/osd-recovery.yaml ceph-12.1.2/qa/suites/rados/singleton/all/osd-recovery.yaml --- ceph-12.1.1/qa/suites/rados/singleton/all/osd-recovery.yaml 2017-07-17 16:56:02.000000000 +0000 +++ ceph-12.1.2/qa/suites/rados/singleton/all/osd-recovery.yaml 2017-08-01 17:55:40.000000000 +0000 @@ -14,7 +14,7 @@ - install: - ceph: log-whitelist: - - wrongly marked me down + - but it is still running - overall HEALTH_ - (OSDMAP_FLAGS) - (OSD_ diff -Nru ceph-12.1.1/qa/suites/rados/singleton/all/pg-removal-interruption.yaml ceph-12.1.2/qa/suites/rados/singleton/all/pg-removal-interruption.yaml --- ceph-12.1.1/qa/suites/rados/singleton/all/pg-removal-interruption.yaml 2017-07-17 16:56:02.000000000 +0000 +++ ceph-12.1.2/qa/suites/rados/singleton/all/pg-removal-interruption.yaml 2017-08-01 17:55:40.000000000 +0000 @@ -13,7 +13,7 @@ - install: - ceph: log-whitelist: - - wrongly marked me down + - but it is still running - slow request - overall HEALTH_ - (OSDMAP_FLAGS) @@ -22,6 +22,7 @@ - exec: client.0: - sudo ceph osd pool create foo 128 128 + - sudo ceph osd pool application enable foo rados - sleep 5 - sudo ceph tell osd.0 injectargs -- --osd-inject-failure-on-pg-removal - sudo ceph osd pool delete foo foo --yes-i-really-really-mean-it diff -Nru ceph-12.1.1/qa/suites/rados/singleton/all/radostool.yaml ceph-12.1.2/qa/suites/rados/singleton/all/radostool.yaml --- ceph-12.1.1/qa/suites/rados/singleton/all/radostool.yaml 2017-07-17 16:56:02.000000000 +0000 +++ ceph-12.1.2/qa/suites/rados/singleton/all/radostool.yaml 2017-08-01 17:55:40.000000000 +0000 @@ -13,12 +13,13 @@ - install: - ceph: log-whitelist: - - wrongly marked me down + - but it is still running - had wrong client addr - had wrong cluster addr - reached quota - overall HEALTH_ - (POOL_FULL) + - (POOL_APP_NOT_ENABLED) - workunit: clients: all: diff -Nru ceph-12.1.1/qa/suites/rados/singleton/all/random-eio.yaml ceph-12.1.2/qa/suites/rados/singleton/all/random-eio.yaml --- ceph-12.1.1/qa/suites/rados/singleton/all/random-eio.yaml 2017-07-17 16:56:02.000000000 +0000 +++ ceph-12.1.2/qa/suites/rados/singleton/all/random-eio.yaml 2017-08-01 17:55:40.000000000 +0000 @@ -18,6 +18,8 @@ log-whitelist: - missing primary copy of - objects unfound and apparently lost + - overall HEALTH_ + - (POOL_APP_NOT_ENABLED) - full_sequential: - exec: client.0: diff -Nru ceph-12.1.1/qa/suites/rados/singleton/all/rest-api.yaml ceph-12.1.2/qa/suites/rados/singleton/all/rest-api.yaml --- ceph-12.1.1/qa/suites/rados/singleton/all/rest-api.yaml 2017-07-17 16:56:02.000000000 +0000 +++ ceph-12.1.2/qa/suites/rados/singleton/all/rest-api.yaml 2017-08-01 17:55:40.000000000 +0000 @@ -16,7 +16,7 @@ - install: - ceph: log-whitelist: - - wrongly marked me down + - but it is still running - had wrong client addr - overall HEALTH_ - (OSDMAP_FLAGS) diff -Nru ceph-12.1.1/qa/suites/rados/singleton/all/test_envlibrados_for_rocksdb.yaml ceph-12.1.2/qa/suites/rados/singleton/all/test_envlibrados_for_rocksdb.yaml --- ceph-12.1.1/qa/suites/rados/singleton/all/test_envlibrados_for_rocksdb.yaml 2017-07-17 16:56:02.000000000 +0000 +++ ceph-12.1.2/qa/suites/rados/singleton/all/test_envlibrados_for_rocksdb.yaml 2017-08-01 17:55:40.000000000 +0000 @@ -10,6 +10,9 @@ tasks: - install: - ceph: + log-whitelist: + - overall HEALTH_ + - (POOL_APP_NOT_ENABLED) - workunit: clients: all: diff -Nru ceph-12.1.1/qa/suites/rados/singleton/all/thrash_cache_writeback_proxy_none.yaml ceph-12.1.2/qa/suites/rados/singleton/all/thrash_cache_writeback_proxy_none.yaml --- ceph-12.1.1/qa/suites/rados/singleton/all/thrash_cache_writeback_proxy_none.yaml 2017-07-17 16:56:02.000000000 +0000 +++ ceph-12.1.2/qa/suites/rados/singleton/all/thrash_cache_writeback_proxy_none.yaml 2017-08-01 17:55:40.000000000 +0000 @@ -16,13 +16,14 @@ - install: - ceph: log-whitelist: - - wrongly marked me down + - but it is still running - slow request - overall HEALTH_ - (CACHE_POOL_ - exec: client.0: - sudo ceph osd pool create base 4 + - sudo ceph osd pool application enable base rados - sudo ceph osd pool create cache 4 - sudo ceph osd tier add base cache - sudo ceph osd tier cache-mode cache writeback diff -Nru ceph-12.1.1/qa/suites/rados/singleton/all/thrash-eio.yaml ceph-12.1.2/qa/suites/rados/singleton/all/thrash-eio.yaml --- ceph-12.1.1/qa/suites/rados/singleton/all/thrash-eio.yaml 2017-07-17 16:56:02.000000000 +0000 +++ ceph-12.1.2/qa/suites/rados/singleton/all/thrash-eio.yaml 2017-08-01 17:55:40.000000000 +0000 @@ -21,7 +21,7 @@ - install: - ceph: log-whitelist: - - wrongly marked me down + - but it is still running - missing primary copy of - objects unfound and apparently lost - overall HEALTH_ diff -Nru ceph-12.1.1/qa/suites/rados/singleton/all/thrash-rados/thrash-rados.yaml ceph-12.1.2/qa/suites/rados/singleton/all/thrash-rados/thrash-rados.yaml --- ceph-12.1.1/qa/suites/rados/singleton/all/thrash-rados/thrash-rados.yaml 2017-07-17 16:56:02.000000000 +0000 +++ ceph-12.1.2/qa/suites/rados/singleton/all/thrash-rados/thrash-rados.yaml 2017-08-01 17:55:40.000000000 +0000 @@ -16,7 +16,7 @@ - install: - ceph: log-whitelist: - - wrongly marked me down + - but it is still running - thrashosds: op_delay: 30 clean_interval: 120 diff -Nru ceph-12.1.1/qa/suites/rados/singleton/msgr-failures/many.yaml ceph-12.1.2/qa/suites/rados/singleton/msgr-failures/many.yaml --- ceph-12.1.1/qa/suites/rados/singleton/msgr-failures/many.yaml 2017-07-17 16:56:02.000000000 +0000 +++ ceph-12.1.2/qa/suites/rados/singleton/msgr-failures/many.yaml 2017-08-01 17:55:40.000000000 +0000 @@ -3,3 +3,5 @@ conf: global: ms inject socket failures: 500 + mgr: + debug monc: 10 diff -Nru ceph-12.1.1/qa/suites/rados/singleton/objectstore/bluestore.yaml ceph-12.1.2/qa/suites/rados/singleton/objectstore/bluestore.yaml --- ceph-12.1.1/qa/suites/rados/singleton/objectstore/bluestore.yaml 2017-07-17 16:56:02.000000000 +0000 +++ ceph-12.1.2/qa/suites/rados/singleton/objectstore/bluestore.yaml 2017-08-01 17:55:40.000000000 +0000 @@ -20,3 +20,21 @@ osd failsafe full ratio: .95 # this doesn't work with failures bc the log writes are not atomic across the two backends # bluestore bluefs env mirror: true + ceph-deploy: + fs: xfs + bluestore: yes + conf: + osd: + osd objectstore: bluestore + bluestore block size: 96636764160 + debug bluestore: 30 + debug bdev: 20 + debug bluefs: 20 + debug rocksdb: 10 + bluestore fsck on mount: true + # lower the full ratios since we can fill up a 100gb osd so quickly + mon osd full ratio: .9 + mon osd backfillfull_ratio: .85 + mon osd nearfull ratio: .8 + osd failsafe full ratio: .95 + diff -Nru ceph-12.1.1/qa/suites/rados/singleton/objectstore/filestore-xfs.yaml ceph-12.1.2/qa/suites/rados/singleton/objectstore/filestore-xfs.yaml --- ceph-12.1.1/qa/suites/rados/singleton/objectstore/filestore-xfs.yaml 2017-07-17 16:56:02.000000000 +0000 +++ ceph-12.1.2/qa/suites/rados/singleton/objectstore/filestore-xfs.yaml 2017-08-01 17:55:40.000000000 +0000 @@ -5,3 +5,11 @@ osd: osd objectstore: filestore osd sloppy crc: true + ceph-deploy: + fs: xfs + filestore: True + conf: + osd: + osd objectstore: filestore + osd sloppy crc: true + diff -Nru ceph-12.1.1/qa/suites/rados/singleton-bluestore/all/cephtool.yaml ceph-12.1.2/qa/suites/rados/singleton-bluestore/all/cephtool.yaml --- ceph-12.1.1/qa/suites/rados/singleton-bluestore/all/cephtool.yaml 2017-07-17 16:56:02.000000000 +0000 +++ ceph-12.1.2/qa/suites/rados/singleton-bluestore/all/cephtool.yaml 2017-08-01 17:55:40.000000000 +0000 @@ -15,7 +15,7 @@ - install: - ceph: log-whitelist: - - wrongly marked me down + - but it is still running - had wrong client addr - had wrong cluster addr - must scrub before tier agent can activate diff -Nru ceph-12.1.1/qa/suites/rados/singleton-bluestore/objectstore/bluestore.yaml ceph-12.1.2/qa/suites/rados/singleton-bluestore/objectstore/bluestore.yaml --- ceph-12.1.1/qa/suites/rados/singleton-bluestore/objectstore/bluestore.yaml 2017-07-17 16:56:02.000000000 +0000 +++ ceph-12.1.2/qa/suites/rados/singleton-bluestore/objectstore/bluestore.yaml 2017-08-01 17:55:40.000000000 +0000 @@ -20,3 +20,21 @@ osd failsafe full ratio: .95 # this doesn't work with failures bc the log writes are not atomic across the two backends # bluestore bluefs env mirror: true + ceph-deploy: + fs: xfs + bluestore: yes + conf: + osd: + osd objectstore: bluestore + bluestore block size: 96636764160 + debug bluestore: 30 + debug bdev: 20 + debug bluefs: 20 + debug rocksdb: 10 + bluestore fsck on mount: true + # lower the full ratios since we can fill up a 100gb osd so quickly + mon osd full ratio: .9 + mon osd backfillfull_ratio: .85 + mon osd nearfull ratio: .8 + osd failsafe full ratio: .95 + diff -Nru ceph-12.1.1/qa/suites/rados/singleton-nomsgr/all/export-after-evict.yaml ceph-12.1.2/qa/suites/rados/singleton-nomsgr/all/export-after-evict.yaml --- ceph-12.1.1/qa/suites/rados/singleton-nomsgr/all/export-after-evict.yaml 2017-07-17 16:56:02.000000000 +0000 +++ ceph-12.1.2/qa/suites/rados/singleton-nomsgr/all/export-after-evict.yaml 2017-08-01 17:55:40.000000000 +0000 @@ -18,6 +18,7 @@ - exec: client.0: - ceph osd pool create base-pool 4 + - ceph osd pool application enable base-pool rados - ceph osd pool create cache-pool 4 - ceph osd tier add base-pool cache-pool - ceph osd tier cache-mode cache-pool writeback diff -Nru ceph-12.1.1/qa/suites/rados/singleton-nomsgr/all/full-tiering.yaml ceph-12.1.2/qa/suites/rados/singleton-nomsgr/all/full-tiering.yaml --- ceph-12.1.1/qa/suites/rados/singleton-nomsgr/all/full-tiering.yaml 2017-07-17 16:56:02.000000000 +0000 +++ ceph-12.1.2/qa/suites/rados/singleton-nomsgr/all/full-tiering.yaml 2017-08-01 17:55:40.000000000 +0000 @@ -20,6 +20,7 @@ client.0: - ceph osd pool create ec-ca 1 1 - ceph osd pool create ec 1 1 erasure default + - ceph osd pool application enable ec rados - ceph osd tier add ec ec-ca - ceph osd tier cache-mode ec-ca readproxy - ceph osd tier set-overlay ec ec-ca diff -Nru ceph-12.1.1/qa/suites/rados/singleton-nomsgr/all/health-warnings.yaml ceph-12.1.2/qa/suites/rados/singleton-nomsgr/all/health-warnings.yaml --- ceph-12.1.1/qa/suites/rados/singleton-nomsgr/all/health-warnings.yaml 2017-07-17 16:56:02.000000000 +0000 +++ ceph-12.1.2/qa/suites/rados/singleton-nomsgr/all/health-warnings.yaml 2017-08-01 17:55:40.000000000 +0000 @@ -9,7 +9,7 @@ osd max object name len: 400 osd max object namespace len: 64 log-whitelist: - - wrongly marked me down + - but it is still running - overall HEALTH_ - (OSDMAP_FLAGS) - (OSD_ diff -Nru ceph-12.1.1/qa/suites/rados/singleton-nomsgr/all/multi-backfill-reject.yaml ceph-12.1.2/qa/suites/rados/singleton-nomsgr/all/multi-backfill-reject.yaml --- ceph-12.1.1/qa/suites/rados/singleton-nomsgr/all/multi-backfill-reject.yaml 2017-07-17 16:56:02.000000000 +0000 +++ ceph-12.1.2/qa/suites/rados/singleton-nomsgr/all/multi-backfill-reject.yaml 2017-08-01 17:55:40.000000000 +0000 @@ -26,6 +26,7 @@ - exec: client.0: - sudo ceph osd pool create foo 64 + - sudo ceph osd pool application enable foo rados - rados -p foo bench 60 write -b 1024 --no-cleanup - sudo ceph osd pool set foo size 3 - sudo ceph osd out 0 1 diff -Nru ceph-12.1.1/qa/suites/rados/standalone/crush.yaml ceph-12.1.2/qa/suites/rados/standalone/crush.yaml --- ceph-12.1.1/qa/suites/rados/standalone/crush.yaml 1970-01-01 00:00:00.000000000 +0000 +++ ceph-12.1.2/qa/suites/rados/standalone/crush.yaml 2017-08-01 17:55:40.000000000 +0000 @@ -0,0 +1,18 @@ +roles: +- - mon.a + - mgr.x + - osd.0 + - osd.1 + - osd.2 + - client.0 +openstack: + - volumes: # attached to each instance + count: 3 + size: 10 # GB +tasks: +- install: +- workunit: + basedir: qa/standalone + clients: + all: + - crush diff -Nru ceph-12.1.1/qa/suites/rados/standalone/erasure-code.yaml ceph-12.1.2/qa/suites/rados/standalone/erasure-code.yaml --- ceph-12.1.1/qa/suites/rados/standalone/erasure-code.yaml 1970-01-01 00:00:00.000000000 +0000 +++ ceph-12.1.2/qa/suites/rados/standalone/erasure-code.yaml 2017-08-01 17:55:40.000000000 +0000 @@ -0,0 +1,18 @@ +roles: +- - mon.a + - mgr.x + - osd.0 + - osd.1 + - osd.2 + - client.0 +openstack: + - volumes: # attached to each instance + count: 3 + size: 10 # GB +tasks: +- install: +- workunit: + basedir: qa/standalone + clients: + all: + - erasure-code diff -Nru ceph-12.1.1/qa/suites/rados/standalone/misc.yaml ceph-12.1.2/qa/suites/rados/standalone/misc.yaml --- ceph-12.1.1/qa/suites/rados/standalone/misc.yaml 1970-01-01 00:00:00.000000000 +0000 +++ ceph-12.1.2/qa/suites/rados/standalone/misc.yaml 2017-08-01 17:55:40.000000000 +0000 @@ -0,0 +1,18 @@ +roles: +- - mon.a + - mgr.x + - osd.0 + - osd.1 + - osd.2 + - client.0 +openstack: + - volumes: # attached to each instance + count: 3 + size: 10 # GB +tasks: +- install: +- workunit: + basedir: qa/standalone + clients: + all: + - misc diff -Nru ceph-12.1.1/qa/suites/rados/standalone/mon.yaml ceph-12.1.2/qa/suites/rados/standalone/mon.yaml --- ceph-12.1.1/qa/suites/rados/standalone/mon.yaml 1970-01-01 00:00:00.000000000 +0000 +++ ceph-12.1.2/qa/suites/rados/standalone/mon.yaml 2017-08-01 17:55:40.000000000 +0000 @@ -0,0 +1,18 @@ +roles: +- - mon.a + - mgr.x + - osd.0 + - osd.1 + - osd.2 + - client.0 +openstack: + - volumes: # attached to each instance + count: 3 + size: 10 # GB +tasks: +- install: +- workunit: + basedir: qa/standalone + clients: + all: + - mon diff -Nru ceph-12.1.1/qa/suites/rados/standalone/osd.yaml ceph-12.1.2/qa/suites/rados/standalone/osd.yaml --- ceph-12.1.1/qa/suites/rados/standalone/osd.yaml 1970-01-01 00:00:00.000000000 +0000 +++ ceph-12.1.2/qa/suites/rados/standalone/osd.yaml 2017-08-01 17:55:40.000000000 +0000 @@ -0,0 +1,18 @@ +roles: +- - mon.a + - mgr.x + - osd.0 + - osd.1 + - osd.2 + - client.0 +openstack: + - volumes: # attached to each instance + count: 3 + size: 10 # GB +tasks: +- install: +- workunit: + basedir: qa/standalone + clients: + all: + - osd diff -Nru ceph-12.1.1/qa/suites/rados/standalone/scrub.yaml ceph-12.1.2/qa/suites/rados/standalone/scrub.yaml --- ceph-12.1.1/qa/suites/rados/standalone/scrub.yaml 1970-01-01 00:00:00.000000000 +0000 +++ ceph-12.1.2/qa/suites/rados/standalone/scrub.yaml 2017-08-01 17:55:40.000000000 +0000 @@ -0,0 +1,18 @@ +roles: +- - mon.a + - mgr.x + - osd.0 + - osd.1 + - osd.2 + - client.0 +openstack: + - volumes: # attached to each instance + count: 3 + size: 10 # GB +tasks: +- install: +- workunit: + basedir: qa/standalone + clients: + all: + - scrub diff -Nru ceph-12.1.1/qa/suites/rados/thrash/d-require-luminous/at-end.yaml ceph-12.1.2/qa/suites/rados/thrash/d-require-luminous/at-end.yaml --- ceph-12.1.1/qa/suites/rados/thrash/d-require-luminous/at-end.yaml 2017-07-17 16:56:02.000000000 +0000 +++ ceph-12.1.2/qa/suites/rados/thrash/d-require-luminous/at-end.yaml 2017-08-01 17:55:40.000000000 +0000 @@ -6,6 +6,7 @@ - exec: mon.a: - ceph osd require-osd-release luminous + - ceph osd pool application enable base rados || true # make sure osds have latest map - rados -p rbd bench 5 write -b 4096 - ceph.healthy: @@ -20,5 +21,11 @@ conf: global: mon debug no require luminous: true + +# setting luminous triggers peering, which *might* trigger health alerts + log-whitelist: + - overall HEALTH_ + - (PG_AVAILABILITY) + - (PG_DEGRADED) thrashosds: chance_thrash_cluster_full: 0 diff -Nru ceph-12.1.1/qa/suites/rados/thrash/objectstore/bluestore.yaml ceph-12.1.2/qa/suites/rados/thrash/objectstore/bluestore.yaml --- ceph-12.1.1/qa/suites/rados/thrash/objectstore/bluestore.yaml 2017-07-17 16:56:02.000000000 +0000 +++ ceph-12.1.2/qa/suites/rados/thrash/objectstore/bluestore.yaml 2017-08-01 17:55:40.000000000 +0000 @@ -20,3 +20,21 @@ osd failsafe full ratio: .95 # this doesn't work with failures bc the log writes are not atomic across the two backends # bluestore bluefs env mirror: true + ceph-deploy: + fs: xfs + bluestore: yes + conf: + osd: + osd objectstore: bluestore + bluestore block size: 96636764160 + debug bluestore: 30 + debug bdev: 20 + debug bluefs: 20 + debug rocksdb: 10 + bluestore fsck on mount: true + # lower the full ratios since we can fill up a 100gb osd so quickly + mon osd full ratio: .9 + mon osd backfillfull_ratio: .85 + mon osd nearfull ratio: .8 + osd failsafe full ratio: .95 + diff -Nru ceph-12.1.1/qa/suites/rados/thrash/objectstore/filestore-xfs.yaml ceph-12.1.2/qa/suites/rados/thrash/objectstore/filestore-xfs.yaml --- ceph-12.1.1/qa/suites/rados/thrash/objectstore/filestore-xfs.yaml 2017-07-17 16:56:02.000000000 +0000 +++ ceph-12.1.2/qa/suites/rados/thrash/objectstore/filestore-xfs.yaml 2017-08-01 17:55:40.000000000 +0000 @@ -5,3 +5,11 @@ osd: osd objectstore: filestore osd sloppy crc: true + ceph-deploy: + fs: xfs + filestore: True + conf: + osd: + osd objectstore: filestore + osd sloppy crc: true + diff -Nru ceph-12.1.1/qa/suites/rados/thrash/thrashers/default.yaml ceph-12.1.2/qa/suites/rados/thrash/thrashers/default.yaml --- ceph-12.1.1/qa/suites/rados/thrash/thrashers/default.yaml 2017-07-17 16:56:02.000000000 +0000 +++ ceph-12.1.2/qa/suites/rados/thrash/thrashers/default.yaml 2017-08-01 17:55:40.000000000 +0000 @@ -1,7 +1,7 @@ overrides: ceph: log-whitelist: - - wrongly marked me down + - but it is still running - objects unfound and apparently lost conf: osd: diff -Nru ceph-12.1.1/qa/suites/rados/thrash/thrashers/mapgap.yaml ceph-12.1.2/qa/suites/rados/thrash/thrashers/mapgap.yaml --- ceph-12.1.1/qa/suites/rados/thrash/thrashers/mapgap.yaml 2017-07-17 16:56:02.000000000 +0000 +++ ceph-12.1.2/qa/suites/rados/thrash/thrashers/mapgap.yaml 2017-08-01 17:55:40.000000000 +0000 @@ -1,7 +1,7 @@ overrides: ceph: log-whitelist: - - wrongly marked me down + - but it is still running - objects unfound and apparently lost - osd_map_cache_size conf: diff -Nru ceph-12.1.1/qa/suites/rados/thrash/thrashers/morepggrow.yaml ceph-12.1.2/qa/suites/rados/thrash/thrashers/morepggrow.yaml --- ceph-12.1.1/qa/suites/rados/thrash/thrashers/morepggrow.yaml 2017-07-17 16:56:02.000000000 +0000 +++ ceph-12.1.2/qa/suites/rados/thrash/thrashers/morepggrow.yaml 2017-08-01 17:55:40.000000000 +0000 @@ -10,7 +10,7 @@ filestore queue throttle max multiple: 10 osd max backfills: 9 log-whitelist: - - wrongly marked me down + - but it is still running - objects unfound and apparently lost tasks: - thrashosds: diff -Nru ceph-12.1.1/qa/suites/rados/thrash/thrashers/pggrow.yaml ceph-12.1.2/qa/suites/rados/thrash/thrashers/pggrow.yaml --- ceph-12.1.1/qa/suites/rados/thrash/thrashers/pggrow.yaml 2017-07-17 16:56:02.000000000 +0000 +++ ceph-12.1.2/qa/suites/rados/thrash/thrashers/pggrow.yaml 2017-08-01 17:55:40.000000000 +0000 @@ -1,7 +1,7 @@ overrides: ceph: log-whitelist: - - wrongly marked me down + - but it is still running - objects unfound and apparently lost conf: osd: diff -Nru ceph-12.1.1/qa/suites/rados/thrash/workloads/cache-agent-big.yaml ceph-12.1.2/qa/suites/rados/thrash/workloads/cache-agent-big.yaml --- ceph-12.1.1/qa/suites/rados/thrash/workloads/cache-agent-big.yaml 2017-07-17 16:56:02.000000000 +0000 +++ ceph-12.1.2/qa/suites/rados/thrash/workloads/cache-agent-big.yaml 2017-08-01 17:55:40.000000000 +0000 @@ -7,6 +7,7 @@ client.0: - sudo ceph osd erasure-code-profile set teuthologyprofile crush-failure-domain=osd m=1 k=2 - sudo ceph osd pool create base 4 4 erasure teuthologyprofile + - sudo ceph osd pool application enable base rados - sudo ceph osd pool set base min_size 2 - sudo ceph osd pool create cache 4 - sudo ceph osd tier add base cache diff -Nru ceph-12.1.1/qa/suites/rados/thrash/workloads/cache-agent-small.yaml ceph-12.1.2/qa/suites/rados/thrash/workloads/cache-agent-small.yaml --- ceph-12.1.1/qa/suites/rados/thrash/workloads/cache-agent-small.yaml 2017-07-17 16:56:02.000000000 +0000 +++ ceph-12.1.2/qa/suites/rados/thrash/workloads/cache-agent-small.yaml 2017-08-01 17:55:40.000000000 +0000 @@ -7,6 +7,7 @@ - exec: client.0: - sudo ceph osd pool create base 4 + - sudo ceph osd pool application enable base rados - sudo ceph osd pool create cache 4 - sudo ceph osd tier add base cache - sudo ceph osd tier cache-mode cache writeback diff -Nru ceph-12.1.1/qa/suites/rados/thrash/workloads/cache-pool-snaps-readproxy.yaml ceph-12.1.2/qa/suites/rados/thrash/workloads/cache-pool-snaps-readproxy.yaml --- ceph-12.1.1/qa/suites/rados/thrash/workloads/cache-pool-snaps-readproxy.yaml 2017-07-17 16:56:02.000000000 +0000 +++ ceph-12.1.2/qa/suites/rados/thrash/workloads/cache-pool-snaps-readproxy.yaml 2017-08-01 17:55:40.000000000 +0000 @@ -6,6 +6,7 @@ - exec: client.0: - sudo ceph osd pool create base 4 + - sudo ceph osd pool application enable base rados - sudo ceph osd pool create cache 4 - sudo ceph osd tier add base cache - sudo ceph osd tier cache-mode cache readproxy diff -Nru ceph-12.1.1/qa/suites/rados/thrash/workloads/cache-pool-snaps.yaml ceph-12.1.2/qa/suites/rados/thrash/workloads/cache-pool-snaps.yaml --- ceph-12.1.1/qa/suites/rados/thrash/workloads/cache-pool-snaps.yaml 2017-07-17 16:56:02.000000000 +0000 +++ ceph-12.1.2/qa/suites/rados/thrash/workloads/cache-pool-snaps.yaml 2017-08-01 17:55:40.000000000 +0000 @@ -6,6 +6,7 @@ - exec: client.0: - sudo ceph osd pool create base 4 + - sudo ceph osd pool application enable base rados - sudo ceph osd pool create cache 4 - sudo ceph osd tier add base cache - sudo ceph osd tier cache-mode cache writeback diff -Nru ceph-12.1.1/qa/suites/rados/thrash/workloads/cache-snaps.yaml ceph-12.1.2/qa/suites/rados/thrash/workloads/cache-snaps.yaml --- ceph-12.1.1/qa/suites/rados/thrash/workloads/cache-snaps.yaml 2017-07-17 16:56:02.000000000 +0000 +++ ceph-12.1.2/qa/suites/rados/thrash/workloads/cache-snaps.yaml 2017-08-01 17:55:40.000000000 +0000 @@ -6,6 +6,7 @@ - exec: client.0: - sudo ceph osd pool create base 4 + - sudo ceph osd pool application enable base rados - sudo ceph osd pool create cache 4 - sudo ceph osd tier add base cache - sudo ceph osd tier cache-mode cache writeback diff -Nru ceph-12.1.1/qa/suites/rados/thrash/workloads/cache.yaml ceph-12.1.2/qa/suites/rados/thrash/workloads/cache.yaml --- ceph-12.1.1/qa/suites/rados/thrash/workloads/cache.yaml 2017-07-17 16:56:02.000000000 +0000 +++ ceph-12.1.2/qa/suites/rados/thrash/workloads/cache.yaml 2017-08-01 17:55:40.000000000 +0000 @@ -6,6 +6,7 @@ - exec: client.0: - sudo ceph osd pool create base 4 + - sudo ceph osd pool application enable base rados - sudo ceph osd pool create cache 4 - sudo ceph osd tier add base cache - sudo ceph osd tier cache-mode cache writeback diff -Nru ceph-12.1.1/qa/suites/rados/thrash/workloads/rados_api_tests.yaml ceph-12.1.2/qa/suites/rados/thrash/workloads/rados_api_tests.yaml --- ceph-12.1.1/qa/suites/rados/thrash/workloads/rados_api_tests.yaml 2017-07-17 16:56:02.000000000 +0000 +++ ceph-12.1.2/qa/suites/rados/thrash/workloads/rados_api_tests.yaml 2017-08-01 17:55:40.000000000 +0000 @@ -2,6 +2,7 @@ ceph: log-whitelist: - reached quota + - (POOL_APP_NOT_ENABLED) crush_tunables: hammer conf: client: diff -Nru ceph-12.1.1/qa/suites/rados/thrash-erasure-code/d-require-luminous/at-end.yaml ceph-12.1.2/qa/suites/rados/thrash-erasure-code/d-require-luminous/at-end.yaml --- ceph-12.1.1/qa/suites/rados/thrash-erasure-code/d-require-luminous/at-end.yaml 2017-07-17 16:56:02.000000000 +0000 +++ ceph-12.1.2/qa/suites/rados/thrash-erasure-code/d-require-luminous/at-end.yaml 2017-08-01 17:55:40.000000000 +0000 @@ -6,6 +6,7 @@ - exec: mon.a: - ceph osd require-osd-release luminous + - ceph osd pool application enable base rados || true # make sure osds have latest map - rados -p rbd bench 5 write -b 4096 - ceph.healthy: @@ -20,5 +21,11 @@ conf: global: mon debug no require luminous: true + +# setting luminous triggers peering, which *might* trigger health alerts + log-whitelist: + - overall HEALTH_ + - (PG_AVAILABILITY) + - (PG_DEGRADED) thrashosds: chance_thrash_cluster_full: 0 diff -Nru ceph-12.1.1/qa/suites/rados/thrash-erasure-code/objectstore/bluestore.yaml ceph-12.1.2/qa/suites/rados/thrash-erasure-code/objectstore/bluestore.yaml --- ceph-12.1.1/qa/suites/rados/thrash-erasure-code/objectstore/bluestore.yaml 2017-07-17 16:56:02.000000000 +0000 +++ ceph-12.1.2/qa/suites/rados/thrash-erasure-code/objectstore/bluestore.yaml 2017-08-01 17:55:40.000000000 +0000 @@ -20,3 +20,21 @@ osd failsafe full ratio: .95 # this doesn't work with failures bc the log writes are not atomic across the two backends # bluestore bluefs env mirror: true + ceph-deploy: + fs: xfs + bluestore: yes + conf: + osd: + osd objectstore: bluestore + bluestore block size: 96636764160 + debug bluestore: 30 + debug bdev: 20 + debug bluefs: 20 + debug rocksdb: 10 + bluestore fsck on mount: true + # lower the full ratios since we can fill up a 100gb osd so quickly + mon osd full ratio: .9 + mon osd backfillfull_ratio: .85 + mon osd nearfull ratio: .8 + osd failsafe full ratio: .95 + diff -Nru ceph-12.1.1/qa/suites/rados/thrash-erasure-code/objectstore/filestore-xfs.yaml ceph-12.1.2/qa/suites/rados/thrash-erasure-code/objectstore/filestore-xfs.yaml --- ceph-12.1.1/qa/suites/rados/thrash-erasure-code/objectstore/filestore-xfs.yaml 2017-07-17 16:56:02.000000000 +0000 +++ ceph-12.1.2/qa/suites/rados/thrash-erasure-code/objectstore/filestore-xfs.yaml 2017-08-01 17:55:40.000000000 +0000 @@ -5,3 +5,11 @@ osd: osd objectstore: filestore osd sloppy crc: true + ceph-deploy: + fs: xfs + filestore: True + conf: + osd: + osd objectstore: filestore + osd sloppy crc: true + diff -Nru ceph-12.1.1/qa/suites/rados/thrash-erasure-code/thrashers/default.yaml ceph-12.1.2/qa/suites/rados/thrash-erasure-code/thrashers/default.yaml --- ceph-12.1.1/qa/suites/rados/thrash-erasure-code/thrashers/default.yaml 2017-07-17 16:56:02.000000000 +0000 +++ ceph-12.1.2/qa/suites/rados/thrash-erasure-code/thrashers/default.yaml 2017-08-01 17:55:40.000000000 +0000 @@ -1,7 +1,7 @@ overrides: ceph: log-whitelist: - - wrongly marked me down + - but it is still running - objects unfound and apparently lost conf: osd: diff -Nru ceph-12.1.1/qa/suites/rados/thrash-erasure-code/thrashers/fastread.yaml ceph-12.1.2/qa/suites/rados/thrash-erasure-code/thrashers/fastread.yaml --- ceph-12.1.1/qa/suites/rados/thrash-erasure-code/thrashers/fastread.yaml 2017-07-17 16:56:02.000000000 +0000 +++ ceph-12.1.2/qa/suites/rados/thrash-erasure-code/thrashers/fastread.yaml 2017-08-01 17:55:40.000000000 +0000 @@ -1,7 +1,7 @@ overrides: ceph: log-whitelist: - - wrongly marked me down + - but it is still running - objects unfound and apparently lost conf: mon: diff -Nru ceph-12.1.1/qa/suites/rados/thrash-erasure-code/thrashers/mapgap.yaml ceph-12.1.2/qa/suites/rados/thrash-erasure-code/thrashers/mapgap.yaml --- ceph-12.1.1/qa/suites/rados/thrash-erasure-code/thrashers/mapgap.yaml 2017-07-17 16:56:02.000000000 +0000 +++ ceph-12.1.2/qa/suites/rados/thrash-erasure-code/thrashers/mapgap.yaml 2017-08-01 17:55:40.000000000 +0000 @@ -9,7 +9,7 @@ osd scrub max interval: 120 osd max backfills: 5 log-whitelist: - - wrongly marked me down + - but it is still running - objects unfound and apparently lost - osd_map_cache_size tasks: diff -Nru ceph-12.1.1/qa/suites/rados/thrash-erasure-code/thrashers/morepggrow.yaml ceph-12.1.2/qa/suites/rados/thrash-erasure-code/thrashers/morepggrow.yaml --- ceph-12.1.1/qa/suites/rados/thrash-erasure-code/thrashers/morepggrow.yaml 2017-07-17 16:56:02.000000000 +0000 +++ ceph-12.1.2/qa/suites/rados/thrash-erasure-code/thrashers/morepggrow.yaml 2017-08-01 17:55:40.000000000 +0000 @@ -6,7 +6,7 @@ osd scrub max interval: 120 osd max backfills: 9 log-whitelist: - - wrongly marked me down + - but it is still running - objects unfound and apparently lost tasks: - thrashosds: diff -Nru ceph-12.1.1/qa/suites/rados/thrash-erasure-code/thrashers/pggrow.yaml ceph-12.1.2/qa/suites/rados/thrash-erasure-code/thrashers/pggrow.yaml --- ceph-12.1.1/qa/suites/rados/thrash-erasure-code/thrashers/pggrow.yaml 2017-07-17 16:56:02.000000000 +0000 +++ ceph-12.1.2/qa/suites/rados/thrash-erasure-code/thrashers/pggrow.yaml 2017-08-01 17:55:40.000000000 +0000 @@ -1,7 +1,7 @@ overrides: ceph: log-whitelist: - - wrongly marked me down + - but it is still running - objects unfound and apparently lost conf: osd: diff -Nru ceph-12.1.1/qa/suites/rados/thrash-erasure-code-big/d-require-luminous/at-end.yaml ceph-12.1.2/qa/suites/rados/thrash-erasure-code-big/d-require-luminous/at-end.yaml --- ceph-12.1.1/qa/suites/rados/thrash-erasure-code-big/d-require-luminous/at-end.yaml 2017-07-17 16:56:02.000000000 +0000 +++ ceph-12.1.2/qa/suites/rados/thrash-erasure-code-big/d-require-luminous/at-end.yaml 2017-08-01 17:55:40.000000000 +0000 @@ -6,6 +6,7 @@ - exec: mon.a: - ceph osd require-osd-release luminous + - ceph osd pool application enable base rados || true # make sure osds have latest map - rados -p rbd bench 5 write -b 4096 - ceph.healthy: @@ -20,5 +21,11 @@ conf: global: mon debug no require luminous: true + +# setting luminous triggers peering, which *might* trigger health alerts + log-whitelist: + - overall HEALTH_ + - (PG_AVAILABILITY) + - (PG_DEGRADED) thrashosds: chance_thrash_cluster_full: 0 diff -Nru ceph-12.1.1/qa/suites/rados/thrash-erasure-code-big/objectstore/bluestore.yaml ceph-12.1.2/qa/suites/rados/thrash-erasure-code-big/objectstore/bluestore.yaml --- ceph-12.1.1/qa/suites/rados/thrash-erasure-code-big/objectstore/bluestore.yaml 2017-07-17 16:56:02.000000000 +0000 +++ ceph-12.1.2/qa/suites/rados/thrash-erasure-code-big/objectstore/bluestore.yaml 2017-08-01 17:55:40.000000000 +0000 @@ -20,3 +20,21 @@ osd failsafe full ratio: .95 # this doesn't work with failures bc the log writes are not atomic across the two backends # bluestore bluefs env mirror: true + ceph-deploy: + fs: xfs + bluestore: yes + conf: + osd: + osd objectstore: bluestore + bluestore block size: 96636764160 + debug bluestore: 30 + debug bdev: 20 + debug bluefs: 20 + debug rocksdb: 10 + bluestore fsck on mount: true + # lower the full ratios since we can fill up a 100gb osd so quickly + mon osd full ratio: .9 + mon osd backfillfull_ratio: .85 + mon osd nearfull ratio: .8 + osd failsafe full ratio: .95 + diff -Nru ceph-12.1.1/qa/suites/rados/thrash-erasure-code-big/objectstore/filestore-xfs.yaml ceph-12.1.2/qa/suites/rados/thrash-erasure-code-big/objectstore/filestore-xfs.yaml --- ceph-12.1.1/qa/suites/rados/thrash-erasure-code-big/objectstore/filestore-xfs.yaml 2017-07-17 16:56:02.000000000 +0000 +++ ceph-12.1.2/qa/suites/rados/thrash-erasure-code-big/objectstore/filestore-xfs.yaml 2017-08-01 17:55:40.000000000 +0000 @@ -5,3 +5,11 @@ osd: osd objectstore: filestore osd sloppy crc: true + ceph-deploy: + fs: xfs + filestore: True + conf: + osd: + osd objectstore: filestore + osd sloppy crc: true + diff -Nru ceph-12.1.1/qa/suites/rados/thrash-erasure-code-big/thrashers/default.yaml ceph-12.1.2/qa/suites/rados/thrash-erasure-code-big/thrashers/default.yaml --- ceph-12.1.1/qa/suites/rados/thrash-erasure-code-big/thrashers/default.yaml 2017-07-17 16:56:02.000000000 +0000 +++ ceph-12.1.2/qa/suites/rados/thrash-erasure-code-big/thrashers/default.yaml 2017-08-01 17:55:40.000000000 +0000 @@ -1,7 +1,7 @@ overrides: ceph: log-whitelist: - - wrongly marked me down + - but it is still running - objects unfound and apparently lost - slow request conf: diff -Nru ceph-12.1.1/qa/suites/rados/thrash-erasure-code-big/thrashers/fastread.yaml ceph-12.1.2/qa/suites/rados/thrash-erasure-code-big/thrashers/fastread.yaml --- ceph-12.1.1/qa/suites/rados/thrash-erasure-code-big/thrashers/fastread.yaml 2017-07-17 16:56:02.000000000 +0000 +++ ceph-12.1.2/qa/suites/rados/thrash-erasure-code-big/thrashers/fastread.yaml 2017-08-01 17:55:40.000000000 +0000 @@ -1,7 +1,7 @@ overrides: ceph: log-whitelist: - - wrongly marked me down + - but it is still running - objects unfound and apparently lost conf: mon: diff -Nru ceph-12.1.1/qa/suites/rados/thrash-erasure-code-big/thrashers/mapgap.yaml ceph-12.1.2/qa/suites/rados/thrash-erasure-code-big/thrashers/mapgap.yaml --- ceph-12.1.1/qa/suites/rados/thrash-erasure-code-big/thrashers/mapgap.yaml 2017-07-17 16:56:02.000000000 +0000 +++ ceph-12.1.2/qa/suites/rados/thrash-erasure-code-big/thrashers/mapgap.yaml 2017-08-01 17:55:40.000000000 +0000 @@ -1,7 +1,7 @@ overrides: ceph: log-whitelist: - - wrongly marked me down + - but it is still running - objects unfound and apparently lost - osd_map_cache_size conf: diff -Nru ceph-12.1.1/qa/suites/rados/thrash-erasure-code-big/thrashers/morepggrow.yaml ceph-12.1.2/qa/suites/rados/thrash-erasure-code-big/thrashers/morepggrow.yaml --- ceph-12.1.1/qa/suites/rados/thrash-erasure-code-big/thrashers/morepggrow.yaml 2017-07-17 16:56:02.000000000 +0000 +++ ceph-12.1.2/qa/suites/rados/thrash-erasure-code-big/thrashers/morepggrow.yaml 2017-08-01 17:55:40.000000000 +0000 @@ -6,7 +6,7 @@ osd scrub max interval: 120 osd max backfills: 9 log-whitelist: - - wrongly marked me down + - but it is still running - objects unfound and apparently lost tasks: - thrashosds: diff -Nru ceph-12.1.1/qa/suites/rados/thrash-erasure-code-big/thrashers/pggrow.yaml ceph-12.1.2/qa/suites/rados/thrash-erasure-code-big/thrashers/pggrow.yaml --- ceph-12.1.1/qa/suites/rados/thrash-erasure-code-big/thrashers/pggrow.yaml 2017-07-17 16:56:02.000000000 +0000 +++ ceph-12.1.2/qa/suites/rados/thrash-erasure-code-big/thrashers/pggrow.yaml 2017-08-01 17:55:40.000000000 +0000 @@ -1,7 +1,7 @@ overrides: ceph: log-whitelist: - - wrongly marked me down + - but it is still running - objects unfound and apparently lost conf: osd: diff -Nru ceph-12.1.1/qa/suites/rados/thrash-erasure-code-isa/d-require-luminous/at-end.yaml ceph-12.1.2/qa/suites/rados/thrash-erasure-code-isa/d-require-luminous/at-end.yaml --- ceph-12.1.1/qa/suites/rados/thrash-erasure-code-isa/d-require-luminous/at-end.yaml 2017-07-17 16:56:02.000000000 +0000 +++ ceph-12.1.2/qa/suites/rados/thrash-erasure-code-isa/d-require-luminous/at-end.yaml 2017-08-01 17:55:40.000000000 +0000 @@ -6,6 +6,7 @@ - exec: mon.a: - ceph osd require-osd-release luminous + - ceph osd pool application enable base rados || true # make sure osds have latest map - rados -p rbd bench 5 write -b 4096 - ceph.healthy: @@ -20,5 +21,11 @@ conf: global: mon debug no require luminous: true + +# setting luminous triggers peering, which *might* trigger health alerts + log-whitelist: + - overall HEALTH_ + - (PG_AVAILABILITY) + - (PG_DEGRADED) thrashosds: chance_thrash_cluster_full: 0 diff -Nru ceph-12.1.1/qa/suites/rados/thrash-erasure-code-isa/objectstore/bluestore.yaml ceph-12.1.2/qa/suites/rados/thrash-erasure-code-isa/objectstore/bluestore.yaml --- ceph-12.1.1/qa/suites/rados/thrash-erasure-code-isa/objectstore/bluestore.yaml 2017-07-17 16:56:02.000000000 +0000 +++ ceph-12.1.2/qa/suites/rados/thrash-erasure-code-isa/objectstore/bluestore.yaml 2017-08-01 17:55:40.000000000 +0000 @@ -20,3 +20,21 @@ osd failsafe full ratio: .95 # this doesn't work with failures bc the log writes are not atomic across the two backends # bluestore bluefs env mirror: true + ceph-deploy: + fs: xfs + bluestore: yes + conf: + osd: + osd objectstore: bluestore + bluestore block size: 96636764160 + debug bluestore: 30 + debug bdev: 20 + debug bluefs: 20 + debug rocksdb: 10 + bluestore fsck on mount: true + # lower the full ratios since we can fill up a 100gb osd so quickly + mon osd full ratio: .9 + mon osd backfillfull_ratio: .85 + mon osd nearfull ratio: .8 + osd failsafe full ratio: .95 + diff -Nru ceph-12.1.1/qa/suites/rados/thrash-erasure-code-isa/objectstore/filestore-xfs.yaml ceph-12.1.2/qa/suites/rados/thrash-erasure-code-isa/objectstore/filestore-xfs.yaml --- ceph-12.1.1/qa/suites/rados/thrash-erasure-code-isa/objectstore/filestore-xfs.yaml 2017-07-17 16:56:02.000000000 +0000 +++ ceph-12.1.2/qa/suites/rados/thrash-erasure-code-isa/objectstore/filestore-xfs.yaml 2017-08-01 17:55:40.000000000 +0000 @@ -5,3 +5,11 @@ osd: osd objectstore: filestore osd sloppy crc: true + ceph-deploy: + fs: xfs + filestore: True + conf: + osd: + osd objectstore: filestore + osd sloppy crc: true + diff -Nru ceph-12.1.1/qa/suites/rados/thrash-erasure-code-isa/thrashers/default.yaml ceph-12.1.2/qa/suites/rados/thrash-erasure-code-isa/thrashers/default.yaml --- ceph-12.1.1/qa/suites/rados/thrash-erasure-code-isa/thrashers/default.yaml 2017-07-17 16:56:02.000000000 +0000 +++ ceph-12.1.2/qa/suites/rados/thrash-erasure-code-isa/thrashers/default.yaml 2017-08-01 17:55:40.000000000 +0000 @@ -1,7 +1,7 @@ overrides: ceph: log-whitelist: - - wrongly marked me down + - but it is still running - objects unfound and apparently lost conf: osd: diff -Nru ceph-12.1.1/qa/suites/rados/thrash-erasure-code-isa/thrashers/mapgap.yaml ceph-12.1.2/qa/suites/rados/thrash-erasure-code-isa/thrashers/mapgap.yaml --- ceph-12.1.1/qa/suites/rados/thrash-erasure-code-isa/thrashers/mapgap.yaml 2017-07-17 16:56:02.000000000 +0000 +++ ceph-12.1.2/qa/suites/rados/thrash-erasure-code-isa/thrashers/mapgap.yaml 2017-08-01 17:55:40.000000000 +0000 @@ -1,7 +1,7 @@ overrides: ceph: log-whitelist: - - wrongly marked me down + - but it is still running - objects unfound and apparently lost - osd_map_cache_size conf: diff -Nru ceph-12.1.1/qa/suites/rados/thrash-erasure-code-isa/thrashers/morepggrow.yaml ceph-12.1.2/qa/suites/rados/thrash-erasure-code-isa/thrashers/morepggrow.yaml --- ceph-12.1.1/qa/suites/rados/thrash-erasure-code-isa/thrashers/morepggrow.yaml 2017-07-17 16:56:02.000000000 +0000 +++ ceph-12.1.2/qa/suites/rados/thrash-erasure-code-isa/thrashers/morepggrow.yaml 2017-08-01 17:55:40.000000000 +0000 @@ -10,7 +10,7 @@ filestore queue throttle max multiple: 10 osd max backfills: 9 log-whitelist: - - wrongly marked me down + - but it is still running - objects unfound and apparently lost tasks: - thrashosds: diff -Nru ceph-12.1.1/qa/suites/rados/thrash-erasure-code-isa/thrashers/pggrow.yaml ceph-12.1.2/qa/suites/rados/thrash-erasure-code-isa/thrashers/pggrow.yaml --- ceph-12.1.1/qa/suites/rados/thrash-erasure-code-isa/thrashers/pggrow.yaml 2017-07-17 16:56:02.000000000 +0000 +++ ceph-12.1.2/qa/suites/rados/thrash-erasure-code-isa/thrashers/pggrow.yaml 2017-08-01 17:55:40.000000000 +0000 @@ -1,7 +1,7 @@ overrides: ceph: log-whitelist: - - wrongly marked me down + - but it is still running - objects unfound and apparently lost conf: osd: diff -Nru ceph-12.1.1/qa/suites/rados/thrash-erasure-code-overwrites/bluestore.yaml ceph-12.1.2/qa/suites/rados/thrash-erasure-code-overwrites/bluestore.yaml --- ceph-12.1.1/qa/suites/rados/thrash-erasure-code-overwrites/bluestore.yaml 2017-07-17 16:56:02.000000000 +0000 +++ ceph-12.1.2/qa/suites/rados/thrash-erasure-code-overwrites/bluestore.yaml 2017-08-01 17:55:40.000000000 +0000 @@ -20,3 +20,21 @@ osd failsafe full ratio: .95 # this doesn't work with failures bc the log writes are not atomic across the two backends # bluestore bluefs env mirror: true + ceph-deploy: + fs: xfs + bluestore: yes + conf: + osd: + osd objectstore: bluestore + bluestore block size: 96636764160 + debug bluestore: 30 + debug bdev: 20 + debug bluefs: 20 + debug rocksdb: 10 + bluestore fsck on mount: true + # lower the full ratios since we can fill up a 100gb osd so quickly + mon osd full ratio: .9 + mon osd backfillfull_ratio: .85 + mon osd nearfull ratio: .8 + osd failsafe full ratio: .95 + diff -Nru ceph-12.1.1/qa/suites/rados/thrash-erasure-code-overwrites/d-require-luminous/at-end.yaml ceph-12.1.2/qa/suites/rados/thrash-erasure-code-overwrites/d-require-luminous/at-end.yaml --- ceph-12.1.1/qa/suites/rados/thrash-erasure-code-overwrites/d-require-luminous/at-end.yaml 2017-07-17 16:56:02.000000000 +0000 +++ ceph-12.1.2/qa/suites/rados/thrash-erasure-code-overwrites/d-require-luminous/at-end.yaml 2017-08-01 17:55:40.000000000 +0000 @@ -6,6 +6,7 @@ - exec: mon.a: - ceph osd require-osd-release luminous + - ceph osd pool application enable base rados || true # make sure osds have latest map - rados -p rbd bench 5 write -b 4096 - ceph.healthy: @@ -20,5 +21,11 @@ conf: global: mon debug no require luminous: true + +# setting luminous triggers peering, which *might* trigger health alerts + log-whitelist: + - overall HEALTH_ + - (PG_AVAILABILITY) + - (PG_DEGRADED) thrashosds: chance_thrash_cluster_full: 0 diff -Nru ceph-12.1.1/qa/suites/rados/thrash-erasure-code-overwrites/thrashers/default.yaml ceph-12.1.2/qa/suites/rados/thrash-erasure-code-overwrites/thrashers/default.yaml --- ceph-12.1.1/qa/suites/rados/thrash-erasure-code-overwrites/thrashers/default.yaml 2017-07-17 16:56:02.000000000 +0000 +++ ceph-12.1.2/qa/suites/rados/thrash-erasure-code-overwrites/thrashers/default.yaml 2017-08-01 17:55:40.000000000 +0000 @@ -1,7 +1,7 @@ overrides: ceph: log-whitelist: - - wrongly marked me down + - but it is still running - objects unfound and apparently lost conf: osd: diff -Nru ceph-12.1.1/qa/suites/rados/thrash-erasure-code-overwrites/thrashers/fastread.yaml ceph-12.1.2/qa/suites/rados/thrash-erasure-code-overwrites/thrashers/fastread.yaml --- ceph-12.1.1/qa/suites/rados/thrash-erasure-code-overwrites/thrashers/fastread.yaml 2017-07-17 16:56:02.000000000 +0000 +++ ceph-12.1.2/qa/suites/rados/thrash-erasure-code-overwrites/thrashers/fastread.yaml 2017-08-01 17:55:40.000000000 +0000 @@ -1,7 +1,7 @@ overrides: ceph: log-whitelist: - - wrongly marked me down + - but it is still running - objects unfound and apparently lost conf: mon: diff -Nru ceph-12.1.1/qa/suites/rados/thrash-erasure-code-overwrites/thrashers/mapgap.yaml ceph-12.1.2/qa/suites/rados/thrash-erasure-code-overwrites/thrashers/mapgap.yaml --- ceph-12.1.1/qa/suites/rados/thrash-erasure-code-overwrites/thrashers/mapgap.yaml 2017-07-17 16:56:02.000000000 +0000 +++ ceph-12.1.2/qa/suites/rados/thrash-erasure-code-overwrites/thrashers/mapgap.yaml 2017-08-01 17:55:40.000000000 +0000 @@ -9,7 +9,7 @@ osd scrub max interval: 120 osd max backfills: 5 log-whitelist: - - wrongly marked me down + - but it is still running - objects unfound and apparently lost - osd_map_cache_size tasks: diff -Nru ceph-12.1.1/qa/suites/rados/thrash-erasure-code-overwrites/thrashers/morepggrow.yaml ceph-12.1.2/qa/suites/rados/thrash-erasure-code-overwrites/thrashers/morepggrow.yaml --- ceph-12.1.1/qa/suites/rados/thrash-erasure-code-overwrites/thrashers/morepggrow.yaml 2017-07-17 16:56:02.000000000 +0000 +++ ceph-12.1.2/qa/suites/rados/thrash-erasure-code-overwrites/thrashers/morepggrow.yaml 2017-08-01 17:55:40.000000000 +0000 @@ -6,7 +6,7 @@ osd scrub max interval: 120 osd max backfills: 9 log-whitelist: - - wrongly marked me down + - but it is still running - objects unfound and apparently lost tasks: - thrashosds: diff -Nru ceph-12.1.1/qa/suites/rados/thrash-erasure-code-overwrites/thrashers/pggrow.yaml ceph-12.1.2/qa/suites/rados/thrash-erasure-code-overwrites/thrashers/pggrow.yaml --- ceph-12.1.1/qa/suites/rados/thrash-erasure-code-overwrites/thrashers/pggrow.yaml 2017-07-17 16:56:02.000000000 +0000 +++ ceph-12.1.2/qa/suites/rados/thrash-erasure-code-overwrites/thrashers/pggrow.yaml 2017-08-01 17:55:40.000000000 +0000 @@ -1,7 +1,7 @@ overrides: ceph: log-whitelist: - - wrongly marked me down + - but it is still running - objects unfound and apparently lost conf: osd: diff -Nru ceph-12.1.1/qa/suites/rados/thrash-erasure-code-shec/d-require-luminous/at-end.yaml ceph-12.1.2/qa/suites/rados/thrash-erasure-code-shec/d-require-luminous/at-end.yaml --- ceph-12.1.1/qa/suites/rados/thrash-erasure-code-shec/d-require-luminous/at-end.yaml 2017-07-17 16:56:02.000000000 +0000 +++ ceph-12.1.2/qa/suites/rados/thrash-erasure-code-shec/d-require-luminous/at-end.yaml 2017-08-01 17:55:40.000000000 +0000 @@ -6,6 +6,7 @@ - exec: mon.a: - ceph osd require-osd-release luminous + - ceph osd pool application enable base rados || true # make sure osds have latest map - rados -p rbd bench 5 write -b 4096 - ceph.healthy: @@ -20,5 +21,11 @@ conf: global: mon debug no require luminous: true + +# setting luminous triggers peering, which *might* trigger health alerts + log-whitelist: + - overall HEALTH_ + - (PG_AVAILABILITY) + - (PG_DEGRADED) thrashosds: chance_thrash_cluster_full: 0 diff -Nru ceph-12.1.1/qa/suites/rados/thrash-erasure-code-shec/objectstore/bluestore.yaml ceph-12.1.2/qa/suites/rados/thrash-erasure-code-shec/objectstore/bluestore.yaml --- ceph-12.1.1/qa/suites/rados/thrash-erasure-code-shec/objectstore/bluestore.yaml 2017-07-17 16:56:02.000000000 +0000 +++ ceph-12.1.2/qa/suites/rados/thrash-erasure-code-shec/objectstore/bluestore.yaml 2017-08-01 17:55:40.000000000 +0000 @@ -20,3 +20,21 @@ osd failsafe full ratio: .95 # this doesn't work with failures bc the log writes are not atomic across the two backends # bluestore bluefs env mirror: true + ceph-deploy: + fs: xfs + bluestore: yes + conf: + osd: + osd objectstore: bluestore + bluestore block size: 96636764160 + debug bluestore: 30 + debug bdev: 20 + debug bluefs: 20 + debug rocksdb: 10 + bluestore fsck on mount: true + # lower the full ratios since we can fill up a 100gb osd so quickly + mon osd full ratio: .9 + mon osd backfillfull_ratio: .85 + mon osd nearfull ratio: .8 + osd failsafe full ratio: .95 + diff -Nru ceph-12.1.1/qa/suites/rados/thrash-erasure-code-shec/objectstore/filestore-xfs.yaml ceph-12.1.2/qa/suites/rados/thrash-erasure-code-shec/objectstore/filestore-xfs.yaml --- ceph-12.1.1/qa/suites/rados/thrash-erasure-code-shec/objectstore/filestore-xfs.yaml 2017-07-17 16:56:02.000000000 +0000 +++ ceph-12.1.2/qa/suites/rados/thrash-erasure-code-shec/objectstore/filestore-xfs.yaml 2017-08-01 17:55:40.000000000 +0000 @@ -5,3 +5,11 @@ osd: osd objectstore: filestore osd sloppy crc: true + ceph-deploy: + fs: xfs + filestore: True + conf: + osd: + osd objectstore: filestore + osd sloppy crc: true + diff -Nru ceph-12.1.1/qa/suites/rados/thrash-erasure-code-shec/thrashers/default.yaml ceph-12.1.2/qa/suites/rados/thrash-erasure-code-shec/thrashers/default.yaml --- ceph-12.1.1/qa/suites/rados/thrash-erasure-code-shec/thrashers/default.yaml 2017-07-17 16:56:02.000000000 +0000 +++ ceph-12.1.2/qa/suites/rados/thrash-erasure-code-shec/thrashers/default.yaml 2017-08-01 17:55:40.000000000 +0000 @@ -1,7 +1,7 @@ overrides: ceph: log-whitelist: - - wrongly marked me down + - but it is still running - objects unfound and apparently lost - slow request conf: diff -Nru ceph-12.1.1/qa/suites/rados/thrash-luminous/objectstore/bluestore.yaml ceph-12.1.2/qa/suites/rados/thrash-luminous/objectstore/bluestore.yaml --- ceph-12.1.1/qa/suites/rados/thrash-luminous/objectstore/bluestore.yaml 2017-07-17 16:56:02.000000000 +0000 +++ ceph-12.1.2/qa/suites/rados/thrash-luminous/objectstore/bluestore.yaml 2017-08-01 17:55:40.000000000 +0000 @@ -20,3 +20,21 @@ osd failsafe full ratio: .95 # this doesn't work with failures bc the log writes are not atomic across the two backends # bluestore bluefs env mirror: true + ceph-deploy: + fs: xfs + bluestore: yes + conf: + osd: + osd objectstore: bluestore + bluestore block size: 96636764160 + debug bluestore: 30 + debug bdev: 20 + debug bluefs: 20 + debug rocksdb: 10 + bluestore fsck on mount: true + # lower the full ratios since we can fill up a 100gb osd so quickly + mon osd full ratio: .9 + mon osd backfillfull_ratio: .85 + mon osd nearfull ratio: .8 + osd failsafe full ratio: .95 + diff -Nru ceph-12.1.1/qa/suites/rados/thrash-luminous/objectstore/filestore-xfs.yaml ceph-12.1.2/qa/suites/rados/thrash-luminous/objectstore/filestore-xfs.yaml --- ceph-12.1.1/qa/suites/rados/thrash-luminous/objectstore/filestore-xfs.yaml 2017-07-17 16:56:02.000000000 +0000 +++ ceph-12.1.2/qa/suites/rados/thrash-luminous/objectstore/filestore-xfs.yaml 2017-08-01 17:55:40.000000000 +0000 @@ -5,3 +5,11 @@ osd: osd objectstore: filestore osd sloppy crc: true + ceph-deploy: + fs: xfs + filestore: True + conf: + osd: + osd objectstore: filestore + osd sloppy crc: true + diff -Nru ceph-12.1.1/qa/suites/rados/thrash-luminous/thrashers/default.yaml ceph-12.1.2/qa/suites/rados/thrash-luminous/thrashers/default.yaml --- ceph-12.1.1/qa/suites/rados/thrash-luminous/thrashers/default.yaml 2017-07-17 16:56:02.000000000 +0000 +++ ceph-12.1.2/qa/suites/rados/thrash-luminous/thrashers/default.yaml 2017-08-01 17:55:40.000000000 +0000 @@ -1,7 +1,7 @@ overrides: ceph: log-whitelist: - - wrongly marked me down + - but it is still running - objects unfound and apparently lost conf: osd: diff -Nru ceph-12.1.1/qa/suites/rados/thrash-luminous/thrashers/mapgap.yaml ceph-12.1.2/qa/suites/rados/thrash-luminous/thrashers/mapgap.yaml --- ceph-12.1.1/qa/suites/rados/thrash-luminous/thrashers/mapgap.yaml 2017-07-17 16:56:02.000000000 +0000 +++ ceph-12.1.2/qa/suites/rados/thrash-luminous/thrashers/mapgap.yaml 2017-08-01 17:55:40.000000000 +0000 @@ -1,7 +1,7 @@ overrides: ceph: log-whitelist: - - wrongly marked me down + - but it is still running - objects unfound and apparently lost - osd_map_cache_size conf: diff -Nru ceph-12.1.1/qa/suites/rados/thrash-luminous/thrashers/morepggrow.yaml ceph-12.1.2/qa/suites/rados/thrash-luminous/thrashers/morepggrow.yaml --- ceph-12.1.1/qa/suites/rados/thrash-luminous/thrashers/morepggrow.yaml 2017-07-17 16:56:02.000000000 +0000 +++ ceph-12.1.2/qa/suites/rados/thrash-luminous/thrashers/morepggrow.yaml 2017-08-01 17:55:40.000000000 +0000 @@ -10,7 +10,7 @@ filestore queue throttle max multiple: 10 osd max backfills: 9 log-whitelist: - - wrongly marked me down + - but it is still running - objects unfound and apparently lost tasks: - thrashosds: diff -Nru ceph-12.1.1/qa/suites/rados/thrash-luminous/thrashers/pggrow.yaml ceph-12.1.2/qa/suites/rados/thrash-luminous/thrashers/pggrow.yaml --- ceph-12.1.1/qa/suites/rados/thrash-luminous/thrashers/pggrow.yaml 2017-07-17 16:56:02.000000000 +0000 +++ ceph-12.1.2/qa/suites/rados/thrash-luminous/thrashers/pggrow.yaml 2017-08-01 17:55:40.000000000 +0000 @@ -1,7 +1,7 @@ overrides: ceph: log-whitelist: - - wrongly marked me down + - but it is still running - objects unfound and apparently lost conf: osd: diff -Nru ceph-12.1.1/qa/suites/rados/upgrade/jewel-x-singleton/2-partial-upgrade/firsthalf.yaml ceph-12.1.2/qa/suites/rados/upgrade/jewel-x-singleton/2-partial-upgrade/firsthalf.yaml --- ceph-12.1.1/qa/suites/rados/upgrade/jewel-x-singleton/2-partial-upgrade/firsthalf.yaml 2017-07-17 16:56:02.000000000 +0000 +++ ceph-12.1.2/qa/suites/rados/upgrade/jewel-x-singleton/2-partial-upgrade/firsthalf.yaml 2017-08-01 17:55:40.000000000 +0000 @@ -9,6 +9,8 @@ - print: "**** done install.upgrade osd.0" - ceph.restart: daemons: [mon.a, mon.b, mon.c] + wait-for-healthy: false - ceph.restart: daemons: [osd.0, osd.1, osd.2] + wait-for-healthy: false - print: "**** done ceph.restart 1st half" diff -Nru ceph-12.1.1/qa/suites/rados/upgrade/jewel-x-singleton/3-thrash/default.yaml ceph-12.1.2/qa/suites/rados/upgrade/jewel-x-singleton/3-thrash/default.yaml --- ceph-12.1.1/qa/suites/rados/upgrade/jewel-x-singleton/3-thrash/default.yaml 2017-07-17 16:56:02.000000000 +0000 +++ ceph-12.1.2/qa/suites/rados/upgrade/jewel-x-singleton/3-thrash/default.yaml 2017-08-01 17:55:40.000000000 +0000 @@ -5,7 +5,7 @@ overrides: ceph: log-whitelist: - - wrongly marked me down + - but it is still running - objects unfound and apparently lost - log bound mismatch tasks: @@ -20,4 +20,5 @@ chance_thrash_cluster_full: 0 chance_thrash_pg_upmap: 0 chance_thrash_pg_upmap_items: 0 + chance_force_recovery: 0 - print: "**** done thrashosds 3-thrash" diff -Nru ceph-12.1.1/qa/suites/rados/verify/d-require-luminous/at-end.yaml ceph-12.1.2/qa/suites/rados/verify/d-require-luminous/at-end.yaml --- ceph-12.1.1/qa/suites/rados/verify/d-require-luminous/at-end.yaml 2017-07-17 16:56:02.000000000 +0000 +++ ceph-12.1.2/qa/suites/rados/verify/d-require-luminous/at-end.yaml 2017-08-01 17:55:40.000000000 +0000 @@ -6,6 +6,7 @@ - exec: mon.a: - ceph osd require-osd-release luminous + - ceph osd pool application enable base rados || true # make sure osds have latest map - rados -p rbd bench 5 write -b 4096 - ceph.healthy: @@ -20,5 +21,11 @@ conf: global: mon debug no require luminous: true + +# setting luminous triggers peering, which *might* trigger health alerts + log-whitelist: + - overall HEALTH_ + - (PG_AVAILABILITY) + - (PG_DEGRADED) thrashosds: chance_thrash_cluster_full: 0 diff -Nru ceph-12.1.1/qa/suites/rados/verify/d-thrash/default/default.yaml ceph-12.1.2/qa/suites/rados/verify/d-thrash/default/default.yaml --- ceph-12.1.1/qa/suites/rados/verify/d-thrash/default/default.yaml 2017-07-17 16:56:02.000000000 +0000 +++ ceph-12.1.2/qa/suites/rados/verify/d-thrash/default/default.yaml 2017-08-01 17:55:40.000000000 +0000 @@ -1,7 +1,7 @@ overrides: ceph: log-whitelist: - - wrongly marked me down + - but it is still running - objects unfound and apparently lost tasks: - thrashosds: diff -Nru ceph-12.1.1/qa/suites/rados/verify/objectstore/bluestore.yaml ceph-12.1.2/qa/suites/rados/verify/objectstore/bluestore.yaml --- ceph-12.1.1/qa/suites/rados/verify/objectstore/bluestore.yaml 2017-07-17 16:56:02.000000000 +0000 +++ ceph-12.1.2/qa/suites/rados/verify/objectstore/bluestore.yaml 2017-08-01 17:55:40.000000000 +0000 @@ -20,3 +20,21 @@ osd failsafe full ratio: .95 # this doesn't work with failures bc the log writes are not atomic across the two backends # bluestore bluefs env mirror: true + ceph-deploy: + fs: xfs + bluestore: yes + conf: + osd: + osd objectstore: bluestore + bluestore block size: 96636764160 + debug bluestore: 30 + debug bdev: 20 + debug bluefs: 20 + debug rocksdb: 10 + bluestore fsck on mount: true + # lower the full ratios since we can fill up a 100gb osd so quickly + mon osd full ratio: .9 + mon osd backfillfull_ratio: .85 + mon osd nearfull ratio: .8 + osd failsafe full ratio: .95 + diff -Nru ceph-12.1.1/qa/suites/rados/verify/objectstore/filestore-xfs.yaml ceph-12.1.2/qa/suites/rados/verify/objectstore/filestore-xfs.yaml --- ceph-12.1.1/qa/suites/rados/verify/objectstore/filestore-xfs.yaml 2017-07-17 16:56:02.000000000 +0000 +++ ceph-12.1.2/qa/suites/rados/verify/objectstore/filestore-xfs.yaml 2017-08-01 17:55:40.000000000 +0000 @@ -5,3 +5,11 @@ osd: osd objectstore: filestore osd sloppy crc: true + ceph-deploy: + fs: xfs + filestore: True + conf: + osd: + osd objectstore: filestore + osd sloppy crc: true + diff -Nru ceph-12.1.1/qa/suites/rados/verify/tasks/mon_recovery.yaml ceph-12.1.2/qa/suites/rados/verify/tasks/mon_recovery.yaml --- ceph-12.1.1/qa/suites/rados/verify/tasks/mon_recovery.yaml 2017-07-17 16:56:02.000000000 +0000 +++ ceph-12.1.2/qa/suites/rados/verify/tasks/mon_recovery.yaml 2017-08-01 17:55:40.000000000 +0000 @@ -5,5 +5,6 @@ - (MON_DOWN) - (OSDMAP_FLAGS) - (SMALLER_PGP_NUM) + - (POOL_APP_NOT_ENABLED) tasks: - mon_recovery: diff -Nru ceph-12.1.1/qa/suites/rados/verify/tasks/rados_api_tests.yaml ceph-12.1.2/qa/suites/rados/verify/tasks/rados_api_tests.yaml --- ceph-12.1.1/qa/suites/rados/verify/tasks/rados_api_tests.yaml 2017-07-17 16:56:02.000000000 +0000 +++ ceph-12.1.2/qa/suites/rados/verify/tasks/rados_api_tests.yaml 2017-08-01 17:55:40.000000000 +0000 @@ -8,6 +8,7 @@ - (SMALLER_PGP_NUM) - (REQUEST_SLOW) - (CACHE_POOL_NEAR_FULL) + - (POOL_APP_NOT_ENABLED) conf: client: debug ms: 1 diff -Nru ceph-12.1.1/qa/suites/rbd/basic/objectstore/bluestore.yaml ceph-12.1.2/qa/suites/rbd/basic/objectstore/bluestore.yaml --- ceph-12.1.1/qa/suites/rbd/basic/objectstore/bluestore.yaml 2017-07-17 16:56:02.000000000 +0000 +++ ceph-12.1.2/qa/suites/rbd/basic/objectstore/bluestore.yaml 2017-08-01 17:55:40.000000000 +0000 @@ -20,3 +20,21 @@ osd failsafe full ratio: .95 # this doesn't work with failures bc the log writes are not atomic across the two backends # bluestore bluefs env mirror: true + ceph-deploy: + fs: xfs + bluestore: yes + conf: + osd: + osd objectstore: bluestore + bluestore block size: 96636764160 + debug bluestore: 30 + debug bdev: 20 + debug bluefs: 20 + debug rocksdb: 10 + bluestore fsck on mount: true + # lower the full ratios since we can fill up a 100gb osd so quickly + mon osd full ratio: .9 + mon osd backfillfull_ratio: .85 + mon osd nearfull ratio: .8 + osd failsafe full ratio: .95 + diff -Nru ceph-12.1.1/qa/suites/rbd/basic/objectstore/filestore-xfs.yaml ceph-12.1.2/qa/suites/rbd/basic/objectstore/filestore-xfs.yaml --- ceph-12.1.1/qa/suites/rbd/basic/objectstore/filestore-xfs.yaml 2017-07-17 16:56:02.000000000 +0000 +++ ceph-12.1.2/qa/suites/rbd/basic/objectstore/filestore-xfs.yaml 2017-08-01 17:55:40.000000000 +0000 @@ -5,3 +5,11 @@ osd: osd objectstore: filestore osd sloppy crc: true + ceph-deploy: + fs: xfs + filestore: True + conf: + osd: + osd objectstore: filestore + osd sloppy crc: true + diff -Nru ceph-12.1.1/qa/suites/rbd/cli/objectstore/bluestore.yaml ceph-12.1.2/qa/suites/rbd/cli/objectstore/bluestore.yaml --- ceph-12.1.1/qa/suites/rbd/cli/objectstore/bluestore.yaml 2017-07-17 16:56:02.000000000 +0000 +++ ceph-12.1.2/qa/suites/rbd/cli/objectstore/bluestore.yaml 2017-08-01 17:55:40.000000000 +0000 @@ -20,3 +20,21 @@ osd failsafe full ratio: .95 # this doesn't work with failures bc the log writes are not atomic across the two backends # bluestore bluefs env mirror: true + ceph-deploy: + fs: xfs + bluestore: yes + conf: + osd: + osd objectstore: bluestore + bluestore block size: 96636764160 + debug bluestore: 30 + debug bdev: 20 + debug bluefs: 20 + debug rocksdb: 10 + bluestore fsck on mount: true + # lower the full ratios since we can fill up a 100gb osd so quickly + mon osd full ratio: .9 + mon osd backfillfull_ratio: .85 + mon osd nearfull ratio: .8 + osd failsafe full ratio: .95 + diff -Nru ceph-12.1.1/qa/suites/rbd/cli/objectstore/filestore-xfs.yaml ceph-12.1.2/qa/suites/rbd/cli/objectstore/filestore-xfs.yaml --- ceph-12.1.1/qa/suites/rbd/cli/objectstore/filestore-xfs.yaml 2017-07-17 16:56:02.000000000 +0000 +++ ceph-12.1.2/qa/suites/rbd/cli/objectstore/filestore-xfs.yaml 2017-08-01 17:55:40.000000000 +0000 @@ -5,3 +5,11 @@ osd: osd objectstore: filestore osd sloppy crc: true + ceph-deploy: + fs: xfs + filestore: True + conf: + osd: + osd objectstore: filestore + osd sloppy crc: true + diff -Nru ceph-12.1.1/qa/suites/rbd/cli/pool/ec-data-pool.yaml ceph-12.1.2/qa/suites/rbd/cli/pool/ec-data-pool.yaml --- ceph-12.1.1/qa/suites/rbd/cli/pool/ec-data-pool.yaml 2017-07-17 16:56:02.000000000 +0000 +++ ceph-12.1.2/qa/suites/rbd/cli/pool/ec-data-pool.yaml 2017-08-01 17:55:40.000000000 +0000 @@ -4,6 +4,7 @@ - sudo ceph osd erasure-code-profile set teuthologyprofile crush-failure-domain=osd m=1 k=2 - sudo ceph osd pool create datapool 4 4 erasure teuthologyprofile - sudo ceph osd pool set datapool allow_ec_overwrites true + - rbd pool init datapool overrides: thrashosds: diff -Nru ceph-12.1.1/qa/suites/rbd/cli/pool/replicated-data-pool.yaml ceph-12.1.2/qa/suites/rbd/cli/pool/replicated-data-pool.yaml --- ceph-12.1.1/qa/suites/rbd/cli/pool/replicated-data-pool.yaml 2017-07-17 16:56:02.000000000 +0000 +++ ceph-12.1.2/qa/suites/rbd/cli/pool/replicated-data-pool.yaml 2017-08-01 17:55:40.000000000 +0000 @@ -2,6 +2,7 @@ - exec: client.0: - sudo ceph osd pool create datapool 4 + - rbd pool init datapool overrides: ceph: diff -Nru ceph-12.1.1/qa/suites/rbd/librbd/msgr-failures/few.yaml ceph-12.1.2/qa/suites/rbd/librbd/msgr-failures/few.yaml --- ceph-12.1.1/qa/suites/rbd/librbd/msgr-failures/few.yaml 2017-07-17 16:56:02.000000000 +0000 +++ ceph-12.1.2/qa/suites/rbd/librbd/msgr-failures/few.yaml 2017-08-01 17:55:40.000000000 +0000 @@ -4,4 +4,4 @@ global: ms inject socket failures: 5000 log-whitelist: - - wrongly marked me down + - but it is still running diff -Nru ceph-12.1.1/qa/suites/rbd/librbd/objectstore/bluestore.yaml ceph-12.1.2/qa/suites/rbd/librbd/objectstore/bluestore.yaml --- ceph-12.1.1/qa/suites/rbd/librbd/objectstore/bluestore.yaml 2017-07-17 16:56:02.000000000 +0000 +++ ceph-12.1.2/qa/suites/rbd/librbd/objectstore/bluestore.yaml 2017-08-01 17:55:40.000000000 +0000 @@ -20,3 +20,21 @@ osd failsafe full ratio: .95 # this doesn't work with failures bc the log writes are not atomic across the two backends # bluestore bluefs env mirror: true + ceph-deploy: + fs: xfs + bluestore: yes + conf: + osd: + osd objectstore: bluestore + bluestore block size: 96636764160 + debug bluestore: 30 + debug bdev: 20 + debug bluefs: 20 + debug rocksdb: 10 + bluestore fsck on mount: true + # lower the full ratios since we can fill up a 100gb osd so quickly + mon osd full ratio: .9 + mon osd backfillfull_ratio: .85 + mon osd nearfull ratio: .8 + osd failsafe full ratio: .95 + diff -Nru ceph-12.1.1/qa/suites/rbd/librbd/objectstore/filestore-xfs.yaml ceph-12.1.2/qa/suites/rbd/librbd/objectstore/filestore-xfs.yaml --- ceph-12.1.1/qa/suites/rbd/librbd/objectstore/filestore-xfs.yaml 2017-07-17 16:56:02.000000000 +0000 +++ ceph-12.1.2/qa/suites/rbd/librbd/objectstore/filestore-xfs.yaml 2017-08-01 17:55:40.000000000 +0000 @@ -5,3 +5,11 @@ osd: osd objectstore: filestore osd sloppy crc: true + ceph-deploy: + fs: xfs + filestore: True + conf: + osd: + osd objectstore: filestore + osd sloppy crc: true + diff -Nru ceph-12.1.1/qa/suites/rbd/librbd/pool/replicated-data-pool.yaml ceph-12.1.2/qa/suites/rbd/librbd/pool/replicated-data-pool.yaml --- ceph-12.1.1/qa/suites/rbd/librbd/pool/replicated-data-pool.yaml 2017-07-17 16:56:02.000000000 +0000 +++ ceph-12.1.2/qa/suites/rbd/librbd/pool/replicated-data-pool.yaml 2017-08-01 17:55:40.000000000 +0000 @@ -2,6 +2,7 @@ - exec: client.0: - sudo ceph osd pool create datapool 4 + - rbd pool init datapool overrides: ceph: diff -Nru ceph-12.1.1/qa/suites/rbd/maintenance/filestore-xfs.yaml ceph-12.1.2/qa/suites/rbd/maintenance/filestore-xfs.yaml --- ceph-12.1.1/qa/suites/rbd/maintenance/filestore-xfs.yaml 2017-07-17 16:56:02.000000000 +0000 +++ ceph-12.1.2/qa/suites/rbd/maintenance/filestore-xfs.yaml 2017-08-01 17:55:40.000000000 +0000 @@ -5,3 +5,11 @@ osd: osd objectstore: filestore osd sloppy crc: true + ceph-deploy: + fs: xfs + filestore: True + conf: + osd: + osd objectstore: filestore + osd sloppy crc: true + diff -Nru ceph-12.1.1/qa/suites/rbd/maintenance/objectstore/bluestore.yaml ceph-12.1.2/qa/suites/rbd/maintenance/objectstore/bluestore.yaml --- ceph-12.1.1/qa/suites/rbd/maintenance/objectstore/bluestore.yaml 2017-07-17 16:56:02.000000000 +0000 +++ ceph-12.1.2/qa/suites/rbd/maintenance/objectstore/bluestore.yaml 2017-08-01 17:55:40.000000000 +0000 @@ -20,3 +20,21 @@ osd failsafe full ratio: .95 # this doesn't work with failures bc the log writes are not atomic across the two backends # bluestore bluefs env mirror: true + ceph-deploy: + fs: xfs + bluestore: yes + conf: + osd: + osd objectstore: bluestore + bluestore block size: 96636764160 + debug bluestore: 30 + debug bdev: 20 + debug bluefs: 20 + debug rocksdb: 10 + bluestore fsck on mount: true + # lower the full ratios since we can fill up a 100gb osd so quickly + mon osd full ratio: .9 + mon osd backfillfull_ratio: .85 + mon osd nearfull ratio: .8 + osd failsafe full ratio: .95 + diff -Nru ceph-12.1.1/qa/suites/rbd/maintenance/objectstore/filestore-xfs.yaml ceph-12.1.2/qa/suites/rbd/maintenance/objectstore/filestore-xfs.yaml --- ceph-12.1.1/qa/suites/rbd/maintenance/objectstore/filestore-xfs.yaml 2017-07-17 16:56:02.000000000 +0000 +++ ceph-12.1.2/qa/suites/rbd/maintenance/objectstore/filestore-xfs.yaml 2017-08-01 17:55:40.000000000 +0000 @@ -5,3 +5,11 @@ osd: osd objectstore: filestore osd sloppy crc: true + ceph-deploy: + fs: xfs + filestore: True + conf: + osd: + osd objectstore: filestore + osd sloppy crc: true + diff -Nru ceph-12.1.1/qa/suites/rbd/mirror/objectstore/bluestore.yaml ceph-12.1.2/qa/suites/rbd/mirror/objectstore/bluestore.yaml --- ceph-12.1.1/qa/suites/rbd/mirror/objectstore/bluestore.yaml 2017-07-17 16:56:02.000000000 +0000 +++ ceph-12.1.2/qa/suites/rbd/mirror/objectstore/bluestore.yaml 2017-08-01 17:55:40.000000000 +0000 @@ -20,3 +20,21 @@ osd failsafe full ratio: .95 # this doesn't work with failures bc the log writes are not atomic across the two backends # bluestore bluefs env mirror: true + ceph-deploy: + fs: xfs + bluestore: yes + conf: + osd: + osd objectstore: bluestore + bluestore block size: 96636764160 + debug bluestore: 30 + debug bdev: 20 + debug bluefs: 20 + debug rocksdb: 10 + bluestore fsck on mount: true + # lower the full ratios since we can fill up a 100gb osd so quickly + mon osd full ratio: .9 + mon osd backfillfull_ratio: .85 + mon osd nearfull ratio: .8 + osd failsafe full ratio: .95 + diff -Nru ceph-12.1.1/qa/suites/rbd/mirror/objectstore/filestore-xfs.yaml ceph-12.1.2/qa/suites/rbd/mirror/objectstore/filestore-xfs.yaml --- ceph-12.1.1/qa/suites/rbd/mirror/objectstore/filestore-xfs.yaml 2017-07-17 16:56:02.000000000 +0000 +++ ceph-12.1.2/qa/suites/rbd/mirror/objectstore/filestore-xfs.yaml 2017-08-01 17:55:40.000000000 +0000 @@ -5,3 +5,11 @@ osd: osd objectstore: filestore osd sloppy crc: true + ceph-deploy: + fs: xfs + filestore: True + conf: + osd: + osd objectstore: filestore + osd sloppy crc: true + diff -Nru ceph-12.1.1/qa/suites/rbd/mirror/rbd-mirror/one-per-cluster.yaml ceph-12.1.2/qa/suites/rbd/mirror/rbd-mirror/one-per-cluster.yaml --- ceph-12.1.1/qa/suites/rbd/mirror/rbd-mirror/one-per-cluster.yaml 2017-07-17 16:56:02.000000000 +0000 +++ ceph-12.1.2/qa/suites/rbd/mirror/rbd-mirror/one-per-cluster.yaml 2017-08-01 17:55:40.000000000 +0000 @@ -8,6 +8,11 @@ admin socket: /var/run/ceph/$cluster-$name.asok pid file: /var/run/ceph/$cluster-$name.pid tasks: +- exec: + cluster1.client.mirror: + - ceph --cluster cluster1 auth caps client.mirror mon 'profile rbd' osd 'profile rbd' + cluster2.client.mirror: + - ceph --cluster cluster2 auth caps client.mirror mon 'profile rbd' osd 'profile rbd' - rbd-mirror: client: cluster1.client.mirror - rbd-mirror: diff -Nru ceph-12.1.1/qa/suites/rbd/mirror-ha/objectstore/bluestore.yaml ceph-12.1.2/qa/suites/rbd/mirror-ha/objectstore/bluestore.yaml --- ceph-12.1.1/qa/suites/rbd/mirror-ha/objectstore/bluestore.yaml 2017-07-17 16:56:02.000000000 +0000 +++ ceph-12.1.2/qa/suites/rbd/mirror-ha/objectstore/bluestore.yaml 2017-08-01 17:55:40.000000000 +0000 @@ -20,3 +20,21 @@ osd failsafe full ratio: .95 # this doesn't work with failures bc the log writes are not atomic across the two backends # bluestore bluefs env mirror: true + ceph-deploy: + fs: xfs + bluestore: yes + conf: + osd: + osd objectstore: bluestore + bluestore block size: 96636764160 + debug bluestore: 30 + debug bdev: 20 + debug bluefs: 20 + debug rocksdb: 10 + bluestore fsck on mount: true + # lower the full ratios since we can fill up a 100gb osd so quickly + mon osd full ratio: .9 + mon osd backfillfull_ratio: .85 + mon osd nearfull ratio: .8 + osd failsafe full ratio: .95 + diff -Nru ceph-12.1.1/qa/suites/rbd/mirror-ha/objectstore/filestore-xfs.yaml ceph-12.1.2/qa/suites/rbd/mirror-ha/objectstore/filestore-xfs.yaml --- ceph-12.1.1/qa/suites/rbd/mirror-ha/objectstore/filestore-xfs.yaml 2017-07-17 16:56:02.000000000 +0000 +++ ceph-12.1.2/qa/suites/rbd/mirror-ha/objectstore/filestore-xfs.yaml 2017-08-01 17:55:40.000000000 +0000 @@ -5,3 +5,11 @@ osd: osd objectstore: filestore osd sloppy crc: true + ceph-deploy: + fs: xfs + filestore: True + conf: + osd: + osd objectstore: filestore + osd sloppy crc: true + diff -Nru ceph-12.1.1/qa/suites/rbd/mirror-ha/workloads/rbd-mirror-ha-workunit.yaml ceph-12.1.2/qa/suites/rbd/mirror-ha/workloads/rbd-mirror-ha-workunit.yaml --- ceph-12.1.1/qa/suites/rbd/mirror-ha/workloads/rbd-mirror-ha-workunit.yaml 2017-07-17 16:56:02.000000000 +0000 +++ ceph-12.1.2/qa/suites/rbd/mirror-ha/workloads/rbd-mirror-ha-workunit.yaml 2017-08-01 17:55:40.000000000 +0000 @@ -1,6 +1,11 @@ meta: - desc: run the rbd_mirror_ha.sh workunit to test the rbd-mirror daemon tasks: +- exec: + cluster1.client.mirror: + - ceph --cluster cluster1 auth caps client.mirror mon 'profile rbd' osd 'profile rbd' + cluster2.client.mirror: + - ceph --cluster cluster2 auth caps client.mirror mon 'profile rbd' osd 'profile rbd' - workunit: clients: cluster1.client.mirror: [rbd/rbd_mirror_ha.sh] diff -Nru ceph-12.1.1/qa/suites/rbd/nbd/objectstore/bluestore.yaml ceph-12.1.2/qa/suites/rbd/nbd/objectstore/bluestore.yaml --- ceph-12.1.1/qa/suites/rbd/nbd/objectstore/bluestore.yaml 2017-07-17 16:56:02.000000000 +0000 +++ ceph-12.1.2/qa/suites/rbd/nbd/objectstore/bluestore.yaml 2017-08-01 17:55:40.000000000 +0000 @@ -20,3 +20,21 @@ osd failsafe full ratio: .95 # this doesn't work with failures bc the log writes are not atomic across the two backends # bluestore bluefs env mirror: true + ceph-deploy: + fs: xfs + bluestore: yes + conf: + osd: + osd objectstore: bluestore + bluestore block size: 96636764160 + debug bluestore: 30 + debug bdev: 20 + debug bluefs: 20 + debug rocksdb: 10 + bluestore fsck on mount: true + # lower the full ratios since we can fill up a 100gb osd so quickly + mon osd full ratio: .9 + mon osd backfillfull_ratio: .85 + mon osd nearfull ratio: .8 + osd failsafe full ratio: .95 + diff -Nru ceph-12.1.1/qa/suites/rbd/nbd/objectstore/filestore-xfs.yaml ceph-12.1.2/qa/suites/rbd/nbd/objectstore/filestore-xfs.yaml --- ceph-12.1.1/qa/suites/rbd/nbd/objectstore/filestore-xfs.yaml 2017-07-17 16:56:02.000000000 +0000 +++ ceph-12.1.2/qa/suites/rbd/nbd/objectstore/filestore-xfs.yaml 2017-08-01 17:55:40.000000000 +0000 @@ -5,3 +5,11 @@ osd: osd objectstore: filestore osd sloppy crc: true + ceph-deploy: + fs: xfs + filestore: True + conf: + osd: + osd objectstore: filestore + osd sloppy crc: true + diff -Nru ceph-12.1.1/qa/suites/rbd/nbd/thrashers/cache.yaml ceph-12.1.2/qa/suites/rbd/nbd/thrashers/cache.yaml --- ceph-12.1.1/qa/suites/rbd/nbd/thrashers/cache.yaml 2017-07-17 16:56:02.000000000 +0000 +++ ceph-12.1.2/qa/suites/rbd/nbd/thrashers/cache.yaml 2017-08-01 17:55:40.000000000 +0000 @@ -1,7 +1,7 @@ overrides: ceph: log-whitelist: - - wrongly marked me down + - but it is still running - objects unfound and apparently lost tasks: - exec: diff -Nru ceph-12.1.1/qa/suites/rbd/nbd/thrashers/default.yaml ceph-12.1.2/qa/suites/rbd/nbd/thrashers/default.yaml --- ceph-12.1.1/qa/suites/rbd/nbd/thrashers/default.yaml 2017-07-17 16:56:02.000000000 +0000 +++ ceph-12.1.2/qa/suites/rbd/nbd/thrashers/default.yaml 2017-08-01 17:55:40.000000000 +0000 @@ -1,7 +1,7 @@ overrides: ceph: log-whitelist: - - wrongly marked me down + - but it is still running - objects unfound and apparently lost tasks: - thrashosds: diff -Nru ceph-12.1.1/qa/suites/rbd/openstack/objectstore/bluestore.yaml ceph-12.1.2/qa/suites/rbd/openstack/objectstore/bluestore.yaml --- ceph-12.1.1/qa/suites/rbd/openstack/objectstore/bluestore.yaml 2017-07-17 16:56:02.000000000 +0000 +++ ceph-12.1.2/qa/suites/rbd/openstack/objectstore/bluestore.yaml 2017-08-01 17:55:40.000000000 +0000 @@ -20,3 +20,21 @@ osd failsafe full ratio: .95 # this doesn't work with failures bc the log writes are not atomic across the two backends # bluestore bluefs env mirror: true + ceph-deploy: + fs: xfs + bluestore: yes + conf: + osd: + osd objectstore: bluestore + bluestore block size: 96636764160 + debug bluestore: 30 + debug bdev: 20 + debug bluefs: 20 + debug rocksdb: 10 + bluestore fsck on mount: true + # lower the full ratios since we can fill up a 100gb osd so quickly + mon osd full ratio: .9 + mon osd backfillfull_ratio: .85 + mon osd nearfull ratio: .8 + osd failsafe full ratio: .95 + diff -Nru ceph-12.1.1/qa/suites/rbd/openstack/objectstore/filestore-xfs.yaml ceph-12.1.2/qa/suites/rbd/openstack/objectstore/filestore-xfs.yaml --- ceph-12.1.1/qa/suites/rbd/openstack/objectstore/filestore-xfs.yaml 2017-07-17 16:56:02.000000000 +0000 +++ ceph-12.1.2/qa/suites/rbd/openstack/objectstore/filestore-xfs.yaml 2017-08-01 17:55:40.000000000 +0000 @@ -5,3 +5,11 @@ osd: osd objectstore: filestore osd sloppy crc: true + ceph-deploy: + fs: xfs + filestore: True + conf: + osd: + osd objectstore: filestore + osd sloppy crc: true + diff -Nru ceph-12.1.1/qa/suites/rbd/qemu/msgr-failures/few.yaml ceph-12.1.2/qa/suites/rbd/qemu/msgr-failures/few.yaml --- ceph-12.1.1/qa/suites/rbd/qemu/msgr-failures/few.yaml 2017-07-17 16:56:02.000000000 +0000 +++ ceph-12.1.2/qa/suites/rbd/qemu/msgr-failures/few.yaml 2017-08-01 17:55:40.000000000 +0000 @@ -4,4 +4,4 @@ global: ms inject socket failures: 5000 log-whitelist: - - wrongly marked me down + - but it is still running diff -Nru ceph-12.1.1/qa/suites/rbd/qemu/objectstore/bluestore.yaml ceph-12.1.2/qa/suites/rbd/qemu/objectstore/bluestore.yaml --- ceph-12.1.1/qa/suites/rbd/qemu/objectstore/bluestore.yaml 2017-07-17 16:56:02.000000000 +0000 +++ ceph-12.1.2/qa/suites/rbd/qemu/objectstore/bluestore.yaml 2017-08-01 17:55:40.000000000 +0000 @@ -20,3 +20,21 @@ osd failsafe full ratio: .95 # this doesn't work with failures bc the log writes are not atomic across the two backends # bluestore bluefs env mirror: true + ceph-deploy: + fs: xfs + bluestore: yes + conf: + osd: + osd objectstore: bluestore + bluestore block size: 96636764160 + debug bluestore: 30 + debug bdev: 20 + debug bluefs: 20 + debug rocksdb: 10 + bluestore fsck on mount: true + # lower the full ratios since we can fill up a 100gb osd so quickly + mon osd full ratio: .9 + mon osd backfillfull_ratio: .85 + mon osd nearfull ratio: .8 + osd failsafe full ratio: .95 + diff -Nru ceph-12.1.1/qa/suites/rbd/qemu/objectstore/filestore-xfs.yaml ceph-12.1.2/qa/suites/rbd/qemu/objectstore/filestore-xfs.yaml --- ceph-12.1.1/qa/suites/rbd/qemu/objectstore/filestore-xfs.yaml 2017-07-17 16:56:02.000000000 +0000 +++ ceph-12.1.2/qa/suites/rbd/qemu/objectstore/filestore-xfs.yaml 2017-08-01 17:55:40.000000000 +0000 @@ -5,3 +5,11 @@ osd: osd objectstore: filestore osd sloppy crc: true + ceph-deploy: + fs: xfs + filestore: True + conf: + osd: + osd objectstore: filestore + osd sloppy crc: true + diff -Nru ceph-12.1.1/qa/suites/rbd/qemu/pool/ec-cache-pool.yaml ceph-12.1.2/qa/suites/rbd/qemu/pool/ec-cache-pool.yaml --- ceph-12.1.1/qa/suites/rbd/qemu/pool/ec-cache-pool.yaml 2017-07-17 16:56:02.000000000 +0000 +++ ceph-12.1.2/qa/suites/rbd/qemu/pool/ec-cache-pool.yaml 2017-08-01 17:55:40.000000000 +0000 @@ -17,3 +17,4 @@ - sudo ceph osd pool set cache hit_set_count 8 - sudo ceph osd pool set cache hit_set_period 60 - sudo ceph osd pool set cache target_max_objects 250 + - rbd pool init rbd diff -Nru ceph-12.1.1/qa/suites/rbd/qemu/pool/ec-data-pool.yaml ceph-12.1.2/qa/suites/rbd/qemu/pool/ec-data-pool.yaml --- ceph-12.1.1/qa/suites/rbd/qemu/pool/ec-data-pool.yaml 2017-07-17 16:56:02.000000000 +0000 +++ ceph-12.1.2/qa/suites/rbd/qemu/pool/ec-data-pool.yaml 2017-08-01 17:55:40.000000000 +0000 @@ -4,6 +4,7 @@ - sudo ceph osd erasure-code-profile set teuthologyprofile crush-failure-domain=osd m=1 k=2 - sudo ceph osd pool create datapool 4 4 erasure teuthologyprofile - sudo ceph osd pool set datapool allow_ec_overwrites true + - rbd pool init datapool overrides: thrashosds: diff -Nru ceph-12.1.1/qa/suites/rbd/qemu/pool/replicated-data-pool.yaml ceph-12.1.2/qa/suites/rbd/qemu/pool/replicated-data-pool.yaml --- ceph-12.1.1/qa/suites/rbd/qemu/pool/replicated-data-pool.yaml 2017-07-17 16:56:02.000000000 +0000 +++ ceph-12.1.2/qa/suites/rbd/qemu/pool/replicated-data-pool.yaml 2017-08-01 17:55:40.000000000 +0000 @@ -2,6 +2,7 @@ - exec: client.0: - sudo ceph osd pool create datapool 4 + - rbd pool init datapool overrides: ceph: diff -Nru ceph-12.1.1/qa/suites/rbd/singleton/objectstore/bluestore.yaml ceph-12.1.2/qa/suites/rbd/singleton/objectstore/bluestore.yaml --- ceph-12.1.1/qa/suites/rbd/singleton/objectstore/bluestore.yaml 2017-07-17 16:56:02.000000000 +0000 +++ ceph-12.1.2/qa/suites/rbd/singleton/objectstore/bluestore.yaml 2017-08-01 17:55:40.000000000 +0000 @@ -20,3 +20,21 @@ osd failsafe full ratio: .95 # this doesn't work with failures bc the log writes are not atomic across the two backends # bluestore bluefs env mirror: true + ceph-deploy: + fs: xfs + bluestore: yes + conf: + osd: + osd objectstore: bluestore + bluestore block size: 96636764160 + debug bluestore: 30 + debug bdev: 20 + debug bluefs: 20 + debug rocksdb: 10 + bluestore fsck on mount: true + # lower the full ratios since we can fill up a 100gb osd so quickly + mon osd full ratio: .9 + mon osd backfillfull_ratio: .85 + mon osd nearfull ratio: .8 + osd failsafe full ratio: .95 + diff -Nru ceph-12.1.1/qa/suites/rbd/singleton/objectstore/filestore-xfs.yaml ceph-12.1.2/qa/suites/rbd/singleton/objectstore/filestore-xfs.yaml --- ceph-12.1.1/qa/suites/rbd/singleton/objectstore/filestore-xfs.yaml 2017-07-17 16:56:02.000000000 +0000 +++ ceph-12.1.2/qa/suites/rbd/singleton/objectstore/filestore-xfs.yaml 2017-08-01 17:55:40.000000000 +0000 @@ -5,3 +5,11 @@ osd: osd objectstore: filestore osd sloppy crc: true + ceph-deploy: + fs: xfs + filestore: True + conf: + osd: + osd objectstore: filestore + osd sloppy crc: true + diff -Nru ceph-12.1.1/qa/suites/rbd/thrash/objectstore/bluestore.yaml ceph-12.1.2/qa/suites/rbd/thrash/objectstore/bluestore.yaml --- ceph-12.1.1/qa/suites/rbd/thrash/objectstore/bluestore.yaml 2017-07-17 16:56:02.000000000 +0000 +++ ceph-12.1.2/qa/suites/rbd/thrash/objectstore/bluestore.yaml 2017-08-01 17:55:40.000000000 +0000 @@ -20,3 +20,21 @@ osd failsafe full ratio: .95 # this doesn't work with failures bc the log writes are not atomic across the two backends # bluestore bluefs env mirror: true + ceph-deploy: + fs: xfs + bluestore: yes + conf: + osd: + osd objectstore: bluestore + bluestore block size: 96636764160 + debug bluestore: 30 + debug bdev: 20 + debug bluefs: 20 + debug rocksdb: 10 + bluestore fsck on mount: true + # lower the full ratios since we can fill up a 100gb osd so quickly + mon osd full ratio: .9 + mon osd backfillfull_ratio: .85 + mon osd nearfull ratio: .8 + osd failsafe full ratio: .95 + diff -Nru ceph-12.1.1/qa/suites/rbd/thrash/objectstore/filestore-xfs.yaml ceph-12.1.2/qa/suites/rbd/thrash/objectstore/filestore-xfs.yaml --- ceph-12.1.1/qa/suites/rbd/thrash/objectstore/filestore-xfs.yaml 2017-07-17 16:56:02.000000000 +0000 +++ ceph-12.1.2/qa/suites/rbd/thrash/objectstore/filestore-xfs.yaml 2017-08-01 17:55:40.000000000 +0000 @@ -5,3 +5,11 @@ osd: osd objectstore: filestore osd sloppy crc: true + ceph-deploy: + fs: xfs + filestore: True + conf: + osd: + osd objectstore: filestore + osd sloppy crc: true + diff -Nru ceph-12.1.1/qa/suites/rbd/thrash/thrashers/cache.yaml ceph-12.1.2/qa/suites/rbd/thrash/thrashers/cache.yaml --- ceph-12.1.1/qa/suites/rbd/thrash/thrashers/cache.yaml 2017-07-17 16:56:02.000000000 +0000 +++ ceph-12.1.2/qa/suites/rbd/thrash/thrashers/cache.yaml 2017-08-01 17:55:40.000000000 +0000 @@ -1,7 +1,7 @@ overrides: ceph: log-whitelist: - - wrongly marked me down + - but it is still running - objects unfound and apparently lost tasks: - exec: diff -Nru ceph-12.1.1/qa/suites/rbd/thrash/thrashers/default.yaml ceph-12.1.2/qa/suites/rbd/thrash/thrashers/default.yaml --- ceph-12.1.1/qa/suites/rbd/thrash/thrashers/default.yaml 2017-07-17 16:56:02.000000000 +0000 +++ ceph-12.1.2/qa/suites/rbd/thrash/thrashers/default.yaml 2017-08-01 17:55:40.000000000 +0000 @@ -1,7 +1,7 @@ overrides: ceph: log-whitelist: - - wrongly marked me down + - but it is still running - objects unfound and apparently lost tasks: - thrashosds: diff -Nru ceph-12.1.1/qa/suites/rbd/valgrind/objectstore/bluestore.yaml ceph-12.1.2/qa/suites/rbd/valgrind/objectstore/bluestore.yaml --- ceph-12.1.1/qa/suites/rbd/valgrind/objectstore/bluestore.yaml 2017-07-17 16:56:02.000000000 +0000 +++ ceph-12.1.2/qa/suites/rbd/valgrind/objectstore/bluestore.yaml 2017-08-01 17:55:40.000000000 +0000 @@ -20,3 +20,21 @@ osd failsafe full ratio: .95 # this doesn't work with failures bc the log writes are not atomic across the two backends # bluestore bluefs env mirror: true + ceph-deploy: + fs: xfs + bluestore: yes + conf: + osd: + osd objectstore: bluestore + bluestore block size: 96636764160 + debug bluestore: 30 + debug bdev: 20 + debug bluefs: 20 + debug rocksdb: 10 + bluestore fsck on mount: true + # lower the full ratios since we can fill up a 100gb osd so quickly + mon osd full ratio: .9 + mon osd backfillfull_ratio: .85 + mon osd nearfull ratio: .8 + osd failsafe full ratio: .95 + diff -Nru ceph-12.1.1/qa/suites/rbd/valgrind/objectstore/filestore-xfs.yaml ceph-12.1.2/qa/suites/rbd/valgrind/objectstore/filestore-xfs.yaml --- ceph-12.1.1/qa/suites/rbd/valgrind/objectstore/filestore-xfs.yaml 2017-07-17 16:56:02.000000000 +0000 +++ ceph-12.1.2/qa/suites/rbd/valgrind/objectstore/filestore-xfs.yaml 2017-08-01 17:55:40.000000000 +0000 @@ -5,3 +5,11 @@ osd: osd objectstore: filestore osd sloppy crc: true + ceph-deploy: + fs: xfs + filestore: True + conf: + osd: + osd objectstore: filestore + osd sloppy crc: true + diff -Nru ceph-12.1.1/qa/suites/rest/basic/tasks/rest_test.yaml ceph-12.1.2/qa/suites/rest/basic/tasks/rest_test.yaml --- ceph-12.1.1/qa/suites/rest/basic/tasks/rest_test.yaml 2017-07-17 16:56:02.000000000 +0000 +++ ceph-12.1.2/qa/suites/rest/basic/tasks/rest_test.yaml 2017-08-01 17:55:40.000000000 +0000 @@ -20,7 +20,7 @@ - ceph: fs: xfs log-whitelist: - - wrongly marked me down + - but it is still running conf: client.rest0: debug ms: 1 diff -Nru ceph-12.1.1/qa/suites/rgw/multifs/objectstore/bluestore.yaml ceph-12.1.2/qa/suites/rgw/multifs/objectstore/bluestore.yaml --- ceph-12.1.1/qa/suites/rgw/multifs/objectstore/bluestore.yaml 2017-07-17 16:56:02.000000000 +0000 +++ ceph-12.1.2/qa/suites/rgw/multifs/objectstore/bluestore.yaml 2017-08-01 17:55:40.000000000 +0000 @@ -20,3 +20,21 @@ osd failsafe full ratio: .95 # this doesn't work with failures bc the log writes are not atomic across the two backends # bluestore bluefs env mirror: true + ceph-deploy: + fs: xfs + bluestore: yes + conf: + osd: + osd objectstore: bluestore + bluestore block size: 96636764160 + debug bluestore: 30 + debug bdev: 20 + debug bluefs: 20 + debug rocksdb: 10 + bluestore fsck on mount: true + # lower the full ratios since we can fill up a 100gb osd so quickly + mon osd full ratio: .9 + mon osd backfillfull_ratio: .85 + mon osd nearfull ratio: .8 + osd failsafe full ratio: .95 + diff -Nru ceph-12.1.1/qa/suites/rgw/multifs/objectstore/filestore-xfs.yaml ceph-12.1.2/qa/suites/rgw/multifs/objectstore/filestore-xfs.yaml --- ceph-12.1.1/qa/suites/rgw/multifs/objectstore/filestore-xfs.yaml 2017-07-17 16:56:02.000000000 +0000 +++ ceph-12.1.2/qa/suites/rgw/multifs/objectstore/filestore-xfs.yaml 2017-08-01 17:55:40.000000000 +0000 @@ -5,3 +5,11 @@ osd: osd objectstore: filestore osd sloppy crc: true + ceph-deploy: + fs: xfs + filestore: True + conf: + osd: + osd objectstore: filestore + osd sloppy crc: true + diff -Nru ceph-12.1.1/qa/suites/rgw/singleton/objectstore/bluestore.yaml ceph-12.1.2/qa/suites/rgw/singleton/objectstore/bluestore.yaml --- ceph-12.1.1/qa/suites/rgw/singleton/objectstore/bluestore.yaml 2017-07-17 16:56:02.000000000 +0000 +++ ceph-12.1.2/qa/suites/rgw/singleton/objectstore/bluestore.yaml 2017-08-01 17:55:40.000000000 +0000 @@ -20,3 +20,21 @@ osd failsafe full ratio: .95 # this doesn't work with failures bc the log writes are not atomic across the two backends # bluestore bluefs env mirror: true + ceph-deploy: + fs: xfs + bluestore: yes + conf: + osd: + osd objectstore: bluestore + bluestore block size: 96636764160 + debug bluestore: 30 + debug bdev: 20 + debug bluefs: 20 + debug rocksdb: 10 + bluestore fsck on mount: true + # lower the full ratios since we can fill up a 100gb osd so quickly + mon osd full ratio: .9 + mon osd backfillfull_ratio: .85 + mon osd nearfull ratio: .8 + osd failsafe full ratio: .95 + diff -Nru ceph-12.1.1/qa/suites/rgw/singleton/objectstore/filestore-xfs.yaml ceph-12.1.2/qa/suites/rgw/singleton/objectstore/filestore-xfs.yaml --- ceph-12.1.1/qa/suites/rgw/singleton/objectstore/filestore-xfs.yaml 2017-07-17 16:56:02.000000000 +0000 +++ ceph-12.1.2/qa/suites/rgw/singleton/objectstore/filestore-xfs.yaml 2017-08-01 17:55:40.000000000 +0000 @@ -5,3 +5,11 @@ osd: osd objectstore: filestore osd sloppy crc: true + ceph-deploy: + fs: xfs + filestore: True + conf: + osd: + osd objectstore: filestore + osd sloppy crc: true + diff -Nru ceph-12.1.1/qa/suites/rgw/thrash/objectstore/bluestore.yaml ceph-12.1.2/qa/suites/rgw/thrash/objectstore/bluestore.yaml --- ceph-12.1.1/qa/suites/rgw/thrash/objectstore/bluestore.yaml 2017-07-17 16:56:02.000000000 +0000 +++ ceph-12.1.2/qa/suites/rgw/thrash/objectstore/bluestore.yaml 2017-08-01 17:55:40.000000000 +0000 @@ -20,3 +20,21 @@ osd failsafe full ratio: .95 # this doesn't work with failures bc the log writes are not atomic across the two backends # bluestore bluefs env mirror: true + ceph-deploy: + fs: xfs + bluestore: yes + conf: + osd: + osd objectstore: bluestore + bluestore block size: 96636764160 + debug bluestore: 30 + debug bdev: 20 + debug bluefs: 20 + debug rocksdb: 10 + bluestore fsck on mount: true + # lower the full ratios since we can fill up a 100gb osd so quickly + mon osd full ratio: .9 + mon osd backfillfull_ratio: .85 + mon osd nearfull ratio: .8 + osd failsafe full ratio: .95 + diff -Nru ceph-12.1.1/qa/suites/rgw/thrash/objectstore/filestore-xfs.yaml ceph-12.1.2/qa/suites/rgw/thrash/objectstore/filestore-xfs.yaml --- ceph-12.1.1/qa/suites/rgw/thrash/objectstore/filestore-xfs.yaml 2017-07-17 16:56:02.000000000 +0000 +++ ceph-12.1.2/qa/suites/rgw/thrash/objectstore/filestore-xfs.yaml 2017-08-01 17:55:40.000000000 +0000 @@ -5,3 +5,11 @@ osd: osd objectstore: filestore osd sloppy crc: true + ceph-deploy: + fs: xfs + filestore: True + conf: + osd: + osd objectstore: filestore + osd sloppy crc: true + diff -Nru ceph-12.1.1/qa/suites/rgw/verify/objectstore/bluestore.yaml ceph-12.1.2/qa/suites/rgw/verify/objectstore/bluestore.yaml --- ceph-12.1.1/qa/suites/rgw/verify/objectstore/bluestore.yaml 2017-07-17 16:56:02.000000000 +0000 +++ ceph-12.1.2/qa/suites/rgw/verify/objectstore/bluestore.yaml 2017-08-01 17:55:40.000000000 +0000 @@ -20,3 +20,21 @@ osd failsafe full ratio: .95 # this doesn't work with failures bc the log writes are not atomic across the two backends # bluestore bluefs env mirror: true + ceph-deploy: + fs: xfs + bluestore: yes + conf: + osd: + osd objectstore: bluestore + bluestore block size: 96636764160 + debug bluestore: 30 + debug bdev: 20 + debug bluefs: 20 + debug rocksdb: 10 + bluestore fsck on mount: true + # lower the full ratios since we can fill up a 100gb osd so quickly + mon osd full ratio: .9 + mon osd backfillfull_ratio: .85 + mon osd nearfull ratio: .8 + osd failsafe full ratio: .95 + diff -Nru ceph-12.1.1/qa/suites/rgw/verify/objectstore/filestore-xfs.yaml ceph-12.1.2/qa/suites/rgw/verify/objectstore/filestore-xfs.yaml --- ceph-12.1.1/qa/suites/rgw/verify/objectstore/filestore-xfs.yaml 2017-07-17 16:56:02.000000000 +0000 +++ ceph-12.1.2/qa/suites/rgw/verify/objectstore/filestore-xfs.yaml 2017-08-01 17:55:40.000000000 +0000 @@ -5,3 +5,11 @@ osd: osd objectstore: filestore osd sloppy crc: true + ceph-deploy: + fs: xfs + filestore: True + conf: + osd: + osd objectstore: filestore + osd sloppy crc: true + diff -Nru ceph-12.1.1/qa/suites/samba/objectstore/bluestore.yaml ceph-12.1.2/qa/suites/samba/objectstore/bluestore.yaml --- ceph-12.1.1/qa/suites/samba/objectstore/bluestore.yaml 2017-07-17 16:56:02.000000000 +0000 +++ ceph-12.1.2/qa/suites/samba/objectstore/bluestore.yaml 2017-08-01 17:55:40.000000000 +0000 @@ -20,3 +20,21 @@ osd failsafe full ratio: .95 # this doesn't work with failures bc the log writes are not atomic across the two backends # bluestore bluefs env mirror: true + ceph-deploy: + fs: xfs + bluestore: yes + conf: + osd: + osd objectstore: bluestore + bluestore block size: 96636764160 + debug bluestore: 30 + debug bdev: 20 + debug bluefs: 20 + debug rocksdb: 10 + bluestore fsck on mount: true + # lower the full ratios since we can fill up a 100gb osd so quickly + mon osd full ratio: .9 + mon osd backfillfull_ratio: .85 + mon osd nearfull ratio: .8 + osd failsafe full ratio: .95 + diff -Nru ceph-12.1.1/qa/suites/samba/objectstore/filestore-xfs.yaml ceph-12.1.2/qa/suites/samba/objectstore/filestore-xfs.yaml --- ceph-12.1.1/qa/suites/samba/objectstore/filestore-xfs.yaml 2017-07-17 16:56:02.000000000 +0000 +++ ceph-12.1.2/qa/suites/samba/objectstore/filestore-xfs.yaml 2017-08-01 17:55:40.000000000 +0000 @@ -5,3 +5,11 @@ osd: osd objectstore: filestore osd sloppy crc: true + ceph-deploy: + fs: xfs + filestore: True + conf: + osd: + osd objectstore: filestore + osd sloppy crc: true + diff -Nru ceph-12.1.1/qa/suites/smoke/1node/objectstore/filestore-xfs.yaml ceph-12.1.2/qa/suites/smoke/1node/objectstore/filestore-xfs.yaml --- ceph-12.1.1/qa/suites/smoke/1node/objectstore/filestore-xfs.yaml 2017-07-17 16:56:02.000000000 +0000 +++ ceph-12.1.2/qa/suites/smoke/1node/objectstore/filestore-xfs.yaml 2017-08-01 17:55:40.000000000 +0000 @@ -5,3 +5,11 @@ osd: osd objectstore: filestore osd sloppy crc: true + ceph-deploy: + fs: xfs + filestore: True + conf: + osd: + osd objectstore: filestore + osd sloppy crc: true + diff -Nru ceph-12.1.1/qa/suites/smoke/basic/objectstore/bluestore.yaml ceph-12.1.2/qa/suites/smoke/basic/objectstore/bluestore.yaml --- ceph-12.1.1/qa/suites/smoke/basic/objectstore/bluestore.yaml 2017-07-17 16:56:02.000000000 +0000 +++ ceph-12.1.2/qa/suites/smoke/basic/objectstore/bluestore.yaml 2017-08-01 17:55:40.000000000 +0000 @@ -20,3 +20,21 @@ osd failsafe full ratio: .95 # this doesn't work with failures bc the log writes are not atomic across the two backends # bluestore bluefs env mirror: true + ceph-deploy: + fs: xfs + bluestore: yes + conf: + osd: + osd objectstore: bluestore + bluestore block size: 96636764160 + debug bluestore: 30 + debug bdev: 20 + debug bluefs: 20 + debug rocksdb: 10 + bluestore fsck on mount: true + # lower the full ratios since we can fill up a 100gb osd so quickly + mon osd full ratio: .9 + mon osd backfillfull_ratio: .85 + mon osd nearfull ratio: .8 + osd failsafe full ratio: .95 + diff -Nru ceph-12.1.1/qa/suites/smoke/basic/tasks/mon_thrash.yaml ceph-12.1.2/qa/suites/smoke/basic/tasks/mon_thrash.yaml --- ceph-12.1.1/qa/suites/smoke/basic/tasks/mon_thrash.yaml 2017-07-17 16:56:02.000000000 +0000 +++ ceph-12.1.2/qa/suites/smoke/basic/tasks/mon_thrash.yaml 2017-08-01 17:55:40.000000000 +0000 @@ -2,6 +2,7 @@ ceph: log-whitelist: - reached quota + - (POOL_APP_NOT_ENABLED) conf: global: ms inject delay max: 1 diff -Nru ceph-12.1.1/qa/suites/smoke/basic/tasks/rados_api_tests.yaml ceph-12.1.2/qa/suites/smoke/basic/tasks/rados_api_tests.yaml --- ceph-12.1.1/qa/suites/smoke/basic/tasks/rados_api_tests.yaml 2017-07-17 16:56:02.000000000 +0000 +++ ceph-12.1.2/qa/suites/smoke/basic/tasks/rados_api_tests.yaml 2017-08-01 17:55:40.000000000 +0000 @@ -4,8 +4,9 @@ fs: ext4 log-whitelist: - reached quota - - wrongly marked me down + - but it is still running - objects unfound and apparently lost + - (POOL_APP_NOT_ENABLED) - thrashosds: chance_pgnum_grow: 2 chance_pgpnum_fix: 1 diff -Nru ceph-12.1.1/qa/suites/smoke/basic/tasks/rados_bench.yaml ceph-12.1.2/qa/suites/smoke/basic/tasks/rados_bench.yaml --- ceph-12.1.1/qa/suites/smoke/basic/tasks/rados_bench.yaml 2017-07-17 16:56:02.000000000 +0000 +++ ceph-12.1.2/qa/suites/smoke/basic/tasks/rados_bench.yaml 2017-08-01 17:55:40.000000000 +0000 @@ -12,7 +12,7 @@ - ceph: fs: xfs log-whitelist: - - wrongly marked me down + - but it is still running - objects unfound and apparently lost - thrashosds: chance_pgnum_grow: 2 diff -Nru ceph-12.1.1/qa/suites/smoke/basic/tasks/rados_cache_snaps.yaml ceph-12.1.2/qa/suites/smoke/basic/tasks/rados_cache_snaps.yaml --- ceph-12.1.1/qa/suites/smoke/basic/tasks/rados_cache_snaps.yaml 2017-07-17 16:56:02.000000000 +0000 +++ ceph-12.1.2/qa/suites/smoke/basic/tasks/rados_cache_snaps.yaml 2017-08-01 17:55:40.000000000 +0000 @@ -2,7 +2,7 @@ - install: null - ceph: log-whitelist: - - wrongly marked me down + - but it is still running - objects unfound and apparently lost - thrashosds: chance_pgnum_grow: 2 @@ -11,6 +11,7 @@ - exec: client.0: - sudo ceph osd pool create base 4 + - sudo ceph osd pool application enable base rados - sudo ceph osd pool create cache 4 - sudo ceph osd tier add base cache - sudo ceph osd tier cache-mode cache writeback diff -Nru ceph-12.1.1/qa/suites/smoke/basic/tasks/rados_ec_snaps.yaml ceph-12.1.2/qa/suites/smoke/basic/tasks/rados_ec_snaps.yaml --- ceph-12.1.1/qa/suites/smoke/basic/tasks/rados_ec_snaps.yaml 2017-07-17 16:56:02.000000000 +0000 +++ ceph-12.1.2/qa/suites/smoke/basic/tasks/rados_ec_snaps.yaml 2017-08-01 17:55:40.000000000 +0000 @@ -3,7 +3,7 @@ - ceph: fs: xfs log-whitelist: - - wrongly marked me down + - but it is still running - objects unfound and apparently lost - thrashosds: chance_pgnum_grow: 3 diff -Nru ceph-12.1.1/qa/suites/smoke/basic/tasks/rados_python.yaml ceph-12.1.2/qa/suites/smoke/basic/tasks/rados_python.yaml --- ceph-12.1.1/qa/suites/smoke/basic/tasks/rados_python.yaml 2017-07-17 16:56:02.000000000 +0000 +++ ceph-12.1.2/qa/suites/smoke/basic/tasks/rados_python.yaml 2017-08-01 17:55:40.000000000 +0000 @@ -2,7 +2,7 @@ - install: - ceph: log-whitelist: - - wrongly marked me down + - but it is still running - ceph-fuse: - workunit: clients: diff -Nru ceph-12.1.1/qa/suites/smoke/systemd/objectstore/filestore-xfs.yaml ceph-12.1.2/qa/suites/smoke/systemd/objectstore/filestore-xfs.yaml --- ceph-12.1.1/qa/suites/smoke/systemd/objectstore/filestore-xfs.yaml 2017-07-17 16:56:02.000000000 +0000 +++ ceph-12.1.2/qa/suites/smoke/systemd/objectstore/filestore-xfs.yaml 2017-08-01 17:55:40.000000000 +0000 @@ -5,3 +5,11 @@ osd: osd objectstore: filestore osd sloppy crc: true + ceph-deploy: + fs: xfs + filestore: True + conf: + osd: + osd objectstore: filestore + osd sloppy crc: true + diff -Nru ceph-12.1.1/qa/suites/stress/thrash/thrashers/default.yaml ceph-12.1.2/qa/suites/stress/thrash/thrashers/default.yaml --- ceph-12.1.1/qa/suites/stress/thrash/thrashers/default.yaml 2017-07-17 16:56:02.000000000 +0000 +++ ceph-12.1.2/qa/suites/stress/thrash/thrashers/default.yaml 2017-08-01 17:55:40.000000000 +0000 @@ -2,6 +2,6 @@ - install: - ceph: log-whitelist: - - wrongly marked me down + - but it is still running - objects unfound and apparently lost - thrashosds: diff -Nru ceph-12.1.1/qa/suites/stress/thrash/thrashers/fast.yaml ceph-12.1.2/qa/suites/stress/thrash/thrashers/fast.yaml --- ceph-12.1.1/qa/suites/stress/thrash/thrashers/fast.yaml 2017-07-17 16:56:02.000000000 +0000 +++ ceph-12.1.2/qa/suites/stress/thrash/thrashers/fast.yaml 2017-08-01 17:55:40.000000000 +0000 @@ -2,7 +2,7 @@ - install: - ceph: log-whitelist: - - wrongly marked me down + - but it is still running - objects unfound and apparently lost - thrashosds: op_delay: 1 diff -Nru ceph-12.1.1/qa/suites/stress/thrash/thrashers/more-down.yaml ceph-12.1.2/qa/suites/stress/thrash/thrashers/more-down.yaml --- ceph-12.1.1/qa/suites/stress/thrash/thrashers/more-down.yaml 2017-07-17 16:56:02.000000000 +0000 +++ ceph-12.1.2/qa/suites/stress/thrash/thrashers/more-down.yaml 2017-08-01 17:55:40.000000000 +0000 @@ -2,7 +2,7 @@ - install: - ceph: log-whitelist: - - wrongly marked me down + - but it is still running - objects unfound and apparently lost - thrashosds: chance_down: 50 diff -Nru ceph-12.1.1/qa/suites/upgrade/hammer-jewel-x/parallel/0-cluster/start.yaml ceph-12.1.2/qa/suites/upgrade/hammer-jewel-x/parallel/0-cluster/start.yaml --- ceph-12.1.1/qa/suites/upgrade/hammer-jewel-x/parallel/0-cluster/start.yaml 2017-07-17 16:56:02.000000000 +0000 +++ ceph-12.1.2/qa/suites/upgrade/hammer-jewel-x/parallel/0-cluster/start.yaml 2017-08-01 17:55:40.000000000 +0000 @@ -5,6 +5,7 @@ mon warn on legacy crush tunables: false mon debug unsafe allow tier with nonempty snaps: true log-whitelist: + - but it is still running - wrongly marked me down - reached quota roles: diff -Nru ceph-12.1.1/qa/suites/upgrade/hammer-jewel-x/stress-split/3-thrash/default.yaml ceph-12.1.2/qa/suites/upgrade/hammer-jewel-x/stress-split/3-thrash/default.yaml --- ceph-12.1.1/qa/suites/upgrade/hammer-jewel-x/stress-split/3-thrash/default.yaml 2017-07-17 16:56:02.000000000 +0000 +++ ceph-12.1.2/qa/suites/upgrade/hammer-jewel-x/stress-split/3-thrash/default.yaml 2017-08-01 17:55:40.000000000 +0000 @@ -5,6 +5,7 @@ overrides: ceph: log-whitelist: + - but it is still running - wrongly marked me down - objects unfound and apparently lost - log bound mismatch @@ -20,4 +21,5 @@ chance_thrash_pg_upmap: 0 chance_thrash_pg_upmap_items: 0 disable_objectstore_tool_tests: true + chance_force_recovery: 0 - print: "**** done thrashosds 3-thrash" diff -Nru ceph-12.1.1/qa/suites/upgrade/hammer-jewel-x/tiering/0-cluster/start.yaml ceph-12.1.2/qa/suites/upgrade/hammer-jewel-x/tiering/0-cluster/start.yaml --- ceph-12.1.1/qa/suites/upgrade/hammer-jewel-x/tiering/0-cluster/start.yaml 2017-07-17 16:56:02.000000000 +0000 +++ ceph-12.1.2/qa/suites/upgrade/hammer-jewel-x/tiering/0-cluster/start.yaml 2017-08-01 17:55:40.000000000 +0000 @@ -4,6 +4,7 @@ mon: mon warn on legacy crush tunables: false log-whitelist: + - but it is still running - wrongly marked me down roles: - - mon.a diff -Nru ceph-12.1.1/qa/suites/upgrade/hammer-jewel-x/tiering/2-setup-cache-tiering/0-create-base-tier/create-ec-pool.yaml ceph-12.1.2/qa/suites/upgrade/hammer-jewel-x/tiering/2-setup-cache-tiering/0-create-base-tier/create-ec-pool.yaml --- ceph-12.1.1/qa/suites/upgrade/hammer-jewel-x/tiering/2-setup-cache-tiering/0-create-base-tier/create-ec-pool.yaml 2017-07-17 16:56:02.000000000 +0000 +++ ceph-12.1.2/qa/suites/upgrade/hammer-jewel-x/tiering/2-setup-cache-tiering/0-create-base-tier/create-ec-pool.yaml 2017-08-01 17:55:40.000000000 +0000 @@ -3,3 +3,4 @@ client.0: - ceph osd erasure-code-profile set t-profile crush-failure-domain=osd k=2 m=1 - ceph osd pool create base-pool 4 4 erasure t-profile + - ceph osd pool application enable base-pool rados diff -Nru ceph-12.1.1/qa/suites/upgrade/hammer-jewel-x/tiering/2-setup-cache-tiering/0-create-base-tier/create-replicated-pool.yaml ceph-12.1.2/qa/suites/upgrade/hammer-jewel-x/tiering/2-setup-cache-tiering/0-create-base-tier/create-replicated-pool.yaml --- ceph-12.1.1/qa/suites/upgrade/hammer-jewel-x/tiering/2-setup-cache-tiering/0-create-base-tier/create-replicated-pool.yaml 2017-07-17 16:56:02.000000000 +0000 +++ ceph-12.1.2/qa/suites/upgrade/hammer-jewel-x/tiering/2-setup-cache-tiering/0-create-base-tier/create-replicated-pool.yaml 2017-08-01 17:55:40.000000000 +0000 @@ -2,3 +2,4 @@ - exec: client.0: - ceph osd pool create base-pool 4 + - ceph osd pool application enable base-pool rados diff -Nru ceph-12.1.1/qa/suites/upgrade/jewel-x/parallel/1-jewel-install/jewel.yaml ceph-12.1.2/qa/suites/upgrade/jewel-x/parallel/1-jewel-install/jewel.yaml --- ceph-12.1.1/qa/suites/upgrade/jewel-x/parallel/1-jewel-install/jewel.yaml 2017-07-17 16:56:02.000000000 +0000 +++ ceph-12.1.2/qa/suites/upgrade/jewel-x/parallel/1-jewel-install/jewel.yaml 2017-08-01 17:55:40.000000000 +0000 @@ -11,6 +11,10 @@ - ceph: skip_mgr_daemons: true add_osds_to_crush: true + log-whitelist: + - overall HEALTH_ + - (FS_ + - (MDS_ - print: "**** done ceph" - install.upgrade: mon.a: diff -Nru ceph-12.1.1/qa/suites/upgrade/jewel-x/point-to-point-x/point-to-point-upgrade.yaml ceph-12.1.2/qa/suites/upgrade/jewel-x/point-to-point-x/point-to-point-upgrade.yaml --- ceph-12.1.1/qa/suites/upgrade/jewel-x/point-to-point-x/point-to-point-upgrade.yaml 2017-07-17 16:56:02.000000000 +0000 +++ ceph-12.1.2/qa/suites/upgrade/jewel-x/point-to-point-x/point-to-point-upgrade.yaml 2017-08-01 17:55:40.000000000 +0000 @@ -80,6 +80,9 @@ - workload_x - upgrade-sequence_x - print: "**** done parallel -x branch" +- exec: + osd.0: + - ceph osd set-require-min-compat-client luminous # Run librados tests on the -x upgraded cluster - install.upgrade: client.1: @@ -221,6 +224,5 @@ - exec: osd.0: - ceph osd require-osd-release luminous - - ceph osd set-require-min-compat-client luminous - ceph.healthy: - print: "**** done ceph.restart all -x branch mds/osd/mon" diff -Nru ceph-12.1.1/qa/suites/upgrade/jewel-x/stress-split/3-thrash/default.yaml ceph-12.1.2/qa/suites/upgrade/jewel-x/stress-split/3-thrash/default.yaml --- ceph-12.1.1/qa/suites/upgrade/jewel-x/stress-split/3-thrash/default.yaml 2017-07-17 16:56:02.000000000 +0000 +++ ceph-12.1.2/qa/suites/upgrade/jewel-x/stress-split/3-thrash/default.yaml 2017-08-01 17:55:40.000000000 +0000 @@ -5,6 +5,7 @@ overrides: ceph: log-whitelist: + - but it is still running - wrongly marked me down - objects unfound and apparently lost - log bound mismatch @@ -20,4 +21,5 @@ chance_thrash_pg_upmap: 0 chance_thrash_pg_upmap_items: 0 disable_objectstore_tool_tests: true + chance_force_recovery: 0 - print: "**** done thrashosds 3-thrash" diff -Nru ceph-12.1.1/qa/suites/upgrade/jewel-x/stress-split/thrashosds-health.yaml ceph-12.1.2/qa/suites/upgrade/jewel-x/stress-split/thrashosds-health.yaml --- ceph-12.1.1/qa/suites/upgrade/jewel-x/stress-split/thrashosds-health.yaml 1970-01-01 00:00:00.000000000 +0000 +++ ceph-12.1.2/qa/suites/upgrade/jewel-x/stress-split/thrashosds-health.yaml 2017-08-01 17:55:40.000000000 +0000 @@ -0,0 +1,13 @@ +overrides: + ceph: + log-whitelist: + - overall HEALTH_ + - (OSDMAP_FLAGS) + - (OSD_ + - (PG_ + - (POOL_ + - (CACHE_POOL_ + - (SMALLER_PGP_NUM) + - (OBJECT_ + - (REQUEST_SLOW) + - (TOO_FEW_PGS) diff -Nru ceph-12.1.1/qa/suites/upgrade/jewel-x/stress-split-erasure-code/3-thrash/default.yaml ceph-12.1.2/qa/suites/upgrade/jewel-x/stress-split-erasure-code/3-thrash/default.yaml --- ceph-12.1.1/qa/suites/upgrade/jewel-x/stress-split-erasure-code/3-thrash/default.yaml 2017-07-17 16:56:02.000000000 +0000 +++ ceph-12.1.2/qa/suites/upgrade/jewel-x/stress-split-erasure-code/3-thrash/default.yaml 2017-08-01 17:55:40.000000000 +0000 @@ -5,6 +5,7 @@ overrides: ceph: log-whitelist: + - but it is still running - wrongly marked me down - objects unfound and apparently lost - log bound mismatch @@ -20,4 +21,5 @@ chance_thrash_cluster_full: 0 chance_thrash_pg_upmap: 0 chance_thrash_pg_upmap_items: 0 + chance_force_recovery: 0 - print: "**** done thrashosds 3-thrash" diff -Nru ceph-12.1.1/qa/suites/upgrade/jewel-x/stress-split-erasure-code/thrashosds-health.yaml ceph-12.1.2/qa/suites/upgrade/jewel-x/stress-split-erasure-code/thrashosds-health.yaml --- ceph-12.1.1/qa/suites/upgrade/jewel-x/stress-split-erasure-code/thrashosds-health.yaml 1970-01-01 00:00:00.000000000 +0000 +++ ceph-12.1.2/qa/suites/upgrade/jewel-x/stress-split-erasure-code/thrashosds-health.yaml 2017-08-01 17:55:40.000000000 +0000 @@ -0,0 +1,13 @@ +overrides: + ceph: + log-whitelist: + - overall HEALTH_ + - (OSDMAP_FLAGS) + - (OSD_ + - (PG_ + - (POOL_ + - (CACHE_POOL_ + - (SMALLER_PGP_NUM) + - (OBJECT_ + - (REQUEST_SLOW) + - (TOO_FEW_PGS) diff -Nru ceph-12.1.1/qa/suites/upgrade/kraken-x/parallel/0-cluster/start.yaml ceph-12.1.2/qa/suites/upgrade/kraken-x/parallel/0-cluster/start.yaml --- ceph-12.1.1/qa/suites/upgrade/kraken-x/parallel/0-cluster/start.yaml 2017-07-17 16:56:02.000000000 +0000 +++ ceph-12.1.2/qa/suites/upgrade/kraken-x/parallel/0-cluster/start.yaml 2017-08-01 17:55:40.000000000 +0000 @@ -24,6 +24,8 @@ - scrub mismatch - ScrubResult - wrongly marked + - (POOL_APP_NOT_ENABLED) + - overall HEALTH_ conf: global: enable experimental unrecoverable data corrupting features: "*" diff -Nru ceph-12.1.1/qa/suites/upgrade/kraken-x/parallel/objectstore/bluestore.yaml ceph-12.1.2/qa/suites/upgrade/kraken-x/parallel/objectstore/bluestore.yaml --- ceph-12.1.1/qa/suites/upgrade/kraken-x/parallel/objectstore/bluestore.yaml 2017-07-17 16:56:02.000000000 +0000 +++ ceph-12.1.2/qa/suites/upgrade/kraken-x/parallel/objectstore/bluestore.yaml 2017-08-01 17:55:40.000000000 +0000 @@ -20,3 +20,21 @@ osd failsafe full ratio: .95 # this doesn't work with failures bc the log writes are not atomic across the two backends # bluestore bluefs env mirror: true + ceph-deploy: + fs: xfs + bluestore: yes + conf: + osd: + osd objectstore: bluestore + bluestore block size: 96636764160 + debug bluestore: 30 + debug bdev: 20 + debug bluefs: 20 + debug rocksdb: 10 + bluestore fsck on mount: true + # lower the full ratios since we can fill up a 100gb osd so quickly + mon osd full ratio: .9 + mon osd backfillfull_ratio: .85 + mon osd nearfull ratio: .8 + osd failsafe full ratio: .95 + diff -Nru ceph-12.1.1/qa/suites/upgrade/kraken-x/parallel/objectstore/filestore-xfs.yaml ceph-12.1.2/qa/suites/upgrade/kraken-x/parallel/objectstore/filestore-xfs.yaml --- ceph-12.1.1/qa/suites/upgrade/kraken-x/parallel/objectstore/filestore-xfs.yaml 2017-07-17 16:56:02.000000000 +0000 +++ ceph-12.1.2/qa/suites/upgrade/kraken-x/parallel/objectstore/filestore-xfs.yaml 2017-08-01 17:55:40.000000000 +0000 @@ -5,3 +5,11 @@ osd: osd objectstore: filestore osd sloppy crc: true + ceph-deploy: + fs: xfs + filestore: True + conf: + osd: + osd objectstore: filestore + osd sloppy crc: true + diff -Nru ceph-12.1.1/qa/suites/upgrade/kraken-x/stress-split/3-thrash/default.yaml ceph-12.1.2/qa/suites/upgrade/kraken-x/stress-split/3-thrash/default.yaml --- ceph-12.1.1/qa/suites/upgrade/kraken-x/stress-split/3-thrash/default.yaml 2017-07-17 16:56:02.000000000 +0000 +++ ceph-12.1.2/qa/suites/upgrade/kraken-x/stress-split/3-thrash/default.yaml 2017-08-01 17:55:40.000000000 +0000 @@ -5,6 +5,7 @@ overrides: ceph: log-whitelist: + - but it is still running - wrongly marked me down - objects unfound and apparently lost - log bound mismatch @@ -20,4 +21,5 @@ chance_thrash_pg_upmap: 0 chance_thrash_pg_upmap_items: 0 disable_objectstore_tool_tests: true + chance_force_recovery: 0 - print: "**** done thrashosds 3-thrash" diff -Nru ceph-12.1.1/qa/suites/upgrade/kraken-x/stress-split/objectstore/bluestore.yaml ceph-12.1.2/qa/suites/upgrade/kraken-x/stress-split/objectstore/bluestore.yaml --- ceph-12.1.1/qa/suites/upgrade/kraken-x/stress-split/objectstore/bluestore.yaml 2017-07-17 16:56:02.000000000 +0000 +++ ceph-12.1.2/qa/suites/upgrade/kraken-x/stress-split/objectstore/bluestore.yaml 2017-08-01 17:55:40.000000000 +0000 @@ -20,3 +20,21 @@ osd failsafe full ratio: .95 # this doesn't work with failures bc the log writes are not atomic across the two backends # bluestore bluefs env mirror: true + ceph-deploy: + fs: xfs + bluestore: yes + conf: + osd: + osd objectstore: bluestore + bluestore block size: 96636764160 + debug bluestore: 30 + debug bdev: 20 + debug bluefs: 20 + debug rocksdb: 10 + bluestore fsck on mount: true + # lower the full ratios since we can fill up a 100gb osd so quickly + mon osd full ratio: .9 + mon osd backfillfull_ratio: .85 + mon osd nearfull ratio: .8 + osd failsafe full ratio: .95 + diff -Nru ceph-12.1.1/qa/suites/upgrade/kraken-x/stress-split/objectstore/filestore-xfs.yaml ceph-12.1.2/qa/suites/upgrade/kraken-x/stress-split/objectstore/filestore-xfs.yaml --- ceph-12.1.1/qa/suites/upgrade/kraken-x/stress-split/objectstore/filestore-xfs.yaml 2017-07-17 16:56:02.000000000 +0000 +++ ceph-12.1.2/qa/suites/upgrade/kraken-x/stress-split/objectstore/filestore-xfs.yaml 2017-08-01 17:55:40.000000000 +0000 @@ -5,3 +5,11 @@ osd: osd objectstore: filestore osd sloppy crc: true + ceph-deploy: + fs: xfs + filestore: True + conf: + osd: + osd objectstore: filestore + osd sloppy crc: true + diff -Nru ceph-12.1.1/qa/suites/upgrade/kraken-x/stress-split-erasure-code/3-thrash/default.yaml ceph-12.1.2/qa/suites/upgrade/kraken-x/stress-split-erasure-code/3-thrash/default.yaml --- ceph-12.1.1/qa/suites/upgrade/kraken-x/stress-split-erasure-code/3-thrash/default.yaml 2017-07-17 16:56:02.000000000 +0000 +++ ceph-12.1.2/qa/suites/upgrade/kraken-x/stress-split-erasure-code/3-thrash/default.yaml 2017-08-01 17:55:40.000000000 +0000 @@ -5,6 +5,7 @@ overrides: ceph: log-whitelist: + - but it is still running - wrongly marked me down - objects unfound and apparently lost - log bound mismatch @@ -20,4 +21,5 @@ chance_thrash_cluster_full: 0 chance_thrash_pg_upmap: 0 chance_thrash_pg_upmap_items: 0 + chance_force_recovery: 0 - print: "**** done thrashosds 3-thrash" diff -Nru ceph-12.1.1/qa/suites/upgrade/kraken-x/stress-split-erasure-code/objectstore/bluestore.yaml ceph-12.1.2/qa/suites/upgrade/kraken-x/stress-split-erasure-code/objectstore/bluestore.yaml --- ceph-12.1.1/qa/suites/upgrade/kraken-x/stress-split-erasure-code/objectstore/bluestore.yaml 2017-07-17 16:56:02.000000000 +0000 +++ ceph-12.1.2/qa/suites/upgrade/kraken-x/stress-split-erasure-code/objectstore/bluestore.yaml 2017-08-01 17:55:40.000000000 +0000 @@ -20,3 +20,21 @@ osd failsafe full ratio: .95 # this doesn't work with failures bc the log writes are not atomic across the two backends # bluestore bluefs env mirror: true + ceph-deploy: + fs: xfs + bluestore: yes + conf: + osd: + osd objectstore: bluestore + bluestore block size: 96636764160 + debug bluestore: 30 + debug bdev: 20 + debug bluefs: 20 + debug rocksdb: 10 + bluestore fsck on mount: true + # lower the full ratios since we can fill up a 100gb osd so quickly + mon osd full ratio: .9 + mon osd backfillfull_ratio: .85 + mon osd nearfull ratio: .8 + osd failsafe full ratio: .95 + diff -Nru ceph-12.1.1/qa/suites/upgrade/kraken-x/stress-split-erasure-code/objectstore/filestore-xfs.yaml ceph-12.1.2/qa/suites/upgrade/kraken-x/stress-split-erasure-code/objectstore/filestore-xfs.yaml --- ceph-12.1.1/qa/suites/upgrade/kraken-x/stress-split-erasure-code/objectstore/filestore-xfs.yaml 2017-07-17 16:56:02.000000000 +0000 +++ ceph-12.1.2/qa/suites/upgrade/kraken-x/stress-split-erasure-code/objectstore/filestore-xfs.yaml 2017-08-01 17:55:40.000000000 +0000 @@ -5,3 +5,11 @@ osd: osd objectstore: filestore osd sloppy crc: true + ceph-deploy: + fs: xfs + filestore: True + conf: + osd: + osd objectstore: filestore + osd sloppy crc: true + diff -Nru ceph-12.1.1/qa/tasks/ceph_deploy.py ceph-12.1.2/qa/tasks/ceph_deploy.py --- ceph-12.1.1/qa/tasks/ceph_deploy.py 2017-07-17 16:56:02.000000000 +0000 +++ ceph-12.1.2/qa/tasks/ceph_deploy.py 2017-08-01 17:55:40.000000000 +0000 @@ -329,6 +329,11 @@ if estatus != 0: raise RuntimeError("ceph-deploy: Failed to zap osds") osd_create_cmd = './ceph-deploy osd create ' + # first check for filestore, default is bluestore with ceph-deploy + if config.get('filestore') is not None: + osd_create_cmd += '--filestore ' + else: + osd_create_cmd += '--bluestore ' if config.get('dmcrypt') is not None: osd_create_cmd += '--dmcrypt ' osd_create_cmd += ":".join(d) @@ -689,6 +694,10 @@ mon_initial_members: 1 only_mon: true keep_running: true + # either choose bluestore or filestore, default is bluestore + bluestore: True + # or + filestore: True tasks: - install: diff -Nru ceph-12.1.1/qa/tasks/cephfs/cephfs_test_case.py ceph-12.1.2/qa/tasks/cephfs/cephfs_test_case.py --- ceph-12.1.1/qa/tasks/cephfs/cephfs_test_case.py 2017-07-17 16:56:02.000000000 +0000 +++ ceph-12.1.2/qa/tasks/cephfs/cephfs_test_case.py 2017-08-01 17:55:40.000000000 +0000 @@ -159,7 +159,7 @@ # Load an config settings of interest for setting in self.LOAD_SETTINGS: - setattr(self, setting, int(self.fs.mds_asok( + setattr(self, setting, float(self.fs.mds_asok( ['config', 'get', setting], self.mds_cluster.mds_ids[0] )[setting])) @@ -184,10 +184,10 @@ def auth_list(self): """ - Convenience wrapper on "ceph auth list" + Convenience wrapper on "ceph auth ls" """ return json.loads(self.mds_cluster.mon_manager.raw_cluster_cmd( - "auth", "list", "--format=json-pretty" + "auth", "ls", "--format=json-pretty" ))['auth_dump'] def assert_session_count(self, expected, ls_data=None, mds_id=None): diff -Nru ceph-12.1.1/qa/tasks/cephfs/fuse_mount.py ceph-12.1.2/qa/tasks/cephfs/fuse_mount.py --- ceph-12.1.1/qa/tasks/cephfs/fuse_mount.py 2017-07-17 16:56:02.000000000 +0000 +++ ceph-12.1.2/qa/tasks/cephfs/fuse_mount.py 2017-08-01 17:55:40.000000000 +0000 @@ -222,6 +222,15 @@ except run.CommandFailedError: log.info('Failed to unmount ceph-fuse on {name}, aborting...'.format(name=self.client_remote.name)) + self.client_remote.run(args=[ + 'sudo', + run.Raw('PATH=/usr/sbin:$PATH'), + 'lsof', + run.Raw(';'), + 'ps', + 'auxf', + ]) + # abort the fuse mount, killing all hung processes if self._fuse_conn: self.run_python(dedent(""" diff -Nru ceph-12.1.1/qa/tasks/cephfs/kernel_mount.py ceph-12.1.2/qa/tasks/cephfs/kernel_mount.py --- ceph-12.1.1/qa/tasks/cephfs/kernel_mount.py 2017-07-17 16:56:02.000000000 +0000 +++ ceph-12.1.2/qa/tasks/cephfs/kernel_mount.py 2017-08-01 17:55:40.000000000 +0000 @@ -98,7 +98,17 @@ if force: cmd.append('-f') - self.client_remote.run(args=cmd) + try: + self.client_remote.run(args=cmd) + except Exception as e: + self.client_remote.run(args=[ + 'sudo', + run.Raw('PATH=/usr/sbin:$PATH'), + 'lsof', + run.Raw(';'), + 'ps', 'auxf', + ]) + raise e rproc = self.client_remote.run( args=[ diff -Nru ceph-12.1.1/qa/tasks/cephfs/test_client_limits.py ceph-12.1.2/qa/tasks/cephfs/test_client_limits.py --- ceph-12.1.1/qa/tasks/cephfs/test_client_limits.py 2017-07-17 16:56:02.000000000 +0000 +++ ceph-12.1.2/qa/tasks/cephfs/test_client_limits.py 2017-08-01 17:55:40.000000000 +0000 @@ -61,8 +61,8 @@ # MDS should not be happy about that, as the client is failing to comply # with the SESSION_RECALL messages it is being sent - mds_recall_state_timeout = int(self.fs.get_config("mds_recall_state_timeout")) - self.wait_for_health("MDS_HEALTH_CLIENT_RECALL", + mds_recall_state_timeout = float(self.fs.get_config("mds_recall_state_timeout")) + self.wait_for_health("MDS_CLIENT_RECALL", mds_recall_state_timeout + 10) # We can also test that the MDS health warning for oversized @@ -122,8 +122,8 @@ # After mds_revoke_cap_timeout, we should see a health warning (extra lag from # MDS beacon period) - mds_revoke_cap_timeout = int(self.fs.get_config("mds_revoke_cap_timeout")) - self.wait_for_health("MDS_CLIENT_RECALL", mds_revoke_cap_timeout + 10) + mds_revoke_cap_timeout = float(self.fs.get_config("mds_revoke_cap_timeout")) + self.wait_for_health("MDS_CLIENT_LATE_RELEASE", mds_revoke_cap_timeout + 10) # Client B should still be stuck self.assertFalse(rproc.finished) diff -Nru ceph-12.1.1/qa/tasks/cephfs/test_failover.py ceph-12.1.2/qa/tasks/cephfs/test_failover.py --- ceph-12.1.1/qa/tasks/cephfs/test_failover.py 2017-07-17 16:56:02.000000000 +0000 +++ ceph-12.1.2/qa/tasks/cephfs/test_failover.py 2017-08-01 17:55:40.000000000 +0000 @@ -32,7 +32,7 @@ # Kill the rank 0 daemon's physical process self.fs.mds_stop(original_active) - grace = int(self.fs.get_config("mds_beacon_grace", service_type="mon")) + grace = float(self.fs.get_config("mds_beacon_grace", service_type="mon")) # Wait until the monitor promotes his replacement def promoted(): @@ -65,7 +65,7 @@ if not require_active: raise case.SkipTest("fuse_require_active_mds is not set") - grace = int(self.fs.get_config("mds_beacon_grace", service_type="mon")) + grace = float(self.fs.get_config("mds_beacon_grace", service_type="mon")) # Check it's not laggy to begin with (original_active, ) = self.fs.get_active_names() @@ -102,7 +102,7 @@ # Need all my standbys up as well as the active daemons self.wait_for_daemon_start() - grace = int(self.fs.get_config("mds_beacon_grace", service_type="mon")) + grace = float(self.fs.get_config("mds_beacon_grace", service_type="mon")) standbys = self.mds_cluster.get_standby_daemons() self.assertGreaterEqual(len(standbys), 1) diff -Nru ceph-12.1.1/qa/tasks/cephfs/test_strays.py ceph-12.1.2/qa/tasks/cephfs/test_strays.py --- ceph-12.1.1/qa/tasks/cephfs/test_strays.py 2017-07-17 16:56:02.000000000 +0000 +++ ceph-12.1.2/qa/tasks/cephfs/test_strays.py 2017-08-01 17:55:40.000000000 +0000 @@ -785,18 +785,23 @@ # Remove the snapshot self.mount_a.run_shell(["rmdir", "snapdir/.snap/snap1"]) - self.mount_a.umount_wait() # Purging file_a doesn't happen until after we've flushed the journal, because # it is referenced by the snapshotted subdir, and the snapshot isn't really # gone until the journal references to it are gone self.fs.mds_asok(["flush", "journal"]) + # Wait for purging to complete, which requires the OSDMap to propagate to the OSDs. + # See also: http://tracker.ceph.com/issues/20072 + self.wait_until_true( + lambda: self.fs.data_objects_absent(file_a_ino, size_mb * 1024 * 1024), + timeout=60 + ) + # See that a purge happens now self._wait_for_counter("mds_cache", "strays_enqueued", 2) self._wait_for_counter("purge_queue", "pq_executed", 2) - self.assertTrue(self.fs.data_objects_absent(file_a_ino, size_mb * 1024 * 1024)) self.await_data_pool_empty() def test_fancy_layout(self): diff -Nru ceph-12.1.1/qa/tasks/ceph_manager.py ceph-12.1.2/qa/tasks/ceph_manager.py --- ceph-12.1.1/qa/tasks/ceph_manager.py 2017-07-17 16:56:02.000000000 +0000 +++ ceph-12.1.2/qa/tasks/ceph_manager.py 2017-08-01 17:55:40.000000000 +0000 @@ -126,6 +126,7 @@ self.chance_thrash_pg_upmap = self.config.get('chance_thrash_pg_upmap', 1.0) self.chance_thrash_pg_upmap_items = self.config.get('chance_thrash_pg_upmap', 1.0) self.random_eio = self.config.get('random_eio') + self.chance_force_recovery = self.config.get('chance_force_recovery', 0.3) num_osds = self.in_osds + self.out_osds self.max_pgs = self.config.get("max_pgs_per_pool_osd", 1200) * num_osds @@ -603,6 +604,39 @@ except CommandFailedError: self.log('Failed to rm-pg-upmap-items, ignoring') + def force_recovery(self): + """ + Force recovery on some of PGs + """ + backfill = random.random() >= 0.5 + j = self.ceph_manager.get_pgids_to_force(backfill) + if j: + if backfill: + self.ceph_manager.raw_cluster_cmd('pg', 'force-backfill', *j) + else: + self.ceph_manager.raw_cluster_cmd('pg', 'force-recovery', *j) + + def cancel_force_recovery(self): + """ + Force recovery on some of PGs + """ + backfill = random.random() >= 0.5 + j = self.ceph_manager.get_pgids_to_cancel_force(backfill) + if j: + if backfill: + self.ceph_manager.raw_cluster_cmd('pg', 'cancel-force-backfill', *j) + else: + self.ceph_manager.raw_cluster_cmd('pg', 'cancel-force-recovery', *j) + + def force_cancel_recovery(self): + """ + Force or cancel forcing recovery + """ + if random.random() >= 0.4: + self.force_recovery() + else: + self.cancel_force_recovery() + def all_up(self): """ Make sure all osds are up and not out. @@ -772,7 +806,7 @@ while len(self.in_osds) < (self.minin + 1): self.in_osd() self.log("Waiting for recovery") - self.ceph_manager.wait_for_all_up( + self.ceph_manager.wait_for_all_osds_up( timeout=self.config.get('timeout') ) # now we wait 20s for the pg status to change, if it takes longer, @@ -841,6 +875,8 @@ actions.append((self.thrash_pg_upmap, self.chance_thrash_pg_upmap,)) if self.chance_thrash_pg_upmap_items > 0: actions.append((self.thrash_pg_upmap_items, self.chance_thrash_pg_upmap_items,)) + if self.chance_force_recovery > 0: + actions.append((self.force_cancel_recovery, self.chance_force_recovery)) for key in ['heartbeat_inject_failure', 'filestore_inject_stall']: for scenario in [ @@ -1074,6 +1110,7 @@ finally: if self.do_revive: self.manager.revive_osd(self.osd) + self.manager.wait_till_osd_is_up(self.osd, 300) class CephManager: @@ -1620,6 +1657,10 @@ 'osd', 'pool', 'set', pool_name, 'allow_ec_overwrites', 'true') + self.raw_cluster_cmd( + 'osd', 'pool', 'application', 'enable', + pool_name, 'rados', '--yes-i-really-mean-it', + run.Raw('||'), 'true') self.pools[pool_name] = pg_num time.sleep(1) @@ -1782,6 +1823,40 @@ j = json.loads('\n'.join(out.split('\n')[1:])) return j['pg_stats'] + def get_pgids_to_force(self, backfill): + """ + Return the randomized list of PGs that can have their recovery/backfill forced + """ + j = self.get_pg_stats(); + pgids = [] + if backfill: + wanted = ['degraded', 'backfilling', 'backfill_wait'] + else: + wanted = ['recovering', 'degraded', 'recovery_wait'] + for pg in j: + status = pg['state'].split('+') + for t in wanted: + if random.random() > 0.5 and not ('forced_backfill' in status or 'forced_recovery' in status) and t in status: + pgids.append(pg['pgid']) + break + return pgids + + def get_pgids_to_cancel_force(self, backfill): + """ + Return the randomized list of PGs whose recovery/backfill priority is forced + """ + j = self.get_pg_stats(); + pgids = [] + if backfill: + wanted = 'forced_backfill' + else: + wanted = 'forced_recovery' + for pg in j: + status = pg['state'].split('+') + if wanted in status and random.random() > 0.5: + pgids.append(pg['pgid']) + return pgids + def compile_pg_status(self): """ Return a histogram of pg state values @@ -1914,6 +1989,10 @@ """ return self.get_osd_dump_json()['osds'] + def get_mgr_dump(self): + out = self.raw_cluster_cmd('mgr', 'dump', '--format=json') + return json.loads(out) + def get_stuck_pgs(self, type_, threshold): """ :returns: stuck pg information from the cluster @@ -2072,7 +2151,7 @@ x = self.get_osd_dump() return (len(x) == sum([(y['up'] > 0) for y in x])) - def wait_for_all_up(self, timeout=None): + def wait_for_all_osds_up(self, timeout=None): """ When this exits, either the timeout has expired, or all osds are up. @@ -2082,10 +2161,45 @@ while not self.are_all_osds_up(): if timeout is not None: assert time.time() - start < timeout, \ - 'timeout expired in wait_for_all_up' + 'timeout expired in wait_for_all_osds_up' time.sleep(3) self.log("all up!") + def pool_exists(self, pool): + if pool in self.list_pools(): + return True + return False + + def wait_for_pool(self, pool, timeout=300): + """ + Wait for a pool to exist + """ + self.log('waiting for pool %s to exist' % pool) + start = time.time() + while not self.pool_exists(pool): + if timeout is not None: + assert time.time() - start < timeout, \ + 'timeout expired in wait_for_pool' + time.sleep(3) + + def wait_for_pools(self, pools): + for pool in pools: + self.wait_for_pool(pool) + + def is_mgr_available(self): + x = self.get_mgr_dump() + return x.get('available', False) + + def wait_for_mgr_available(self, timeout=None): + self.log("waiting for mgr available") + start = time.time() + while not self.is_mgr_available(): + if timeout is not None: + assert time.time() - start < timeout, \ + 'timeout expired in wait_for_mgr_available' + time.sleep(3) + self.log("mgr available!") + def wait_for_recovery(self, timeout=None): """ Check peering. When this exists, we have recovered. @@ -2443,5 +2557,8 @@ create_pool = utility_task("create_pool") remove_pool = utility_task("remove_pool") wait_for_clean = utility_task("wait_for_clean") +flush_all_pg_stats = utility_task("flush_all_pg_stats") set_pool_property = utility_task("set_pool_property") do_pg_scrub = utility_task("do_pg_scrub") +wait_for_pool = utility_task("wait_for_pool") +wait_for_pools = utility_task("wait_for_pools") diff -Nru ceph-12.1.1/qa/tasks/ceph.py ceph-12.1.2/qa/tasks/ceph.py --- ceph-12.1.1/qa/tasks/ceph.py 2017-07-17 16:56:02.000000000 +0000 +++ ceph-12.1.2/qa/tasks/ceph.py 2017-08-01 17:55:40.000000000 +0000 @@ -1110,6 +1110,9 @@ if config.get('coverage') or config.get('valgrind') is not None: daemon_signal = 'term' + # create osds in order. (this only matters for pre-luminous, which might + # be hammer, which doesn't take an id_ argument to legacy 'osd create'). + osd_uuids = {} for remote, roles_for_host in daemons.remotes.iteritems(): is_type_ = teuthology.is_type(type_, cluster_name) for role in roles_for_host: @@ -1117,6 +1120,7 @@ continue _, _, id_ = teuthology.split_role(role) + if type_ == 'osd': datadir='/var/lib/ceph/osd/{cluster}-{id}'.format( cluster=cluster_name, id=id_) @@ -1125,29 +1129,40 @@ path=datadir + '/fsid', sudo=True, ).strip() - try: - remote.run( - args=[ - 'sudo', 'ceph', '--cluster', cluster_name, - 'osd', 'new', osd_uuid, id_, - ] - ) - except: - # fallback to pre-luminous (hammer or jewel) - remote.run( - args=[ - 'sudo', 'ceph', '--cluster', cluster_name, - 'osd', 'create', osd_uuid, - ] - ) - if config.get('add_osds_to_crush'): - remote.run( - args=[ - 'sudo', 'ceph', '--cluster', cluster_name, - 'osd', 'crush', 'create-or-move', 'osd.' + id_, - '1.0', 'host=localhost', 'root=default', - ] - ) + osd_uuids[id_] = osd_uuid + for osd_id in range(len(osd_uuids)): + id_ = str(osd_id) + osd_uuid = osd_uuids.get(id_) + try: + remote.run( + args=[ + 'sudo', 'ceph', '--cluster', cluster_name, + 'osd', 'new', osd_uuid, id_, + ] + ) + except: + # fallback to pre-luminous (hammer or jewel) + remote.run( + args=[ + 'sudo', 'ceph', '--cluster', cluster_name, + 'osd', 'create', osd_uuid, + ] + ) + if config.get('add_osds_to_crush'): + remote.run( + args=[ + 'sudo', 'ceph', '--cluster', cluster_name, + 'osd', 'crush', 'create-or-move', 'osd.' + id_, + '1.0', 'host=localhost', 'root=default', + ] + ) + + for remote, roles_for_host in daemons.remotes.iteritems(): + is_type_ = teuthology.is_type(type_, cluster_name) + for role in roles_for_host: + if not is_type_(role): + continue + _, _, id_ = teuthology.split_role(role) run_cmd = [ 'sudo', @@ -1207,7 +1222,13 @@ """ config = config if isinstance(config, dict) else dict() cluster_name = config.get('cluster', 'ceph') - log.info('Waiting until ceph cluster %s is healthy...', cluster_name) + log.info('Waiting until %s daemons up and pgs clean...', cluster_name) + manager = ctx.managers[cluster_name] + try: + manager.wait_for_mgr_available() + except run.CommandFailedError: + log.info('ignoring mgr wait error, probably testing upgrade') + firstmon = teuthology.get_first_mon(ctx, config, cluster_name) (mon0_remote,) = ctx.cluster.only(firstmon).remotes.keys() teuthology.wait_until_osds_up( @@ -1216,6 +1237,14 @@ remote=mon0_remote, ceph_cluster=cluster_name, ) + + try: + manager.flush_all_pg_stats() + except run.CommandFailedError: + log.info('ignoring flush pg stats error, probably testing upgrade') + manager.wait_for_clean() + + log.info('Waiting until ceph cluster %s is healthy...', cluster_name) teuthology.wait_until_healthy( ctx, remote=mon0_remote, diff -Nru ceph-12.1.1/qa/tasks/dump_stuck.py ceph-12.1.2/qa/tasks/dump_stuck.py --- ceph-12.1.1/qa/tasks/dump_stuck.py 2017-07-17 16:56:02.000000000 +0000 +++ ceph-12.1.2/qa/tasks/dump_stuck.py 2017-08-01 17:55:40.000000000 +0000 @@ -100,6 +100,7 @@ log.info('stopping first osd') manager.kill_osd(0) manager.mark_down_osd(0) + manager.wait_for_active(timeout) log.info('waiting for all to be unclean') starttime = time.time() diff -Nru ceph-12.1.1/qa/tasks/mds_thrash.py ceph-12.1.2/qa/tasks/mds_thrash.py --- ceph-12.1.1/qa/tasks/mds_thrash.py 2017-07-17 16:56:02.000000000 +0000 +++ ceph-12.1.2/qa/tasks/mds_thrash.py 2017-08-01 17:55:40.000000000 +0000 @@ -151,7 +151,7 @@ thrash_in_replay: [default: 0.0] likelihood that the MDS will be thrashed during replay. Value should be between 0.0 and 1.0. - thrash_max_mds: [default: 0.0] likelihood that the max_mds of the mds + thrash_max_mds: [default: 0.05] likelihood that the max_mds of the mds cluster will be modified to a value [1, current) or (current, starting max_mds]. When reduced, randomly selected MDSs other than rank 0 will be deactivated to reach the new max_mds. Value should be between 0.0 and 1.0. @@ -216,7 +216,7 @@ self.stopping = Event() self.randomize = bool(self.config.get('randomize', True)) - self.thrash_max_mds = float(self.config.get('thrash_max_mds', 0.0)) + self.thrash_max_mds = float(self.config.get('thrash_max_mds', 0.05)) self.max_thrash = int(self.config.get('max_thrash', 1)) self.max_thrash_delay = float(self.config.get('thrash_delay', 120.0)) self.thrash_in_replay = float(self.config.get('thrash_in_replay', False)) diff -Nru ceph-12.1.1/qa/tasks/rbd_fio.py ceph-12.1.2/qa/tasks/rbd_fio.py --- ceph-12.1.1/qa/tasks/rbd_fio.py 2017-07-17 16:56:02.000000000 +0000 +++ ceph-12.1.2/qa/tasks/rbd_fio.py 2017-08-01 17:55:40.000000000 +0000 @@ -8,13 +8,14 @@ import contextlib import json import logging +import os import StringIO -import re from teuthology.parallel import parallel from teuthology import misc as teuthology from tempfile import NamedTemporaryFile from teuthology.orchestra import run +from teuthology.packaging import install_package, remove_package log = logging.getLogger(__name__) @@ -64,6 +65,28 @@ yield +def get_ioengine_package_name(ioengine, remote): + system_type = teuthology.get_system_type(remote) + if ioengine == 'rbd': + return 'librbd1-devel' if system_type == 'rpm' else 'librbd-dev' + elif ioengine == 'libaio': + return 'libaio-devel' if system_type == 'rpm' else 'libaio-dev' + else: + return None + + +def run_rbd_map(remote, image, iodepth): + iodepth = max(iodepth, 128) # RBD_QUEUE_DEPTH_DEFAULT + out = StringIO.StringIO() + remote.run(args=['sudo', 'rbd', 'map', '-o', 'queue_depth={}'.format(iodepth), image], stdout=out) + dev = out.getvalue().rstrip('\n') + teuthology.sudo_write_file( + remote, + '/sys/block/{}/queue/nr_requests'.format(os.path.basename(dev)), + str(iodepth)) + return dev + + def run_fio(remote, config, rbd_test_dir): """ create fio config file with options based on above config @@ -82,7 +105,8 @@ fio_config.write('bs={bs}\n'.format(bs=bs)) else: fio_config.write('bs=4k\n') - fio_config.write('iodepth=2\n') + iodepth = config.get('io-depth', 2) + fio_config.write('iodepth={iod}\n'.format(iod=iodepth)) if config.get('fio-io-size'): size=config['fio-io-size'] fio_config.write('size={size}\n'.format(size=size)) @@ -102,7 +126,7 @@ formats=[1,2] features=[['layering'],['striping'],['exclusive-lock','object-map']] - fio_version='2.7' + fio_version='2.21' if config.get('formats'): formats=config['formats'] if config.get('features'): @@ -110,23 +134,19 @@ if config.get('fio-version'): fio_version=config['fio-version'] - fio_config.write('norandommap\n') - if ioengine == 'rbd': - fio_config.write('invalidate=0\n') - #handle package required for librbd engine + # handle package required for ioengine, if any sn=remote.shortname - system_type= teuthology.get_system_type(remote) - if system_type == 'rpm' and ioengine == 'rbd': - log.info("Installing librbd1 devel package on {sn}".format(sn=sn)) - remote.run(args=['sudo', 'yum' , 'install', 'librbd1-devel', '-y']) - elif ioengine == 'rbd': - log.info("Installing librbd devel package on {sn}".format(sn=sn)) - remote.run(args=['sudo', 'apt-get', '-y', - '--force-yes', - 'install', 'librbd-dev']) + ioengine_pkg = get_ioengine_package_name(ioengine, remote) + if ioengine_pkg: + install_package(ioengine_pkg, remote) + + fio_config.write('norandommap\n') if ioengine == 'rbd': fio_config.write('clientname=admin\n') fio_config.write('pool=rbd\n') + fio_config.write('invalidate=0\n') + elif ioengine == 'libaio': + fio_config.write('direct=1\n') for frmt in formats: for feature in features: log.info("Creating rbd images on {sn}".format(sn=sn)) @@ -142,18 +162,13 @@ remote.run(args=create_args) remote.run(args=['rbd', 'info', rbd_name]) if ioengine != 'rbd': - out=StringIO.StringIO() - remote.run(args=['sudo', 'rbd', 'map', rbd_name ],stdout=out) - dev=re.search(r'(/dev/rbd\d+)',out.getvalue()) - rbd_dev=dev.group(1) + rbd_dev = run_rbd_map(remote, rbd_name, iodepth) if config.get('test-clone-io'): log.info("Testing clones using fio") remote.run(args=['rbd', 'snap', 'create', rbd_snap_name]) remote.run(args=['rbd', 'snap', 'protect', rbd_snap_name]) remote.run(args=['rbd', 'clone', rbd_snap_name, rbd_clone_name]) - remote.run(args=['sudo', 'rbd', 'map', rbd_clone_name], stdout=out) - dev=re.search(r'(/dev/rbd\d+)',out.getvalue()) - rbd_clone_dev=dev.group(1) + rbd_clone_dev = run_rbd_map(remote, rbd_clone_name, iodepth) fio_config.write('[{rbd_dev}]\n'.format(rbd_dev=rbd_dev)) if config.get('rw'): rw=config['rw'] @@ -194,6 +209,7 @@ run.Raw(';'), 'wget' , fio , run.Raw(';'), run.Raw('tar -xvf fio*tar.gz'), run.Raw(';'), run.Raw('cd fio-fio*'), 'configure', run.Raw(';') ,'make']) remote.run(args=['ceph', '-s']) + remote.run(args=[run.Raw('{tdir}/fio-fio-{v}/fio --showcmd {f}'.format(tdir=rbd_test_dir,v=fio_version,f=fio_config.name))]) remote.run(args=['sudo', run.Raw('{tdir}/fio-fio-{v}/fio {f}'.format(tdir=rbd_test_dir,v=fio_version,f=fio_config.name))]) remote.run(args=['ceph', '-s']) finally: @@ -206,9 +222,5 @@ remote.run(args=['sudo', 'rbd', 'unmap', str(image['device'])]) log.info("Cleaning up fio install") remote.run(args=['rm','-rf', run.Raw(rbd_test_dir)]) - if system_type == 'rpm' and ioengine == 'rbd': - log.info("Uninstall librbd1 devel package on {sn}".format(sn=sn)) - remote.run(args=['sudo', 'yum' , 'remove', 'librbd1-devel', '-y']) - elif ioengine == 'rbd': - log.info("Uninstall librbd devel package on {sn}".format(sn=sn)) - remote.run(args=['sudo', 'apt-get', '-y', 'remove', 'librbd-dev']) + if ioengine_pkg: + remove_package(ioengine_pkg, remote) diff -Nru ceph-12.1.1/qa/tasks/reg11184.py ceph-12.1.2/qa/tasks/reg11184.py --- ceph-12.1.1/qa/tasks/reg11184.py 2017-07-17 16:56:02.000000000 +0000 +++ ceph-12.1.2/qa/tasks/reg11184.py 2017-08-01 17:55:40.000000000 +0000 @@ -10,6 +10,7 @@ import time from cStringIO import StringIO +from teuthology.orchestra import run from teuthology import misc as teuthology from util.rados import rados import os @@ -55,6 +56,9 @@ # create 1 pg pool log.info('creating foo') manager.raw_cluster_cmd('osd', 'pool', 'create', 'foo', '1') + manager.raw_cluster_cmd( + 'osd', 'pool', 'application', 'enable', + 'foo', 'rados', run.Raw('||'), 'true') # Remove extra pool to simlify log output manager.raw_cluster_cmd('osd', 'pool', 'delete', 'rbd', 'rbd', '--yes-i-really-really-mean-it') @@ -189,12 +193,11 @@ manager.mark_down_osd(non_divergent[0]) # manager.mark_out_osd(non_divergent[0]) - # An empty collection for pg 2.0 needs to be cleaned up + # An empty collection for pg 2.0 might need to be cleaned up cmd = ((prefix + "--op remove --pgid 2.0"). format(id=non_divergent[0])) proc = exp_remote.run(args=cmd, wait=True, check_status=False, stdout=StringIO()) - assert proc.exitstatus == 0 cmd = ((prefix + "--op import --file {file}"). format(id=non_divergent[0], file=expfile)) diff -Nru ceph-12.1.1/qa/tasks/repair_test.py ceph-12.1.2/qa/tasks/repair_test.py --- ceph-12.1.1/qa/tasks/repair_test.py 2017-07-17 16:56:02.000000000 +0000 +++ ceph-12.1.2/qa/tasks/repair_test.py 2017-08-01 17:55:40.000000000 +0000 @@ -275,7 +275,7 @@ - 'scrub [0-9]+ errors' - 'size 1 != size' - 'attr name mismatch' - - 'Regular scrub request, losing deep-scrub details' + - 'Regular scrub request, deep-scrub details will be lost' conf: osd: filestore debug inject read err: true @@ -288,7 +288,7 @@ 'repair_test task only accepts a dict for config' manager = ctx.managers['ceph'] - manager.wait_for_all_up() + manager.wait_for_all_osds_up() manager.raw_cluster_cmd('osd', 'set', 'noscrub') manager.raw_cluster_cmd('osd', 'set', 'nodeep-scrub') diff -Nru ceph-12.1.1/qa/tasks/rgw_multi/tests.py ceph-12.1.2/qa/tasks/rgw_multi/tests.py --- ceph-12.1.1/qa/tasks/rgw_multi/tests.py 2017-07-17 16:56:02.000000000 +0000 +++ ceph-12.1.2/qa/tasks/rgw_multi/tests.py 2017-08-01 17:55:40.000000000 +0000 @@ -379,6 +379,14 @@ assert False, 'finished bucket checkpoint for target_zone=%s source_zone=%s bucket=%s' % \ (target_zone.name, source_zone.name, bucket_name) +def zonegroup_bucket_checkpoint(zonegroup_conns, bucket_name): + for source_conn in zonegroup_conns.zones: + for target_conn in zonegroup_conns.zones: + if source_conn.zone == target_conn.zone: + continue + zone_bucket_checkpoint(target_conn.zone, source_conn.zone, bucket_name) + target_conn.check_bucket_eq(source_conn, bucket_name) + def set_master_zone(zone): zone.modify(zone.cluster, ['--master']) zonegroup = zone.zonegroup @@ -387,6 +395,26 @@ log.info('Set master zone=%s, waiting %ds for reconfiguration..', zone.name, config.reconfigure_delay) time.sleep(config.reconfigure_delay) +def enable_bucket_sync(zone, bucket_name): + cmd = ['bucket', 'sync', 'enable', '--bucket', bucket_name] + zone.zone_args() + zone.cluster.admin(cmd) + +def disable_bucket_sync(zone, bucket_name): + cmd = ['bucket', 'sync', 'disable', '--bucket', bucket_name] + zone.zone_args() + zone.cluster.admin(cmd) + +def check_buckets_sync_status_obj_not_exist(zone, buckets): + for _ in range(config.checkpoint_retries): + cmd = ['log', 'list'] + zone.zone_arg() + log_list, ret = zone.cluster.admin(cmd, check_retcode=False, read_only=True) + for bucket in buckets: + if log_list.find(':'+bucket+":") >= 0: + break + else: + return + time.sleep(config.checkpoint_delay) + assert False + def gen_bucket_name(): global num_buckets @@ -825,3 +853,80 @@ for _, bucket in zone_bucket: bucket.set_policy(policy) assert(bucket.get_policy() == policy) + +def test_bucket_sync_disable(): + zonegroup = realm.master_zonegroup() + zonegroup_conns = ZonegroupConns(zonegroup) + buckets, zone_bucket = create_bucket_per_zone(zonegroup_conns) + + for bucket_name in buckets: + disable_bucket_sync(realm.meta_master_zone(), bucket_name) + + for zone in zonegroup.zones: + check_buckets_sync_status_obj_not_exist(zone, buckets) + +def test_bucket_sync_enable_right_after_disable(): + zonegroup = realm.master_zonegroup() + zonegroup_conns = ZonegroupConns(zonegroup) + buckets, zone_bucket = create_bucket_per_zone(zonegroup_conns) + + objnames = ['obj1', 'obj2', 'obj3', 'obj4'] + content = 'asdasd' + + for zone, bucket in zone_bucket: + for objname in objnames: + k = new_key(zone, bucket.name, objname) + k.set_contents_from_string(content) + + for bucket_name in buckets: + zonegroup_bucket_checkpoint(zonegroup_conns, bucket_name) + + for bucket_name in buckets: + disable_bucket_sync(realm.meta_master_zone(), bucket_name) + enable_bucket_sync(realm.meta_master_zone(), bucket_name) + + objnames_2 = ['obj5', 'obj6', 'obj7', 'obj8'] + + for zone, bucket in zone_bucket: + for objname in objnames_2: + k = new_key(zone, bucket.name, objname) + k.set_contents_from_string(content) + + for bucket_name in buckets: + zonegroup_bucket_checkpoint(zonegroup_conns, bucket_name) + +def test_bucket_sync_disable_enable(): + zonegroup = realm.master_zonegroup() + zonegroup_conns = ZonegroupConns(zonegroup) + buckets, zone_bucket = create_bucket_per_zone(zonegroup_conns) + + objnames = [ 'obj1', 'obj2', 'obj3', 'obj4' ] + content = 'asdasd' + + for zone, bucket in zone_bucket: + for objname in objnames: + k = new_key(zone, bucket.name, objname) + k.set_contents_from_string(content) + + zonegroup_meta_checkpoint(zonegroup) + + for bucket_name in buckets: + zonegroup_bucket_checkpoint(zonegroup_conns, bucket_name) + + for bucket_name in buckets: + disable_bucket_sync(realm.meta_master_zone(), bucket_name) + + zonegroup_meta_checkpoint(zonegroup) + + objnames_2 = [ 'obj5', 'obj6', 'obj7', 'obj8' ] + + for zone, bucket in zone_bucket: + for objname in objnames_2: + k = new_key(zone, bucket.name, objname) + k.set_contents_from_string(content) + + for bucket_name in buckets: + enable_bucket_sync(realm.meta_master_zone(), bucket_name) + + for bucket_name in buckets: + zonegroup_bucket_checkpoint(zonegroup_conns, bucket_name) diff -Nru ceph-12.1.1/qa/tasks/thrashosds.py ceph-12.1.2/qa/tasks/thrashosds.py --- ceph-12.1.1/qa/tasks/thrashosds.py 2017-07-17 16:56:02.000000000 +0000 +++ ceph-12.1.2/qa/tasks/thrashosds.py 2017-08-01 17:55:40.000000000 +0000 @@ -199,6 +199,6 @@ finally: log.info('joining thrashosds') thrash_proc.do_join() - cluster_manager.wait_for_all_up() + cluster_manager.wait_for_all_osds_up() cluster_manager.flush_all_pg_stats() cluster_manager.wait_for_recovery(config.get('timeout', 360)) diff -Nru ceph-12.1.1/qa/tasks/vstart_runner.py ceph-12.1.2/qa/tasks/vstart_runner.py --- ceph-12.1.1/qa/tasks/vstart_runner.py 2017-07-17 16:56:02.000000000 +0000 +++ ceph-12.1.2/qa/tasks/vstart_runner.py 2017-08-01 17:55:40.000000000 +0000 @@ -910,7 +910,7 @@ # Wait for OSD to come up so that subsequent injectargs etc will # definitely succeed - LocalCephCluster(LocalContext()).mon_manager.wait_for_all_up(timeout=30) + LocalCephCluster(LocalContext()).mon_manager.wait_for_all_osds_up(timeout=30) # List of client mounts, sufficient to run the selected tests clients = [i.__str__() for i in range(0, max_required_clients)] diff -Nru ceph-12.1.1/qa/tasks/workunit.py ceph-12.1.2/qa/tasks/workunit.py --- ceph-12.1.1/qa/tasks/workunit.py 2017-07-17 16:56:02.000000000 +0000 +++ ceph-12.1.2/qa/tasks/workunit.py 2017-08-01 17:55:40.000000000 +0000 @@ -122,6 +122,17 @@ backup.client.0: [foo] client.1: [bar] # cluster is implicitly 'ceph' + You can also specify an alternative top-level dir to 'qa/workunits', like + 'qa/standalone', with:: + + tasks: + - install: + - workunit: + basedir: qa/standalone + clients: + client.0: + - test-ceph-helpers.sh + :param ctx: Context :param config: Configuration """ @@ -174,7 +185,9 @@ for role, tests in clients.iteritems(): if role != "all": p.spawn(_run_tests, ctx, refspec, role, tests, - config.get('env'), timeout=timeout) + config.get('env'), + basedir=config.get('basedir','qa/workunits'), + timeout=timeout) # Clean up dirs from any non-all workunits for role, created in created_mountpoint.items(): @@ -184,6 +197,7 @@ if 'all' in clients: all_tasks = clients["all"] _spawn_on_all_clients(ctx, refspec, all_tasks, config.get('env'), + config.get('basedir', 'qa/workunits'), config.get('subdir'), timeout=timeout) @@ -312,7 +326,7 @@ return created_mountpoint -def _spawn_on_all_clients(ctx, refspec, tests, env, subdir, timeout=None): +def _spawn_on_all_clients(ctx, refspec, tests, env, basedir, subdir, timeout=None): """ Make a scratch directory for each client in the cluster, and then for each test spawn _run_tests() for each role. @@ -331,7 +345,9 @@ for unit in tests: with parallel() as p: for role, remote in client_remotes.items(): - p.spawn(_run_tests, ctx, refspec, role, [unit], env, subdir, + p.spawn(_run_tests, ctx, refspec, role, [unit], env, + basedir, + subdir, timeout=timeout) # cleanup the generated client directories @@ -339,7 +355,8 @@ _delete_dir(ctx, role, created_mountpoint[role]) -def _run_tests(ctx, refspec, role, tests, env, subdir=None, timeout=None): +def _run_tests(ctx, refspec, role, tests, env, basedir, + subdir=None, timeout=None): """ Run the individual test. Create a scratch directory and then extract the workunits from git. Make the executables, and then run the tests. @@ -369,7 +386,8 @@ else: scratch_tmp = os.path.join(mnt, subdir) clonedir = '{tdir}/clone.{role}'.format(tdir=testdir, role=role) - srcdir = '{cdir}/qa/workunits'.format(cdir=clonedir) + srcdir = '{cdir}/{basedir}'.format(cdir=clonedir, + basedir=basedir) git_url = teuth_config.get_ceph_qa_suite_git_url() # if we are running an upgrade test, and ceph-ci does not have branches like @@ -430,6 +448,7 @@ run.Raw('CEPH_ID="{id}"'.format(id=id_)), run.Raw('PATH=$PATH:/usr/sbin'), run.Raw('CEPH_BASE={dir}'.format(dir=clonedir)), + run.Raw('CEPH_ROOT={dir}'.format(dir=clonedir)), ] if env is not None: for var, val in env.iteritems(): diff -Nru ceph-12.1.1/qa/workunits/ceph-disk/ceph-disk.sh ceph-12.1.2/qa/workunits/ceph-disk/ceph-disk.sh --- ceph-12.1.1/qa/workunits/ceph-disk/ceph-disk.sh 2017-07-17 16:56:02.000000000 +0000 +++ ceph-12.1.2/qa/workunits/ceph-disk/ceph-disk.sh 2017-08-01 17:55:40.000000000 +0000 @@ -1,8 +1,8 @@ #!/bin/bash -if [ -f $(dirname $0)/../ceph-helpers-root.sh ]; then - source $(dirname $0)/../ceph-helpers-root.sh +if [ -f $(dirname $0)/../../standalone/ceph-helpers-root.sh ]; then + source $(dirname $0)/../../standalone/ceph-helpers-root.sh else - echo "$(dirname $0)/../ceph-helpers-root.sh does not exist." + echo "$(dirname $0)/../../standalone/ceph-helpers-root.sh does not exist." exit 1 fi diff -Nru ceph-12.1.1/qa/workunits/ceph-disk/ceph-disk-test.py ceph-12.1.2/qa/workunits/ceph-disk/ceph-disk-test.py --- ceph-12.1.1/qa/workunits/ceph-disk/ceph-disk-test.py 2017-07-17 16:56:02.000000000 +0000 +++ ceph-12.1.2/qa/workunits/ceph-disk/ceph-disk-test.py 2017-08-01 17:55:40.000000000 +0000 @@ -22,7 +22,7 @@ # ln -sf /home/ubuntu/ceph/systemd/ceph-disk@.service /usr/lib/systemd/system/ceph-disk@.service # ceph-disk.conf will be silently ignored if it is a symbolic link or a hard link /var/log/upstart for logs # cp /home/ubuntu/ceph/src/upstart/ceph-disk.conf /etc/init/ceph-disk.conf -# id=3 ; ceph-disk deactivate --deactivate-by-id $id ; ceph-disk destroy --zap --destroy-by-id $id +# id=3 ; ceph-disk deactivate --deactivate-by-id $id ; ceph-disk destroy --purge --zap --destroy-by-id $id # py.test -s -v -k test_activate_dmcrypt_luks ceph-disk-test.py # # CentOS 7 @@ -173,7 +173,7 @@ self.sh(""" set -xe ceph-disk --verbose deactivate --deactivate-by-id {id} - ceph-disk --verbose destroy --destroy-by-id {id} --zap + ceph-disk --verbose destroy --purge --destroy-by-id {id} --zap """.format(id=id)) def deactivate_osd(self, uuid): @@ -299,7 +299,7 @@ assert partition['state'] == 'active' c.sh("ceph-disk --verbose deactivate " + partition['path']) c.wait_for_osd_down(osd_uuid) - c.sh("ceph-disk --verbose destroy " + partition['path'] + " --zap") + c.sh("ceph-disk --verbose destroy --purge " + partition['path'] + " --zap") def test_deactivate_reactivate_dmcrypt_plain(self): c = CephDisk() diff -Nru ceph-12.1.1/qa/workunits/ceph-helpers-root.sh ceph-12.1.2/qa/workunits/ceph-helpers-root.sh --- ceph-12.1.1/qa/workunits/ceph-helpers-root.sh 2017-07-17 16:56:02.000000000 +0000 +++ ceph-12.1.2/qa/workunits/ceph-helpers-root.sh 2017-08-01 17:55:40.000000000 +0000 @@ -76,6 +76,7 @@ ceph osd pool create $test_pool 4 || return 1 ceph osd pool set $test_pool size $size || return 1 ceph osd pool set $test_pool min_size $size || return 1 + ceph osd pool application enable $test_pool rados echo FOO > $dir/BAR timeout $timeout rados --pool $test_pool put BAR $dir/BAR || return 1 diff -Nru ceph-12.1.1/qa/workunits/ceph-helpers.sh ceph-12.1.2/qa/workunits/ceph-helpers.sh --- ceph-12.1.1/qa/workunits/ceph-helpers.sh 2017-07-17 16:56:02.000000000 +0000 +++ ceph-12.1.2/qa/workunits/ceph-helpers.sh 1970-01-01 00:00:00.000000000 +0000 @@ -1,1821 +0,0 @@ -#!/bin/bash -# -# Copyright (C) 2013,2014 Cloudwatt -# Copyright (C) 2014,2015 Red Hat -# Copyright (C) 2014 Federico Gimenez -# -# Author: Loic Dachary -# Author: Federico Gimenez -# -# This program is free software; you can redistribute it and/or modify -# it under the terms of the GNU Library Public License as published by -# the Free Software Foundation; either version 2, or (at your option) -# any later version. -# -# This program is distributed in the hope that it will be useful, -# but WITHOUT ANY WARRANTY; without even the implied warranty of -# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the -# GNU Library Public License for more details. -# -TIMEOUT=300 -PG_NUM=4 -: ${CEPH_BUILD_VIRTUALENV:=/tmp} - -if type xmlstarlet > /dev/null 2>&1; then - XMLSTARLET=xmlstarlet -elif type xml > /dev/null 2>&1; then - XMLSTARLET=xml -else - echo "Missing xmlstarlet binary!" - exit 1 -fi - -if [ `uname` = FreeBSD ]; then - SED=gsed - DIFFCOLOPTS="" -else - SED=sed - termwidth=$(stty -a | head -1 | sed -e 's/.*columns \([0-9]*\).*/\1/') - if [ -n "$termwidth" -a "$termwidth" != "0" ]; then - termwidth="-W ${termwidth}" - fi - DIFFCOLOPTS="-y $termwidth" -fi - -#! @file ceph-helpers.sh -# @brief Toolbox to manage Ceph cluster dedicated to testing -# -# Example use case: -# -# ~~~~~~~~~~~~~~~~{.sh} -# source ceph-helpers.sh -# -# function mytest() { -# # cleanup leftovers and reset mydir -# setup mydir -# # create a cluster with one monitor and three osds -# run_mon mydir a -# run_osd mydir 0 -# run_osd mydir 2 -# run_osd mydir 3 -# # put and get an object -# rados --pool rbd put GROUP /etc/group -# rados --pool rbd get GROUP /tmp/GROUP -# # stop the cluster and cleanup the directory -# teardown mydir -# } -# ~~~~~~~~~~~~~~~~ -# -# The focus is on simplicity and efficiency, in the context of -# functional tests. The output is intentionally very verbose -# and functions return as soon as an error is found. The caller -# is also expected to abort on the first error so that debugging -# can be done by looking at the end of the output. -# -# Each function is documented, implemented and tested independently. -# When modifying a helper, the test and the documentation are -# expected to be updated and it is easier of they are collocated. A -# test for a given function can be run with -# -# ~~~~~~~~~~~~~~~~{.sh} -# ceph-helpers.sh TESTS test_get_osds -# ~~~~~~~~~~~~~~~~ -# -# and all the tests (i.e. all functions matching test_*) are run -# with: -# -# ~~~~~~~~~~~~~~~~{.sh} -# ceph-helpers.sh TESTS -# ~~~~~~~~~~~~~~~~ -# -# A test function takes a single argument : the directory dedicated -# to the tests. It is expected to not create any file outside of this -# directory and remove it entirely when it completes successfully. -# - - -## -# Cleanup any leftovers found in **dir** via **teardown** -# and reset **dir** as an empty environment. -# -# @param dir path name of the environment -# @return 0 on success, 1 on error -# -function setup() { - local dir=$1 - teardown $dir || return 1 - mkdir -p $dir -} - -function test_setup() { - local dir=$dir - setup $dir || return 1 - test -d $dir || return 1 - setup $dir || return 1 - test -d $dir || return 1 - teardown $dir -} - -####################################################################### - -## -# Kill all daemons for which a .pid file exists in **dir** and remove -# **dir**. If the file system in which **dir** is btrfs, delete all -# subvolumes that relate to it. -# -# @param dir path name of the environment -# @return 0 on success, 1 on error -# -function teardown() { - local dir=$1 - kill_daemons $dir KILL - if [ `uname` != FreeBSD ] \ - && [ $(stat -f -c '%T' .) == "btrfs" ]; then - __teardown_btrfs $dir - fi - rm -fr $dir -} - -function __teardown_btrfs() { - local btrfs_base_dir=$1 - local btrfs_root=$(df -P . | tail -1 | awk '{print $NF}') - local btrfs_dirs=$(cd $btrfs_base_dir; sudo btrfs subvolume list . -t | awk '/^[0-9]/ {print $4}' | grep "$btrfs_base_dir/$btrfs_dir") - for subvolume in $btrfs_dirs; do - sudo btrfs subvolume delete $btrfs_root/$subvolume - done -} - -function test_teardown() { - local dir=$dir - setup $dir || return 1 - teardown $dir || return 1 - ! test -d $dir || return 1 -} - -####################################################################### - -## -# Sends a signal to a single daemon. -# This is a helper function for kill_daemons -# -# After the daemon is sent **signal**, its actual termination -# will be verified by sending it signal 0. If the daemon is -# still alive, kill_daemon will pause for a few seconds and -# try again. This will repeat for a fixed number of times -# before kill_daemon returns on failure. The list of -# sleep intervals can be specified as **delays** and defaults -# to: -# -# 0.1 0.2 1 1 1 2 3 5 5 5 10 10 20 60 60 60 120 -# -# This sequence is designed to run first a very short sleep time (0.1) -# if the machine is fast enough and the daemon terminates in a fraction of a -# second. The increasing sleep numbers should give plenty of time for -# the daemon to die even on the slowest running machine. If a daemon -# takes more than a few minutes to stop (the sum of all sleep times), -# there probably is no point in waiting more and a number of things -# are likely to go wrong anyway: better give up and return on error. -# -# @param pid the process id to send a signal -# @param send_signal the signal to send -# @param delays sequence of sleep times before failure -# -function kill_daemon() { - local pid=$(cat $1) - local send_signal=$2 - local delays=${3:-0.1 0.2 1 1 1 2 3 5 5 5 10 10 20 60 60 60 120} - local exit_code=1 - for try in $delays ; do - if kill -$send_signal $pid 2> /dev/null ; then - exit_code=1 - else - exit_code=0 - break - fi - send_signal=0 - sleep $try - done; - return $exit_code -} - -function test_kill_daemon() { - local dir=$1 - setup $dir || return 1 - run_mon $dir a --osd_pool_default_size=1 || return 1 - run_mgr $dir x || return 1 - run_osd $dir 0 || return 1 - - name_prefix=osd - for pidfile in $(find $dir 2>/dev/null | grep $name_prefix'[^/]*\.pid') ; do - # - # sending signal 0 won't kill the daemon - # waiting just for one second instead of the default schedule - # allows us to quickly verify what happens when kill fails - # to stop the daemon (i.e. it must return false) - # - ! kill_daemon $pidfile 0 1 || return 1 - # - # killing just the osd and verify the mon still is responsive - # - kill_daemon $pidfile TERM || return 1 - done - - ceph osd dump | grep "osd.0 down" || return 1 - - name_prefix=mgr - for pidfile in $(find $dir 2>/dev/null | grep $name_prefix'[^/]*\.pid') ; do - # - # kill the mgr - # - kill_daemon $pidfile TERM || return 1 - done - - name_prefix=mon - for pidfile in $(find $dir 2>/dev/null | grep $name_prefix'[^/]*\.pid') ; do - # - # kill the mon and verify it cannot be reached - # - kill_daemon $pidfile TERM || return 1 - ! timeout 5 ceph status || return 1 - done - - teardown $dir || return 1 -} - -## -# Kill all daemons for which a .pid file exists in **dir**. Each -# daemon is sent a **signal** and kill_daemons waits for it to exit -# during a few minutes. By default all daemons are killed. If a -# **name_prefix** is provided, only the daemons for which a pid -# file is found matching the prefix are killed. See run_osd and -# run_mon for more information about the name conventions for -# the pid files. -# -# Send TERM to all daemons : kill_daemons $dir -# Send KILL to all daemons : kill_daemons $dir KILL -# Send KILL to all osds : kill_daemons $dir KILL osd -# Send KILL to osd 1 : kill_daemons $dir KILL osd.1 -# -# If a daemon is sent the TERM signal and does not terminate -# within a few minutes, it will still be running even after -# kill_daemons returns. -# -# If all daemons are kill successfully the function returns 0 -# if at least one daemon remains, this is treated as an -# error and the function return 1. -# -# @param dir path name of the environment -# @param signal name of the first signal (defaults to TERM) -# @param name_prefix only kill match daemons (defaults to all) -# @param delays sequence of sleep times before failure -# @return 0 on success, 1 on error -# -function kill_daemons() { - local trace=$(shopt -q -o xtrace && echo true || echo false) - $trace && shopt -u -o xtrace - local dir=$1 - local signal=${2:-TERM} - local name_prefix=$3 # optional, osd, mon, osd.1 - local delays=$4 #optional timing - local status=0 - local pids="" - - for pidfile in $(find $dir 2>/dev/null | grep $name_prefix'[^/]*\.pid') ; do - run_in_background pids kill_daemon $pidfile $signal $delays - done - - wait_background pids - status=$? - - $trace && shopt -s -o xtrace - return $status -} - -function test_kill_daemons() { - local dir=$1 - setup $dir || return 1 - run_mon $dir a --osd_pool_default_size=1 || return 1 - run_mgr $dir x || return 1 - run_osd $dir 0 || return 1 - # - # sending signal 0 won't kill the daemon - # waiting just for one second instead of the default schedule - # allows us to quickly verify what happens when kill fails - # to stop the daemon (i.e. it must return false) - # - ! kill_daemons $dir 0 osd 1 || return 1 - # - # killing just the osd and verify the mon still is responsive - # - kill_daemons $dir TERM osd || return 1 - ceph osd dump | grep "osd.0 down" || return 1 - # - # kill the mgr - # - kill_daemons $dir TERM mgr || return 1 - # - # kill the mon and verify it cannot be reached - # - kill_daemons $dir TERM || return 1 - ! timeout 5 ceph status || return 1 - teardown $dir || return 1 -} - -####################################################################### - -## -# Run a monitor by the name mon.**id** with data in **dir**/**id**. -# The logs can be found in **dir**/mon.**id**.log and the pid file -# is **dir**/mon.**id**.pid and the admin socket is -# **dir**/**id**/ceph-mon.**id**.asok. -# -# The remaining arguments are passed verbatim to ceph-mon --mkfs -# and the ceph-mon daemon. -# -# Two mandatory arguments must be provided: --fsid and --mon-host -# Instead of adding them to every call to run_mon, they can be -# set in the CEPH_ARGS environment variable to be read implicitly -# by every ceph command. -# -# The CEPH_CONF variable is expected to be set to /dev/null to -# only rely on arguments for configuration. -# -# Examples: -# -# CEPH_ARGS="--fsid=$(uuidgen) " -# CEPH_ARGS+="--mon-host=127.0.0.1:7018 " -# run_mon $dir a # spawn a mon and bind port 7018 -# run_mon $dir a --debug-filestore=20 # spawn with filestore debugging -# -# If mon_initial_members is not set, the default rbd pool is deleted -# and replaced with a replicated pool with less placement groups to -# speed up initialization. If mon_initial_members is set, no attempt -# is made to recreate the rbd pool because it would hang forever, -# waiting for other mons to join. -# -# A **dir**/ceph.conf file is created but not meant to be used by any -# function. It is convenient for debugging a failure with: -# -# ceph --conf **dir**/ceph.conf -s -# -# @param dir path name of the environment -# @param id mon identifier -# @param ... can be any option valid for ceph-mon -# @return 0 on success, 1 on error -# -function run_mon_no_pool() { - local dir=$1 - shift - local id=$1 - shift - local data=$dir/$id - - ceph-mon \ - --id $id \ - --mkfs \ - --mon-data=$data \ - --run-dir=$dir \ - "$@" || return 1 - - ceph-mon \ - --id $id \ - --mon-osd-full-ratio=.99 \ - --mon-data-avail-crit=1 \ - --paxos-propose-interval=0.1 \ - --osd-crush-chooseleaf-type=0 \ - --erasure-code-dir=$CEPH_LIB \ - --plugin-dir=$CEPH_LIB \ - --debug-mon 20 \ - --debug-ms 20 \ - --debug-paxos 20 \ - --chdir= \ - --mon-data=$data \ - --log-file=$dir/\$name.log \ - --admin-socket=$dir/\$cluster-\$name.asok \ - --mon-cluster-log-file=$dir/log \ - --run-dir=$dir \ - --pid-file=$dir/\$name.pid \ - --mon-allow-pool-delete \ - "$@" || return 1 - - cat > $dir/ceph.conf </dev/null | \ - jq '.acting | .[]') - # get rid of the trailing space - echo $osds -} - -function test_get_osds() { - local dir=$1 - - setup $dir || return 1 - run_mon $dir a --osd_pool_default_size=2 || return 1 - run_mgr $dir x || return 1 - run_osd $dir 0 || return 1 - run_osd $dir 1 || return 1 - wait_for_clean || return 1 - get_osds rbd GROUP | grep --quiet '^[0-1] [0-1]$' || return 1 - teardown $dir || return 1 -} - -####################################################################### - -## -# Wait for the monitor to form quorum (optionally, of size N) -# -# @param timeout duration (lower-bound) to wait for quorum to be formed -# @param quorumsize size of quorum to wait for -# @return 0 on success, 1 on error -# -function wait_for_quorum() { - local timeout=$1 - local quorumsize=$2 - - if [[ -z "$timeout" ]]; then - timeout=300 - fi - - if [[ -z "$quorumsize" ]]; then - timeout $timeout ceph mon_status --format=json >&/dev/null || return 1 - return 0 - fi - - no_quorum=1 - wait_until=$((`date +%s` + $timeout)) - while [[ $(date +%s) -lt $wait_until ]]; do - jqfilter='.quorum | length == '$quorumsize - jqinput="$(timeout $timeout ceph mon_status --format=json 2>/dev/null)" - res=$(echo $jqinput | jq "$jqfilter") - if [[ "$res" == "true" ]]; then - no_quorum=0 - break - fi - done - return $no_quorum -} - -####################################################################### - -## -# Return the PG of supporting the **objectname** stored in -# **poolname**, as reported by ceph osd map. -# -# @param poolname an existing pool -# @param objectname an objectname (may or may not exist) -# @param STDOUT a PG -# @return 0 on success, 1 on error -# -function get_pg() { - local poolname=$1 - local objectname=$2 - - ceph --format json osd map $poolname $objectname 2>/dev/null | jq -r '.pgid' -} - -function test_get_pg() { - local dir=$1 - - setup $dir || return 1 - run_mon $dir a --osd_pool_default_size=1 || return 1 - run_mgr $dir x || return 1 - run_osd $dir 0 || return 1 - wait_for_clean || return 1 - get_pg rbd GROUP | grep --quiet '^[0-9]\.[0-9a-f][0-9a-f]*$' || return 1 - teardown $dir || return 1 -} - -####################################################################### - -## -# Return the value of the **config**, obtained via the config get command -# of the admin socket of **daemon**.**id**. -# -# @param daemon mon or osd -# @param id mon or osd ID -# @param config the configuration variable name as found in config_opts.h -# @param STDOUT the config value -# @return 0 on success, 1 on error -# -function get_config() { - local daemon=$1 - local id=$2 - local config=$3 - - CEPH_ARGS='' \ - ceph --format json daemon $dir/ceph-$daemon.$id.asok \ - config get $config 2> /dev/null | \ - jq -r ".$config" -} - -function test_get_config() { - local dir=$1 - - # override the default config using command line arg and check it - setup $dir || return 1 - run_mon $dir a --osd_pool_default_size=1 || return 1 - test $(get_config mon a osd_pool_default_size) = 1 || return 1 - run_mgr $dir x || return 1 - run_osd $dir 0 --osd_max_scrubs=3 || return 1 - test $(get_config osd 0 osd_max_scrubs) = 3 || return 1 - teardown $dir || return 1 -} - -####################################################################### - -## -# Set the **config** to specified **value**, via the config set command -# of the admin socket of **daemon**.**id** -# -# @param daemon mon or osd -# @param id mon or osd ID -# @param config the configuration variable name as found in config_opts.h -# @param value the config value -# @return 0 on success, 1 on error -# -function set_config() { - local daemon=$1 - local id=$2 - local config=$3 - local value=$4 - - test $(env CEPH_ARGS='' ceph --format json daemon $dir/ceph-$daemon.$id.asok \ - config set $config $value 2> /dev/null | \ - jq 'has("success")') == true -} - -function test_set_config() { - local dir=$1 - - setup $dir || return 1 - run_mon $dir a --osd_pool_default_size=1 || return 1 - test $(get_config mon a ms_crc_header) = true || return 1 - set_config mon a ms_crc_header false || return 1 - test $(get_config mon a ms_crc_header) = false || return 1 - set_config mon a ms_crc_header true || return 1 - test $(get_config mon a ms_crc_header) = true || return 1 - teardown $dir || return 1 -} - -####################################################################### - -## -# Return the OSD id of the primary OSD supporting the **objectname** -# stored in **poolname**, as reported by ceph osd map. -# -# @param poolname an existing pool -# @param objectname an objectname (may or may not exist) -# @param STDOUT the primary OSD id -# @return 0 on success, 1 on error -# -function get_primary() { - local poolname=$1 - local objectname=$2 - - ceph --format json osd map $poolname $objectname 2>/dev/null | \ - jq '.acting_primary' -} - -function test_get_primary() { - local dir=$1 - - setup $dir || return 1 - run_mon $dir a --osd_pool_default_size=1 || return 1 - local osd=0 - run_mgr $dir x || return 1 - run_osd $dir $osd || return 1 - wait_for_clean || return 1 - test $(get_primary rbd GROUP) = $osd || return 1 - teardown $dir || return 1 -} - -####################################################################### - -## -# Return the id of any OSD supporting the **objectname** stored in -# **poolname**, as reported by ceph osd map, except the primary. -# -# @param poolname an existing pool -# @param objectname an objectname (may or may not exist) -# @param STDOUT the OSD id -# @return 0 on success, 1 on error -# -function get_not_primary() { - local poolname=$1 - local objectname=$2 - - local primary=$(get_primary $poolname $objectname) - ceph --format json osd map $poolname $objectname 2>/dev/null | \ - jq ".acting | map(select (. != $primary)) | .[0]" -} - -function test_get_not_primary() { - local dir=$1 - - setup $dir || return 1 - run_mon $dir a --osd_pool_default_size=2 || return 1 - run_mgr $dir x || return 1 - run_osd $dir 0 || return 1 - run_osd $dir 1 || return 1 - wait_for_clean || return 1 - local primary=$(get_primary rbd GROUP) - local not_primary=$(get_not_primary rbd GROUP) - test $not_primary != $primary || return 1 - test $not_primary = 0 -o $not_primary = 1 || return 1 - teardown $dir || return 1 -} - -####################################################################### - -## -# Run ceph-objectstore-tool against the OSD **id** using the data path -# **dir**. The OSD is killed with TERM prior to running -# ceph-objectstore-tool because access to the data path is -# exclusive. The OSD is restarted after the command completes. The -# objectstore_tool returns after all PG are active+clean again. -# -# @param dir the data path of the OSD -# @param id the OSD id -# @param ... arguments to ceph-objectstore-tool -# @param STDIN the input of ceph-objectstore-tool -# @param STDOUT the output of ceph-objectstore-tool -# @return 0 on success, 1 on error -# -# The value of $ceph_osd_args will be passed to restarted osds -# -function objectstore_tool() { - local dir=$1 - shift - local id=$1 - shift - local osd_data=$dir/$id - - local osd_type=$(cat $osd_data/type) - - kill_daemons $dir TERM osd.$id >&2 < /dev/null || return 1 - - local journal_args - if [ "$objectstore_type" == "filestore" ]; then - journal_args=" --journal-path $osd_data/journal" - fi - ceph-objectstore-tool \ - --data-path $osd_data \ - $journal_args \ - "$@" || return 1 - activate_osd $dir $id $ceph_osd_args >&2 || return 1 - wait_for_clean >&2 -} - -function test_objectstore_tool() { - local dir=$1 - - setup $dir || return 1 - run_mon $dir a --osd_pool_default_size=1 || return 1 - local osd=0 - run_mgr $dir x || return 1 - run_osd $dir $osd || return 1 - wait_for_clean || return 1 - rados --pool rbd put GROUP /etc/group || return 1 - objectstore_tool $dir $osd GROUP get-bytes | \ - diff - /etc/group - ! objectstore_tool $dir $osd NOTEXISTS get-bytes || return 1 - teardown $dir || return 1 -} - -####################################################################### - -## -# Predicate checking if there is an ongoing recovery in the -# cluster. If any of the recovering_{keys,bytes,objects}_per_sec -# counters are reported by ceph status, it means recovery is in -# progress. -# -# @return 0 if recovery in progress, 1 otherwise -# -function get_is_making_recovery_progress() { - local recovery_progress - recovery_progress+=".recovering_keys_per_sec + " - recovery_progress+=".recovering_bytes_per_sec + " - recovery_progress+=".recovering_objects_per_sec" - local progress=$(ceph --format json status 2>/dev/null | \ - jq -r ".pgmap | $recovery_progress") - test "$progress" != null -} - -function test_get_is_making_recovery_progress() { - local dir=$1 - - setup $dir || return 1 - run_mon $dir a || return 1 - run_mgr $dir x || return 1 - ! get_is_making_recovery_progress || return 1 - teardown $dir || return 1 -} - -####################################################################### - -## -# Return the number of active PGs in the cluster. A PG is active if -# ceph pg dump pgs reports it both **active** and **clean** and that -# not **stale**. -# -# @param STDOUT the number of active PGs -# @return 0 on success, 1 on error -# -function get_num_active_clean() { - local expression - expression+="select(contains(\"active\") and contains(\"clean\")) | " - expression+="select(contains(\"stale\") | not)" - ceph --format json pg dump pgs 2>/dev/null | \ - jq "[.[] | .state | $expression] | length" -} - -function test_get_num_active_clean() { - local dir=$1 - - setup $dir || return 1 - run_mon $dir a --osd_pool_default_size=1 || return 1 - run_mgr $dir x || return 1 - run_osd $dir 0 || return 1 - wait_for_clean || return 1 - local num_active_clean=$(get_num_active_clean) - test "$num_active_clean" = $PG_NUM || return 1 - teardown $dir || return 1 -} - -####################################################################### - -## -# Return the number of PGs in the cluster, according to -# ceph pg dump pgs. -# -# @param STDOUT the number of PGs -# @return 0 on success, 1 on error -# -function get_num_pgs() { - ceph --format json status 2>/dev/null | jq '.pgmap.num_pgs' -} - -function test_get_num_pgs() { - local dir=$1 - - setup $dir || return 1 - run_mon $dir a --osd_pool_default_size=1 || return 1 - run_mgr $dir x || return 1 - run_osd $dir 0 || return 1 - wait_for_clean || return 1 - local num_pgs=$(get_num_pgs) - test "$num_pgs" -gt 0 || return 1 - teardown $dir || return 1 -} - -####################################################################### - -## -# Return the date and time of the last completed scrub for **pgid**, -# as reported by ceph pg dump pgs. Note that a repair also sets this -# date. -# -# @param pgid the id of the PG -# @param STDOUT the date and time of the last scrub -# @return 0 on success, 1 on error -# -function get_last_scrub_stamp() { - local pgid=$1 - local sname=${2:-last_scrub_stamp} - ceph --format json pg dump pgs 2>/dev/null | \ - jq -r ".[] | select(.pgid==\"$pgid\") | .$sname" -} - -function test_get_last_scrub_stamp() { - local dir=$1 - - setup $dir || return 1 - run_mon $dir a --osd_pool_default_size=1 || return 1 - run_mgr $dir x || return 1 - run_osd $dir 0 || return 1 - wait_for_clean || return 1 - stamp=$(get_last_scrub_stamp 2.0) - test -n "$stamp" || return 1 - teardown $dir || return 1 -} - -####################################################################### - -## -# Predicate checking if the cluster is clean, i.e. all of its PGs are -# in a clean state (see get_num_active_clean for a definition). -# -# @return 0 if the cluster is clean, 1 otherwise -# -function is_clean() { - num_pgs=$(get_num_pgs) - test $num_pgs != 0 || return 1 - test $(get_num_active_clean) = $num_pgs || return 1 -} - -function test_is_clean() { - local dir=$1 - - setup $dir || return 1 - run_mon $dir a --osd_pool_default_size=1 || return 1 - run_mgr $dir x || return 1 - run_osd $dir 0 || return 1 - wait_for_clean || return 1 - is_clean || return 1 - teardown $dir || return 1 -} - -####################################################################### - -## -# Return a list of numbers that are increasingly larger and whose -# total is **timeout** seconds. It can be used to have short sleep -# delay while waiting for an event on a fast machine. But if running -# very slowly the larger delays avoid stressing the machine even -# further or spamming the logs. -# -# @param timeout sum of all delays, in seconds -# @return a list of sleep delays -# -function get_timeout_delays() { - local trace=$(shopt -q -o xtrace && echo true || echo false) - $trace && shopt -u -o xtrace - local timeout=$1 - local first_step=${2:-1} - - local i - local total="0" - i=$first_step - while test "$(echo $total + $i \<= $timeout | bc -l)" = "1"; do - echo -n "$i " - total=$(echo $total + $i | bc -l) - i=$(echo $i \* 2 | bc -l) - done - if test "$(echo $total \< $timeout | bc -l)" = "1"; then - echo -n $(echo $timeout - $total | bc -l) - fi - $trace && shopt -s -o xtrace -} - -function test_get_timeout_delays() { - test "$(get_timeout_delays 1)" = "1 " || return 1 - test "$(get_timeout_delays 5)" = "1 2 2" || return 1 - test "$(get_timeout_delays 6)" = "1 2 3" || return 1 - test "$(get_timeout_delays 7)" = "1 2 4 " || return 1 - test "$(get_timeout_delays 8)" = "1 2 4 1" || return 1 - test "$(get_timeout_delays 1 .1)" = ".1 .2 .4 .3" || return 1 - test "$(get_timeout_delays 1.5 .1)" = ".1 .2 .4 .8 " || return 1 - test "$(get_timeout_delays 5 .1)" = ".1 .2 .4 .8 1.6 1.9" || return 1 - test "$(get_timeout_delays 6 .1)" = ".1 .2 .4 .8 1.6 2.9" || return 1 - test "$(get_timeout_delays 6.3 .1)" = ".1 .2 .4 .8 1.6 3.2 " || return 1 - test "$(get_timeout_delays 20 .1)" = ".1 .2 .4 .8 1.6 3.2 6.4 7.3" || return 1 -} - -####################################################################### - -## -# Wait until the cluster becomes clean or if it does not make progress -# for $TIMEOUT seconds. -# Progress is measured either via the **get_is_making_recovery_progress** -# predicate or if the number of clean PGs changes (as returned by get_num_active_clean) -# -# @return 0 if the cluster is clean, 1 otherwise -# -function wait_for_clean() { - local num_active_clean=-1 - local cur_active_clean - local -a delays=($(get_timeout_delays $TIMEOUT .1)) - local -i loop=0 - - while test $(get_num_pgs) == 0 ; do - sleep 1 - done - - while true ; do - # Comparing get_num_active_clean & get_num_pgs is used to determine - # if the cluster is clean. That's almost an inline of is_clean() to - # get more performance by avoiding multiple calls of get_num_active_clean. - cur_active_clean=$(get_num_active_clean) - test $cur_active_clean = $(get_num_pgs) && break - if test $cur_active_clean != $num_active_clean ; then - loop=0 - num_active_clean=$cur_active_clean - elif get_is_making_recovery_progress ; then - loop=0 - elif (( $loop >= ${#delays[*]} )) ; then - ceph report - return 1 - fi - sleep ${delays[$loop]} - loop+=1 - done - return 0 -} - -function test_wait_for_clean() { - local dir=$1 - - setup $dir || return 1 - run_mon $dir a --osd_pool_default_size=1 || return 1 - run_mgr $dir x || return 1 - ! TIMEOUT=1 wait_for_clean || return 1 - run_osd $dir 0 || return 1 - wait_for_clean || return 1 - teardown $dir || return 1 -} - -####################################################################### - -## -# Wait until the cluster becomes HEALTH_OK again or if it does not make progress -# for $TIMEOUT seconds. -# -# @return 0 if the cluster is HEALTHY, 1 otherwise -# -function wait_for_health() { - local grepstr=$1 - local -a delays=($(get_timeout_delays $TIMEOUT .1)) - local -i loop=0 - - while ! ceph health detail | grep "$grepstr" ; do - if (( $loop >= ${#delays[*]} )) ; then - ceph health detail - return 1 - fi - sleep ${delays[$loop]} - loop+=1 - done -} - -function wait_for_health_ok() { - wait_for_health "HEALTH_OK" || return 1 -} - -function test_wait_for_health_ok() { - local dir=$1 - - setup $dir || return 1 - run_mon $dir a --osd_pool_default_size=1 --osd_failsafe_full_ratio=.99 --mon_pg_warn_min_per_osd=0 || return 1 - run_mgr $dir x --mon_pg_warn_min_per_osd=0 || return 1 - run_osd $dir 0 || return 1 - kill_daemons $dir TERM osd || return 1 - ! TIMEOUT=1 wait_for_health_ok || return 1 - activate_osd $dir 0 || return 1 - wait_for_health_ok || return 1 - teardown $dir || return 1 -} - - -####################################################################### - -## -# Run repair on **pgid** and wait until it completes. The repair -# function will fail if repair does not complete within $TIMEOUT -# seconds. -# -# @param pgid the id of the PG -# @return 0 on success, 1 on error -# -function repair() { - local pgid=$1 - local last_scrub=$(get_last_scrub_stamp $pgid) - ceph pg repair $pgid - wait_for_scrub $pgid "$last_scrub" -} - -function test_repair() { - local dir=$1 - - setup $dir || return 1 - run_mon $dir a --osd_pool_default_size=1 || return 1 - run_mgr $dir x || return 1 - run_osd $dir 0 || return 1 - wait_for_clean || return 1 - repair 2.0 || return 1 - kill_daemons $dir KILL osd || return 1 - ! TIMEOUT=1 repair 2.0 || return 1 - teardown $dir || return 1 -} -####################################################################### - -## -# Run scrub on **pgid** and wait until it completes. The pg_scrub -# function will fail if repair does not complete within $TIMEOUT -# seconds. The pg_scrub is complete whenever the -# **get_last_scrub_stamp** function reports a timestamp different from -# the one stored before starting the scrub. -# -# @param pgid the id of the PG -# @return 0 on success, 1 on error -# -function pg_scrub() { - local pgid=$1 - local last_scrub=$(get_last_scrub_stamp $pgid) - ceph pg scrub $pgid - wait_for_scrub $pgid "$last_scrub" -} - -function pg_deep_scrub() { - local pgid=$1 - local last_scrub=$(get_last_scrub_stamp $pgid last_deep_scrub_stamp) - ceph pg deep-scrub $pgid - wait_for_scrub $pgid "$last_scrub" last_deep_scrub_stamp -} - -function test_pg_scrub() { - local dir=$1 - - setup $dir || return 1 - run_mon $dir a --osd_pool_default_size=1 || return 1 - run_mgr $dir x || return 1 - run_osd $dir 0 || return 1 - wait_for_clean || return 1 - pg_scrub 2.0 || return 1 - kill_daemons $dir KILL osd || return 1 - ! TIMEOUT=1 pg_scrub 2.0 || return 1 - teardown $dir || return 1 -} - -####################################################################### - -## -# Run the *command* and expect it to fail (i.e. return a non zero status). -# The output (stderr and stdout) is stored in a temporary file in *dir* -# and is expected to contain the string *expected*. -# -# Return 0 if the command failed and the string was found. Otherwise -# return 1 and cat the full output of the command on stderr for debug. -# -# @param dir temporary directory to store the output -# @param expected string to look for in the output -# @param command ... the command and its arguments -# @return 0 on success, 1 on error -# - -function expect_failure() { - local dir=$1 - shift - local expected="$1" - shift - local success - - if "$@" > $dir/out 2>&1 ; then - success=true - else - success=false - fi - - if $success || ! grep --quiet "$expected" $dir/out ; then - cat $dir/out >&2 - return 1 - else - return 0 - fi -} - -function test_expect_failure() { - local dir=$1 - - setup $dir || return 1 - expect_failure $dir FAIL bash -c 'echo FAIL ; exit 1' || return 1 - # the command did not fail - ! expect_failure $dir FAIL bash -c 'echo FAIL ; exit 0' > $dir/out || return 1 - grep --quiet FAIL $dir/out || return 1 - # the command failed but the output does not contain the expected string - ! expect_failure $dir FAIL bash -c 'echo UNEXPECTED ; exit 1' > $dir/out || return 1 - ! grep --quiet FAIL $dir/out || return 1 - teardown $dir || return 1 -} - -####################################################################### - -## -# Given the *last_scrub*, wait for scrub to happen on **pgid**. It -# will fail if scrub does not complete within $TIMEOUT seconds. The -# repair is complete whenever the **get_last_scrub_stamp** function -# reports a timestamp different from the one given in argument. -# -# @param pgid the id of the PG -# @param last_scrub timestamp of the last scrub for *pgid* -# @return 0 on success, 1 on error -# -function wait_for_scrub() { - local pgid=$1 - local last_scrub="$2" - local sname=${3:-last_scrub_stamp} - - for ((i=0; i < $TIMEOUT; i++)); do - if test "$last_scrub" != "$(get_last_scrub_stamp $pgid $sname)" ; then - return 0 - fi - sleep 1 - done - return 1 -} - -function test_wait_for_scrub() { - local dir=$1 - - setup $dir || return 1 - run_mon $dir a --osd_pool_default_size=1 || return 1 - run_mgr $dir x || return 1 - run_osd $dir 0 || return 1 - wait_for_clean || return 1 - local pgid=2.0 - ceph pg repair $pgid - local last_scrub=$(get_last_scrub_stamp $pgid) - wait_for_scrub $pgid "$last_scrub" || return 1 - kill_daemons $dir KILL osd || return 1 - last_scrub=$(get_last_scrub_stamp $pgid) - ! TIMEOUT=1 wait_for_scrub $pgid "$last_scrub" || return 1 - teardown $dir || return 1 -} - -####################################################################### - -## -# Return 0 if the erasure code *plugin* is available, 1 otherwise. -# -# @param plugin erasure code plugin -# @return 0 on success, 1 on error -# - -function erasure_code_plugin_exists() { - local plugin=$1 - local status - local grepstr - local s - case `uname` in - FreeBSD) grepstr="Cannot open.*$plugin" ;; - *) grepstr="$plugin.*No such file" ;; - esac - - s=$(ceph osd erasure-code-profile set TESTPROFILE plugin=$plugin 2>&1) - local status=$? - if [ $status -eq 0 ]; then - ceph osd erasure-code-profile rm TESTPROFILE - elif ! echo $s | grep --quiet "$grepstr" ; then - status=1 - # display why the string was rejected. - echo $s - fi - return $status -} - -function test_erasure_code_plugin_exists() { - local dir=$1 - - setup $dir || return 1 - run_mon $dir a || return 1 - run_mgr $dir x || return 1 - erasure_code_plugin_exists jerasure || return 1 - ! erasure_code_plugin_exists FAKE || return 1 - teardown $dir || return 1 -} - -####################################################################### - -## -# Display all log files from **dir** on stdout. -# -# @param dir directory in which all data is stored -# - -function display_logs() { - local dir=$1 - - find $dir -maxdepth 1 -name '*.log' | \ - while read file ; do - echo "======================= $file" - cat $file - done -} - -function test_display_logs() { - local dir=$1 - - setup $dir || return 1 - run_mon $dir a || return 1 - kill_daemons $dir || return 1 - display_logs $dir > $dir/log.out - grep --quiet mon.a.log $dir/log.out || return 1 - teardown $dir || return 1 -} - -####################################################################### -## -# Spawn a command in background and save the pid in the variable name -# passed in argument. To make the output reading easier, the output is -# prepend with the process id. -# -# Example: -# pids1="" -# run_in_background pids1 bash -c 'sleep 1; exit 1' -# -# @param pid_variable the variable name (not value) where the pids will be stored -# @param ... the command to execute -# @return only the pid_variable output should be considered and used with **wait_background** -# -function run_in_background() { - local pid_variable=$1 - shift; - # Execute the command and prepend the output with its pid - # We enforce to return the exit status of the command and not the awk one. - ("$@" |& awk '{ a[i++] = $0 }END{for (i = 0; i in a; ++i) { print "'$$': " a[i]} }'; return ${PIPESTATUS[0]}) >&2 & - eval "$pid_variable+=\" $!\"" -} - -function test_run_in_background() { - local pids - run_in_background pids sleep 1 - run_in_background pids sleep 1 - test $(echo $pids | wc -w) = 2 || return 1 - wait $pids || return 1 -} - -####################################################################### -## -# Wait for pids running in background to complete. -# This function is usually used after a **run_in_background** call -# Example: -# pids1="" -# run_in_background pids1 bash -c 'sleep 1; exit 1' -# wait_background pids1 -# -# @param pids The variable name that contains the active PIDS. Set as empty at then end of the function. -# @return returns 1 if at least one process exits in error unless returns 0 -# -function wait_background() { - # We extract the PIDS from the variable name - pids=${!1} - - return_code=0 - for pid in $pids; do - if ! wait $pid; then - # If one process failed then return 1 - return_code=1 - fi - done - - # We empty the variable reporting that all process ended - eval "$1=''" - - return $return_code -} - - -function test_wait_background() { - local pids="" - run_in_background pids bash -c "sleep 1; exit 1" - run_in_background pids bash -c "sleep 2; exit 0" - wait_background pids - if [ $? -ne 1 ]; then return 1; fi - - run_in_background pids bash -c "sleep 1; exit 0" - run_in_background pids bash -c "sleep 2; exit 0" - wait_background pids - if [ $? -ne 0 ]; then return 1; fi - - if [ ! -z "$pids" ]; then return 1; fi -} - -function flush_pg_stats() -{ - local timeout=${1:-$TIMEOUT} - - ids=`ceph osd ls` - seqs='' - for osd in $ids; do - seq=`ceph tell osd.$osd flush_pg_stats` - seqs="$seqs $osd-$seq" - done - - for s in $seqs; do - osd=`echo $s | cut -d - -f 1` - seq=`echo $s | cut -d - -f 2` - echo "waiting osd.$osd seq $seq" - while test $(ceph osd last-stat-seq $osd) -lt $seq; do - sleep 1 - if [ $((timeout--)) -eq 0 ]; then - return 1 - fi - done - done -} - -function test_flush_pg_stats() -{ - local dir=$1 - - setup $dir || return 1 - run_mon $dir a --osd_pool_default_size=1 || return 1 - run_mgr $dir x || return 1 - run_osd $dir 0 || return 1 - rados -p rbd put obj /etc/group - flush_pg_stats - local jq_filter='.pools | .[] | select(.name == "rbd") | .stats' - raw_bytes_used=`ceph df detail --format=json | jq "$jq_filter.raw_bytes_used"` - bytes_used=`ceph df detail --format=json | jq "$jq_filter.bytes_used"` - test $raw_bytes_used > 0 || return 1 - test $raw_bytes_used == $bytes_used || return 1 -} - -####################################################################### - -## -# Call the **run** function (which must be defined by the caller) with -# the **dir** argument followed by the caller argument list. -# -# If the **run** function returns on error, all logs found in **dir** -# are displayed for diagnostic purposes. -# -# **teardown** function is called when the **run** function returns -# (on success or on error), to cleanup leftovers. The CEPH_CONF is set -# to /dev/null and CEPH_ARGS is unset so that the tests are protected from -# external interferences. -# -# It is the responsibility of the **run** function to call the -# **setup** function to prepare the test environment (create a temporary -# directory etc.). -# -# The shell is required (via PS4) to display the function and line -# number whenever a statement is executed to help debugging. -# -# @param dir directory in which all data is stored -# @param ... arguments passed transparently to **run** -# @return 0 on success, 1 on error -# -function main() { - local dir=td/$1 - shift - - shopt -s -o xtrace - PS4='${BASH_SOURCE[0]}:$LINENO: ${FUNCNAME[0]}: ' - - export PATH=${CEPH_BUILD_VIRTUALENV}/ceph-disk-virtualenv/bin:${CEPH_BUILD_VIRTUALENV}/ceph-detect-init-virtualenv/bin:.:$PATH # make sure program from sources are preferred - #export PATH=$CEPH_ROOT/src/ceph-disk/virtualenv/bin:$CEPH_ROOT/src/ceph-detect-init/virtualenv/bin:.:$PATH # make sure program from sources are preferred - - export CEPH_CONF=/dev/null - unset CEPH_ARGS - - local code - if run $dir "$@" ; then - code=0 - else - display_logs $dir - code=1 - fi - teardown $dir || return 1 - return $code -} - -####################################################################### - -function run_tests() { - shopt -s -o xtrace - PS4='${BASH_SOURCE[0]}:$LINENO: ${FUNCNAME[0]}: ' - - export PATH=${CEPH_BUILD_VIRTUALENV}/ceph-disk-virtualenv/bin:${CEPH_BUILD_VIRTUALENV}/ceph-detect-init-virtualenv/bin:.:$PATH # make sure program from sources are preferred - #export PATH=$CEPH_ROOT/src/ceph-disk/virtualenv/bin:$CEPH_ROOT/src/ceph-detect-init/virtualenv/bin:.:$PATH # make sure program from sources are preferred - - export CEPH_MON="127.0.0.1:7109" # git grep '\<7109\>' : there must be only one - export CEPH_ARGS - CEPH_ARGS+="--fsid=$(uuidgen) --auth-supported=none " - CEPH_ARGS+="--mon-host=$CEPH_MON " - export CEPH_CONF=/dev/null - - local funcs=${@:-$(set | sed -n -e 's/^\(test_[0-9a-z_]*\) .*/\1/p')} - local dir=td/ceph-helpers - - for func in $funcs ; do - $func $dir || return 1 - done -} - -if test "$1" = TESTS ; then - shift - run_tests "$@" -fi - -# NOTE: -# jq only support --exit-status|-e from version 1.4 forwards, which makes -# returning on error waaaay prettier and straightforward. -# However, the current automated upstream build is running with v1.3, -# which has no idea what -e is. Hence the convoluted error checking we -# need. Sad. -# The next time someone changes this code, please check if v1.4 is now -# a thing, and, if so, please change these to use -e. Thanks. - -# jq '.all.supported | select([.[] == "foo"] | any)' -function jq_success() { - input="$1" - filter="$2" - expects="\"$3\"" - - in_escaped=$(printf %s "$input" | sed "s/'/'\\\\''/g") - filter_escaped=$(printf %s "$filter" | sed "s/'/'\\\\''/g") - - ret=$(echo "$in_escaped" | jq "$filter_escaped") - if [[ "$ret" == "true" ]]; then - return 0 - elif [[ -n "$expects" ]]; then - if [[ "$ret" == "$expects" ]]; then - return 0 - fi - fi - return 1 - input=$1 - filter=$2 - expects="$3" - - ret="$(echo $input | jq \"$filter\")" - if [[ "$ret" == "true" ]]; then - return 0 - elif [[ -n "$expects" && "$ret" == "$expects" ]]; then - return 0 - fi - return 1 -} - -# Local Variables: -# compile-command: "cd ../../src ; make -j4 && ../qa/workunits/ceph-helpers.sh TESTS # test_get_config" -# End: diff -Nru ceph-12.1.1/qa/workunits/cephtool/test.sh ceph-12.1.2/qa/workunits/cephtool/test.sh --- ceph-12.1.1/qa/workunits/cephtool/test.sh 2017-07-17 16:56:02.000000000 +0000 +++ ceph-12.1.2/qa/workunits/cephtool/test.sh 2017-08-01 17:55:40.000000000 +0000 @@ -2,7 +2,7 @@ # -*- mode:shell-script; tab-width:8; sh-basic-offset:2; indent-tabs-mode:t -*- # vim: ts=8 sw=8 ft=bash smarttab -source $(dirname $0)/../ceph-helpers.sh +source $(dirname $0)/../../standalone/ceph-helpers.sh set -e set -o functrace @@ -14,9 +14,9 @@ { local client=$1 - if test -n "$CEPH_OUT_DIR"; + if test -n "$CEPH_ASOK_DIR"; then - echo $CEPH_OUT_DIR/$client.asok + echo $(get_asok_dir)/$client.asok else local cluster=$(echo $CEPH_ARGS | sed -r 's/.*--cluster[[:blank:]]*([[:alnum:]]*).*/\1/') echo "/var/run/ceph/$cluster-$client.asok" @@ -163,6 +163,9 @@ if [ -n "$1" ]; then whatch_opt=--watch-$1 + if [ -n "$2" ]; then + whatch_opt+=" --watch-channel $2" + fi fi CEPH_WATCH_FILE=${TEMP_DIR}/CEPH_WATCH_$$ @@ -264,6 +267,7 @@ local slow=slow_eviction local fast=fast_eviction ceph osd pool create $slow 1 1 + ceph osd pool application enable $slow rados ceph osd pool create $fast 1 1 ceph osd tier add $slow $fast ceph osd tier cache-mode $fast writeback @@ -306,7 +310,9 @@ { # tiering ceph osd pool create slow 2 + ceph osd pool application enable slow rados ceph osd pool create slow2 2 + ceph osd pool application enable slow2 rados ceph osd pool create cache 2 ceph osd pool create cache2 2 ceph osd tier add slow cache @@ -392,6 +398,7 @@ { # make sure we can't clobber snapshot state ceph osd pool create snap_base 2 + ceph osd pool application enable snap_base rados ceph osd pool create snap_cache 2 ceph osd pool mksnap snap_cache snapname expect_false ceph osd tier add snap_base snap_cache @@ -403,6 +410,7 @@ { # make sure we can't create snapshot on tier ceph osd pool create basex 2 + ceph osd pool application enable basex rados ceph osd pool create cachex 2 ceph osd tier add basex cachex expect_false ceph osd pool mksnap cache snapname @@ -417,6 +425,7 @@ ceph osd pool create eccache 2 2 erasure expect_false ceph osd set-require-min-compat-client bobtail ceph osd pool create repbase 2 + ceph osd pool application enable repbase rados expect_false ceph osd tier add repbase eccache ceph osd pool delete repbase repbase --yes-i-really-really-mean-it ceph osd pool delete eccache eccache --yes-i-really-really-mean-it @@ -426,6 +435,7 @@ { # convenient add-cache command ceph osd pool create slow 2 + ceph osd pool application enable slow rados ceph osd pool create cache3 2 ceph osd tier add-cache slow cache3 1024000 ceph osd dump | grep cache3 | grep bloom | grep 'false_positive_probability: 0.05' | grep 'target_bytes 1024000' | grep '1200s x4' @@ -443,6 +453,7 @@ { # check add-cache whether work ceph osd pool create datapool 2 + ceph osd pool application enable datapool rados ceph osd pool create cachepool 2 ceph osd tier add-cache datapool cachepool 1024000 ceph osd tier cache-mode cachepool writeback @@ -460,6 +471,7 @@ { # protection against pool removal when used as tiers ceph osd pool create datapool 2 + ceph osd pool application enable datapool rados ceph osd pool create cachepool 2 ceph osd tier add-cache datapool cachepool 1024000 ceph osd pool delete cachepool cachepool --yes-i-really-really-mean-it 2> $TMPFILE || true @@ -477,6 +489,7 @@ ## check health check ceph osd set notieragent ceph osd pool create datapool 2 + ceph osd pool application enable datapool rados ceph osd pool create cache4 2 ceph osd tier add-cache datapool cache4 1024000 ceph osd tier cache-mode cache4 writeback @@ -503,7 +516,9 @@ # results in a 'pool foo is now (or already was) not a tier of bar' # ceph osd pool create basepoolA 2 + ceph osd pool application enable basepoolA rados ceph osd pool create basepoolB 2 + ceph osd pool application enable basepoolB rados poolA_id=$(ceph osd dump | grep 'pool.*basepoolA' | awk '{print $2;}') poolB_id=$(ceph osd dump | grep 'pool.*basepoolB' | awk '{print $2;}') @@ -537,6 +552,7 @@ ceph auth add client.xx -i client.xx.keyring rm -f client.xx.keyring ceph auth list | grep client.xx + ceph auth ls | grep client.xx ceph auth get client.xx | grep caps | grep mon ceph auth get client.xx | grep caps | grep osd ceph auth get-key client.xx @@ -575,9 +591,9 @@ check_response "auid = $auid" ceph --format json-pretty auth get client.TEST > $TMPFILE check_response '"auid": '$auid - ceph auth list > $TMPFILE + ceph auth ls > $TMPFILE check_response "auid: $auid" - ceph --format json-pretty auth list > $TMPFILE + ceph --format json-pretty auth ls > $TMPFILE check_response '"auid": '$auid ceph auth del client.TEST } @@ -603,7 +619,7 @@ check_response "EACCES: access denied" ceph -n client.xx-profile-ro -k client.xx.keyring osd set noout >& $TMPFILE || true check_response "EACCES: access denied" - ceph -n client.xx-profile-ro -k client.xx.keyring auth list >& $TMPFILE || true + ceph -n client.xx-profile-ro -k client.xx.keyring auth ls >& $TMPFILE || true check_response "EACCES: access denied" # read-write is allowed for all read-write commands (except auth) @@ -616,11 +632,11 @@ ceph -n client.xx-profile-rw -k client.xx.keyring osd set noout ceph -n client.xx-profile-rw -k client.xx.keyring osd unset noout # read-write gets access denied for auth commands - ceph -n client.xx-profile-rw -k client.xx.keyring auth list >& $TMPFILE || true + ceph -n client.xx-profile-rw -k client.xx.keyring auth ls >& $TMPFILE || true check_response "EACCES: access denied" # role-definer is allowed RWX 'auth' commands and read-only 'mon' commands - ceph -n client.xx-profile-rd -k client.xx.keyring auth list + ceph -n client.xx-profile-rd -k client.xx.keyring auth ls ceph -n client.xx-profile-rd -k client.xx.keyring auth export ceph -n client.xx-profile-rd -k client.xx.keyring auth add client.xx-profile-foo ceph -n client.xx-profile-rd -k client.xx.keyring status @@ -733,6 +749,12 @@ ceph mon count-metadata ceph_version ceph mon versions + ceph mgr metadata + ceph mgr versions + ceph mgr count-metadata ceph_version + + ceph versions + ceph node ls } @@ -795,7 +817,7 @@ # in the cluster at all function mds_exists() { - ceph auth list | grep "^mds" + ceph auth ls | grep "^mds" } # some of the commands are just not idempotent. @@ -1327,6 +1349,21 @@ } +function test_mon_config_key() +{ + key=asdfasdfqwerqwreasdfuniquesa123df + ceph config-key list | grep -c $key | grep 0 + ceph config-key get $key | grep -c bar | grep 0 + ceph config-key set $key bar + ceph config-key get $key | grep bar + ceph config-key list | grep -c $key | grep 1 + ceph config-key dump | grep $key | grep bar + ceph config-key rm $key + expect_false ceph config-key get $key + ceph config-key list | grep -c $key | grep 0 + ceph config-key dump | grep -c $key | grep 0 +} + function test_mon_osd() { # @@ -1581,6 +1618,7 @@ ceph osd ls ceph osd pool create data 10 + ceph osd pool application enable data rados ceph osd lspools | grep data ceph osd map data foo | grep 'pool.*data.*object.*foo.*pg.*up.*acting' ceph osd map data foo namespace| grep 'pool.*data.*object.*namespace/foo.*pg.*up.*acting' @@ -1595,12 +1633,16 @@ ceph osd tree down ceph osd tree in ceph osd tree out + ceph osd tree destroyed ceph osd tree up in ceph osd tree up out ceph osd tree down in ceph osd tree down out ceph osd tree out down expect_false ceph osd tree up down + expect_false ceph osd tree up destroyed + expect_false ceph osd tree down destroyed + expect_false ceph osd tree up down destroyed expect_false ceph osd tree in out expect_false ceph osd tree up foo @@ -1640,6 +1682,7 @@ # osd pool # ceph osd pool create data 10 + ceph osd pool application enable data rados ceph osd pool mksnap data datasnap rados -p data lssnap | grep datasnap ceph osd pool rmsnap data datasnap @@ -1647,6 +1690,7 @@ ceph osd pool delete data data --yes-i-really-really-mean-it ceph osd pool create data2 10 + ceph osd pool application enable data2 rados ceph osd pool rename data2 data3 ceph osd lspools | grep data3 ceph osd pool delete data3 data3 --yes-i-really-really-mean-it @@ -1655,15 +1699,17 @@ ceph osd pool create replicated 12 12 replicated ceph osd pool create replicated 12 12 # default is replicated ceph osd pool create replicated 12 # default is replicated, pgp_num = pg_num + ceph osd pool application enable replicated rados # should fail because the type is not the same expect_false ceph osd pool create replicated 12 12 erasure ceph osd lspools | grep replicated ceph osd pool create ec_test 1 1 erasure + ceph osd pool application enable ec_test rados set +e - ceph osd metadata | grep osd_objectstore_type | grep -qc bluestore - if [ $? -eq 0 ]; then + ceph osd count-metadata osd_objectstore | grep 'bluestore' + if [ $? -eq 1 ]; then # enable ec_overwrites on non-bluestore pools should fail ceph osd pool set ec_test allow_ec_overwrites true >& $TMPFILE - check_response $? 22 "pool must only be stored on bluestore for scrubbing to work" + check_response "pool must only be stored on bluestore for scrubbing to work" $? 22 else ceph osd pool set ec_test allow_ec_overwrites true || return 1 expect_false ceph osd pool set ec_test allow_ec_overwrites false @@ -1681,6 +1727,7 @@ # create tmp pool ceph osd pool create tmp-quota-pool 36 + ceph osd pool application enable tmp-quota-pool rados # # set erroneous quotas # @@ -1857,6 +1904,7 @@ { TEST_POOL_GETSET=pool_getset ceph osd pool create $TEST_POOL_GETSET 1 + ceph osd pool application enable $TEST_POOL_GETSET rados wait_for_clean ceph osd pool get $TEST_POOL_GETSET all @@ -1871,6 +1919,7 @@ ceph osd pool set $TEST_POOL_GETSET size $old_size ceph osd pool create pool_erasure 1 1 erasure + ceph osd pool application enable pool_erasure rados wait_for_clean set +e ceph osd pool set pool_erasure size 4444 2>$TMPFILE @@ -1942,12 +1991,12 @@ ceph osd pool set $TEST_POOL_GETSET pgp_num 10 old_pgs=$(ceph osd pool get $TEST_POOL_GETSET pg_num | sed -e 's/pg_num: //') - new_pgs=$(($old_pgs+$(ceph osd stat | grep osdmap | awk '{print $3}')*32)) + new_pgs=$(($old_pgs + $(ceph osd stat --format json | jq '.num_osds') * 32)) ceph osd pool set $TEST_POOL_GETSET pg_num $new_pgs ceph osd pool set $TEST_POOL_GETSET pgp_num $new_pgs wait_for_clean old_pgs=$(ceph osd pool get $TEST_POOL_GETSET pg_num | sed -e 's/pg_num: //') - new_pgs=$(($old_pgs+$(ceph osd stat | grep osdmap | awk '{print $3}')*32+1)) + new_pgs=$(($old_pgs + $(ceph osd stat --format json | jq '.num_osds') * 32 + 1)) expect_false ceph osd pool set $TEST_POOL_GETSET pg_num $new_pgs ceph osd pool set $TEST_POOL_GETSET nosizechange 1 @@ -1964,11 +2013,6 @@ expect_false ceph osd pool set $TEST_POOL_GETSET hashpspool 1 ceph osd pool set $TEST_POOL_GETSET hashpspool 1 --yes-i-really-mean-it - ceph osd pool set $TEST_POOL_GETSET nodelete 1 - expect_false ceph osd pool delete $TEST_POOL_GETSET $TEST_POOL_GETSET --yes-i-really-really-mean-it - ceph osd pool set $TEST_POOL_GETSET nodelete 0 - ceph osd pool delete $TEST_POOL_GETSET $TEST_POOL_GETSET --yes-i-really-really-mean-it - ceph osd pool get rbd crush_rule | grep 'crush_rule: ' ceph osd pool get $TEST_POOL_GETSET compression_mode | expect_false grep '.' @@ -2004,6 +2048,12 @@ ceph osd pool set $TEST_POOL_GETSET $size 0 ceph osd pool get $TEST_POOL_GETSET $size | expect_false grep '.' done + + ceph osd pool set $TEST_POOL_GETSET nodelete 1 + expect_false ceph osd pool delete $TEST_POOL_GETSET $TEST_POOL_GETSET --yes-i-really-really-mean-it + ceph osd pool set $TEST_POOL_GETSET nodelete 0 + ceph osd pool delete $TEST_POOL_GETSET $TEST_POOL_GETSET --yes-i-really-really-mean-it + } function test_mon_osd_tiered_pool_set() @@ -2058,6 +2108,7 @@ # this is not a tier pool ceph osd pool create fake-tier 2 + ceph osd pool application enable fake-tier rados wait_for_clean expect_false ceph osd pool set fake-tier hit_set_type explicit_hash @@ -2246,11 +2297,11 @@ sleep 1 - ceph_watch_start debug + ceph_watch_start debug audit ceph tell mon.a version ceph_watch_wait 'mon.a \[DBG\] from.*cmd=\[{"prefix": "version"}\]: dispatch' - ceph_watch_start debug + ceph_watch_start debug audit ceph tell mon.b version ceph_watch_wait 'mon.b \[DBG\] from.*cmd=\[{"prefix": "version"}\]: dispatch' } @@ -2295,6 +2346,7 @@ # RAW USED The near raw used per pool in raw total ceph osd pool create cephdf_for_test 32 32 replicated + ceph osd pool application enable cephdf_for_test rados ceph osd pool set cephdf_for_test size 2 dd if=/dev/zero of=./cephdf_for_test bs=4k count=1 @@ -2318,6 +2370,44 @@ expect_false test $cal_raw_used_size != $raw_used_size } +function test_mon_pool_application() +{ + ceph osd pool create app_for_test 10 + + ceph osd pool application enable app_for_test rbd + expect_false ceph osd pool application enable app_for_test rgw + ceph osd pool application enable app_for_test rgw --yes-i-really-mean-it + ceph osd pool ls detail | grep "application rbd,rgw" + ceph osd pool ls detail --format=json | grep '"application_metadata":{"rbd":{},"rgw":{}}' + + expect_false ceph osd pool application set app_for_test cephfs key value + ceph osd pool application set app_for_test rbd key1 value1 + ceph osd pool application set app_for_test rbd key2 value2 + ceph osd pool application set app_for_test rgw key1 value1 + + ceph osd pool ls detail --format=json | grep '"application_metadata":{"rbd":{"key1":"value1","key2":"value2"},"rgw":{"key1":"value1"}}' + + ceph osd pool application rm app_for_test rgw key1 + ceph osd pool ls detail --format=json | grep '"application_metadata":{"rbd":{"key1":"value1","key2":"value2"},"rgw":{}}' + ceph osd pool application rm app_for_test rbd key2 + ceph osd pool ls detail --format=json | grep '"application_metadata":{"rbd":{"key1":"value1"},"rgw":{}}' + ceph osd pool application rm app_for_test rbd key1 + ceph osd pool ls detail --format=json | grep '"application_metadata":{"rbd":{},"rgw":{}}' + ceph osd pool application rm app_for_test rbd key1 # should be idempotent + + expect_false ceph osd pool application disable app_for_test rgw + ceph osd pool application disable app_for_test rgw --yes-i-really-mean-it + ceph osd pool application disable app_for_test rgw --yes-i-really-mean-it # should be idempotent + ceph osd pool ls detail | grep "application rbd" + ceph osd pool ls detail --format=json | grep '"application_metadata":{"rbd":{}}' + + ceph osd pool application disable app_for_test rgw --yes-i-really-mean-it + ceph osd pool ls detail | grep -v "application " + ceph osd pool ls detail --format=json | grep '"application_metadata":{}' + + ceph osd pool rm app_for_test app_for_test --yes-i-really-really-mean-it +} + function test_mon_tell_help_command() { ceph tell mon.a help @@ -2326,6 +2416,12 @@ expect_false ceph tell mon.zzz help } +function test_mon_stdin_stdout() +{ + echo foo | ceph config-key set test_key -i - + ceph config-key get test_key -o - | grep -c foo | grep -q 1 +} + function test_osd_tell_help_command() { ceph tell osd.1 help @@ -2335,7 +2431,7 @@ function test_osd_compact() { ceph tell osd.1 compact - ceph daemon osd.1 compact + $SUDO ceph daemon osd.1 compact } function test_mds_tell_help_command() @@ -2364,7 +2460,7 @@ function test_mgr_tell() { ceph tell mgr help - ceph tell mgr fs status + #ceph tell mgr fs status # see http://tracker.ceph.com/issues/20761 ceph tell mgr osd status } @@ -2395,6 +2491,7 @@ MON_TESTS+=" mon_misc" MON_TESTS+=" mon_mon" MON_TESTS+=" mon_osd" +MON_TESTS+=" mon_config_key" MON_TESTS+=" mon_crush" MON_TESTS+=" mon_osd_create_destroy" MON_TESTS+=" mon_osd_pool" @@ -2411,6 +2508,7 @@ MON_TESTS+=" mon_caps" MON_TESTS+=" mon_cephdf_commands" MON_TESTS+=" mon_tell_help_command" +MON_TESTS+=" mon_stdin_stdout" OSD_TESTS+=" osd_bench" OSD_TESTS+=" osd_negative_filestore_merge_threshold" diff -Nru ceph-12.1.1/qa/workunits/erasure-code/encode-decode-non-regression.sh ceph-12.1.2/qa/workunits/erasure-code/encode-decode-non-regression.sh --- ceph-12.1.1/qa/workunits/erasure-code/encode-decode-non-regression.sh 2017-07-17 16:56:02.000000000 +0000 +++ ceph-12.1.2/qa/workunits/erasure-code/encode-decode-non-regression.sh 2017-08-01 17:55:40.000000000 +0000 @@ -14,7 +14,6 @@ # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the # GNU Library Public License for more details. # -source $(dirname $0)/../../../src/test/detect-build-env-vars.sh : ${CORPUS:=https://github.com/ceph/ceph-erasure-code-corpus.git} : ${DIRECTORY:=$CEPH_ROOT/ceph-erasure-code-corpus} diff -Nru ceph-12.1.1/qa/workunits/mon/auth_caps.sh ceph-12.1.2/qa/workunits/mon/auth_caps.sh --- ceph-12.1.1/qa/workunits/mon/auth_caps.sh 2017-07-17 16:56:02.000000000 +0000 +++ ceph-12.1.2/qa/workunits/mon/auth_caps.sh 2017-08-01 17:55:40.000000000 +0000 @@ -68,7 +68,7 @@ expect $ret ceph auth get-key client.admin $args expect $ret ceph auth export $args expect $ret ceph auth export client.admin $args - expect $ret ceph auth list $args + expect $ret ceph auth ls $args expect $ret ceph auth print-key client.admin $args expect $ret ceph auth print_key client.admin $args } diff -Nru ceph-12.1.1/qa/workunits/mon/caps.py ceph-12.1.2/qa/workunits/mon/caps.py --- ceph-12.1.1/qa/workunits/mon/caps.py 2017-07-17 16:56:02.000000000 +0000 +++ ceph-12.1.2/qa/workunits/mon/caps.py 2017-08-01 17:55:40.000000000 +0000 @@ -220,7 +220,7 @@ 'auth':[ { 'pre':'', - 'cmd':('auth list', '', 'r'), + 'cmd':('auth ls', '', 'r'), 'post':'' }, { @@ -265,11 +265,11 @@ ], 'config-key':[ { - 'pre':'config-key put foo bar', + 'pre':'config-key set foo bar', 'cmd':('config-key get', 'key=foo', 'r') }, { - 'pre':'config-key put foo bar', + 'pre':'config-key set foo bar', 'cmd':('config-key del', 'key=foo', 'rw') } ] diff -Nru ceph-12.1.1/qa/workunits/mon/caps.sh ceph-12.1.2/qa/workunits/mon/caps.sh --- ceph-12.1.1/qa/workunits/mon/caps.sh 2017-07-17 16:56:02.000000000 +0000 +++ ceph-12.1.2/qa/workunits/mon/caps.sh 2017-08-01 17:55:40.000000000 +0000 @@ -28,10 +28,10 @@ expect "ceph -k $tmp.bazar.keyring --user bazar mon_status" 13 ceph auth del client.bazar -c="'allow command \"auth list\", allow command mon_status'" +c="'allow command \"auth ls\", allow command mon_status'" expect "ceph auth get-or-create client.foo mon $c > $tmp.foo.keyring" 0 expect "ceph -k $tmp.foo.keyring --user foo mon_status" 0 -expect "ceph -k $tmp.foo.keyring --user foo auth list" 0 +expect "ceph -k $tmp.foo.keyring --user foo auth ls" 0 expect "ceph -k $tmp.foo.keyring --user foo auth export" 13 expect "ceph -k $tmp.foo.keyring --user foo auth del client.bazar" 13 expect "ceph -k $tmp.foo.keyring --user foo osd dump" 13 @@ -42,7 +42,7 @@ c="'allow command service with prefix=list, allow command mon_status'" expect "ceph auth get-or-create client.bar mon $c > $tmp.bar.keyring" 0 expect "ceph -k $tmp.bar.keyring --user bar mon_status" 0 -expect "ceph -k $tmp.bar.keyring --user bar auth list" 13 +expect "ceph -k $tmp.bar.keyring --user bar auth ls" 13 expect "ceph -k $tmp.bar.keyring --user bar auth export" 13 expect "ceph -k $tmp.bar.keyring --user bar auth del client.foo" 13 expect "ceph -k $tmp.bar.keyring --user bar osd dump" 13 @@ -52,4 +52,4 @@ rm $tmp.bazar.keyring $tmp.foo.keyring $tmp.bar.keyring -echo OK \ No newline at end of file +echo OK diff -Nru ceph-12.1.1/qa/workunits/mon/crush_ops.sh ceph-12.1.2/qa/workunits/mon/crush_ops.sh --- ceph-12.1.1/qa/workunits/mon/crush_ops.sh 2017-07-17 16:56:02.000000000 +0000 +++ ceph-12.1.2/qa/workunits/mon/crush_ops.sh 2017-08-01 17:55:40.000000000 +0000 @@ -21,8 +21,6 @@ # make sure we're at luminous+ before using crush device classes ceph osd require-osd-release luminous -ceph osd crush class create ssd -ceph osd crush class create hdd ceph osd crush set-device-class ssd osd.0 ceph osd crush set-device-class hdd osd.1 ceph osd crush rule create-replicated foo-ssd default host ssd @@ -39,7 +37,9 @@ ceph osd crush rule rm bar # can't delete in-use rules, tho: +ceph osd pool create pinning_pool 1 expect_false ceph osd crush rule rm replicated_rule +ceph osd pool rm pinning_pool pinning_pool --yes-i-really-really-mean-it # build a simple map expect_false ceph osd crush add-bucket foo osd @@ -125,4 +125,33 @@ ceph osd rm osd.$o4 ceph osd rm osd.$o5 +# weight sets +# make sure we require luminous before testing weight-sets +ceph osd set-require-min-compat-client luminous +ceph osd crush weight-set dump +ceph osd crush weight-set ls +expect_false ceph osd crush weight-set reweight fooset osd.0 .9 +ceph osd pool create fooset 8 +ceph osd pool create barset 8 +ceph osd pool set barset size 3 +expect_false ceph osd crush weight-set reweight fooset osd.0 .9 +ceph osd crush weight-set create fooset flat +ceph osd crush weight-set create barset positional +ceph osd crush weight-set ls | grep fooset +ceph osd crush weight-set ls | grep barset +ceph osd crush weight-set dump +ceph osd crush weight-set reweight fooset osd.0 .9 +expect_false ceph osd crush weight-set reweight fooset osd.0 .9 .9 +expect_false ceph osd crush weight-set reweight barset osd.0 .9 +ceph osd crush weight-set reweight barset osd.0 .9 .9 .9 +ceph osd crush weight-set ls | grep -c fooset | grep -q 1 +ceph osd crush weight-set rm fooset +ceph osd crush weight-set ls | grep -c fooset | grep -q 0 +ceph osd crush weight-set ls | grep barset +ceph osd crush weight-set rm barset +ceph osd crush weight-set ls | grep -c barset | grep -q 0 +ceph osd crush weight-set create-compat +ceph osd crush weight-set ls | grep '(compat)' +ceph osd crush weight-set rm-compat + echo OK diff -Nru ceph-12.1.1/qa/workunits/mon/rbd_snaps_ops.sh ceph-12.1.2/qa/workunits/mon/rbd_snaps_ops.sh --- ceph-12.1.1/qa/workunits/mon/rbd_snaps_ops.sh 2017-07-17 16:56:02.000000000 +0000 +++ ceph-12.1.2/qa/workunits/mon/rbd_snaps_ops.sh 2017-08-01 17:55:40.000000000 +0000 @@ -23,11 +23,13 @@ expect 'ceph osd pool mksnap test snapshot' 0 expect 'ceph osd pool rmsnap test snapshot' 0 +expect 'rbd --pool=test pool init' 0 expect 'rbd --pool=test --rbd_validate_pool=false create --size=102400 image' 0 expect 'rbd --pool=test snap create image@snapshot' 22 expect 'ceph osd pool delete test test --yes-i-really-really-mean-it' 0 expect 'ceph osd pool create test 256 256' 0 +expect 'rbd --pool=test pool init' 0 expect 'rbd --pool=test create --size=102400 image' 0 expect 'rbd --pool=test snap create image@snapshot' 0 expect 'rbd --pool=test snap ls image' 0 diff -Nru ceph-12.1.1/qa/workunits/mon/workloadgen.sh ceph-12.1.2/qa/workunits/mon/workloadgen.sh --- ceph-12.1.1/qa/workunits/mon/workloadgen.sh 2017-07-17 16:56:02.000000000 +0000 +++ ceph-12.1.2/qa/workunits/mon/workloadgen.sh 1970-01-01 00:00:00.000000000 +0000 @@ -1,169 +0,0 @@ -#!/bin/bash -x -# vim: ts=8 sw=2 smarttab -# -# $0.sh - run mon workload generator - -d() { - [[ "$VERBOSE" != "" && $VERBOSE -eq 1 ]] && echo "## DEBUG ## $*" -} - -d "check for required binaries" - -required_bins="ceph crushtool ceph_test_mon_workloadgen" -for b in $required_bins; do - which $b >& /dev/null - if [[ $? -ne 0 ]]; then - echo "Unable to find '$b' in PATH" - exit 1 - fi -done - -d "Start workunit" - -crush_map_fn=test.crush.map -create_crush=0 -clobber_crush=0 -new_cluster=0 -do_run=0 -num_osds=0 - -# Assume the test is in PATH -bin_test=ceph_test_mon_workloadgen - -num_osds=10 -if [[ "$LOADGEN_NUM_OSDS" != "" ]]; then - num_osds=$LOADGEN_NUM_OSDS -fi - -duration=300 -[ ! -z $DURATION ] && duration=$DURATION - -d "checking osd tree" - -crush_testing_root="`ceph osd tree | grep 'root[ \t]\+testing'`" - -d "$crush_testing_root" - -if [[ "$crush_testing_root" == "" ]]; then - d "set create_crush" - create_crush=1 -fi - -d "generate run_id (create_crush = $create_crush)" - -run_id=`uuidgen` - -d "run_id = $run_id ; create_crush = $create_crush" - -if [[ $create_crush -eq 1 ]]; then - tmp_crush_fn="/tmp/ceph.$run_id.crush" - ceph osd getcrushmap -o $tmp_crush_fn - crushtool -d $tmp_crush_fn -o $tmp_crush_fn.plain - - highest_root_id=0 - root_ids_raw="`cat $tmp_crush_fn.plain | grep id`" - ifs=$IFS - IFS=$'\n' - for l in $root_ids_raw; do - root_id=`echo $l | sed 's/.*-\([[:digit:]]\+\).*/\1/'` - d "root id = $root_id ; highest = $highest_root_id" - if [[ $root_id -gt $highest_root_id ]]; then - highest_root_id=$root_id - fi - done - our_root_id=$(($highest_root_id+1)) - IFS=$ifs - - cat << EOF >> $tmp_crush_fn.plain -root testing { - id -$our_root_id - alg straw - hash 0 # rjenkins1 -} -rule testingdata { - ruleset 0 - type replicated - min_size 1 - max_size 10 - step take testing - step choose firstn 0 type osd - step emit -} -rule testingmetadata { - ruleset 1 - type replicated - min_size 1 - max_size 10 - step take testing - step choose firstn 0 type osd - step emit -} -rule testingrbd { - ruleset 2 - type replicated - min_size 1 - max_size 10 - step take testing - step choose firstn 0 type osd - step emit -} -EOF - - if [[ $VERBOSE -eq 1 ]]; then - cat $tmp_crush_fn.plain - fi - - crushtool -c $tmp_crush_fn.plain -o $tmp_crush_fn - if [[ $? -eq 1 ]]; then - echo "Error compiling test crush map; probably need newer crushtool" - echo "NOK" - exit 1 - fi - - d "created crush" - - ceph osd setcrushmap -i $tmp_crush_fn -fi - -keyring="/tmp/ceph.$run_id.keyring" - -ceph auth get-or-create-key osd.admin mon 'allow rwx' osd 'allow *' -ceph auth export | grep -v "export" > $keyring - -osd_ids="" - -for osd in `seq 1 $num_osds`; do - id=`ceph osd create` - osd_ids="$osd_ids $id" - d "osd.$id" - ceph osd crush set $id osd.$id 1.0 host=testhost rack=testrack root=testing -done - -d "osds: $osd_ids" - -stub_id_args="" -f= -l= -for i in $osd_ids; do - d "i: $i" - if [[ $stub_id_args == "" ]]; then - stub_id_args="--stub-id $i" - f=$i - fi - if [[ $l != "" ]]; then - if [[ $i -gt $(($l+1)) ]]; then - stub_id_args="$stub_id_args..$l --stub-id $i" - f=$i - fi - fi - l=$i -done -if [[ $l -gt $f ]]; then - stub_id_args="$stub_id_args..$l" -fi - -args="$EXTRA_ARGS --duration $duration $stub_id_args" - -d "running: $args" - -$bin_test --keyring $keyring $args diff -Nru ceph-12.1.1/qa/workunits/rados/test_alloc_hint.sh ceph-12.1.2/qa/workunits/rados/test_alloc_hint.sh --- ceph-12.1.1/qa/workunits/rados/test_alloc_hint.sh 2017-07-17 16:56:02.000000000 +0000 +++ ceph-12.1.2/qa/workunits/rados/test_alloc_hint.sh 2017-08-01 17:55:40.000000000 +0000 @@ -109,6 +109,7 @@ POOL="alloc_hint-rep" ceph osd pool create "${POOL}" "${NUM_PG}" ceph osd pool set "${POOL}" size "${NUM_OSDS}" +ceph osd pool application enable "${POOL}" rados OBJ="foo" setup_pgid "${POOL}" "${OBJ}" @@ -156,6 +157,7 @@ ceph osd erasure-code-profile set "${PROFILE}" k=2 m=1 crush-failure-domain=osd ceph osd erasure-code-profile get "${PROFILE}" # just so it's logged ceph osd pool create "${POOL}" "${NUM_PG}" "${NUM_PGP}" erasure "${PROFILE}" +ceph osd pool application enable "${POOL}" rados OBJ="baz" setup_pgid "${POOL}" "${OBJ}" diff -Nru ceph-12.1.1/qa/workunits/rados/test_cache_pool.sh ceph-12.1.2/qa/workunits/rados/test_cache_pool.sh --- ceph-12.1.1/qa/workunits/rados/test_cache_pool.sh 2017-07-17 16:56:02.000000000 +0000 +++ ceph-12.1.2/qa/workunits/rados/test_cache_pool.sh 2017-08-01 17:55:40.000000000 +0000 @@ -10,6 +10,7 @@ # create pools, set up tier relationship ceph osd pool create base_pool 2 +ceph osd pool application enable base_pool rados ceph osd pool create partial_wrong 2 ceph osd pool create wrong_cache 2 ceph osd tier add base_pool partial_wrong @@ -89,6 +90,7 @@ ## set of base, cache ceph osd pool create base 8 +ceph osd pool application enable base rados ceph osd pool create cache 8 ceph osd tier add base cache diff -Nru ceph-12.1.1/qa/workunits/rados/test_health_warnings.sh ceph-12.1.2/qa/workunits/rados/test_health_warnings.sh --- ceph-12.1.1/qa/workunits/rados/test_health_warnings.sh 2017-07-17 16:56:02.000000000 +0000 +++ ceph-12.1.2/qa/workunits/rados/test_health_warnings.sh 2017-08-01 17:55:40.000000000 +0000 @@ -55,8 +55,6 @@ test_mark_two_osds_same_host_down_with_classes() { ceph osd set noup - ceph osd crush class create ssd - ceph osd crush class create hdd ceph osd crush set-device-class ssd osd.0 osd.2 osd.4 osd.6 osd.8 ceph osd crush set-device-class hdd osd.1 osd.3 osd.5 osd.7 osd.9 ceph osd down osd.0 osd.1 diff -Nru ceph-12.1.1/qa/workunits/rados/test_pool_quota.sh ceph-12.1.2/qa/workunits/rados/test_pool_quota.sh --- ceph-12.1.1/qa/workunits/rados/test_pool_quota.sh 2017-07-17 16:56:02.000000000 +0000 +++ ceph-12.1.2/qa/workunits/rados/test_pool_quota.sh 2017-08-01 17:55:40.000000000 +0000 @@ -5,6 +5,7 @@ # objects ceph osd pool create $p 12 ceph osd pool set-quota $p max_objects 10 +ceph osd pool application enable $p rados for f in `seq 1 10` ; do rados -p $p put obj$f /etc/passwd @@ -41,6 +42,7 @@ pp=`uuidgen` ceph osd pool create $pp 12 +ceph osd pool application enable $pp rados # set objects quota ceph osd pool set-quota $pp max_objects 10 diff -Nru ceph-12.1.1/qa/workunits/rbd/cli_generic.sh ceph-12.1.2/qa/workunits/rbd/cli_generic.sh --- ceph-12.1.1/qa/workunits/rbd/cli_generic.sh 2017-07-17 16:56:02.000000000 +0000 +++ ceph-12.1.2/qa/workunits/rbd/cli_generic.sh 2017-08-01 17:55:40.000000000 +0000 @@ -290,6 +290,7 @@ ceph osd pool delete test test --yes-i-really-really-mean-it || true ceph osd pool create test 100 + rbd pool init test truncate -s 1 /tmp/empty /tmp/empty@snap rbd ls | wc -l | grep 0 diff -Nru ceph-12.1.1/qa/workunits/rbd/journal.sh ceph-12.1.2/qa/workunits/rbd/journal.sh --- ceph-12.1.1/qa/workunits/rbd/journal.sh 2017-07-17 16:56:02.000000000 +0000 +++ ceph-12.1.2/qa/workunits/rbd/journal.sh 2017-08-01 17:55:40.000000000 +0000 @@ -1,6 +1,6 @@ #!/bin/bash -e -. $(dirname $0)/../ceph-helpers.sh +. $(dirname $0)/../../standalone/ceph-helpers.sh function list_tests() { diff -Nru ceph-12.1.1/qa/workunits/rbd/krbd_data_pool.sh ceph-12.1.2/qa/workunits/rbd/krbd_data_pool.sh --- ceph-12.1.1/qa/workunits/rbd/krbd_data_pool.sh 2017-07-17 16:56:02.000000000 +0000 +++ ceph-12.1.2/qa/workunits/rbd/krbd_data_pool.sh 2017-08-01 17:55:40.000000000 +0000 @@ -99,11 +99,15 @@ } ceph osd pool create repdata 24 24 +rbd pool init repdata ceph osd erasure-code-profile set teuthologyprofile crush-failure-domain=osd m=1 k=2 ceph osd pool create ecdata 24 24 erasure teuthologyprofile +rbd pool init ecdata ceph osd pool set ecdata allow_ec_overwrites true ceph osd pool create rbdnonzero 24 24 +rbd pool init rbdnonzero ceph osd pool create clonesonly 24 24 +rbd pool init clonesonly for pool in rbd rbdnonzero; do rbd create --size 200 --image-format 1 $pool/img0 diff -Nru ceph-12.1.1/qa/workunits/rbd/permissions.sh ceph-12.1.2/qa/workunits/rbd/permissions.sh --- ceph-12.1.1/qa/workunits/rbd/permissions.sh 2017-07-17 16:56:02.000000000 +0000 +++ ceph-12.1.2/qa/workunits/rbd/permissions.sh 2017-08-01 17:55:40.000000000 +0000 @@ -4,7 +4,9 @@ create_pools() { ceph osd pool create images 100 + rbd pool init images ceph osd pool create volumes 100 + rbd pool init volumes } delete_pools() { diff -Nru ceph-12.1.1/qa/workunits/rbd/rbd_mirror_helpers.sh ceph-12.1.2/qa/workunits/rbd/rbd_mirror_helpers.sh --- ceph-12.1.1/qa/workunits/rbd/rbd_mirror_helpers.sh 2017-07-17 16:56:02.000000000 +0000 +++ ceph-12.1.2/qa/workunits/rbd/rbd_mirror_helpers.sh 2017-08-01 17:55:40.000000000 +0000 @@ -69,6 +69,8 @@ POOL=mirror PARENT_POOL=mirror_parent TEMPDIR= +USER_ID=mirror +export CEPH_ARGS="--id ${USER_ID}" CEPH_ROOT=$(readlink -f $(dirname $0)/../../../src) CEPH_BIN=. @@ -190,8 +192,15 @@ if [ -z "${RBD_MIRROR_USE_EXISTING_CLUSTER}" ]; then cd ${CEPH_ROOT} - ${CEPH_SRC}/mstart.sh ${CLUSTER1} -n ${RBD_MIRROR_VARGS} - ${CEPH_SRC}/mstart.sh ${CLUSTER2} -n ${RBD_MIRROR_VARGS} + CEPH_ARGS='' ${CEPH_SRC}/mstart.sh ${CLUSTER1} -n ${RBD_MIRROR_VARGS} + CEPH_ARGS='' ${CEPH_SRC}/mstart.sh ${CLUSTER2} -n ${RBD_MIRROR_VARGS} + + CEPH_ARGS='' ceph --conf run/${CLUSTER1}/ceph.conf \ + auth get-or-create client.${USER_ID} mon 'profile rbd' osd 'profile rbd' >> \ + run/${CLUSTER1}/keyring + CEPH_ARGS='' ceph --conf run/${CLUSTER2}/ceph.conf \ + auth get-or-create client.${USER_ID} mon 'profile rbd' osd 'profile rbd' >> \ + run/${CLUSTER2}/keyring rm -f ${TEMPDIR}/${CLUSTER1}.conf ln -s $(readlink -f run/${CLUSTER1}/ceph.conf) \ @@ -203,10 +212,10 @@ cd ${TEMPDIR} fi - ceph --cluster ${CLUSTER1} osd pool create ${POOL} 64 64 - ceph --cluster ${CLUSTER1} osd pool create ${PARENT_POOL} 64 64 - ceph --cluster ${CLUSTER2} osd pool create ${PARENT_POOL} 64 64 - ceph --cluster ${CLUSTER2} osd pool create ${POOL} 64 64 + CEPH_ARGS='' ceph --cluster ${CLUSTER1} osd pool create ${POOL} 64 64 + CEPH_ARGS='' ceph --cluster ${CLUSTER1} osd pool create ${PARENT_POOL} 64 64 + CEPH_ARGS='' ceph --cluster ${CLUSTER2} osd pool create ${PARENT_POOL} 64 64 + CEPH_ARGS='' ceph --cluster ${CLUSTER2} osd pool create ${POOL} 64 64 rbd --cluster ${CLUSTER1} mirror pool enable ${POOL} pool rbd --cluster ${CLUSTER2} mirror pool enable ${POOL} pool @@ -234,13 +243,13 @@ if [ -z "${RBD_MIRROR_USE_EXISTING_CLUSTER}" ]; then cd ${CEPH_ROOT} - ${CEPH_SRC}/mstop.sh ${CLUSTER1} - ${CEPH_SRC}/mstop.sh ${CLUSTER2} + CEPH_ARGS='' ${CEPH_SRC}/mstop.sh ${CLUSTER1} + CEPH_ARGS='' ${CEPH_SRC}/mstop.sh ${CLUSTER2} else - ceph --cluster ${CLUSTER1} osd pool rm ${POOL} ${POOL} --yes-i-really-really-mean-it - ceph --cluster ${CLUSTER2} osd pool rm ${POOL} ${POOL} --yes-i-really-really-mean-it - ceph --cluster ${CLUSTER1} osd pool rm ${PARENT_POOL} ${PARENT_POOL} --yes-i-really-really-mean-it - ceph --cluster ${CLUSTER2} osd pool rm ${PARENT_POOL} ${PARENT_POOL} --yes-i-really-really-mean-it + CEPH_ARGS='' ceph --cluster ${CLUSTER1} osd pool rm ${POOL} ${POOL} --yes-i-really-really-mean-it + CEPH_ARGS='' ceph --cluster ${CLUSTER2} osd pool rm ${POOL} ${POOL} --yes-i-really-really-mean-it + CEPH_ARGS='' ceph --cluster ${CLUSTER1} osd pool rm ${PARENT_POOL} ${PARENT_POOL} --yes-i-really-really-mean-it + CEPH_ARGS='' ceph --cluster ${CLUSTER2} osd pool rm ${PARENT_POOL} ${PARENT_POOL} --yes-i-really-really-mean-it fi test "${RBD_MIRROR_TEMDIR}" = "${TEMPDIR}" || rm -Rf ${TEMPDIR} @@ -257,6 +266,7 @@ rbd-mirror \ --cluster ${cluster} \ + --id mirror \ --pid-file=$(daemon_pid_file "${cluster}:${instance}") \ --log-file=${TEMPDIR}/rbd-mirror.${cluster}_daemon.${instance}.log \ --admin-socket=${TEMPDIR}/rbd-mirror.${cluster}_daemon.${instance}.\$cluster.asok \ diff -Nru ceph-12.1.1/qa/workunits/rbd/rbd_mirror.sh ceph-12.1.2/qa/workunits/rbd/rbd_mirror.sh --- ceph-12.1.1/qa/workunits/rbd/rbd_mirror.sh 2017-07-17 16:56:02.000000000 +0000 +++ ceph-12.1.2/qa/workunits/rbd/rbd_mirror.sh 2017-08-01 17:55:40.000000000 +0000 @@ -117,8 +117,8 @@ # demote and promote same cluster demote_image ${CLUSTER2} ${POOL} ${image} wait_for_image_replay_stopped ${CLUSTER1} ${POOL} ${image} -wait_for_status_in_pool_dir ${CLUSTER1} ${POOL} ${image} 'up+stopped' -wait_for_status_in_pool_dir ${CLUSTER2} ${POOL} ${image} 'up+stopped' +wait_for_status_in_pool_dir ${CLUSTER1} ${POOL} ${image} 'up+unknown' +wait_for_status_in_pool_dir ${CLUSTER2} ${POOL} ${image} 'up+unknown' promote_image ${CLUSTER2} ${POOL} ${image} wait_for_image_replay_started ${CLUSTER1} ${POOL} ${image} write_image ${CLUSTER2} ${POOL} ${image} 100 @@ -130,15 +130,16 @@ # failover (unmodified) demote_image ${CLUSTER2} ${POOL} ${image} wait_for_image_replay_stopped ${CLUSTER1} ${POOL} ${image} -wait_for_status_in_pool_dir ${CLUSTER1} ${POOL} ${image} 'up+stopped' -wait_for_status_in_pool_dir ${CLUSTER2} ${POOL} ${image} 'up+stopped' +wait_for_status_in_pool_dir ${CLUSTER1} ${POOL} ${image} 'up+unknown' +wait_for_status_in_pool_dir ${CLUSTER2} ${POOL} ${image} 'up+unknown' promote_image ${CLUSTER1} ${POOL} ${image} wait_for_image_replay_started ${CLUSTER2} ${POOL} ${image} # failback (unmodified) demote_image ${CLUSTER1} ${POOL} ${image} wait_for_image_replay_stopped ${CLUSTER2} ${POOL} ${image} -wait_for_status_in_pool_dir ${CLUSTER2} ${POOL} ${image} 'up+stopped' +wait_for_status_in_pool_dir ${CLUSTER1} ${POOL} ${image} 'up+unknown' +wait_for_status_in_pool_dir ${CLUSTER2} ${POOL} ${image} 'up+unknown' promote_image ${CLUSTER2} ${POOL} ${image} wait_for_image_replay_started ${CLUSTER1} ${POOL} ${image} wait_for_replay_complete ${CLUSTER1} ${CLUSTER2} ${POOL} ${image} @@ -149,8 +150,8 @@ # failover demote_image ${CLUSTER2} ${POOL} ${image} wait_for_image_replay_stopped ${CLUSTER1} ${POOL} ${image} -wait_for_status_in_pool_dir ${CLUSTER1} ${POOL} ${image} 'up+stopped' -wait_for_status_in_pool_dir ${CLUSTER2} ${POOL} ${image} 'up+stopped' +wait_for_status_in_pool_dir ${CLUSTER1} ${POOL} ${image} 'up+unknown' +wait_for_status_in_pool_dir ${CLUSTER2} ${POOL} ${image} 'up+unknown' promote_image ${CLUSTER1} ${POOL} ${image} wait_for_image_replay_started ${CLUSTER2} ${POOL} ${image} write_image ${CLUSTER1} ${POOL} ${image} 100 @@ -162,7 +163,8 @@ # failback demote_image ${CLUSTER1} ${POOL} ${image} wait_for_image_replay_stopped ${CLUSTER2} ${POOL} ${image} -wait_for_status_in_pool_dir ${CLUSTER2} ${POOL} ${image} 'up+stopped' +wait_for_status_in_pool_dir ${CLUSTER1} ${POOL} ${image} 'up+unknown' +wait_for_status_in_pool_dir ${CLUSTER2} ${POOL} ${image} 'up+unknown' promote_image ${CLUSTER2} ${POOL} ${image} wait_for_image_replay_started ${CLUSTER1} ${POOL} ${image} write_image ${CLUSTER2} ${POOL} ${image} 100 @@ -378,7 +380,7 @@ create_image ${CLUSTER2} ${POOL} ${image} wait_for_status_in_pool_dir ${CLUSTER1} ${POOL} ${image} 'up+replaying' 'master_position' demote_image ${CLUSTER2} ${POOL} ${image} -wait_for_status_in_pool_dir ${CLUSTER1} ${POOL} ${image} 'up+stopped' +wait_for_status_in_pool_dir ${CLUSTER1} ${POOL} ${image} 'up+unknown' promote_image ${CLUSTER1} ${POOL} ${image} write_image ${CLUSTER1} ${POOL} ${image} 10 demote_image ${CLUSTER1} ${POOL} ${image} @@ -388,7 +390,7 @@ wait_for_status_in_pool_dir ${CLUSTER1} ${POOL} ${image} 'up+replaying' 'master_position' testlog "TEST: no blacklists" -ceph --cluster ${CLUSTER1} osd blacklist ls 2>&1 | grep -q "listed 0 entries" -ceph --cluster ${CLUSTER2} osd blacklist ls 2>&1 | grep -q "listed 0 entries" +CEPH_ARGS='--id admin' ceph --cluster ${CLUSTER1} osd blacklist ls 2>&1 | grep -q "listed 0 entries" +CEPH_ARGS='--id admin' ceph --cluster ${CLUSTER2} osd blacklist ls 2>&1 | grep -q "listed 0 entries" echo OK diff -Nru ceph-12.1.1/qa/workunits/rbd/rbd-nbd.sh ceph-12.1.2/qa/workunits/rbd/rbd-nbd.sh --- ceph-12.1.1/qa/workunits/rbd/rbd-nbd.sh 2017-07-17 16:56:02.000000000 +0000 +++ ceph-12.1.2/qa/workunits/rbd/rbd-nbd.sh 2017-08-01 17:55:40.000000000 +0000 @@ -1,6 +1,6 @@ #!/bin/bash -ex -. $(dirname $0)/../ceph-helpers.sh +. $(dirname $0)/../../standalone/ceph-helpers.sh POOL=rbd IMAGE=testrbdnbd$$ diff -Nru ceph-12.1.1/qa/workunits/rbd/run_devstack_tempest.sh ceph-12.1.2/qa/workunits/rbd/run_devstack_tempest.sh --- ceph-12.1.1/qa/workunits/rbd/run_devstack_tempest.sh 2017-07-17 16:56:02.000000000 +0000 +++ ceph-12.1.2/qa/workunits/rbd/run_devstack_tempest.sh 2017-08-01 17:55:40.000000000 +0000 @@ -108,6 +108,11 @@ chmod 0755 ${STACK_HOME_PATH}/start.sh sudo -H -u ${STACK_USER} ${STACK_HOME_PATH}/start.sh +# switch to rbd profile caps +ceph auth caps client.cinder mon 'profile rbd' osd 'profile rbd pool=volumes, profile rbd pool=vms, profile rbd pool=images' +ceph auth caps client.cinder-bak mon 'profile rbd' osd 'profile rbd pool=backups, profile rbd pool=volumes' +ceph auth caps client.glance mon 'profile rbd' osd 'profile rbd pool=images' + # execute tempest chown -R ${TEMPEST_USER}:${STACK_GROUP} ${STACK_OPT_PATH}/tempest chown -R ${TEMPEST_USER}:${STACK_GROUP} ${STACK_OPT_PATH}/data/tempest diff -Nru ceph-12.1.1/qa/workunits/rbd/test_admin_socket.sh ceph-12.1.2/qa/workunits/rbd/test_admin_socket.sh --- ceph-12.1.1/qa/workunits/rbd/test_admin_socket.sh 2017-07-17 16:56:02.000000000 +0000 +++ ceph-12.1.2/qa/workunits/rbd/test_admin_socket.sh 2017-08-01 17:55:40.000000000 +0000 @@ -4,7 +4,7 @@ mkdir $TMPDIR trap "rm -fr $TMPDIR" 0 -. $(dirname $0)/../ceph-helpers.sh +. $(dirname $0)/../../standalone/ceph-helpers.sh function expect_false() { diff -Nru ceph-12.1.1/qa/workunits/rbd/verify_pool.sh ceph-12.1.2/qa/workunits/rbd/verify_pool.sh --- ceph-12.1.1/qa/workunits/rbd/verify_pool.sh 2017-07-17 16:56:02.000000000 +0000 +++ ceph-12.1.2/qa/workunits/rbd/verify_pool.sh 2017-08-01 17:55:40.000000000 +0000 @@ -11,6 +11,7 @@ tear_down ceph osd pool create $POOL_NAME $PG_NUM ceph osd pool mksnap $POOL_NAME snap + rbd pool init $POOL_NAME } trap tear_down EXIT HUP INT diff -Nru ceph-12.1.1/README ceph-12.1.2/README --- ceph-12.1.1/README 2017-07-17 16:56:02.000000000 +0000 +++ ceph-12.1.2/README 2017-08-01 17:55:40.000000000 +0000 @@ -48,7 +48,7 @@ Note that these instructions are meant for developers who are compiling the code for development and testing. To build binaries suitable for installation we recommend you build deb or rpm packages, -or refer to the ceph.spec.in or debian/rules to see which +or refer to the `ceph.spec.in` or `debian/rules` to see which configuration options are specified for production builds. Prerequisite: CMake 2.8.11 @@ -60,7 +60,7 @@ make This assumes you make your build dir a subdirectory of the ceph.git -checkout. If you put it elsewhere, just replace .. in do_cmake.sh with a +checkout. If you put it elsewhere, just replace `..` in do_cmake.sh with a correct path to the checkout. To build only certain targets use: @@ -175,7 +175,7 @@ ctest -V -R [regex matching test name(s)] To run an tests manually and run the jobs in parallel, run `ctest` with -the -j flag: +the `-j` flag: ctest -j [number of jobs] @@ -190,14 +190,14 @@ ### Prerequisites The list of package dependencies for building the documentation can be -found in doc_deps.deb.txt: +found in `doc_deps.deb.txt`: sudo apt-get install `cat doc_deps.deb.txt` ### Building the Documentation To build the documentation, ensure that you are in the top-level -`/ceph directory, and execute the build script. For example: +`/ceph` directory, and execute the build script. For example: admin/build-doc diff -Nru ceph-12.1.1/README.md ceph-12.1.2/README.md --- ceph-12.1.1/README.md 2017-07-17 16:56:02.000000000 +0000 +++ ceph-12.1.2/README.md 2017-08-01 17:55:40.000000000 +0000 @@ -48,7 +48,7 @@ Note that these instructions are meant for developers who are compiling the code for development and testing. To build binaries suitable for installation we recommend you build deb or rpm packages, -or refer to the ceph.spec.in or debian/rules to see which +or refer to the `ceph.spec.in` or `debian/rules` to see which configuration options are specified for production builds. Prerequisite: CMake 2.8.11 @@ -60,7 +60,7 @@ make This assumes you make your build dir a subdirectory of the ceph.git -checkout. If you put it elsewhere, just replace .. in do_cmake.sh with a +checkout. If you put it elsewhere, just replace `..` in do_cmake.sh with a correct path to the checkout. To build only certain targets use: @@ -175,7 +175,7 @@ ctest -V -R [regex matching test name(s)] To run an tests manually and run the jobs in parallel, run `ctest` with -the -j flag: +the `-j` flag: ctest -j [number of jobs] @@ -190,14 +190,14 @@ ### Prerequisites The list of package dependencies for building the documentation can be -found in doc_deps.deb.txt: +found in `doc_deps.deb.txt`: sudo apt-get install `cat doc_deps.deb.txt` ### Building the Documentation To build the documentation, ensure that you are in the top-level -`/ceph directory, and execute the build script. For example: +`/ceph` directory, and execute the build script. For example: admin/build-doc diff -Nru ceph-12.1.1/run-make-check.sh ceph-12.1.2/run-make-check.sh --- ceph-12.1.1/run-make-check.sh 2017-07-17 16:56:02.000000000 +0000 +++ ceph-12.1.2/run-make-check.sh 2017-08-01 17:55:40.000000000 +0000 @@ -71,7 +71,10 @@ $DRY_RUN ./do_cmake.sh $@ || return 1 $DRY_RUN cd build $DRY_RUN make $BUILD_MAKEOPTS tests || return 1 - $DRY_RUN ctest $CHECK_MAKEOPTS --output-on-failure || return 1 + if ! $DRY_RUN ctest $CHECK_MAKEOPTS --output-on-failure; then + rm -f ${TMPDIR:-/tmp}/ceph-asok.* + return 1 + fi } function main() { diff -Nru ceph-12.1.1/src/auth/AuthSessionHandler.cc ceph-12.1.2/src/auth/AuthSessionHandler.cc --- ceph-12.1.1/src/auth/AuthSessionHandler.cc 2017-07-17 16:56:02.000000000 +0000 +++ ceph-12.1.2/src/auth/AuthSessionHandler.cc 2017-08-01 17:55:40.000000000 +0000 @@ -38,14 +38,3 @@ } return NULL; } - - -void AuthSessionHandler::print_auth_session_handler_stats() { - ldout(cct,10) << "Auth Session Handler Stats " << this << dendl; - ldout(cct,10) << " Messages Signed = " << messages_signed << dendl; - ldout(cct,10) << " Signatures Checked = " << signatures_checked << dendl; - ldout(cct,10) << " Signatures Matched = " << signatures_matched << dendl; - ldout(cct,10) << " Signatures Did Not Match = " << signatures_failed << dendl; - ldout(cct,10) << " Messages Encrypted = " << messages_encrypted << dendl; - ldout(cct,10) << " Messages Decrypted = " << messages_decrypted << dendl; -} diff -Nru ceph-12.1.1/src/auth/AuthSessionHandler.h ceph-12.1.2/src/auth/AuthSessionHandler.h --- ceph-12.1.1/src/auth/AuthSessionHandler.h 2017-07-17 16:56:02.000000000 +0000 +++ ceph-12.1.2/src/auth/AuthSessionHandler.h 2017-08-01 17:55:40.000000000 +0000 @@ -34,26 +34,12 @@ CryptoKey key; public: - // Keep stats on how many messages were signed, how many messages were encrypted, how many - // signatures were properly checked, and how many messages were decrypted. PLR - int messages_signed; - int signatures_checked; - int signatures_matched; - int signatures_failed; - int messages_encrypted; - int messages_decrypted; - - explicit AuthSessionHandler(CephContext *cct_) : cct(cct_), protocol(CEPH_AUTH_UNKNOWN), messages_signed(0), - signatures_checked(0), signatures_matched(0), signatures_failed(0), messages_encrypted(0), - messages_decrypted(0) {} + explicit AuthSessionHandler(CephContext *cct_) : cct(cct_), protocol(CEPH_AUTH_UNKNOWN) {} AuthSessionHandler(CephContext *cct_, int protocol_, CryptoKey key_) : cct(cct_), - protocol(protocol_), key(key_), messages_signed(0), signatures_checked(0), signatures_matched(0), - signatures_failed(0), messages_encrypted(0), messages_decrypted(0) {} + protocol(protocol_), key(key_) {} virtual ~AuthSessionHandler() { } - void print_auth_session_handler_stats() ; - virtual bool no_security() = 0; virtual int sign_message(Message *message) = 0; virtual int check_message_signature(Message *message) = 0; diff -Nru ceph-12.1.1/src/auth/cephx/CephxSessionHandler.cc ceph-12.1.2/src/auth/cephx/CephxSessionHandler.cc --- ceph-12.1.1/src/auth/cephx/CephxSessionHandler.cc 2017-07-17 16:56:02.000000000 +0000 +++ ceph-12.1.2/src/auth/cephx/CephxSessionHandler.cc 2017-08-01 17:55:40.000000000 +0000 @@ -81,7 +81,6 @@ ceph_msg_footer& f = m->get_footer(); f.sig = sig; f.flags = (unsigned)f.flags | CEPH_MSG_FOOTER_SIGNED; - messages_signed++; ldout(cct, 20) << "Putting signature in client message(seq # " << m->get_seq() << "): sig = " << sig << dendl; return 0; @@ -103,8 +102,6 @@ if (r < 0) return r; - signatures_checked++; - if (sig != m->get_footer().sig) { // Should have been signed, but signature check failed. PLR if (!(m->get_footer().flags & CEPH_MSG_FOOTER_SIGNED)) { @@ -122,14 +119,10 @@ // security failure, particularly when there are large numbers of // them, since the latter is a potential sign of an attack. PLR - signatures_failed++; ldout(cct, 0) << "Signature failed." << dendl; return (SESSION_SIGNATURE_FAILURE); } - // If we get here, the signature checked. PLR - signatures_matched++; - return 0; } diff -Nru ceph-12.1.1/src/brag/client/ceph-brag ceph-12.1.2/src/brag/client/ceph-brag --- ceph-12.1.1/src/brag/client/ceph-brag 2017-07-17 16:56:02.000000000 +0000 +++ ceph-12.1.2/src/brag/client/ceph-brag 2017-08-01 17:55:40.000000000 +0000 @@ -226,10 +226,10 @@ if rc: #uuid is not yet set. uid = str(uuid.uuid4()) - (rc, o, e) = run_command(['ceph', 'config-key', 'put', + (rc, o, e) = run_command(['ceph', 'config-key', 'set', CLUSTER_UUID_NAME, uid]) if rc: - raise RuntimeError("\'ceph config-key put\' failed -" + e) + raise RuntimeError("\'ceph config-key set\' failed -" + e) return uid diff -Nru ceph-12.1.1/src/ceph-disk/ceph_disk/main.py ceph-12.1.2/src/ceph-disk/ceph_disk/main.py --- ceph-12.1.1/src/ceph-disk/ceph_disk/main.py 2017-07-17 16:56:02.000000000 +0000 +++ ceph-12.1.2/src/ceph-disk/ceph_disk/main.py 2017-08-01 17:55:40.000000000 +0000 @@ -1,6 +1,6 @@ #!/usr/bin/env python # -# Copyright (C) 2015, 2016 Red Hat +# Copyright (C) 2015, 2016, 2017 Red Hat # Copyright (C) 2014 Inktank # Copyright (C) 2014 Cloudwatt # Copyright (C) 2014 Catalyst.net Ltd @@ -36,6 +36,7 @@ import uuid import time import shlex +import shutil import pwd import grp import textwrap @@ -214,11 +215,13 @@ PROCDIR = '/compat/linux/proc' # FreeBSD does not have blockdevices any more BLOCKDIR = '/dev' + ROOTGROUP = 'wheel' else: FREEBSD = False DEFAULT_FS_TYPE = 'xfs' PROCDIR = '/proc' BLOCKDIR = '/sys/block' + ROOTGROUP = 'root' """ OSD STATUS Definition @@ -478,6 +481,26 @@ return _bytes2str(out), _bytes2str(err), process.returncode +def command_with_stdin(arguments, stdin): + LOG.info("Running command with stdin: " + " ".join(arguments)) + process = subprocess.Popen( + arguments, + stdin=subprocess.PIPE, + stdout=subprocess.PIPE, + stderr=subprocess.PIPE) + out, err = process.communicate(stdin) + LOG.debug(out) + if process.returncode != 0: + LOG.error(err) + raise SystemExit( + "'{cmd}' failed with status code {returncode}".format( + cmd=arguments, + returncode=process.returncode, + ) + ) + return out + + def _bytes2str(string): return string.decode('utf-8') if isinstance(string, bytes) else string @@ -1034,30 +1057,50 @@ cluster, fsid, keyring, + path, ): """ - Accocates an OSD id on the given cluster. + Allocates an OSD id on the given cluster. :raises: Error if the call to allocate the OSD id fails. :return: The allocated OSD id. """ + lockbox_path = os.path.join(STATEDIR, 'osd-lockbox', fsid) + lockbox_osd_id = read_one_line(lockbox_path, 'whoami') + osd_keyring = os.path.join(path, 'keyring') + if lockbox_osd_id: + LOG.debug('Getting OSD id from Lockbox...') + osd_id = lockbox_osd_id + shutil.move(os.path.join(lockbox_path, 'osd_keyring'), + osd_keyring) + path_set_context(osd_keyring) + os.unlink(os.path.join(lockbox_path, 'whoami')) + return osd_id LOG.debug('Allocating OSD id...') + secrets = Secrets() try: - osd_id = _check_output( - args=[ + wanttobe = read_one_line(path, 'wanttobe') + if os.path.exists(os.path.join(path, 'wanttobe')): + os.unlink(os.path.join(path, 'wanttobe')) + id_arg = wanttobe and [wanttobe] or [] + osd_id = command_with_stdin( + [ 'ceph', '--cluster', cluster, '--name', 'client.bootstrap-osd', '--keyring', keyring, - 'osd', 'create', '--concise', + '-i', '-', + 'osd', 'new', fsid, - ], + ] + id_arg, + secrets.get_json() ) except subprocess.CalledProcessError as e: raise Error('ceph osd create failed', e, e.output) osd_id = must_be_one_line(osd_id) check_osd_id(osd_id) + secrets.write_osd_keyring(osd_keyring, osd_id) return osd_id @@ -1311,27 +1354,15 @@ rawdev, ] + cryptsetup_parameters - def run(args, stdin): - LOG.info(" ".join(args)) - process = subprocess.Popen( - args, - stdin=subprocess.PIPE, - stdout=subprocess.PIPE, - stderr=subprocess.PIPE) - out, err = process.communicate(stdin) - LOG.debug(out) - LOG.error(err) - assert process.returncode == 0 - try: if luks: if format_dev: - run(luksFormat_args, key) - run(luksOpen_args, key) + command_with_stdin(luksFormat_args, key) + command_with_stdin(luksOpen_args, key) else: # Plain mode has no format function, nor any validation # that the key is correct. - run(create_args, key) + command_with_stdin(create_args, key) # set proper ownership of mapped device command_check_call(['chown', 'ceph:ceph', dev]) return dev @@ -1944,6 +1975,11 @@ help='unique OSD uuid to assign this disk to', ) parser.add_argument( + '--osd-id', + metavar='ID', + help='unique OSD id to assign this disk to', + ) + parser.add_argument( '--crush-device-class', help='crush device class to assign this disk to', ) @@ -2590,6 +2626,60 @@ return None +class Secrets(object): + + def __init__(self): + secret, stderr, ret = command(['ceph-authtool', '--gen-print-key']) + LOG.debug("stderr " + stderr) + assert ret == 0 + self.keys = { + 'cephx_secret': secret.strip(), + } + + def write_osd_keyring(self, keyring, osd_id): + command_check_call( + [ + 'ceph-authtool', keyring, + '--create-keyring', + '--name', 'osd.' + str(osd_id), + '--add-key', self.keys['cephx_secret'], + ]) + path_set_context(keyring) + + def get_json(self): + return bytearray(json.dumps(self.keys), 'ascii') + + +class LockboxSecrets(Secrets): + + def __init__(self, args): + super(LockboxSecrets, self).__init__() + + key_size = CryptHelpers.get_dmcrypt_keysize(args) + key = open('/dev/urandom', 'rb').read(key_size / 8) + base64_key = base64.b64encode(key).decode('ascii') + + secret, stderr, ret = command(['ceph-authtool', '--gen-print-key']) + LOG.debug("stderr " + stderr) + assert ret == 0 + + self.keys.update({ + 'dmcrypt_key': base64_key, + 'cephx_lockbox_secret': secret.strip(), + }) + + def write_lockbox_keyring(self, path, osd_uuid): + keyring = os.path.join(path, 'keyring') + command_check_call( + [ + 'ceph-authtool', keyring, + '--create-keyring', + '--name', 'client.osd-lockbox.' + osd_uuid, + '--add-key', self.keys['cephx_lockbox_secret'], + ]) + path_set_context(keyring) + + class Lockbox(object): def __init__(self, args): @@ -2619,7 +2709,7 @@ def create_partition(self): self.device = Device.factory(self.args.lockbox, argparse.Namespace()) - partition_number = 3 + partition_number = 5 self.device.create_partition(uuid=self.args.lockbox_uuid, name='lockbox', num=partition_number, @@ -2643,42 +2733,28 @@ self.partition = self.create_partition() def create_key(self): - key_size = CryptHelpers.get_dmcrypt_keysize(self.args) - key = open('/dev/urandom', 'rb').read(key_size / 8) - base64_key = base64.b64encode(key) cluster = self.args.cluster bootstrap = self.args.prepare_key_template.format(cluster=cluster, statedir=STATEDIR) - command_check_call( - [ - 'ceph', - '--cluster', cluster, - '--name', 'client.bootstrap-osd', - '--keyring', bootstrap, - 'config-key', - 'put', - 'dm-crypt/osd/' + self.args.osd_uuid + '/luks', - base64_key, - ], - ) - keyring, stderr, ret = command( + path = self.get_mount_point() + secrets = LockboxSecrets(self.args) + id_arg = self.args.osd_id and [self.args.osd_id] or [] + osd_id = command_with_stdin( [ 'ceph', '--cluster', cluster, '--name', 'client.bootstrap-osd', '--keyring', bootstrap, - 'auth', - 'get-or-create', - 'client.osd-lockbox.' + self.args.osd_uuid, - 'mon', - ('allow command "config-key get" with key="dm-crypt/osd/' + - self.args.osd_uuid + '/luks"'), - ], + '-i', '-', + 'osd', 'new', self.args.osd_uuid, + ] + id_arg, + secrets.get_json() ) - LOG.debug("stderr " + stderr) - assert ret == 0 - path = self.get_mount_point() - open(os.path.join(path, 'keyring'), 'w').write(keyring) + secrets.write_lockbox_keyring(path, self.args.osd_uuid) + osd_id = must_be_one_line(osd_id) + check_osd_id(osd_id) + write_one_line(path, 'whoami', osd_id) + secrets.write_osd_keyring(os.path.join(path, 'osd_keyring'), osd_id) write_one_line(path, 'key-management-mode', KEY_MANAGEMENT_MODE_V1) def symlink_spaces(self, path): @@ -2839,6 +2915,8 @@ write_one_line(path, 'ceph_fsid', self.args.cluster_uuid) write_one_line(path, 'fsid', self.args.osd_uuid) + if self.args.osd_id: + write_one_line(path, 'wanttobe', self.args.osd_id) if self.args.crush_device_class: write_one_line(path, 'crush_device_class', self.args.crush_device_class) @@ -3034,36 +3112,6 @@ write_one_line(path, 'type', 'bluestore') -# -# Temporary workaround: if ceph-osd --mkfs does not -# complete within 5 minutes, assume it is blocked -# because of http://tracker.ceph.com/issues/13522 -# and retry a few times. -# -# Remove this function calls with command_check_call -# when http://tracker.ceph.com/issues/13522 is fixed -# -def ceph_osd_mkfs(arguments): - timeout = _get_command_executable(['timeout']) - mkfs_ok = False - error = 'unknown error' - for delay in os.environ.get('CEPH_OSD_MKFS_DELAYS', - '300 300 300 300 300').split(): - try: - _check_output(timeout + [delay] + arguments) - mkfs_ok = True - break - except subprocess.CalledProcessError as e: - error = e.output - if e.returncode == 124: # timeout fired, retry - LOG.debug('%s timed out : %s (retry)' - % (str(arguments), error)) - else: - break - if not mkfs_ok: - raise Error('%s failed : %s' % (str(arguments), error)) - - def mkfs( path, cluster, @@ -3085,34 +3133,30 @@ osd_type = read_one_line(path, 'type') if osd_type == 'bluestore': - ceph_osd_mkfs( + command_check_call( [ 'ceph-osd', '--cluster', cluster, '--mkfs', - '--mkkey', '-i', osd_id, '--monmap', monmap, '--osd-data', path, '--osd-uuid', fsid, - '--keyring', os.path.join(path, 'keyring'), '--setuser', get_ceph_user(), '--setgroup', get_ceph_group(), ], ) elif osd_type == 'filestore': - ceph_osd_mkfs( + command_check_call( [ 'ceph-osd', '--cluster', cluster, '--mkfs', - '--mkkey', '-i', osd_id, '--monmap', monmap, '--osd-data', path, '--osd-journal', os.path.join(path, 'journal'), '--osd-uuid', fsid, - '--keyring', os.path.join(path, 'keyring'), '--setuser', get_ceph_user(), '--setgroup', get_ceph_group(), ], @@ -3121,45 +3165,6 @@ raise Error('unrecognized objectstore type %s' % osd_type) -def auth_key( - path, - cluster, - osd_id, - keyring, -): - try: - # try dumpling+ cap scheme - command_check_call( - [ - 'ceph', - '--cluster', cluster, - '--name', 'client.bootstrap-osd', - '--keyring', keyring, - 'auth', 'add', 'osd.{osd_id}'.format(osd_id=osd_id), - '-i', os.path.join(path, 'keyring'), - 'osd', 'allow *', - 'mon', 'allow profile osd', - ], - ) - except subprocess.CalledProcessError as err: - if err.returncode == errno.EINVAL: - # try old cap scheme - command_check_call( - [ - 'ceph', - '--cluster', cluster, - '--name', 'client.bootstrap-osd', - '--keyring', keyring, - 'auth', 'add', 'osd.{osd_id}'.format(osd_id=osd_id), - '-i', os.path.join(path, 'keyring'), - 'osd', 'allow *', - 'mon', 'allow rwx', - ], - ) - else: - raise - - def get_mount_point(cluster, osd_id): parent = STATEDIR + '/osd' return os.path.join( @@ -3673,6 +3678,7 @@ cluster=cluster, fsid=fsid, keyring=keyring, + path=path, ) write_one_line(path, 'whoami', osd_id) LOG.debug('OSD id is %s', osd_id) @@ -3713,13 +3719,6 @@ pass if not os.path.exists(os.path.join(path, 'active')): - LOG.debug('Authorizing OSD key...') - auth_key( - path=path, - cluster=cluster, - osd_id=osd_id, - keyring=keyring, - ) write_one_line(path, 'active', 'ok') LOG.debug('%s osd.%s data dir is ready at %s', cluster, osd_id, path) return (osd_id, cluster) @@ -3958,52 +3957,7 @@ ########################### -def _remove_from_crush_map(cluster, osd_id): - LOG.info("Prepare to remove osd.%s from crush map..." % osd_id) - command([ - 'ceph', - 'osd', - 'crush', - 'remove', - 'osd.%s' % osd_id, - ]) - - -def _delete_osd_auth_key(cluster, osd_id): - LOG.info("Prepare to delete osd.%s cephx key..." % osd_id) - command([ - 'ceph', - 'auth', - 'del', - 'osd.%s' % osd_id, - ]) - - -def _deallocate_osd_id(cluster, osd_id): - LOG.info("Prepare to deallocate the osd-id: %s..." % osd_id) - command([ - 'ceph', - 'osd', - 'rm', - '%s' % osd_id, - ]) - - def _remove_lockbox(uuid, cluster): - command([ - 'ceph', - '--cluster', cluster, - 'auth', - 'del', - 'client.osd-lockbox.' + uuid, - ]) - command([ - 'ceph', - '--cluster', cluster, - 'config-key', - 'del', - 'dm-crypt/osd/' + uuid + '/luks', - ]) lockbox = os.path.join(STATEDIR, 'osd-lockbox') if not os.path.exists(lockbox): return @@ -4083,14 +4037,18 @@ raise Error("Could not destroy the active osd. (osd-id: %s)" % osd_id) - # Remove OSD from crush map - _remove_from_crush_map(args.cluster, osd_id) - - # Remove OSD cephx key - _delete_osd_auth_key(args.cluster, osd_id) - - # Deallocate OSD ID - _deallocate_osd_id(args.cluster, osd_id) + if args.purge: + action = 'purge' + else: + action = 'destroy' + LOG.info("Prepare to %s osd.%s" % (action, osd_id)) + command([ + 'ceph', + 'osd', + action, + 'osd.%s' % osd_id, + '--yes-i-really-mean-it', + ]) # we remove the crypt map and device mapper (if dmcrypt is True) if dmcrypt: @@ -4111,7 +4069,7 @@ if not os.path.exists(path): raise Error('%s does not exist' % path) - if path_is_diskdevice(path): + if not path_is_diskdevice(path): raise Error('%s is not a block device' % path) if (is_partition(path) and @@ -4144,6 +4102,10 @@ if not os.path.exists(args.dev): raise Error('%s does not exist' % args.dev) + if is_suppressed(args.dev): + LOG.info('suppressed activate request on space %s', args.dev) + return + cluster = None osd_id = None osd_uuid = None @@ -4715,7 +4677,7 @@ disk = os.path.realpath(path) if not os.path.exists(disk): raise Error('does not exist', path) - if ldev_is_diskdevice(path): + if not ldev_is_diskdevice(path): raise Error('not a block device', path) base = get_dev_name(disk) @@ -4897,11 +4859,11 @@ def main_fix(args): # A hash table containing 'path': ('uid', 'gid', blocking, recursive) fix_table = [ - ('/usr/bin/ceph-mon', 'root', 'root', True, False), - ('/usr/bin/ceph-mds', 'root', 'root', True, False), - ('/usr/bin/ceph-osd', 'root', 'root', True, False), - ('/usr/bin/radosgw', 'root', 'root', True, False), - ('/etc/ceph', 'root', 'root', True, True), + ('/usr/bin/ceph-mon', 'root', ROOTGROUP, True, False), + ('/usr/bin/ceph-mds', 'root', ROOTGROUP, True, False), + ('/usr/bin/ceph-osd', 'root', ROOTGROUP, True, False), + ('/usr/bin/radosgw', 'root', ROOTGROUP, True, False), + ('/etc/ceph', 'root', ROOTGROUP, True, True), ('/var/run/ceph', 'ceph', 'ceph', True, True), ('/var/log/ceph', 'ceph', 'ceph', True, True), ('/var/log/radosgw', 'ceph', 'ceph', True, True), @@ -5594,12 +5556,13 @@ destroy_parser = subparsers.add_parser( 'destroy', formatter_class=argparse.RawDescriptionHelpFormatter, - description=textwrap.fill(textwrap.dedent("""\ - Destroy the OSD located at PATH. - It removes the OSD from the cluster, the crushmap and - deallocates the OSD id. An OSD must be down before it - can be destroyed. - """)), + description=textwrap.fill(textwrap.dedent("""\ Destroy the OSD located at PATH. It removes the OSD from the + cluster and marks it destroyed. An OSD must be down before it + can be destroyed. Once it is destroyed, a new OSD can be created + in its place, reusing the same OSD id and position (e.g. after + a failed HDD or SSD is replaced). Alternatively, if the + --purge option is also specified, the OSD is removed from the + CRUSH map and the OSD id is deallocated.""")), help='Destroy a Ceph OSD') destroy_parser.add_argument( '--cluster', @@ -5631,6 +5594,11 @@ action='store_true', default=False, help='option to erase data and partition', ) + destroy_parser.add_argument( + '--purge', + action='store_true', default=False, + help='option to remove OSD from CRUSH map and deallocate the id', + ) destroy_parser.set_defaults( func=main_destroy, ) diff -Nru ceph-12.1.1/src/ceph-disk/tests/ceph-disk.sh ceph-12.1.2/src/ceph-disk/tests/ceph-disk.sh --- ceph-12.1.1/src/ceph-disk/tests/ceph-disk.sh 2017-07-17 16:56:02.000000000 +0000 +++ ceph-12.1.2/src/ceph-disk/tests/ceph-disk.sh 2017-08-01 17:55:40.000000000 +0000 @@ -1,7 +1,7 @@ #!/bin/bash # # Copyright (C) 2014 Cloudwatt -# Copyright (C) 2014, 2015, 2016 Red Hat +# Copyright (C) 2014, 2015, 2016, 2017 Red Hat # # Author: Loic Dachary # @@ -25,7 +25,7 @@ CEPH_BIN=$CEPH_ROOT CEPH_LIB=$CEPH_ROOT/.libs fi -source $CEPH_ROOT/qa/workunits/ceph-helpers.sh +source $CEPH_ROOT/qa/standalone/ceph-helpers.sh set -x @@ -37,7 +37,7 @@ CEPH_DISK_ARGS= CEPH_DISK_ARGS+=" --verbose" CEPH_DISK_ARGS+=" --prepend-to-path=" -TIMEOUT=360 +: ${CEPH_DISK_TIMEOUT:=360} if [ `uname` != FreeBSD ]; then PROCDIR="" else @@ -45,16 +45,22 @@ fi cat=$(which cat) -timeout=$(which timeout) diff=$(which diff) mkdir=$(which mkdir) rm=$(which rm) uuidgen=$(which uuidgen) +if [ `uname` = FreeBSD ]; then + # for unknown reasons FreeBSD timeout does not return sometimes + timeout="" +else + timeout="$(which timeout) $CEPH_DISK_TIMEOUT" +fi function setup() { local dir=$1 teardown $dir mkdir -p $dir/osd + mkdir -p $(get_asok_dir) touch $dir/ceph.conf # so ceph-disk think ceph is the cluster } @@ -73,6 +79,7 @@ umount $mounted done rm -fr $dir + rm -rf $(get_asok_dir) } function command_fixture() { @@ -164,6 +171,7 @@ shift run_mon $dir a + create_rbd_pool local osd_data=$dir/dir $mkdir -p $osd_data @@ -189,7 +197,7 @@ else expected=systemd fi - $timeout $TIMEOUT ${CEPH_DISK} $CEPH_DISK_ARGS \ + $timeout ${CEPH_DISK} $CEPH_DISK_ARGS \ --verbose \ activate \ --mark-init=$expected \ @@ -242,61 +250,105 @@ grep --quiet $uuid $osd_data/ceph_fsid || return 1 } -function test_pool_read_write() { - local osd_uuid=$1 - local TEST_POOL=rbd +function read_write() { + local dir=$1 + local file=${2:-$(uuidgen)} + local pool=rbd + + echo FOO > $dir/$file + $timeout rados --pool $pool put $file $dir/$file || return 1 + $timeout rados --pool $pool get $file $dir/$file.copy || return 1 + $diff $dir/$file $dir/$file.copy || return 1 +} - $timeout $TIMEOUT ceph osd pool set $TEST_POOL size 1 || return 1 +function test_pool_read_write() { + local dir=$1 + local pool=rbd - local id=$(ceph osd create $osd_uuid) - local weight=1 - ceph osd crush add osd.$id $weight root=default host=localhost || return 1 - echo FOO > $dir/BAR - $timeout $TIMEOUT rados --pool $TEST_POOL put BAR $dir/BAR || return 1 - $timeout $TIMEOUT rados --pool $TEST_POOL get BAR $dir/BAR.copy || return 1 - $diff $dir/BAR $dir/BAR.copy || return 1 + $timeout ceph osd pool set $pool size 1 || return 1 + read_write $dir || return 1 } function test_activate() { - local to_prepare=$1 - local to_activate=$2 - local osd_uuid=$($uuidgen) - local timeoutcmd + local dir=$1 + shift + local osd_data=$1 + shift - if [ `uname` = FreeBSD ]; then - # for unknown reasons FreeBSD timeout does not return here - # So we run without timeout - timeoutcmd="" - else - timeoutcmd="${timeout} $TIMEOUT" - fi + mkdir -p $osd_data ${CEPH_DISK} $CEPH_DISK_ARGS \ - prepare --filestore --osd-uuid $osd_uuid $to_prepare || return 1 + prepare --filestore "$@" $osd_data || return 1 - $timeoutcmd ${CEPH_DISK} $CEPH_DISK_ARGS \ + $timeout ${CEPH_DISK} $CEPH_DISK_ARGS \ activate \ --mark-init=none \ - $to_activate || return 1 + $osd_data || return 1 - test_pool_read_write $osd_uuid || return 1 + test_pool_read_write $dir || return 1 +} + +function test_reuse_osd_id() { + local dir=$1 + + run_mon $dir a || return 1 + run_mgr $dir x || return 1 + create_rbd_pool + + test_activate $dir $dir/dir1 --osd-uuid $(uuidgen) || return 1 + + # + # add a new OSD with a given OSD id (13) + # + local osd_uuid=$($uuidgen) + local osd_id=13 + test_activate $dir $dir/dir2 --osd-id $osd_id --osd-uuid $osd_uuid || return 1 + test $osd_id = $(ceph osd new $osd_uuid) || return 1 + + # + # make sure the OSD is in use by the PGs + # + wait_osd_id_used_by_pgs $osd_id $PG_NUM || return 1 + read_write $dir SOMETHING || return 1 + + # + # set the OSD out and verify it is no longer used by the PGs + # + ceph osd out osd.$osd_id || return 1 + wait_osd_id_used_by_pgs $osd_id 0 || return 1 + + # + # kill the OSD and destroy it (do not purge, retain its place in the crushmap) + # + kill_daemons $dir TERM osd.$osd_id || return 1 + ceph osd destroy osd.$osd_id --yes-i-really-mean-it || return 1 + + # + # add a new OSD with the same id as the destroyed OSD + # + osd_uuid=$($uuidgen) + test_activate $dir $dir/dir3 --osd-id $osd_id --osd-uuid $osd_uuid || return 1 + test $osd_id = $(ceph osd new $osd_uuid) || return 1 } function test_activate_dir() { local dir=$1 shift - run_mon $dir a + run_mon $dir a || return 1 + run_mgr $dir x || return 1 + create_rbd_pool $@ - local osd_data=$dir/dir - $mkdir -p $osd_data - test_activate $osd_data $osd_data || return 1 + test_activate $dir $dir/dir || return 1 } function test_activate_dir_bluestore() { local dir=$1 - run_mon $dir a + + run_mon $dir a || return 1 + run_mgr $dir x || return 1 + create_rbd_pool local osd_data=$dir/dir $mkdir -p $osd_data @@ -309,11 +361,12 @@ prepare --bluestore --block-file --osd-uuid $osd_uuid $to_prepare || return 1 CEPH_ARGS=" --osd-objectstore=bluestore --bluestore-fsck-on-mount=true --bluestore-block-db-size=67108864 --bluestore-block-wal-size=134217728 --bluestore-block-size=10737418240 $CEPH_ARGS" \ - $timeout $TIMEOUT ${CEPH_DISK} $CEPH_DISK_ARGS \ + $timeout ${CEPH_DISK} $CEPH_DISK_ARGS \ activate \ --mark-init=none \ $to_activate || return 1 - test_pool_read_write $osd_uuid || return 1 + + test_pool_read_write $dir || return 1 } function test_find_cluster_by_uuid() { @@ -335,38 +388,12 @@ grep --quiet "keyring $dir/bootstrap-osd/ceph.keyring" $dir/test_keyring || return 1 } -# http://tracker.ceph.com/issues/13522 -function ceph_osd_fail_once_fixture() { - local dir=$1 - local command=ceph-osd - local fpath=`readlink -f $(which $command)` - [ "$fpath" = `readlink -f $CEPH_BIN/$command` ] || [ "$fpath" = `readlink -f $(pwd)/$command` ] || return 1 - - cat > $dir/$command < # @@ -1264,9 +1264,6 @@ list_devices=list_devices_return, get_partition_base=lambda dev_path: '/dev/sdY', _check_osd_status=lambda cluster, osd_id: 0, - _remove_from_crush_map=lambda cluster, osd_id: True, - _delete_osd_auth_key=lambda cluster, osd_id: True, - _deallocate_osd_id=lambda cluster, osd_id: True, zap=lambda dev: True ): main.main_destroy(args) @@ -1287,36 +1284,6 @@ self.assertRaises(Exception, main.main_destroy, args) shutil.rmtree(data) - def test_remove_from_crush_map_fail(self): - cluster = 'ceph' - osd_id = '5566' - with patch.multiple( - main, - command=raise_command_error - ): - self.assertRaises(Exception, main._remove_from_crush_map, - cluster, osd_id) - - def test_delete_osd_auth_key_fail(self): - cluster = 'ceph' - osd_id = '5566' - with patch.multiple( - main, - command=raise_command_error - ): - self.assertRaises(Exception, main._delete_osd_auth_key, - cluster, osd_id) - - def test_deallocate_osd_id_fail(self): - cluster = 'ceph' - osd_id = '5566' - with patch.multiple( - main, - command=raise_command_error - ): - self.assertRaises(Exception, main._deallocate_osd_id, - cluster, osd_id) - def test_main_fix(self): if platform.system() == "FreeBSD": return diff -Nru ceph-12.1.1/src/ceph-disk/tests/test_prepare.py ceph-12.1.2/src/ceph-disk/tests/test_prepare.py --- ceph-12.1.1/src/ceph-disk/tests/test_prepare.py 2017-07-17 16:56:02.000000000 +0000 +++ ceph-12.1.2/src/ceph-disk/tests/test_prepare.py 2017-08-01 17:55:40.000000000 +0000 @@ -1,4 +1,4 @@ -#!/bin/bash +#!/usr/bin/env python # # Copyright (C) 2015, 2016 Red Hat # @@ -423,3 +423,37 @@ set_type=set_type): data = main.PrepareData(args) assert data.args.cluster_uuid == cluster_uuid + + +class TestSecrets(Base): + + @mock.patch('ceph_disk.main.command') + def test_secrets(self, m_command): + key = "KEY" + m_command.side_effect = lambda cmd: (key + "\n", '', 0) + s = main.Secrets() + assert {"cephx_secret": key} == s.keys + assert '{"cephx_secret": "' + key + '"}' == s.get_json() + + @mock.patch('ceph_disk.main.open') + @mock.patch('ceph_disk.main.CryptHelpers.get_dmcrypt_keysize') + @mock.patch('ceph_disk.main.command') + def test_lockbox_secrets(self, + m_command, + m_get_dmcrypt_keysize, + m_open): + key = "KEY" + m_command.side_effect = lambda cmd: (key + "\n", '', 0) + m_get_dmcrypt_keysize.side_effect = lambda args: 32 + + class File: + def read(self, size): + return b'O' * size + + m_open.side_effect = lambda path, mode: File() + s = main.LockboxSecrets({}) + assert { + "dmcrypt_key": 'T09PTw==', + "cephx_secret": key, + "cephx_lockbox_secret": key, + } == s.keys diff -Nru ceph-12.1.1/src/ceph_fuse.cc ceph-12.1.2/src/ceph_fuse.cc --- ceph-12.1.1/src/ceph_fuse.cc 2017-07-17 16:56:02.000000000 +0000 +++ ceph-12.1.2/src/ceph_fuse.cc 2017-08-01 17:55:40.000000000 +0000 @@ -62,9 +62,9 @@ void usage() { cout << -"usage: ceph-fuse [-m mon-ip-addr:mon-port] [OPTIONS]\n" -" --client_mountpoint/-r \n" -" use root_directory as the mounted root, rather than the full Ceph tree.\n" +"usage: ceph-fuse [-n client.username] [-m mon-ip-addr:mon-port] [OPTIONS]\n" +" --client_mountpoint/-r \n" +" use sub_directory as the mounted root, rather than the full Ceph tree.\n" "\n"; fuse_usage(); generic_client_usage(); diff -Nru ceph-12.1.1/src/ceph.in ceph-12.1.2/src/ceph.in --- ceph-12.1.1/src/ceph.in 2017-07-17 16:56:02.000000000 +0000 +++ ceph-12.1.2/src/ceph.in 2017-08-01 17:55:40.000000000 +0000 @@ -264,9 +264,9 @@ parser.add_argument('-c', '--conf', dest='cephconf', help='ceph configuration file') parser.add_argument('-i', '--in-file', dest='input_file', - help='input file') + help='input file, or "-" for stdin') parser.add_argument('-o', '--out-file', dest='output_file', - help='output file') + help='output file, or "-" for stdout') parser.add_argument('--id', '--user', dest='client_id', help='client id for authentication') @@ -276,8 +276,6 @@ parser.add_argument('--admin-daemon', dest='admin_socket', help='submit admin-socket commands (\"help\" for help') - parser.add_argument('--admin-socket', dest='admin_socket_nope', - help='you probably mean --admin-daemon') parser.add_argument('-s', '--status', action='store_true', help='show cluster status') @@ -781,7 +779,8 @@ i = sys.argv.index("injectargs") sys.argv = sys.argv[:i] + ceph_args.split() + sys.argv[i:] else: - sys.argv.extend(ceph_args.split()) + sys.argv.extend([arg for arg in ceph_args.split() + if '--admin-socket' not in arg]) parser, parsed_args, childargs = parse_cmdargs() if parsed_args.version: @@ -798,11 +797,6 @@ if verbose: print("parsed_args: {0}, childargs: {1}".format(parsed_args, childargs), file=sys.stderr) - if parsed_args.admin_socket_nope: - print('--admin-socket is used by daemons; ' - 'you probably mean --admin-daemon/daemon', file=sys.stderr) - return 1 - # pass on --id, --name, --conf name = 'client.admin' if parsed_args.client_id: @@ -971,8 +965,11 @@ inbuf = b'' if parsed_args.input_file: try: - with open(parsed_args.input_file, 'rb') as f: - inbuf = f.read() + if parsed_args.input_file == '-': + inbuf = sys.stdin.read() + else: + with open(parsed_args.input_file, 'rb') as f: + inbuf = f.read() except Exception as e: print('Can\'t open input file {0}: {1}'.format(parsed_args.input_file, e), file=sys.stderr) return 1 @@ -980,7 +977,10 @@ # prepare output file, if any if parsed_args.output_file: try: - outf = open(parsed_args.output_file, 'wb') + if parsed_args.output_file == '-': + outf = sys.stdout + else: + outf = open(parsed_args.output_file, 'wb') except Exception as e: print('Can\'t open output file {0}: {1}'.format(parsed_args.output_file, e), file=sys.stderr) return 1 @@ -1106,7 +1106,7 @@ sys.stdout.flush() - if parsed_args.output_file: + if parsed_args.output_file and parsed_args.output_file != '-': outf.close() if final_ret: diff -Nru ceph-12.1.1/src/ceph_mgr.cc ceph-12.1.2/src/ceph_mgr.cc --- ceph-12.1.1/src/ceph_mgr.cc 2017-07-17 16:56:02.000000000 +0000 +++ ceph-12.1.2/src/ceph_mgr.cc 2017-08-01 17:55:40.000000000 +0000 @@ -45,7 +45,7 @@ CODE_ENVIRONMENT_DAEMON, 0, "mgr_data"); // For consumption by KeyRing::from_ceph_context in MonClient - g_conf->set_val("keyring", "$mgr_data/keyring", false); + g_conf->set_val_or_die("keyring", "$mgr_data/keyring"); // Handle --help if ((args.size() == 1 && (std::string(args[0]) == "--help" || std::string(args[0]) == "-h"))) { diff -Nru ceph-12.1.1/src/ceph_mon.cc ceph-12.1.2/src/ceph_mon.cc --- ceph-12.1.1/src/ceph_mon.cc 2017-07-17 16:56:02.000000000 +0000 +++ ceph-12.1.2/src/ceph_mon.cc 2017-08-01 17:55:40.000000000 +0000 @@ -203,10 +203,10 @@ // We need to specify some default values that may be overridden by the // user, that are specific to the monitor. The options we are overriding // are also used on the OSD (or in any other component that uses leveldb), - // so changing them directly in common/config_opts.h is not an option. + // so changing the global defaults is not an option. // This is not the prettiest way of doing this, especially since it has us - // having a different place than common/config_opts.h defining default - // values, but it's not horribly wrong enough to prevent us from doing it :) + // having a different place defining default values, but it's not horribly + // wrong enough to prevent us from doing it :) // // NOTE: user-defined options will take precedence over ours. // diff -Nru ceph-12.1.1/src/ceph_syn.cc ceph-12.1.2/src/ceph_syn.cc --- ceph-12.1.1/src/ceph_syn.cc 2017-07-17 16:56:02.000000000 +0000 +++ ceph-12.1.2/src/ceph_syn.cc 2017-08-01 17:55:40.000000000 +0000 @@ -57,11 +57,11 @@ list clients; list synclients; - Messenger* messengers[g_conf->num_client]; - MonClient* mclients[g_conf->num_client]; + Messenger* messengers[num_client]; + MonClient* mclients[num_client]; - cout << "ceph-syn: starting " << g_conf->num_client << " syn client(s)" << std::endl; - for (int i=0; inum_client; i++) { + cout << "ceph-syn: starting " << num_client << " syn client(s)" << std::endl; + for (int i=0; ibind(g_conf->public_addr); @@ -91,7 +91,7 @@ delete client; } - for (int i = 0; i < g_conf->num_client; ++i) { + for (int i = 0; i < num_client; ++i) { // wait for messenger to finish delete mclients[i]; messengers[i]->shutdown(); diff -Nru ceph-12.1.1/src/client/Client.cc ceph-12.1.2/src/client/Client.cc --- ceph-12.1.1/src/client/Client.cc 2017-07-17 16:56:02.000000000 +0000 +++ ceph-12.1.2/src/client/Client.cc 2017-08-01 17:55:40.000000000 +0000 @@ -3178,13 +3178,12 @@ if (!waitfor_caps && !waitfor_commit) { if ((have & need) == need) { - int butnot = want & ~(have & need); int revoking = implemented & ~have; ldout(cct, 10) << "get_caps " << *in << " have " << ccap_string(have) << " need " << ccap_string(need) << " want " << ccap_string(want) - << " but not " << ccap_string(butnot) << " revoking " << ccap_string(revoking) + << " revoking " << ccap_string(revoking) << dendl; - if ((revoking & butnot) == 0) { + if ((revoking & want) == 0) { *phave = need | (have & want); in->get_cap_ref(need); return 0; @@ -6466,8 +6465,13 @@ } //make new dir r = _mkdir(cur.get(), path[i].c_str(), mode, perms, &next); + //check proper creation/existence - if (r < 0) return r; + if(-EEXIST == r && i < path.depth() - 1) { + r = _lookup(cur.get(), path[i].c_str(), CEPH_CAP_AUTH_SHARED, &next, perms); + } + if (r < 0) + return r; //move to new dir and continue cur.swap(next); ldout(cct, 20) << "mkdirs: successfully created directory " @@ -8564,17 +8568,22 @@ if (in->inline_version == 0) { int r = _getattr(in, CEPH_STAT_CAP_INLINE_DATA, f->actor_perms, true); - if (r < 0) + if (r < 0) { + if (movepos) + unlock_fh_pos(f); return r; + } assert(in->inline_version > 0); } retry: int have; int r = get_caps(in, CEPH_CAP_FILE_RD, CEPH_CAP_FILE_CACHE, &have, -1); - if (r < 0) + if (r < 0) { + if (movepos) + unlock_fh_pos(f); return r; - + } if (f->flags & O_DIRECT) have &= ~CEPH_CAP_FILE_CACHE; @@ -8676,7 +8685,12 @@ if (have) put_cap_ref(in, CEPH_CAP_FILE_RD); - return r < 0 ? r : bl->length(); + if (r < 0) { + if (movepos) + unlock_fh_pos(f); + return r; + } else + return bl->length(); } Client::C_Readahead::C_Readahead(Client *c, Fh *f) : diff -Nru ceph-12.1.1/src/client/SyntheticClient.cc ceph-12.1.2/src/client/SyntheticClient.cc --- ceph-12.1.1/src/client/SyntheticClient.cc 2017-07-17 16:56:02.000000000 +0000 +++ ceph-12.1.2/src/client/SyntheticClient.cc 2017-08-01 17:55:40.000000000 +0000 @@ -48,7 +48,7 @@ //void trace_include(SyntheticClient *syn, Client *cl, string& prefix); //void trace_openssh(SyntheticClient *syn, Client *cl, string& prefix); - +int num_client = 1; list syn_modes; list syn_iargs; list syn_sargs; @@ -59,6 +59,10 @@ vector nargs; for (unsigned i=0; icct->_conf->num_client ? client->cct->_conf->num_client : 1; + int numc = num_client ? num_client : 1; int start, inc, end; @@ -3243,7 +3247,7 @@ if (sp < 0) dirnum++; //dout(0) << "leading dir " << filename << " " << dirnum << dendl; - if (dirnum % client->cct->_conf->num_client != client->get_nodeid()) { + if (dirnum % num_client != client->get_nodeid()) { dout(20) << "skipping leading dir " << dirnum << " " << filename << dendl; continue; } diff -Nru ceph-12.1.1/src/client/SyntheticClient.h ceph-12.1.2/src/client/SyntheticClient.h --- ceph-12.1.1/src/client/SyntheticClient.h 2017-07-17 16:56:02.000000000 +0000 +++ ceph-12.1.2/src/client/SyntheticClient.h 2017-08-01 17:55:40.000000000 +0000 @@ -95,6 +95,7 @@ void parse_syn_options(vector& args); +extern int num_client; class SyntheticClient { StandaloneClient *client; diff -Nru ceph-12.1.1/src/cls/journal/cls_journal.cc ceph-12.1.2/src/cls/journal/cls_journal.cc --- ceph-12.1.1/src/cls/journal/cls_journal.cc 2017-07-17 16:56:02.000000000 +0000 +++ ceph-12.1.2/src/cls/journal/cls_journal.cc 2017-08-01 17:55:40.000000000 +0000 @@ -101,13 +101,13 @@ skip_client_key = key_from_client_id(*skip_client_id); } - int r; uint64_t minimum_tag_tid = std::numeric_limits::max(); std::string last_read = HEADER_KEY_CLIENT_PREFIX; + bool more; do { std::map vals; - r = cls_cxx_map_get_vals(hctx, last_read, HEADER_KEY_CLIENT_PREFIX, - MAX_KEYS_READ, &vals); + int r = cls_cxx_map_get_vals(hctx, last_read, HEADER_KEY_CLIENT_PREFIX, + MAX_KEYS_READ, &vals, &more); if (r < 0 && r != -ENOENT) { CLS_ERR("failed to retrieve registered clients: %s", cpp_strerror(r).c_str()); @@ -137,7 +137,7 @@ if (!vals.empty()) { last_read = vals.rbegin()->first; } - } while (r == MAX_KEYS_READ); + } while (more); // cannot expire tags if a client hasn't committed yet if (minimum_tag_tid == std::numeric_limits::max()) { @@ -153,8 +153,8 @@ last_read = HEADER_KEY_TAG_PREFIX; do { std::map vals; - r = cls_cxx_map_get_vals(hctx, last_read, HEADER_KEY_TAG_PREFIX, - MAX_KEYS_READ, &vals); + int r = cls_cxx_map_get_vals(hctx, last_read, HEADER_KEY_TAG_PREFIX, + MAX_KEYS_READ, &vals, &more); if (r < 0 && r != -ENOENT) { CLS_ERR("failed to retrieve tags: %s", cpp_strerror(r).c_str()); return r; @@ -192,7 +192,7 @@ } } - if (tag_pass != TAG_PASS_DONE && vals.size() < MAX_KEYS_READ) { + if (tag_pass != TAG_PASS_DONE && !more) { last_read = HEADER_KEY_TAG_PREFIX; ++tag_pass; } else if (!vals.empty()) { @@ -211,8 +211,9 @@ } std::map vals; + bool more; int r = cls_cxx_map_get_vals(hctx, last_read, HEADER_KEY_CLIENT_PREFIX, - max_return, &vals); + max_return, &vals, &more); if (r < 0) { CLS_ERR("failed to retrieve omap values: %s", cpp_strerror(r).c_str()); return r; @@ -1022,8 +1023,9 @@ std::string last_read = HEADER_KEY_TAG_PREFIX; do { std::map vals; + bool more; r = cls_cxx_map_get_vals(hctx, last_read, HEADER_KEY_TAG_PREFIX, - MAX_KEYS_READ, &vals); + MAX_KEYS_READ, &vals, &more); if (r < 0 && r != -ENOENT) { CLS_ERR("failed to retrieve tags: %s", cpp_strerror(r).c_str()); return r; @@ -1062,7 +1064,7 @@ } } - if (tag_pass != TAG_PASS_DONE && vals.size() < MAX_KEYS_READ) { + if (tag_pass != TAG_PASS_DONE && !more) { last_read = HEADER_KEY_TAG_PREFIX; ++tag_pass; } else if (!vals.empty()) { diff -Nru ceph-12.1.1/src/cls/log/cls_log.cc ceph-12.1.2/src/cls/log/cls_log.cc --- ceph-12.1.1/src/cls/log/cls_log.cc 2017-07-17 16:56:02.000000000 +0000 +++ ceph-12.1.2/src/cls/log/cls_log.cc 2017-08-01 17:55:40.000000000 +0000 @@ -170,24 +170,22 @@ if (!max_entries || max_entries > MAX_ENTRIES) max_entries = MAX_ENTRIES; - int rc = cls_cxx_map_get_vals(hctx, from_index, log_index_prefix, max_entries + 1, &keys); + cls_log_list_ret ret; + + int rc = cls_cxx_map_get_vals(hctx, from_index, log_index_prefix, max_entries, &keys, &ret.truncated); if (rc < 0) return rc; - cls_log_list_ret ret; - list& entries = ret.entries; map::iterator iter = keys.begin(); - bool done = false; string marker; - size_t i; - for (i = 0; i < max_entries && iter != keys.end(); ++i, ++iter) { + for (; iter != keys.end(); ++iter) { const string& index = iter->first; marker = index; if (use_time_boundary && index.compare(0, to_index.size(), to_index) >= 0) { - done = true; + ret.truncated = false; break; } @@ -202,11 +200,9 @@ } } - if (iter == keys.end()) - done = true; - - ret.marker = marker; - ret.truncated = !done; + if (ret.truncated) { + ret.marker = marker; + } ::encode(ret, *out); @@ -244,16 +240,16 @@ #define MAX_TRIM_ENTRIES 1000 size_t max_entries = MAX_TRIM_ENTRIES; + bool more; - int rc = cls_cxx_map_get_vals(hctx, from_index, log_index_prefix, max_entries, &keys); + int rc = cls_cxx_map_get_vals(hctx, from_index, log_index_prefix, max_entries, &keys, &more); if (rc < 0) return rc; map::iterator iter = keys.begin(); - size_t i; bool removed = false; - for (i = 0; i < max_entries && iter != keys.end(); ++i, ++iter) { + for (; iter != keys.end(); ++iter) { const string& index = iter->first; CLS_LOG(20, "index=%s to_index=%s", index.c_str(), to_index.c_str()); diff -Nru ceph-12.1.1/src/cls/lua/cls_lua.cc ceph-12.1.2/src/cls/lua/cls_lua.cc --- ceph-12.1.1/src/cls/lua/cls_lua.cc 2017-07-17 16:56:02.000000000 +0000 +++ ceph-12.1.2/src/cls/lua/cls_lua.cc 2017-08-01 17:55:40.000000000 +0000 @@ -429,7 +429,8 @@ int max_to_get = luaL_checkinteger(L, 2); std::set keys; - int ret = cls_cxx_map_get_keys(hctx, start_after, max_to_get, &keys); + bool more; + int ret = cls_cxx_map_get_keys(hctx, start_after, max_to_get, &keys, &more); if (ret < 0) return clslua_opresult(L, 0, ret, 0); @@ -456,8 +457,9 @@ int max_to_get = luaL_checkinteger(L, 3); map kvpairs; + bool more; int ret = cls_cxx_map_get_vals(hctx, start_after, filter_prefix, - max_to_get, &kvpairs); + max_to_get, &kvpairs, &more); if (ret < 0) return clslua_opresult(L, 0, ret, 0); diff -Nru ceph-12.1.1/src/cls/rbd/cls_rbd.cc ceph-12.1.2/src/cls/rbd/cls_rbd.cc --- ceph-12.1.1/src/cls/rbd/cls_rbd.cc 2017-07-17 16:56:02.000000000 +0000 +++ ceph-12.1.2/src/cls/rbd/cls_rbd.cc 2017-08-01 17:55:40.000000000 +0000 @@ -1087,10 +1087,11 @@ int max_read = RBD_MAX_KEYS_READ; vector snap_ids; string last_read = RBD_SNAP_KEY_PREFIX; + bool more; do { set keys; - r = cls_cxx_map_get_keys(hctx, last_read, max_read, &keys); + r = cls_cxx_map_get_keys(hctx, last_read, max_read, &keys, &more); if (r < 0) { return r; } @@ -1125,7 +1126,7 @@ if (!keys.empty()) { last_read = *(keys.rbegin()); } - } while (r == max_read); + } while (more); } cls_rbd_parent parent; @@ -1370,10 +1371,11 @@ int max_read = RBD_MAX_KEYS_READ; vector snap_ids; string last_read = RBD_SNAP_KEY_PREFIX; + bool more; do { set keys; - r = cls_cxx_map_get_keys(hctx, last_read, max_read, &keys); + r = cls_cxx_map_get_keys(hctx, last_read, max_read, &keys, &more); if (r < 0) return r; @@ -1386,7 +1388,7 @@ } if (!keys.empty()) last_read = *(keys.rbegin()); - } while (r == max_read); + } while (more); uint64_t snap_seq; r = read_key(hctx, "snap_seq", &snap_seq); @@ -1631,10 +1633,11 @@ int max_read = RBD_MAX_KEYS_READ; uint64_t total_read = 0; string last_read = RBD_SNAP_KEY_PREFIX; + bool more; do { map vals; r = cls_cxx_map_get_vals(hctx, last_read, RBD_SNAP_KEY_PREFIX, - max_read, &vals); + max_read, &vals, &more); if (r < 0) return r; @@ -1668,7 +1671,7 @@ if (!vals.empty()) last_read = vals.rbegin()->first; - } while (r == RBD_MAX_KEYS_READ); + } while (more); // snapshot inherits parent, if any cls_rbd_parent parent; @@ -1729,10 +1732,11 @@ int max_read = RBD_MAX_KEYS_READ; string last_read = RBD_SNAP_KEY_PREFIX; + bool more; do { map vals; r = cls_cxx_map_get_vals(hctx, last_read, RBD_SNAP_KEY_PREFIX, - max_read, &vals); + max_read, &vals, &more); if (r < 0) return r; @@ -1754,7 +1758,7 @@ } if (!vals.empty()) last_read = vals.rbegin()->first; - } while (r == RBD_MAX_KEYS_READ); + } while (more); key_from_snap_id(src_snap_id, &src_snap_key); r = read_key(hctx, src_snap_key, &snap_meta); @@ -2164,15 +2168,15 @@ } int max_read = RBD_MAX_KEYS_READ; - int r = max_read; map images; string last_read = dir_key_for_name(start_after); + bool more = true; - while (r == max_read && images.size() < max_return) { + while (more && images.size() < max_return) { map vals; CLS_LOG(20, "last_read = '%s'", last_read.c_str()); - r = cls_cxx_map_get_vals(hctx, last_read, RBD_DIR_NAME_KEY_PREFIX, - max_read, &vals); + int r = cls_cxx_map_get_vals(hctx, last_read, RBD_DIR_NAME_KEY_PREFIX, + max_read, &vals, &more); if (r < 0) { CLS_ERR("error reading directory by name: %s", cpp_strerror(r).c_str()); return r; @@ -2648,11 +2652,12 @@ map data; string last_read = metadata_key_for_name(start_after); int max_read = max_return ? MIN(RBD_MAX_KEYS_READ, max_return) : RBD_MAX_KEYS_READ; + bool more; do { map raw_data; int r = cls_cxx_map_get_vals(hctx, last_read, RBD_METADATA_KEY_PREFIX, - max_read, &raw_data); + max_read, &raw_data, &more); if (r < 0) { CLS_ERR("failed to read the vals off of disk: %s", cpp_strerror(r).c_str()); return r; @@ -2664,13 +2669,13 @@ for (; it != raw_data.end(); ++it) data[metadata_name_from_key(it->first)].swap(it->second); - if (r < max_read) + if (!more) break; last_read = raw_data.rbegin()->first; if (max_return) max_read = MIN(RBD_MAX_KEYS_READ, max_return - data.size()); - } while (max_read); + } while (more); ::encode(data, *out); return 0; @@ -3166,11 +3171,11 @@ std::vector *peers) { std::string last_read = PEER_KEY_PREFIX; int max_read = RBD_MAX_KEYS_READ; - int r = max_read; - while (r == max_read) { + bool more = true; + while (more) { std::map vals; - r = cls_cxx_map_get_vals(hctx, last_read, PEER_KEY_PREFIX.c_str(), - max_read, &vals); + int r = cls_cxx_map_get_vals(hctx, last_read, PEER_KEY_PREFIX.c_str(), + max_read, &vals, &more); if (r < 0) { CLS_ERR("error reading peers: %s", cpp_strerror(r).c_str()); return r; @@ -3462,13 +3467,13 @@ map *mirror_statuses) { std::string last_read = image_key(start_after); int max_read = RBD_MAX_KEYS_READ; - int r = max_read; + bool more = true; - while (r == max_read && mirror_images->size() < max_return) { + while (more && mirror_images->size() < max_return) { std::map vals; CLS_LOG(20, "last_read = '%s'", last_read.c_str()); - r = cls_cxx_map_get_vals(hctx, last_read, IMAGE_KEY_PREFIX, max_read, - &vals); + int r = cls_cxx_map_get_vals(hctx, last_read, IMAGE_KEY_PREFIX, max_read, + &vals, &more); if (r < 0) { CLS_ERR("error reading mirror image directory by name: %s", cpp_strerror(r).c_str()); @@ -3526,11 +3531,11 @@ string last_read = IMAGE_KEY_PREFIX; int max_read = RBD_MAX_KEYS_READ; - r = max_read; - while (r == max_read) { + bool more = true; + while (more) { map vals; r = cls_cxx_map_get_vals(hctx, last_read, IMAGE_KEY_PREFIX, - max_read, &vals); + max_read, &vals, &more); if (r < 0) { CLS_ERR("error reading mirrored images: %s", cpp_strerror(r).c_str()); return r; @@ -3586,11 +3591,11 @@ string last_read = STATUS_GLOBAL_KEY_PREFIX; int max_read = RBD_MAX_KEYS_READ; - r = max_read; - while (r == max_read) { + bool more = true; + while (more) { map vals; r = cls_cxx_map_get_vals(hctx, last_read, STATUS_GLOBAL_KEY_PREFIX, - max_read, &vals); + max_read, &vals, &more); if (r < 0) { CLS_ERR("error reading mirrored images: %s", cpp_strerror(r).c_str()); return r; @@ -3638,11 +3643,11 @@ std::vector *instance_ids) { std::string last_read = INSTANCE_KEY_PREFIX; int max_read = RBD_MAX_KEYS_READ; - int r = max_read; - while (r == max_read) { + bool more = true; + while (more) { std::map vals; - r = cls_cxx_map_get_vals(hctx, last_read, INSTANCE_KEY_PREFIX.c_str(), - max_read, &vals); + int r = cls_cxx_map_get_vals(hctx, last_read, INSTANCE_KEY_PREFIX.c_str(), + max_read, &vals, &more); if (r < 0) { if (r != -ENOENT) { CLS_ERR("error reading mirror instances: %s", cpp_strerror(r).c_str()); @@ -4049,15 +4054,15 @@ } int max_read = RBD_MAX_KEYS_READ; - int r = max_read; + bool more = true; std::map mirror_images; std::string last_read = mirror::image_key(start_after); - while (r == max_read && mirror_images.size() < max_return) { + while (more && mirror_images.size() < max_return) { std::map vals; CLS_LOG(20, "last_read = '%s'", last_read.c_str()); - r = cls_cxx_map_get_vals(hctx, last_read, mirror::IMAGE_KEY_PREFIX, - max_read, &vals); + int r = cls_cxx_map_get_vals(hctx, last_read, mirror::IMAGE_KEY_PREFIX, + max_read, &vals, &more); if (r < 0) { CLS_ERR("error reading mirror image directory by name: %s", cpp_strerror(r).c_str()); @@ -4469,15 +4474,15 @@ } int max_read = RBD_MAX_KEYS_READ; - int r = max_read; + bool more = true; map groups; string last_read = dir_key_for_name(start_after); - while (r == max_read && groups.size() < max_return) { + while (more && groups.size() < max_return) { map vals; CLS_LOG(20, "last_read = '%s'", last_read.c_str()); - r = cls_cxx_map_get_vals(hctx, last_read, RBD_DIR_NAME_KEY_PREFIX, - max_read, &vals); + int r = cls_cxx_map_get_vals(hctx, last_read, RBD_DIR_NAME_KEY_PREFIX, + max_read, &vals, &more); if (r < 0) { CLS_ERR("error reading directory by name: %s", cpp_strerror(r).c_str()); return r; @@ -4723,12 +4728,12 @@ std::map vals; string last_read = start_after.image_key(); std::vector res; - int keys_read; + bool more; do { - keys_read = cls_cxx_map_get_vals(hctx, last_read,cls::rbd::RBD_GROUP_IMAGE_KEY_PREFIX, - max_read, &vals); - if (keys_read < 0) - return keys_read; + int r = cls_cxx_map_get_vals(hctx, last_read,cls::rbd::RBD_GROUP_IMAGE_KEY_PREFIX, + max_read, &vals, &more); + if (r < 0) + return r; for (map::iterator it = vals.begin(); it != vals.end() && res.size() < max_return; ++it) { @@ -4755,7 +4760,7 @@ last_read = res.rbegin()->spec.image_key(); } - } while ((keys_read == RBD_MAX_KEYS_READ) && (res.size() < max_return)); + } while (more && (res.size() < max_return)); ::encode(res, *out); return 0; @@ -5016,6 +5021,11 @@ * Returns the list of trash spec entries registered in the rbd_trash * object. * + * Input: + * @param start_after which name to begin listing after + * (use the empty string to start at the beginning) + * @param max_return the maximum number of names to list + * * Output: * @param data the map between image id and trash spec info * @@ -5023,18 +5033,31 @@ */ int trash_list(cls_method_context_t hctx, bufferlist *in, bufferlist *out) { + string start_after; + uint64_t max_return; + + try { + bufferlist::iterator iter = in->begin(); + ::decode(start_after, iter); + ::decode(max_return, iter); + } catch (const buffer::error &err) { + return -EINVAL; + } + map data; - string last_read = trash::image_key(""); - int max_read = RBD_MAX_KEYS_READ; + string last_read = trash::image_key(start_after); + bool more = true; CLS_LOG(20, "trash_get_images"); - - do { + while (data.size() < max_return) { map raw_data; + int max_read = std::min(RBD_MAX_KEYS_READ, + max_return - data.size()); int r = cls_cxx_map_get_vals(hctx, last_read, trash::IMAGE_KEY_PREFIX, - max_read, &raw_data); + max_read, &raw_data, &more); if (r < 0) { - CLS_ERR("failed to read the vals off of disk: %s", cpp_strerror(r).c_str()); + CLS_ERR("failed to read the vals off of disk: %s", + cpp_strerror(r).c_str()); return r; } if (raw_data.empty()) { @@ -5046,15 +5069,14 @@ ::decode(data[trash::image_id_from_key(it->first)], it->second); } - if (r < max_read) { + if (!more) { break; } last_read = raw_data.rbegin()->first; - } while (max_read); + } ::encode(data, *out); - return 0; } diff -Nru ceph-12.1.1/src/cls/rbd/cls_rbd_client.cc ceph-12.1.2/src/cls/rbd/cls_rbd_client.cc --- ceph-12.1.1/src/cls/rbd/cls_rbd_client.cc 2017-07-17 16:56:02.000000000 +0000 +++ ceph-12.1.2/src/cls/rbd/cls_rbd_client.cc 2017-08-01 17:55:40.000000000 +0000 @@ -2046,9 +2046,12 @@ return ioctx->operate(RBD_TRASH, &op); } - void trash_list_start(librados::ObjectReadOperation *op) + void trash_list_start(librados::ObjectReadOperation *op, + const std::string &start, uint64_t max_return) { bufferlist bl; + ::encode(start, bl); + ::encode(max_return, bl); op->exec("rbd", "trash_list", bl); } @@ -2067,10 +2070,11 @@ } int trash_list(librados::IoCtx *ioctx, + const std::string &start, uint64_t max_return, map *entries) { librados::ObjectReadOperation op; - trash_list_start(&op); + trash_list_start(&op, start, max_return); bufferlist out_bl; int r = ioctx->operate(RBD_TRASH, &op, &out_bl); diff -Nru ceph-12.1.1/src/cls/rbd/cls_rbd_client.h ceph-12.1.2/src/cls/rbd/cls_rbd_client.h --- ceph-12.1.1/src/cls/rbd/cls_rbd_client.h 2017-07-17 16:56:02.000000000 +0000 +++ ceph-12.1.2/src/cls/rbd/cls_rbd_client.h 2017-08-01 17:55:40.000000000 +0000 @@ -418,10 +418,12 @@ void trash_remove(librados::ObjectWriteOperation *op, const std::string &id); int trash_remove(librados::IoCtx *ioctx, const std::string &id); - void trash_list_start(librados::ObjectReadOperation *op); + void trash_list_start(librados::ObjectReadOperation *op, + const std::string &start, uint64_t max_return); int trash_list_finish(bufferlist::iterator *it, map *entries); int trash_list(librados::IoCtx *ioctx, + const std::string &start, uint64_t max_return, map *entries); void trash_get_start(librados::ObjectReadOperation *op, const std::string &id); diff -Nru ceph-12.1.1/src/cls/refcount/cls_refcount.cc ceph-12.1.2/src/cls/refcount/cls_refcount.cc --- ceph-12.1.1/src/cls/refcount/cls_refcount.cc 2017-07-17 16:56:02.000000000 +0000 +++ ceph-12.1.2/src/cls/refcount/cls_refcount.cc 2017-08-01 17:55:40.000000000 +0000 @@ -16,18 +16,23 @@ struct obj_refcount { map refs; + set retired_refs; obj_refcount() {} void encode(bufferlist& bl) const { - ENCODE_START(1, 1, bl); + ENCODE_START(2, 1, bl); ::encode(refs, bl); + ::encode(retired_refs, bl); ENCODE_FINISH(bl); } void decode(bufferlist::iterator& bl) { - DECODE_START(1, bl); + DECODE_START(2, bl); ::decode(refs, bl); + if (struct_v >= 2) { + ::decode(retired_refs, bl); + } DECODE_FINISH(bl); } }; @@ -60,12 +65,9 @@ return 0; } -static int set_refcount(cls_method_context_t hctx, map& refs) +static int set_refcount(cls_method_context_t hctx, const struct obj_refcount& objr) { bufferlist bl; - struct obj_refcount objr; - - objr.refs = refs; ::encode(objr, bl); @@ -97,7 +99,7 @@ objr.refs[op.tag] = true; - ret = set_refcount(hctx, objr.refs); + ret = set_refcount(hctx, objr); if (ret < 0) return ret; @@ -139,16 +141,18 @@ } } - if (!found) + if (!found || + objr.retired_refs.find(op.tag) != objr.retired_refs.end()) return 0; + objr.retired_refs.insert(op.tag); objr.refs.erase(iter); if (objr.refs.empty()) { return cls_cxx_remove(hctx); } - ret = set_refcount(hctx, objr.refs); + ret = set_refcount(hctx, objr); if (ret < 0) return ret; @@ -177,7 +181,7 @@ objr.refs[*iter] = true; } - int ret = set_refcount(hctx, objr.refs); + int ret = set_refcount(hctx, objr); if (ret < 0) return ret; diff -Nru ceph-12.1.1/src/cls/rgw/cls_rgw.cc ceph-12.1.2/src/cls/rgw/cls_rgw.cc --- ceph-12.1.1/src/cls/rgw/cls_rgw.cc 2017-07-17 16:56:02.000000000 +0000 +++ ceph-12.1.2/src/cls/rgw/cls_rgw.cc 2017-08-01 17:55:40.000000000 +0000 @@ -145,9 +145,9 @@ * read list of objects, skips objects in the ugly namespace */ static int get_obj_vals(cls_method_context_t hctx, const string& start, const string& filter_prefix, - int num_entries, map *pkeys) + int num_entries, map *pkeys, bool *pmore) { - int ret = cls_cxx_map_get_vals(hctx, start, filter_prefix, num_entries, pkeys); + int ret = cls_cxx_map_get_vals(hctx, start, filter_prefix, num_entries, pkeys, pmore); if (ret < 0) return ret; @@ -183,7 +183,7 @@ string new_start = c; /* now get some more keys */ - ret = cls_cxx_map_get_vals(hctx, new_start, filter_prefix, num_entries - pkeys->size(), &new_keys); + ret = cls_cxx_map_get_vals(hctx, new_start, filter_prefix, num_entries - pkeys->size(), &new_keys, pmore); if (ret < 0) return ret; @@ -405,10 +405,11 @@ string start_key; encode_list_index_key(hctx, op.start_obj, &start_key); bool done = false; - uint32_t left_to_read = op.num_entries + 1; + uint32_t left_to_read = op.num_entries; + bool more; do { - rc = get_obj_vals(hctx, start_key, op.filter_prefix, left_to_read, &keys); + rc = get_obj_vals(hctx, start_key, op.filter_prefix, left_to_read, &keys, &more); if (rc < 0) return rc; @@ -458,8 +459,7 @@ } } while (left_to_read > 0 && !done); - ret.is_truncated = (left_to_read == 0) && /* we found more entries than we were requested, meaning response is truncated */ - !done; + ret.is_truncated = more && !done; ::encode(ret, *out); return 0; @@ -482,9 +482,10 @@ #define CHECK_CHUNK_SIZE 1000 bool done = false; + bool more; do { - rc = get_obj_vals(hctx, start_obj, filter_prefix, CHECK_CHUNK_SIZE, &keys); + rc = get_obj_vals(hctx, start_obj, filter_prefix, CHECK_CHUNK_SIZE, &keys, &more); if (rc < 0) return rc; @@ -689,7 +690,7 @@ return rc; } - if (op.log_op) { + if (op.log_op && !header.syncstopped) { rc = log_index_operation(hctx, op.key, op.op, op.tag, entry.meta.mtime, entry.ver, info.state, header.ver, header.max_marker, op.bilog_flags, NULL, NULL, &op.zones_trace); if (rc < 0) @@ -846,7 +847,7 @@ bufferlist op_bl; if (cancel) { - if (op.log_op) { + if (op.log_op && !header.syncstopped) { rc = log_index_operation(hctx, op.key, op.op, op.tag, entry.meta.mtime, entry.ver, CLS_RGW_STATE_COMPLETE, header.ver, header.max_marker, op.bilog_flags, NULL, NULL, &op.zones_trace); if (rc < 0) @@ -908,7 +909,7 @@ break; } - if (op.log_op) { + if (op.log_op && !header.syncstopped) { rc = log_index_operation(hctx, op.key, op.op, op.tag, entry.meta.mtime, entry.ver, CLS_RGW_STATE_COMPLETE, header.ver, header.max_marker, op.bilog_flags, NULL, NULL, &op.zones_trace); if (rc < 0) @@ -933,7 +934,7 @@ remove_entry.key.name.c_str(), remove_entry.key.instance.c_str(), remove_entry.meta.category); unaccount_entry(header, remove_entry); - if (op.log_op) { + if (op.log_op && !header.syncstopped) { rc = log_index_operation(hctx, remove_key, CLS_RGW_OP_DEL, op.tag, remove_entry.meta.mtime, remove_entry.ver, CLS_RGW_STATE_COMPLETE, header.ver, header.max_marker, op.bilog_flags, NULL, NULL, &op.zones_trace); if (rc < 0) @@ -1157,9 +1158,10 @@ get_list_index_key(instance_entry, &list_idx); /* this is the current head, need to update! */ map keys; + bool more; string filter = key.name; /* list key starts with key name, filter it to avoid a case where we cross to different namespace */ - int ret = cls_cxx_map_get_vals(hctx, list_idx, filter, 1, &keys); + int ret = cls_cxx_map_get_vals(hctx, list_idx, filter, 1, &keys, &more); if (ret < 0) { return ret; } @@ -1520,7 +1522,7 @@ return ret; } - if (op.log_op) { + if (op.log_op && !header.syncstopped) { rgw_bucket_dir_entry& entry = obj.get_dir_entry(); rgw_bucket_entry_ver ver; @@ -1676,7 +1678,7 @@ return ret; } - if (op.log_op) { + if (op.log_op && !header.syncstopped) { rgw_bucket_entry_ver ver; ver.epoch = (op.olh_epoch ? op.olh_epoch : olh.get_epoch()); @@ -1944,7 +1946,7 @@ ret = cls_cxx_map_remove_key(hctx, cur_change_key); if (ret < 0) return ret; - if (log_op && cur_disk.exists) { + if (log_op && cur_disk.exists && !header.syncstopped) { ret = log_index_operation(hctx, cur_disk.key, CLS_RGW_OP_DEL, cur_disk.tag, cur_disk.meta.mtime, cur_disk.ver, CLS_RGW_STATE_COMPLETE, header.ver, header.max_marker, 0, NULL, NULL, NULL); if (ret < 0) { @@ -1967,7 +1969,7 @@ ret = cls_cxx_map_set_val(hctx, cur_change_key, &cur_state_bl); if (ret < 0) return ret; - if (log_op) { + if (log_op && !header.syncstopped) { ret = log_index_operation(hctx, cur_change.key, CLS_RGW_OP_ADD, cur_change.tag, cur_change.meta.mtime, cur_change.ver, CLS_RGW_STATE_COMPLETE, header.ver, header.max_marker, 0, NULL, NULL, NULL); if (ret < 0) { @@ -2260,7 +2262,7 @@ } static int list_plain_entries(cls_method_context_t hctx, const string& name, const string& marker, uint32_t max, - list *entries) + list *entries, bool *pmore) { string filter = name; string start_key = marker; @@ -2270,59 +2272,52 @@ int count = 0; map keys; - do { - if (count >= (int)max) { + int ret = cls_cxx_map_get_vals(hctx, start_key, filter, max, &keys, pmore); + if (ret < 0) { + return ret; + } + + map::iterator iter; + for (iter = keys.begin(); iter != keys.end(); ++iter) { + if (iter->first >= end_key) { + /* past the end of plain namespace */ return count; } - keys.clear(); -#define BI_GET_NUM_KEYS 128 - int ret = cls_cxx_map_get_vals(hctx, start_key, filter, BI_GET_NUM_KEYS, &keys); - if (ret < 0) { - return ret; - } - - map::iterator iter; - for (iter = keys.begin(); iter != keys.end(); ++iter) { - if (iter->first >= end_key) { - /* past the end of plain namespace */ - return count; - } - rgw_cls_bi_entry entry; - entry.type = PlainIdx; - entry.idx = iter->first; - entry.data = iter->second; + rgw_cls_bi_entry entry; + entry.type = PlainIdx; + entry.idx = iter->first; + entry.data = iter->second; - bufferlist::iterator biter = entry.data.begin(); + bufferlist::iterator biter = entry.data.begin(); - rgw_bucket_dir_entry e; - try { - ::decode(e, biter); - } catch (buffer::error& err) { - CLS_LOG(0, "ERROR: %s(): failed to decode buffer", __func__); - return -EIO; - } + rgw_bucket_dir_entry e; + try { + ::decode(e, biter); + } catch (buffer::error& err) { + CLS_LOG(0, "ERROR: %s(): failed to decode buffer", __func__); + return -EIO; + } - CLS_LOG(20, "%s(): entry.idx=%s e.key.name=%s", __func__, escape_str(entry.idx).c_str(), escape_str(e.key.name).c_str()); + CLS_LOG(20, "%s(): entry.idx=%s e.key.name=%s", __func__, escape_str(entry.idx).c_str(), escape_str(e.key.name).c_str()); - if (!name.empty() && e.key.name != name) { - return count; - } + if (!name.empty() && e.key.name != name) { + return count; + } - entries->push_back(entry); - count++; - if (count >= (int)max) { - return count; - } - start_key = entry.idx; + entries->push_back(entry); + count++; + if (count >= (int)max) { + return count; } - } while (!keys.empty()); + start_key = entry.idx; + } return count; } static int list_instance_entries(cls_method_context_t hctx, const string& name, const string& marker, uint32_t max, - list *entries) + list *entries, bool *pmore) { cls_rgw_obj_key key(name); string first_instance_idx; @@ -2341,66 +2336,63 @@ } int count = 0; map keys; - bool started = true; - do { - if (count >= (int)max) { - return count; - } - keys.clear(); -#define BI_GET_NUM_KEYS 128 - int ret; - if (started) { - ret = cls_cxx_map_get_val(hctx, start_key, &keys[start_key]); - if (ret == -ENOENT) { - ret = cls_cxx_map_get_vals(hctx, start_key, string(), BI_GET_NUM_KEYS, &keys); - } - started = false; - } else { - ret = cls_cxx_map_get_vals(hctx, start_key, string(), BI_GET_NUM_KEYS, &keys); - } + bufferlist k; + int ret = cls_cxx_map_get_val(hctx, start_key, &k); + if (ret < 0 && ret != -ENOENT) { + return ret; + } + bool found_first = (ret == 0); + if (found_first) { + --max; + } + if (max > 0) { + ret = cls_cxx_map_get_vals(hctx, start_key, string(), max, &keys, pmore); CLS_LOG(20, "%s(): start_key=%s first_instance_idx=%s keys.size()=%d", __func__, escape_str(start_key).c_str(), escape_str(first_instance_idx).c_str(), (int)keys.size()); if (ret < 0) { return ret; } + } + if (found_first) { + keys[start_key].claim(k); + } - map::iterator iter; - for (iter = keys.begin(); iter != keys.end(); ++iter) { - rgw_cls_bi_entry entry; - entry.type = InstanceIdx; - entry.idx = iter->first; - entry.data = iter->second; - - if (!filter.empty() && entry.idx.compare(0, filter.size(), filter) != 0) { - return count; - } + map::iterator iter; + for (iter = keys.begin(); iter != keys.end(); ++iter) { + rgw_cls_bi_entry entry; + entry.type = InstanceIdx; + entry.idx = iter->first; + entry.data = iter->second; - CLS_LOG(20, "%s(): entry.idx=%s", __func__, escape_str(entry.idx).c_str()); + if (!filter.empty() && entry.idx.compare(0, filter.size(), filter) != 0) { + return count; + } - bufferlist::iterator biter = entry.data.begin(); + CLS_LOG(20, "%s(): entry.idx=%s", __func__, escape_str(entry.idx).c_str()); - rgw_bucket_dir_entry e; - try { - ::decode(e, biter); - } catch (buffer::error& err) { - CLS_LOG(0, "ERROR: %s(): failed to decode buffer (size=%d)", __func__, entry.data.length()); - return -EIO; - } + bufferlist::iterator biter = entry.data.begin(); - if (!name.empty() && e.key.name != name) { - return count; - } + rgw_bucket_dir_entry e; + try { + ::decode(e, biter); + } catch (buffer::error& err) { + CLS_LOG(0, "ERROR: %s(): failed to decode buffer (size=%d)", __func__, entry.data.length()); + return -EIO; + } - entries->push_back(entry); - count++; - start_key = entry.idx; + if (!name.empty() && e.key.name != name) { + return count; } - } while (!keys.empty()); + + entries->push_back(entry); + count++; + start_key = entry.idx; + } return count; } static int list_olh_entries(cls_method_context_t hctx, const string& name, const string& marker, uint32_t max, - list *entries) + list *entries, bool *pmore) { cls_rgw_obj_key key(name); string first_instance_idx; @@ -2419,60 +2411,59 @@ } int count = 0; map keys; - bool started = true; - do { - if (count >= (int)max) { - return count; - } - keys.clear(); -#define BI_GET_NUM_KEYS 128 - int ret; - if (started) { - ret = cls_cxx_map_get_val(hctx, start_key, &keys[start_key]); - if (ret == -ENOENT) { - ret = cls_cxx_map_get_vals(hctx, start_key, string(), BI_GET_NUM_KEYS, &keys); - } - started = false; - } else { - ret = cls_cxx_map_get_vals(hctx, start_key, string(), BI_GET_NUM_KEYS, &keys); - } + int ret; + bufferlist k; + ret = cls_cxx_map_get_val(hctx, start_key, &k); + if (ret < 0 && ret != -ENOENT) { + return ret; + } + bool found_first = (ret == 0); + if (found_first) { + --max; + } + if (max > 0) { + ret = cls_cxx_map_get_vals(hctx, start_key, string(), max, &keys, pmore); CLS_LOG(20, "%s(): start_key=%s first_instance_idx=%s keys.size()=%d", __func__, escape_str(start_key).c_str(), escape_str(first_instance_idx).c_str(), (int)keys.size()); if (ret < 0) { return ret; } + } - map::iterator iter; - for (iter = keys.begin(); iter != keys.end(); ++iter) { - rgw_cls_bi_entry entry; - entry.type = OLHIdx; - entry.idx = iter->first; - entry.data = iter->second; + if (found_first) { + keys[start_key].claim(k); + } - if (!filter.empty() && entry.idx.compare(0, filter.size(), filter) != 0) { - return count; - } + map::iterator iter; + for (iter = keys.begin(); iter != keys.end(); ++iter) { + rgw_cls_bi_entry entry; + entry.type = OLHIdx; + entry.idx = iter->first; + entry.data = iter->second; - CLS_LOG(20, "%s(): entry.idx=%s", __func__, escape_str(entry.idx).c_str()); + if (!filter.empty() && entry.idx.compare(0, filter.size(), filter) != 0) { + return count; + } - bufferlist::iterator biter = entry.data.begin(); + CLS_LOG(20, "%s(): entry.idx=%s", __func__, escape_str(entry.idx).c_str()); - rgw_bucket_olh_entry e; - try { - ::decode(e, biter); - } catch (buffer::error& err) { - CLS_LOG(0, "ERROR: %s(): failed to decode buffer (size=%d)", __func__, entry.data.length()); - return -EIO; - } + bufferlist::iterator biter = entry.data.begin(); - if (!name.empty() && e.key.name != name) { - return count; - } + rgw_bucket_olh_entry e; + try { + ::decode(e, biter); + } catch (buffer::error& err) { + CLS_LOG(0, "ERROR: %s(): failed to decode buffer (size=%d)", __func__, entry.data.length()); + return -EIO; + } - entries->push_back(entry); - count++; - start_key = entry.idx; + if (!name.empty() && e.key.name != name) { + return count; } - } while (!keys.empty()); + + entries->push_back(entry); + count++; + start_key = entry.idx; + } return count; } @@ -2493,9 +2484,10 @@ string filter = op.name; #define MAX_BI_LIST_ENTRIES 1000 - int32_t max = (op.max < MAX_BI_LIST_ENTRIES ? op.max : MAX_BI_LIST_ENTRIES) + 1; /* one extra entry for identifying truncation */ + int32_t max = (op.max < MAX_BI_LIST_ENTRIES ? op.max : MAX_BI_LIST_ENTRIES); string start_key = op.marker; - int ret = list_plain_entries(hctx, op.name, op.marker, max, &op_ret.entries); + bool more; + int ret = list_plain_entries(hctx, op.name, op.marker, max, &op_ret.entries, &more); if (ret < 0) { CLS_LOG(0, "ERROR: %s(): list_plain_entries retured ret=%d", __func__, ret); return ret; @@ -2504,23 +2496,27 @@ CLS_LOG(20, "found %d plain entries", count); - ret = list_instance_entries(hctx, op.name, op.marker, max - count, &op_ret.entries); - if (ret < 0) { - CLS_LOG(0, "ERROR: %s(): list_instance_entries retured ret=%d", __func__, ret); - return ret; + if (!more) { + ret = list_instance_entries(hctx, op.name, op.marker, max - count, &op_ret.entries, &more); + if (ret < 0) { + CLS_LOG(0, "ERROR: %s(): list_instance_entries retured ret=%d", __func__, ret); + return ret; + } + + count += ret; } - count += ret; + if (!more) { + ret = list_olh_entries(hctx, op.name, op.marker, max - count, &op_ret.entries, &more); + if (ret < 0) { + CLS_LOG(0, "ERROR: %s(): list_olh_entries retured ret=%d", __func__, ret); + return ret; + } - ret = list_olh_entries(hctx, op.name, op.marker, max - count, &op_ret.entries); - if (ret < 0) { - CLS_LOG(0, "ERROR: %s(): list_olh_entries retured ret=%d", __func__, ret); - return ret; + count += ret; } - count += ret; - - op_ret.is_truncated = (count >= max); + op_ret.is_truncated = (count >= max) || more; while (count >= max) { op_ret.entries.pop_back(); count--; @@ -2582,45 +2578,40 @@ string filter; - do { -#define BI_NUM_KEYS 128 - int ret = cls_cxx_map_get_vals(hctx, start_key, filter, BI_NUM_KEYS, &keys); - if (ret < 0) - return ret; + int ret = cls_cxx_map_get_vals(hctx, start_key, filter, max_entries, &keys, truncated); + if (ret < 0) + return ret; - map::iterator iter = keys.begin(); - if (iter == keys.end()) - break; + map::iterator iter = keys.begin(); + if (iter == keys.end()) + return 0; - for (; iter != keys.end(); ++iter) { - const string& key = iter->first; - rgw_bi_log_entry e; + uint32_t num_keys = keys.size(); - CLS_LOG(0, "bi_log_iterate_entries key=%s bl.length=%d\n", key.c_str(), (int)iter->second.length()); + for (; iter != keys.end(); ++iter,++i) { + const string& key = iter->first; + rgw_bi_log_entry e; - if (key.compare(end_key) > 0) - return 0; + CLS_LOG(0, "bi_log_iterate_entries key=%s bl.length=%d\n", key.c_str(), (int)iter->second.length()); - ret = bi_log_record_decode(iter->second, e); - if (ret < 0) - return ret; + if (key.compare(end_key) > 0) { + key_iter = key; + return 0; + } - if (max_entries && (i >= max_entries)) { - if (truncated) - *truncated = true; - key_iter = key; - return 0; - } + ret = bi_log_record_decode(iter->second, e); + if (ret < 0) + return ret; - ret = cb(hctx, key, e, param); - if (ret < 0) - return ret; - i++; + ret = cb(hctx, key, e, param); + if (ret < 0) + return ret; + if (i == num_keys - 1) { + key_iter = key; } - --iter; - start_key = iter->first; - } while (true); + } + return 0; } @@ -2728,6 +2719,74 @@ return 0; } +static int rgw_bi_log_resync(cls_method_context_t hctx, bufferlist *in, bufferlist *out) +{ + struct rgw_bucket_dir_header header; + int rc = read_bucket_header(hctx, &header); + if (rc < 0) { + CLS_LOG(1, "ERROR: rgw_bucket_complete_op(): failed to read header\n"); + return rc; + } + + bufferlist bl; + + struct rgw_bi_log_entry entry; + + entry.timestamp = real_clock::now(); + entry.op = RGWModifyOp::CLS_RGW_OP_RESYNC; + entry.state = RGWPendingState::CLS_RGW_STATE_COMPLETE; + + string key; + bi_log_index_key(hctx, key, entry.id, header.ver); + + ::encode(entry, bl); + + if (entry.id > header.max_marker) + header.max_marker = entry.id; + + header.syncstopped = false; + + rc = cls_cxx_map_set_val(hctx, key, &bl); + if (rc < 0) + return rc; + + return write_bucket_header(hctx, &header); +} + +static int rgw_bi_log_stop(cls_method_context_t hctx, bufferlist *in, bufferlist *out) +{ + struct rgw_bucket_dir_header header; + int rc = read_bucket_header(hctx, &header); + if (rc < 0) { + CLS_LOG(1, "ERROR: rgw_bucket_complete_op(): failed to read header\n"); + return rc; + } + + bufferlist bl; + + struct rgw_bi_log_entry entry; + + entry.timestamp = real_clock::now(); + entry.op = RGWModifyOp::CLS_RGW_OP_SYNCSTOP; + entry.state = RGWPendingState::CLS_RGW_STATE_COMPLETE; + + string key; + bi_log_index_key(hctx, key, entry.id, header.ver); + + ::encode(entry, bl); + + if (entry.id > header.max_marker) + header.max_marker = entry.id; + header.syncstopped = true; + + rc = cls_cxx_map_set_val(hctx, key, &bl); + if (rc < 0) + return rc; + + return write_bucket_header(hctx, &header); +} + + static void usage_record_prefix_by_time(uint64_t epoch, string& key) { char buf[32]; @@ -2862,58 +2921,53 @@ start_key = key_iter; } - do { - CLS_LOG(20, "usage_iterate_range start_key=%s", start_key.c_str()); - int ret = cls_cxx_map_get_vals(hctx, start_key, filter_prefix, NUM_KEYS, &keys); - if (ret < 0) - return ret; + CLS_LOG(20, "usage_iterate_range start_key=%s", start_key.c_str()); + int ret = cls_cxx_map_get_vals(hctx, start_key, filter_prefix, max_entries, &keys, truncated); + if (ret < 0) + return ret; - map::iterator iter = keys.begin(); - if (iter == keys.end()) - break; + map::iterator iter = keys.begin(); + if (iter == keys.end()) + return 0; - for (; iter != keys.end(); ++iter) { - const string& key = iter->first; - rgw_usage_log_entry e; + uint32_t num_keys = keys.size(); - if (!by_user && key.compare(end_key) >= 0) { - CLS_LOG(20, "usage_iterate_range reached key=%s, done", key.c_str()); - return 0; - } + for (; iter != keys.end(); ++iter,++i) { + const string& key = iter->first; + rgw_usage_log_entry e; - if (by_user && key.compare(0, user_key.size(), user_key) != 0) { - CLS_LOG(20, "usage_iterate_range reached key=%s, done", key.c_str()); - return 0; - } + if (!by_user && key.compare(end_key) >= 0) { + CLS_LOG(20, "usage_iterate_range reached key=%s, done", key.c_str()); + return 0; + } - ret = usage_record_decode(iter->second, e); - if (ret < 0) - return ret; + if (by_user && key.compare(0, user_key.size(), user_key) != 0) { + CLS_LOG(20, "usage_iterate_range reached key=%s, done", key.c_str()); + return 0; + } - if (e.epoch < start) - continue; + ret = usage_record_decode(iter->second, e); + if (ret < 0) + return ret; - /* keys are sorted by epoch, so once we're past end we're done */ - if (e.epoch >= end) - return 0; + if (e.epoch < start) + continue; - ret = cb(hctx, key, e, param); - if (ret < 0) - return ret; + /* keys are sorted by epoch, so once we're past end we're done */ + if (e.epoch >= end) + return 0; + ret = cb(hctx, key, e, param); + if (ret < 0) + return ret; - i++; - if (max_entries && (i > max_entries)) { - CLS_LOG(20, "usage_iterate_range reached max_entries (%d), done", max_entries); - *truncated = true; - key_iter = key; - return 0; - } + + if (i == num_keys - 1) { + key_iter = key; + return 0; } - --iter; - start_key = iter->first; - } while (true); + } return 0; } @@ -2999,7 +3053,9 @@ } string iter; - ret = usage_iterate_range(hctx, op.start_epoch, op.end_epoch, op.user, iter, 0, NULL, usage_log_trim_cb, NULL); + bool more; +#define MAX_USAGE_TRIM_ENTRIES 128 + ret = usage_iterate_range(hctx, op.start_epoch, op.end_epoch, op.user, iter, MAX_USAGE_TRIM_ENTRIES, &more, usage_log_trim_cb, NULL); if (ret < 0) return ret; @@ -3197,50 +3253,42 @@ string filter; - do { -#define GC_NUM_KEYS 32 - int ret = cls_cxx_map_get_vals(hctx, start_key, filter, GC_NUM_KEYS, &keys); - if (ret < 0) - return ret; + int ret = cls_cxx_map_get_vals(hctx, start_key, filter, max_entries, &keys, truncated); + if (ret < 0) + return ret; - map::iterator iter = keys.begin(); - if (iter == keys.end()) - break; + map::iterator iter = keys.begin(); + if (iter == keys.end()) + return 0; - for (; iter != keys.end(); ++iter) { - const string& key = iter->first; - cls_rgw_gc_obj_info e; + uint32_t num_keys = keys.size(); - CLS_LOG(10, "gc_iterate_entries key=%s\n", key.c_str()); + for (; iter != keys.end(); ++iter, ++i) { + const string& key = iter->first; + cls_rgw_gc_obj_info e; - if (!end_key.empty() && key.compare(end_key) >= 0) - return 0; + CLS_LOG(10, "gc_iterate_entries key=%s\n", key.c_str()); - if (!key_in_index(key, GC_OBJ_TIME_INDEX)) - return 0; + if (!end_key.empty() && key.compare(end_key) >= 0) + return 0; - ret = gc_record_decode(iter->second, e); - if (ret < 0) - return ret; + if (!key_in_index(key, GC_OBJ_TIME_INDEX)) + return 0; - if (max_entries && (i >= max_entries)) { - if (truncated) - *truncated = true; - --iter; - key_iter = iter->first; - return 0; - } + ret = gc_record_decode(iter->second, e); + if (ret < 0) + return ret; - ret = cb(hctx, key, e, param); - if (ret < 0) - return ret; - i++; + ret = cb(hctx, key, e, param); + if (ret < 0) + return ret; + if (i == num_keys - 1) { + key_iter = key; } - --iter; - start_key = iter->first; - } while (true); + } + return 0; } @@ -3274,7 +3322,8 @@ } cls_rgw_gc_list_ret op_ret; - int ret = gc_list_entries(hctx, op.marker, op.max, op.expired_only, +#define GC_LIST_ENTRIES_DEFAULT 128 + int ret = gc_list_entries(hctx, op.marker, (op.max ? op.max : GC_LIST_ENTRIES_DEFAULT), op.expired_only, op_ret.entries, &op_ret.truncated, op_ret.next_marker); if (ret < 0) return ret; @@ -3384,7 +3433,8 @@ map vals; string filter_prefix; - int ret = cls_cxx_map_get_vals(hctx, op.marker, filter_prefix, 1, &vals); + bool more; + int ret = cls_cxx_map_get_vals(hctx, op.marker, filter_prefix, 1, &vals, &more); if (ret < 0) return ret; map::iterator it; @@ -3419,7 +3469,7 @@ bufferlist::iterator iter; map vals; string filter_prefix; - int ret = cls_cxx_map_get_vals(hctx, op.marker, filter_prefix, op.max_entries, &vals); + int ret = cls_cxx_map_get_vals(hctx, op.marker, filter_prefix, op.max_entries, &vals, &op_ret.is_truncated); if (ret < 0) return ret; map::iterator it; @@ -3524,8 +3574,8 @@ string filter_prefix; #define MAX_RESHARD_LIST_ENTRIES 1000 /* one extra entry for identifying truncation */ - int32_t max = (op.max < MAX_RESHARD_LIST_ENTRIES ? op.max : MAX_RESHARD_LIST_ENTRIES) + 1; - int ret = cls_cxx_map_get_vals(hctx, op.marker, filter_prefix, max, &vals); + int32_t max = (op.max && (op.max < MAX_RESHARD_LIST_ENTRIES) ? op.max : MAX_RESHARD_LIST_ENTRIES); + int ret = cls_cxx_map_get_vals(hctx, op.marker, filter_prefix, max, &vals, &op_ret.is_truncated); if (ret < 0) return ret; map::iterator it; @@ -3541,7 +3591,6 @@ } op_ret.entries.push_back(entry); } - op_ret.is_truncated = op.max && (vals.size() > op.max); ::encode(op_ret, *out); return 0; } @@ -3747,6 +3796,8 @@ cls_method_handle_t h_rgw_bi_put_op; cls_method_handle_t h_rgw_bi_list_op; cls_method_handle_t h_rgw_bi_log_list_op; + cls_method_handle_t h_rgw_bi_log_resync_op; + cls_method_handle_t h_rgw_bi_log_stop_op; cls_method_handle_t h_rgw_dir_suggest_changes; cls_method_handle_t h_rgw_user_usage_log_add; cls_method_handle_t h_rgw_user_usage_log_read; @@ -3800,6 +3851,9 @@ cls_register_cxx_method(h_class, RGW_BI_LOG_TRIM, CLS_METHOD_RD | CLS_METHOD_WR, rgw_bi_log_trim, &h_rgw_bi_log_list_op); cls_register_cxx_method(h_class, RGW_DIR_SUGGEST_CHANGES, CLS_METHOD_RD | CLS_METHOD_WR, rgw_dir_suggest_changes, &h_rgw_dir_suggest_changes); + cls_register_cxx_method(h_class, "bi_log_resync", CLS_METHOD_RD | CLS_METHOD_WR, rgw_bi_log_resync, &h_rgw_bi_log_resync_op); + cls_register_cxx_method(h_class, "bi_log_stop", CLS_METHOD_RD | CLS_METHOD_WR, rgw_bi_log_stop, &h_rgw_bi_log_stop_op); + /* usage logging */ cls_register_cxx_method(h_class, RGW_USER_USAGE_LOG_ADD, CLS_METHOD_RD | CLS_METHOD_WR, rgw_user_usage_log_add, &h_rgw_user_usage_log_add); cls_register_cxx_method(h_class, RGW_USER_USAGE_LOG_READ, CLS_METHOD_RD, rgw_user_usage_log_read, &h_rgw_user_usage_log_read); diff -Nru ceph-12.1.1/src/cls/rgw/cls_rgw_client.cc ceph-12.1.2/src/cls/rgw/cls_rgw_client.cc --- ceph-12.1.1/src/cls/rgw/cls_rgw_client.cc 2017-07-17 16:56:02.000000000 +0000 +++ ceph-12.1.2/src/cls/rgw/cls_rgw_client.cc 2017-08-01 17:55:40.000000000 +0000 @@ -524,6 +524,32 @@ return issue_bucket_list_op(io_ctx, oid, nokey, "", 0, false, &manager, &result[shard_id]); } +static bool issue_resync_bi_log(librados::IoCtx& io_ctx, const string& oid, BucketIndexAioManager *manager) +{ + bufferlist in; + librados::ObjectWriteOperation op; + op.exec("rgw", "bi_log_resync", in); + return manager->aio_operate(io_ctx, oid, &op); +} + +int CLSRGWIssueResyncBucketBILog::issue_op(int shard_id, const string& oid) +{ + return issue_resync_bi_log(io_ctx, oid, &manager); +} + +static bool issue_bi_log_stop(librados::IoCtx& io_ctx, const string& oid, BucketIndexAioManager *manager) +{ + bufferlist in; + librados::ObjectWriteOperation op; + op.exec("rgw", "bi_log_stop", in); + return manager->aio_operate(io_ctx, oid, &op); +} + +int CLSRGWIssueBucketBILogStop::issue_op(int shard_id, const string& oid) +{ + return issue_bi_log_stop(io_ctx, oid, &manager); +} + class GetDirHeaderCompletion : public ObjectOperationCompletion { RGWGetDirHeader_CB *ret_ctx; public: diff -Nru ceph-12.1.1/src/cls/rgw/cls_rgw_client.h ceph-12.1.2/src/cls/rgw/cls_rgw_client.h --- ceph-12.1.1/src/cls/rgw/cls_rgw_client.h 2017-07-17 16:56:02.000000000 +0000 +++ ceph-12.1.2/src/cls/rgw/cls_rgw_client.h 2017-08-01 17:55:40.000000000 +0000 @@ -465,6 +465,22 @@ uint32_t _max_aio) : CLSRGWConcurrentIO(ioc, _bucket_objs, _max_aio), entry(_entry) {} }; +class CLSRGWIssueResyncBucketBILog : public CLSRGWConcurrentIO { +protected: + int issue_op(int shard_id, const string& oid); +public: + CLSRGWIssueResyncBucketBILog(librados::IoCtx& io_ctx, map& _bucket_objs, uint32_t max_aio) : + CLSRGWConcurrentIO(io_ctx, _bucket_objs, max_aio) {} +}; + +class CLSRGWIssueBucketBILogStop : public CLSRGWConcurrentIO { +protected: + int issue_op(int shard_id, const string& oid); +public: + CLSRGWIssueBucketBILogStop(librados::IoCtx& io_ctx, map& _bucket_objs, uint32_t max_aio) : + CLSRGWConcurrentIO(io_ctx, _bucket_objs, max_aio) {} +}; + int cls_rgw_get_dir_header_async(librados::IoCtx& io_ctx, string& oid, RGWGetDirHeader_CB *ctx); void cls_rgw_encode_suggestion(char op, rgw_bucket_dir_entry& dirent, bufferlist& updates); diff -Nru ceph-12.1.1/src/cls/rgw/cls_rgw_ops.h ceph-12.1.2/src/cls/rgw/cls_rgw_ops.h --- ceph-12.1.1/src/cls/rgw/cls_rgw_ops.h 2017-07-17 16:56:02.000000000 +0000 +++ ceph-12.1.2/src/cls/rgw/cls_rgw_ops.h 2017-08-01 17:55:40.000000000 +0000 @@ -1123,18 +1123,23 @@ struct cls_rgw_lc_list_entries_ret { map entries; + bool is_truncated{false}; cls_rgw_lc_list_entries_ret() {} void encode(bufferlist& bl) const { - ENCODE_START(1, 1, bl); + ENCODE_START(2, 1, bl); ::encode(entries, bl); + ::encode(is_truncated, bl); ENCODE_FINISH(bl); } void decode(bufferlist::iterator& bl) { - DECODE_START(1, bl); + DECODE_START(2, bl); ::decode(entries, bl); + if (struct_v >= 2) { + ::decode(is_truncated, bl); + } DECODE_FINISH(bl); } diff -Nru ceph-12.1.1/src/cls/rgw/cls_rgw_types.cc ceph-12.1.2/src/cls/rgw/cls_rgw_types.cc --- ceph-12.1.1/src/cls/rgw/cls_rgw_types.cc 2017-07-17 16:56:02.000000000 +0000 +++ ceph-12.1.2/src/cls/rgw/cls_rgw_types.cc 2017-08-01 17:55:40.000000000 +0000 @@ -366,6 +366,10 @@ op = CLS_RGW_OP_LINK_OLH_DM; } else if (op_str == "unlink_instance") { op = CLS_RGW_OP_UNLINK_INSTANCE; + } else if (op_str == "syncstop") { + op = CLS_RGW_OP_SYNCSTOP; + } else if (op_str == "resync") { + op = CLS_RGW_OP_RESYNC; } else { op = CLS_RGW_OP_UNKNOWN; } @@ -419,6 +423,12 @@ case CLS_RGW_OP_UNLINK_INSTANCE: f->dump_string("op", "unlink_instance"); break; + case CLS_RGW_OP_SYNCSTOP: + f->dump_string("op", "syncstop"); + break; + case CLS_RGW_OP_RESYNC: + f->dump_string("op", "resync"); + break; default: f->dump_string("op", "invalid"); break; diff -Nru ceph-12.1.1/src/cls/rgw/cls_rgw_types.h ceph-12.1.2/src/cls/rgw/cls_rgw_types.h --- ceph-12.1.1/src/cls/rgw/cls_rgw_types.h 2017-07-17 16:56:02.000000000 +0000 +++ ceph-12.1.2/src/cls/rgw/cls_rgw_types.h 2017-08-01 17:55:40.000000000 +0000 @@ -34,6 +34,8 @@ CLS_RGW_OP_LINK_OLH = 4, CLS_RGW_OP_LINK_OLH_DM = 5, /* creation of delete marker */ CLS_RGW_OP_UNLINK_INSTANCE = 6, + CLS_RGW_OP_SYNCSTOP = 7, + CLS_RGW_OP_RESYNC = 8, }; enum RGWBILogFlags { @@ -660,17 +662,19 @@ uint64_t master_ver; string max_marker; cls_rgw_bucket_instance_entry new_instance; + bool syncstopped; - rgw_bucket_dir_header() : tag_timeout(0), ver(0), master_ver(0) {} + rgw_bucket_dir_header() : tag_timeout(0), ver(0), master_ver(0), syncstopped(false) {} void encode(bufferlist &bl) const { - ENCODE_START(6, 2, bl); + ENCODE_START(7, 2, bl); ::encode(stats, bl); ::encode(tag_timeout, bl); ::encode(ver, bl); ::encode(master_ver, bl); ::encode(max_marker, bl); ::encode(new_instance, bl); + ::encode(syncstopped,bl); ENCODE_FINISH(bl); } void decode(bufferlist::iterator &bl) { @@ -695,6 +699,9 @@ } else { new_instance = cls_rgw_bucket_instance_entry(); } + if (struct_v >= 7) { + ::decode(syncstopped,bl); + } DECODE_FINISH(bl); } void dump(Formatter *f) const; diff -Nru ceph-12.1.1/src/cls/statelog/cls_statelog.cc ceph-12.1.2/src/cls/statelog/cls_statelog.cc --- ceph-12.1.1/src/cls/statelog/cls_statelog.cc 2017-07-17 16:56:02.000000000 +0000 +++ ceph-12.1.2/src/cls/statelog/cls_statelog.cc 2017-08-01 17:55:40.000000000 +0000 @@ -168,21 +168,20 @@ if (!max_entries || max_entries > MAX_ENTRIES) max_entries = MAX_ENTRIES; - int rc = cls_cxx_map_get_vals(hctx, from_index, match_prefix, max_entries + 1, &keys); + cls_statelog_list_ret ret; + + int rc = cls_cxx_map_get_vals(hctx, from_index, match_prefix, max_entries, &keys, &ret.truncated); if (rc < 0) return rc; CLS_LOG(20, "from_index=%s match_prefix=%s", from_index.c_str(), match_prefix.c_str()); - cls_statelog_list_ret ret; list& entries = ret.entries; map::iterator iter = keys.begin(); - bool done = false; string marker; - size_t i; - for (i = 0; i < max_entries && iter != keys.end(); ++i, ++iter) { + for (; iter != keys.end(); ++iter) { const string& index = iter->first; marker = index; @@ -197,12 +196,9 @@ } } - if (iter == keys.end()) - done = true; - else + if (ret.truncated) { ret.marker = marker; - - ret.truncated = !done; + } ::encode(ret, *out); diff -Nru ceph-12.1.1/src/cls/timeindex/cls_timeindex.cc ceph-12.1.2/src/cls/timeindex/cls_timeindex.cc --- ceph-12.1.1/src/cls/timeindex/cls_timeindex.cc 2017-07-17 16:56:02.000000000 +0000 +++ ceph-12.1.2/src/cls/timeindex/cls_timeindex.cc 2017-08-01 17:55:40.000000000 +0000 @@ -120,21 +120,20 @@ max_entries = MAX_LIST_ENTRIES; } + cls_timeindex_list_ret ret; + int rc = cls_cxx_map_get_vals(hctx, from_index, TIMEINDEX_PREFIX, - max_entries + 1, &keys); + max_entries, &keys, &ret.truncated); if (rc < 0) { return rc; } - cls_timeindex_list_ret ret; - list& entries = ret.entries; map::iterator iter = keys.begin(); - bool done = false; string marker; - for (size_t i = 0; i < max_entries && iter != keys.end(); ++i, ++iter) { + for (; iter != keys.end(); ++iter) { const string& index = iter->first; bufferlist& bl = iter->second; @@ -142,7 +141,7 @@ if (use_time_boundary && index.compare(0, to_index.size(), to_index) >= 0) { CLS_LOG(20, "DEBUG: cls_timeindex_list: finishing on to_index=%s", to_index.c_str()); - done = true; + ret.truncated = false; break; } @@ -159,12 +158,7 @@ } } - if (iter == keys.end()) { - done = true; - } - ret.marker = marker; - ret.truncated = !done; ::encode(ret, *out); @@ -203,8 +197,10 @@ to_index = op.to_marker; } + bool more; + int rc = cls_cxx_map_get_vals(hctx, from_index, TIMEINDEX_PREFIX, - MAX_TRIM_ENTRIES, &keys); + MAX_TRIM_ENTRIES, &keys, &more); if (rc < 0) { return rc; } @@ -212,7 +208,7 @@ map::iterator iter = keys.begin(); bool removed = false; - for (size_t i = 0; i < MAX_TRIM_ENTRIES && iter != keys.end(); ++i, ++iter) { + for (; iter != keys.end(); ++iter) { const string& index = iter->first; CLS_LOG(20, "index=%s to_index=%s", index.c_str(), to_index.c_str()); diff -Nru ceph-12.1.1/src/cls/user/cls_user.cc ceph-12.1.2/src/cls/user/cls_user.cc --- ceph-12.1.1/src/cls/user/cls_user.cc 2017-07-17 16:56:02.000000000 +0000 +++ ceph-12.1.2/src/cls/user/cls_user.cc 2017-08-01 17:55:40.000000000 +0000 @@ -292,8 +292,9 @@ max_entries = MAX_ENTRIES; string match_prefix; + cls_user_list_buckets_ret ret; - int rc = cls_cxx_map_get_vals(hctx, from_index, match_prefix, max_entries + 1, &keys); + int rc = cls_cxx_map_get_vals(hctx, from_index, match_prefix, max_entries, &keys, &ret.truncated); if (rc < 0) return rc; @@ -301,21 +302,20 @@ from_index.c_str(), to_index.c_str(), match_prefix.c_str()); - cls_user_list_buckets_ret ret; list& entries = ret.entries; map::iterator iter = keys.begin(); - bool done = false; string marker; - size_t i; - for (i = 0; i < max_entries && iter != keys.end(); ++i, ++iter) { + for (; iter != keys.end(); ++iter) { const string& index = iter->first; marker = index; - if (to_index_valid && to_index.compare(index) <= 0) + if (to_index_valid && to_index.compare(index) <= 0) { + ret.truncated = false; break; + } bufferlist& bl = iter->second; bufferlist::iterator biter = bl.begin(); @@ -328,12 +328,9 @@ } } - if (iter == keys.end()) - done = true; - else + if (ret.truncated) { ret.marker = marker; - - ret.truncated = !done; + } ::encode(ret, *out); diff -Nru ceph-12.1.1/src/CMakeLists.txt ceph-12.1.2/src/CMakeLists.txt --- ceph-12.1.1/src/CMakeLists.txt 2017-07-17 16:56:02.000000000 +0000 +++ ceph-12.1.2/src/CMakeLists.txt 2017-08-01 17:55:40.000000000 +0000 @@ -508,8 +508,8 @@ common/ceph_hash.cc common/ceph_strings.cc common/ceph_frag.cc + common/options.cc common/config.cc - common/config_validators.cc common/utf8.c common/mime.c common/strtol.cc @@ -540,6 +540,11 @@ ${auth_files} ${mds_files}) +if(HAS_VTA) + set_source_files_properties(common/config.cc + PROPERTIES COMPILE_FLAGS -fno-var-tracking-assignments) +endif() + if(FREEBSD) list(APPEND libcommon_files common/freebsd_errno.cc) elseif(DARWIN) @@ -691,7 +696,8 @@ mgr/PyState.cc mgr/MgrPyModule.cc mgr/MgrStandby.cc - mgr/Mgr.cc) + mgr/Mgr.cc + mgr/mgr_commands.cc) add_executable(ceph-mgr ${mgr_srcs} $) target_include_directories(ceph-mgr PRIVATE "${PYTHON_INCLUDE_DIRS}") @@ -992,7 +998,7 @@ install(TARGETS cephfs DESTINATION ${CMAKE_INSTALL_LIBDIR}) install(DIRECTORY "${CMAKE_SOURCE_DIR}/src/include/cephfs" - DESTINATION include) + DESTINATION ${CMAKE_INSTALL_INCLUDEDIR}) set(ceph_syn_srcs ceph_syn.cc client/SyntheticClient.cc) diff -Nru ceph-12.1.1/src/common/buffer.cc ceph-12.1.2/src/common/buffer.cc --- ceph-12.1.1/src/common/buffer.cc 2017-07-17 16:56:02.000000000 +0000 +++ ceph-12.1.2/src/common/buffer.cc 2017-08-01 17:55:40.000000000 +0000 @@ -117,9 +117,9 @@ return buffer_c_str_accesses; } +#ifdef CEPH_HAVE_SETPIPE_SZ static std::atomic buffer_max_pipe_size { 0 }; int update_max_pipe_size() { -#ifdef CEPH_HAVE_SETPIPE_SZ char buf[32]; int r; std::string err; @@ -135,21 +135,22 @@ if (!err.empty()) return -EIO; buffer_max_pipe_size = size; -#endif return 0; } size_t get_max_pipe_size() { -#ifdef CEPH_HAVE_SETPIPE_SZ size_t size = buffer_max_pipe_size; if (size) return size; if (update_max_pipe_size() == 0) return buffer_max_pipe_size; -#endif // this is the max size hardcoded in linux before 2.6.35 return 65536; } +#else + size_t get_max_pipe_size() { return 65536; } +#endif + const char * buffer::error::what() const throw () { return "buffer::exception"; diff -Nru ceph-12.1.1/src/common/ceph_context.cc ceph-12.1.2/src/common/ceph_context.cc --- ceph-12.1.1/src/common/ceph_context.cc 2017-07-17 16:56:02.000000000 +0000 +++ ceph-12.1.2/src/common/ceph_context.cc 2017-08-01 17:55:40.000000000 +0000 @@ -448,6 +448,27 @@ f->dump_string(var.c_str(), buf); } } + } else if (command == "config help") { + std::string var; + if (cmd_getval(this, cmdmap, "var", var)) { + // Output a single one + std::string key = ConfFile::normalize_key_name(var); + const auto &i = _conf->schema.find(key); + if (i == _conf->schema.end()) { + std::ostringstream msg; + msg << "Setting not found: '" << key << "'"; + f->dump_string("error", msg.str()); + } else { + i->second.dump(f); + } + } else { + // Output all + f->open_array_section("options"); + for (const auto &option : ceph_options) { + option.dump(f); + } + f->close_section(); + } } else if (command == "config diff") { md_config_t def_conf; def_conf.set_val("cluster", _conf->cluster); @@ -529,10 +550,11 @@ << "result is " << out->length() << " bytes" << dendl; } - -CephContext::CephContext(uint32_t module_type_, int init_flags_) +CephContext::CephContext(uint32_t module_type_, + enum code_environment_t code_env, + int init_flags_) : nref(1), - _conf(new md_config_t()), + _conf(new md_config_t(code_env == CODE_ENVIRONMENT_DAEMON)), _log(NULL), _module_type(module_type_), _init_flags(init_flags_), @@ -590,6 +612,7 @@ _admin_socket->register_command("perf histogram schema", "perf histogram schema", _admin_hook, "dump perf histogram schema"); _admin_socket->register_command("perf reset", "perf reset name=var,type=CephString", _admin_hook, "perf reset : perf reset all or one perfcounter name"); _admin_socket->register_command("config show", "config show", _admin_hook, "dump current config settings"); + _admin_socket->register_command("config help", "config help name=var,type=CephString,req=false", _admin_hook, "get config setting schema and descriptions"); _admin_socket->register_command("config set", "config set name=var,type=CephString name=val,type=CephString,n=N", _admin_hook, "config set [ ...]: set a config variable"); _admin_socket->register_command("config get", "config get name=var,type=CephString", _admin_hook, "config get : get the config value"); _admin_socket->register_command("config diff", @@ -637,6 +660,7 @@ _admin_socket->unregister_command("config show"); _admin_socket->unregister_command("config set"); _admin_socket->unregister_command("config get"); + _admin_socket->unregister_command("config help"); _admin_socket->unregister_command("config diff"); _admin_socket->unregister_command("config diff get"); _admin_socket->unregister_command("log flush"); diff -Nru ceph-12.1.1/src/common/ceph_context.h ceph-12.1.2/src/common/ceph_context.h --- ceph-12.1.1/src/common/ceph_context.h 2017-07-17 16:56:02.000000000 +0000 +++ ceph-12.1.2/src/common/ceph_context.h 2017-08-01 17:55:40.000000000 +0000 @@ -20,6 +20,7 @@ #include #include "common/cmdparse.h" +#include "common/code_environment.h" #include "crush/CrushLocation.h" #include "include/Spinlock.h" @@ -50,7 +51,9 @@ */ class CephContext { public: - CephContext(uint32_t module_type_, int init_flags_ = 0); + CephContext(uint32_t module_type_, + enum code_environment_t code_env=CODE_ENVIRONMENT_UTILITY, + int init_flags_ = 0); // ref count! private: diff -Nru ceph-12.1.1/src/common/cmdparse.cc ceph-12.1.2/src/common/cmdparse.cc --- ceph-12.1.1/src/common/cmdparse.cc 2017-07-17 16:56:02.000000000 +0000 +++ ceph-12.1.2/src/common/cmdparse.cc 2017-08-01 17:55:40.000000000 +0000 @@ -185,6 +185,15 @@ } f->close_section(); } + + void operator()(const std::vector &operand) const + { + f->open_array_section(key.c_str()); + for (const auto i : operand) { + f->dump_float("item", i); + } + f->close_section(); + } }; //f->open_object_section("cmdmap"); @@ -240,7 +249,7 @@ case json_spirit::array_type: { // array is a vector of values. Unpack it to a vector - // of strings or int64_t, the only types we handle. + // of strings, doubles, or int64_t, the only types we handle. const vector& spvals = it->second.get_array(); if (spvals.empty()) { // if an empty array is acceptable, the caller should always check for @@ -265,9 +274,18 @@ outv.push_back(sv.get_int64()); } (*mapp)[it->first] = std::move(outv); + } else if (spvals.front().type() == json_spirit::real_type) { + vector outv; + for (const auto& sv : spvals) { + if (spvals.front().type() != json_spirit::real_type) { + throw(runtime_error("Can't handle arrays of multiple types")); + } + outv.push_back(sv.get_real()); + } + (*mapp)[it->first] = std::move(outv); } else { throw(runtime_error("Can't handle arrays of types other than " - "int or string")); + "int, string, or double")); } } break; diff -Nru ceph-12.1.1/src/common/cmdparse.h ceph-12.1.2/src/common/cmdparse.h --- ceph-12.1.1/src/common/cmdparse.h 2017-07-17 16:56:02.000000000 +0000 +++ ceph-12.1.2/src/common/cmdparse.h 2017-08-01 17:55:40.000000000 +0000 @@ -21,7 +21,8 @@ int64_t, double, std::vector, - std::vector> cmd_vartype; + std::vector, + std::vector> cmd_vartype; typedef std::map cmdmap_t; std::string cmddesc_get_prefix(const std::string &cmddesc); diff -Nru ceph-12.1.1/src/common/cohort_lru.h ceph-12.1.2/src/common/cohort_lru.h --- ceph-12.1.1/src/common/cohort_lru.h 2017-07-17 16:56:02.000000000 +0000 +++ ceph-12.1.2/src/common/cohort_lru.h 2017-08-01 17:55:40.000000000 +0000 @@ -399,7 +399,7 @@ v = lat.p->cache[slot]; if (v) { if (CEQ()(*v, k)) { - if (flags & (FLAG_LOCK|FLAG_UNLOCK)) + if ((flags & FLAG_LOCK) && (flags & FLAG_UNLOCK)) lat.lock->unlock(); return v; } @@ -417,7 +417,7 @@ lat.p->cache[slot] = v; } } - if (flags & (FLAG_LOCK|FLAG_UNLOCK)) + if ((flags & FLAG_LOCK) && (flags & FLAG_UNLOCK)) lat.lock->unlock(); return v; } /* find_latch */ diff -Nru ceph-12.1.1/src/common/common_init.cc ceph-12.1.2/src/common/common_init.cc --- ceph-12.1.1/src/common/common_init.cc 2017-07-17 16:56:02.000000000 +0000 +++ ceph-12.1.2/src/common/common_init.cc 2017-08-01 17:55:40.000000000 +0000 @@ -32,7 +32,7 @@ g_code_env = code_env; // Create a configuration object - CephContext *cct = new CephContext(iparams.module_type, flags); + CephContext *cct = new CephContext(iparams.module_type, code_env, flags); md_config_t *conf = cct->_conf; // add config observers here @@ -43,44 +43,20 @@ if (data_dir_option) conf->data_dir_option = data_dir_option; - // Set some defaults based on code type - switch (code_env) { - case CODE_ENVIRONMENT_DAEMON: - conf->set_val_or_die("daemonize", "true"); - conf->set_val_or_die("log_to_stderr", "false"); - conf->set_val_or_die("err_to_stderr", "true"); - - // different default keyring locations for osd and mds. this is - // for backward compatibility. moving forward, we want all keyrings - // in these locations. the mon already forces $mon_data/keyring. - if (conf->name.is_mds()) - conf->set_val("keyring", "$mds_data/keyring", false); - else if (conf->name.is_osd()) - conf->set_val("keyring", "$osd_data/keyring", false); - break; + // different default keyring locations for osd and mds. this is + // for backward compatibility. moving forward, we want all keyrings + // in these locations. the mon already forces $mon_data/keyring. + if (conf->name.is_mds()) { + conf->set_val("keyring", "$mds_data/keyring", false); + } else if (conf->name.is_osd()) { + conf->set_val("keyring", "$osd_data/keyring", false); + } - case CODE_ENVIRONMENT_UTILITY_NODOUT: - case CODE_ENVIRONMENT_LIBRARY: + if (code_env == CODE_ENVIRONMENT_LIBRARY || + code_env == CODE_ENVIRONMENT_UTILITY_NODOUT) { conf->set_val_or_die("log_to_stderr", "false"); conf->set_val_or_die("err_to_stderr", "false"); conf->set_val_or_die("log_flush_on_exit", "false"); - break; - - default: - break; - } - - if (flags & CINIT_FLAG_UNPRIVILEGED_DAEMON_DEFAULTS) { - // do nothing special! we used to do no default log, pid_file, - // admin_socket, but changed our minds. let's make ceph-fuse - // and radosgw use the same defaults as ceph-{osd,mon,mds,...} - } else if (code_env != CODE_ENVIRONMENT_DAEMON) { - // no default log, pid_file, admin_socket - conf->set_val_or_die("pid_file", ""); - conf->set_val_or_die("admin_socket", ""); - conf->set_val_or_die("log_file", ""); - // use less memory for logs - conf->set_val_or_die("log_max_recent", "500"); } return cct; diff -Nru ceph-12.1.1/src/common/config.cc ceph-12.1.2/src/common/config.cc --- ceph-12.1.1/src/common/config.cc 2017-07-17 16:56:02.000000000 +0000 +++ ceph-12.1.2/src/common/config.cc 2017-08-01 17:55:40.000000000 +0000 @@ -15,7 +15,6 @@ #include "common/ceph_argparse.h" #include "common/common_init.h" #include "common/config.h" -#include "common/config_validators.h" #include "include/str_list.h" #include "include/stringify.h" #include "osd/osd_types.h" @@ -73,101 +72,106 @@ return ret; } -#define OPTION(name, type, def_val) -#define OPTION_VALIDATOR(name) \ -struct md_config_t::option_##name##_t { \ - typedef decltype(md_config_t::name) type; \ -}; -#define SAFE_OPTION(name, type, def_val) -#define SUBSYS(name, log, gather) -#define DEFAULT_SUBSYS(log, gather) -#include "common/config_opts.h" + + +md_config_t::md_config_t(bool is_daemon) + : cluster(""), + lock("md_config_t", true, false) +{ + init_subsys(); + + // Load the compile-time list of Option into + // a map so that we can resolve keys quickly. + for (const auto &i : ceph_options) { + if (schema.count(i.name)) { + // We may be instantiated pre-logging so send + std::cerr << "Duplicate config key in schema: '" << i.name << "'" + << std::endl; + assert(false); + } + schema.insert({i.name, i}); + } + + // Populate list of legacy_values according to the OPTION() definitions + // Note that this is just setting up our map of name->member ptr. The + // default values etc will get loaded in along with new-style data, + // as all loads write to both the values map, and the legacy + // members if present. + legacy_values = { +#define OPTION(name, type) \ + {std::string(STRINGIFY(name)), &md_config_t::name}, +#define SAFE_OPTION(name, type) OPTION(name, type) +#include "common/legacy_config_opts.h" #undef OPTION -#undef OPTION_VALIDATOR #undef SAFE_OPTION -#undef SUBSYS -#undef DEFAULT_SUBSYS + }; -namespace { + validate_schema(); -template -typename std::enable_if::value, - md_config_t::validator_t>::type create_validator() { - return md_config_t::validator_t(); -} + // Load default values from the schema + for (const auto &i : schema) { + const Option &opt = i.second; + bool has_daemon_default = !boost::get(&opt.daemon_value); + Option::value_t default_val; + if (is_daemon && has_daemon_default) { + default_val = opt.daemon_value; + } else { + default_val = opt.value; + } -template -typename std::enable_if::value, - md_config_t::validator_t>::type create_validator() { - // if T is defined (and not just forward declared), it implies - // that a validator function exists. use a dummy typed pointer to - // pick the correct validator function - return [](std::string *value, std::string *error_message) { - return ::validate(reinterpret_cast(0), value, error_message); - }; -} + if (opt.type == Option::TYPE_STR) { + // We call pre_validate as a sanity check, but also to get any + // side effect (value modification) from the validator. + std::string *def_str = boost::get(&default_val); + std::string err; + if (opt.pre_validate(def_str, &err) != 0) { + std::cerr << "Default value " << opt.name << "=" << *def_str << " is " + "invalid: " << err << std::endl; -} // anonymous namespace + // This is the compiled-in default that is failing its own option's + // validation, so this is super-invalid and should never make it + // past a pull request: crash out. + assert(false); + } + } -md_config_t::md_config_t() - : cluster(""), + values[i.first] = default_val; + } -#define OPTION_OPT_INT(name, def_val) name(def_val), -#define OPTION_OPT_LONGLONG(name, def_val) name((1LL) * def_val), -#define OPTION_OPT_STR(name, def_val) name(def_val), -#define OPTION_OPT_DOUBLE(name, def_val) name(def_val), -#define OPTION_OPT_FLOAT(name, def_val) name(def_val), -#define OPTION_OPT_BOOL(name, def_val) name(def_val), -#define OPTION_OPT_ADDR(name, def_val) name(def_val), -#define OPTION_OPT_U32(name, def_val) name(def_val), -#define OPTION_OPT_U64(name, def_val) name(((uint64_t)1) * def_val), -#define OPTION_OPT_UUID(name, def_val) name(def_val), -#define OPTION(name, type, def_val) OPTION_##type(name, def_val) -#define OPTION_VALIDATOR(name) -#define SAFE_OPTION(name, type, def_val) OPTION(name, type, def_val) -#define SUBSYS(name, log, gather) -#define DEFAULT_SUBSYS(log, gather) -#include "common/config_opts.h" -#undef OPTION_OPT_INT -#undef OPTION_OPT_LONGLONG -#undef OPTION_OPT_STR -#undef OPTION_OPT_DOUBLE -#undef OPTION_OPT_FLOAT -#undef OPTION_OPT_BOOL -#undef OPTION_OPT_ADDR -#undef OPTION_OPT_U32 -#undef OPTION_OPT_U64 -#undef OPTION_OPT_UUID -#undef OPTION -#undef OPTION_VALIDATOR -#undef SAFE_OPTION -#undef SUBSYS -#undef DEFAULT_SUBSYS - lock("md_config_t", true, false) + // Copy out values (defaults) into any legacy (C struct member) fields + for (const auto &i : legacy_values) { + const auto &name = i.first; + const auto &option = schema.at(name); + auto ptr = i.second; + + update_legacy_val(option, ptr); + } +} + +/** + * Sanity check schema. Assert out on failures, to ensure any bad changes + * cannot possibly pass any testing and make it into a release. + */ +void md_config_t::validate_schema() { - static const std::vector s_config_options = { -#define OPTION4(name, type, def_val, safe) \ - config_option{ STRINGIFY(name), type, &md_config_t::name, safe, \ - create_validator() }, -#define OPTION(name, type, def_val) OPTION4(name, type, def_val, false) -#define OPTION_VALIDATOR(name) -#define SAFE_OPTION(name, type, def_val) OPTION4(name, type, def_val, true) -#define SUBSYS(name, log, gather) -#define DEFAULT_SUBSYS(log, gather) -#include "common/config_opts.h" -#undef OPTION4 -#undef OPTION -#undef OPTION_VALIDATOR -#undef SAFE_OPTION -#undef SUBSYS -#undef DEFAULT_SUBSYS - }; - static std::shared_ptr - s_tbl(new std::vector(std::move(s_config_options))); - config_options = s_tbl; + for (const auto &i : schema) { + const auto &opt = i.second; + for (const auto &see_also_key : opt.see_also) { + if (schema.count(see_also_key) == 0) { + std::cerr << "Non-existent see-also key '" << see_also_key + << "' on option '" << opt.name << "'" << std::endl; + assert(false); + } + } + } - validate_default_settings(); - init_subsys(); + for (const auto &i : legacy_values) { + if (schema.count(i.first) == 0) { + std::cerr << "Schema is missing legacy field '" << i.first << "'" + << std::endl; + assert(false); + } + } } void md_config_t::init_subsys() @@ -176,13 +180,7 @@ subsys.add(ceph_subsys_##name, STRINGIFY(name), log, gather); #define DEFAULT_SUBSYS(log, gather) \ subsys.add(ceph_subsys_, "none", log, gather); -#define OPTION(a, b, c) -#define OPTION_VALIDATOR(a) -#define SAFE_OPTION(a, b, c) -#include "common/config_opts.h" -#undef OPTION -#undef OPTION_VALIDATOR -#undef SAFE_OPTION +#include "common/subsys.h" #undef SUBSYS #undef DEFAULT_SUBSYS } @@ -255,7 +253,7 @@ string &s = *p; if (s.find("$data_dir") != string::npos) { if (data_dir_option.length()) { - list stack; + list stack; expand_meta(s, NULL, stack, warnings); p++; } else { @@ -310,12 +308,13 @@ std::vector my_sections; _get_my_sections(my_sections); - for (auto& opt: *config_options) { + for (const auto &i : schema) { + const auto &opt = i.second; std::string val; int ret = _get_val_from_conf_file(my_sections, opt.name, val, false); if (ret == 0) { std::string error_message; - int r = set_val_impl(val, &opt, &error_message); + int r = set_val_impl(val, opt, &error_message); if (warnings != nullptr && (r != 0 || !error_message.empty())) { *warnings << "parse error setting '" << opt.name << "' to '" << val << "'"; @@ -328,7 +327,7 @@ } // subsystems? - for (int o = 0; o < subsys.get_num(); o++) { + for (size_t o = 0; o < subsys.get_num(); o++) { std::string as_option("debug_"); as_option += subsys.get_name(o); std::string val; @@ -402,7 +401,7 @@ f->dump_string("name", stringify(name)); f->dump_string("cluster", cluster); } - for (int o = 0; o < subsys.get_num(); o++) { + for (size_t o = 0; o < subsys.get_num(); o++) { if (out) *out << "debug_" << subsys.get_name(o) << " = " << subsys.get_log_level(o) @@ -416,13 +415,14 @@ f->dump_string(debug_name.c_str(), ss.str()); } } - for (auto& opt: *config_options) { + for (const auto& i: schema) { + const Option &opt = i.second; char *buf; _get_val(opt.name, &buf, -1); if (out) *out << opt.name << " = " << buf << std::endl; if (f) - f->dump_string(opt.name, buf); + f->dump_string(opt.name.c_str(), buf); free(buf); } } @@ -528,7 +528,7 @@ ostream *oss) { int ret = 0; - int o; + size_t o = 0; std::string val; // subsystems? @@ -556,16 +556,16 @@ return ret; } - const char *option_name = nullptr; + std::string option_name; std::string error_message; o = 0; - for (auto& opt_ref: *config_options) { + for (const auto& opt_iter: schema) { + const Option &opt = opt_iter.second; ostringstream err; - config_option const *opt = &opt_ref; std::string as_option("--"); - as_option += opt->name; - option_name = opt->name; - if (opt->type == OPT_BOOL) { + as_option += opt.name; + option_name = opt.name; + if (opt.type == Option::TYPE_BOOL) { int res; if (ceph_argparse_binary_flag(args, i, &res, oss, as_option.c_str(), (char*)NULL)) { @@ -578,7 +578,7 @@ break; } else { std::string no("--no-"); - no += opt->name; + no += opt.name; if (ceph_argparse_flag(args, i, no.c_str(), (char*)NULL)) { ret = set_val_impl("false", opt, &error_message); break; @@ -591,9 +591,9 @@ ret = -EINVAL; break; } - if (oss && ((!opt->is_safe()) && - (observers.find(opt->name) == observers.end()))) { - *oss << "You cannot change " << opt->name << " using injectargs.\n"; + if (oss && ((!opt.is_safe()) && + (observers.find(opt.name) == observers.end()))) { + *oss << "You cannot change " << opt.name << " using injectargs.\n"; return -ENOSYS; } ret = set_val_impl(val, opt, &error_message); @@ -603,7 +603,7 @@ } if (ret != 0 || !error_message.empty()) { - assert(option_name); + assert(!option_name.empty()); if (oss) { *oss << "Parse error setting " << option_name << " to '" << val << "' using injectargs"; @@ -621,7 +621,7 @@ } } - if (o == (int)config_options->size()) { + if (o == schema.size()) { // ignore ++i; } @@ -666,6 +666,15 @@ expand_all_meta(); + // expand_all_meta could have modified anything. Copy it all out again. + for (const auto &i : legacy_values) { + const auto &name = i.first; + const auto &option = schema.at(name); + auto ptr = i.second; + + update_legacy_val(option, ptr); + } + // create the reverse observer mapping, mapping observers to the set of // changed keys that they'll get. rev_obs_map_t robs; @@ -754,62 +763,29 @@ return ret; } -void md_config_t::set_val_or_die(const char *key, const char *val) -{ - int ret = set_val(key, val); - assert(ret == 0); -} - -struct is_integer_member : public boost::static_visitor { - template, int>::type = 0> - bool operator()(const T md_config_t::* /* member_ptr */) const { - return true; - } - template::value, int>::type = 0> - bool operator()(const T md_config_t::* /* member_ptr */) const { - return false; - } -}; - -struct is_float_member : public boost::static_visitor { - template, int>::type = 0> - bool operator()(const T md_config_t::* /* member_ptr */) const { - return true; +void md_config_t::set_val_or_die(const std::string &key, + const std::string &val, + bool meta) +{ + std::stringstream err; + int ret = set_val(key, val, meta, &err); + if (ret != 0) { + std::cerr << "set_val_or_die(" << key << "): " << err.str(); } - template::value, int>::type = 0> - bool operator()(const T md_config_t::* /* member_ptr */) const { - return false; - } -}; - -bool md_config_t::config_option::is_safe() const { - // for now integer and floating point options considered thread safe - return safe || - boost::apply_visitor(is_integer_member(), md_member_ptr) || - boost::apply_visitor(is_float_member(), md_member_ptr); -} - -md_config_t::config_option const *md_config_t::find_config_option(const std::string &normalized_key) const -{ - auto opt_it = std::find_if(config_options->begin(), - config_options->end(), - [normalized_key](const config_option &opt) -> bool { - return strcmp(normalized_key.c_str(), opt.name) == 0; - }); - return config_options->end() == opt_it ? nullptr : &(*opt_it); + assert(ret == 0); } -int md_config_t::set_val(const char *key, const char *val, bool meta) +int md_config_t::set_val(const std::string &key, const char *val, + bool meta, std::stringstream *err_ss) { Mutex::Locker l(lock); - if (!key) + if (key.empty()) { + if (err_ss) *err_ss << "No key specified"; return -EINVAL; - if (!val) + } + if (!val) { return -EINVAL; + } std::string v(val); if (meta) @@ -819,97 +795,101 @@ // subsystems? if (strncmp(k.c_str(), "debug_", 6) == 0) { - for (int o = 0; o < subsys.get_num(); o++) { + for (size_t o = 0; o < subsys.get_num(); o++) { std::string as_option = "debug_" + subsys.get_name(o); if (k == as_option) { int log, gather; int r = sscanf(v.c_str(), "%d/%d", &log, &gather); if (r >= 1) { - if (r < 2) + if (r < 2) { gather = log; - // cout << "subsys " << subsys.get_name(o) << " log " << log << " gather " << gather << std::endl; + } subsys.set_log_level(o, log); subsys.set_gather_level(o, gather); + if (err_ss) *err_ss << "Set " << k << " to " << log << "/" << gather; return 0; } + if (err_ss) { + *err_ss << "Invalid debug level, should be or /"; + } return -EINVAL; } } } - config_option const *opt = find_config_option(k); - if (opt) { - if ((!opt->is_safe()) && internal_safe_to_start_threads) { + const auto &opt_iter = schema.find(k); + if (opt_iter != schema.end()) { + const Option &opt = opt_iter->second; + if ((!opt.is_safe()) && internal_safe_to_start_threads) { // If threads have been started and the option is not thread safe - if (observers.find(opt->name) == observers.end()) { + if (observers.find(opt.name) == observers.end()) { // And there is no observer to safely change it... // You lose. + if (err_ss) *err_ss << "Configuration option '" << key << "' may " + "not be modified at runtime"; return -ENOSYS; } } std::string error_message; int r = set_val_impl(v, opt, &error_message); + if (r == 0) { + if (err_ss) *err_ss << "Set " << opt.name << " to " << v; + } else { + if (err_ss) *err_ss << error_message; + } return r; } - // couldn't find a configuration option with key 'key' + if (err_ss) *err_ss << "Configuration option not found: '" << key << "'"; return -ENOENT; } -int md_config_t::get_val(const char *key, char **buf, int len) const +int md_config_t::get_val(const std::string &key, char **buf, int len) const { Mutex::Locker l(lock); return _get_val(key, buf,len); } -md_config_t::config_value_t md_config_t::get_val_generic(const char *key) const +Option::value_t md_config_t::get_val_generic(const std::string &key) const { Mutex::Locker l(lock); return _get_val(key); } -class get_value_generic_visitor : public boost::static_visitor { - md_config_t const *conf; -public: - explicit get_value_generic_visitor(md_config_t const *conf_) : conf(conf_) { } - template md_config_t::config_value_t operator()(const T md_config_t::* member_ptr) { - return md_config_t::config_value_t(conf->*member_ptr); - } -}; - -md_config_t::config_value_t md_config_t::_get_val(const char *key) const +Option::value_t md_config_t::_get_val(const std::string &key) const { assert(lock.is_locked()); - if (!key) - return config_value_t(invalid_config_value_t()); + if (key.empty()) { + return Option::value_t(boost::blank()); + } // In key names, leading and trailing whitespace are not significant. string k(ConfFile::normalize_key_name(key)); - config_option const *opt = find_config_option(k); - if (!opt) { - return config_value_t(invalid_config_value_t()); + const auto &opt_iter = schema.find(k); + if (opt_iter != schema.end()) { + // Using .at() is safe because all keys in the schema always have + // entries in ::values + return values.at(k); + } else { + return Option::value_t(boost::blank()); } - get_value_generic_visitor gvv(this); - return boost::apply_visitor(gvv, opt->md_member_ptr); } -int md_config_t::_get_val(const char *key, std::string *value) const { +int md_config_t::_get_val(const std::string &key, std::string *value) const { assert(lock.is_locked()); std::string normalized_key(ConfFile::normalize_key_name(key)); - config_value_t config_value = _get_val(normalized_key.c_str()); - if (!boost::get(&config_value)) { + Option::value_t config_value = _get_val(normalized_key.c_str()); + if (!boost::get(&config_value)) { ostringstream oss; if (bool *flag = boost::get(&config_value)) { oss << (*flag ? "true" : "false"); - } else if (float *fp = boost::get(&config_value)) { - oss << std::fixed << *fp ; } else if (double *dp = boost::get(&config_value)) { - oss << std::fixed << *dp ; + oss << std::fixed << *dp; } else { oss << config_value; } @@ -919,15 +899,15 @@ return -ENOENT; } -int md_config_t::_get_val(const char *key, char **buf, int len) const +int md_config_t::_get_val(const std::string &key, char **buf, int len) const { assert(lock.is_locked()); - if (!key) + if (key.empty()) return -EINVAL; string val ; - if (!_get_val(key, &val)) { + if (_get_val(key, &val) == 0) { int l = val.length() + 1; if (len == -1) { *buf = (char*)malloc(l); @@ -942,7 +922,7 @@ string k(ConfFile::normalize_key_name(key)); // subsys? - for (int o = 0; o < subsys.get_num(); o++) { + for (size_t o = 0; o < subsys.get_num(); o++) { std::string as_option = "debug_" + subsys.get_name(o); if (k == as_option) { if (len == -1) { @@ -962,14 +942,15 @@ const std::string negative_flag_prefix("no_"); keys->clear(); - keys->reserve(config_options->size()); - for (auto& opt: *config_options) { + keys->reserve(schema.size()); + for (const auto &i: schema) { + const Option &opt = i.second; keys->push_back(opt.name); - if (opt.type == OPT_BOOL) { + if (opt.type == Option::TYPE_BOOL) { keys->push_back(negative_flag_prefix + opt.name); } } - for (int i = 0; i < subsys.get_num(); ++i) { + for (size_t i = 0; i < subsys.get_num(); ++i) { keys->push_back("debug_" + subsys.get_name(i)); } } @@ -1007,14 +988,14 @@ } int md_config_t::get_val_from_conf_file(const std::vector §ions, - const char *key, std::string &out, bool emeta) const + const std::string &key, std::string &out, bool emeta) const { Mutex::Locker l(lock); return _get_val_from_conf_file(sections, key, out, emeta); } int md_config_t::_get_val_from_conf_file(const std::vector §ions, - const char *key, std::string &out, bool emeta) const + const std::string &key, std::string &out, bool emeta) const { assert(lock.is_locked()); std::vector ::const_iterator s = sections.begin(); @@ -1032,117 +1013,122 @@ return -ENOENT; } -int md_config_t::set_val_impl(const std::string &val, config_option const *opt, +int md_config_t::set_val_impl(const std::string &raw_val, const Option &opt, std::string *error_message) { assert(lock.is_locked()); - std::string value(val); - if (opt->validator) { - int r = opt->validator(&value, error_message); - if (r < 0) { - return r; - } - } - - int ret = set_val_raw(value.c_str(), opt); - if (ret) - return ret; - changed.insert(opt->name); - return 0; -} -template struct strtox_helper; + std::string val = raw_val; -template<> struct strtox_helper { - static inline void apply(const char *val, float &x, std::string &err) { - x = strict_strtof(val, &err); - } -}; - -template<> struct strtox_helper { - static inline void apply(const char *val, double &x, std::string &err) { - x = strict_strtod(val, &err); - } -}; - -template static inline int strict_strtox(const char *val, T &x) { - std::string err; - strtox_helper::apply(val, x, err); - return err.empty() ? 0 : -EINVAL; -} - -class set_value_visitor : public boost::static_visitor { - md_config_t const *conf; - const char *val; -public: - explicit set_value_visitor(md_config_t const *conf_, const char *val_) : - conf(conf_), val(val_) { } - - int operator()(const std::string md_config_t::* member_ptr) { - auto *ptr = const_cast(&(conf->*member_ptr)); - *ptr = val ? val : ""; - return 0; + int r = opt.pre_validate(&val, error_message); + if (r != 0) { + return r; } - int operator()(const bool md_config_t::* member_ptr) { - bool *ptr = const_cast(&(conf->*member_ptr)); - if (strcasecmp(val, "false") == 0) { - *ptr = false; - } else if (strcasecmp(val, "true") == 0) { - *ptr = true; + Option::value_t new_value; + if (opt.type == Option::TYPE_INT) { + int64_t f = strict_si_cast(val.c_str(), error_message); + if (!error_message->empty()) { + return -EINVAL; + } + new_value = f; + } else if (opt.type == Option::TYPE_UINT) { + uint64_t f = strict_si_cast(val.c_str(), error_message); + if (!error_message->empty()) { + return -EINVAL; + } + new_value = f; + } else if (opt.type == Option::TYPE_STR) { + new_value = val; + } else if (opt.type == Option::TYPE_FLOAT) { + double f = strict_strtod(val.c_str(), error_message); + if (!error_message->empty()) { + return -EINVAL; } else { - std::string err; - int b = strict_strtol(val, 10, &err); - if (!err.empty()) { + new_value = f; + } + } else if (opt.type == Option::TYPE_BOOL) { + if (strcasecmp(val.c_str(), "false") == 0) { + new_value = false; + } else if (strcasecmp(val.c_str(), "true") == 0) { + new_value = true; + } else { + int b = strict_strtol(val.c_str(), 10, error_message); + if (!error_message->empty()) { return -EINVAL; } - *ptr = !!b; + new_value = !!b; } - return 0; + } else if (opt.type == Option::TYPE_ADDR) { + entity_addr_t addr; + if (!addr.parse(val.c_str())){ + return -EINVAL; + } + new_value = addr; + } else if (opt.type == Option::TYPE_UUID) { + uuid_d uuid; + if (!uuid.parse(val.c_str())) { + return -EINVAL; + } + new_value = uuid; + } else { + ceph_abort(); } - - // type has parse() member function - template, int>::type = 0> - int operator()(const T md_config_t::* member_ptr) { - T *obj = const_cast(&(conf->*member_ptr)); - if (!obj->parse(val)) { - return -EINVAL; - } - return 0; - } - // float, double - template, int>::type = 0> - int operator()(const T md_config_t::* member_ptr) { - T* ptr = const_cast(&(conf->*member_ptr)); - return strict_strtox(val, *ptr); - } - - // integers - template::value && - !boost::is_same::value, int>::type = 0> - int operator()(const T md_config_t::* member_ptr) { - std::string err; - T f = strict_si_cast(val, &err); - if (!err.empty()) { - return -EINVAL; - } - T *ptr = const_cast(&(conf->*member_ptr)); - *ptr = f; - return 0; + r = opt.validate(new_value, error_message); + if (r != 0) { + return r; + } + + + // Apply the value to its entry in the `values` map + values[opt.name] = new_value; + + // Apply the value to its legacy field, if it has one + auto legacy_ptr_iter = legacy_values.find(std::string(opt.name)); + if (legacy_ptr_iter != legacy_values.end()) { + update_legacy_val(opt, legacy_ptr_iter->second); + } + + changed.insert(opt.name); + return 0; +} + +/** + * Handles assigning from a variant-of-types to a variant-of-pointers-to-types + */ +class assign_visitor : public boost::static_visitor<> +{ + md_config_t *conf; + Option::value_t val; + public: + + assign_visitor(md_config_t *conf_, Option::value_t val_) + : conf(conf_), val(val_) + {} + + template + void operator()( T md_config_t::* ptr) const + { + T *member = const_cast(&(conf->*(boost::get(ptr)))); + + *member = boost::get(val); } }; -int md_config_t::set_val_raw(const char *val, config_option const *opt) +void md_config_t::update_legacy_val(const Option &opt, + md_config_t::member_ptr_t member_ptr) { - assert(lock.is_locked()); - set_value_visitor svv(this, val); - return boost::apply_visitor(svv, opt->md_member_ptr); + if (boost::get(&values.at(opt.name))) { + // This shouldn't happen, but if it does then just don't even + // try to assign to the legacy field. + return; + } + + boost::apply_visitor(assign_visitor(this, values.at(opt.name)), member_ptr); } + static const char *CONF_METAVARIABLES[] = { "data_dir", // put this first: it may contain some of the others "cluster", "type", "name", "host", "num", "id", "pid", "cctid" @@ -1154,27 +1140,29 @@ { // Expand all metavariables ostringstream oss; - for (auto& opt: *config_options) { - std::string *str; - opt.conf_ptr(str, this); - if (str) { - list stack; + for (const auto &i : schema) { + const Option &opt = i.second; + + if (opt.type == Option::TYPE_STR) { + list stack; + std::string *str = boost::get(&(values.at(opt.name))); + assert(str != nullptr); // Non-string values should never get in expand_meta(*str, &opt, stack, &oss); } } cerr << oss.str(); } -bool md_config_t::expand_meta(std::string &origval, +bool md_config_t::expand_meta(std::string &val, std::ostream *oss) const { - list stack; - return expand_meta(origval, NULL, stack, oss); + list stack; + return expand_meta(val, NULL, stack, oss); } bool md_config_t::expand_meta(std::string &origval, - config_option const *opt, - std::list stack, + const Option *opt, + std::list stack, std::ostream *oss) const { assert(lock.is_locked()); @@ -1186,25 +1174,22 @@ // ignore an expansion loop and create a human readable // message about it if (opt) { - for (list::iterator i = stack.begin(); - i != stack.end(); - ++i) { - if (strcmp(opt->name, (*i)->name) == 0) { + for (const auto stack_ptr : stack) { + if (opt->name == stack_ptr->name) { *oss << "variable expansion loop at " << opt->name << "=" << origval << std::endl; *oss << "expansion stack: " << std::endl; - for (list::iterator j = stack.begin(); - j != stack.end(); - ++j) { - *oss << (*j)->name << "=" << *((*j)->conf_ptr(this)) << std::endl; + for (const auto j : stack) { + std::string val; + _get_val(j->name, &val); + *oss << j->name << "=" << val << std::endl; } return false; } } - } - if (opt) stack.push_front(opt); + } bool found_meta = false; string out; @@ -1285,22 +1270,25 @@ if (!expanded) { // config option? - for (auto& opt: *config_options) { - if (var == opt.name) { - string *origval; - opt.conf_ptr(origval, const_cast(this)); - if (origval) { - expand_meta(*origval, &opt, stack, oss); - out += *origval; - } else { - char *vv = NULL; - _get_val(opt.name, &vv, -1); - out += vv; - free(vv); - } - expanded = true; - break; - } + const auto other_opt_iter = schema.find(var); + if (other_opt_iter != schema.end()) { + const Option &other_opt = other_opt_iter->second; + if (other_opt.type == Option::TYPE_STR) { + // The referenced option is a string, it may need substitution + // before inserting. + Option::value_t *other_val_ptr = const_cast(&(values.at(other_opt.name))); + std::string *other_opt_val = boost::get(other_val_ptr); + expand_meta(*other_opt_val, &other_opt, stack, oss); + out += *other_opt_val; + } else { + // The referenced option is not a string: retrieve and insert + // its stringized form. + char *vv = NULL; + _get_val(other_opt.name, &vv, -1); + out += vv; + free(vv); + } + expanded = true; } } } @@ -1341,7 +1329,8 @@ char local_buf[4096]; char other_buf[4096]; - for (auto& opt : *config_options) { + for (const auto &i : schema) { + const Option &opt = i.second; if (!setting.empty()) { if (setting != opt.name) { continue; @@ -1378,18 +1367,3 @@ ::complain_about_parse_errors(cct, &parse_errors); } -void md_config_t::validate_default_settings() { - Mutex::Locker l(lock); - for (auto &opt : *config_options) { - // normalize config defaults using their validator - if (opt.validator) { - std::string value; - int r = _get_val(opt.name, &value); - assert(r == 0); - - std::string error_message; - r = set_val_impl(value.c_str(), &opt, &error_message); - assert(r == 0); - } - } -} diff -Nru ceph-12.1.1/src/common/config.h ceph-12.1.2/src/common/config.h --- ceph-12.1.1/src/common/config.h 2017-07-17 16:56:02.000000000 +0000 +++ ceph-12.1.2/src/common/config.h 2017-08-01 17:55:40.000000000 +0000 @@ -17,9 +17,11 @@ #include "common/ConfUtils.h" #include "common/entity_name.h" +#include "common/code_environment.h" #include "common/Mutex.h" #include "log/SubsystemMap.h" #include "common/config_obs.h" +#include "common/options.h" #define OSD_REP_PRIMARY 0 #define OSD_REP_SPLAY 1 @@ -64,6 +66,14 @@ */ struct md_config_t { public: + typedef boost::variant member_ptr_t; + /* Maps configuration options to the observer listening for them. */ typedef std::multimap obs_map_t; @@ -71,83 +81,29 @@ * apply_changes */ typedef std::set < std::string > changed_set_t; - struct invalid_config_value_t { }; - typedef boost::variant config_value_t; - typedef boost::variant member_ptr_t; + /* + * Mapping from legacy config option names to class members + */ + std::map legacy_values; + + /** + * The configuration schema, in the form of Option objects describing + * possible settings. + */ + std::map schema; + + /** + * The current values of all settings described by the schema + */ + std::map values; typedef enum { OPT_INT, OPT_LONGLONG, OPT_STR, OPT_DOUBLE, OPT_FLOAT, OPT_BOOL, OPT_ADDR, OPT_U32, OPT_U64, OPT_UUID } opt_type_t; - typedef std::function validator_t; - - class config_option { - public: - const char *name; - opt_type_t type; - md_config_t::member_ptr_t md_member_ptr; - bool safe; // promise to access it only via md_config_t::get_val - validator_t validator; - private: - template struct get_typed_pointer_visitor : public boost::static_visitor { - md_config_t const *conf; - explicit get_typed_pointer_visitor(md_config_t const *conf_) : conf(conf_) { } - template, int>::type = 0> - T const *operator()(const U md_config_t::* member_ptr) { - return &(conf->*member_ptr); - } - template::value, int>::type = 0> - T const *operator()(const U md_config_t::* member_ptr) { - return nullptr; - } - }; - public: - // is it OK to alter the value when threads are running? - bool is_safe() const; - // Given a configuration, return a pointer to this option inside - // that configuration. - template void conf_ptr(T const *&ptr, md_config_t const *conf) const { - get_typed_pointer_visitor gtpv(conf); - ptr = boost::apply_visitor(gtpv, md_member_ptr); - } - template void conf_ptr(T *&ptr, md_config_t *conf) const { - get_typed_pointer_visitor gtpv(conf); - ptr = const_cast(boost::apply_visitor(gtpv, md_member_ptr)); - } - template T const *conf_ptr(md_config_t const *conf) const { - get_typed_pointer_visitor gtpv(conf); - return boost::apply_visitor(gtpv, md_member_ptr); - } - template T *conf_ptr(md_config_t *conf) const { - get_typed_pointer_visitor gtpv(conf); - return const_cast(boost::apply_visitor(gtpv, md_member_ptr)); - } - }; - // Create a new md_config_t structure. - md_config_t(); + md_config_t(bool is_daemon=false); ~md_config_t(); // Adds a new observer to this configuration. You can do this at any time, @@ -188,21 +144,24 @@ // Set a configuration value, or crash // Metavariables will be expanded. - void set_val_or_die(const char *key, const char *val); + void set_val_or_die(const std::string &key, const std::string &val, + bool meta=true); // Set a configuration value. // Metavariables will be expanded. - int set_val(const char *key, const char *val, bool meta=true); - int set_val(const char *key, const string& s, bool meta=true) { - return set_val(key, s.c_str(), meta); + int set_val(const std::string &key, const char *val, bool meta=true, + std::stringstream *err_ss=nullptr); + int set_val(const std::string &key, const string& s, bool meta=true, + std::stringstream *err_ss=nullptr) { + return set_val(key, s.c_str(), meta, err_ss); } // Get a configuration value. // No metavariables will be returned (they will have already been expanded) - int get_val(const char *key, char **buf, int len) const; - int _get_val(const char *key, char **buf, int len) const; - config_value_t get_val_generic(const char *key) const; - template T get_val(const char *key) const; + int get_val(const std::string &key, char **buf, int len) const; + int _get_val(const std::string &key, char **buf, int len) const; + Option::value_t get_val_generic(const std::string &key) const; + template T get_val(const std::string &key) const; void get_all_keys(std::vector *keys) const; @@ -215,7 +174,7 @@ // Get a value from the configuration file that we read earlier. // Metavariables will be expanded if emeta is true. int get_val_from_conf_file(const std::vector §ions, - const char *key, std::string &out, bool emeta) const; + std::string const &key, std::string &out, bool emeta) const; /// dump all config values to a stream void show_config(std::ostream& out); @@ -236,16 +195,17 @@ void complain_about_parse_errors(CephContext *cct); private: + void validate_schema(); void validate_default_settings(); - int _get_val(const char *key, std::string *value) const; - config_value_t _get_val(const char *key) const; + int _get_val(const std::string &key, std::string *value) const; + Option::value_t _get_val(const std::string &key) const; void _show_config(std::ostream *out, Formatter *f); void _get_my_sections(std::vector §ions) const; int _get_val_from_conf_file(const std::vector §ions, - const char *key, std::string &out, bool emeta) const; + const std::string &key, std::string &out, bool emeta) const; int parse_option(std::vector& args, std::vector::iterator& i, @@ -255,9 +215,15 @@ int parse_config_files_impl(const std::list &conf_files, std::ostream *warnings); - int set_val_impl(const std::string &val, config_option const *opt, + int set_val_impl(const std::string &val, const Option &opt, std::string *error_message); - int set_val_raw(const char *val, config_option const *opt); + + template + void assign_member(member_ptr_t ptr, const Option::value_t &val); + + + void update_legacy_val(const Option &opt, + md_config_t::member_ptr_t member); void init_subsys(); @@ -276,8 +242,8 @@ } private: bool expand_meta(std::string &val, - config_option const *opt, - std::list stack, + const Option *opt, + std::list stack, std::ostream *oss) const; /// expand all metavariables in config structure. @@ -301,29 +267,28 @@ /// cluster name string cluster; -#define OPTION_OPT_INT(name) const int name; -#define OPTION_OPT_LONGLONG(name) const long long name; -#define OPTION_OPT_STR(name) const std::string name; -#define OPTION_OPT_DOUBLE(name) const double name; -#define OPTION_OPT_FLOAT(name) const float name; -#define OPTION_OPT_BOOL(name) const bool name; -#define OPTION_OPT_ADDR(name) const entity_addr_t name; -#define OPTION_OPT_U32(name) const uint32_t name; -#define OPTION_OPT_U64(name) const uint64_t name; -#define OPTION_OPT_UUID(name) const uuid_d name; -#define OPTION(name, ty, init) \ +// This macro block defines C members of the md_config_t struct +// corresponding to the definitions in legacy_config_opts.h. +// These C members are consumed by code that was written before +// the new options.cc infrastructure: all newer code should +// be consume options via explicit get() rather than C members. +#define OPTION_OPT_INT(name) int64_t name; +#define OPTION_OPT_LONGLONG(name) int64_t name; +#define OPTION_OPT_STR(name) std::string name; +#define OPTION_OPT_DOUBLE(name) double name; +#define OPTION_OPT_FLOAT(name) double name; +#define OPTION_OPT_BOOL(name) bool name; +#define OPTION_OPT_ADDR(name) entity_addr_t name; +#define OPTION_OPT_U32(name) uint64_t name; +#define OPTION_OPT_U64(name) uint64_t name; +#define OPTION_OPT_UUID(name) uuid_d name; +#define OPTION(name, ty) \ public: \ - OPTION_##ty(name) \ - struct option_##name##_t; -#define OPTION_VALIDATOR(name) -#define SAFE_OPTION(name, ty, init) \ + OPTION_##ty(name) +#define SAFE_OPTION(name, ty) \ protected: \ - OPTION_##ty(name) \ - public: \ - struct option_##name##_t; -#define SUBSYS(name, log, gather) -#define DEFAULT_SUBSYS(log, gather) -#include "common/config_opts.h" + OPTION_##ty(name) +#include "common/legacy_config_opts.h" #undef OPTION_OPT_INT #undef OPTION_OPT_LONGLONG #undef OPTION_OPT_STR @@ -335,11 +300,9 @@ #undef OPTION_OPT_U64 #undef OPTION_OPT_UUID #undef OPTION -#undef OPTION_VALIDATOR #undef SAFE_OPTION -#undef SUBSYS -#undef DEFAULT_SUBSYS +public: unsigned get_osd_pool_default_min_size() const { return osd_pool_default_min_size ? MIN(osd_pool_default_min_size, osd_pool_default_size) : @@ -353,11 +316,6 @@ mutable Mutex lock; friend class test_md_config_t; -protected: - // Tests and possibly users expect options to appear in the output - // of ceph-conf in the same order as declared in config_opts.h - std::shared_ptr> config_options; - config_option const *find_config_option(const std::string& normalized_key) const; }; template @@ -374,35 +332,26 @@ } }; -template T md_config_t::get_val(const char *key) const { - config_value_t generic_val = this->get_val_generic(key); +template T md_config_t::get_val(const std::string &key) const { + Option::value_t generic_val = this->get_val_generic(key); get_typed_value_visitor gtv; return boost::apply_visitor(gtv, generic_val); } -inline std::ostream& operator<<(std::ostream& o, const md_config_t::invalid_config_value_t& ) { +inline std::ostream& operator<<(std::ostream& o, const boost::blank& ) { return o << "INVALID_CONFIG_VALUE"; } int ceph_resolve_file_search(const std::string& filename_list, std::string& result); -typedef md_config_t::config_option config_option; - - enum config_subsys_id { ceph_subsys_, // default -#define OPTION(a,b,c) -#define OPTION_VALIDATOR(name) -#define SAFE_OPTION(a,b,c) #define SUBSYS(name, log, gather) \ ceph_subsys_##name, #define DEFAULT_SUBSYS(log, gather) -#include "common/config_opts.h" +#include "common/subsys.h" #undef SUBSYS -#undef OPTION -#undef OPTION_VALIDATOR -#undef SAFE_OPTION #undef DEFAULT_SUBSYS ceph_subsys_max }; diff -Nru ceph-12.1.1/src/common/config_opts.h ceph-12.1.2/src/common/config_opts.h --- ceph-12.1.1/src/common/config_opts.h 2017-07-17 16:56:02.000000000 +0000 +++ ceph-12.1.2/src/common/config_opts.h 1970-01-01 00:00:00.000000000 +0000 @@ -1,1800 +0,0 @@ -// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- -// vim: ts=8 sw=2 smarttab -/* - * Ceph - scalable distributed file system - * - * Copyright (C) 2004-2006 Sage Weil - * - * This is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License version 2.1, as published by the Free Software - * Foundation. See file COPYING. - * - */ - -/* note: no header guard */ -OPTION(host, OPT_STR, "") // "" means that ceph will use short hostname -OPTION(fsid, OPT_UUID, uuid_d()) -OPTION(public_addr, OPT_ADDR, entity_addr_t()) -OPTION(public_bind_addr, OPT_ADDR, entity_addr_t()) -OPTION(cluster_addr, OPT_ADDR, entity_addr_t()) -OPTION(public_network, OPT_STR, "") -OPTION(cluster_network, OPT_STR, "") -OPTION(num_client, OPT_INT, 1) -OPTION(monmap, OPT_STR, "") -OPTION(mon_host, OPT_STR, "") -OPTION(mon_dns_srv_name, OPT_STR, "ceph-mon") -OPTION(lockdep, OPT_BOOL, false) -OPTION(lockdep_force_backtrace, OPT_BOOL, false) // always gather current backtrace at every lock -OPTION(run_dir, OPT_STR, "/var/run/ceph") // the "/var/run/ceph" dir, created on daemon startup -OPTION(admin_socket, OPT_STR, "$run_dir/$cluster-$name.asok") // default changed by common_preinit() -OPTION(admin_socket_mode, OPT_STR, "") // permission bits to set for admin socket file, e.g., "0775", "0755" - -OPTION(daemonize, OPT_BOOL, false) // default changed by common_preinit() -OPTION(setuser, OPT_STR, "") // uid or user name -OPTION(setgroup, OPT_STR, "") // gid or group name -OPTION(setuser_match_path, OPT_STR, "") // make setuser/group conditional on this path matching ownership -OPTION(pid_file, OPT_STR, "") // default changed by common_preinit() -OPTION(chdir, OPT_STR, "/") -OPTION(max_open_files, OPT_LONGLONG, 0) -OPTION(restapi_log_level, OPT_STR, "") // default set by Python code -OPTION(restapi_base_url, OPT_STR, "") // " -OPTION(fatal_signal_handlers, OPT_BOOL, true) -SAFE_OPTION(erasure_code_dir, OPT_STR, CEPH_PKGLIBDIR"/erasure-code") // default location for erasure-code plugins - -OPTION(log_file, OPT_STR, "/var/log/ceph/$cluster-$name.log") // default changed by common_preinit() -OPTION(log_max_new, OPT_INT, 1000) // default changed by common_preinit() -OPTION(log_max_recent, OPT_INT, 10000) // default changed by common_preinit() -OPTION(log_to_stderr, OPT_BOOL, true) // default changed by common_preinit() -OPTION(err_to_stderr, OPT_BOOL, true) // default changed by common_preinit() -OPTION(log_to_syslog, OPT_BOOL, false) -OPTION(err_to_syslog, OPT_BOOL, false) -OPTION(log_flush_on_exit, OPT_BOOL, true) // default changed by common_preinit() -OPTION(log_stop_at_utilization, OPT_FLOAT, .97) // stop logging at (near) full -OPTION(log_to_graylog, OPT_BOOL, false) -OPTION(err_to_graylog, OPT_BOOL, false) -OPTION(log_graylog_host, OPT_STR, "127.0.0.1") -OPTION(log_graylog_port, OPT_INT, 12201) - -// options will take k/v pairs, or single-item that will be assumed as general -// default for all, regardless of channel. -// e.g., "info" would be taken as the same as "default=info" -// also, "default=daemon audit=local0" would mean -// "default all to 'daemon', override 'audit' with 'local0' -OPTION(clog_to_monitors, OPT_STR, "default=true") -OPTION(clog_to_syslog, OPT_STR, "false") -OPTION(clog_to_syslog_level, OPT_STR, "info") // this level and above -OPTION(clog_to_syslog_facility, OPT_STR, "default=daemon audit=local0") -OPTION(clog_to_graylog, OPT_STR, "false") -OPTION(clog_to_graylog_host, OPT_STR, "127.0.0.1") -OPTION(clog_to_graylog_port, OPT_STR, "12201") - -OPTION(mon_cluster_log_to_syslog, OPT_STR, "default=false") -OPTION(mon_cluster_log_to_syslog_level, OPT_STR, "info") // this level and above -OPTION(mon_cluster_log_to_syslog_facility, OPT_STR, "daemon") -OPTION(mon_cluster_log_file, OPT_STR, - "default=/var/log/ceph/$cluster.$channel.log cluster=/var/log/ceph/$cluster.log") -OPTION(mon_cluster_log_file_level, OPT_STR, "info") -OPTION(mon_cluster_log_to_graylog, OPT_STR, "false") -OPTION(mon_cluster_log_to_graylog_host, OPT_STR, "127.0.0.1") -OPTION(mon_cluster_log_to_graylog_port, OPT_STR, "12201") - -OPTION(enable_experimental_unrecoverable_data_corrupting_features, OPT_STR, "") - -SAFE_OPTION(plugin_dir, OPT_STR, CEPH_PKGLIBDIR) - -OPTION(xio_trace_mempool, OPT_BOOL, false) // mempool allocation counters -OPTION(xio_trace_msgcnt, OPT_BOOL, false) // incoming/outgoing msg counters -OPTION(xio_trace_xcon, OPT_BOOL, false) // Xio message encode/decode trace -OPTION(xio_queue_depth, OPT_INT, 128) // depth of Accelio msg queue -OPTION(xio_mp_min, OPT_INT, 128) // default min mempool size -OPTION(xio_mp_max_64, OPT_INT, 65536) // max 64-byte chunks (buffer is 40) -OPTION(xio_mp_max_256, OPT_INT, 8192) // max 256-byte chunks -OPTION(xio_mp_max_1k, OPT_INT, 8192) // max 1K chunks -OPTION(xio_mp_max_page, OPT_INT, 4096) // max 1K chunks -OPTION(xio_mp_max_hint, OPT_INT, 4096) // max size-hint chunks -OPTION(xio_portal_threads, OPT_INT, 2) // xio portal threads per messenger -OPTION(xio_max_conns_per_portal, OPT_INT, 32) // max xio_connections per portal/ctx -OPTION(xio_transport_type, OPT_STR, "rdma") // xio transport type: {rdma or tcp} -OPTION(xio_max_send_inline, OPT_INT, 512) // xio maximum threshold to send inline - -OPTION(compressor_zlib_isal, OPT_BOOL, false) -OPTION(compressor_zlib_level, OPT_INT, 5) //regular zlib compression level, not applicable to isa-l optimized version - -OPTION(async_compressor_enabled, OPT_BOOL, false) -OPTION(async_compressor_type, OPT_STR, "snappy") -OPTION(async_compressor_threads, OPT_INT, 2) -OPTION(async_compressor_thread_timeout, OPT_INT, 5) -OPTION(async_compressor_thread_suicide_timeout, OPT_INT, 30) - -OPTION(plugin_crypto_accelerator, OPT_STR, "crypto_isal") - -OPTION(mempool_debug, OPT_BOOL, false) - -DEFAULT_SUBSYS(0, 5) -SUBSYS(lockdep, 0, 1) -SUBSYS(context, 0, 1) -SUBSYS(crush, 1, 1) -SUBSYS(mds, 1, 5) -SUBSYS(mds_balancer, 1, 5) -SUBSYS(mds_locker, 1, 5) -SUBSYS(mds_log, 1, 5) -SUBSYS(mds_log_expire, 1, 5) -SUBSYS(mds_migrator, 1, 5) -SUBSYS(buffer, 0, 1) -SUBSYS(timer, 0, 1) -SUBSYS(filer, 0, 1) -SUBSYS(striper, 0, 1) -SUBSYS(objecter, 0, 1) -SUBSYS(rados, 0, 5) -SUBSYS(rbd, 0, 5) -SUBSYS(rbd_mirror, 0, 5) -SUBSYS(rbd_replay, 0, 5) -SUBSYS(journaler, 0, 5) -SUBSYS(objectcacher, 0, 5) -SUBSYS(client, 0, 5) -SUBSYS(osd, 1, 5) -SUBSYS(optracker, 0, 5) -SUBSYS(objclass, 0, 5) -SUBSYS(filestore, 1, 3) -SUBSYS(journal, 1, 3) -SUBSYS(ms, 0, 5) -SUBSYS(mon, 1, 5) -SUBSYS(monc, 0, 10) -SUBSYS(paxos, 1, 5) -SUBSYS(tp, 0, 5) -SUBSYS(auth, 1, 5) -SUBSYS(crypto, 1, 5) -SUBSYS(finisher, 1, 1) -SUBSYS(heartbeatmap, 1, 5) -SUBSYS(perfcounter, 1, 5) -SUBSYS(rgw, 1, 5) // log level for the Rados gateway -SUBSYS(civetweb, 1, 10) -SUBSYS(javaclient, 1, 5) -SUBSYS(asok, 1, 5) -SUBSYS(throttle, 1, 1) -SUBSYS(refs, 0, 0) -SUBSYS(xio, 1, 5) -SUBSYS(compressor, 1, 5) -SUBSYS(bluestore, 1, 5) -SUBSYS(bluefs, 1, 5) -SUBSYS(bdev, 1, 3) -SUBSYS(kstore, 1, 5) -SUBSYS(rocksdb, 4, 5) -SUBSYS(leveldb, 4, 5) -SUBSYS(memdb, 4, 5) -SUBSYS(kinetic, 1, 5) -SUBSYS(fuse, 1, 5) -SUBSYS(mgr, 1, 5) -SUBSYS(mgrc, 1, 5) -SUBSYS(dpdk, 1, 5) -SUBSYS(eventtrace, 1, 5) - -OPTION(key, OPT_STR, "") -OPTION(keyfile, OPT_STR, "") -OPTION(keyring, OPT_STR, - // default changed by common_preinit() for mds and osd - "/etc/ceph/$cluster.$name.keyring,/etc/ceph/$cluster.keyring,/etc/ceph/keyring,/etc/ceph/keyring.bin," -#if defined(__FreeBSD) - "/usr/local/etc/ceph/$cluster.$name.keyring,/usr/local/etc/ceph/$cluster.keyring," - "/usr/local/etc/ceph/keyring,/usr/local/etc/ceph/keyring.bin," -#endif - ) -OPTION(heartbeat_interval, OPT_INT, 5) -OPTION(heartbeat_file, OPT_STR, "") -OPTION(heartbeat_inject_failure, OPT_INT, 0) // force an unhealthy heartbeat for N seconds -OPTION(perf, OPT_BOOL, true) // enable internal perf counters - -SAFE_OPTION(ms_type, OPT_STR, "async+posix") // messenger backend. It will be modified in runtime, so use SAFE_OPTION -OPTION(ms_public_type, OPT_STR, "") // messenger backend -OPTION(ms_cluster_type, OPT_STR, "") // messenger backend -OPTION(ms_tcp_nodelay, OPT_BOOL, true) -OPTION(ms_tcp_rcvbuf, OPT_INT, 0) -OPTION(ms_tcp_prefetch_max_size, OPT_INT, 4096) // max prefetch size, we limit this to avoid extra memcpy -OPTION(ms_initial_backoff, OPT_DOUBLE, .2) -OPTION(ms_max_backoff, OPT_DOUBLE, 15.0) -OPTION(ms_crc_data, OPT_BOOL, true) -OPTION(ms_crc_header, OPT_BOOL, true) -OPTION(ms_die_on_bad_msg, OPT_BOOL, false) -OPTION(ms_die_on_unhandled_msg, OPT_BOOL, false) -OPTION(ms_die_on_old_message, OPT_BOOL, false) // assert if we get a dup incoming message and shouldn't have (may be triggered by pre-541cd3c64be0dfa04e8a2df39422e0eb9541a428 code) -OPTION(ms_die_on_skipped_message, OPT_BOOL, false) // assert if we skip a seq (kernel client does this intentionally) -OPTION(ms_dispatch_throttle_bytes, OPT_U64, 100 << 20) -OPTION(ms_bind_ipv6, OPT_BOOL, false) -OPTION(ms_bind_port_min, OPT_INT, 6800) -OPTION(ms_bind_port_max, OPT_INT, 7300) -#if !defined(__FreeBSD__) -OPTION(ms_bind_retry_count, OPT_INT, 3) // If binding fails, how many times do we retry to bind -OPTION(ms_bind_retry_delay, OPT_INT, 5) // Delay between attemps to bind -#else -// FreeBSD does not use SO_REAUSEADDR so allow for a bit more time per default -OPTION(ms_bind_retry_count, OPT_INT, 6) // If binding fails, how many times do we retry to bind -OPTION(ms_bind_retry_delay, OPT_INT, 6) // Delay between attemps to bind -#endif -OPTION(ms_bind_before_connect, OPT_BOOL, false) -OPTION(ms_tcp_listen_backlog, OPT_INT, 512) -OPTION(ms_rwthread_stack_bytes, OPT_U64, 1024 << 10) -OPTION(ms_tcp_read_timeout, OPT_U64, 900) -OPTION(ms_pq_max_tokens_per_priority, OPT_U64, 16777216) -OPTION(ms_pq_min_cost, OPT_U64, 65536) -OPTION(ms_inject_socket_failures, OPT_U64, 0) -SAFE_OPTION(ms_inject_delay_type, OPT_STR, "") // "osd mds mon client" allowed -OPTION(ms_inject_delay_msg_type, OPT_STR, "") // the type of message to delay, as returned by Message::get_type_name(). This is an additional restriction on the general type filter ms_inject_delay_type. -OPTION(ms_inject_delay_max, OPT_DOUBLE, 1) // seconds -OPTION(ms_inject_delay_probability, OPT_DOUBLE, 0) // range [0, 1] -OPTION(ms_inject_internal_delays, OPT_DOUBLE, 0) // seconds -OPTION(ms_dump_on_send, OPT_BOOL, false) // hexdump msg to log on send -OPTION(ms_dump_corrupt_message_level, OPT_INT, 1) // debug level to hexdump undecodeable messages at -OPTION(ms_async_op_threads, OPT_U64, 3) // number of worker processing threads for async messenger created on init -OPTION(ms_async_max_op_threads, OPT_U64, 5) // max number of worker processing threads for async messenger -OPTION(ms_async_set_affinity, OPT_BOOL, true) -// example: ms_async_affinity_cores = 0,1 -// The number of coreset is expected to equal to ms_async_op_threads, otherwise -// extra op threads will loop ms_async_affinity_cores again. -// If ms_async_affinity_cores is empty, all threads will be bind to current running -// core -OPTION(ms_async_affinity_cores, OPT_STR, "") -OPTION(ms_async_rdma_device_name, OPT_STR, "") -OPTION(ms_async_rdma_enable_hugepage, OPT_BOOL, false) -OPTION(ms_async_rdma_buffer_size, OPT_INT, 128 << 10) -OPTION(ms_async_rdma_send_buffers, OPT_U32, 1024) -OPTION(ms_async_rdma_receive_buffers, OPT_U32, 1024) -OPTION(ms_async_rdma_port_num, OPT_U32, 1) -OPTION(ms_async_rdma_polling_us, OPT_U32, 1000) -OPTION(ms_async_rdma_local_gid, OPT_STR, "") // GID format: "fe80:0000:0000:0000:7efe:90ff:fe72:6efe", no zero folding -OPTION(ms_async_rdma_roce_ver, OPT_INT, 1) // 0=RoCEv1, 1=RoCEv2, 2=RoCEv1.5 -OPTION(ms_async_rdma_sl, OPT_INT, 3) // in RoCE, this means PCP -OPTION(ms_async_rdma_dscp, OPT_INT, 96) // in RoCE, this means DSCP - -OPTION(ms_dpdk_port_id, OPT_INT, 0) -SAFE_OPTION(ms_dpdk_coremask, OPT_STR, "1") // it is modified in unittest so that use SAFE_OPTION to declare -OPTION(ms_dpdk_memory_channel, OPT_STR, "4") -OPTION(ms_dpdk_hugepages, OPT_STR, "") -OPTION(ms_dpdk_pmd, OPT_STR, "") -SAFE_OPTION(ms_dpdk_host_ipv4_addr, OPT_STR, "") -SAFE_OPTION(ms_dpdk_gateway_ipv4_addr, OPT_STR, "") -SAFE_OPTION(ms_dpdk_netmask_ipv4_addr, OPT_STR, "") -OPTION(ms_dpdk_lro, OPT_BOOL, true) -OPTION(ms_dpdk_hw_flow_control, OPT_BOOL, true) -// Weighing of a hardware network queue relative to a software queue (0=no work, 1= equal share)") -OPTION(ms_dpdk_hw_queue_weight, OPT_FLOAT, 1) -OPTION(ms_dpdk_debug_allow_loopback, OPT_BOOL, false) -OPTION(ms_dpdk_rx_buffer_count_per_core, OPT_INT, 8192) - -OPTION(inject_early_sigterm, OPT_BOOL, false) - -OPTION(mon_data, OPT_STR, "/var/lib/ceph/mon/$cluster-$id") -OPTION(mon_initial_members, OPT_STR, "") // list of initial cluster mon ids; if specified, need majority to form initial quorum and create new cluster -OPTION(mon_compact_on_start, OPT_BOOL, false) // compact leveldb on ceph-mon start -OPTION(mon_compact_on_bootstrap, OPT_BOOL, false) // trigger leveldb compaction on bootstrap -OPTION(mon_compact_on_trim, OPT_BOOL, true) // compact (a prefix) when we trim old states -OPTION(mon_osd_cache_size, OPT_INT, 10) // the size of osdmaps cache, not to rely on underlying store's cache - -OPTION(mon_cpu_threads, OPT_INT, 4) -OPTION(mon_osd_mapping_pgs_per_chunk, OPT_INT, 4096) -OPTION(mon_osd_max_creating_pgs, OPT_INT, 1024) -OPTION(mon_tick_interval, OPT_INT, 5) -OPTION(mon_session_timeout, OPT_INT, 300) // must send keepalive or subscribe -OPTION(mon_subscribe_interval, OPT_DOUBLE, 24*3600) // for legacy clients only -OPTION(mon_delta_reset_interval, OPT_DOUBLE, 10) // seconds of inactivity before we reset the pg delta to 0 -OPTION(mon_osd_laggy_halflife, OPT_INT, 60*60) // (seconds) how quickly our laggy estimations decay -OPTION(mon_osd_laggy_weight, OPT_DOUBLE, .3) // weight for new 'samples's in laggy estimations -OPTION(mon_osd_laggy_max_interval, OPT_INT, 300) // maximum value of laggy_interval in laggy estimations -OPTION(mon_osd_adjust_heartbeat_grace, OPT_BOOL, true) // true if we should scale based on laggy estimations -OPTION(mon_osd_adjust_down_out_interval, OPT_BOOL, true) // true if we should scale based on laggy estimations -OPTION(mon_osd_auto_mark_in, OPT_BOOL, false) // mark any booting osds 'in' -OPTION(mon_osd_auto_mark_auto_out_in, OPT_BOOL, true) // mark booting auto-marked-out osds 'in' -OPTION(mon_osd_auto_mark_new_in, OPT_BOOL, true) // mark booting new osds 'in' -OPTION(mon_osd_down_out_interval, OPT_INT, 600) // seconds -OPTION(mon_osd_down_out_subtree_limit, OPT_STR, "rack") // smallest crush unit/type that we will not automatically mark out -OPTION(mon_osd_min_up_ratio, OPT_DOUBLE, .3) // min osds required to be up to mark things down -OPTION(mon_osd_min_in_ratio, OPT_DOUBLE, .75) // min osds required to be in to mark things out -OPTION(mon_osd_warn_op_age, OPT_DOUBLE, 32) // max op age before we generate a warning (make it a power of 2) -OPTION(mon_osd_err_op_age_ratio, OPT_DOUBLE, 128) // when to generate an error, as multiple of mon_osd_warn_op_age -OPTION(mon_osd_max_split_count, OPT_INT, 32) // largest number of PGs per "involved" OSD to let split create -OPTION(mon_osd_allow_primary_temp, OPT_BOOL, false) // allow primary_temp to be set in the osdmap -OPTION(mon_osd_allow_primary_affinity, OPT_BOOL, false) // allow primary_affinity to be set in the osdmap -OPTION(mon_osd_prime_pg_temp, OPT_BOOL, true) // prime osdmap with pg mapping changes -OPTION(mon_osd_prime_pg_temp_max_time, OPT_FLOAT, .5) // max time to spend priming -OPTION(mon_osd_prime_pg_temp_max_estimate, OPT_FLOAT, .25) // max estimate of pg total before we do all pgs in parallel -OPTION(mon_osd_pool_ec_fast_read, OPT_BOOL, false) // whether turn on fast read on the pool or not -OPTION(mon_stat_smooth_intervals, OPT_INT, 6) // smooth stats over last N PGMap maps -OPTION(mon_election_timeout, OPT_FLOAT, 5) // on election proposer, max waiting time for all ACKs -OPTION(mon_lease, OPT_FLOAT, 5) // lease interval -OPTION(mon_lease_renew_interval_factor, OPT_FLOAT, .6) // on leader, to renew the lease -OPTION(mon_lease_ack_timeout_factor, OPT_FLOAT, 2.0) // on leader, if lease isn't acked by all peons -OPTION(mon_accept_timeout_factor, OPT_FLOAT, 2.0) // on leader, if paxos update isn't accepted - -OPTION(mon_clock_drift_allowed, OPT_FLOAT, .050) // allowed clock drift between monitors -OPTION(mon_clock_drift_warn_backoff, OPT_FLOAT, 5) // exponential backoff for clock drift warnings -OPTION(mon_timecheck_interval, OPT_FLOAT, 300.0) // on leader, timecheck (clock drift check) interval (seconds) -OPTION(mon_timecheck_skew_interval, OPT_FLOAT, 30.0) // on leader, timecheck (clock drift check) interval when in presence of a skew (seconds) -OPTION(mon_pg_stuck_threshold, OPT_INT, 60) // number of seconds after which pgs can be considered stuck inactive, unclean, etc (see doc/control.rst under dump_stuck for more info) -OPTION(mon_pg_min_inactive, OPT_U64, 1) // the number of PGs which have to be inactive longer than 'mon_pg_stuck_threshold' before health goes into ERR. 0 means disabled, never go into ERR. -OPTION(mon_pg_warn_min_per_osd, OPT_INT, 30) // min # pgs per (in) osd before we warn the admin -OPTION(mon_pg_warn_max_per_osd, OPT_INT, 300) // max # pgs per (in) osd before we warn the admin -OPTION(mon_pg_warn_max_object_skew, OPT_FLOAT, 10.0) // max skew few average in objects per pg -OPTION(mon_pg_warn_min_objects, OPT_INT, 10000) // do not warn below this object # -OPTION(mon_pg_warn_min_pool_objects, OPT_INT, 1000) // do not warn on pools below this object # -OPTION(mon_pg_check_down_all_threshold, OPT_FLOAT, .5) // threshold of down osds after which we check all pgs -OPTION(mon_cache_target_full_warn_ratio, OPT_FLOAT, .66) // position between pool cache_target_full and max where we start warning -OPTION(mon_osd_full_ratio, OPT_FLOAT, .95) // what % full makes an OSD "full" -OPTION(mon_osd_backfillfull_ratio, OPT_FLOAT, .90) // what % full makes an OSD backfill full (backfill halted) -OPTION(mon_osd_nearfull_ratio, OPT_FLOAT, .85) // what % full makes an OSD near full -OPTION(mon_osd_initial_require_min_compat_client, OPT_STR, "jewel") -OPTION(mon_allow_pool_delete, OPT_BOOL, false) // allow pool deletion -OPTION(mon_fake_pool_delete, OPT_BOOL, false) // fake pool deletion (add _DELETED suffix) -OPTION(mon_globalid_prealloc, OPT_U32, 10000) // how many globalids to prealloc -OPTION(mon_osd_report_timeout, OPT_INT, 900) // grace period before declaring unresponsive OSDs dead -OPTION(mon_force_standby_active, OPT_BOOL, true) // should mons force standby-replay mds to be active -OPTION(mon_warn_on_legacy_crush_tunables, OPT_BOOL, true) // warn if crush tunables are too old (older than mon_min_crush_required_version) -OPTION(mon_crush_min_required_version, OPT_STR, "firefly") -OPTION(mon_warn_on_crush_straw_calc_version_zero, OPT_BOOL, true) // warn if crush straw_calc_version==0 -OPTION(mon_warn_on_osd_down_out_interval_zero, OPT_BOOL, true) // warn if 'mon_osd_down_out_interval == 0' -OPTION(mon_warn_on_cache_pools_without_hit_sets, OPT_BOOL, true) -OPTION(mon_warn_osd_usage_min_max_delta, OPT_FLOAT, .40) // warn if difference between min and max OSD utilizations exceeds specified amount -OPTION(mon_min_osdmap_epochs, OPT_INT, 500) -OPTION(mon_max_pgmap_epochs, OPT_INT, 500) -OPTION(mon_max_log_epochs, OPT_INT, 500) -OPTION(mon_max_mdsmap_epochs, OPT_INT, 500) -OPTION(mon_max_osd, OPT_INT, 10000) -OPTION(mon_probe_timeout, OPT_DOUBLE, 2.0) -OPTION(mon_client_bytes, OPT_U64, 100ul << 20) // client msg data allowed in memory (in bytes) -OPTION(mon_mgr_proxy_client_bytes_ratio, OPT_FLOAT, .3) // ratio of mon_client_bytes that can be consumed by proxied mgr commands before we error out to client -OPTION(mon_log_max_summary, OPT_U64, 50) -OPTION(mon_daemon_bytes, OPT_U64, 400ul << 20) // mds, osd message memory cap (in bytes) -OPTION(mon_max_log_entries_per_event, OPT_INT, 4096) -OPTION(mon_reweight_min_pgs_per_osd, OPT_U64, 10) // min pgs per osd for reweight-by-pg command -OPTION(mon_reweight_min_bytes_per_osd, OPT_U64, 100*1024*1024) // min bytes per osd for reweight-by-utilization command -OPTION(mon_reweight_max_osds, OPT_INT, 4) // max osds to change per reweight-by-* command -OPTION(mon_reweight_max_change, OPT_DOUBLE, 0.05) -OPTION(mon_health_data_update_interval, OPT_FLOAT, 60.0) -OPTION(mon_health_to_clog, OPT_BOOL, true) -OPTION(mon_health_to_clog_interval, OPT_INT, 3600) -OPTION(mon_health_to_clog_tick_interval, OPT_DOUBLE, 60.0) -OPTION(mon_health_preluminous_compat, OPT_BOOL, false) -OPTION(mon_health_max_detail, OPT_INT, 50) // max detailed pgs to report in health detail -OPTION(mon_data_avail_crit, OPT_INT, 5) -OPTION(mon_data_avail_warn, OPT_INT, 30) -OPTION(mon_data_size_warn, OPT_U64, 15*1024*1024*1024) // issue a warning when the monitor's data store goes over 15GB (in bytes) -OPTION(mon_warn_not_scrubbed, OPT_INT, 0) -OPTION(mon_warn_not_deep_scrubbed, OPT_INT, 0) -OPTION(mon_scrub_interval, OPT_INT, 3600*24) // once a day -OPTION(mon_scrub_timeout, OPT_INT, 60*5) // let's give it 5 minutes; why not. -OPTION(mon_scrub_max_keys, OPT_INT, 100) // max number of keys to scrub each time -OPTION(mon_scrub_inject_crc_mismatch, OPT_DOUBLE, 0.0) // probability of injected crc mismatch [0.0, 1.0] -OPTION(mon_scrub_inject_missing_keys, OPT_DOUBLE, 0.0) // probability of injected missing keys [0.0, 1.0] -OPTION(mon_config_key_max_entry_size, OPT_INT, 4096) // max num bytes per config-key entry -OPTION(mon_sync_timeout, OPT_DOUBLE, 60.0) -OPTION(mon_sync_max_payload_size, OPT_U32, 1048576) // max size for a sync chunk payload (say, 1MB) -OPTION(mon_sync_debug, OPT_BOOL, false) // enable sync-specific debug -OPTION(mon_inject_sync_get_chunk_delay, OPT_DOUBLE, 0) // inject N second delay on each get_chunk request -OPTION(mon_osd_min_down_reporters, OPT_INT, 2) // number of OSDs from different subtrees who need to report a down OSD for it to count -OPTION(mon_osd_reporter_subtree_level , OPT_STR, "host") // in which level of parent bucket the reporters are counted -OPTION(mon_osd_force_trim_to, OPT_INT, 0) // force mon to trim maps to this point, regardless of min_last_epoch_clean (dangerous, use with care) -OPTION(mon_mds_force_trim_to, OPT_INT, 0) // force mon to trim mdsmaps to this point (dangerous, use with care) -OPTION(mon_mds_skip_sanity, OPT_BOOL, false) // skip safety assertions on FSMap (in case of bugs where we want to continue anyway) - -// monitor debug options -OPTION(mon_debug_deprecated_as_obsolete, OPT_BOOL, false) // consider deprecated commands as obsolete - -// dump transactions -OPTION(mon_debug_dump_transactions, OPT_BOOL, false) -OPTION(mon_debug_dump_json, OPT_BOOL, false) -OPTION(mon_debug_dump_location, OPT_STR, "/var/log/ceph/$cluster-$name.tdump") -OPTION(mon_debug_no_require_luminous, OPT_BOOL, false) -OPTION(mon_debug_no_require_bluestore_for_ec_overwrites, OPT_BOOL, false) -OPTION(mon_debug_no_initial_persistent_features, OPT_BOOL, false) -OPTION(mon_inject_transaction_delay_max, OPT_DOUBLE, 10.0) // seconds -OPTION(mon_inject_transaction_delay_probability, OPT_DOUBLE, 0) // range [0, 1] - -OPTION(mon_sync_provider_kill_at, OPT_INT, 0) // kill the sync provider at a specific point in the work flow -OPTION(mon_sync_requester_kill_at, OPT_INT, 0) // kill the sync requester at a specific point in the work flow -OPTION(mon_force_quorum_join, OPT_BOOL, false) // force monitor to join quorum even if it has been previously removed from the map -OPTION(mon_keyvaluedb, OPT_STR, "rocksdb") // type of keyvaluedb backend - -// UNSAFE -- TESTING ONLY! Allows addition of a cache tier with preexisting snaps -OPTION(mon_debug_unsafe_allow_tier_with_nonempty_snaps, OPT_BOOL, false) -OPTION(mon_osd_blacklist_default_expire, OPT_DOUBLE, 60*60) // default one hour -OPTION(mon_osd_crush_smoke_test, OPT_BOOL, true) - -OPTION(paxos_stash_full_interval, OPT_INT, 25) // how often (in commits) to stash a full copy of the PaxosService state -OPTION(paxos_max_join_drift, OPT_INT, 10) // max paxos iterations before we must first sync the monitor stores -OPTION(paxos_propose_interval, OPT_DOUBLE, 1.0) // gather updates for this long before proposing a map update -OPTION(paxos_min_wait, OPT_DOUBLE, 0.05) // min time to gather updates for after period of inactivity -OPTION(paxos_min, OPT_INT, 500) // minimum number of paxos states to keep around -OPTION(paxos_trim_min, OPT_INT, 250) // number of extra proposals tolerated before trimming -OPTION(paxos_trim_max, OPT_INT, 500) // max number of extra proposals to trim at a time -OPTION(paxos_service_trim_min, OPT_INT, 250) // minimum amount of versions to trigger a trim (0 disables it) -OPTION(paxos_service_trim_max, OPT_INT, 500) // maximum amount of versions to trim during a single proposal (0 disables it) -OPTION(paxos_kill_at, OPT_INT, 0) -OPTION(auth_cluster_required, OPT_STR, "cephx") // required of mon, mds, osd daemons -OPTION(auth_service_required, OPT_STR, "cephx") // required by daemons of clients -OPTION(auth_client_required, OPT_STR, "cephx, none") // what clients require of daemons -OPTION(auth_supported, OPT_STR, "") // deprecated; default value for above if they are not defined. -OPTION(max_rotating_auth_attempts, OPT_INT, 10) -OPTION(cephx_require_signatures, OPT_BOOL, false) // If true, don't talk to Cephx partners if they don't support message signing; off by default -OPTION(cephx_cluster_require_signatures, OPT_BOOL, false) -OPTION(cephx_service_require_signatures, OPT_BOOL, false) -OPTION(cephx_sign_messages, OPT_BOOL, true) // Default to signing session messages if supported -OPTION(auth_mon_ticket_ttl, OPT_DOUBLE, 60*60*12) -OPTION(auth_service_ticket_ttl, OPT_DOUBLE, 60*60) -OPTION(auth_debug, OPT_BOOL, false) // if true, assert when weird things happen -OPTION(mon_client_hunt_parallel, OPT_U32, 2) // how many mons to try to connect to in parallel during hunt -OPTION(mon_client_hunt_interval, OPT_DOUBLE, 3.0) // try new mon every N seconds until we connect -OPTION(mon_client_ping_interval, OPT_DOUBLE, 10.0) // ping every N seconds -OPTION(mon_client_ping_timeout, OPT_DOUBLE, 30.0) // fail if we don't hear back -OPTION(mon_client_hunt_interval_backoff, OPT_DOUBLE, 2.0) // each time we reconnect to a monitor, double our timeout -OPTION(mon_client_hunt_interval_max_multiple, OPT_DOUBLE, 10.0) // up to a max of 10*default (30 seconds) -OPTION(mon_client_max_log_entries_per_message, OPT_INT, 1000) -OPTION(mon_max_pool_pg_num, OPT_INT, 65536) -OPTION(mon_pool_quota_warn_threshold, OPT_INT, 0) // percent of quota at which to issue warnings -OPTION(mon_pool_quota_crit_threshold, OPT_INT, 0) // percent of quota at which to issue errors -OPTION(client_cache_size, OPT_INT, 16384) -OPTION(client_cache_mid, OPT_FLOAT, .75) -OPTION(client_use_random_mds, OPT_BOOL, false) -OPTION(client_mount_timeout, OPT_DOUBLE, 300.0) -OPTION(client_tick_interval, OPT_DOUBLE, 1.0) -OPTION(client_trace, OPT_STR, "") -OPTION(client_readahead_min, OPT_LONGLONG, 128*1024) // readahead at _least_ this much. -OPTION(client_readahead_max_bytes, OPT_LONGLONG, 0) // default unlimited -OPTION(client_readahead_max_periods, OPT_LONGLONG, 4) // as multiple of file layout period (object size * num stripes) -OPTION(client_reconnect_stale, OPT_BOOL, false) // automatically reconnect stale session -OPTION(client_snapdir, OPT_STR, ".snap") -OPTION(client_mountpoint, OPT_STR, "/") -OPTION(client_mount_uid, OPT_INT, -1) -OPTION(client_mount_gid, OPT_INT, -1) -OPTION(client_notify_timeout, OPT_INT, 10) // in seconds -OPTION(osd_client_watch_timeout, OPT_INT, 30) // in seconds -OPTION(client_caps_release_delay, OPT_INT, 5) // in seconds -OPTION(client_quota_df, OPT_BOOL, true) // use quota for df on subdir mounts -OPTION(client_oc, OPT_BOOL, true) -OPTION(client_oc_size, OPT_INT, 1024*1024* 200) // MB * n -OPTION(client_oc_max_dirty, OPT_INT, 1024*1024* 100) // MB * n (dirty OR tx.. bigish) -OPTION(client_oc_target_dirty, OPT_INT, 1024*1024* 8) // target dirty (keep this smallish) -OPTION(client_oc_max_dirty_age, OPT_DOUBLE, 5.0) // max age in cache before writeback -OPTION(client_oc_max_objects, OPT_INT, 1000) // max objects in cache -OPTION(client_debug_getattr_caps, OPT_BOOL, false) // check if MDS reply contains wanted caps -OPTION(client_debug_force_sync_read, OPT_BOOL, false) // always read synchronously (go to osds) -OPTION(client_debug_inject_tick_delay, OPT_INT, 0) // delay the client tick for a number of seconds -OPTION(client_max_inline_size, OPT_U64, 4096) -OPTION(client_inject_release_failure, OPT_BOOL, false) // synthetic client bug for testing -OPTION(client_inject_fixed_oldest_tid, OPT_BOOL, false) // synthetic client bug for testing -OPTION(client_metadata, OPT_STR, "") -OPTION(client_acl_type, OPT_STR, "") -OPTION(client_permissions, OPT_BOOL, true) -OPTION(client_dirsize_rbytes, OPT_BOOL, true) - -// note: the max amount of "in flight" dirty data is roughly (max - target) -OPTION(fuse_use_invalidate_cb, OPT_BOOL, true) // use fuse 2.8+ invalidate callback to keep page cache consistent -OPTION(fuse_disable_pagecache, OPT_BOOL, false) -OPTION(fuse_allow_other, OPT_BOOL, true) -OPTION(fuse_default_permissions, OPT_BOOL, false) -OPTION(fuse_big_writes, OPT_BOOL, true) -OPTION(fuse_atomic_o_trunc, OPT_BOOL, true) -OPTION(fuse_debug, OPT_BOOL, false) -OPTION(fuse_multithreaded, OPT_BOOL, true) -OPTION(fuse_require_active_mds, OPT_BOOL, true) // if ceph_fuse requires active mds server -OPTION(fuse_syncfs_on_mksnap, OPT_BOOL, true) -OPTION(fuse_set_user_groups, OPT_BOOL, false) // if ceph_fuse fills in group lists or not - -OPTION(client_try_dentry_invalidate, OPT_BOOL, true) // the client should try to use dentry invaldation instead of remounting, on kernels it believes that will work for -OPTION(client_die_on_failed_remount, OPT_BOOL, true) -OPTION(client_check_pool_perm, OPT_BOOL, true) -OPTION(client_use_faked_inos, OPT_BOOL, false) -OPTION(client_mds_namespace, OPT_STR, "") - -OPTION(crush_location, OPT_STR, "") // whitespace-separated list of key=value pairs describing crush location -OPTION(crush_location_hook, OPT_STR, "") -OPTION(crush_location_hook_timeout, OPT_INT, 10) - -OPTION(objecter_tick_interval, OPT_DOUBLE, 5.0) -OPTION(objecter_timeout, OPT_DOUBLE, 10.0) // before we ask for a map -OPTION(objecter_inflight_op_bytes, OPT_U64, 1024*1024*100) // max in-flight data (both directions) -OPTION(objecter_inflight_ops, OPT_U64, 1024) // max in-flight ios -OPTION(objecter_completion_locks_per_session, OPT_U64, 32) // num of completion locks per each session, for serializing same object responses -OPTION(objecter_inject_no_watch_ping, OPT_BOOL, false) // suppress watch pings -OPTION(objecter_retry_writes_after_first_reply, OPT_BOOL, false) // ignore the first reply for each write, and resend the osd op instead -OPTION(objecter_debug_inject_relock_delay, OPT_BOOL, false) - -// Max number of deletes at once in a single Filer::purge call -OPTION(filer_max_purge_ops, OPT_U32, 10) -// Max number of truncate at once in a single Filer::truncate call -OPTION(filer_max_truncate_ops, OPT_U32, 128) - -OPTION(journaler_write_head_interval, OPT_INT, 15) -OPTION(journaler_prefetch_periods, OPT_INT, 10) // * journal object size -OPTION(journaler_prezero_periods, OPT_INT, 5) // * journal object size -OPTION(mds_data, OPT_STR, "/var/lib/ceph/mds/$cluster-$id") -OPTION(mds_max_file_size, OPT_U64, 1ULL << 40) // Used when creating new CephFS. Change with 'ceph mds set max_file_size ' afterwards -// max xattr kv pairs size for each dir/file -OPTION(mds_max_xattr_pairs_size, OPT_U32, 64 << 10) -OPTION(mds_cache_size, OPT_INT, 100000) -OPTION(mds_cache_mid, OPT_FLOAT, .7) -OPTION(mds_max_file_recover, OPT_U32, 32) -OPTION(mds_dir_max_commit_size, OPT_INT, 10) // MB -OPTION(mds_dir_keys_per_op, OPT_INT, 16384) -OPTION(mds_decay_halflife, OPT_FLOAT, 5) -OPTION(mds_beacon_interval, OPT_FLOAT, 4) -OPTION(mds_beacon_grace, OPT_FLOAT, 15) -OPTION(mds_enforce_unique_name, OPT_BOOL, true) -OPTION(mds_blacklist_interval, OPT_FLOAT, 24.0*60.0) // how long to blacklist failed nodes - -OPTION(mds_session_timeout, OPT_FLOAT, 60) // cap bits and leases time out if client idle -OPTION(mds_session_blacklist_on_timeout, OPT_BOOL, true) // whether to blacklist clients whose sessions are dropped due to timeout -OPTION(mds_session_blacklist_on_evict, OPT_BOOL, true) // whether to blacklist clients whose sessions are dropped via admin commands - -OPTION(mds_sessionmap_keys_per_op, OPT_U32, 1024) // how many sessions should I try to load/store in a single OMAP operation? -OPTION(mds_revoke_cap_timeout, OPT_FLOAT, 60) // detect clients which aren't revoking caps -OPTION(mds_recall_state_timeout, OPT_FLOAT, 60) // detect clients which aren't trimming caps -OPTION(mds_freeze_tree_timeout, OPT_FLOAT, 30) // detecting freeze tree deadlock -OPTION(mds_session_autoclose, OPT_FLOAT, 300) // autoclose idle session -OPTION(mds_health_summarize_threshold, OPT_INT, 10) // collapse N-client health metrics to a single 'many' -OPTION(mds_health_cache_threshold, OPT_FLOAT, 1.5) // warn on cache size if it exceeds mds_cache_size by this factor -OPTION(mds_reconnect_timeout, OPT_FLOAT, 45) // seconds to wait for clients during mds restart - // make it (mds_session_timeout - mds_beacon_grace) -OPTION(mds_tick_interval, OPT_FLOAT, 5) -OPTION(mds_dirstat_min_interval, OPT_FLOAT, 1) // try to avoid propagating more often than this -OPTION(mds_scatter_nudge_interval, OPT_FLOAT, 5) // how quickly dirstat changes propagate up the hierarchy -OPTION(mds_client_prealloc_inos, OPT_INT, 1000) -OPTION(mds_early_reply, OPT_BOOL, true) -OPTION(mds_default_dir_hash, OPT_INT, CEPH_STR_HASH_RJENKINS) -OPTION(mds_log_pause, OPT_BOOL, false) -OPTION(mds_log_skip_corrupt_events, OPT_BOOL, false) -OPTION(mds_log_max_events, OPT_INT, -1) -OPTION(mds_log_events_per_segment, OPT_INT, 1024) -OPTION(mds_log_segment_size, OPT_INT, 0) // segment size for mds log, default to default file_layout_t -OPTION(mds_log_max_segments, OPT_U32, 30) -OPTION(mds_log_max_expiring, OPT_INT, 20) -OPTION(mds_bal_export_pin, OPT_BOOL, true) // allow clients to pin directory trees to ranks -OPTION(mds_bal_sample_interval, OPT_DOUBLE, 3.0) // every 3 seconds -OPTION(mds_bal_replicate_threshold, OPT_FLOAT, 8000) -OPTION(mds_bal_unreplicate_threshold, OPT_FLOAT, 0) -OPTION(mds_bal_frag, OPT_BOOL, true) -OPTION(mds_bal_split_size, OPT_INT, 10000) -OPTION(mds_bal_split_rd, OPT_FLOAT, 25000) -OPTION(mds_bal_split_wr, OPT_FLOAT, 10000) -OPTION(mds_bal_split_bits, OPT_INT, 3) -OPTION(mds_bal_merge_size, OPT_INT, 50) -OPTION(mds_bal_interval, OPT_INT, 10) // seconds -OPTION(mds_bal_fragment_interval, OPT_INT, 5) // seconds -OPTION(mds_bal_fragment_size_max, OPT_INT, 10000*10) // order of magnitude higher than split size -OPTION(mds_bal_fragment_fast_factor, OPT_FLOAT, 1.5) // multiple of size_max that triggers immediate split -OPTION(mds_bal_idle_threshold, OPT_FLOAT, 0) -OPTION(mds_bal_max, OPT_INT, -1) -OPTION(mds_bal_max_until, OPT_INT, -1) -OPTION(mds_bal_mode, OPT_INT, 0) -OPTION(mds_bal_min_rebalance, OPT_FLOAT, .1) // must be this much above average before we export anything -OPTION(mds_bal_min_start, OPT_FLOAT, .2) // if we need less than this, we don't do anything -OPTION(mds_bal_need_min, OPT_FLOAT, .8) // take within this range of what we need -OPTION(mds_bal_need_max, OPT_FLOAT, 1.2) -OPTION(mds_bal_midchunk, OPT_FLOAT, .3) // any sub bigger than this taken in full -OPTION(mds_bal_minchunk, OPT_FLOAT, .001) // never take anything smaller than this -OPTION(mds_bal_target_decay, OPT_DOUBLE, 10.0) // target decay half-life in MDSMap (2x larger is approx. 2x slower) -OPTION(mds_replay_interval, OPT_FLOAT, 1.0) // time to wait before starting replay again -OPTION(mds_shutdown_check, OPT_INT, 0) -OPTION(mds_thrash_exports, OPT_INT, 0) -OPTION(mds_thrash_fragments, OPT_INT, 0) -OPTION(mds_dump_cache_on_map, OPT_BOOL, false) -OPTION(mds_dump_cache_after_rejoin, OPT_BOOL, false) -OPTION(mds_verify_scatter, OPT_BOOL, false) -OPTION(mds_debug_scatterstat, OPT_BOOL, false) -OPTION(mds_debug_frag, OPT_BOOL, false) -OPTION(mds_debug_auth_pins, OPT_BOOL, false) -OPTION(mds_debug_subtrees, OPT_BOOL, false) -OPTION(mds_kill_mdstable_at, OPT_INT, 0) -OPTION(mds_kill_export_at, OPT_INT, 0) -OPTION(mds_kill_import_at, OPT_INT, 0) -OPTION(mds_kill_link_at, OPT_INT, 0) -OPTION(mds_kill_rename_at, OPT_INT, 0) -OPTION(mds_kill_openc_at, OPT_INT, 0) -OPTION(mds_kill_journal_at, OPT_INT, 0) -OPTION(mds_kill_journal_expire_at, OPT_INT, 0) -OPTION(mds_kill_journal_replay_at, OPT_INT, 0) -OPTION(mds_journal_format, OPT_U32, 1) // Default to most recent JOURNAL_FORMAT_* -OPTION(mds_kill_create_at, OPT_INT, 0) -OPTION(mds_inject_traceless_reply_probability, OPT_DOUBLE, 0) /* percentage - of MDS modify replies to skip sending the - client a trace on [0-1]*/ -OPTION(mds_wipe_sessions, OPT_BOOL, 0) -OPTION(mds_wipe_ino_prealloc, OPT_BOOL, 0) -OPTION(mds_skip_ino, OPT_INT, 0) -OPTION(mds_standby_for_name, OPT_STR, "") -OPTION(mds_standby_for_rank, OPT_INT, -1) -OPTION(mds_standby_for_fscid, OPT_INT, -1) -OPTION(mds_standby_replay, OPT_BOOL, false) -OPTION(mds_enable_op_tracker, OPT_BOOL, true) // enable/disable MDS op tracking -OPTION(mds_op_history_size, OPT_U32, 20) // Max number of completed ops to track -OPTION(mds_op_history_duration, OPT_U32, 600) // Oldest completed op to track -OPTION(mds_op_complaint_time, OPT_FLOAT, 30) // how many seconds old makes an op complaint-worthy -OPTION(mds_op_log_threshold, OPT_INT, 5) // how many op log messages to show in one go -OPTION(mds_snap_min_uid, OPT_U32, 0) // The minimum UID required to create a snapshot -OPTION(mds_snap_max_uid, OPT_U32, 4294967294) // The maximum UID allowed to create a snapshot -OPTION(mds_snap_rstat, OPT_BOOL, false) // enable/disbale nested stat for snapshot -OPTION(mds_verify_backtrace, OPT_U32, 1) -// detect clients which aren't trimming completed requests -OPTION(mds_max_completed_flushes, OPT_U32, 100000) -OPTION(mds_max_completed_requests, OPT_U32, 100000) - -OPTION(mds_action_on_write_error, OPT_U32, 1) // 0: ignore; 1: force readonly; 2: crash -OPTION(mds_mon_shutdown_timeout, OPT_DOUBLE, 5) - -// Maximum number of concurrent stray files to purge -OPTION(mds_max_purge_files, OPT_U32, 64) -// Maximum number of concurrent RADOS ops to issue in purging -OPTION(mds_max_purge_ops, OPT_U32, 8192) -// Maximum number of concurrent RADOS ops to issue in purging, scaled by PG count -OPTION(mds_max_purge_ops_per_pg, OPT_FLOAT, 0.5) - -OPTION(mds_purge_queue_busy_flush_period, OPT_FLOAT, 1.0) - -OPTION(mds_root_ino_uid, OPT_INT, 0) // The UID of / on new filesystems -OPTION(mds_root_ino_gid, OPT_INT, 0) // The GID of / on new filesystems - -OPTION(mds_max_scrub_ops_in_progress, OPT_INT, 5) // the number of simultaneous scrubs allowed - -// Maximum number of damaged frags/dentries before whole MDS rank goes damaged -OPTION(mds_damage_table_max_entries, OPT_INT, 10000) - -// Maximum increment for client writable range, counted by number of objects -OPTION(mds_client_writeable_range_max_inc_objs, OPT_U32, 1024) - -// verify backend can support configured max object name length -OPTION(osd_check_max_object_name_len_on_startup, OPT_BOOL, true) - -// Maximum number of backfills to or from a single osd -OPTION(osd_max_backfills, OPT_U64, 1) - -// Minimum recovery priority (255 = max, smaller = lower) -OPTION(osd_min_recovery_priority, OPT_INT, 0) - -// Seconds to wait before retrying refused backfills -OPTION(osd_backfill_retry_interval, OPT_DOUBLE, 30.0) - -// Seconds to wait before retrying refused recovery -OPTION(osd_recovery_retry_interval, OPT_DOUBLE, 30.0) - -// max agent flush ops -OPTION(osd_agent_max_ops, OPT_INT, 4) -OPTION(osd_agent_max_low_ops, OPT_INT, 2) -OPTION(osd_agent_min_evict_effort, OPT_FLOAT, .1) -OPTION(osd_agent_quantize_effort, OPT_FLOAT, .1) -OPTION(osd_agent_delay_time, OPT_FLOAT, 5.0) - -// osd ignore history.last_epoch_started in find_best_info -OPTION(osd_find_best_info_ignore_history_les, OPT_BOOL, false) - -// decay atime and hist histograms after how many objects go by -OPTION(osd_agent_hist_halflife, OPT_INT, 1000) - -// must be this amount over the threshold to enable, -// this amount below the threshold to disable. -OPTION(osd_agent_slop, OPT_FLOAT, .02) - -OPTION(osd_uuid, OPT_UUID, uuid_d()) -OPTION(osd_data, OPT_STR, "/var/lib/ceph/osd/$cluster-$id") -OPTION(osd_journal, OPT_STR, "/var/lib/ceph/osd/$cluster-$id/journal") -OPTION(osd_journal_size, OPT_INT, 5120) // in mb -OPTION(osd_journal_flush_on_shutdown, OPT_BOOL, true) // Flush journal to data store on shutdown -// flags for specific control purpose during osd mount() process. -// e.g., can be 1 to skip over replaying journal -// or 2 to skip over mounting omap or 3 to skip over both. -// This might be helpful in case the journal is totally corrupted -// and we still want to bring the osd daemon back normally, etc. -OPTION(osd_os_flags, OPT_U32, 0) -OPTION(osd_max_write_size, OPT_INT, 90) -OPTION(osd_max_pgls, OPT_U64, 1024) // max number of pgls entries to return -OPTION(osd_client_message_size_cap, OPT_U64, 500*1024L*1024L) // client data allowed in-memory (in bytes) -OPTION(osd_client_message_cap, OPT_U64, 100) // num client messages allowed in-memory -OPTION(osd_pg_bits, OPT_INT, 6) // bits per osd -OPTION(osd_pgp_bits, OPT_INT, 6) // bits per osd -OPTION(osd_crush_update_weight_set, OPT_BOOL, true) // update weight set while updating weights -OPTION(osd_crush_chooseleaf_type, OPT_INT, 1) // 1 = host -OPTION(osd_pool_use_gmt_hitset, OPT_BOOL, true) // try to use gmt for hitset archive names if all osds in cluster support it. -OPTION(osd_crush_update_on_start, OPT_BOOL, true) -OPTION(osd_class_update_on_start, OPT_BOOL, true) // automatically set device class on start -OPTION(osd_crush_initial_weight, OPT_DOUBLE, -1) // if >=0, the initial weight is for newly added osds. -OPTION(osd_pool_default_crush_rule, OPT_INT, -1) -OPTION(osd_pool_erasure_code_stripe_unit, OPT_U32, 4096) // in bytes -OPTION(osd_pool_default_size, OPT_INT, 3) -OPTION(osd_pool_default_min_size, OPT_INT, 0) // 0 means no specific default; ceph will use size-size/2 -OPTION(osd_pool_default_pg_num, OPT_INT, 8) // number of PGs for new pools. Configure in global or mon section of ceph.conf -OPTION(osd_pool_default_pgp_num, OPT_INT, 8) // number of PGs for placement purposes. Should be equal to pg_num -OPTION(osd_pool_default_type, OPT_STR, "replicated") -OPTION(osd_pool_default_erasure_code_profile, - OPT_STR, - "plugin=jerasure " - "technique=reed_sol_van " - "k=2 " - "m=1 " - ) // default properties of osd pool create -OPTION(osd_erasure_code_plugins, OPT_STR, - "jerasure" - " lrc" -#ifdef HAVE_BETTER_YASM_ELF64 - " isa" -#endif - ) // list of erasure code plugins - -// Allows the "peered" state for recovery and backfill below min_size -OPTION(osd_allow_recovery_below_min_size, OPT_BOOL, true) - -OPTION(osd_pool_default_flags, OPT_INT, 0) // default flags for new pools -OPTION(osd_pool_default_flag_hashpspool, OPT_BOOL, true) // use new pg hashing to prevent pool/pg overlap -OPTION(osd_pool_default_flag_nodelete, OPT_BOOL, false) // pool can't be deleted -OPTION(osd_pool_default_flag_nopgchange, OPT_BOOL, false) // pool's pg and pgp num can't be changed -OPTION(osd_pool_default_flag_nosizechange, OPT_BOOL, false) // pool's size and min size can't be changed -OPTION(osd_pool_default_hit_set_bloom_fpp, OPT_FLOAT, .05) -OPTION(osd_pool_default_cache_target_dirty_ratio, OPT_FLOAT, .4) -OPTION(osd_pool_default_cache_target_dirty_high_ratio, OPT_FLOAT, .6) -OPTION(osd_pool_default_cache_target_full_ratio, OPT_FLOAT, .8) -OPTION(osd_pool_default_cache_min_flush_age, OPT_INT, 0) // seconds -OPTION(osd_pool_default_cache_min_evict_age, OPT_INT, 0) // seconds -OPTION(osd_pool_default_cache_max_evict_check_size, OPT_INT, 10) // max size to check for eviction -OPTION(osd_hit_set_min_size, OPT_INT, 1000) // min target size for a HitSet -OPTION(osd_hit_set_max_size, OPT_INT, 100000) // max target size for a HitSet -OPTION(osd_hit_set_namespace, OPT_STR, ".ceph-internal") // rados namespace for hit_set tracking - -// conservative default throttling values -OPTION(osd_tier_promote_max_objects_sec, OPT_U64, 25) -OPTION(osd_tier_promote_max_bytes_sec, OPT_U64, 5 * 1024*1024) - -OPTION(osd_tier_default_cache_mode, OPT_STR, "writeback") -OPTION(osd_tier_default_cache_hit_set_count, OPT_INT, 4) -OPTION(osd_tier_default_cache_hit_set_period, OPT_INT, 1200) -OPTION(osd_tier_default_cache_hit_set_type, OPT_STR, "bloom") -OPTION(osd_tier_default_cache_min_read_recency_for_promote, OPT_INT, 1) // number of recent HitSets the object must appear in to be promoted (on read) -OPTION(osd_tier_default_cache_min_write_recency_for_promote, OPT_INT, 1) // number of recent HitSets the object must appear in to be promoted (on write) -OPTION(osd_tier_default_cache_hit_set_grade_decay_rate, OPT_INT, 20) -OPTION(osd_tier_default_cache_hit_set_search_last_n, OPT_INT, 1) - -OPTION(osd_map_dedup, OPT_BOOL, true) -OPTION(osd_map_max_advance, OPT_INT, 40) // make this < cache_size! -OPTION(osd_map_cache_size, OPT_INT, 50) -OPTION(osd_map_message_max, OPT_INT, 40) // max maps per MOSDMap message -OPTION(osd_map_share_max_epochs, OPT_INT, 40) // cap on # of inc maps we send to peers, clients -OPTION(osd_inject_bad_map_crc_probability, OPT_FLOAT, 0) -OPTION(osd_inject_failure_on_pg_removal, OPT_BOOL, false) -// shutdown the OSD if stuatus flipping more than max_markdown_count times in recent max_markdown_period seconds -OPTION(osd_max_markdown_period , OPT_INT, 600) -OPTION(osd_max_markdown_count, OPT_INT, 5) - -OPTION(osd_peering_wq_threads, OPT_INT, 2) -OPTION(osd_peering_wq_batch_size, OPT_U64, 20) -OPTION(osd_op_pq_max_tokens_per_priority, OPT_U64, 4194304) -OPTION(osd_op_pq_min_cost, OPT_U64, 65536) -OPTION(osd_disk_threads, OPT_INT, 1) -OPTION(osd_disk_thread_ioprio_class, OPT_STR, "") // rt realtime be best effort idle -OPTION(osd_disk_thread_ioprio_priority, OPT_INT, -1) // 0-7 -OPTION(osd_recover_clone_overlap, OPT_BOOL, true) // preserve clone_overlap during recovery/migration -OPTION(osd_op_num_threads_per_shard, OPT_INT, 0) -OPTION(osd_op_num_threads_per_shard_hdd, OPT_INT, 1) -OPTION(osd_op_num_threads_per_shard_ssd, OPT_INT, 2) -OPTION(osd_op_num_shards, OPT_INT, 0) -OPTION(osd_op_num_shards_hdd, OPT_INT, 5) -OPTION(osd_op_num_shards_ssd, OPT_INT, 8) - -// PrioritzedQueue (prio), Weighted Priority Queue (wpq ; default), -// mclock_opclass, mclock_client, or debug_random. "mclock_opclass" -// and "mclock_client" are based on the mClock/dmClock algorithm -// (Gulati, et al. 2010). "mclock_opclass" prioritizes based on the -// class the operation belongs to. "mclock_client" does the same but -// also works to ienforce fairness between clients. "debug_random" -// chooses among all four with equal probability. -OPTION(osd_op_queue, OPT_STR, "wpq") - -OPTION(osd_op_queue_cut_off, OPT_STR, "low") // Min priority to go to strict queue. (low, high, debug_random) - -// mClock priority queue parameters for five types of ops -OPTION(osd_op_queue_mclock_client_op_res, OPT_DOUBLE, 1000.0) -OPTION(osd_op_queue_mclock_client_op_wgt, OPT_DOUBLE, 500.0) -OPTION(osd_op_queue_mclock_client_op_lim, OPT_DOUBLE, 0.0) -OPTION(osd_op_queue_mclock_osd_subop_res, OPT_DOUBLE, 1000.0) -OPTION(osd_op_queue_mclock_osd_subop_wgt, OPT_DOUBLE, 500.0) -OPTION(osd_op_queue_mclock_osd_subop_lim, OPT_DOUBLE, 0.0) -OPTION(osd_op_queue_mclock_snap_res, OPT_DOUBLE, 0.0) -OPTION(osd_op_queue_mclock_snap_wgt, OPT_DOUBLE, 1.0) -OPTION(osd_op_queue_mclock_snap_lim, OPT_DOUBLE, 0.001) -OPTION(osd_op_queue_mclock_recov_res, OPT_DOUBLE, 0.0) -OPTION(osd_op_queue_mclock_recov_wgt, OPT_DOUBLE, 1.0) -OPTION(osd_op_queue_mclock_recov_lim, OPT_DOUBLE, 0.001) -OPTION(osd_op_queue_mclock_scrub_res, OPT_DOUBLE, 0.0) -OPTION(osd_op_queue_mclock_scrub_wgt, OPT_DOUBLE, 1.0) -OPTION(osd_op_queue_mclock_scrub_lim, OPT_DOUBLE, 0.001) - -OPTION(osd_ignore_stale_divergent_priors, OPT_BOOL, false) // do not assert on divergent_prior entries which aren't in the log and whose on-disk objects are newer - -// Set to true for testing. Users should NOT set this. -// If set to true even after reading enough shards to -// decode the object, any error will be reported. -OPTION(osd_read_ec_check_for_errors, OPT_BOOL, false) // return error if any ec shard has an error - -// Only use clone_overlap for recovery if there are fewer than -// osd_recover_clone_overlap_limit entries in the overlap set -OPTION(osd_recover_clone_overlap_limit, OPT_INT, 10) - -OPTION(osd_backfill_scan_min, OPT_INT, 64) -OPTION(osd_backfill_scan_max, OPT_INT, 512) -OPTION(osd_op_thread_timeout, OPT_INT, 15) -OPTION(osd_op_thread_suicide_timeout, OPT_INT, 150) -OPTION(osd_recovery_thread_timeout, OPT_INT, 30) -OPTION(osd_recovery_thread_suicide_timeout, OPT_INT, 300) -OPTION(osd_recovery_sleep, OPT_FLOAT, 0.01) // seconds to sleep between recovery ops -OPTION(osd_snap_trim_sleep, OPT_DOUBLE, 0) -OPTION(osd_scrub_invalid_stats, OPT_BOOL, true) -OPTION(osd_remove_thread_timeout, OPT_INT, 60*60) -OPTION(osd_remove_thread_suicide_timeout, OPT_INT, 10*60*60) -OPTION(osd_command_thread_timeout, OPT_INT, 10*60) -OPTION(osd_command_thread_suicide_timeout, OPT_INT, 15*60) -OPTION(osd_heartbeat_addr, OPT_ADDR, entity_addr_t()) -OPTION(osd_heartbeat_interval, OPT_INT, 6) // (seconds) how often we ping peers - -// (seconds) how long before we decide a peer has failed -// This setting is read by the MONs and OSDs and has to be set to a equal value in both settings of the configuration -OPTION(osd_heartbeat_grace, OPT_INT, 20) -OPTION(osd_heartbeat_min_peers, OPT_INT, 10) // minimum number of peers -OPTION(osd_heartbeat_use_min_delay_socket, OPT_BOOL, false) // prio the heartbeat tcp socket and set dscp as CS6 on it if true -OPTION(osd_heartbeat_min_size, OPT_INT, 2000) // the minimum size of OSD heartbeat messages to send - -// max number of parallel snap trims/pg -OPTION(osd_pg_max_concurrent_snap_trims, OPT_U64, 2) -// max number of trimming pgs -OPTION(osd_max_trimming_pgs, OPT_U64, 2) - -// minimum number of peers that must be reachable to mark ourselves -// back up after being wrongly marked down. -OPTION(osd_heartbeat_min_healthy_ratio, OPT_FLOAT, .33) - -OPTION(osd_mon_heartbeat_interval, OPT_INT, 30) // (seconds) how often to ping monitor if no peers -OPTION(osd_mon_report_interval_max, OPT_INT, 600) -OPTION(osd_mon_report_interval_min, OPT_INT, 5) // pg stats, failures, up_thru, boot. -OPTION(osd_mon_report_max_in_flight, OPT_INT, 2) // max updates in flight -OPTION(osd_beacon_report_interval, OPT_INT, 300) // (second) how often to send beacon message to monitor -OPTION(osd_pg_stat_report_interval_max, OPT_INT, 500) // report pg stats for any given pg at least this often -OPTION(osd_mon_ack_timeout, OPT_DOUBLE, 30.0) // time out a mon if it doesn't ack stats -OPTION(osd_stats_ack_timeout_factor, OPT_DOUBLE, 2.0) // multiples of mon_ack_timeout -OPTION(osd_stats_ack_timeout_decay, OPT_DOUBLE, .9) -OPTION(osd_default_data_pool_replay_window, OPT_INT, 45) -OPTION(osd_auto_mark_unfound_lost, OPT_BOOL, false) -OPTION(osd_recovery_delay_start, OPT_FLOAT, 0) -OPTION(osd_recovery_max_active, OPT_U64, 3) -OPTION(osd_recovery_max_single_start, OPT_U64, 1) -OPTION(osd_recovery_max_chunk, OPT_U64, 8<<20) // max size of push chunk -OPTION(osd_recovery_max_omap_entries_per_chunk, OPT_U64, 64000) // max number of omap entries per chunk; 0 to disable limit -OPTION(osd_copyfrom_max_chunk, OPT_U64, 8<<20) // max size of a COPYFROM chunk -OPTION(osd_push_per_object_cost, OPT_U64, 1000) // push cost per object -OPTION(osd_max_push_cost, OPT_U64, 8<<20) // max size of push message -OPTION(osd_max_push_objects, OPT_U64, 10) // max objects in single push op -OPTION(osd_recovery_forget_lost_objects, OPT_BOOL, false) // off for now -OPTION(osd_max_scrubs, OPT_INT, 1) -OPTION(osd_scrub_during_recovery, OPT_BOOL, false) // Allow new scrubs to start while recovery is active on the OSD -OPTION(osd_scrub_begin_hour, OPT_INT, 0) -OPTION(osd_scrub_end_hour, OPT_INT, 24) -OPTION(osd_scrub_load_threshold, OPT_FLOAT, 0.5) -OPTION(osd_scrub_min_interval, OPT_FLOAT, 60*60*24) // if load is low -OPTION(osd_scrub_max_interval, OPT_FLOAT, 7*60*60*24) // regardless of load -OPTION(osd_scrub_interval_randomize_ratio, OPT_FLOAT, 0.5) // randomize the scheduled scrub in the span of [min,min*(1+randomize_ratio)) -OPTION(osd_scrub_backoff_ratio, OPT_DOUBLE, .66) // the probability to back off the scheduled scrub -OPTION(osd_scrub_chunk_min, OPT_INT, 5) -OPTION(osd_scrub_chunk_max, OPT_INT, 25) -OPTION(osd_scrub_sleep, OPT_FLOAT, 0) // sleep between [deep]scrub ops -OPTION(osd_scrub_auto_repair, OPT_BOOL, false) // whether auto-repair inconsistencies upon deep-scrubbing -OPTION(osd_scrub_auto_repair_num_errors, OPT_U32, 5) // only auto-repair when number of errors is below this threshold -OPTION(osd_deep_scrub_interval, OPT_FLOAT, 60*60*24*7) // once a week -OPTION(osd_deep_scrub_randomize_ratio, OPT_FLOAT, 0.15) // scrubs will randomly become deep scrubs at this rate (0.15 -> 15% of scrubs are deep) -OPTION(osd_deep_scrub_stride, OPT_INT, 524288) -OPTION(osd_deep_scrub_update_digest_min_age, OPT_INT, 2*60*60) // objects must be this old (seconds) before we update the whole-object digest on scrub -OPTION(osd_class_dir, OPT_STR, CEPH_LIBDIR "/rados-classes") // where rados plugins are stored -OPTION(osd_open_classes_on_start, OPT_BOOL, true) -OPTION(osd_class_load_list, OPT_STR, "cephfs hello journal lock log numops " - "rbd refcount replica_log rgw statelog timeindex user version") // list of object classes allowed to be loaded (allow all: *) -OPTION(osd_class_default_list, OPT_STR, "cephfs hello journal lock log numops " - "rbd refcount replica_log rgw statelog timeindex user version") // list of object classes with default execute perm (allow all: *) -OPTION(osd_check_for_log_corruption, OPT_BOOL, false) -OPTION(osd_use_stale_snap, OPT_BOOL, false) -OPTION(osd_rollback_to_cluster_snap, OPT_STR, "") -OPTION(osd_default_notify_timeout, OPT_U32, 30) // default notify timeout in seconds -OPTION(osd_kill_backfill_at, OPT_INT, 0) - -// Bounds how infrequently a new map epoch will be persisted for a pg -OPTION(osd_pg_epoch_persisted_max_stale, OPT_U32, 40) // make this < map_cache_size! - -OPTION(osd_min_pg_log_entries, OPT_U32, 3000) // number of entries to keep in the pg log when trimming it -OPTION(osd_max_pg_log_entries, OPT_U32, 10000) // max entries, say when degraded, before we trim -OPTION(osd_force_recovery_pg_log_entries_factor, OPT_FLOAT, 1.3) // max entries factor before force recovery -OPTION(osd_pg_log_trim_min, OPT_U32, 100) -OPTION(osd_op_complaint_time, OPT_FLOAT, 30) // how many seconds old makes an op complaint-worthy -OPTION(osd_command_max_records, OPT_INT, 256) -OPTION(osd_max_pg_blocked_by, OPT_U32, 16) // max peer osds to report that are blocking our progress -OPTION(osd_op_log_threshold, OPT_INT, 5) // how many op log messages to show in one go -OPTION(osd_verify_sparse_read_holes, OPT_BOOL, false) // read fiemap-reported holes and verify they are zeros -OPTION(osd_backoff_on_unfound, OPT_BOOL, true) // object unfound -OPTION(osd_backoff_on_degraded, OPT_BOOL, false) // [mainly for debug?] object unreadable/writeable -OPTION(osd_backoff_on_down, OPT_BOOL, true) // pg in down/incomplete state -OPTION(osd_backoff_on_peering, OPT_BOOL, false) // [debug] pg peering -OPTION(osd_debug_crash_on_ignored_backoff, OPT_BOOL, false) // crash osd if client ignores a backoff; useful for debugging -OPTION(osd_debug_inject_dispatch_delay_probability, OPT_DOUBLE, 0) -OPTION(osd_debug_inject_dispatch_delay_duration, OPT_DOUBLE, .1) -OPTION(osd_debug_drop_ping_probability, OPT_DOUBLE, 0) -OPTION(osd_debug_drop_ping_duration, OPT_INT, 0) -OPTION(osd_debug_op_order, OPT_BOOL, false) -OPTION(osd_debug_verify_missing_on_start, OPT_BOOL, false) -OPTION(osd_debug_scrub_chance_rewrite_digest, OPT_U64, 0) -OPTION(osd_debug_verify_snaps_on_info, OPT_BOOL, false) -OPTION(osd_debug_verify_stray_on_activate, OPT_BOOL, false) -OPTION(osd_debug_skip_full_check_in_backfill_reservation, OPT_BOOL, false) -OPTION(osd_debug_reject_backfill_probability, OPT_DOUBLE, 0) -OPTION(osd_debug_inject_copyfrom_error, OPT_BOOL, false) // inject failure during copyfrom completion -OPTION(osd_debug_misdirected_ops, OPT_BOOL, false) -OPTION(osd_debug_skip_full_check_in_recovery, OPT_BOOL, false) -OPTION(osd_debug_random_push_read_error, OPT_DOUBLE, 0) -OPTION(osd_debug_verify_cached_snaps, OPT_BOOL, false) -OPTION(osd_enable_op_tracker, OPT_BOOL, true) // enable/disable OSD op tracking -OPTION(osd_num_op_tracker_shard, OPT_U32, 32) // The number of shards for holding the ops -OPTION(osd_op_history_size, OPT_U32, 20) // Max number of completed ops to track -OPTION(osd_op_history_duration, OPT_U32, 600) // Oldest completed op to track -OPTION(osd_op_history_slow_op_size, OPT_U32, 20) // Max number of slow ops to track -OPTION(osd_op_history_slow_op_threshold, OPT_DOUBLE, 10.0) // track the op if over this threshold -OPTION(osd_target_transaction_size, OPT_INT, 30) // to adjust various transactions that batch smaller items -OPTION(osd_failsafe_full_ratio, OPT_FLOAT, .97) // what % full makes an OSD "full" (failsafe) -OPTION(osd_fast_fail_on_connection_refused, OPT_BOOL, true) // immediately mark OSDs as down once they refuse to accept connections - -OPTION(osd_pg_object_context_cache_count, OPT_INT, 64) -OPTION(osd_tracing, OPT_BOOL, false) // true if LTTng-UST tracepoints should be enabled -OPTION(osd_function_tracing, OPT_BOOL, false) // true if function instrumentation should use LTTng - -OPTION(osd_fast_info, OPT_BOOL, true) // use fast info attr, if we can - -// determines whether PGLog::check() compares written out log to stored log -OPTION(osd_debug_pg_log_writeout, OPT_BOOL, false) -OPTION(osd_loop_before_reset_tphandle, OPT_U32, 64) // Max number of loop before we reset thread-pool's handle -// default timeout while caling WaitInterval on an empty queue -OPTION(threadpool_default_timeout, OPT_INT, 60) -// default wait time for an empty queue before pinging the hb timeout -OPTION(threadpool_empty_queue_max_wait, OPT_INT, 2) - -OPTION(leveldb_log_to_ceph_log, OPT_BOOL, true) -OPTION(leveldb_write_buffer_size, OPT_U64, 8 *1024*1024) // leveldb write buffer size -OPTION(leveldb_cache_size, OPT_U64, 128 *1024*1024) // leveldb cache size -OPTION(leveldb_block_size, OPT_U64, 0) // leveldb block size -OPTION(leveldb_bloom_size, OPT_INT, 0) // leveldb bloom bits per entry -OPTION(leveldb_max_open_files, OPT_INT, 0) // leveldb max open files -OPTION(leveldb_compression, OPT_BOOL, true) // leveldb uses compression -OPTION(leveldb_paranoid, OPT_BOOL, false) // leveldb paranoid flag -OPTION(leveldb_log, OPT_STR, "/dev/null") // enable leveldb log file -OPTION(leveldb_compact_on_mount, OPT_BOOL, false) - -OPTION(kinetic_host, OPT_STR, "") // hostname or ip address of a kinetic drive to use -OPTION(kinetic_port, OPT_INT, 8123) // port number of the kinetic drive -OPTION(kinetic_user_id, OPT_INT, 1) // kinetic user to authenticate as -OPTION(kinetic_hmac_key, OPT_STR, "asdfasdf") // kinetic key to authenticate with -OPTION(kinetic_use_ssl, OPT_BOOL, false) // whether to secure kinetic traffic with TLS - - -OPTION(rocksdb_separate_wal_dir, OPT_BOOL, false) // use $path.wal for wal -SAFE_OPTION(rocksdb_db_paths, OPT_STR, "") // path,size( path,size)* -OPTION(rocksdb_log_to_ceph_log, OPT_BOOL, true) // log to ceph log -OPTION(rocksdb_cache_size, OPT_U64, 128*1024*1024) // rocksdb cache size (unless set by bluestore/etc) -OPTION(rocksdb_cache_row_ratio, OPT_FLOAT, 0) // ratio of cache for row (vs block) -OPTION(rocksdb_cache_shard_bits, OPT_INT, 4) // rocksdb block cache shard bits, 4 bit -> 16 shards -OPTION(rocksdb_cache_type, OPT_STR, "lru") // 'lru' or 'clock' -OPTION(rocksdb_block_size, OPT_INT, 4*1024) // default rocksdb block size -OPTION(rocksdb_perf, OPT_BOOL, false) // Enabling this will have 5-10% impact on performance for the stats collection -OPTION(rocksdb_collect_compaction_stats, OPT_BOOL, false) //For rocksdb, this behavior will be an overhead of 5%~10%, collected only rocksdb_perf is enabled. -OPTION(rocksdb_collect_extended_stats, OPT_BOOL, false) //For rocksdb, this behavior will be an overhead of 5%~10%, collected only rocksdb_perf is enabled. -OPTION(rocksdb_collect_memory_stats, OPT_BOOL, false) //For rocksdb, this behavior will be an overhead of 5%~10%, collected only rocksdb_perf is enabled. -OPTION(rocksdb_enable_rmrange, OPT_BOOL, false) // see https://github.com/facebook/rocksdb/blob/master/include/rocksdb/db.h#L253 - -// rocksdb options that will be used for omap(if omap_backend is rocksdb) -OPTION(filestore_rocksdb_options, OPT_STR, "") -// rocksdb options that will be used in monstore -OPTION(mon_rocksdb_options, OPT_STR, "write_buffer_size=33554432,compression=kNoCompression") - -/** - * osd_*_priority adjust the relative priority of client io, recovery io, - * snaptrim io, etc - * - * osd_*_priority determines the ratio of available io between client and - * recovery. Each option may be set between - * 1..63. - */ -OPTION(osd_client_op_priority, OPT_U32, 63) -OPTION(osd_recovery_op_priority, OPT_U32, 3) - -OPTION(osd_snap_trim_priority, OPT_U32, 5) -OPTION(osd_snap_trim_cost, OPT_U32, 1<<20) // set default cost equal to 1MB io - -OPTION(osd_scrub_priority, OPT_U32, 5) -// set default cost equal to 50MB io -OPTION(osd_scrub_cost, OPT_U32, 50<<20) -// set requested scrub priority higher than scrub priority to make the -// requested scrubs jump the queue of scheduled scrubs -OPTION(osd_requested_scrub_priority, OPT_U32, 120) - -OPTION(osd_recovery_priority, OPT_U32, 5) -// set default cost equal to 20MB io -OPTION(osd_recovery_cost, OPT_U32, 20<<20) - -/** - * osd_recovery_op_warn_multiple scales the normal warning threshhold, - * osd_op_complaint_time, so that slow recovery ops won't cause noise - */ -OPTION(osd_recovery_op_warn_multiple, OPT_U32, 16) - -// Max time to wait between notifying mon of shutdown and shutting down -OPTION(osd_mon_shutdown_timeout, OPT_DOUBLE, 5) -OPTION(osd_shutdown_pgref_assert, OPT_BOOL, false) // crash if the OSD has stray PG refs on shutdown - -OPTION(osd_max_object_size, OPT_U64, 128*1024L*1024L) // OSD's maximum object size -OPTION(osd_max_object_name_len, OPT_U32, 2048) // max rados object name len -OPTION(osd_max_object_namespace_len, OPT_U32, 256) // max rados object namespace len -OPTION(osd_max_attr_name_len, OPT_U32, 100) // max rados attr name len; cannot go higher than 100 chars for file system backends -OPTION(osd_max_attr_size, OPT_U64, 0) - -OPTION(osd_max_omap_entries_per_request, OPT_U64, 131072) -OPTION(osd_max_omap_bytes_per_request, OPT_U64, 1<<30) - -OPTION(osd_objectstore, OPT_STR, "filestore") // ObjectStore backend type -OPTION(osd_objectstore_tracing, OPT_BOOL, false) // true if LTTng-UST tracepoints should be enabled -OPTION(osd_objectstore_fuse, OPT_BOOL, false) - -OPTION(osd_bench_small_size_max_iops, OPT_U32, 100) // 100 IOPS -OPTION(osd_bench_large_size_max_throughput, OPT_U64, 100 << 20) // 100 MB/s -OPTION(osd_bench_max_block_size, OPT_U64, 64 << 20) // cap the block size at 64MB -OPTION(osd_bench_duration, OPT_U32, 30) // duration of 'osd bench', capped at 30s to avoid triggering timeouts - -OPTION(osd_blkin_trace_all, OPT_BOOL, false) // create a blkin trace for all osd requests -OPTION(osdc_blkin_trace_all, OPT_BOOL, false) // create a blkin trace for all objecter requests - -OPTION(osd_discard_disconnected_ops, OPT_BOOL, true) - -OPTION(memstore_device_bytes, OPT_U64, 1024*1024*1024) -OPTION(memstore_page_set, OPT_BOOL, true) -OPTION(memstore_page_size, OPT_U64, 64 << 10) - -OPTION(bdev_debug_inflight_ios, OPT_BOOL, false) -OPTION(bdev_inject_crash, OPT_INT, 0) // if N>0, then ~ 1/N IOs will complete before we crash on flush. -OPTION(bdev_inject_crash_flush_delay, OPT_INT, 2) // wait N more seconds on flush -OPTION(bdev_aio, OPT_BOOL, true) -OPTION(bdev_aio_poll_ms, OPT_INT, 250) // milliseconds -OPTION(bdev_aio_max_queue_depth, OPT_INT, 1024) -OPTION(bdev_aio_reap_max, OPT_INT, 16) -OPTION(bdev_block_size, OPT_INT, 4096) -OPTION(bdev_debug_aio, OPT_BOOL, false) -OPTION(bdev_debug_aio_suicide_timeout, OPT_FLOAT, 60.0) - -// if yes, osd will unbind all NVMe devices from kernel driver and bind them -// to the uio_pci_generic driver. The purpose is to prevent the case where -// NVMe driver is loaded while osd is running. -OPTION(bdev_nvme_unbind_from_kernel, OPT_BOOL, false) -OPTION(bdev_nvme_retry_count, OPT_INT, -1) // -1 means by default which is 4 - -OPTION(objectstore_blackhole, OPT_BOOL, false) - -OPTION(bluefs_alloc_size, OPT_U64, 1048576) -OPTION(bluefs_max_prefetch, OPT_U64, 1048576) -OPTION(bluefs_min_log_runway, OPT_U64, 1048576) // alloc when we get this low -OPTION(bluefs_max_log_runway, OPT_U64, 4194304) // alloc this much at a time -OPTION(bluefs_log_compact_min_ratio, OPT_FLOAT, 5.0) // before we consider -OPTION(bluefs_log_compact_min_size, OPT_U64, 16*1048576) // before we consider -OPTION(bluefs_min_flush_size, OPT_U64, 524288) // ignore flush until its this big -OPTION(bluefs_compact_log_sync, OPT_BOOL, false) // sync or async log compaction? -OPTION(bluefs_buffered_io, OPT_BOOL, false) -OPTION(bluefs_sync_write, OPT_BOOL, false) -OPTION(bluefs_allocator, OPT_STR, "bitmap") // stupid | bitmap -OPTION(bluefs_preextend_wal_files, OPT_BOOL, false) // this *requires* that rocksdb has recycling enabled - -OPTION(bluestore_bluefs, OPT_BOOL, true) -OPTION(bluestore_bluefs_env_mirror, OPT_BOOL, false) // mirror to normal Env for debug -OPTION(bluestore_bluefs_min, OPT_U64, 1*1024*1024*1024) // 1gb -OPTION(bluestore_bluefs_min_ratio, OPT_FLOAT, .02) // min fs free / total free -OPTION(bluestore_bluefs_max_ratio, OPT_FLOAT, .90) // max fs free / total free -OPTION(bluestore_bluefs_gift_ratio, OPT_FLOAT, .02) // how much to add at a time -OPTION(bluestore_bluefs_reclaim_ratio, OPT_FLOAT, .20) // how much to reclaim at a time -OPTION(bluestore_bluefs_balance_interval, OPT_FLOAT, 1) // how often (sec) to balance free space between bluefs and bluestore -// If you want to use spdk driver, you need to specify NVMe serial number here -// with "spdk:" prefix. -// Users can use 'lspci -vvv -d 8086:0953 | grep "Device Serial Number"' to -// get the serial number of Intel(R) Fultondale NVMe controllers. -// Example: -// bluestore_block_path = spdk:55cd2e404bd73932 -// If you want to run multiple SPDK instances per node, you must specify the -// amount of dpdk memory size in MB each instance will use, to make sure each -// instance uses its own dpdk memory -OPTION(bluestore_spdk_mem, OPT_U32, 512) -// A hexadecimal bit mask of the cores to run on. Note the core numbering can change between platforms and should be determined beforehand. -OPTION(bluestore_spdk_coremask, OPT_STR, "0x3") -// Specify the maximal I/Os to be batched completed while checking queue pair completions. -// Default value 0 means that let SPDK nvme library determine the value. -OPTION(bluestore_spdk_max_io_completion, OPT_U32, 0) -OPTION(bluestore_block_path, OPT_STR, "") -OPTION(bluestore_block_size, OPT_U64, 10 * 1024*1024*1024) // 10gb for testing -OPTION(bluestore_block_create, OPT_BOOL, true) -OPTION(bluestore_block_db_path, OPT_STR, "") -OPTION(bluestore_block_db_size, OPT_U64, 0) // rocksdb ssts (hot/warm) -OPTION(bluestore_block_db_create, OPT_BOOL, false) -OPTION(bluestore_block_wal_path, OPT_STR, "") -OPTION(bluestore_block_wal_size, OPT_U64, 96 * 1024*1024) // rocksdb wal -OPTION(bluestore_block_wal_create, OPT_BOOL, false) -OPTION(bluestore_block_preallocate_file, OPT_BOOL, false) //whether preallocate space if block/db_path/wal_path is file rather that block device. -OPTION(bluestore_csum_type, OPT_STR, "crc32c") // none|xxhash32|xxhash64|crc32c|crc32c_16|crc32c_8 -OPTION(bluestore_csum_min_block, OPT_U32, 4096) -OPTION(bluestore_csum_max_block, OPT_U32, 64*1024) -OPTION(bluestore_min_alloc_size, OPT_U32, 0) -OPTION(bluestore_min_alloc_size_hdd, OPT_U32, 64*1024) -OPTION(bluestore_min_alloc_size_ssd, OPT_U32, 16*1024) -OPTION(bluestore_max_alloc_size, OPT_U32, 0) -OPTION(bluestore_prefer_deferred_size, OPT_U32, 0) -OPTION(bluestore_prefer_deferred_size_hdd, OPT_U32, 32768) -OPTION(bluestore_prefer_deferred_size_ssd, OPT_U32, 0) -OPTION(bluestore_compression_mode, OPT_STR, "none") // force|aggressive|passive|none -OPTION(bluestore_compression_algorithm, OPT_STR, "snappy") -OPTION(bluestore_compression_min_blob_size, OPT_U32, 0) -OPTION(bluestore_compression_min_blob_size_hdd, OPT_U32, 128*1024) -OPTION(bluestore_compression_min_blob_size_ssd, OPT_U32, 8*1024) -OPTION(bluestore_compression_max_blob_size, OPT_U32, 0) -OPTION(bluestore_compression_max_blob_size_hdd, OPT_U32, 512*1024) -OPTION(bluestore_compression_max_blob_size_ssd, OPT_U32, 64*1024) -/* - * Specifies minimum expected amount of saved allocation units - * per single blob to enable compressed blobs garbage collection - * - */ -OPTION(bluestore_gc_enable_blob_threshold, OPT_INT, 0) -/* - * Specifies minimum expected amount of saved allocation units - * per all blobsb to enable compressed blobs garbage collection - * - */ -OPTION(bluestore_gc_enable_total_threshold, OPT_INT, 0) - -OPTION(bluestore_max_blob_size, OPT_U32, 0) -OPTION(bluestore_max_blob_size_hdd, OPT_U32, 512*1024) -OPTION(bluestore_max_blob_size_ssd, OPT_U32, 64*1024) -/* - * Require the net gain of compression at least to be at this ratio, - * otherwise we don't compress. - * And ask for compressing at least 12.5%(1/8) off, by default. - */ -OPTION(bluestore_compression_required_ratio, OPT_DOUBLE, .875) -OPTION(bluestore_extent_map_shard_max_size, OPT_U32, 1200) -OPTION(bluestore_extent_map_shard_target_size, OPT_U32, 500) -OPTION(bluestore_extent_map_shard_min_size, OPT_U32, 150) -OPTION(bluestore_extent_map_shard_target_size_slop, OPT_DOUBLE, .2) -OPTION(bluestore_extent_map_inline_shard_prealloc_size, OPT_U32, 256) -OPTION(bluestore_cache_trim_interval, OPT_DOUBLE, .2) -OPTION(bluestore_cache_trim_max_skip_pinned, OPT_U32, 64) // skip this many onodes pinned in cache before we give up -OPTION(bluestore_cache_type, OPT_STR, "2q") // lru, 2q -OPTION(bluestore_2q_cache_kin_ratio, OPT_DOUBLE, .5) // kin page slot size / max page slot size -OPTION(bluestore_2q_cache_kout_ratio, OPT_DOUBLE, .5) // number of kout page slot / total number of page slot -OPTION(bluestore_cache_size, OPT_U64, 0) -OPTION(bluestore_cache_size_hdd, OPT_U64, 1*1024*1024*1024) -OPTION(bluestore_cache_size_ssd, OPT_U64, 3*1024*1024*1024) -OPTION(bluestore_cache_meta_ratio, OPT_DOUBLE, .01) -OPTION(bluestore_cache_kv_ratio, OPT_DOUBLE, .99) -OPTION(bluestore_cache_kv_max, OPT_U64, 512*1024*1024) // limit the maximum amount of cache for the kv store -OPTION(bluestore_kvbackend, OPT_STR, "rocksdb") -OPTION(bluestore_allocator, OPT_STR, "bitmap") // stupid | bitmap -OPTION(bluestore_freelist_blocks_per_key, OPT_INT, 128) -OPTION(bluestore_bitmapallocator_blocks_per_zone, OPT_INT, 1024) // must be power of 2 aligned, e.g., 512, 1024, 2048... -OPTION(bluestore_bitmapallocator_span_size, OPT_INT, 1024) // must be power of 2 aligned, e.g., 512, 1024, 2048... -OPTION(bluestore_max_deferred_txc, OPT_U64, 32) -OPTION(bluestore_rocksdb_options, OPT_STR, "compression=kNoCompression,max_write_buffer_number=4,min_write_buffer_number_to_merge=1,recycle_log_file_num=4,write_buffer_size=268435456,writable_file_max_buffer_size=0,compaction_readahead_size=2097152") -OPTION(bluestore_fsck_on_mount, OPT_BOOL, false) -OPTION(bluestore_fsck_on_mount_deep, OPT_BOOL, true) -OPTION(bluestore_fsck_on_umount, OPT_BOOL, false) -OPTION(bluestore_fsck_on_umount_deep, OPT_BOOL, true) -OPTION(bluestore_fsck_on_mkfs, OPT_BOOL, true) -OPTION(bluestore_fsck_on_mkfs_deep, OPT_BOOL, false) -OPTION(bluestore_sync_submit_transaction, OPT_BOOL, false) // submit kv txn in queueing thread (not kv_sync_thread) -OPTION(bluestore_throttle_bytes, OPT_U64, 64*1024*1024) -OPTION(bluestore_throttle_deferred_bytes, OPT_U64, 128*1024*1024) -OPTION(bluestore_throttle_cost_per_io_hdd, OPT_U64, 670000) -OPTION(bluestore_throttle_cost_per_io_ssd, OPT_U64, 4000) -OPTION(bluestore_throttle_cost_per_io, OPT_U64, 0) -OPTION(bluestore_deferred_batch_ops, OPT_U64, 0) -OPTION(bluestore_deferred_batch_ops_hdd, OPT_U64, 64) -OPTION(bluestore_deferred_batch_ops_ssd, OPT_U64, 16) -OPTION(bluestore_nid_prealloc, OPT_INT, 1024) -OPTION(bluestore_blobid_prealloc, OPT_U64, 10240) -OPTION(bluestore_clone_cow, OPT_BOOL, true) // do copy-on-write for clones -OPTION(bluestore_default_buffered_read, OPT_BOOL, true) -OPTION(bluestore_default_buffered_write, OPT_BOOL, false) -OPTION(bluestore_debug_misc, OPT_BOOL, false) -OPTION(bluestore_debug_no_reuse_blocks, OPT_BOOL, false) -OPTION(bluestore_debug_small_allocations, OPT_INT, 0) -OPTION(bluestore_debug_freelist, OPT_BOOL, false) -OPTION(bluestore_debug_prefill, OPT_FLOAT, 0) -OPTION(bluestore_debug_prefragment_max, OPT_INT, 1048576) -OPTION(bluestore_debug_inject_read_err, OPT_BOOL, false) -OPTION(bluestore_debug_randomize_serial_transaction, OPT_INT, 0) -OPTION(bluestore_debug_omit_block_device_write, OPT_BOOL, false) -OPTION(bluestore_debug_fsck_abort, OPT_BOOL, false) -OPTION(bluestore_debug_omit_kv_commit, OPT_BOOL, false) -OPTION(bluestore_debug_permit_any_bdev_label, OPT_BOOL, false) -OPTION(bluestore_shard_finishers, OPT_BOOL, false) -OPTION(bluestore_debug_random_read_err, OPT_DOUBLE, 0) - -OPTION(kstore_max_ops, OPT_U64, 512) -OPTION(kstore_max_bytes, OPT_U64, 64*1024*1024) -OPTION(kstore_backend, OPT_STR, "rocksdb") -OPTION(kstore_rocksdb_options, OPT_STR, "compression=kNoCompression") -OPTION(kstore_rocksdb_bloom_bits_per_key, OPT_INT, 0) -OPTION(kstore_fsck_on_mount, OPT_BOOL, false) -OPTION(kstore_fsck_on_mount_deep, OPT_BOOL, true) -OPTION(kstore_nid_prealloc, OPT_U64, 1024) -OPTION(kstore_sync_transaction, OPT_BOOL, false) -OPTION(kstore_sync_submit_transaction, OPT_BOOL, false) -OPTION(kstore_onode_map_size, OPT_U64, 1024) -OPTION(kstore_default_stripe_size, OPT_INT, 65536) - -OPTION(filestore_omap_backend, OPT_STR, "rocksdb") -OPTION(filestore_omap_backend_path, OPT_STR, "") - -/// filestore wb throttle limits -OPTION(filestore_wbthrottle_enable, OPT_BOOL, true) -OPTION(filestore_wbthrottle_btrfs_bytes_start_flusher, OPT_U64, 41943040) -OPTION(filestore_wbthrottle_btrfs_bytes_hard_limit, OPT_U64, 419430400) -OPTION(filestore_wbthrottle_btrfs_ios_start_flusher, OPT_U64, 500) -OPTION(filestore_wbthrottle_btrfs_ios_hard_limit, OPT_U64, 5000) -OPTION(filestore_wbthrottle_btrfs_inodes_start_flusher, OPT_U64, 500) -OPTION(filestore_wbthrottle_xfs_bytes_start_flusher, OPT_U64, 41943040) -OPTION(filestore_wbthrottle_xfs_bytes_hard_limit, OPT_U64, 419430400) -OPTION(filestore_wbthrottle_xfs_ios_start_flusher, OPT_U64, 500) -OPTION(filestore_wbthrottle_xfs_ios_hard_limit, OPT_U64, 5000) -OPTION(filestore_wbthrottle_xfs_inodes_start_flusher, OPT_U64, 500) - -/// These must be less than the fd limit -OPTION(filestore_wbthrottle_btrfs_inodes_hard_limit, OPT_U64, 5000) -OPTION(filestore_wbthrottle_xfs_inodes_hard_limit, OPT_U64, 5000) - -//Introduce a O_DSYNC write in the filestore -OPTION(filestore_odsync_write, OPT_BOOL, false) - -// Tests index failure paths -OPTION(filestore_index_retry_probability, OPT_DOUBLE, 0) - -// Allow object read error injection -OPTION(filestore_debug_inject_read_err, OPT_BOOL, false) -OPTION(filestore_debug_random_read_err, OPT_DOUBLE, 0) - -OPTION(filestore_debug_omap_check, OPT_BOOL, false) // Expensive debugging check on sync -OPTION(filestore_omap_header_cache_size, OPT_INT, 1024) - -// Use omap for xattrs for attrs over -// filestore_max_inline_xattr_size or -OPTION(filestore_max_inline_xattr_size, OPT_U32, 0) //Override -OPTION(filestore_max_inline_xattr_size_xfs, OPT_U32, 65536) -OPTION(filestore_max_inline_xattr_size_btrfs, OPT_U32, 2048) -OPTION(filestore_max_inline_xattr_size_other, OPT_U32, 512) - -// for more than filestore_max_inline_xattrs attrs -OPTION(filestore_max_inline_xattrs, OPT_U32, 0) //Override -OPTION(filestore_max_inline_xattrs_xfs, OPT_U32, 10) -OPTION(filestore_max_inline_xattrs_btrfs, OPT_U32, 10) -OPTION(filestore_max_inline_xattrs_other, OPT_U32, 2) - -// max xattr value size -OPTION(filestore_max_xattr_value_size, OPT_U32, 0) //Override -OPTION(filestore_max_xattr_value_size_xfs, OPT_U32, 64<<10) -OPTION(filestore_max_xattr_value_size_btrfs, OPT_U32, 64<<10) -// ext4 allows 4k xattrs total including some smallish extra fields and the -// keys. We're allowing 2 512 inline attrs in addition some some filestore -// replay attrs. After accounting for those, we still need to fit up to -// two attrs of this value. That means we need this value to be around 1k -// to be safe. This is hacky, but it's not worth complicating the code -// to work around ext4's total xattr limit. -OPTION(filestore_max_xattr_value_size_other, OPT_U32, 1<<10) - -OPTION(filestore_sloppy_crc, OPT_BOOL, false) // track sloppy crcs -OPTION(filestore_sloppy_crc_block_size, OPT_INT, 65536) - -OPTION(filestore_max_alloc_hint_size, OPT_U64, 1ULL << 20) // bytes - -OPTION(filestore_max_sync_interval, OPT_DOUBLE, 5) // seconds -OPTION(filestore_min_sync_interval, OPT_DOUBLE, .01) // seconds -OPTION(filestore_btrfs_snap, OPT_BOOL, true) -OPTION(filestore_btrfs_clone_range, OPT_BOOL, true) -OPTION(filestore_zfs_snap, OPT_BOOL, false) // zfsonlinux is still unstable -OPTION(filestore_fsync_flushes_journal_data, OPT_BOOL, false) -OPTION(filestore_fiemap, OPT_BOOL, false) // (try to) use fiemap -OPTION(filestore_punch_hole, OPT_BOOL, false) -OPTION(filestore_seek_data_hole, OPT_BOOL, false) // (try to) use seek_data/hole -OPTION(filestore_splice, OPT_BOOL, false) -OPTION(filestore_fadvise, OPT_BOOL, true) -//collect device partition information for management application to use -OPTION(filestore_collect_device_partition_information, OPT_BOOL, true) - -// (try to) use extsize for alloc hint NOTE: extsize seems to trigger -// data corruption in xfs prior to kernel 3.5. filestore will -// implicity disable this if it cannot confirm the kernel is newer -// than that. -// NOTE: This option involves a tradeoff: When disabled, fragmentation is -// worse, but large sequential writes are faster. When enabled, large -// sequential writes are slower, but fragmentation is reduced. -OPTION(filestore_xfs_extsize, OPT_BOOL, false) - -OPTION(filestore_journal_parallel, OPT_BOOL, false) -OPTION(filestore_journal_writeahead, OPT_BOOL, false) -OPTION(filestore_journal_trailing, OPT_BOOL, false) -OPTION(filestore_queue_max_ops, OPT_U64, 50) -OPTION(filestore_queue_max_bytes, OPT_U64, 100 << 20) - -OPTION(filestore_caller_concurrency, OPT_INT, 10) - -/// Expected filestore throughput in B/s -OPTION(filestore_expected_throughput_bytes, OPT_DOUBLE, 200 << 20) -/// Expected filestore throughput in ops/s -OPTION(filestore_expected_throughput_ops, OPT_DOUBLE, 200) - -/// Filestore max delay multiple. Defaults to 0 (disabled) -OPTION(filestore_queue_max_delay_multiple, OPT_DOUBLE, 0) -/// Filestore high delay multiple. Defaults to 0 (disabled) -OPTION(filestore_queue_high_delay_multiple, OPT_DOUBLE, 0) - -/// Use above to inject delays intended to keep the op queue between low and high -OPTION(filestore_queue_low_threshhold, OPT_DOUBLE, 0.3) -OPTION(filestore_queue_high_threshhold, OPT_DOUBLE, 0.9) - -OPTION(filestore_op_threads, OPT_INT, 2) -OPTION(filestore_op_thread_timeout, OPT_INT, 60) -OPTION(filestore_op_thread_suicide_timeout, OPT_INT, 180) -OPTION(filestore_commit_timeout, OPT_FLOAT, 600) -OPTION(filestore_fiemap_threshold, OPT_INT, 4096) -OPTION(filestore_merge_threshold, OPT_INT, 10) -OPTION(filestore_split_multiple, OPT_INT, 2) -OPTION(filestore_split_rand_factor, OPT_U32, 20) // randomize the split threshold by adding 16 * [0, rand_factor) -OPTION(filestore_update_to, OPT_INT, 1000) -OPTION(filestore_blackhole, OPT_BOOL, false) // drop any new transactions on the floor -OPTION(filestore_fd_cache_size, OPT_INT, 128) // FD lru size -OPTION(filestore_fd_cache_shards, OPT_INT, 16) // FD number of shards -OPTION(filestore_ondisk_finisher_threads, OPT_INT, 1) -OPTION(filestore_apply_finisher_threads, OPT_INT, 1) -OPTION(filestore_dump_file, OPT_STR, "") // file onto which store transaction dumps -OPTION(filestore_kill_at, OPT_INT, 0) // inject a failure at the n'th opportunity -OPTION(filestore_inject_stall, OPT_INT, 0) // artificially stall for N seconds in op queue thread -OPTION(filestore_fail_eio, OPT_BOOL, true) // fail/crash on EIO -OPTION(filestore_debug_verify_split, OPT_BOOL, false) -OPTION(journal_dio, OPT_BOOL, true) -OPTION(journal_aio, OPT_BOOL, true) -OPTION(journal_force_aio, OPT_BOOL, false) -OPTION(journal_block_size, OPT_INT, 4096) - -// max bytes to search ahead in journal searching for corruption -OPTION(journal_max_corrupt_search, OPT_U64, 10<<20) -OPTION(journal_block_align, OPT_BOOL, true) -OPTION(journal_write_header_frequency, OPT_U64, 0) -OPTION(journal_max_write_bytes, OPT_INT, 10 << 20) -OPTION(journal_max_write_entries, OPT_INT, 100) - -/// Target range for journal fullness -OPTION(journal_throttle_low_threshhold, OPT_DOUBLE, 0.6) -OPTION(journal_throttle_high_threshhold, OPT_DOUBLE, 0.9) - -/// Multiple over expected at high_threshhold. Defaults to 0 (disabled). -OPTION(journal_throttle_high_multiple, OPT_DOUBLE, 0) -/// Multiple over expected at max. Defaults to 0 (disabled). -OPTION(journal_throttle_max_multiple, OPT_DOUBLE, 0) - -OPTION(journal_align_min_size, OPT_INT, 64 << 10) // align data payloads >= this. -OPTION(journal_replay_from, OPT_INT, 0) -OPTION(journal_zero_on_create, OPT_BOOL, false) -OPTION(journal_ignore_corruption, OPT_BOOL, false) // assume journal is not corrupt -OPTION(journal_discard, OPT_BOOL, false) //using ssd disk as journal, whether support discard nouse journal-data. - -OPTION(fio_dir, OPT_STR, "/tmp/fio") // fio data directory for fio-objectstore - -OPTION(rados_mon_op_timeout, OPT_DOUBLE, 0) // how many seconds to wait for a response from the monitor before returning an error from a rados operation. 0 means no limit. -OPTION(rados_osd_op_timeout, OPT_DOUBLE, 0) // how many seconds to wait for a response from osds before returning an error from a rados operation. 0 means no limit. -OPTION(rados_tracing, OPT_BOOL, false) // true if LTTng-UST tracepoints should be enabled - -OPTION(rbd_op_threads, OPT_INT, 1) -OPTION(rbd_op_thread_timeout, OPT_INT, 60) -OPTION(rbd_non_blocking_aio, OPT_BOOL, true) // process AIO ops from a worker thread to prevent blocking -OPTION(rbd_cache, OPT_BOOL, true) // whether to enable caching (writeback unless rbd_cache_max_dirty is 0) -OPTION(rbd_cache_writethrough_until_flush, OPT_BOOL, true) // whether to make writeback caching writethrough until flush is called, to be sure the user of librbd will send flushs so that writeback is safe -OPTION(rbd_cache_size, OPT_LONGLONG, 32<<20) // cache size in bytes -OPTION(rbd_cache_max_dirty, OPT_LONGLONG, 24<<20) // dirty limit in bytes - set to 0 for write-through caching -OPTION(rbd_cache_target_dirty, OPT_LONGLONG, 16<<20) // target dirty limit in bytes -OPTION(rbd_cache_max_dirty_age, OPT_FLOAT, 1.0) // seconds in cache before writeback starts -OPTION(rbd_cache_max_dirty_object, OPT_INT, 0) // dirty limit for objects - set to 0 for auto calculate from rbd_cache_size -OPTION(rbd_cache_block_writes_upfront, OPT_BOOL, false) // whether to block writes to the cache before the aio_write call completes (true), or block before the aio completion is called (false) -OPTION(rbd_concurrent_management_ops, OPT_INT, 10) // how many operations can be in flight for a management operation like deleting or resizing an image -OPTION(rbd_balance_snap_reads, OPT_BOOL, false) -OPTION(rbd_localize_snap_reads, OPT_BOOL, false) -OPTION(rbd_balance_parent_reads, OPT_BOOL, false) -OPTION(rbd_localize_parent_reads, OPT_BOOL, true) -OPTION(rbd_readahead_trigger_requests, OPT_INT, 10) // number of sequential requests necessary to trigger readahead -OPTION(rbd_readahead_max_bytes, OPT_LONGLONG, 512 * 1024) // set to 0 to disable readahead -OPTION(rbd_readahead_disable_after_bytes, OPT_LONGLONG, 50 * 1024 * 1024) // how many bytes are read in total before readahead is disabled -OPTION(rbd_clone_copy_on_read, OPT_BOOL, false) -OPTION(rbd_blacklist_on_break_lock, OPT_BOOL, true) // whether to blacklist clients whose lock was broken -OPTION(rbd_blacklist_expire_seconds, OPT_INT, 0) // number of seconds to blacklist - set to 0 for OSD default -OPTION(rbd_request_timed_out_seconds, OPT_INT, 30) // number of seconds before maint request times out -OPTION(rbd_skip_partial_discard, OPT_BOOL, false) // when trying to discard a range inside an object, set to true to skip zeroing the range. -OPTION(rbd_enable_alloc_hint, OPT_BOOL, true) // when writing a object, it will issue a hint to osd backend to indicate the expected size object need -OPTION(rbd_tracing, OPT_BOOL, false) // true if LTTng-UST tracepoints should be enabled -OPTION(rbd_blkin_trace_all, OPT_BOOL, false) // create a blkin trace for all RBD requests -OPTION(rbd_validate_pool, OPT_BOOL, true) // true if empty pools should be validated for RBD compatibility -OPTION(rbd_validate_names, OPT_BOOL, true) // true if image specs should be validated -OPTION(rbd_auto_exclusive_lock_until_manual_request, OPT_BOOL, true) // whether to automatically acquire/release exclusive lock until it is explicitly requested, i.e. before we know the user of librbd is properly using the lock API -OPTION(rbd_mirroring_resync_after_disconnect, OPT_BOOL, false) // automatically start image resync after mirroring is disconnected due to being laggy -OPTION(rbd_mirroring_replay_delay, OPT_INT, 0) // time-delay in seconds for rbd-mirror asynchronous replication - -OPTION(rbd_default_pool, OPT_STR, "rbd") // default pool for storing images -OPTION_VALIDATOR(rbd_default_pool) - -/* - * The following options change the behavior for librbd's image creation methods that - * don't require all of the parameters. These are provided so that older programs - * can take advantage of newer features without being rewritten to use new versions - * of the image creation functions. - * - * rbd_create()/RBD::create() are affected by all of these options. - * - * rbd_create2()/RBD::create2() and rbd_clone()/RBD::clone() are affected by: - * - rbd_default_order - * - rbd_default_stripe_count - * - rbd_default_stripe_size - * - * rbd_create3()/RBD::create3() and rbd_clone2/RBD::clone2() are only - * affected by rbd_default_order. - */ -OPTION(rbd_default_format, OPT_INT, 2) -OPTION(rbd_default_order, OPT_INT, 22) -OPTION(rbd_default_stripe_count, OPT_U64, 0) // changing requires stripingv2 feature -OPTION(rbd_default_stripe_unit, OPT_U64, 0) // changing to non-object size requires stripingv2 feature -OPTION(rbd_default_data_pool, OPT_STR, "") // optional default pool for storing image data blocks -OPTION_VALIDATOR(rbd_default_data_pool) - -/** - * RBD features are only applicable for v2 images. This setting accepts either - * an integer bitmask value or comma-delimited string of RBD feature names. - * This setting is always internally stored as an integer bitmask value. The - * mapping between feature bitmask value and feature name is as follows: - * - * +1 -> layering - * +2 -> striping - * +4 -> exclusive-lock - * +8 -> object-map - * +16 -> fast-diff - * +32 -> deep-flatten - * +64 -> journaling - * +128 -> data-pool - */ -SAFE_OPTION(rbd_default_features, OPT_STR, "layering,exclusive-lock,object-map,fast-diff,deep-flatten") -OPTION_VALIDATOR(rbd_default_features) - -OPTION(rbd_default_map_options, OPT_STR, "") // default rbd map -o / --options - -/** - * RBD journal options. - */ -OPTION(rbd_journal_order, OPT_U32, 24) // bits to shift to compute journal object max size, between 12 and 64 -OPTION(rbd_journal_splay_width, OPT_U32, 4) // number of active journal objects -OPTION(rbd_journal_commit_age, OPT_DOUBLE, 5) // commit time interval, seconds -OPTION(rbd_journal_object_flush_interval, OPT_INT, 0) // maximum number of pending commits per journal object -OPTION(rbd_journal_object_flush_bytes, OPT_INT, 0) // maximum number of pending bytes per journal object -OPTION(rbd_journal_object_flush_age, OPT_DOUBLE, 0) // maximum age (in seconds) for pending commits -OPTION(rbd_journal_pool, OPT_STR, "") // pool for journal objects -OPTION(rbd_journal_max_payload_bytes, OPT_U32, 16384) // maximum journal payload size before splitting -OPTION(rbd_journal_max_concurrent_object_sets, OPT_INT, 0) // maximum number of object sets a journal client can be behind before it is automatically unregistered - -/** - * RBD Mirror options - */ -OPTION(rbd_mirror_journal_commit_age, OPT_DOUBLE, 5) // commit time interval, seconds -OPTION(rbd_mirror_journal_poll_age, OPT_DOUBLE, 5) // maximum age (in seconds) between successive journal polls -OPTION(rbd_mirror_journal_max_fetch_bytes, OPT_U32, 32768) // maximum bytes to read from each journal data object per fetch -OPTION(rbd_mirror_sync_point_update_age, OPT_DOUBLE, 30) // number of seconds between each update of the image sync point object number -OPTION(rbd_mirror_concurrent_image_syncs, OPT_U32, 5) // maximum number of image syncs in parallel -OPTION(rbd_mirror_pool_replayers_refresh_interval, OPT_INT, 30) // interval to refresh peers in rbd-mirror daemon -OPTION(rbd_mirror_delete_retry_interval, OPT_DOUBLE, 30) // interval to check and retry the failed requests in deleter -OPTION(rbd_mirror_image_state_check_interval, OPT_INT, 30) // interval to get images from pool watcher and set sources in replayer -OPTION(rbd_mirror_leader_heartbeat_interval, OPT_INT, 5) // interval (in seconds) between mirror leader heartbeats -OPTION(rbd_mirror_leader_max_missed_heartbeats, OPT_INT, 2) // number of missed heartbeats for non-lock owner to attempt to acquire lock -OPTION(rbd_mirror_leader_max_acquire_attempts_before_break, OPT_INT, 3) // number of failed attempts to acquire lock after missing heartbeats before breaking lock - -OPTION(nss_db_path, OPT_STR, "") // path to nss db - - -OPTION(rgw_max_chunk_size, OPT_INT, 4 * 1024 * 1024) -OPTION(rgw_put_obj_min_window_size, OPT_INT, 16 * 1024 * 1024) -OPTION(rgw_put_obj_max_window_size, OPT_INT, 64 * 1024 * 1024) -OPTION(rgw_max_put_size, OPT_U64, 5ULL*1024*1024*1024) -OPTION(rgw_max_put_param_size, OPT_U64, 1 * 1024 * 1024) // max input size for PUT requests accepting json/xml params - -/** - * override max bucket index shards in zone configuration (if not zero) - * - * Represents the number of shards for the bucket index object, a value of zero - * indicates there is no sharding. By default (no sharding, the name of the object - * is '.dir.{marker}', with sharding, the name is '.dir.{markder}.{sharding_id}', - * sharding_id is zero-based value. It is not recommended to set a too large value - * (e.g. thousand) as it increases the cost for bucket listing. - */ -OPTION(rgw_override_bucket_index_max_shards, OPT_U32, 0) - -/** - * Represents the maximum AIO pending requests for the bucket index object shards. - */ -OPTION(rgw_bucket_index_max_aio, OPT_U32, 8) - -/** - * whether or not the quota/gc threads should be started - */ -OPTION(rgw_enable_quota_threads, OPT_BOOL, true) -OPTION(rgw_enable_gc_threads, OPT_BOOL, true) -OPTION(rgw_enable_lc_threads, OPT_BOOL, true) - - -OPTION(rgw_data, OPT_STR, "/var/lib/ceph/radosgw/$cluster-$id") -OPTION(rgw_enable_apis, OPT_STR, "s3, s3website, swift, swift_auth, admin") -OPTION(rgw_cache_enabled, OPT_BOOL, true) // rgw cache enabled -OPTION(rgw_cache_lru_size, OPT_INT, 10000) // num of entries in rgw cache -OPTION(rgw_socket_path, OPT_STR, "") // path to unix domain socket, if not specified, rgw will not run as external fcgi -OPTION(rgw_host, OPT_STR, "") // host for radosgw, can be an IP, default is 0.0.0.0 -OPTION(rgw_port, OPT_STR, "") // port to listen, format as "8080" "5000", if not specified, rgw will not run external fcgi -OPTION(rgw_dns_name, OPT_STR, "") // hostname suffix on buckets -OPTION(rgw_dns_s3website_name, OPT_STR, "") // hostname suffix on buckets for s3-website endpoint -OPTION(rgw_content_length_compat, OPT_BOOL, false) // Check both HTTP_CONTENT_LENGTH and CONTENT_LENGTH in fcgi env -OPTION(rgw_lifecycle_work_time, OPT_STR, "00:00-06:00") //job process lc at 00:00-06:00s -OPTION(rgw_lc_lock_max_time, OPT_INT, 60) // total run time for a single lc processor work -OPTION(rgw_lc_max_objs, OPT_INT, 32) -OPTION(rgw_lc_debug_interval, OPT_INT, -1) // Debug run interval, in seconds -OPTION(rgw_script_uri, OPT_STR, "") // alternative value for SCRIPT_URI if not set in request -OPTION(rgw_request_uri, OPT_STR, "") // alternative value for REQUEST_URI if not set in request -OPTION(rgw_swift_url, OPT_STR, "") // the swift url, being published by the internal swift auth -OPTION(rgw_swift_url_prefix, OPT_STR, "swift") // entry point for which a url is considered a swift url -OPTION(rgw_swift_auth_url, OPT_STR, "") // default URL to go and verify tokens for v1 auth (if not using internal swift auth) -OPTION(rgw_swift_auth_entry, OPT_STR, "auth") // entry point for which a url is considered a swift auth url -OPTION(rgw_swift_tenant_name, OPT_STR, "") // tenant name to use for swift access -OPTION(rgw_swift_account_in_url, OPT_BOOL, false) // assume that URL always contain the account (aka tenant) part -OPTION(rgw_swift_enforce_content_length, OPT_BOOL, false) // enforce generation of Content-Length even in cost of performance or scalability -OPTION(rgw_keystone_url, OPT_STR, "") // url for keystone server -OPTION(rgw_keystone_admin_token, OPT_STR, "") // keystone admin token (shared secret) -OPTION(rgw_keystone_admin_user, OPT_STR, "") // keystone admin user name -OPTION(rgw_keystone_admin_password, OPT_STR, "") // keystone admin user password -OPTION(rgw_keystone_admin_tenant, OPT_STR, "") // keystone admin user tenant (for keystone v2.0) -OPTION(rgw_keystone_admin_project, OPT_STR, "") // keystone admin user project (for keystone v3) -OPTION(rgw_keystone_admin_domain, OPT_STR, "") // keystone admin user domain -OPTION(rgw_keystone_barbican_user, OPT_STR, "") // keystone user to access barbican secrets -OPTION(rgw_keystone_barbican_password, OPT_STR, "") // keystone password for barbican user -OPTION(rgw_keystone_barbican_tenant, OPT_STR, "") // keystone barbican user tenant (for keystone v2.0) -OPTION(rgw_keystone_barbican_project, OPT_STR, "") // keystone barbican user project (for keystone v3) -OPTION(rgw_keystone_barbican_domain, OPT_STR, "") // keystone barbican user domain -OPTION(rgw_keystone_api_version, OPT_INT, 2) // Version of Keystone API to use (2 or 3) -OPTION(rgw_keystone_accepted_roles, OPT_STR, "Member, admin") // roles required to serve requests -OPTION(rgw_keystone_accepted_admin_roles, OPT_STR, "") // list of roles allowing an user to gain admin privileges -OPTION(rgw_keystone_token_cache_size, OPT_INT, 10000) // max number of entries in keystone token cache -OPTION(rgw_keystone_revocation_interval, OPT_INT, 15 * 60) // seconds between tokens revocation check -OPTION(rgw_keystone_verify_ssl, OPT_BOOL, true) // should we try to verify keystone's ssl -OPTION(rgw_keystone_implicit_tenants, OPT_BOOL, false) // create new users in their own tenants of the same name -OPTION(rgw_cross_domain_policy, OPT_STR, "") -OPTION(rgw_healthcheck_disabling_path, OPT_STR, "") // path that existence causes the healthcheck to respond 503 -OPTION(rgw_s3_auth_use_rados, OPT_BOOL, true) // should we try to use the internal credentials for s3? -OPTION(rgw_s3_auth_use_keystone, OPT_BOOL, false) // should we try to use keystone for s3? -OPTION(rgw_s3_auth_aws4_force_boto2_compat, OPT_BOOL, true) // force aws4 auth boto2 compatibility -OPTION(rgw_barbican_url, OPT_STR, "") // url for barbican server - -/* OpenLDAP-style LDAP parameter strings */ -/* rgw_ldap_uri space-separated list of LDAP servers in URI format */ -OPTION(rgw_ldap_uri, OPT_STR, "ldaps://") -/* rgw_ldap_binddn LDAP entry RGW will bind with (user match) */ -OPTION(rgw_ldap_binddn, OPT_STR, "uid=admin,cn=users,dc=example,dc=com") -/* rgw_ldap_searchdn LDAP search base (basedn) */ -OPTION(rgw_ldap_searchdn, OPT_STR, "cn=users,cn=accounts,dc=example,dc=com") -/* rgw_ldap_dnattr LDAP attribute containing RGW user names (to form binddns)*/ -OPTION(rgw_ldap_dnattr, OPT_STR, "uid") -/* rgw_ldap_secret file containing credentials for rgw_ldap_binddn */ -OPTION(rgw_ldap_secret, OPT_STR, "/etc/openldap/secret") -/* rgw_s3_auth_use_ldap use LDAP for RGW auth? */ -OPTION(rgw_s3_auth_use_ldap, OPT_BOOL, false) -/* rgw_ldap_searchfilter LDAP search filter */ -OPTION(rgw_ldap_searchfilter, OPT_STR, "") - -OPTION(rgw_admin_entry, OPT_STR, "admin") // entry point for which a url is considered an admin request -OPTION(rgw_enforce_swift_acls, OPT_BOOL, true) -OPTION(rgw_swift_token_expiration, OPT_INT, 24 * 3600) // time in seconds for swift token expiration -OPTION(rgw_print_continue, OPT_BOOL, true) // enable if 100-Continue works -OPTION(rgw_print_prohibited_content_length, OPT_BOOL, false) // violate RFC 7230 and send Content-Length in 204 and 304 -OPTION(rgw_remote_addr_param, OPT_STR, "REMOTE_ADDR") // e.g. X-Forwarded-For, if you have a reverse proxy -OPTION(rgw_op_thread_timeout, OPT_INT, 10*60) -OPTION(rgw_op_thread_suicide_timeout, OPT_INT, 0) -OPTION(rgw_thread_pool_size, OPT_INT, 100) -OPTION(rgw_num_control_oids, OPT_INT, 8) -OPTION(rgw_num_rados_handles, OPT_U32, 1) -OPTION(rgw_verify_ssl, OPT_BOOL, true) // should http_client try to verify ssl when sent https request - -/* The following are tunables for caches of RGW NFS (and other file - * client) objects. - * - * The file handle cache is a partitioned hash table - * (fhcache_partitions), each with a closed hash part and backing - * b-tree mapping. The number of partions is expected to be a small - * prime, the cache size something larger but less than 5K, the total - * size of the cache is n_part * cache_size. - */ -OPTION(rgw_nfs_lru_lanes, OPT_INT, 5) -OPTION(rgw_nfs_lru_lane_hiwat, OPT_INT, 911) -OPTION(rgw_nfs_fhcache_partitions, OPT_INT, 3) -OPTION(rgw_nfs_fhcache_size, OPT_INT, 2017) /* 3*2017=6051 */ -OPTION(rgw_nfs_namespace_expire_secs, OPT_INT, 300) /* namespace invalidate - * timer */ -OPTION(rgw_nfs_max_gc, OPT_INT, 300) /* max gc events per cycle */ -OPTION(rgw_nfs_write_completion_interval_s, OPT_INT, 10) /* stateless (V3) - * commit - * delay */ - -OPTION(rgw_zone, OPT_STR, "") // zone name -OPTION(rgw_zone_root_pool, OPT_STR, ".rgw.root") // pool where zone specific info is stored -OPTION(rgw_default_zone_info_oid, OPT_STR, "default.zone") // oid where default zone info is stored -OPTION(rgw_region, OPT_STR, "") // region name -OPTION(rgw_region_root_pool, OPT_STR, ".rgw.root") // pool where all region info is stored -OPTION(rgw_default_region_info_oid, OPT_STR, "default.region") // oid where default region info is stored -OPTION(rgw_zonegroup, OPT_STR, "") // zone group name -OPTION(rgw_zonegroup_root_pool, OPT_STR, ".rgw.root") // pool where all zone group info is stored -OPTION(rgw_default_zonegroup_info_oid, OPT_STR, "default.zonegroup") // oid where default zone group info is stored -OPTION(rgw_realm, OPT_STR, "") // realm name -OPTION(rgw_realm_root_pool, OPT_STR, ".rgw.root") // pool where all realm info is stored -OPTION(rgw_default_realm_info_oid, OPT_STR, "default.realm") // oid where default realm info is stored -OPTION(rgw_period_root_pool, OPT_STR, ".rgw.root") // pool where all period info is stored -OPTION(rgw_period_latest_epoch_info_oid, OPT_STR, ".latest_epoch") // oid where current period info is stored -OPTION(rgw_log_nonexistent_bucket, OPT_BOOL, false) -OPTION(rgw_log_object_name, OPT_STR, "%Y-%m-%d-%H-%i-%n") // man date to see codes (a subset are supported) -OPTION(rgw_log_object_name_utc, OPT_BOOL, false) -OPTION(rgw_usage_max_shards, OPT_INT, 32) -OPTION(rgw_usage_max_user_shards, OPT_INT, 1) -OPTION(rgw_enable_ops_log, OPT_BOOL, false) // enable logging every rgw operation -OPTION(rgw_enable_usage_log, OPT_BOOL, false) // enable logging bandwidth usage -OPTION(rgw_ops_log_rados, OPT_BOOL, true) // whether ops log should go to rados -OPTION(rgw_ops_log_socket_path, OPT_STR, "") // path to unix domain socket where ops log can go -OPTION(rgw_ops_log_data_backlog, OPT_INT, 5 << 20) // max data backlog for ops log -OPTION(rgw_fcgi_socket_backlog, OPT_INT, 1024) // socket backlog for fcgi -OPTION(rgw_usage_log_flush_threshold, OPT_INT, 1024) // threshold to flush pending log data -OPTION(rgw_usage_log_tick_interval, OPT_INT, 30) // flush pending log data every X seconds -OPTION(rgw_intent_log_object_name, OPT_STR, "%Y-%m-%d-%i-%n") // man date to see codes (a subset are supported) -OPTION(rgw_intent_log_object_name_utc, OPT_BOOL, false) -OPTION(rgw_init_timeout, OPT_INT, 300) // time in seconds -OPTION(rgw_mime_types_file, OPT_STR, "/etc/mime.types") -OPTION(rgw_gc_max_objs, OPT_INT, 32) -OPTION(rgw_gc_obj_min_wait, OPT_INT, 2 * 3600) // wait time before object may be handled by gc -OPTION(rgw_gc_processor_max_time, OPT_INT, 3600) // total run time for a single gc processor work -OPTION(rgw_gc_processor_period, OPT_INT, 3600) // gc processor cycle time -OPTION(rgw_s3_success_create_obj_status, OPT_INT, 0) // alternative success status response for create-obj (0 - default) -OPTION(rgw_resolve_cname, OPT_BOOL, false) // should rgw try to resolve hostname as a dns cname record -OPTION(rgw_obj_stripe_size, OPT_INT, 4 << 20) -OPTION(rgw_extended_http_attrs, OPT_STR, "") // list of extended attrs that can be set on objects (beyond the default) -OPTION(rgw_exit_timeout_secs, OPT_INT, 120) // how many seconds to wait for process to go down before exiting unconditionally -OPTION(rgw_get_obj_window_size, OPT_INT, 16 << 20) // window size in bytes for single get obj request -OPTION(rgw_get_obj_max_req_size, OPT_INT, 4 << 20) // max length of a single get obj rados op -OPTION(rgw_relaxed_s3_bucket_names, OPT_BOOL, false) // enable relaxed bucket name rules for US region buckets -OPTION(rgw_defer_to_bucket_acls, OPT_STR, "") // if the user has bucket perms, use those before key perms (recurse and full_control) -OPTION(rgw_list_buckets_max_chunk, OPT_INT, 1000) // max buckets to retrieve in a single op when listing user buckets -OPTION(rgw_md_log_max_shards, OPT_INT, 64) // max shards for metadata log -OPTION(rgw_num_zone_opstate_shards, OPT_INT, 128) // max shards for keeping inter-region copy progress info -OPTION(rgw_opstate_ratelimit_sec, OPT_INT, 30) // min time between opstate updates on a single upload (0 for disabling ratelimit) -OPTION(rgw_curl_wait_timeout_ms, OPT_INT, 1000) // timeout for certain curl calls -OPTION(rgw_copy_obj_progress, OPT_BOOL, true) // should dump progress during long copy operations? -OPTION(rgw_copy_obj_progress_every_bytes, OPT_INT, 1024 * 1024) // min bytes between copy progress output -OPTION(rgw_obj_tombstone_cache_size, OPT_INT, 1000) // how many objects in tombstone cache, which is used in multi-zone sync to keep - // track of removed objects' mtime - -OPTION(rgw_data_log_window, OPT_INT, 30) // data log entries window (in seconds) -OPTION(rgw_data_log_changes_size, OPT_INT, 1000) // number of in-memory entries to hold for data changes log -OPTION(rgw_data_log_num_shards, OPT_INT, 128) // number of objects to keep data changes log on -OPTION(rgw_data_log_obj_prefix, OPT_STR, "data_log") // -OPTION(rgw_replica_log_obj_prefix, OPT_STR, "replica_log") // - -OPTION(rgw_bucket_quota_ttl, OPT_INT, 600) // time for cached bucket stats to be cached within rgw instance -OPTION(rgw_bucket_quota_soft_threshold, OPT_DOUBLE, 0.95) // threshold from which we don't rely on cached info for quota decisions -OPTION(rgw_bucket_quota_cache_size, OPT_INT, 10000) // number of entries in bucket quota cache -OPTION(rgw_bucket_default_quota_max_objects, OPT_INT, -1) // number of objects allowed -OPTION(rgw_bucket_default_quota_max_size, OPT_LONGLONG, -1) // Max size of object in bytes - -OPTION(rgw_expose_bucket, OPT_BOOL, false) // Return the bucket name in the 'Bucket' response header - -OPTION(rgw_frontends, OPT_STR, "civetweb port=7480") // rgw front ends - -OPTION(rgw_user_quota_bucket_sync_interval, OPT_INT, 180) // time period for accumulating modified buckets before syncing stats -OPTION(rgw_user_quota_sync_interval, OPT_INT, 3600 * 24) // time period for accumulating modified buckets before syncing entire user stats -OPTION(rgw_user_quota_sync_idle_users, OPT_BOOL, false) // whether stats for idle users be fully synced -OPTION(rgw_user_quota_sync_wait_time, OPT_INT, 3600 * 24) // min time between two full stats sync for non-idle users -OPTION(rgw_user_default_quota_max_objects, OPT_INT, -1) // number of objects allowed -OPTION(rgw_user_default_quota_max_size, OPT_LONGLONG, -1) // Max size of object in bytes - -OPTION(rgw_multipart_min_part_size, OPT_INT, 5 * 1024 * 1024) // min size for each part (except for last one) in multipart upload -OPTION(rgw_multipart_part_upload_limit, OPT_INT, 10000) // parts limit in multipart upload - -OPTION(rgw_max_slo_entries, OPT_INT, 1000) // default number of max entries in slo - -OPTION(rgw_olh_pending_timeout_sec, OPT_INT, 3600) // time until we retire a pending olh change -OPTION(rgw_user_max_buckets, OPT_INT, 1000) // global option to set max buckets count for all user - -OPTION(rgw_objexp_gc_interval, OPT_U32, 60 * 10) // maximum time between round of expired objects garbage collecting -OPTION(rgw_objexp_time_step, OPT_U32, 4096) // number of seconds for rounding the timestamps -OPTION(rgw_objexp_hints_num_shards, OPT_U32, 127) // maximum number of parts in which the hint index is stored in -OPTION(rgw_objexp_chunk_size, OPT_U32, 100) // maximum number of entries in a single operation when processing objexp data - -OPTION(rgw_enable_static_website, OPT_BOOL, false) // enable static website feature -OPTION(rgw_log_http_headers, OPT_STR, "" ) // list of HTTP headers to log when seen, ignores case (e.g., http_x_forwarded_for - -OPTION(rgw_num_async_rados_threads, OPT_INT, 32) // num of threads to use for async rados operations -OPTION(rgw_md_notify_interval_msec, OPT_INT, 200) // metadata changes notification interval to followers -OPTION(rgw_run_sync_thread, OPT_BOOL, true) // whether radosgw (not radosgw-admin) spawns the sync thread -OPTION(rgw_sync_lease_period, OPT_INT, 120) // time in second for lease that rgw takes on a specific log (or log shard) -OPTION(rgw_sync_log_trim_interval, OPT_INT, 1200) // time in seconds between attempts to trim sync logs - -OPTION(rgw_sync_data_inject_err_probability, OPT_DOUBLE, 0) // range [0, 1] -OPTION(rgw_sync_meta_inject_err_probability, OPT_DOUBLE, 0) // range [0, 1] - - -OPTION(rgw_period_push_interval, OPT_DOUBLE, 2) // seconds to wait before retrying "period push" -OPTION(rgw_period_push_interval_max, OPT_DOUBLE, 30) // maximum interval after exponential backoff - -OPTION(rgw_safe_max_objects_per_shard, OPT_INT, 100*1024) // safe max loading -OPTION(rgw_shard_warning_threshold, OPT_DOUBLE, 90) // pct of safe max - // at which to warn - -OPTION(rgw_swift_versioning_enabled, OPT_BOOL, false) // whether swift object versioning feature is enabled - -OPTION(mgr_module_path, OPT_STR, CEPH_PKGLIBDIR "/mgr") // where to load python modules from -OPTION(mgr_initial_modules, OPT_STR, "restful status") // Which modules to load -OPTION(mgr_data, OPT_STR, "/var/lib/ceph/mgr/$cluster-$id") // where to find keyring etc -OPTION(mgr_tick_period, OPT_INT, 2) // How frequently to tick -OPTION(mgr_stats_period, OPT_INT, 5) // How frequently clients send stats -OPTION(mgr_client_bytes, OPT_U64, 128*1048576) // bytes from clients -OPTION(mgr_client_messages, OPT_U64, 512) // messages from clients -OPTION(mgr_osd_bytes, OPT_U64, 512*1048576) // bytes from osds -OPTION(mgr_osd_messages, OPT_U64, 8192) // messages from osds -OPTION(mgr_mds_bytes, OPT_U64, 128*1048576) // bytes from mdss -OPTION(mgr_mds_messages, OPT_U64, 128) // messages from mdss -OPTION(mgr_mon_bytes, OPT_U64, 128*1048576) // bytes from mons -OPTION(mgr_mon_messages, OPT_U64, 128) // messages from mons - -OPTION(mgr_connect_retry_interval, OPT_DOUBLE, 1.0) -OPTION(mgr_service_beacon_grace, OPT_DOUBLE, 60.0) - -OPTION(mon_mgr_digest_period, OPT_INT, 5) // How frequently to send digests -OPTION(mon_mgr_beacon_grace, OPT_INT, 30) // How long to wait to failover -OPTION(mon_mgr_inactive_grace, OPT_INT, 60) // How long before health WARN -> ERR -OPTION(mon_mgr_mkfs_grace, OPT_INT, 60) // How long before we complain about MGR_DOWN -OPTION(rgw_crypt_require_ssl, OPT_BOOL, true) // requests including encryption key headers must be sent over ssl -OPTION(rgw_crypt_default_encryption_key, OPT_STR, "") // base64 encoded key for encryption of rgw objects -OPTION(rgw_crypt_s3_kms_encryption_keys, OPT_STR, "") // extra keys that may be used for aws:kms - // defined as map "key1=YmluCmJvb3N0CmJvb3N0LQ== key2=b3V0CnNyYwpUZXN0aW5nCg==" -OPTION(rgw_crypt_suppress_logs, OPT_BOOL, true) // suppress logs that might print customer key -OPTION(rgw_list_bucket_min_readahead, OPT_INT, 1000) // minimum number of entries to read from rados for bucket listing - -OPTION(rgw_rest_getusage_op_compat, OPT_BOOL, false) // dump description of total stats for s3 GetUsage API - -OPTION(mutex_perf_counter, OPT_BOOL, false) // enable/disable mutex perf counter -OPTION(throttler_perf_counter, OPT_BOOL, true) // enable/disable throttler perf counter - -/* The following are tunables for torrent data */ -OPTION(rgw_torrent_flag, OPT_BOOL, false) // produce torrent function flag -OPTION(rgw_torrent_tracker, OPT_STR, "") // torrent field annouce and annouce list -OPTION(rgw_torrent_createby, OPT_STR, "") // torrent field created by -OPTION(rgw_torrent_comment, OPT_STR, "") // torrent field comment -OPTION(rgw_torrent_encoding, OPT_STR, "") // torrent field encoding -OPTION(rgw_torrent_origin, OPT_STR, "") // torrent origin -OPTION(rgw_torrent_sha_unit, OPT_INT, 512*1024) // torrent field piece length 512K - -OPTION(event_tracing, OPT_BOOL, false) // true if LTTng-UST tracepoints should be enabled - -// This will be set to true when it is safe to start threads. -// Once it is true, it will never change. -OPTION(internal_safe_to_start_threads, OPT_BOOL, false) - -OPTION(debug_deliberately_leak_memory, OPT_BOOL, false) - -OPTION(rgw_swift_custom_header, OPT_STR, "") // option to enable swift custom headers - -OPTION(rgw_swift_need_stats, OPT_BOOL, true) // option to enable stats on bucket listing for swift - -/* resharding tunables */ -OPTION(rgw_reshard_num_logs, OPT_INT, 16) -OPTION(rgw_reshard_bucket_lock_duration, OPT_INT, 120) // duration of lock on bucket obj during resharding -OPTION(rgw_dynamic_resharding, OPT_BOOL, true) -OPTION(rgw_max_objs_per_shard, OPT_INT, 100000) -OPTION(rgw_reshard_thread_interval, OPT_U32, 60 * 10) // maximum time between rounds of reshard thread processing diff -Nru ceph-12.1.1/src/common/config_validators.cc ceph-12.1.2/src/common/config_validators.cc --- ceph-12.1.1/src/common/config_validators.cc 2017-07-17 16:56:02.000000000 +0000 +++ ceph-12.1.2/src/common/config_validators.cc 1970-01-01 00:00:00.000000000 +0000 @@ -1,88 +0,0 @@ -// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- -// vim: ts=8 sw=2 smarttab - -#include "common/config_validators.h" -#include "include/stringify.h" - -#include -#include -#include - -int validate(md_config_t::option_rbd_default_pool_t *, - std::string *value, std::string *error_message) { - boost::regex pattern("^[^@/]+$"); - if (!boost::regex_match (*value, pattern)) { - *value = "rbd"; - *error_message = "invalid RBD default pool, resetting to 'rbd'"; - } - return 0; -} - -int validate(md_config_t::option_rbd_default_data_pool_t *, - std::string *value, std::string *error_message) { - boost::regex pattern("^[^@/]*$"); - if (!boost::regex_match (*value, pattern)) { - *value = ""; - *error_message = "ignoring invalid RBD data pool"; - } - return 0; -} - -int validate(md_config_t::option_rbd_default_features_t *, - std::string *value, std::string *error_message) { - static const std::map FEATURE_MAP = { - {RBD_FEATURE_NAME_LAYERING, RBD_FEATURE_LAYERING}, - {RBD_FEATURE_NAME_STRIPINGV2, RBD_FEATURE_STRIPINGV2}, - {RBD_FEATURE_NAME_EXCLUSIVE_LOCK, RBD_FEATURE_EXCLUSIVE_LOCK}, - {RBD_FEATURE_NAME_OBJECT_MAP, RBD_FEATURE_OBJECT_MAP}, - {RBD_FEATURE_NAME_FAST_DIFF, RBD_FEATURE_FAST_DIFF}, - {RBD_FEATURE_NAME_DEEP_FLATTEN, RBD_FEATURE_DEEP_FLATTEN}, - {RBD_FEATURE_NAME_JOURNALING, RBD_FEATURE_JOURNALING}, - {RBD_FEATURE_NAME_DATA_POOL, RBD_FEATURE_DATA_POOL}, - }; - static_assert((RBD_FEATURE_DATA_POOL << 1) > RBD_FEATURES_ALL, - "new RBD feature added"); - - // convert user-friendly comma delimited feature name list to a bitmask - // that is used by the librbd API - uint64_t features = 0; - error_message->clear(); - - try { - features = boost::lexical_cast(*value); - - uint64_t unsupported_features = (features & ~RBD_FEATURES_ALL); - if (unsupported_features != 0ull) { - features &= RBD_FEATURES_ALL; - - std::stringstream ss; - ss << "ignoring unknown feature mask 0x" - << std::hex << unsupported_features; - *error_message = ss.str(); - } - } catch (const boost::bad_lexical_cast& ) { - int r = 0; - std::vector feature_names; - boost::split(feature_names, *value, boost::is_any_of(",")); - for (auto feature_name: feature_names) { - boost::trim(feature_name); - auto feature_it = FEATURE_MAP.find(feature_name); - if (feature_it != FEATURE_MAP.end()) { - features += feature_it->second; - } else { - if (!error_message->empty()) { - *error_message += ", "; - } - *error_message += "ignoring unknown feature " + feature_name; - r = -EINVAL; - } - } - - if (features == 0 && r == -EINVAL) { - features = RBD_FEATURES_DEFAULT; - } - } - *value = stringify(features); - return 0; -} - diff -Nru ceph-12.1.1/src/common/config_validators.h ceph-12.1.2/src/common/config_validators.h --- ceph-12.1.1/src/common/config_validators.h 2017-07-17 16:56:02.000000000 +0000 +++ ceph-12.1.2/src/common/config_validators.h 1970-01-01 00:00:00.000000000 +0000 @@ -1,21 +0,0 @@ -// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- -// vim: ts=8 sw=2 smarttab - -#ifndef CEPH_CONFIG_VALIDATORS -#define CEPH_CONFIG_VALIDATORS - -#include "config.h" -#include - -/** - * Global config value validators for the Ceph project - */ - -int validate(md_config_t::option_rbd_default_pool_t *type, - std::string *value, std::string *error_message); -int validate(md_config_t::option_rbd_default_data_pool_t *type, - std::string *value, std::string *error_message); -int validate(md_config_t::option_rbd_default_features_t *type, - std::string *value, std::string *error_message); - -#endif // CEPH_CONFIG_VALIDATORS diff -Nru ceph-12.1.1/src/common/dout.h ceph-12.1.2/src/common/dout.h --- ceph-12.1.1/src/common/dout.h 2017-07-17 16:56:02.000000000 +0000 +++ ceph-12.1.2/src/common/dout.h 2017-08-01 17:55:40.000000000 +0000 @@ -16,6 +16,8 @@ #ifndef CEPH_DOUT_H #define CEPH_DOUT_H +#include + #include "global/global_context.h" #include "common/config.h" #include "common/likely.h" @@ -50,10 +52,13 @@ if (0) { \ char __array[((v >= -1) && (v <= 200)) ? 0 : -1] __attribute__((unused)); \ } \ - static size_t _log_exp_length=80; \ + static size_t _log_exp_length = 80; \ ceph::logging::Entry *_dout_e = cct->_log->create_entry(v, sub, &_log_exp_length); \ ostream _dout_os(&_dout_e->m_streambuf); \ - CephContext *_dout_cct = cct; \ + static_assert(std::is_convertible::value, \ + "provided cct must be compatible with CephContext*"); \ + auto _dout_cct = cct; \ std::ostream* _dout = &_dout_os; #define lsubdout(cct, sub, v) dout_impl(cct, ceph_subsys_##sub, v) dout_prefix diff -Nru ceph-12.1.1/src/common/legacy_config_opts.h ceph-12.1.2/src/common/legacy_config_opts.h --- ceph-12.1.1/src/common/legacy_config_opts.h 1970-01-01 00:00:00.000000000 +0000 +++ ceph-12.1.2/src/common/legacy_config_opts.h 2017-08-01 17:55:40.000000000 +0000 @@ -0,0 +1,1713 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab +/* + * Ceph - scalable distributed file system + * + * Copyright (C) 2004-2006 Sage Weil + * + * This is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License version 2.1, as published by the Free Software + * Foundation. See file COPYING. + * + */ + +/* note: no header guard */ +OPTION(host, OPT_STR) // "" means that ceph will use short hostname +OPTION(fsid, OPT_UUID) +OPTION(public_addr, OPT_ADDR) +OPTION(public_bind_addr, OPT_ADDR) +OPTION(cluster_addr, OPT_ADDR) +OPTION(public_network, OPT_STR) +OPTION(cluster_network, OPT_STR) +OPTION(monmap, OPT_STR) +OPTION(mon_host, OPT_STR) +OPTION(mon_dns_srv_name, OPT_STR) +OPTION(lockdep, OPT_BOOL) +OPTION(lockdep_force_backtrace, OPT_BOOL) // always gather current backtrace at every lock +OPTION(run_dir, OPT_STR) // the "/var/run/ceph" dir, created on daemon startup +OPTION(admin_socket, OPT_STR) // default changed by common_preinit() +OPTION(admin_socket_mode, OPT_STR) // permission bits to set for admin socket file, e.g., "0775", "0755" + +OPTION(daemonize, OPT_BOOL) // default changed by common_preinit() +OPTION(setuser, OPT_STR) // uid or user name +OPTION(setgroup, OPT_STR) // gid or group name +OPTION(setuser_match_path, OPT_STR) // make setuser/group conditional on this path matching ownership +OPTION(pid_file, OPT_STR) // default changed by common_preinit() +OPTION(chdir, OPT_STR) +OPTION(restapi_log_level, OPT_STR) // default set by Python code +OPTION(restapi_base_url, OPT_STR) // " +OPTION(fatal_signal_handlers, OPT_BOOL) +SAFE_OPTION(erasure_code_dir, OPT_STR) // default location for erasure-code plugins + +OPTION(log_file, OPT_STR) // default changed by common_preinit() +OPTION(log_max_new, OPT_INT) // default changed by common_preinit() +OPTION(log_max_recent, OPT_INT) // default changed by common_preinit() +OPTION(log_to_stderr, OPT_BOOL) // default changed by common_preinit() +OPTION(err_to_stderr, OPT_BOOL) // default changed by common_preinit() +OPTION(log_to_syslog, OPT_BOOL) +OPTION(err_to_syslog, OPT_BOOL) +OPTION(log_flush_on_exit, OPT_BOOL) // default changed by common_preinit() +OPTION(log_stop_at_utilization, OPT_FLOAT) // stop logging at (near) full +OPTION(log_to_graylog, OPT_BOOL) +OPTION(err_to_graylog, OPT_BOOL) +OPTION(log_graylog_host, OPT_STR) +OPTION(log_graylog_port, OPT_INT) + +// options will take k/v pairs, or single-item that will be assumed as general +// default for all, regardless of channel. +// e.g., "info" would be taken as the same as "default=info" +// also, "default=daemon audit=local0" would mean +// "default all to 'daemon', override 'audit' with 'local0' +OPTION(clog_to_monitors, OPT_STR) +OPTION(clog_to_syslog, OPT_STR) +OPTION(clog_to_syslog_level, OPT_STR) // this level and above +OPTION(clog_to_syslog_facility, OPT_STR) +OPTION(clog_to_graylog, OPT_STR) +OPTION(clog_to_graylog_host, OPT_STR) +OPTION(clog_to_graylog_port, OPT_STR) + +OPTION(mon_cluster_log_to_syslog, OPT_STR) +OPTION(mon_cluster_log_to_syslog_level, OPT_STR) // this level and above +OPTION(mon_cluster_log_to_syslog_facility, OPT_STR) +OPTION(mon_cluster_log_file, OPT_STR) +OPTION(mon_cluster_log_file_level, OPT_STR) +OPTION(mon_cluster_log_to_graylog, OPT_STR) +OPTION(mon_cluster_log_to_graylog_host, OPT_STR) +OPTION(mon_cluster_log_to_graylog_port, OPT_STR) + +OPTION(enable_experimental_unrecoverable_data_corrupting_features, OPT_STR) + +SAFE_OPTION(plugin_dir, OPT_STR) + +OPTION(xio_trace_mempool, OPT_BOOL) // mempool allocation counters +OPTION(xio_trace_msgcnt, OPT_BOOL) // incoming/outgoing msg counters +OPTION(xio_trace_xcon, OPT_BOOL) // Xio message encode/decode trace +OPTION(xio_queue_depth, OPT_INT) // depth of Accelio msg queue +OPTION(xio_mp_min, OPT_INT) // default min mempool size +OPTION(xio_mp_max_64, OPT_INT) // max 64-byte chunks (buffer is 40) +OPTION(xio_mp_max_256, OPT_INT) // max 256-byte chunks +OPTION(xio_mp_max_1k, OPT_INT) // max 1K chunks +OPTION(xio_mp_max_page, OPT_INT) // max 1K chunks +OPTION(xio_mp_max_hint, OPT_INT) // max size-hint chunks +OPTION(xio_portal_threads, OPT_INT) // xio portal threads per messenger +OPTION(xio_max_conns_per_portal, OPT_INT) // max xio_connections per portal/ctx +OPTION(xio_transport_type, OPT_STR) // xio transport type: {rdma or tcp} +OPTION(xio_max_send_inline, OPT_INT) // xio maximum threshold to send inline + +OPTION(compressor_zlib_isal, OPT_BOOL) +OPTION(compressor_zlib_level, OPT_INT) //regular zlib compression level, not applicable to isa-l optimized version + +OPTION(async_compressor_enabled, OPT_BOOL) +OPTION(async_compressor_type, OPT_STR) +OPTION(async_compressor_threads, OPT_INT) +OPTION(async_compressor_thread_timeout, OPT_INT) +OPTION(async_compressor_thread_suicide_timeout, OPT_INT) + +OPTION(plugin_crypto_accelerator, OPT_STR) + +OPTION(mempool_debug, OPT_BOOL) + + + +OPTION(key, OPT_STR) +OPTION(keyfile, OPT_STR) +OPTION(keyring, OPT_STR) +OPTION(heartbeat_interval, OPT_INT) +OPTION(heartbeat_file, OPT_STR) +OPTION(heartbeat_inject_failure, OPT_INT) // force an unhealthy heartbeat for N seconds +OPTION(perf, OPT_BOOL) // enable internal perf counters + +SAFE_OPTION(ms_type, OPT_STR) // messenger backend. It will be modified in runtime, so use SAFE_OPTION +OPTION(ms_public_type, OPT_STR) // messenger backend +OPTION(ms_cluster_type, OPT_STR) // messenger backend +OPTION(ms_tcp_nodelay, OPT_BOOL) +OPTION(ms_tcp_rcvbuf, OPT_INT) +OPTION(ms_tcp_prefetch_max_size, OPT_INT) // max prefetch size, we limit this to avoid extra memcpy +OPTION(ms_initial_backoff, OPT_DOUBLE) +OPTION(ms_max_backoff, OPT_DOUBLE) +OPTION(ms_crc_data, OPT_BOOL) +OPTION(ms_crc_header, OPT_BOOL) +OPTION(ms_die_on_bad_msg, OPT_BOOL) +OPTION(ms_die_on_unhandled_msg, OPT_BOOL) +OPTION(ms_die_on_old_message, OPT_BOOL) // assert if we get a dup incoming message and shouldn't have (may be triggered by pre-541cd3c64be0dfa04e8a2df39422e0eb9541a428 code) +OPTION(ms_die_on_skipped_message, OPT_BOOL) // assert if we skip a seq (kernel client does this intentionally) +OPTION(ms_dispatch_throttle_bytes, OPT_U64) +OPTION(ms_bind_ipv6, OPT_BOOL) +OPTION(ms_bind_port_min, OPT_INT) +OPTION(ms_bind_port_max, OPT_INT) +OPTION(ms_bind_retry_count, OPT_INT) // If binding fails, how many times do we retry to bind +OPTION(ms_bind_retry_delay, OPT_INT) // Delay between attemps to bind +OPTION(ms_bind_before_connect, OPT_BOOL) +OPTION(ms_tcp_listen_backlog, OPT_INT) +OPTION(ms_rwthread_stack_bytes, OPT_U64) +OPTION(ms_tcp_read_timeout, OPT_U64) +OPTION(ms_pq_max_tokens_per_priority, OPT_U64) +OPTION(ms_pq_min_cost, OPT_U64) +OPTION(ms_inject_socket_failures, OPT_U64) +SAFE_OPTION(ms_inject_delay_type, OPT_STR) // "osd mds mon client" allowed +OPTION(ms_inject_delay_msg_type, OPT_STR) // the type of message to delay). This is an additional restriction on the general type filter ms_inject_delay_type. +OPTION(ms_inject_delay_max, OPT_DOUBLE) // seconds +OPTION(ms_inject_delay_probability, OPT_DOUBLE) // range [0, 1] +OPTION(ms_inject_internal_delays, OPT_DOUBLE) // seconds +OPTION(ms_dump_on_send, OPT_BOOL) // hexdump msg to log on send +OPTION(ms_dump_corrupt_message_level, OPT_INT) // debug level to hexdump undecodeable messages at +OPTION(ms_async_op_threads, OPT_U64) // number of worker processing threads for async messenger created on init +OPTION(ms_async_max_op_threads, OPT_U64) // max number of worker processing threads for async messenger +OPTION(ms_async_set_affinity, OPT_BOOL) +// example: ms_async_affinity_cores = 0,1 +// The number of coreset is expected to equal to ms_async_op_threads, otherwise +// extra op threads will loop ms_async_affinity_cores again. +// If ms_async_affinity_cores is empty, all threads will be bind to current running +// core +OPTION(ms_async_affinity_cores, OPT_STR) +OPTION(ms_async_rdma_device_name, OPT_STR) +OPTION(ms_async_rdma_enable_hugepage, OPT_BOOL) +OPTION(ms_async_rdma_buffer_size, OPT_INT) +OPTION(ms_async_rdma_send_buffers, OPT_U32) +OPTION(ms_async_rdma_receive_buffers, OPT_U32) +OPTION(ms_async_rdma_port_num, OPT_U32) +OPTION(ms_async_rdma_polling_us, OPT_U32) +OPTION(ms_async_rdma_local_gid, OPT_STR) // GID format: "fe80:0000:0000:0000:7efe:90ff:fe72:6efe", no zero folding +OPTION(ms_async_rdma_roce_ver, OPT_INT) // 0=RoCEv1, 1=RoCEv2, 2=RoCEv1.5 +OPTION(ms_async_rdma_sl, OPT_INT) // in RoCE, this means PCP +OPTION(ms_async_rdma_dscp, OPT_INT) // in RoCE, this means DSCP + +OPTION(ms_dpdk_port_id, OPT_INT) +SAFE_OPTION(ms_dpdk_coremask, OPT_STR) // it is modified in unittest so that use SAFE_OPTION to declare +OPTION(ms_dpdk_memory_channel, OPT_STR) +OPTION(ms_dpdk_hugepages, OPT_STR) +OPTION(ms_dpdk_pmd, OPT_STR) +SAFE_OPTION(ms_dpdk_host_ipv4_addr, OPT_STR) +SAFE_OPTION(ms_dpdk_gateway_ipv4_addr, OPT_STR) +SAFE_OPTION(ms_dpdk_netmask_ipv4_addr, OPT_STR) +OPTION(ms_dpdk_lro, OPT_BOOL) +OPTION(ms_dpdk_hw_flow_control, OPT_BOOL) +// Weighing of a hardware network queue relative to a software queue (0=no work, 1= equal share)") +OPTION(ms_dpdk_hw_queue_weight, OPT_FLOAT) +OPTION(ms_dpdk_debug_allow_loopback, OPT_BOOL) +OPTION(ms_dpdk_rx_buffer_count_per_core, OPT_INT) + +OPTION(inject_early_sigterm, OPT_BOOL) + +OPTION(mon_data, OPT_STR) +OPTION(mon_initial_members, OPT_STR) // list of initial cluster mon ids; if specified, need majority to form initial quorum and create new cluster +OPTION(mon_compact_on_start, OPT_BOOL) // compact leveldb on ceph-mon start +OPTION(mon_compact_on_bootstrap, OPT_BOOL) // trigger leveldb compaction on bootstrap +OPTION(mon_compact_on_trim, OPT_BOOL) // compact (a prefix) when we trim old states +OPTION(mon_osd_cache_size, OPT_INT) // the size of osdmaps cache, not to rely on underlying store's cache + +OPTION(mon_cpu_threads, OPT_INT) +OPTION(mon_osd_mapping_pgs_per_chunk, OPT_INT) +OPTION(mon_osd_max_creating_pgs, OPT_INT) +OPTION(mon_tick_interval, OPT_INT) +OPTION(mon_session_timeout, OPT_INT) // must send keepalive or subscribe +OPTION(mon_subscribe_interval, OPT_DOUBLE) // for legacy clients only +OPTION(mon_delta_reset_interval, OPT_DOUBLE) // seconds of inactivity before we reset the pg delta to 0 +OPTION(mon_osd_laggy_halflife, OPT_INT) // (seconds) how quickly our laggy estimations decay +OPTION(mon_osd_laggy_weight, OPT_DOUBLE) // weight for new 'samples's in laggy estimations +OPTION(mon_osd_laggy_max_interval, OPT_INT) // maximum value of laggy_interval in laggy estimations +OPTION(mon_osd_adjust_heartbeat_grace, OPT_BOOL) // true if we should scale based on laggy estimations +OPTION(mon_osd_adjust_down_out_interval, OPT_BOOL) // true if we should scale based on laggy estimations +OPTION(mon_osd_auto_mark_in, OPT_BOOL) // mark any booting osds 'in' +OPTION(mon_osd_auto_mark_auto_out_in, OPT_BOOL) // mark booting auto-marked-out osds 'in' +OPTION(mon_osd_auto_mark_new_in, OPT_BOOL) // mark booting new osds 'in' +OPTION(mon_osd_destroyed_out_interval, OPT_INT) // seconds +OPTION(mon_osd_down_out_interval, OPT_INT) // seconds +OPTION(mon_osd_down_out_subtree_limit, OPT_STR) // smallest crush unit/type that we will not automatically mark out +OPTION(mon_osd_min_up_ratio, OPT_DOUBLE) // min osds required to be up to mark things down +OPTION(mon_osd_min_in_ratio, OPT_DOUBLE) // min osds required to be in to mark things out +OPTION(mon_osd_warn_op_age, OPT_DOUBLE) // max op age before we generate a warning (make it a power of 2) +OPTION(mon_osd_err_op_age_ratio, OPT_DOUBLE) // when to generate an error, as multiple of mon_osd_warn_op_age +OPTION(mon_osd_max_split_count, OPT_INT) // largest number of PGs per "involved" OSD to let split create +OPTION(mon_osd_allow_primary_temp, OPT_BOOL) // allow primary_temp to be set in the osdmap +OPTION(mon_osd_allow_primary_affinity, OPT_BOOL) // allow primary_affinity to be set in the osdmap +OPTION(mon_osd_prime_pg_temp, OPT_BOOL) // prime osdmap with pg mapping changes +OPTION(mon_osd_prime_pg_temp_max_time, OPT_FLOAT) // max time to spend priming +OPTION(mon_osd_prime_pg_temp_max_estimate, OPT_FLOAT) // max estimate of pg total before we do all pgs in parallel +OPTION(mon_osd_pool_ec_fast_read, OPT_BOOL) // whether turn on fast read on the pool or not +OPTION(mon_stat_smooth_intervals, OPT_INT) // smooth stats over last N PGMap maps +OPTION(mon_election_timeout, OPT_FLOAT) // on election proposer, max waiting time for all ACKs +OPTION(mon_lease, OPT_FLOAT) // lease interval +OPTION(mon_lease_renew_interval_factor, OPT_FLOAT) // on leader, to renew the lease +OPTION(mon_lease_ack_timeout_factor, OPT_FLOAT) // on leader, if lease isn't acked by all peons +OPTION(mon_accept_timeout_factor, OPT_FLOAT) // on leader, if paxos update isn't accepted + +OPTION(mon_clock_drift_allowed, OPT_FLOAT) // allowed clock drift between monitors +OPTION(mon_clock_drift_warn_backoff, OPT_FLOAT) // exponential backoff for clock drift warnings +OPTION(mon_timecheck_interval, OPT_FLOAT) // on leader, timecheck (clock drift check) interval (seconds) +OPTION(mon_timecheck_skew_interval, OPT_FLOAT) // on leader, timecheck (clock drift check) interval when in presence of a skew (seconds) +OPTION(mon_pg_stuck_threshold, OPT_INT) // number of seconds after which pgs can be considered stuck inactive, unclean, etc (see doc/control.rst under dump_stuck for more info) +OPTION(mon_pg_min_inactive, OPT_U64) // the number of PGs which have to be inactive longer than 'mon_pg_stuck_threshold' before health goes into ERR. 0 means disabled, never go into ERR. +OPTION(mon_pg_warn_min_per_osd, OPT_INT) // min # pgs per (in) osd before we warn the admin +OPTION(mon_pg_warn_max_per_osd, OPT_INT) // max # pgs per (in) osd before we warn the admin +OPTION(mon_pg_warn_max_object_skew, OPT_FLOAT) // max skew few average in objects per pg +OPTION(mon_pg_warn_min_objects, OPT_INT) // do not warn below this object # +OPTION(mon_pg_warn_min_pool_objects, OPT_INT) // do not warn on pools below this object # +OPTION(mon_pg_check_down_all_threshold, OPT_FLOAT) // threshold of down osds after which we check all pgs +OPTION(mon_cache_target_full_warn_ratio, OPT_FLOAT) // position between pool cache_target_full and max where we start warning +OPTION(mon_osd_full_ratio, OPT_FLOAT) // what % full makes an OSD "full" +OPTION(mon_osd_backfillfull_ratio, OPT_FLOAT) // what % full makes an OSD backfill full (backfill halted) +OPTION(mon_osd_nearfull_ratio, OPT_FLOAT) // what % full makes an OSD near full +OPTION(mon_osd_initial_require_min_compat_client, OPT_STR) +OPTION(mon_allow_pool_delete, OPT_BOOL) // allow pool deletion +OPTION(mon_fake_pool_delete, OPT_BOOL) // fake pool deletion (add _DELETED suffix) +OPTION(mon_globalid_prealloc, OPT_U32) // how many globalids to prealloc +OPTION(mon_osd_report_timeout, OPT_INT) // grace period before declaring unresponsive OSDs dead +OPTION(mon_force_standby_active, OPT_BOOL) // should mons force standby-replay mds to be active +OPTION(mon_warn_on_legacy_crush_tunables, OPT_BOOL) // warn if crush tunables are too old (older than mon_min_crush_required_version) +OPTION(mon_crush_min_required_version, OPT_STR) +OPTION(mon_warn_on_crush_straw_calc_version_zero, OPT_BOOL) // warn if crush straw_calc_version==0 +OPTION(mon_warn_on_osd_down_out_interval_zero, OPT_BOOL) // warn if 'mon_osd_down_out_interval == 0' +OPTION(mon_warn_on_cache_pools_without_hit_sets, OPT_BOOL) +OPTION(mon_min_osdmap_epochs, OPT_INT) +OPTION(mon_max_pgmap_epochs, OPT_INT) +OPTION(mon_max_log_epochs, OPT_INT) +OPTION(mon_max_mdsmap_epochs, OPT_INT) +OPTION(mon_max_osd, OPT_INT) +OPTION(mon_probe_timeout, OPT_DOUBLE) +OPTION(mon_client_bytes, OPT_U64) // client msg data allowed in memory (in bytes) +OPTION(mon_mgr_proxy_client_bytes_ratio, OPT_FLOAT) // ratio of mon_client_bytes that can be consumed by proxied mgr commands before we error out to client +OPTION(mon_log_max_summary, OPT_U64) +OPTION(mon_daemon_bytes, OPT_U64) // mds, osd message memory cap (in bytes) +OPTION(mon_max_log_entries_per_event, OPT_INT) +OPTION(mon_reweight_min_pgs_per_osd, OPT_U64) // min pgs per osd for reweight-by-pg command +OPTION(mon_reweight_min_bytes_per_osd, OPT_U64) // min bytes per osd for reweight-by-utilization command +OPTION(mon_reweight_max_osds, OPT_INT) // max osds to change per reweight-by-* command +OPTION(mon_reweight_max_change, OPT_DOUBLE) +OPTION(mon_health_data_update_interval, OPT_FLOAT) +OPTION(mon_health_to_clog, OPT_BOOL) +OPTION(mon_health_to_clog_interval, OPT_INT) +OPTION(mon_health_to_clog_tick_interval, OPT_DOUBLE) +OPTION(mon_health_preluminous_compat, OPT_BOOL) +OPTION(mon_health_max_detail, OPT_INT) // max detailed pgs to report in health detail +OPTION(mon_data_avail_crit, OPT_INT) +OPTION(mon_data_avail_warn, OPT_INT) +OPTION(mon_data_size_warn, OPT_U64) // issue a warning when the monitor's data store goes over 15GB (in bytes) +OPTION(mon_warn_not_scrubbed, OPT_INT) +OPTION(mon_warn_not_deep_scrubbed, OPT_INT) +OPTION(mon_scrub_interval, OPT_INT) // once a day +OPTION(mon_scrub_timeout, OPT_INT) // let's give it 5 minutes; why not. +OPTION(mon_scrub_max_keys, OPT_INT) // max number of keys to scrub each time +OPTION(mon_scrub_inject_crc_mismatch, OPT_DOUBLE) // probability of injected crc mismatch [0.0, 1.0] +OPTION(mon_scrub_inject_missing_keys, OPT_DOUBLE) // probability of injected missing keys [0.0, 1.0] +OPTION(mon_config_key_max_entry_size, OPT_INT) // max num bytes per config-key entry +OPTION(mon_sync_timeout, OPT_DOUBLE) +OPTION(mon_sync_max_payload_size, OPT_U32) // max size for a sync chunk payload (say) +OPTION(mon_sync_debug, OPT_BOOL) // enable sync-specific debug +OPTION(mon_inject_sync_get_chunk_delay, OPT_DOUBLE) // inject N second delay on each get_chunk request +OPTION(mon_osd_min_down_reporters, OPT_INT) // number of OSDs from different subtrees who need to report a down OSD for it to count +OPTION(mon_osd_reporter_subtree_level , OPT_STR) // in which level of parent bucket the reporters are counted +OPTION(mon_osd_force_trim_to, OPT_INT) // force mon to trim maps to this point, regardless of min_last_epoch_clean (dangerous) +OPTION(mon_mds_force_trim_to, OPT_INT) // force mon to trim mdsmaps to this point (dangerous) +OPTION(mon_mds_skip_sanity, OPT_BOOL) // skip safety assertions on FSMap (in case of bugs where we want to continue anyway) + +// monitor debug options +OPTION(mon_debug_deprecated_as_obsolete, OPT_BOOL) // consider deprecated commands as obsolete + +// dump transactions +OPTION(mon_debug_dump_transactions, OPT_BOOL) +OPTION(mon_debug_dump_json, OPT_BOOL) +OPTION(mon_debug_dump_location, OPT_STR) +OPTION(mon_debug_no_require_luminous, OPT_BOOL) +OPTION(mon_debug_no_require_bluestore_for_ec_overwrites, OPT_BOOL) +OPTION(mon_debug_no_initial_persistent_features, OPT_BOOL) +OPTION(mon_inject_transaction_delay_max, OPT_DOUBLE) // seconds +OPTION(mon_inject_transaction_delay_probability, OPT_DOUBLE) // range [0, 1] + +OPTION(mon_sync_provider_kill_at, OPT_INT) // kill the sync provider at a specific point in the work flow +OPTION(mon_sync_requester_kill_at, OPT_INT) // kill the sync requester at a specific point in the work flow +OPTION(mon_force_quorum_join, OPT_BOOL) // force monitor to join quorum even if it has been previously removed from the map +OPTION(mon_keyvaluedb, OPT_STR) // type of keyvaluedb backend + +// UNSAFE -- TESTING ONLY! Allows addition of a cache tier with preexisting snaps +OPTION(mon_debug_unsafe_allow_tier_with_nonempty_snaps, OPT_BOOL) +OPTION(mon_osd_blacklist_default_expire, OPT_DOUBLE) // default one hour +OPTION(mon_osd_crush_smoke_test, OPT_BOOL) + +OPTION(paxos_stash_full_interval, OPT_INT) // how often (in commits) to stash a full copy of the PaxosService state +OPTION(paxos_max_join_drift, OPT_INT) // max paxos iterations before we must first sync the monitor stores +OPTION(paxos_propose_interval, OPT_DOUBLE) // gather updates for this long before proposing a map update +OPTION(paxos_min_wait, OPT_DOUBLE) // min time to gather updates for after period of inactivity +OPTION(paxos_min, OPT_INT) // minimum number of paxos states to keep around +OPTION(paxos_trim_min, OPT_INT) // number of extra proposals tolerated before trimming +OPTION(paxos_trim_max, OPT_INT) // max number of extra proposals to trim at a time +OPTION(paxos_service_trim_min, OPT_INT) // minimum amount of versions to trigger a trim (0 disables it) +OPTION(paxos_service_trim_max, OPT_INT) // maximum amount of versions to trim during a single proposal (0 disables it) +OPTION(paxos_kill_at, OPT_INT) +OPTION(auth_cluster_required, OPT_STR) // required of mon, mds, osd daemons +OPTION(auth_service_required, OPT_STR) // required by daemons of clients +OPTION(auth_client_required, OPT_STR) // what clients require of daemons +OPTION(auth_supported, OPT_STR) // deprecated; default value for above if they are not defined. +OPTION(max_rotating_auth_attempts, OPT_INT) +OPTION(cephx_require_signatures, OPT_BOOL) // If true, don't talk to Cephx partners if they don't support message signing; off by default +OPTION(cephx_cluster_require_signatures, OPT_BOOL) +OPTION(cephx_service_require_signatures, OPT_BOOL) +OPTION(cephx_sign_messages, OPT_BOOL) // Default to signing session messages if supported +OPTION(auth_mon_ticket_ttl, OPT_DOUBLE) +OPTION(auth_service_ticket_ttl, OPT_DOUBLE) +OPTION(auth_debug, OPT_BOOL) // if true, assert when weird things happen +OPTION(mon_client_hunt_parallel, OPT_U32) // how many mons to try to connect to in parallel during hunt +OPTION(mon_client_hunt_interval, OPT_DOUBLE) // try new mon every N seconds until we connect +OPTION(mon_client_ping_interval, OPT_DOUBLE) // ping every N seconds +OPTION(mon_client_ping_timeout, OPT_DOUBLE) // fail if we don't hear back +OPTION(mon_client_hunt_interval_backoff, OPT_DOUBLE) // each time we reconnect to a monitor, double our timeout +OPTION(mon_client_hunt_interval_max_multiple, OPT_DOUBLE) // up to a max of 10*default (30 seconds) +OPTION(mon_client_max_log_entries_per_message, OPT_INT) +OPTION(mon_max_pool_pg_num, OPT_INT) +OPTION(mon_pool_quota_warn_threshold, OPT_INT) // percent of quota at which to issue warnings +OPTION(mon_pool_quota_crit_threshold, OPT_INT) // percent of quota at which to issue errors +OPTION(client_cache_size, OPT_INT) +OPTION(client_cache_mid, OPT_FLOAT) +OPTION(client_use_random_mds, OPT_BOOL) +OPTION(client_mount_timeout, OPT_DOUBLE) +OPTION(client_tick_interval, OPT_DOUBLE) +OPTION(client_trace, OPT_STR) +OPTION(client_readahead_min, OPT_LONGLONG) // readahead at _least_ this much. +OPTION(client_readahead_max_bytes, OPT_LONGLONG) // default unlimited +OPTION(client_readahead_max_periods, OPT_LONGLONG) // as multiple of file layout period (object size * num stripes) +OPTION(client_reconnect_stale, OPT_BOOL) // automatically reconnect stale session +OPTION(client_snapdir, OPT_STR) +OPTION(client_mountpoint, OPT_STR) +OPTION(client_mount_uid, OPT_INT) +OPTION(client_mount_gid, OPT_INT) +OPTION(client_notify_timeout, OPT_INT) // in seconds +OPTION(osd_client_watch_timeout, OPT_INT) // in seconds +OPTION(client_caps_release_delay, OPT_INT) // in seconds +OPTION(client_quota_df, OPT_BOOL) // use quota for df on subdir mounts +OPTION(client_oc, OPT_BOOL) +OPTION(client_oc_size, OPT_INT) // MB * n +OPTION(client_oc_max_dirty, OPT_INT) // MB * n (dirty OR tx.. bigish) +OPTION(client_oc_target_dirty, OPT_INT) // target dirty (keep this smallish) +OPTION(client_oc_max_dirty_age, OPT_DOUBLE) // max age in cache before writeback +OPTION(client_oc_max_objects, OPT_INT) // max objects in cache +OPTION(client_debug_getattr_caps, OPT_BOOL) // check if MDS reply contains wanted caps +OPTION(client_debug_force_sync_read, OPT_BOOL) // always read synchronously (go to osds) +OPTION(client_debug_inject_tick_delay, OPT_INT) // delay the client tick for a number of seconds +OPTION(client_max_inline_size, OPT_U64) +OPTION(client_inject_release_failure, OPT_BOOL) // synthetic client bug for testing +OPTION(client_inject_fixed_oldest_tid, OPT_BOOL) // synthetic client bug for testing +OPTION(client_metadata, OPT_STR) +OPTION(client_acl_type, OPT_STR) +OPTION(client_permissions, OPT_BOOL) +OPTION(client_dirsize_rbytes, OPT_BOOL) + +// note: the max amount of "in flight" dirty data is roughly (max - target) +OPTION(fuse_use_invalidate_cb, OPT_BOOL) // use fuse 2.8+ invalidate callback to keep page cache consistent +OPTION(fuse_disable_pagecache, OPT_BOOL) +OPTION(fuse_allow_other, OPT_BOOL) +OPTION(fuse_default_permissions, OPT_BOOL) +OPTION(fuse_big_writes, OPT_BOOL) +OPTION(fuse_atomic_o_trunc, OPT_BOOL) +OPTION(fuse_debug, OPT_BOOL) +OPTION(fuse_multithreaded, OPT_BOOL) +OPTION(fuse_require_active_mds, OPT_BOOL) // if ceph_fuse requires active mds server +OPTION(fuse_syncfs_on_mksnap, OPT_BOOL) +OPTION(fuse_set_user_groups, OPT_BOOL) // if ceph_fuse fills in group lists or not + +OPTION(client_try_dentry_invalidate, OPT_BOOL) // the client should try to use dentry invaldation instead of remounting, on kernels it believes that will work for +OPTION(client_die_on_failed_remount, OPT_BOOL) +OPTION(client_check_pool_perm, OPT_BOOL) +OPTION(client_use_faked_inos, OPT_BOOL) +OPTION(client_mds_namespace, OPT_STR) + +OPTION(crush_location, OPT_STR) // whitespace-separated list of key=value pairs describing crush location +OPTION(crush_location_hook, OPT_STR) +OPTION(crush_location_hook_timeout, OPT_INT) + +OPTION(objecter_tick_interval, OPT_DOUBLE) +OPTION(objecter_timeout, OPT_DOUBLE) // before we ask for a map +OPTION(objecter_inflight_op_bytes, OPT_U64) // max in-flight data (both directions) +OPTION(objecter_inflight_ops, OPT_U64) // max in-flight ios +OPTION(objecter_completion_locks_per_session, OPT_U64) // num of completion locks per each session, for serializing same object responses +OPTION(objecter_inject_no_watch_ping, OPT_BOOL) // suppress watch pings +OPTION(objecter_retry_writes_after_first_reply, OPT_BOOL) // ignore the first reply for each write, and resend the osd op instead +OPTION(objecter_debug_inject_relock_delay, OPT_BOOL) + +// Max number of deletes at once in a single Filer::purge call +OPTION(filer_max_purge_ops, OPT_U32) +// Max number of truncate at once in a single Filer::truncate call +OPTION(filer_max_truncate_ops, OPT_U32) + +OPTION(journaler_write_head_interval, OPT_INT) +OPTION(journaler_prefetch_periods, OPT_INT) // * journal object size +OPTION(journaler_prezero_periods, OPT_INT) // * journal object size +OPTION(mds_data, OPT_STR) +OPTION(mds_max_file_size, OPT_U64) // Used when creating new CephFS. Change with 'ceph mds set max_file_size ' afterwards +// max xattr kv pairs size for each dir/file +OPTION(mds_max_xattr_pairs_size, OPT_U32) +OPTION(mds_cache_size, OPT_INT) +OPTION(mds_cache_mid, OPT_FLOAT) +OPTION(mds_max_file_recover, OPT_U32) +OPTION(mds_dir_max_commit_size, OPT_INT) // MB +OPTION(mds_dir_keys_per_op, OPT_INT) +OPTION(mds_decay_halflife, OPT_FLOAT) +OPTION(mds_beacon_interval, OPT_FLOAT) +OPTION(mds_beacon_grace, OPT_FLOAT) +OPTION(mds_enforce_unique_name, OPT_BOOL) +OPTION(mds_blacklist_interval, OPT_FLOAT) // how long to blacklist failed nodes + +OPTION(mds_session_timeout, OPT_FLOAT) // cap bits and leases time out if client idle +OPTION(mds_session_blacklist_on_timeout, OPT_BOOL) // whether to blacklist clients whose sessions are dropped due to timeout +OPTION(mds_session_blacklist_on_evict, OPT_BOOL) // whether to blacklist clients whose sessions are dropped via admin commands + +OPTION(mds_sessionmap_keys_per_op, OPT_U32) // how many sessions should I try to load/store in a single OMAP operation? +OPTION(mds_revoke_cap_timeout, OPT_FLOAT) // detect clients which aren't revoking caps +OPTION(mds_recall_state_timeout, OPT_FLOAT) // detect clients which aren't trimming caps +OPTION(mds_freeze_tree_timeout, OPT_FLOAT) // detecting freeze tree deadlock +OPTION(mds_session_autoclose, OPT_FLOAT) // autoclose idle session +OPTION(mds_health_summarize_threshold, OPT_INT) // collapse N-client health metrics to a single 'many' +OPTION(mds_health_cache_threshold, OPT_FLOAT) // warn on cache size if it exceeds mds_cache_size by this factor +OPTION(mds_reconnect_timeout, OPT_FLOAT) // seconds to wait for clients during mds restart + // make it (mds_session_timeout - mds_beacon_grace) +OPTION(mds_tick_interval, OPT_FLOAT) +OPTION(mds_dirstat_min_interval, OPT_FLOAT) // try to avoid propagating more often than this +OPTION(mds_scatter_nudge_interval, OPT_FLOAT) // how quickly dirstat changes propagate up the hierarchy +OPTION(mds_client_prealloc_inos, OPT_INT) +OPTION(mds_early_reply, OPT_BOOL) +OPTION(mds_default_dir_hash, OPT_INT) +OPTION(mds_log_pause, OPT_BOOL) +OPTION(mds_log_skip_corrupt_events, OPT_BOOL) +OPTION(mds_log_max_events, OPT_INT) +OPTION(mds_log_events_per_segment, OPT_INT) +OPTION(mds_log_segment_size, OPT_INT) // segment size for mds log, default to default file_layout_t +OPTION(mds_log_max_segments, OPT_U32) +OPTION(mds_log_max_expiring, OPT_INT) +OPTION(mds_bal_export_pin, OPT_BOOL) // allow clients to pin directory trees to ranks +OPTION(mds_bal_sample_interval, OPT_DOUBLE) // every 3 seconds +OPTION(mds_bal_replicate_threshold, OPT_FLOAT) +OPTION(mds_bal_unreplicate_threshold, OPT_FLOAT) +OPTION(mds_bal_frag, OPT_BOOL) +OPTION(mds_bal_split_size, OPT_INT) +OPTION(mds_bal_split_rd, OPT_FLOAT) +OPTION(mds_bal_split_wr, OPT_FLOAT) +OPTION(mds_bal_split_bits, OPT_INT) +OPTION(mds_bal_merge_size, OPT_INT) +OPTION(mds_bal_interval, OPT_INT) // seconds +OPTION(mds_bal_fragment_interval, OPT_INT) // seconds +OPTION(mds_bal_fragment_size_max, OPT_INT) // order of magnitude higher than split size +OPTION(mds_bal_fragment_fast_factor, OPT_FLOAT) // multiple of size_max that triggers immediate split +OPTION(mds_bal_idle_threshold, OPT_FLOAT) +OPTION(mds_bal_max, OPT_INT) +OPTION(mds_bal_max_until, OPT_INT) +OPTION(mds_bal_mode, OPT_INT) +OPTION(mds_bal_min_rebalance, OPT_FLOAT) // must be this much above average before we export anything +OPTION(mds_bal_min_start, OPT_FLOAT) // if we need less than this, we don't do anything +OPTION(mds_bal_need_min, OPT_FLOAT) // take within this range of what we need +OPTION(mds_bal_need_max, OPT_FLOAT) +OPTION(mds_bal_midchunk, OPT_FLOAT) // any sub bigger than this taken in full +OPTION(mds_bal_minchunk, OPT_FLOAT) // never take anything smaller than this +OPTION(mds_bal_target_decay, OPT_DOUBLE) // target decay half-life in MDSMap (2x larger is approx. 2x slower) +OPTION(mds_replay_interval, OPT_FLOAT) // time to wait before starting replay again +OPTION(mds_shutdown_check, OPT_INT) +OPTION(mds_thrash_exports, OPT_INT) +OPTION(mds_thrash_fragments, OPT_INT) +OPTION(mds_dump_cache_on_map, OPT_BOOL) +OPTION(mds_dump_cache_after_rejoin, OPT_BOOL) +OPTION(mds_verify_scatter, OPT_BOOL) +OPTION(mds_debug_scatterstat, OPT_BOOL) +OPTION(mds_debug_frag, OPT_BOOL) +OPTION(mds_debug_auth_pins, OPT_BOOL) +OPTION(mds_debug_subtrees, OPT_BOOL) +OPTION(mds_kill_mdstable_at, OPT_INT) +OPTION(mds_kill_export_at, OPT_INT) +OPTION(mds_kill_import_at, OPT_INT) +OPTION(mds_kill_link_at, OPT_INT) +OPTION(mds_kill_rename_at, OPT_INT) +OPTION(mds_kill_openc_at, OPT_INT) +OPTION(mds_kill_journal_expire_at, OPT_INT) +OPTION(mds_kill_journal_replay_at, OPT_INT) +OPTION(mds_journal_format, OPT_U32) // Default to most recent JOURNAL_FORMAT_* +OPTION(mds_kill_create_at, OPT_INT) +OPTION(mds_inject_traceless_reply_probability, OPT_DOUBLE) /* percentage + of MDS modify replies to skip sending the + client a trace on [0-1]*/ +OPTION(mds_wipe_sessions, OPT_BOOL) +OPTION(mds_wipe_ino_prealloc, OPT_BOOL) +OPTION(mds_skip_ino, OPT_INT) +OPTION(mds_standby_for_name, OPT_STR) +OPTION(mds_standby_for_rank, OPT_INT) +OPTION(mds_standby_for_fscid, OPT_INT) +OPTION(mds_standby_replay, OPT_BOOL) +OPTION(mds_enable_op_tracker, OPT_BOOL) // enable/disable MDS op tracking +OPTION(mds_op_history_size, OPT_U32) // Max number of completed ops to track +OPTION(mds_op_history_duration, OPT_U32) // Oldest completed op to track +OPTION(mds_op_complaint_time, OPT_FLOAT) // how many seconds old makes an op complaint-worthy +OPTION(mds_op_log_threshold, OPT_INT) // how many op log messages to show in one go +OPTION(mds_snap_min_uid, OPT_U32) // The minimum UID required to create a snapshot +OPTION(mds_snap_max_uid, OPT_U32) // The maximum UID allowed to create a snapshot +OPTION(mds_snap_rstat, OPT_BOOL) // enable/disbale nested stat for snapshot +OPTION(mds_verify_backtrace, OPT_U32) +// detect clients which aren't trimming completed requests +OPTION(mds_max_completed_flushes, OPT_U32) +OPTION(mds_max_completed_requests, OPT_U32) + +OPTION(mds_action_on_write_error, OPT_U32) // 0: ignore; 1: force readonly; 2: crash +OPTION(mds_mon_shutdown_timeout, OPT_DOUBLE) + +// Maximum number of concurrent stray files to purge +OPTION(mds_max_purge_files, OPT_U32) +// Maximum number of concurrent RADOS ops to issue in purging +OPTION(mds_max_purge_ops, OPT_U32) +// Maximum number of concurrent RADOS ops to issue in purging, scaled by PG count +OPTION(mds_max_purge_ops_per_pg, OPT_FLOAT) + +OPTION(mds_purge_queue_busy_flush_period, OPT_FLOAT) + +OPTION(mds_root_ino_uid, OPT_INT) // The UID of / on new filesystems +OPTION(mds_root_ino_gid, OPT_INT) // The GID of / on new filesystems + +OPTION(mds_max_scrub_ops_in_progress, OPT_INT) // the number of simultaneous scrubs allowed + +// Maximum number of damaged frags/dentries before whole MDS rank goes damaged +OPTION(mds_damage_table_max_entries, OPT_INT) + +// Maximum increment for client writable range, counted by number of objects +OPTION(mds_client_writeable_range_max_inc_objs, OPT_U32) + +// verify backend can support configured max object name length +OPTION(osd_check_max_object_name_len_on_startup, OPT_BOOL) + +// Maximum number of backfills to or from a single osd +OPTION(osd_max_backfills, OPT_U64) + +// Minimum recovery priority (255 = max, smaller = lower) +OPTION(osd_min_recovery_priority, OPT_INT) + +// Seconds to wait before retrying refused backfills +OPTION(osd_backfill_retry_interval, OPT_DOUBLE) + +// Seconds to wait before retrying refused recovery +OPTION(osd_recovery_retry_interval, OPT_DOUBLE) + +// max agent flush ops +OPTION(osd_agent_max_ops, OPT_INT) +OPTION(osd_agent_max_low_ops, OPT_INT) +OPTION(osd_agent_min_evict_effort, OPT_FLOAT) +OPTION(osd_agent_quantize_effort, OPT_FLOAT) +OPTION(osd_agent_delay_time, OPT_FLOAT) + +// osd ignore history.last_epoch_started in find_best_info +OPTION(osd_find_best_info_ignore_history_les, OPT_BOOL) + +// decay atime and hist histograms after how many objects go by +OPTION(osd_agent_hist_halflife, OPT_INT) + +// must be this amount over the threshold to enable, +// this amount below the threshold to disable. +OPTION(osd_agent_slop, OPT_FLOAT) + +OPTION(osd_uuid, OPT_UUID) +OPTION(osd_data, OPT_STR) +OPTION(osd_journal, OPT_STR) +OPTION(osd_journal_size, OPT_INT) // in mb +OPTION(osd_journal_flush_on_shutdown, OPT_BOOL) // Flush journal to data store on shutdown +// flags for specific control purpose during osd mount() process. +// e.g., can be 1 to skip over replaying journal +// or 2 to skip over mounting omap or 3 to skip over both. +// This might be helpful in case the journal is totally corrupted +// and we still want to bring the osd daemon back normally, etc. +OPTION(osd_os_flags, OPT_U32) +OPTION(osd_max_write_size, OPT_INT) +OPTION(osd_max_pgls, OPT_U64) // max number of pgls entries to return +OPTION(osd_client_message_size_cap, OPT_U64) // client data allowed in-memory (in bytes) +OPTION(osd_client_message_cap, OPT_U64) // num client messages allowed in-memory +OPTION(osd_pg_bits, OPT_INT) // bits per osd +OPTION(osd_pgp_bits, OPT_INT) // bits per osd +OPTION(osd_crush_update_weight_set, OPT_BOOL) // update weight set while updating weights +OPTION(osd_crush_chooseleaf_type, OPT_INT) // 1 = host +OPTION(osd_pool_use_gmt_hitset, OPT_BOOL) // try to use gmt for hitset archive names if all osds in cluster support it. +OPTION(osd_crush_update_on_start, OPT_BOOL) +OPTION(osd_class_update_on_start, OPT_BOOL) // automatically set device class on start +OPTION(osd_crush_initial_weight, OPT_DOUBLE) // if >=0, the initial weight is for newly added osds. +OPTION(osd_pool_default_crush_rule, OPT_INT) +OPTION(osd_pool_erasure_code_stripe_unit, OPT_U32) // in bytes +OPTION(osd_pool_default_size, OPT_INT) +OPTION(osd_pool_default_min_size, OPT_INT) // 0 means no specific default; ceph will use size-size/2 +OPTION(osd_pool_default_pg_num, OPT_INT) // number of PGs for new pools. Configure in global or mon section of ceph.conf +OPTION(osd_pool_default_pgp_num, OPT_INT) // number of PGs for placement purposes. Should be equal to pg_num +OPTION(osd_pool_default_type, OPT_STR) +OPTION(osd_pool_default_erasure_code_profile, OPT_STR) // default properties of osd pool create +OPTION(osd_erasure_code_plugins, OPT_STR) // list of erasure code plugins + +// Allows the "peered" state for recovery and backfill below min_size +OPTION(osd_allow_recovery_below_min_size, OPT_BOOL) + +OPTION(osd_pool_default_flags, OPT_INT) // default flags for new pools +OPTION(osd_pool_default_flag_hashpspool, OPT_BOOL) // use new pg hashing to prevent pool/pg overlap +OPTION(osd_pool_default_flag_nodelete, OPT_BOOL) // pool can't be deleted +OPTION(osd_pool_default_flag_nopgchange, OPT_BOOL) // pool's pg and pgp num can't be changed +OPTION(osd_pool_default_flag_nosizechange, OPT_BOOL) // pool's size and min size can't be changed +OPTION(osd_pool_default_hit_set_bloom_fpp, OPT_FLOAT) +OPTION(osd_pool_default_cache_target_dirty_ratio, OPT_FLOAT) +OPTION(osd_pool_default_cache_target_dirty_high_ratio, OPT_FLOAT) +OPTION(osd_pool_default_cache_target_full_ratio, OPT_FLOAT) +OPTION(osd_pool_default_cache_min_flush_age, OPT_INT) // seconds +OPTION(osd_pool_default_cache_min_evict_age, OPT_INT) // seconds +OPTION(osd_pool_default_cache_max_evict_check_size, OPT_INT) // max size to check for eviction +OPTION(osd_hit_set_min_size, OPT_INT) // min target size for a HitSet +OPTION(osd_hit_set_max_size, OPT_INT) // max target size for a HitSet +OPTION(osd_hit_set_namespace, OPT_STR) // rados namespace for hit_set tracking + +// conservative default throttling values +OPTION(osd_tier_promote_max_objects_sec, OPT_U64) +OPTION(osd_tier_promote_max_bytes_sec, OPT_U64) + +OPTION(osd_tier_default_cache_mode, OPT_STR) +OPTION(osd_tier_default_cache_hit_set_count, OPT_INT) +OPTION(osd_tier_default_cache_hit_set_period, OPT_INT) +OPTION(osd_tier_default_cache_hit_set_type, OPT_STR) +OPTION(osd_tier_default_cache_min_read_recency_for_promote, OPT_INT) // number of recent HitSets the object must appear in to be promoted (on read) +OPTION(osd_tier_default_cache_min_write_recency_for_promote, OPT_INT) // number of recent HitSets the object must appear in to be promoted (on write) +OPTION(osd_tier_default_cache_hit_set_grade_decay_rate, OPT_INT) +OPTION(osd_tier_default_cache_hit_set_search_last_n, OPT_INT) + +OPTION(osd_map_dedup, OPT_BOOL) +OPTION(osd_map_max_advance, OPT_INT) // make this < cache_size! +OPTION(osd_map_cache_size, OPT_INT) +OPTION(osd_map_message_max, OPT_INT) // max maps per MOSDMap message +OPTION(osd_map_share_max_epochs, OPT_INT) // cap on # of inc maps we send to peers, clients +OPTION(osd_inject_bad_map_crc_probability, OPT_FLOAT) +OPTION(osd_inject_failure_on_pg_removal, OPT_BOOL) +// shutdown the OSD if stuatus flipping more than max_markdown_count times in recent max_markdown_period seconds +OPTION(osd_max_markdown_period , OPT_INT) +OPTION(osd_max_markdown_count, OPT_INT) + +OPTION(osd_peering_wq_threads, OPT_INT) +OPTION(osd_peering_wq_batch_size, OPT_U64) +OPTION(osd_op_pq_max_tokens_per_priority, OPT_U64) +OPTION(osd_op_pq_min_cost, OPT_U64) +OPTION(osd_disk_threads, OPT_INT) +OPTION(osd_disk_thread_ioprio_class, OPT_STR) // rt realtime be best effort idle +OPTION(osd_disk_thread_ioprio_priority, OPT_INT) // 0-7 +OPTION(osd_recover_clone_overlap, OPT_BOOL) // preserve clone_overlap during recovery/migration +OPTION(osd_op_num_threads_per_shard, OPT_INT) +OPTION(osd_op_num_threads_per_shard_hdd, OPT_INT) +OPTION(osd_op_num_threads_per_shard_ssd, OPT_INT) +OPTION(osd_op_num_shards, OPT_INT) +OPTION(osd_op_num_shards_hdd, OPT_INT) +OPTION(osd_op_num_shards_ssd, OPT_INT) + +// PrioritzedQueue (prio), Weighted Priority Queue (wpq ; default), +// mclock_opclass, mclock_client, or debug_random. "mclock_opclass" +// and "mclock_client" are based on the mClock/dmClock algorithm +// (Gulati, et al. 2010). "mclock_opclass" prioritizes based on the +// class the operation belongs to. "mclock_client" does the same but +// also works to ienforce fairness between clients. "debug_random" +// chooses among all four with equal probability. +OPTION(osd_op_queue, OPT_STR) + +OPTION(osd_op_queue_cut_off, OPT_STR) // Min priority to go to strict queue. (low, high) + +// mClock priority queue parameters for five types of ops +OPTION(osd_op_queue_mclock_client_op_res, OPT_DOUBLE) +OPTION(osd_op_queue_mclock_client_op_wgt, OPT_DOUBLE) +OPTION(osd_op_queue_mclock_client_op_lim, OPT_DOUBLE) +OPTION(osd_op_queue_mclock_osd_subop_res, OPT_DOUBLE) +OPTION(osd_op_queue_mclock_osd_subop_wgt, OPT_DOUBLE) +OPTION(osd_op_queue_mclock_osd_subop_lim, OPT_DOUBLE) +OPTION(osd_op_queue_mclock_snap_res, OPT_DOUBLE) +OPTION(osd_op_queue_mclock_snap_wgt, OPT_DOUBLE) +OPTION(osd_op_queue_mclock_snap_lim, OPT_DOUBLE) +OPTION(osd_op_queue_mclock_recov_res, OPT_DOUBLE) +OPTION(osd_op_queue_mclock_recov_wgt, OPT_DOUBLE) +OPTION(osd_op_queue_mclock_recov_lim, OPT_DOUBLE) +OPTION(osd_op_queue_mclock_scrub_res, OPT_DOUBLE) +OPTION(osd_op_queue_mclock_scrub_wgt, OPT_DOUBLE) +OPTION(osd_op_queue_mclock_scrub_lim, OPT_DOUBLE) + +OPTION(osd_ignore_stale_divergent_priors, OPT_BOOL) // do not assert on divergent_prior entries which aren't in the log and whose on-disk objects are newer + +// Set to true for testing. Users should NOT set this. +// If set to true even after reading enough shards to +// decode the object, any error will be reported. +OPTION(osd_read_ec_check_for_errors, OPT_BOOL) // return error if any ec shard has an error + +// Only use clone_overlap for recovery if there are fewer than +// osd_recover_clone_overlap_limit entries in the overlap set +OPTION(osd_recover_clone_overlap_limit, OPT_INT) + +OPTION(osd_backfill_scan_min, OPT_INT) +OPTION(osd_backfill_scan_max, OPT_INT) +OPTION(osd_op_thread_timeout, OPT_INT) +OPTION(osd_op_thread_suicide_timeout, OPT_INT) +OPTION(osd_recovery_thread_timeout, OPT_INT) +OPTION(osd_recovery_thread_suicide_timeout, OPT_INT) +OPTION(osd_recovery_sleep, OPT_FLOAT) // seconds to sleep between recovery ops +OPTION(osd_recovery_sleep_hdd, OPT_FLOAT) +OPTION(osd_recovery_sleep_ssd, OPT_FLOAT) +OPTION(osd_snap_trim_sleep, OPT_DOUBLE) +OPTION(osd_scrub_invalid_stats, OPT_BOOL) +OPTION(osd_remove_thread_timeout, OPT_INT) +OPTION(osd_remove_thread_suicide_timeout, OPT_INT) +OPTION(osd_command_thread_timeout, OPT_INT) +OPTION(osd_command_thread_suicide_timeout, OPT_INT) +OPTION(osd_heartbeat_addr, OPT_ADDR) +OPTION(osd_heartbeat_interval, OPT_INT) // (seconds) how often we ping peers + +// (seconds) how long before we decide a peer has failed +// This setting is read by the MONs and OSDs and has to be set to a equal value in both settings of the configuration +OPTION(osd_heartbeat_grace, OPT_INT) +OPTION(osd_heartbeat_min_peers, OPT_INT) // minimum number of peers +OPTION(osd_heartbeat_use_min_delay_socket, OPT_BOOL) // prio the heartbeat tcp socket and set dscp as CS6 on it if true +OPTION(osd_heartbeat_min_size, OPT_INT) // the minimum size of OSD heartbeat messages to send + +// max number of parallel snap trims/pg +OPTION(osd_pg_max_concurrent_snap_trims, OPT_U64) +// max number of trimming pgs +OPTION(osd_max_trimming_pgs, OPT_U64) + +// minimum number of peers that must be reachable to mark ourselves +// back up after being wrongly marked down. +OPTION(osd_heartbeat_min_healthy_ratio, OPT_FLOAT) + +OPTION(osd_mon_heartbeat_interval, OPT_INT) // (seconds) how often to ping monitor if no peers +OPTION(osd_mon_report_interval_max, OPT_INT) +OPTION(osd_mon_report_interval_min, OPT_INT) // pg stats, failures, up_thru, boot. +OPTION(osd_mon_report_max_in_flight, OPT_INT) // max updates in flight +OPTION(osd_beacon_report_interval, OPT_INT) // (second) how often to send beacon message to monitor +OPTION(osd_pg_stat_report_interval_max, OPT_INT) // report pg stats for any given pg at least this often +OPTION(osd_mon_ack_timeout, OPT_DOUBLE) // time out a mon if it doesn't ack stats +OPTION(osd_stats_ack_timeout_factor, OPT_DOUBLE) // multiples of mon_ack_timeout +OPTION(osd_stats_ack_timeout_decay, OPT_DOUBLE) +OPTION(osd_default_data_pool_replay_window, OPT_INT) +OPTION(osd_auto_mark_unfound_lost, OPT_BOOL) +OPTION(osd_recovery_delay_start, OPT_FLOAT) +OPTION(osd_recovery_max_active, OPT_U64) +OPTION(osd_recovery_max_single_start, OPT_U64) +OPTION(osd_recovery_max_chunk, OPT_U64) // max size of push chunk +OPTION(osd_recovery_max_omap_entries_per_chunk, OPT_U64) // max number of omap entries per chunk; 0 to disable limit +OPTION(osd_copyfrom_max_chunk, OPT_U64) // max size of a COPYFROM chunk +OPTION(osd_push_per_object_cost, OPT_U64) // push cost per object +OPTION(osd_max_push_cost, OPT_U64) // max size of push message +OPTION(osd_max_push_objects, OPT_U64) // max objects in single push op +OPTION(osd_recovery_forget_lost_objects, OPT_BOOL) // off for now +OPTION(osd_max_scrubs, OPT_INT) +OPTION(osd_scrub_during_recovery, OPT_BOOL) // Allow new scrubs to start while recovery is active on the OSD +OPTION(osd_scrub_begin_hour, OPT_INT) +OPTION(osd_scrub_end_hour, OPT_INT) +OPTION(osd_scrub_load_threshold, OPT_FLOAT) +OPTION(osd_scrub_min_interval, OPT_FLOAT) // if load is low +OPTION(osd_scrub_max_interval, OPT_FLOAT) // regardless of load +OPTION(osd_scrub_interval_randomize_ratio, OPT_FLOAT) // randomize the scheduled scrub in the span of [min,min*(1+randomize_ratio)) +OPTION(osd_scrub_backoff_ratio, OPT_DOUBLE) // the probability to back off the scheduled scrub +OPTION(osd_scrub_chunk_min, OPT_INT) +OPTION(osd_scrub_chunk_max, OPT_INT) +OPTION(osd_scrub_sleep, OPT_FLOAT) // sleep between [deep]scrub ops +OPTION(osd_scrub_auto_repair, OPT_BOOL) // whether auto-repair inconsistencies upon deep-scrubbing +OPTION(osd_scrub_auto_repair_num_errors, OPT_U32) // only auto-repair when number of errors is below this threshold +OPTION(osd_deep_scrub_interval, OPT_FLOAT) // once a week +OPTION(osd_deep_scrub_randomize_ratio, OPT_FLOAT) // scrubs will randomly become deep scrubs at this rate (0.15 -> 15% of scrubs are deep) +OPTION(osd_deep_scrub_stride, OPT_INT) +OPTION(osd_deep_scrub_update_digest_min_age, OPT_INT) // objects must be this old (seconds) before we update the whole-object digest on scrub +OPTION(osd_class_dir, OPT_STR) // where rados plugins are stored +OPTION(osd_open_classes_on_start, OPT_BOOL) +OPTION(osd_class_load_list, OPT_STR) // list of object classes allowed to be loaded (allow all: *) +OPTION(osd_class_default_list, OPT_STR) // list of object classes with default execute perm (allow all: *) +OPTION(osd_check_for_log_corruption, OPT_BOOL) +OPTION(osd_use_stale_snap, OPT_BOOL) +OPTION(osd_rollback_to_cluster_snap, OPT_STR) +OPTION(osd_default_notify_timeout, OPT_U32) // default notify timeout in seconds +OPTION(osd_kill_backfill_at, OPT_INT) + +// Bounds how infrequently a new map epoch will be persisted for a pg +OPTION(osd_pg_epoch_persisted_max_stale, OPT_U32) // make this < map_cache_size! + +OPTION(osd_min_pg_log_entries, OPT_U32) // number of entries to keep in the pg log when trimming it +OPTION(osd_max_pg_log_entries, OPT_U32) // max entries, say when degraded, before we trim +OPTION(osd_pg_log_dups_tracked, OPT_U32) // how many versions back to track combined in both pglog's regular + dup logs +OPTION(osd_force_recovery_pg_log_entries_factor, OPT_FLOAT) // max entries factor before force recovery +OPTION(osd_pg_log_trim_min, OPT_U32) +OPTION(osd_op_complaint_time, OPT_FLOAT) // how many seconds old makes an op complaint-worthy +OPTION(osd_command_max_records, OPT_INT) +OPTION(osd_max_pg_blocked_by, OPT_U32) // max peer osds to report that are blocking our progress +OPTION(osd_op_log_threshold, OPT_INT) // how many op log messages to show in one go +OPTION(osd_verify_sparse_read_holes, OPT_BOOL) // read fiemap-reported holes and verify they are zeros +OPTION(osd_backoff_on_unfound, OPT_BOOL) // object unfound +OPTION(osd_backoff_on_degraded, OPT_BOOL) // [mainly for debug?] object unreadable/writeable +OPTION(osd_backoff_on_down, OPT_BOOL) // pg in down/incomplete state +OPTION(osd_backoff_on_peering, OPT_BOOL) // [debug] pg peering +OPTION(osd_debug_crash_on_ignored_backoff, OPT_BOOL) // crash osd if client ignores a backoff; useful for debugging +OPTION(osd_debug_inject_dispatch_delay_probability, OPT_DOUBLE) +OPTION(osd_debug_inject_dispatch_delay_duration, OPT_DOUBLE) +OPTION(osd_debug_drop_ping_probability, OPT_DOUBLE) +OPTION(osd_debug_drop_ping_duration, OPT_INT) +OPTION(osd_debug_op_order, OPT_BOOL) +OPTION(osd_debug_verify_missing_on_start, OPT_BOOL) +OPTION(osd_debug_scrub_chance_rewrite_digest, OPT_U64) +OPTION(osd_debug_verify_snaps_on_info, OPT_BOOL) +OPTION(osd_debug_verify_stray_on_activate, OPT_BOOL) +OPTION(osd_debug_skip_full_check_in_backfill_reservation, OPT_BOOL) +OPTION(osd_debug_reject_backfill_probability, OPT_DOUBLE) +OPTION(osd_debug_inject_copyfrom_error, OPT_BOOL) // inject failure during copyfrom completion +OPTION(osd_debug_misdirected_ops, OPT_BOOL) +OPTION(osd_debug_skip_full_check_in_recovery, OPT_BOOL) +OPTION(osd_debug_random_push_read_error, OPT_DOUBLE) +OPTION(osd_debug_verify_cached_snaps, OPT_BOOL) +OPTION(osd_enable_op_tracker, OPT_BOOL) // enable/disable OSD op tracking +OPTION(osd_num_op_tracker_shard, OPT_U32) // The number of shards for holding the ops +OPTION(osd_op_history_size, OPT_U32) // Max number of completed ops to track +OPTION(osd_op_history_duration, OPT_U32) // Oldest completed op to track +OPTION(osd_op_history_slow_op_size, OPT_U32) // Max number of slow ops to track +OPTION(osd_op_history_slow_op_threshold, OPT_DOUBLE) // track the op if over this threshold +OPTION(osd_target_transaction_size, OPT_INT) // to adjust various transactions that batch smaller items +OPTION(osd_failsafe_full_ratio, OPT_FLOAT) // what % full makes an OSD "full" (failsafe) +OPTION(osd_fast_fail_on_connection_refused, OPT_BOOL) // immediately mark OSDs as down once they refuse to accept connections + +OPTION(osd_pg_object_context_cache_count, OPT_INT) +OPTION(osd_tracing, OPT_BOOL) // true if LTTng-UST tracepoints should be enabled +OPTION(osd_function_tracing, OPT_BOOL) // true if function instrumentation should use LTTng + +OPTION(osd_fast_info, OPT_BOOL) // use fast info attr, if we can + +// determines whether PGLog::check() compares written out log to stored log +OPTION(osd_debug_pg_log_writeout, OPT_BOOL) +OPTION(osd_loop_before_reset_tphandle, OPT_U32) // Max number of loop before we reset thread-pool's handle +// default timeout while caling WaitInterval on an empty queue +OPTION(threadpool_default_timeout, OPT_INT) +// default wait time for an empty queue before pinging the hb timeout +OPTION(threadpool_empty_queue_max_wait, OPT_INT) + +OPTION(leveldb_log_to_ceph_log, OPT_BOOL) +OPTION(leveldb_write_buffer_size, OPT_U64) // leveldb write buffer size +OPTION(leveldb_cache_size, OPT_U64) // leveldb cache size +OPTION(leveldb_block_size, OPT_U64) // leveldb block size +OPTION(leveldb_bloom_size, OPT_INT) // leveldb bloom bits per entry +OPTION(leveldb_max_open_files, OPT_INT) // leveldb max open files +OPTION(leveldb_compression, OPT_BOOL) // leveldb uses compression +OPTION(leveldb_paranoid, OPT_BOOL) // leveldb paranoid flag +OPTION(leveldb_log, OPT_STR) // enable leveldb log file +OPTION(leveldb_compact_on_mount, OPT_BOOL) + +OPTION(kinetic_host, OPT_STR) // hostname or ip address of a kinetic drive to use +OPTION(kinetic_port, OPT_INT) // port number of the kinetic drive +OPTION(kinetic_user_id, OPT_INT) // kinetic user to authenticate as +OPTION(kinetic_hmac_key, OPT_STR) // kinetic key to authenticate with +OPTION(kinetic_use_ssl, OPT_BOOL) // whether to secure kinetic traffic with TLS + + +OPTION(rocksdb_separate_wal_dir, OPT_BOOL) // use $path.wal for wal +SAFE_OPTION(rocksdb_db_paths, OPT_STR) // path,size( path,size)* +OPTION(rocksdb_log_to_ceph_log, OPT_BOOL) // log to ceph log +OPTION(rocksdb_cache_size, OPT_U64) // rocksdb cache size (unless set by bluestore/etc) +OPTION(rocksdb_cache_row_ratio, OPT_FLOAT) // ratio of cache for row (vs block) +OPTION(rocksdb_cache_shard_bits, OPT_INT) // rocksdb block cache shard bits, 4 bit -> 16 shards +OPTION(rocksdb_cache_type, OPT_STR) // 'lru' or 'clock' +OPTION(rocksdb_block_size, OPT_INT) // default rocksdb block size +OPTION(rocksdb_perf, OPT_BOOL) // Enabling this will have 5-10% impact on performance for the stats collection +OPTION(rocksdb_collect_compaction_stats, OPT_BOOL) //For rocksdb, this behavior will be an overhead of 5%~10%, collected only rocksdb_perf is enabled. +OPTION(rocksdb_collect_extended_stats, OPT_BOOL) //For rocksdb, this behavior will be an overhead of 5%~10%, collected only rocksdb_perf is enabled. +OPTION(rocksdb_collect_memory_stats, OPT_BOOL) //For rocksdb, this behavior will be an overhead of 5%~10%, collected only rocksdb_perf is enabled. +OPTION(rocksdb_enable_rmrange, OPT_BOOL) // see https://github.com/facebook/rocksdb/blob/master/include/rocksdb/db.h#L253 + +// rocksdb options that will be used for omap(if omap_backend is rocksdb) +OPTION(filestore_rocksdb_options, OPT_STR) +// rocksdb options that will be used in monstore +OPTION(mon_rocksdb_options, OPT_STR) + +/** + * osd_*_priority adjust the relative priority of client io, recovery io, + * snaptrim io, etc + * + * osd_*_priority determines the ratio of available io between client and + * recovery. Each option may be set between + * 1..63. + */ +OPTION(osd_client_op_priority, OPT_U32) +OPTION(osd_recovery_op_priority, OPT_U32) + +OPTION(osd_snap_trim_priority, OPT_U32) +OPTION(osd_snap_trim_cost, OPT_U32) // set default cost equal to 1MB io + +OPTION(osd_scrub_priority, OPT_U32) +// set default cost equal to 50MB io +OPTION(osd_scrub_cost, OPT_U32) +// set requested scrub priority higher than scrub priority to make the +// requested scrubs jump the queue of scheduled scrubs +OPTION(osd_requested_scrub_priority, OPT_U32) + +OPTION(osd_recovery_priority, OPT_U32) +// set default cost equal to 20MB io +OPTION(osd_recovery_cost, OPT_U32) + +/** + * osd_recovery_op_warn_multiple scales the normal warning threshhold, + * osd_op_complaint_time, so that slow recovery ops won't cause noise + */ +OPTION(osd_recovery_op_warn_multiple, OPT_U32) + +// Max time to wait between notifying mon of shutdown and shutting down +OPTION(osd_mon_shutdown_timeout, OPT_DOUBLE) +OPTION(osd_shutdown_pgref_assert, OPT_BOOL) // crash if the OSD has stray PG refs on shutdown + +OPTION(osd_max_object_size, OPT_U64) // OSD's maximum object size +OPTION(osd_max_object_name_len, OPT_U32) // max rados object name len +OPTION(osd_max_object_namespace_len, OPT_U32) // max rados object namespace len +OPTION(osd_max_attr_name_len, OPT_U32) // max rados attr name len; cannot go higher than 100 chars for file system backends +OPTION(osd_max_attr_size, OPT_U64) + +OPTION(osd_max_omap_entries_per_request, OPT_U64) +OPTION(osd_max_omap_bytes_per_request, OPT_U64) + +OPTION(osd_objectstore, OPT_STR) // ObjectStore backend type +OPTION(osd_objectstore_tracing, OPT_BOOL) // true if LTTng-UST tracepoints should be enabled +OPTION(osd_objectstore_fuse, OPT_BOOL) + +OPTION(osd_bench_small_size_max_iops, OPT_U32) // 100 IOPS +OPTION(osd_bench_large_size_max_throughput, OPT_U64) // 100 MB/s +OPTION(osd_bench_max_block_size, OPT_U64) // cap the block size at 64MB +OPTION(osd_bench_duration, OPT_U32) // duration of 'osd bench', capped at 30s to avoid triggering timeouts + +OPTION(osd_blkin_trace_all, OPT_BOOL) // create a blkin trace for all osd requests +OPTION(osdc_blkin_trace_all, OPT_BOOL) // create a blkin trace for all objecter requests + +OPTION(osd_discard_disconnected_ops, OPT_BOOL) + +OPTION(memstore_device_bytes, OPT_U64) +OPTION(memstore_page_set, OPT_BOOL) +OPTION(memstore_page_size, OPT_U64) + +OPTION(bdev_debug_inflight_ios, OPT_BOOL) +OPTION(bdev_inject_crash, OPT_INT) // if N>0, then ~ 1/N IOs will complete before we crash on flush. +OPTION(bdev_inject_crash_flush_delay, OPT_INT) // wait N more seconds on flush +OPTION(bdev_aio, OPT_BOOL) +OPTION(bdev_aio_poll_ms, OPT_INT) // milliseconds +OPTION(bdev_aio_max_queue_depth, OPT_INT) +OPTION(bdev_aio_reap_max, OPT_INT) +OPTION(bdev_block_size, OPT_INT) +OPTION(bdev_debug_aio, OPT_BOOL) +OPTION(bdev_debug_aio_suicide_timeout, OPT_FLOAT) + +// if yes, osd will unbind all NVMe devices from kernel driver and bind them +// to the uio_pci_generic driver. The purpose is to prevent the case where +// NVMe driver is loaded while osd is running. +OPTION(bdev_nvme_unbind_from_kernel, OPT_BOOL) +OPTION(bdev_nvme_retry_count, OPT_INT) // -1 means by default which is 4 + +OPTION(objectstore_blackhole, OPT_BOOL) + +OPTION(bluefs_alloc_size, OPT_U64) +OPTION(bluefs_max_prefetch, OPT_U64) +OPTION(bluefs_min_log_runway, OPT_U64) // alloc when we get this low +OPTION(bluefs_max_log_runway, OPT_U64) // alloc this much at a time +OPTION(bluefs_log_compact_min_ratio, OPT_FLOAT) // before we consider +OPTION(bluefs_log_compact_min_size, OPT_U64) // before we consider +OPTION(bluefs_min_flush_size, OPT_U64) // ignore flush until its this big +OPTION(bluefs_compact_log_sync, OPT_BOOL) // sync or async log compaction? +OPTION(bluefs_buffered_io, OPT_BOOL) +OPTION(bluefs_sync_write, OPT_BOOL) +OPTION(bluefs_allocator, OPT_STR) // stupid | bitmap +OPTION(bluefs_preextend_wal_files, OPT_BOOL) // this *requires* that rocksdb has recycling enabled + +OPTION(bluestore_bluefs, OPT_BOOL) +OPTION(bluestore_bluefs_env_mirror, OPT_BOOL) // mirror to normal Env for debug +OPTION(bluestore_bluefs_min, OPT_U64) // 1gb +OPTION(bluestore_bluefs_min_ratio, OPT_FLOAT) // min fs free / total free +OPTION(bluestore_bluefs_max_ratio, OPT_FLOAT) // max fs free / total free +OPTION(bluestore_bluefs_gift_ratio, OPT_FLOAT) // how much to add at a time +OPTION(bluestore_bluefs_reclaim_ratio, OPT_FLOAT) // how much to reclaim at a time +OPTION(bluestore_bluefs_balance_interval, OPT_FLOAT) // how often (sec) to balance free space between bluefs and bluestore +// If you want to use spdk driver, you need to specify NVMe serial number here +// with "spdk:" prefix. +// Users can use 'lspci -vvv -d 8086:0953 | grep "Device Serial Number"' to +// get the serial number of Intel(R) Fultondale NVMe controllers. +// Example: +// bluestore_block_path = spdk:55cd2e404bd73932 +// If you want to run multiple SPDK instances per node, you must specify the +// amount of dpdk memory size in MB each instance will use, to make sure each +// instance uses its own dpdk memory +OPTION(bluestore_spdk_mem, OPT_U32) +// A hexadecimal bit mask of the cores to run on. Note the core numbering can change between platforms and should be determined beforehand. +OPTION(bluestore_spdk_coremask, OPT_STR) +// Specify the maximal I/Os to be batched completed while checking queue pair completions. +// Default value 0 means that let SPDK nvme library determine the value. +OPTION(bluestore_spdk_max_io_completion, OPT_U32) +OPTION(bluestore_block_path, OPT_STR) +OPTION(bluestore_block_size, OPT_U64) // 10gb for testing +OPTION(bluestore_block_create, OPT_BOOL) +OPTION(bluestore_block_db_path, OPT_STR) +OPTION(bluestore_block_db_size, OPT_U64) // rocksdb ssts (hot/warm) +OPTION(bluestore_block_db_create, OPT_BOOL) +OPTION(bluestore_block_wal_path, OPT_STR) +OPTION(bluestore_block_wal_size, OPT_U64) // rocksdb wal +OPTION(bluestore_block_wal_create, OPT_BOOL) +OPTION(bluestore_block_preallocate_file, OPT_BOOL) //whether preallocate space if block/db_path/wal_path is file rather that block device. +OPTION(bluestore_csum_type, OPT_STR) // none|xxhash32|xxhash64|crc32c|crc32c_16|crc32c_8 +OPTION(bluestore_csum_min_block, OPT_U32) +OPTION(bluestore_csum_max_block, OPT_U32) +OPTION(bluestore_min_alloc_size, OPT_U32) +OPTION(bluestore_min_alloc_size_hdd, OPT_U32) +OPTION(bluestore_min_alloc_size_ssd, OPT_U32) +OPTION(bluestore_max_alloc_size, OPT_U32) +OPTION(bluestore_prefer_deferred_size, OPT_U32) +OPTION(bluestore_prefer_deferred_size_hdd, OPT_U32) +OPTION(bluestore_prefer_deferred_size_ssd, OPT_U32) +OPTION(bluestore_compression_mode, OPT_STR) // force|aggressive|passive|none +OPTION(bluestore_compression_algorithm, OPT_STR) +OPTION(bluestore_compression_min_blob_size, OPT_U32) +OPTION(bluestore_compression_min_blob_size_hdd, OPT_U32) +OPTION(bluestore_compression_min_blob_size_ssd, OPT_U32) +OPTION(bluestore_compression_max_blob_size, OPT_U32) +OPTION(bluestore_compression_max_blob_size_hdd, OPT_U32) +OPTION(bluestore_compression_max_blob_size_ssd, OPT_U32) +/* + * Specifies minimum expected amount of saved allocation units + * per single blob to enable compressed blobs garbage collection + * + */ +OPTION(bluestore_gc_enable_blob_threshold, OPT_INT) +/* + * Specifies minimum expected amount of saved allocation units + * per all blobsb to enable compressed blobs garbage collection + * + */ +OPTION(bluestore_gc_enable_total_threshold, OPT_INT) + +OPTION(bluestore_max_blob_size, OPT_U32) +OPTION(bluestore_max_blob_size_hdd, OPT_U32) +OPTION(bluestore_max_blob_size_ssd, OPT_U32) +/* + * Require the net gain of compression at least to be at this ratio, + * otherwise we don't compress. + * And ask for compressing at least 12.5%(1/8) off, by default. + */ +OPTION(bluestore_compression_required_ratio, OPT_DOUBLE) +OPTION(bluestore_extent_map_shard_max_size, OPT_U32) +OPTION(bluestore_extent_map_shard_target_size, OPT_U32) +OPTION(bluestore_extent_map_shard_min_size, OPT_U32) +OPTION(bluestore_extent_map_shard_target_size_slop, OPT_DOUBLE) +OPTION(bluestore_extent_map_inline_shard_prealloc_size, OPT_U32) +OPTION(bluestore_cache_trim_interval, OPT_DOUBLE) +OPTION(bluestore_cache_trim_max_skip_pinned, OPT_U32) // skip this many onodes pinned in cache before we give up +OPTION(bluestore_cache_type, OPT_STR) // lru, 2q +OPTION(bluestore_2q_cache_kin_ratio, OPT_DOUBLE) // kin page slot size / max page slot size +OPTION(bluestore_2q_cache_kout_ratio, OPT_DOUBLE) // number of kout page slot / total number of page slot +OPTION(bluestore_cache_size, OPT_U64) +OPTION(bluestore_cache_size_hdd, OPT_U64) +OPTION(bluestore_cache_size_ssd, OPT_U64) +OPTION(bluestore_cache_meta_ratio, OPT_DOUBLE) +OPTION(bluestore_cache_kv_ratio, OPT_DOUBLE) +OPTION(bluestore_cache_kv_max, OPT_U64) // limit the maximum amount of cache for the kv store +OPTION(bluestore_kvbackend, OPT_STR) +OPTION(bluestore_allocator, OPT_STR) // stupid | bitmap +OPTION(bluestore_freelist_blocks_per_key, OPT_INT) +OPTION(bluestore_bitmapallocator_blocks_per_zone, OPT_INT) // must be power of 2 aligned, e.g., 512, 1024, 2048... +OPTION(bluestore_bitmapallocator_span_size, OPT_INT) // must be power of 2 aligned, e.g., 512, 1024, 2048... +OPTION(bluestore_max_deferred_txc, OPT_U64) +OPTION(bluestore_rocksdb_options, OPT_STR) +OPTION(bluestore_fsck_on_mount, OPT_BOOL) +OPTION(bluestore_fsck_on_mount_deep, OPT_BOOL) +OPTION(bluestore_fsck_on_umount, OPT_BOOL) +OPTION(bluestore_fsck_on_umount_deep, OPT_BOOL) +OPTION(bluestore_fsck_on_mkfs, OPT_BOOL) +OPTION(bluestore_fsck_on_mkfs_deep, OPT_BOOL) +OPTION(bluestore_sync_submit_transaction, OPT_BOOL) // submit kv txn in queueing thread (not kv_sync_thread) +OPTION(bluestore_throttle_bytes, OPT_U64) +OPTION(bluestore_throttle_deferred_bytes, OPT_U64) +OPTION(bluestore_throttle_cost_per_io_hdd, OPT_U64) +OPTION(bluestore_throttle_cost_per_io_ssd, OPT_U64) +OPTION(bluestore_throttle_cost_per_io, OPT_U64) +OPTION(bluestore_deferred_batch_ops, OPT_U64) +OPTION(bluestore_deferred_batch_ops_hdd, OPT_U64) +OPTION(bluestore_deferred_batch_ops_ssd, OPT_U64) +OPTION(bluestore_nid_prealloc, OPT_INT) +OPTION(bluestore_blobid_prealloc, OPT_U64) +OPTION(bluestore_clone_cow, OPT_BOOL) // do copy-on-write for clones +OPTION(bluestore_default_buffered_read, OPT_BOOL) +OPTION(bluestore_default_buffered_write, OPT_BOOL) +OPTION(bluestore_debug_misc, OPT_BOOL) +OPTION(bluestore_debug_no_reuse_blocks, OPT_BOOL) +OPTION(bluestore_debug_small_allocations, OPT_INT) +OPTION(bluestore_debug_freelist, OPT_BOOL) +OPTION(bluestore_debug_prefill, OPT_FLOAT) +OPTION(bluestore_debug_prefragment_max, OPT_INT) +OPTION(bluestore_debug_inject_read_err, OPT_BOOL) +OPTION(bluestore_debug_randomize_serial_transaction, OPT_INT) +OPTION(bluestore_debug_omit_block_device_write, OPT_BOOL) +OPTION(bluestore_debug_fsck_abort, OPT_BOOL) +OPTION(bluestore_debug_omit_kv_commit, OPT_BOOL) +OPTION(bluestore_debug_permit_any_bdev_label, OPT_BOOL) +OPTION(bluestore_shard_finishers, OPT_BOOL) +OPTION(bluestore_debug_random_read_err, OPT_DOUBLE) + +OPTION(kstore_max_ops, OPT_U64) +OPTION(kstore_max_bytes, OPT_U64) +OPTION(kstore_backend, OPT_STR) +OPTION(kstore_rocksdb_options, OPT_STR) +OPTION(kstore_fsck_on_mount, OPT_BOOL) +OPTION(kstore_fsck_on_mount_deep, OPT_BOOL) +OPTION(kstore_nid_prealloc, OPT_U64) +OPTION(kstore_sync_transaction, OPT_BOOL) +OPTION(kstore_sync_submit_transaction, OPT_BOOL) +OPTION(kstore_onode_map_size, OPT_U64) +OPTION(kstore_default_stripe_size, OPT_INT) + +OPTION(filestore_omap_backend, OPT_STR) +OPTION(filestore_omap_backend_path, OPT_STR) + +/// filestore wb throttle limits +OPTION(filestore_wbthrottle_enable, OPT_BOOL) +OPTION(filestore_wbthrottle_btrfs_bytes_start_flusher, OPT_U64) +OPTION(filestore_wbthrottle_btrfs_bytes_hard_limit, OPT_U64) +OPTION(filestore_wbthrottle_btrfs_ios_start_flusher, OPT_U64) +OPTION(filestore_wbthrottle_btrfs_ios_hard_limit, OPT_U64) +OPTION(filestore_wbthrottle_btrfs_inodes_start_flusher, OPT_U64) +OPTION(filestore_wbthrottle_xfs_bytes_start_flusher, OPT_U64) +OPTION(filestore_wbthrottle_xfs_bytes_hard_limit, OPT_U64) +OPTION(filestore_wbthrottle_xfs_ios_start_flusher, OPT_U64) +OPTION(filestore_wbthrottle_xfs_ios_hard_limit, OPT_U64) +OPTION(filestore_wbthrottle_xfs_inodes_start_flusher, OPT_U64) + +/// These must be less than the fd limit +OPTION(filestore_wbthrottle_btrfs_inodes_hard_limit, OPT_U64) +OPTION(filestore_wbthrottle_xfs_inodes_hard_limit, OPT_U64) + +//Introduce a O_DSYNC write in the filestore +OPTION(filestore_odsync_write, OPT_BOOL) + +// Tests index failure paths +OPTION(filestore_index_retry_probability, OPT_DOUBLE) + +// Allow object read error injection +OPTION(filestore_debug_inject_read_err, OPT_BOOL) +OPTION(filestore_debug_random_read_err, OPT_DOUBLE) + +OPTION(filestore_debug_omap_check, OPT_BOOL) // Expensive debugging check on sync +OPTION(filestore_omap_header_cache_size, OPT_INT) + +// Use omap for xattrs for attrs over +// filestore_max_inline_xattr_size or +OPTION(filestore_max_inline_xattr_size, OPT_U32) //Override +OPTION(filestore_max_inline_xattr_size_xfs, OPT_U32) +OPTION(filestore_max_inline_xattr_size_btrfs, OPT_U32) +OPTION(filestore_max_inline_xattr_size_other, OPT_U32) + +// for more than filestore_max_inline_xattrs attrs +OPTION(filestore_max_inline_xattrs, OPT_U32) //Override +OPTION(filestore_max_inline_xattrs_xfs, OPT_U32) +OPTION(filestore_max_inline_xattrs_btrfs, OPT_U32) +OPTION(filestore_max_inline_xattrs_other, OPT_U32) + +// max xattr value size +OPTION(filestore_max_xattr_value_size, OPT_U32) //Override +OPTION(filestore_max_xattr_value_size_xfs, OPT_U32) +OPTION(filestore_max_xattr_value_size_btrfs, OPT_U32) +// ext4 allows 4k xattrs total including some smallish extra fields and the +// keys. We're allowing 2 512 inline attrs in addition some some filestore +// replay attrs. After accounting for those, we still need to fit up to +// two attrs of this value. That means we need this value to be around 1k +// to be safe. This is hacky, but it's not worth complicating the code +// to work around ext4's total xattr limit. +OPTION(filestore_max_xattr_value_size_other, OPT_U32) + +OPTION(filestore_sloppy_crc, OPT_BOOL) // track sloppy crcs +OPTION(filestore_sloppy_crc_block_size, OPT_INT) + +OPTION(filestore_max_alloc_hint_size, OPT_U64) // bytes + +OPTION(filestore_max_sync_interval, OPT_DOUBLE) // seconds +OPTION(filestore_min_sync_interval, OPT_DOUBLE) // seconds +OPTION(filestore_btrfs_snap, OPT_BOOL) +OPTION(filestore_btrfs_clone_range, OPT_BOOL) +OPTION(filestore_zfs_snap, OPT_BOOL) // zfsonlinux is still unstable +OPTION(filestore_fsync_flushes_journal_data, OPT_BOOL) +OPTION(filestore_fiemap, OPT_BOOL) // (try to) use fiemap +OPTION(filestore_punch_hole, OPT_BOOL) +OPTION(filestore_seek_data_hole, OPT_BOOL) // (try to) use seek_data/hole +OPTION(filestore_splice, OPT_BOOL) +OPTION(filestore_fadvise, OPT_BOOL) +//collect device partition information for management application to use +OPTION(filestore_collect_device_partition_information, OPT_BOOL) + +// (try to) use extsize for alloc hint NOTE: extsize seems to trigger +// data corruption in xfs prior to kernel 3.5. filestore will +// implicity disable this if it cannot confirm the kernel is newer +// than that. +// NOTE: This option involves a tradeoff: When disabled, fragmentation is +// worse, but large sequential writes are faster. When enabled, large +// sequential writes are slower, but fragmentation is reduced. +OPTION(filestore_xfs_extsize, OPT_BOOL) + +OPTION(filestore_journal_parallel, OPT_BOOL) +OPTION(filestore_journal_writeahead, OPT_BOOL) +OPTION(filestore_journal_trailing, OPT_BOOL) +OPTION(filestore_queue_max_ops, OPT_U64) +OPTION(filestore_queue_max_bytes, OPT_U64) + +OPTION(filestore_caller_concurrency, OPT_INT) + +/// Expected filestore throughput in B/s +OPTION(filestore_expected_throughput_bytes, OPT_DOUBLE) +/// Expected filestore throughput in ops/s +OPTION(filestore_expected_throughput_ops, OPT_DOUBLE) + +/// Filestore max delay multiple. Defaults to 0 (disabled) +OPTION(filestore_queue_max_delay_multiple, OPT_DOUBLE) +/// Filestore high delay multiple. Defaults to 0 (disabled) +OPTION(filestore_queue_high_delay_multiple, OPT_DOUBLE) + +/// Use above to inject delays intended to keep the op queue between low and high +OPTION(filestore_queue_low_threshhold, OPT_DOUBLE) +OPTION(filestore_queue_high_threshhold, OPT_DOUBLE) + +OPTION(filestore_op_threads, OPT_INT) +OPTION(filestore_op_thread_timeout, OPT_INT) +OPTION(filestore_op_thread_suicide_timeout, OPT_INT) +OPTION(filestore_commit_timeout, OPT_FLOAT) +OPTION(filestore_fiemap_threshold, OPT_INT) +OPTION(filestore_merge_threshold, OPT_INT) +OPTION(filestore_split_multiple, OPT_INT) +OPTION(filestore_split_rand_factor, OPT_U32) // randomize the split threshold by adding 16 * [0) +OPTION(filestore_update_to, OPT_INT) +OPTION(filestore_blackhole, OPT_BOOL) // drop any new transactions on the floor +OPTION(filestore_fd_cache_size, OPT_INT) // FD lru size +OPTION(filestore_fd_cache_shards, OPT_INT) // FD number of shards +OPTION(filestore_ondisk_finisher_threads, OPT_INT) +OPTION(filestore_apply_finisher_threads, OPT_INT) +OPTION(filestore_dump_file, OPT_STR) // file onto which store transaction dumps +OPTION(filestore_kill_at, OPT_INT) // inject a failure at the n'th opportunity +OPTION(filestore_inject_stall, OPT_INT) // artificially stall for N seconds in op queue thread +OPTION(filestore_fail_eio, OPT_BOOL) // fail/crash on EIO +OPTION(filestore_debug_verify_split, OPT_BOOL) +OPTION(journal_dio, OPT_BOOL) +OPTION(journal_aio, OPT_BOOL) +OPTION(journal_force_aio, OPT_BOOL) +OPTION(journal_block_size, OPT_INT) + +// max bytes to search ahead in journal searching for corruption +OPTION(journal_max_corrupt_search, OPT_U64) +OPTION(journal_block_align, OPT_BOOL) +OPTION(journal_write_header_frequency, OPT_U64) +OPTION(journal_max_write_bytes, OPT_INT) +OPTION(journal_max_write_entries, OPT_INT) + +/// Target range for journal fullness +OPTION(journal_throttle_low_threshhold, OPT_DOUBLE) +OPTION(journal_throttle_high_threshhold, OPT_DOUBLE) + +/// Multiple over expected at high_threshhold. Defaults to 0 (disabled). +OPTION(journal_throttle_high_multiple, OPT_DOUBLE) +/// Multiple over expected at max. Defaults to 0 (disabled). +OPTION(journal_throttle_max_multiple, OPT_DOUBLE) + +OPTION(journal_align_min_size, OPT_INT) // align data payloads >= this. +OPTION(journal_replay_from, OPT_INT) +OPTION(journal_zero_on_create, OPT_BOOL) +OPTION(journal_ignore_corruption, OPT_BOOL) // assume journal is not corrupt +OPTION(journal_discard, OPT_BOOL) //using ssd disk as journal, whether support discard nouse journal-data. + +OPTION(fio_dir, OPT_STR) // fio data directory for fio-objectstore + +OPTION(rados_mon_op_timeout, OPT_DOUBLE) // how many seconds to wait for a response from the monitor before returning an error from a rados operation. 0 means no limit. +OPTION(rados_osd_op_timeout, OPT_DOUBLE) // how many seconds to wait for a response from osds before returning an error from a rados operation. 0 means no limit. +OPTION(rados_tracing, OPT_BOOL) // true if LTTng-UST tracepoints should be enabled + +OPTION(rbd_op_threads, OPT_INT) +OPTION(rbd_op_thread_timeout, OPT_INT) +OPTION(rbd_non_blocking_aio, OPT_BOOL) // process AIO ops from a worker thread to prevent blocking +OPTION(rbd_cache, OPT_BOOL) // whether to enable caching (writeback unless rbd_cache_max_dirty is 0) +OPTION(rbd_cache_writethrough_until_flush, OPT_BOOL) // whether to make writeback caching writethrough until flush is called, to be sure the user of librbd will send flushs so that writeback is safe +OPTION(rbd_cache_size, OPT_LONGLONG) // cache size in bytes +OPTION(rbd_cache_max_dirty, OPT_LONGLONG) // dirty limit in bytes - set to 0 for write-through caching +OPTION(rbd_cache_target_dirty, OPT_LONGLONG) // target dirty limit in bytes +OPTION(rbd_cache_max_dirty_age, OPT_FLOAT) // seconds in cache before writeback starts +OPTION(rbd_cache_max_dirty_object, OPT_INT) // dirty limit for objects - set to 0 for auto calculate from rbd_cache_size +OPTION(rbd_cache_block_writes_upfront, OPT_BOOL) // whether to block writes to the cache before the aio_write call completes (true)) +OPTION(rbd_concurrent_management_ops, OPT_INT) // how many operations can be in flight for a management operation like deleting or resizing an image +OPTION(rbd_balance_snap_reads, OPT_BOOL) +OPTION(rbd_localize_snap_reads, OPT_BOOL) +OPTION(rbd_balance_parent_reads, OPT_BOOL) +OPTION(rbd_localize_parent_reads, OPT_BOOL) +OPTION(rbd_readahead_trigger_requests, OPT_INT) // number of sequential requests necessary to trigger readahead +OPTION(rbd_readahead_max_bytes, OPT_LONGLONG) // set to 0 to disable readahead +OPTION(rbd_readahead_disable_after_bytes, OPT_LONGLONG) // how many bytes are read in total before readahead is disabled +OPTION(rbd_clone_copy_on_read, OPT_BOOL) +OPTION(rbd_blacklist_on_break_lock, OPT_BOOL) // whether to blacklist clients whose lock was broken +OPTION(rbd_blacklist_expire_seconds, OPT_INT) // number of seconds to blacklist - set to 0 for OSD default +OPTION(rbd_request_timed_out_seconds, OPT_INT) // number of seconds before maint request times out +OPTION(rbd_skip_partial_discard, OPT_BOOL) // when trying to discard a range inside an object, set to true to skip zeroing the range. +OPTION(rbd_enable_alloc_hint, OPT_BOOL) // when writing a object, it will issue a hint to osd backend to indicate the expected size object need +OPTION(rbd_tracing, OPT_BOOL) // true if LTTng-UST tracepoints should be enabled +OPTION(rbd_blkin_trace_all, OPT_BOOL) // create a blkin trace for all RBD requests +OPTION(rbd_validate_pool, OPT_BOOL) // true if empty pools should be validated for RBD compatibility +OPTION(rbd_validate_names, OPT_BOOL) // true if image specs should be validated +OPTION(rbd_auto_exclusive_lock_until_manual_request, OPT_BOOL) // whether to automatically acquire/release exclusive lock until it is explicitly requested, i.e. before we know the user of librbd is properly using the lock API +OPTION(rbd_mirroring_resync_after_disconnect, OPT_BOOL) // automatically start image resync after mirroring is disconnected due to being laggy +OPTION(rbd_mirroring_replay_delay, OPT_INT) // time-delay in seconds for rbd-mirror asynchronous replication + +OPTION(rbd_default_pool, OPT_STR) // default pool for storing images + +/* + * The following options change the behavior for librbd's image creation methods that + * don't require all of the parameters. These are provided so that older programs + * can take advantage of newer features without being rewritten to use new versions + * of the image creation functions. + * + * rbd_create()/RBD::create() are affected by all of these options. + * + * rbd_create2()/RBD::create2() and rbd_clone()/RBD::clone() are affected by: + * - rbd_default_order + * - rbd_default_stripe_count + * - rbd_default_stripe_size + * + * rbd_create3()/RBD::create3() and rbd_clone2/RBD::clone2() are only + * affected by rbd_default_order. + */ +OPTION(rbd_default_format, OPT_INT) +OPTION(rbd_default_order, OPT_INT) +OPTION(rbd_default_stripe_count, OPT_U64) // changing requires stripingv2 feature +OPTION(rbd_default_stripe_unit, OPT_U64) // changing to non-object size requires stripingv2 feature +OPTION(rbd_default_data_pool, OPT_STR) // optional default pool for storing image data blocks + +/** + * RBD features are only applicable for v2 images. This setting accepts either + * an integer bitmask value or comma-delimited string of RBD feature names. + * This setting is always internally stored as an integer bitmask value. The + * mapping between feature bitmask value and feature name is as follows: + * + * +1 -> layering + * +2 -> striping + * +4 -> exclusive-lock + * +8 -> object-map + * +16 -> fast-diff + * +32 -> deep-flatten + * +64 -> journaling + * +128 -> data-pool + */ +SAFE_OPTION(rbd_default_features, OPT_STR) + +OPTION(rbd_default_map_options, OPT_STR) // default rbd map -o / --options + +/** + * RBD journal options. + */ +OPTION(rbd_journal_order, OPT_U32) // bits to shift to compute journal object max size, between 12 and 64 +OPTION(rbd_journal_splay_width, OPT_U32) // number of active journal objects +OPTION(rbd_journal_commit_age, OPT_DOUBLE) // commit time interval, seconds +OPTION(rbd_journal_object_flush_interval, OPT_INT) // maximum number of pending commits per journal object +OPTION(rbd_journal_object_flush_bytes, OPT_INT) // maximum number of pending bytes per journal object +OPTION(rbd_journal_object_flush_age, OPT_DOUBLE) // maximum age (in seconds) for pending commits +OPTION(rbd_journal_pool, OPT_STR) // pool for journal objects +OPTION(rbd_journal_max_payload_bytes, OPT_U32) // maximum journal payload size before splitting +OPTION(rbd_journal_max_concurrent_object_sets, OPT_INT) // maximum number of object sets a journal client can be behind before it is automatically unregistered + +/** + * RBD Mirror options + */ +OPTION(rbd_mirror_journal_commit_age, OPT_DOUBLE) // commit time interval, seconds +OPTION(rbd_mirror_journal_poll_age, OPT_DOUBLE) // maximum age (in seconds) between successive journal polls +OPTION(rbd_mirror_journal_max_fetch_bytes, OPT_U32) // maximum bytes to read from each journal data object per fetch +OPTION(rbd_mirror_sync_point_update_age, OPT_DOUBLE) // number of seconds between each update of the image sync point object number +OPTION(rbd_mirror_concurrent_image_syncs, OPT_U32) // maximum number of image syncs in parallel +OPTION(rbd_mirror_pool_replayers_refresh_interval, OPT_INT) // interval to refresh peers in rbd-mirror daemon +OPTION(rbd_mirror_delete_retry_interval, OPT_DOUBLE) // interval to check and retry the failed requests in deleter +OPTION(rbd_mirror_image_state_check_interval, OPT_INT) // interval to get images from pool watcher and set sources in replayer +OPTION(rbd_mirror_leader_heartbeat_interval, OPT_INT) // interval (in seconds) between mirror leader heartbeats +OPTION(rbd_mirror_leader_max_missed_heartbeats, OPT_INT) // number of missed heartbeats for non-lock owner to attempt to acquire lock +OPTION(rbd_mirror_leader_max_acquire_attempts_before_break, OPT_INT) // number of failed attempts to acquire lock after missing heartbeats before breaking lock + +OPTION(nss_db_path, OPT_STR) // path to nss db + + +OPTION(rgw_max_chunk_size, OPT_INT) +OPTION(rgw_put_obj_min_window_size, OPT_INT) +OPTION(rgw_put_obj_max_window_size, OPT_INT) +OPTION(rgw_max_put_size, OPT_U64) +OPTION(rgw_max_put_param_size, OPT_U64) // max input size for PUT requests accepting json/xml params + +/** + * override max bucket index shards in zone configuration (if not zero) + * + * Represents the number of shards for the bucket index object, a value of zero + * indicates there is no sharding. By default (no sharding, the name of the object + * is '.dir.{marker}', with sharding, the name is '.dir.{markder}.{sharding_id}', + * sharding_id is zero-based value. It is not recommended to set a too large value + * (e.g. thousand) as it increases the cost for bucket listing. + */ +OPTION(rgw_override_bucket_index_max_shards, OPT_U32) + +/** + * Represents the maximum AIO pending requests for the bucket index object shards. + */ +OPTION(rgw_bucket_index_max_aio, OPT_U32) + +/** + * whether or not the quota/gc threads should be started + */ +OPTION(rgw_enable_quota_threads, OPT_BOOL) +OPTION(rgw_enable_gc_threads, OPT_BOOL) +OPTION(rgw_enable_lc_threads, OPT_BOOL) + + +OPTION(rgw_data, OPT_STR) +OPTION(rgw_enable_apis, OPT_STR) +OPTION(rgw_cache_enabled, OPT_BOOL) // rgw cache enabled +OPTION(rgw_cache_lru_size, OPT_INT) // num of entries in rgw cache +OPTION(rgw_socket_path, OPT_STR) // path to unix domain socket, if not specified, rgw will not run as external fcgi +OPTION(rgw_host, OPT_STR) // host for radosgw, can be an IP, default is 0.0.0.0 +OPTION(rgw_port, OPT_STR) // port to listen, format as "8080" "5000", if not specified, rgw will not run external fcgi +OPTION(rgw_dns_name, OPT_STR) // hostname suffix on buckets +OPTION(rgw_dns_s3website_name, OPT_STR) // hostname suffix on buckets for s3-website endpoint +OPTION(rgw_content_length_compat, OPT_BOOL) // Check both HTTP_CONTENT_LENGTH and CONTENT_LENGTH in fcgi env +OPTION(rgw_lifecycle_work_time, OPT_STR) //job process lc at 00:00-06:00s +OPTION(rgw_lc_lock_max_time, OPT_INT) // total run time for a single lc processor work +OPTION(rgw_lc_max_objs, OPT_INT) +OPTION(rgw_lc_debug_interval, OPT_INT) // Debug run interval, in seconds +OPTION(rgw_script_uri, OPT_STR) // alternative value for SCRIPT_URI if not set in request +OPTION(rgw_request_uri, OPT_STR) // alternative value for REQUEST_URI if not set in request +OPTION(rgw_swift_url, OPT_STR) // the swift url, being published by the internal swift auth +OPTION(rgw_swift_url_prefix, OPT_STR) // entry point for which a url is considered a swift url +OPTION(rgw_swift_auth_url, OPT_STR) // default URL to go and verify tokens for v1 auth (if not using internal swift auth) +OPTION(rgw_swift_auth_entry, OPT_STR) // entry point for which a url is considered a swift auth url +OPTION(rgw_swift_tenant_name, OPT_STR) // tenant name to use for swift access +OPTION(rgw_swift_account_in_url, OPT_BOOL) // assume that URL always contain the account (aka tenant) part +OPTION(rgw_swift_enforce_content_length, OPT_BOOL) // enforce generation of Content-Length even in cost of performance or scalability +OPTION(rgw_keystone_url, OPT_STR) // url for keystone server +OPTION(rgw_keystone_admin_token, OPT_STR) // keystone admin token (shared secret) +OPTION(rgw_keystone_admin_user, OPT_STR) // keystone admin user name +OPTION(rgw_keystone_admin_password, OPT_STR) // keystone admin user password +OPTION(rgw_keystone_admin_tenant, OPT_STR) // keystone admin user tenant (for keystone v2.0) +OPTION(rgw_keystone_admin_project, OPT_STR) // keystone admin user project (for keystone v3) +OPTION(rgw_keystone_admin_domain, OPT_STR) // keystone admin user domain +OPTION(rgw_keystone_barbican_user, OPT_STR) // keystone user to access barbican secrets +OPTION(rgw_keystone_barbican_password, OPT_STR) // keystone password for barbican user +OPTION(rgw_keystone_barbican_tenant, OPT_STR) // keystone barbican user tenant (for keystone v2.0) +OPTION(rgw_keystone_barbican_project, OPT_STR) // keystone barbican user project (for keystone v3) +OPTION(rgw_keystone_barbican_domain, OPT_STR) // keystone barbican user domain +OPTION(rgw_keystone_api_version, OPT_INT) // Version of Keystone API to use (2 or 3) +OPTION(rgw_keystone_accepted_roles, OPT_STR) // roles required to serve requests +OPTION(rgw_keystone_accepted_admin_roles, OPT_STR) // list of roles allowing an user to gain admin privileges +OPTION(rgw_keystone_token_cache_size, OPT_INT) // max number of entries in keystone token cache +OPTION(rgw_keystone_revocation_interval, OPT_INT) // seconds between tokens revocation check +OPTION(rgw_keystone_verify_ssl, OPT_BOOL) // should we try to verify keystone's ssl +OPTION(rgw_keystone_implicit_tenants, OPT_BOOL) // create new users in their own tenants of the same name +OPTION(rgw_cross_domain_policy, OPT_STR) +OPTION(rgw_healthcheck_disabling_path, OPT_STR) // path that existence causes the healthcheck to respond 503 +OPTION(rgw_s3_auth_use_rados, OPT_BOOL) // should we try to use the internal credentials for s3? +OPTION(rgw_s3_auth_use_keystone, OPT_BOOL) // should we try to use keystone for s3? +OPTION(rgw_s3_auth_aws4_force_boto2_compat, OPT_BOOL) // force aws4 auth boto2 compatibility +OPTION(rgw_barbican_url, OPT_STR) // url for barbican server + +/* OpenLDAP-style LDAP parameter strings */ +/* rgw_ldap_uri space-separated list of LDAP servers in URI format */ +OPTION(rgw_ldap_uri, OPT_STR) +/* rgw_ldap_binddn LDAP entry RGW will bind with (user match) */ +OPTION(rgw_ldap_binddn, OPT_STR) +/* rgw_ldap_searchdn LDAP search base (basedn) */ +OPTION(rgw_ldap_searchdn, OPT_STR) +/* rgw_ldap_dnattr LDAP attribute containing RGW user names (to form binddns)*/ +OPTION(rgw_ldap_dnattr, OPT_STR) +/* rgw_ldap_secret file containing credentials for rgw_ldap_binddn */ +OPTION(rgw_ldap_secret, OPT_STR) +/* rgw_s3_auth_use_ldap use LDAP for RGW auth? */ +OPTION(rgw_s3_auth_use_ldap, OPT_BOOL) +/* rgw_ldap_searchfilter LDAP search filter */ +OPTION(rgw_ldap_searchfilter, OPT_STR) + +OPTION(rgw_admin_entry, OPT_STR) // entry point for which a url is considered an admin request +OPTION(rgw_enforce_swift_acls, OPT_BOOL) +OPTION(rgw_swift_token_expiration, OPT_INT) // time in seconds for swift token expiration +OPTION(rgw_print_continue, OPT_BOOL) // enable if 100-Continue works +OPTION(rgw_print_prohibited_content_length, OPT_BOOL) // violate RFC 7230 and send Content-Length in 204 and 304 +OPTION(rgw_remote_addr_param, OPT_STR) // e.g. X-Forwarded-For, if you have a reverse proxy +OPTION(rgw_op_thread_timeout, OPT_INT) +OPTION(rgw_op_thread_suicide_timeout, OPT_INT) +OPTION(rgw_thread_pool_size, OPT_INT) +OPTION(rgw_num_control_oids, OPT_INT) +OPTION(rgw_num_rados_handles, OPT_U32) +OPTION(rgw_verify_ssl, OPT_BOOL) // should http_client try to verify ssl when sent https request + +/* The following are tunables for caches of RGW NFS (and other file + * client) objects. + * + * The file handle cache is a partitioned hash table + * (fhcache_partitions), each with a closed hash part and backing + * b-tree mapping. The number of partions is expected to be a small + * prime, the cache size something larger but less than 5K, the total + * size of the cache is n_part * cache_size. + */ +OPTION(rgw_nfs_lru_lanes, OPT_INT) +OPTION(rgw_nfs_lru_lane_hiwat, OPT_INT) +OPTION(rgw_nfs_fhcache_partitions, OPT_INT) +OPTION(rgw_nfs_fhcache_size, OPT_INT) /* 3*2017=6051 */ +OPTION(rgw_nfs_namespace_expire_secs, OPT_INT) /* namespace invalidate + * timer */ +OPTION(rgw_nfs_max_gc, OPT_INT) /* max gc events per cycle */ +OPTION(rgw_nfs_write_completion_interval_s, OPT_INT) /* stateless (V3) + * commit + * delay */ + +OPTION(rgw_zone, OPT_STR) // zone name +OPTION(rgw_zone_root_pool, OPT_STR) // pool where zone specific info is stored +OPTION(rgw_default_zone_info_oid, OPT_STR) // oid where default zone info is stored +OPTION(rgw_region, OPT_STR) // region name +OPTION(rgw_region_root_pool, OPT_STR) // pool where all region info is stored +OPTION(rgw_default_region_info_oid, OPT_STR) // oid where default region info is stored +OPTION(rgw_zonegroup, OPT_STR) // zone group name +OPTION(rgw_zonegroup_root_pool, OPT_STR) // pool where all zone group info is stored +OPTION(rgw_default_zonegroup_info_oid, OPT_STR) // oid where default zone group info is stored +OPTION(rgw_realm, OPT_STR) // realm name +OPTION(rgw_realm_root_pool, OPT_STR) // pool where all realm info is stored +OPTION(rgw_default_realm_info_oid, OPT_STR) // oid where default realm info is stored +OPTION(rgw_period_root_pool, OPT_STR) // pool where all period info is stored +OPTION(rgw_period_latest_epoch_info_oid, OPT_STR) // oid where current period info is stored +OPTION(rgw_log_nonexistent_bucket, OPT_BOOL) +OPTION(rgw_log_object_name, OPT_STR) // man date to see codes (a subset are supported) +OPTION(rgw_log_object_name_utc, OPT_BOOL) +OPTION(rgw_usage_max_shards, OPT_INT) +OPTION(rgw_usage_max_user_shards, OPT_INT) +OPTION(rgw_enable_ops_log, OPT_BOOL) // enable logging every rgw operation +OPTION(rgw_enable_usage_log, OPT_BOOL) // enable logging bandwidth usage +OPTION(rgw_ops_log_rados, OPT_BOOL) // whether ops log should go to rados +OPTION(rgw_ops_log_socket_path, OPT_STR) // path to unix domain socket where ops log can go +OPTION(rgw_ops_log_data_backlog, OPT_INT) // max data backlog for ops log +OPTION(rgw_fcgi_socket_backlog, OPT_INT) // socket backlog for fcgi +OPTION(rgw_usage_log_flush_threshold, OPT_INT) // threshold to flush pending log data +OPTION(rgw_usage_log_tick_interval, OPT_INT) // flush pending log data every X seconds +OPTION(rgw_intent_log_object_name, OPT_STR) // man date to see codes (a subset are supported) +OPTION(rgw_intent_log_object_name_utc, OPT_BOOL) +OPTION(rgw_init_timeout, OPT_INT) // time in seconds +OPTION(rgw_mime_types_file, OPT_STR) +OPTION(rgw_gc_max_objs, OPT_INT) +OPTION(rgw_gc_obj_min_wait, OPT_INT) // wait time before object may be handled by gc +OPTION(rgw_gc_processor_max_time, OPT_INT) // total run time for a single gc processor work +OPTION(rgw_gc_processor_period, OPT_INT) // gc processor cycle time +OPTION(rgw_s3_success_create_obj_status, OPT_INT) // alternative success status response for create-obj (0 - default) +OPTION(rgw_resolve_cname, OPT_BOOL) // should rgw try to resolve hostname as a dns cname record +OPTION(rgw_obj_stripe_size, OPT_INT) +OPTION(rgw_extended_http_attrs, OPT_STR) // list of extended attrs that can be set on objects (beyond the default) +OPTION(rgw_exit_timeout_secs, OPT_INT) // how many seconds to wait for process to go down before exiting unconditionally +OPTION(rgw_get_obj_window_size, OPT_INT) // window size in bytes for single get obj request +OPTION(rgw_get_obj_max_req_size, OPT_INT) // max length of a single get obj rados op +OPTION(rgw_relaxed_s3_bucket_names, OPT_BOOL) // enable relaxed bucket name rules for US region buckets +OPTION(rgw_defer_to_bucket_acls, OPT_STR) // if the user has bucket perms) +OPTION(rgw_list_buckets_max_chunk, OPT_INT) // max buckets to retrieve in a single op when listing user buckets +OPTION(rgw_md_log_max_shards, OPT_INT) // max shards for metadata log +OPTION(rgw_num_zone_opstate_shards, OPT_INT) // max shards for keeping inter-region copy progress info +OPTION(rgw_opstate_ratelimit_sec, OPT_INT) // min time between opstate updates on a single upload (0 for disabling ratelimit) +OPTION(rgw_curl_wait_timeout_ms, OPT_INT) // timeout for certain curl calls +OPTION(rgw_copy_obj_progress, OPT_BOOL) // should dump progress during long copy operations? +OPTION(rgw_copy_obj_progress_every_bytes, OPT_INT) // min bytes between copy progress output +OPTION(rgw_obj_tombstone_cache_size, OPT_INT) // how many objects in tombstone cache, which is used in multi-zone sync to keep + // track of removed objects' mtime + +OPTION(rgw_data_log_window, OPT_INT) // data log entries window (in seconds) +OPTION(rgw_data_log_changes_size, OPT_INT) // number of in-memory entries to hold for data changes log +OPTION(rgw_data_log_num_shards, OPT_INT) // number of objects to keep data changes log on +OPTION(rgw_data_log_obj_prefix, OPT_STR) // +OPTION(rgw_replica_log_obj_prefix, OPT_STR) // + +OPTION(rgw_bucket_quota_ttl, OPT_INT) // time for cached bucket stats to be cached within rgw instance +OPTION(rgw_bucket_quota_soft_threshold, OPT_DOUBLE) // threshold from which we don't rely on cached info for quota decisions +OPTION(rgw_bucket_quota_cache_size, OPT_INT) // number of entries in bucket quota cache +OPTION(rgw_bucket_default_quota_max_objects, OPT_INT) // number of objects allowed +OPTION(rgw_bucket_default_quota_max_size, OPT_LONGLONG) // Max size of object in bytes + +OPTION(rgw_expose_bucket, OPT_BOOL) // Return the bucket name in the 'Bucket' response header + +OPTION(rgw_frontends, OPT_STR) // rgw front ends + +OPTION(rgw_user_quota_bucket_sync_interval, OPT_INT) // time period for accumulating modified buckets before syncing stats +OPTION(rgw_user_quota_sync_interval, OPT_INT) // time period for accumulating modified buckets before syncing entire user stats +OPTION(rgw_user_quota_sync_idle_users, OPT_BOOL) // whether stats for idle users be fully synced +OPTION(rgw_user_quota_sync_wait_time, OPT_INT) // min time between two full stats sync for non-idle users +OPTION(rgw_user_default_quota_max_objects, OPT_INT) // number of objects allowed +OPTION(rgw_user_default_quota_max_size, OPT_LONGLONG) // Max size of object in bytes + +OPTION(rgw_multipart_min_part_size, OPT_INT) // min size for each part (except for last one) in multipart upload +OPTION(rgw_multipart_part_upload_limit, OPT_INT) // parts limit in multipart upload + +OPTION(rgw_max_slo_entries, OPT_INT) // default number of max entries in slo + +OPTION(rgw_olh_pending_timeout_sec, OPT_INT) // time until we retire a pending olh change +OPTION(rgw_user_max_buckets, OPT_INT) // global option to set max buckets count for all user + +OPTION(rgw_objexp_gc_interval, OPT_U32) // maximum time between round of expired objects garbage collecting +OPTION(rgw_objexp_time_step, OPT_U32) // number of seconds for rounding the timestamps +OPTION(rgw_objexp_hints_num_shards, OPT_U32) // maximum number of parts in which the hint index is stored in +OPTION(rgw_objexp_chunk_size, OPT_U32) // maximum number of entries in a single operation when processing objexp data + +OPTION(rgw_enable_static_website, OPT_BOOL) // enable static website feature +OPTION(rgw_log_http_headers, OPT_STR) // list of HTTP headers to log when seen, ignores case (e.g., http_x_forwarded_for + +OPTION(rgw_num_async_rados_threads, OPT_INT) // num of threads to use for async rados operations +OPTION(rgw_md_notify_interval_msec, OPT_INT) // metadata changes notification interval to followers +OPTION(rgw_run_sync_thread, OPT_BOOL) // whether radosgw (not radosgw-admin) spawns the sync thread +OPTION(rgw_sync_lease_period, OPT_INT) // time in second for lease that rgw takes on a specific log (or log shard) +OPTION(rgw_sync_log_trim_interval, OPT_INT) // time in seconds between attempts to trim sync logs + +OPTION(rgw_sync_data_inject_err_probability, OPT_DOUBLE) // range [0, 1] +OPTION(rgw_sync_meta_inject_err_probability, OPT_DOUBLE) // range [0, 1] + + +OPTION(rgw_period_push_interval, OPT_DOUBLE) // seconds to wait before retrying "period push" +OPTION(rgw_period_push_interval_max, OPT_DOUBLE) // maximum interval after exponential backoff + +OPTION(rgw_safe_max_objects_per_shard, OPT_INT) // safe max loading +OPTION(rgw_shard_warning_threshold, OPT_DOUBLE) // pct of safe max + // at which to warn + +OPTION(rgw_swift_versioning_enabled, OPT_BOOL) // whether swift object versioning feature is enabled + +OPTION(mgr_module_path, OPT_STR) // where to load python modules from +OPTION(mgr_initial_modules, OPT_STR) // Which modules to load +OPTION(mgr_data, OPT_STR) // where to find keyring etc +OPTION(mgr_tick_period, OPT_INT) // How frequently to tick +OPTION(mgr_stats_period, OPT_INT) // How frequently clients send stats +OPTION(mgr_client_bytes, OPT_U64) // bytes from clients +OPTION(mgr_client_messages, OPT_U64) // messages from clients +OPTION(mgr_osd_bytes, OPT_U64) // bytes from osds +OPTION(mgr_osd_messages, OPT_U64) // messages from osds +OPTION(mgr_mds_bytes, OPT_U64) // bytes from mdss +OPTION(mgr_mds_messages, OPT_U64) // messages from mdss +OPTION(mgr_mon_bytes, OPT_U64) // bytes from mons +OPTION(mgr_mon_messages, OPT_U64) // messages from mons + +OPTION(mgr_connect_retry_interval, OPT_DOUBLE) +OPTION(mgr_service_beacon_grace, OPT_DOUBLE) + +OPTION(mon_mgr_digest_period, OPT_INT) // How frequently to send digests +OPTION(mon_mgr_beacon_grace, OPT_INT) // How long to wait to failover +OPTION(mon_mgr_inactive_grace, OPT_INT) // How long before health WARN -> ERR +OPTION(mon_mgr_mkfs_grace, OPT_INT) // How long before we complain about MGR_DOWN +OPTION(rgw_crypt_require_ssl, OPT_BOOL) // requests including encryption key headers must be sent over ssl +OPTION(rgw_crypt_default_encryption_key, OPT_STR) // base64 encoded key for encryption of rgw objects +OPTION(rgw_crypt_s3_kms_encryption_keys, OPT_STR) // extra keys that may be used for aws:kms + // defined as map "key1=YmluCmJvb3N0CmJvb3N0LQ== key2=b3V0CnNyYwpUZXN0aW5nCg==" +OPTION(rgw_crypt_suppress_logs, OPT_BOOL) // suppress logs that might print customer key +OPTION(rgw_list_bucket_min_readahead, OPT_INT) // minimum number of entries to read from rados for bucket listing + +OPTION(rgw_rest_getusage_op_compat, OPT_BOOL) // dump description of total stats for s3 GetUsage API + +OPTION(mutex_perf_counter, OPT_BOOL) // enable/disable mutex perf counter +OPTION(throttler_perf_counter, OPT_BOOL) // enable/disable throttler perf counter + +/* The following are tunables for torrent data */ +OPTION(rgw_torrent_flag, OPT_BOOL) // produce torrent function flag +OPTION(rgw_torrent_tracker, OPT_STR) // torrent field annouce and annouce list +OPTION(rgw_torrent_createby, OPT_STR) // torrent field created by +OPTION(rgw_torrent_comment, OPT_STR) // torrent field comment +OPTION(rgw_torrent_encoding, OPT_STR) // torrent field encoding +OPTION(rgw_torrent_origin, OPT_STR) // torrent origin +OPTION(rgw_torrent_sha_unit, OPT_INT) // torrent field piece length 512K + +OPTION(event_tracing, OPT_BOOL) // true if LTTng-UST tracepoints should be enabled + +// This will be set to true when it is safe to start threads. +// Once it is true, it will never change. +OPTION(internal_safe_to_start_threads, OPT_BOOL) + +OPTION(debug_deliberately_leak_memory, OPT_BOOL) + +OPTION(rgw_swift_custom_header, OPT_STR) // option to enable swift custom headers + +OPTION(rgw_swift_need_stats, OPT_BOOL) // option to enable stats on bucket listing for swift + +/* resharding tunables */ +OPTION(rgw_reshard_num_logs, OPT_INT) +OPTION(rgw_reshard_bucket_lock_duration, OPT_INT) // duration of lock on bucket obj during resharding +OPTION(rgw_dynamic_resharding, OPT_BOOL) +OPTION(rgw_max_objs_per_shard, OPT_INT) +OPTION(rgw_reshard_thread_interval, OPT_U32) // maximum time between rounds of reshard thread processing + +OPTION(rgw_acl_grants_max_num, OPT_INT) // According to AWS S3(http://docs.aws.amazon.com/AmazonS3/latest/dev/acl-overview.html), An ACL can have up to 100 grants. diff -Nru ceph-12.1.1/src/common/options.cc ceph-12.1.2/src/common/options.cc --- ceph-12.1.1/src/common/options.cc 1970-01-01 00:00:00.000000000 +0000 +++ ceph-12.1.2/src/common/options.cc 2017-08-01 17:55:40.000000000 +0000 @@ -0,0 +1,5673 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#include "acconfig.h" +#include "options.h" +#include "common/Formatter.h" + +// Helpers for validators +#include "include/stringify.h" +#include +#include +#include + + +void Option::dump_value(const char *field_name, + const Option::value_t &v, Formatter *f) const +{ + if (boost::get(&v)) { + // This should be nil but Formatter doesn't allow it. + f->dump_string(field_name, ""); + } else if (type == TYPE_UINT) { + f->dump_unsigned(field_name, boost::get(v)); + } else if (type == TYPE_INT) { + f->dump_int(field_name, boost::get(v)); + } else if (type == TYPE_STR) { + f->dump_string(field_name, boost::get(v)); + } else if (type == TYPE_FLOAT) { + f->dump_float(field_name, boost::get(v)); + } else if (type == TYPE_BOOL) { + f->dump_bool(field_name, boost::get(v)); + } else { + f->dump_stream(field_name) << v; + } +} + +int Option::pre_validate(std::string *new_value, std::string *err) const +{ + if (validator) { + return validator(new_value, err); + } else { + return 0; + } +} + +int Option::validate(const Option::value_t &new_value, std::string *err) const +{ + // Generic validation: min + if (!boost::get(&(min))) { + if (new_value < min) { + std::ostringstream oss; + oss << "Value '" << new_value << "' is below minimum " << min; + *err = oss.str(); + return -EINVAL; + } + } + + // Generic validation: max + if (!boost::get(&(max))) { + if (new_value > max) { + std::ostringstream oss; + oss << "Value '" << new_value << "' exceeds maximum " << max; + *err = oss.str(); + return -EINVAL; + } + } + + // Generic validation: enum + if (!enum_allowed.empty() && type == Option::TYPE_STR) { + auto found = std::find(enum_allowed.begin(), enum_allowed.end(), + boost::get(new_value)); + if (found == enum_allowed.end()) { + std::ostringstream oss; + oss << "'" << new_value << "' is not one of the permitted " + "values: " << joinify(enum_allowed.begin(), + enum_allowed.end(), + std::string(", ")); + *err = oss.str(); + return -EINVAL; + } + } + + return 0; +} + +void Option::dump(Formatter *f) const +{ + f->open_object_section("option"); + f->dump_string("name", name); + + f->dump_string("type", type_to_str(type)); + std::string level_str; + + f->dump_string("level", level_to_str(level)); + + f->dump_string("desc", desc); + f->dump_string("long_desc", long_desc); + + dump_value("default", value, f); + dump_value("daemon_default", daemon_value, f); + + f->open_array_section("tags"); + for (const auto t : tags) { + f->dump_string("tag", t); + } + f->close_section(); + + f->open_array_section("services"); + for (const auto s : services) { + f->dump_string("service", s); + } + f->close_section(); + + f->open_array_section("see_also"); + for (const auto sa : see_also) { + f->dump_string("see_also", sa); + } + f->close_section(); + + if (type == TYPE_STR) { + f->open_array_section("enum_values"); + for (const auto &ea : enum_allowed) { + f->dump_string("enum_value", ea); + } + f->close_section(); + } + + dump_value("min", min, f); + dump_value("max", max, f); + + f->close_section(); +} + + +std::vector