diff -Nru ceph-12.2.11/admin/doc-requirements.txt ceph-12.2.12/admin/doc-requirements.txt
--- ceph-12.2.11/admin/doc-requirements.txt	2019-01-30 15:51:26.000000000 +0000
+++ ceph-12.2.12/admin/doc-requirements.txt	2019-04-11 12:33:50.000000000 +0000
@@ -1,3 +1,3 @@
 Sphinx == 1.6.3
 -e git+https://github.com/ceph/sphinx-ditaa.git@py3#egg=sphinx-ditaa
--e git+https://github.com/michaeljones/breathe#egg=breathe
+breathe == 4.11.1
diff -Nru ceph-12.2.11/alpine/APKBUILD ceph-12.2.12/alpine/APKBUILD
--- ceph-12.2.11/alpine/APKBUILD	2019-01-30 15:55:46.000000000 +0000
+++ ceph-12.2.12/alpine/APKBUILD	2019-04-11 12:36:34.000000000 +0000
@@ -1,7 +1,7 @@
 # Contributor: John Coyle <dx9err@gmail.com>
 # Maintainer: John Coyle <dx9err@gmail.com>
 pkgname=ceph
-pkgver=12.2.11
+pkgver=12.2.12
 pkgrel=0
 pkgdesc="Ceph is a distributed object store and file system"
 pkgusers="ceph"
@@ -63,7 +63,7 @@
 	xmlstarlet
 	yasm
 "
-source="ceph-12.2.11.tar.bz2"
+source="ceph-12.2.12.tar.bz2"
 subpackages="
 	$pkgname-base
 	$pkgname-common
@@ -116,7 +116,7 @@
 _udevrulesdir=/etc/udev/rules.d
 _python_sitelib=/usr/lib/python2.7/site-packages
 
-builddir=$srcdir/ceph-12.2.11
+builddir=$srcdir/ceph-12.2.12
 
 build() {
 	export CEPH_BUILD_VIRTUALENV=$builddir
diff -Nru ceph-12.2.11/ceph.spec ceph-12.2.12/ceph.spec
--- ceph-12.2.11/ceph.spec	2019-01-30 15:55:46.000000000 +0000
+++ ceph-12.2.12/ceph.spec	2019-04-11 12:36:34.000000000 +0000
@@ -61,7 +61,7 @@
 # main package definition
 #################################################################################
 Name:		ceph
-Version:	12.2.11
+Version:	12.2.12
 Release:	0%{?dist}
 %if 0%{?fedora} || 0%{?rhel}
 Epoch:		2
@@ -77,7 +77,7 @@
 Group:		System/Filesystems
 %endif
 URL:		http://ceph.com/
-Source0:	http://ceph.com/download/ceph-12.2.11.tar.bz2
+Source0:	http://ceph.com/download/ceph-12.2.12.tar.bz2
 %if 0%{?suse_version}
 %if 0%{?is_opensuse}
 ExclusiveArch:  x86_64 aarch64 ppc64 ppc64le
@@ -382,7 +382,7 @@
 %if 0%{?suse_version}
 Group:		System/Filesystems
 %endif
-Requires:	ceph-common = %{_epoch_prefix}%{version}-%{release}
+Requires:	ceph-base = %{_epoch_prefix}%{version}-%{release}
 Requires:	librados2 = %{_epoch_prefix}%{version}-%{release}
 %description -n rbd-mirror
 Daemon for mirroring RBD images between Ceph clusters, streaming
@@ -403,7 +403,7 @@
 %if 0%{?suse_version}
 Group:		System/Filesystems
 %endif
-Requires:	ceph-common = %{_epoch_prefix}%{version}-%{release}
+Requires:	ceph-base = %{_epoch_prefix}%{version}-%{release}
 %if 0%{with selinux}
 Requires:	ceph-selinux = %{_epoch_prefix}%{version}-%{release}
 %endif
@@ -788,7 +788,7 @@
 # common
 #################################################################################
 %prep
-%autosetup -p1 -n ceph-12.2.11
+%autosetup -p1 -n ceph-12.2.12
 
 %build
 %if 0%{with cephfs_java}
diff -Nru ceph-12.2.11/ceph.spec.in ceph-12.2.12/ceph.spec.in
--- ceph-12.2.11/ceph.spec.in	2019-01-30 15:51:26.000000000 +0000
+++ ceph-12.2.12/ceph.spec.in	2019-04-11 12:33:50.000000000 +0000
@@ -382,7 +382,7 @@
 %if 0%{?suse_version}
 Group:		System/Filesystems
 %endif
-Requires:	ceph-common = %{_epoch_prefix}%{version}-%{release}
+Requires:	ceph-base = %{_epoch_prefix}%{version}-%{release}
 Requires:	librados2 = %{_epoch_prefix}%{version}-%{release}
 %description -n rbd-mirror
 Daemon for mirroring RBD images between Ceph clusters, streaming
@@ -403,7 +403,7 @@
 %if 0%{?suse_version}
 Group:		System/Filesystems
 %endif
-Requires:	ceph-common = %{_epoch_prefix}%{version}-%{release}
+Requires:	ceph-base = %{_epoch_prefix}%{version}-%{release}
 %if 0%{with selinux}
 Requires:	ceph-selinux = %{_epoch_prefix}%{version}-%{release}
 %endif
diff -Nru ceph-12.2.11/CMakeLists.txt ceph-12.2.12/CMakeLists.txt
--- ceph-12.2.11/CMakeLists.txt	2019-01-30 15:51:26.000000000 +0000
+++ ceph-12.2.12/CMakeLists.txt	2019-04-11 12:33:50.000000000 +0000
@@ -1,7 +1,7 @@
 cmake_minimum_required(VERSION 2.8.11)
 
 project(ceph)
-set(VERSION 12.2.11)
+set(VERSION 12.2.12)
 
 if(POLICY CMP0046)
   # Tweak policies (this one disables "missing" dependency warning)
diff -Nru ceph-12.2.11/debian/changelog ceph-12.2.12/debian/changelog
--- ceph-12.2.11/debian/changelog	2019-04-26 12:17:04.000000000 +0000
+++ ceph-12.2.12/debian/changelog	2019-06-12 10:05:49.000000000 +0000
@@ -1,3 +1,11 @@
+ceph (12.2.12-0ubuntu0.18.04.1) bionic; urgency=medium
+
+  * d/copyright: Exclude cruft from upstream tarballs.
+  * New upstream point release (LP: #1829716).
+  * d/p/s390x-link.patch: Drop, included upstream.
+
+ -- James Page <james.page@ubuntu.com>  Wed, 12 Jun 2019 11:05:49 +0100
+
 ceph (12.2.11-0ubuntu0.18.04.2) bionic; urgency=medium
 
   * d/control: Use openssl1.0 at build and runtime as
diff -Nru ceph-12.2.11/debian/copyright ceph-12.2.12/debian/copyright
--- ceph-12.2.11/debian/copyright	2019-02-11 11:06:34.000000000 +0000
+++ ceph-12.2.12/debian/copyright	2019-06-12 10:04:50.000000000 +0000
@@ -2,7 +2,14 @@
 Upstream-Name: ceph
 Upstream-Contact: Sage Weil <sage@newdream.net>
 Source: http://ceph.com/
-Files-Excluded: debian
+Files-Excluded:
+ debian
+ src/civetweb/examples/websocket_client/ssl/server.key.orig
+ src/civetweb/resources/cert/client.key.orig
+ src/civetweb/resources/cert/server.key.orig
+ src/erasure-code/jerasure/jerasure/Examples/makefile.orig
+ src/erasure-code/jerasure/jerasure/include/config.h.in~
+ src/erasure-code/jerasure/jerasure/makefile.orig
 
 Files: *
 Copyright: 2004-2014 Sage Weil <sage@newdream.net>
diff -Nru ceph-12.2.11/debian/patches/s390x-link.patch ceph-12.2.12/debian/patches/s390x-link.patch
--- ceph-12.2.11/debian/patches/s390x-link.patch	2019-02-13 17:11:15.000000000 +0000
+++ ceph-12.2.12/debian/patches/s390x-link.patch	1970-01-01 00:00:00.000000000 +0000
@@ -1,16 +0,0 @@
-Descrption: Fix linking issues on s390x
-Origin: https://github.com/ceph/ceph/pull/21380
-
---- a/src/rgw/CMakeLists.txt
-+++ b/src/rgw/CMakeLists.txt
-@@ -177,9 +177,7 @@ endif (WITH_RADOSGW_BEAST_FRONTEND)
- 
- add_library(radosgw_a STATIC ${radosgw_srcs}
-   $<TARGET_OBJECTS:civetweb_common_objs>)
--if (WITH_RADOSGW_BEAST_FRONTEND AND WITH_RADOSGW_BEAST_OPENSSL)
--  target_link_libraries(radosgw_a rgw_a ${SSL_LIBRARIES})
--endif()
-+target_link_libraries(radosgw_a rgw_a ${SSL_LIBRARIES})
- 
- add_executable(radosgw rgw_main.cc)
- target_link_libraries(radosgw radosgw_a librados
diff -Nru ceph-12.2.11/debian/patches/series ceph-12.2.12/debian/patches/series
--- ceph-12.2.11/debian/patches/series	2019-02-13 17:11:15.000000000 +0000
+++ ceph-12.2.12/debian/patches/series	2019-06-12 10:04:50.000000000 +0000
@@ -7,4 +7,3 @@
 # Ubuntu: FTBFS on armhf
 armhf-ftbfs.patch
 misc-32-bit-fixes.patch
-s390x-link.patch
diff -Nru ceph-12.2.11/doc/api/libcephfs-java.rst ceph-12.2.12/doc/api/libcephfs-java.rst
--- ceph-12.2.11/doc/api/libcephfs-java.rst	2019-01-30 15:51:26.000000000 +0000
+++ ceph-12.2.12/doc/api/libcephfs-java.rst	2019-04-11 12:33:50.000000000 +0000
@@ -2,8 +2,17 @@
 Libcephfs (JavaDoc)
 ===================
 
+.. warning::
+
+    CephFS Java bindings are no longer tested by CI. They may not work properly
+    or corrupt data.
+
+    Developers interested in reviving these bindings by fixing and writing tests
+    are encouraged to contribute!
+
 ..
     The admin/build-docs script runs Ant to build the JavaDoc files, and
     copies them to api/libcephfs-java/javadoc/.
 
+
 View the auto-generated `JavaDoc pages for the CephFS Java bindings <javadoc/>`_.
diff -Nru ceph-12.2.11/doc/ceph-volume/simple/scan.rst ceph-12.2.12/doc/ceph-volume/simple/scan.rst
--- ceph-12.2.11/doc/ceph-volume/simple/scan.rst	2019-01-30 15:51:26.000000000 +0000
+++ ceph-12.2.12/doc/ceph-volume/simple/scan.rst	2019-04-11 12:33:50.000000000 +0000
@@ -9,6 +9,7 @@
 
 The command has the ability to inspect a running OSD, by inspecting the
 directory where the OSD data is stored, or by consuming the data partition.
+The command can also scan all running OSDs if no path or device is provided.
 
 Once scanned, information will (by default) persist the metadata as JSON in
 a file in ``/etc/ceph/osd``. This ``JSON`` file will use the naming convention
@@ -31,6 +32,16 @@
 
 .. _ceph-volume-simple-scan-directory:
 
+Running OSDs scan
+-----------------
+Using this command without providing an OSD directory or device will scan the
+directories of any currently running OSDs. If a running OSD was not created
+by ceph-disk it will be ignored and not scanned.
+
+To scan all running ceph-disk OSDs, the command would look like::
+
+    ceph-volume simple scan
+
 Directory scan
 --------------
 The directory scan will capture OSD file contents from interesting files. There
diff -Nru ceph-12.2.11/doc/man/8/ceph.rst ceph-12.2.12/doc/man/8/ceph.rst
--- ceph-12.2.11/doc/man/8/ceph.rst	2019-01-30 15:51:26.000000000 +0000
+++ ceph-12.2.12/doc/man/8/ceph.rst	2019-04-11 12:33:50.000000000 +0000
@@ -1465,6 +1465,16 @@
    reply to outfile.  Only specific monitor commands (e.g. osd getmap)
    return a payload.
 
+.. option:: --setuser user
+
+   will apply the appropriate user ownership to the file specified by
+   the option '-o'.
+
+.. option:: --setgroup group
+
+   will apply the appropriate group ownership to the file specified by
+   the option '-o'.
+
 .. option:: -c ceph.conf, --conf=ceph.conf
 
    Use ceph.conf configuration file instead of the default
diff -Nru ceph-12.2.11/doc/man/8/ceph-volume.rst ceph-12.2.12/doc/man/8/ceph-volume.rst
--- ceph-12.2.11/doc/man/8/ceph-volume.rst	2019-01-30 15:51:26.000000000 +0000
+++ ceph-12.2.12/doc/man/8/ceph-volume.rst	2019-04-11 12:33:50.000000000 +0000
@@ -280,6 +280,10 @@
 
 Optionally, the JSON blob can be sent to stdout for further inspection.
 
+Usage on all running OSDs::
+
+    ceph-voume simple scan
+
 Usage on data devices::
 
     ceph-volume simple scan <data device>
@@ -295,7 +299,7 @@
 * [--stdout]            Send the JSON blob to stdout
 * [--force]             If the JSON file exists at destination, overwrite it
 
-Required Positional arguments:
+Optional Positional arguments:
 
 * <DATA DEVICE or OSD DIR>  Actual data partition or a path to the running OSD
 
diff -Nru ceph-12.2.11/doc/man/8/rbdmap.rst ceph-12.2.12/doc/man/8/rbdmap.rst
--- ceph-12.2.11/doc/man/8/rbdmap.rst	2019-01-30 15:51:26.000000000 +0000
+++ ceph-12.2.12/doc/man/8/rbdmap.rst	2019-04-11 12:33:50.000000000 +0000
@@ -46,6 +46,8 @@
     rbd map POOLNAME/IMAGENAME --PARAM1 VAL1 --PARAM2 VAL2 
 
 (See the ``rbd`` manpage for a full list of possible options.)
+For parameters and values which contain commas or equality signs, a simple
+apostrophe can be used to prevent replacing them.
 
 When run as ``rbdmap map``, the script parses the configuration file, and for
 each RBD image specified attempts to first map the image (using the ``rbd map``
@@ -77,11 +79,12 @@
 Examples
 ========
 
-Example ``/etc/ceph/rbdmap`` for two RBD images called "bar1" and "bar2", both
-in pool "foopool"::
+Example ``/etc/ceph/rbdmap`` for three RBD images called "bar1", "bar2" and "bar3", 
+which are in pool "foopool"::
 
     foopool/bar1    id=admin,keyring=/etc/ceph/ceph.client.admin.keyring
     foopool/bar2    id=admin,keyring=/etc/ceph/ceph.client.admin.keyring
+    foopool/bar3    id=admin,keyring=/etc/ceph/ceph.client.admin.keyring,options='lock_on_read,queue_depth=1024'
 
 Each line in the file contains two strings: the image spec and the options to
 be passed to ``rbd map``. These two lines get transformed into the following
@@ -89,12 +92,14 @@
 
     rbd map foopool/bar1 --id admin --keyring /etc/ceph/ceph.client.admin.keyring
     rbd map foopool/bar2 --id admin --keyring /etc/ceph/ceph.client.admin.keyring
+    rbd map foopool/bar2 --id admin --keyring /etc/ceph/ceph.client.admin.keyring --options lock_on_read,queue_depth=1024
 
 If the images had XFS filesystems on them, the corresponding ``/etc/fstab``
 entries might look like this::
 
     /dev/rbd/foopool/bar1 /mnt/bar1 xfs noauto 0 0
     /dev/rbd/foopool/bar2 /mnt/bar2 xfs noauto 0 0
+    /dev/rbd/foopool/bar3 /mnt/bar3 xfs noauto 0 0
 
 After creating the images and populating the ``/etc/ceph/rbdmap`` file, making
 the images get automatically mapped and mounted at boot is just a matter of
diff -Nru ceph-12.2.11/doc/rados/configuration/mon-config-ref.rst ceph-12.2.12/doc/rados/configuration/mon-config-ref.rst
--- ceph-12.2.11/doc/rados/configuration/mon-config-ref.rst	2019-01-30 15:51:26.000000000 +0000
+++ ceph-12.2.12/doc/rados/configuration/mon-config-ref.rst	2019-04-11 12:33:50.000000000 +0000
@@ -1193,7 +1193,7 @@
               will be splitted on all OSDs serving that pool. We want to avoid
               extreme multipliers on PG splits.
 :Type: Integer
-:Default: 300
+:Default: 32
 
 
 ``mon session timeout``
diff -Nru ceph-12.2.11/doc/rados/configuration/osd-config-ref.rst ceph-12.2.12/doc/rados/configuration/osd-config-ref.rst
--- ceph-12.2.11/doc/rados/configuration/osd-config-ref.rst	2019-01-30 15:51:26.000000000 +0000
+++ ceph-12.2.12/doc/rados/configuration/osd-config-ref.rst	2019-04-11 12:33:50.000000000 +0000
@@ -850,30 +850,14 @@
 
 :Description: The number of OSD maps to keep cached.
 :Type: 32-bit Integer
-:Default: ``500``
-
-
-``osd map cache bl size``
-
-:Description: The size of the in-memory OSD map cache in OSD daemons.
-:Type: 32-bit Integer
 :Default: ``50``
 
 
-``osd map cache bl inc size``
-
-:Description: The size of the in-memory OSD map cache incrementals in
-              OSD daemons.
-
-:Type: 32-bit Integer
-:Default: ``100``
-
-
 ``osd map message max``
 
 :Description: The maximum map entries allowed per MOSDMap message.
 :Type: 32-bit Integer
-:Default: ``100``
+:Default: ``40``
 
 
 
diff -Nru ceph-12.2.11/doc/rados/operations/health-checks.rst ceph-12.2.12/doc/rados/operations/health-checks.rst
--- ceph-12.2.11/doc/rados/operations/health-checks.rst	2019-01-30 15:51:26.000000000 +0000
+++ ceph-12.2.12/doc/rados/operations/health-checks.rst	2019-04-11 12:33:50.000000000 +0000
@@ -514,7 +514,7 @@
 ____________________
 
 One or more PGs has not been deep scrubbed recently.  PGs are normally
-scrubbed every ``osd_deep_mon_scrub_interval`` seconds, and this warning
+scrubbed every ``osd_deep_scrub_interval`` seconds, and this warning
 triggers when ``mon_warn_not_deep_scrubbed`` such intervals have elapsed
 without a scrub.
 
diff -Nru ceph-12.2.11/doc/rados/troubleshooting/troubleshooting-mon.rst ceph-12.2.12/doc/rados/troubleshooting/troubleshooting-mon.rst
--- ceph-12.2.11/doc/rados/troubleshooting/troubleshooting-mon.rst	2019-01-30 15:51:26.000000000 +0000
+++ ceph-12.2.12/doc/rados/troubleshooting/troubleshooting-mon.rst	2019-04-11 12:33:50.000000000 +0000
@@ -460,12 +460,12 @@
   using ``ceph-monstore-tool``. But the MDS keyrings and other keyrings are missing
   in the recovered monitor store. You might need to re-add them manually.
 
-- **pg settings**: the ``full ratio`` and ``nearfull ratio`` settings configured using
-  ``ceph pg set_full_ratio`` and ``ceph pg set_nearfull_ratio`` will be lost.
+- **creating pools**: If any RADOS pools were in the process of being creating, that state is lost.  The recovery tool assumes that all pools have been created.  If there are PGs that are stuck in the 'unknown' after the recovery for a partially created pool, you can force creation of the *empty* PG with the ``ceph osd force-create-pg`` command.  Note that this will create an *empty* PG, so only do this if you know the pool is empty.
 
 - **MDS Maps**: the MDS maps are lost.
 
 
+
 Everything Failed! Now What?
 =============================
 
diff -Nru ceph-12.2.11/doc/radosgw/index.rst ceph-12.2.12/doc/radosgw/index.rst
--- ceph-12.2.11/doc/radosgw/index.rst	2019-01-30 15:51:26.000000000 +0000
+++ ceph-12.2.12/doc/radosgw/index.rst	2019-04-11 12:33:50.000000000 +0000
@@ -40,6 +40,7 @@
 
    Manual Install w/Civetweb <../../install/install-ceph-gateway>
    HTTP Frontends <frontends>
+   Pool Placement <placement>
    Multisite Configuration <multisite>
    Configuring Pools <pools>
    Config Reference <config-ref>
diff -Nru ceph-12.2.11/doc/radosgw/placement.rst ceph-12.2.12/doc/radosgw/placement.rst
--- ceph-12.2.11/doc/radosgw/placement.rst	1970-01-01 00:00:00.000000000 +0000
+++ ceph-12.2.12/doc/radosgw/placement.rst	2019-04-11 12:33:50.000000000 +0000
@@ -0,0 +1,180 @@
+==============
+Pool Placement
+==============
+
+.. contents::
+
+Placement Targets
+=================
+
+.. versionadded:: Jewel
+
+Placement targets control which `Pools`_ are associated with a particular
+bucket. A bucket's placement target is selected on creation, and cannot be
+modified. The ``radosgw-admin bucket stats`` command will display its
+``placement_rule``.
+
+The zonegroup configuration contains a list of placement targets with an
+initial target named ``default-placement``. The zone configuration then maps
+each zonegroup placement target name onto its local storage. This zone
+placement information includes the ``index_pool`` name for the bucket index,
+the ``data_extra_pool`` name for metadata about incomplete multipart uploads,
+and a ``data_pool`` name for object data.
+
+Zonegroup/Zone Configuration
+============================
+
+Placement configuration is performed with ``radosgw-admin`` commands on
+the zonegroups and zones.
+
+The zonegroup placement configuration can be queried with:
+
+::
+
+  $ radosgw-admin zonegroup get
+  {
+      "id": "ab01123f-e0df-4f29-9d71-b44888d67cd5",
+      "name": "default",
+      "api_name": "default",
+      ...
+      "placement_targets": [
+          {
+              "name": "default-placement",
+              "tags": [],
+          }
+      ],
+      "default_placement": "default-placement",
+      ...
+  }
+
+The zone placement configuration can be queried with:
+
+::
+
+  $ radosgw-admin zone get
+  {
+      "id": "557cdcee-3aae-4e9e-85c7-2f86f5eddb1f",
+      "name": "default",
+      "domain_root": "default.rgw.meta:root",
+      ...
+      "placement_pools": [
+          {
+              "key": "default-placement",
+              "val": {
+                  "index_pool": "default.rgw.buckets.index",
+                  "data_pool": "default.rgw.buckets.data",
+                  "data_extra_pool": "default.rgw.buckets.non-ec",
+                  "index_type": 0
+              }
+          }
+      ],
+      ...
+  }
+
+.. note:: If you have not done any previous `Multisite Configuration`_,
+          a ``default`` zone and zonegroup are created for you, and changes
+          to the zone/zonegroup will not take effect until the Ceph Object
+          Gateways are restarted. If you have created a realm for multisite,
+          the zone/zonegroup changes will take effect once the changes are
+          committed with ``radosgw-admin period update --commit``.
+
+Adding a Placement Target
+-------------------------
+
+To create a new placement target named ``temporary``, start by adding it to
+the zonegroup:
+
+::
+
+  $ radosgw-admin zonegroup placement add \
+        --rgw-zonegroup default \
+        --placement-id temporary
+
+Then provide the zone placement info for that target:
+
+::
+
+  $ radosgw-admin zone placement add \
+        --rgw-zone default \
+        --placement-id temporary \
+        --data-pool default.rgw.temporary.data \
+        --index-pool default.rgw.temporary.index \
+        --data-extra-pool default.rgw.temporary.non-ec \
+        --compression lz4
+
+Customizing Placement
+=====================
+
+Default Placement
+-----------------
+
+By default, new buckets will use the zonegroup's ``default_placement`` target.
+This zonegroup setting can be changed with:
+
+::
+
+  $ radosgw-admin zonegroup placement default \
+        --rgw-zonegroup default \
+        --placement-id new-placement
+
+User Placement
+--------------
+
+A Ceph Object Gateway user can override the zonegroup's default placement
+target by setting a non-empty ``default_placement`` field in the user info.
+
+::
+
+  $ radosgw-admin user info --uid testid
+  {
+      ...
+      "default_placement": "",
+      "placement_tags": [],
+      ...
+  }
+
+If a zonegroup's placement target contains any ``tags``, users will be unable
+to create buckets with that placement target unless their user info contains
+at least one matching tag in its ``placement_tags`` field. This can be useful
+to restrict access to certain types of storage.
+
+The ``radosgw-admin`` command cannot modify these fields directly, so the json
+format must be edited manually:
+
+::
+
+  $ radosgw-admin metadata get user:<user-id> > user.json
+  $ vi user.json
+  $ radosgw-admin metadata put user:<user-id> < user.json
+
+S3 Bucket Placement
+-------------------
+
+When creating a bucket with the S3 protocol, a placement target can be
+provided as part of the LocationConstraint to override the default placement
+targets from the user and zonegroup.
+
+Normally, the LocationConstraint must match the zonegroup's ``api_name``:
+
+::
+
+  <LocationConstraint>default</LocationConstraint>
+
+A custom placement target can be added to the ``api_name`` following a colon:
+
+::
+
+  <LocationConstraint>default:new-placement</LocationConstraint>
+
+Swift Bucket Placement
+----------------------
+
+When creating a bucket with the Swift protocol, a placement target can be
+provided in the HTTP header ``X-Storage-Policy``:
+
+::
+
+  X-Storage-Policy: new-placement
+
+.. _`Pools`: ../pools
+.. _`Multisite Configuration`: ../multisite
diff -Nru ceph-12.2.11/doc/radosgw/s3/authentication.rst ceph-12.2.12/doc/radosgw/s3/authentication.rst
--- ceph-12.2.11/doc/radosgw/s3/authentication.rst	2019-01-30 15:51:26.000000000 +0000
+++ ceph-12.2.12/doc/radosgw/s3/authentication.rst	2019-04-11 12:33:50.000000000 +0000
@@ -71,5 +71,126 @@
 | ``FULL_CONTROL`` | Grantee has full permissions for object in the bucket. | Grantee can read or write to the object ACL. |
 +------------------+--------------------------------------------------------+----------------------------------------------+
 
+Internally, S3 operations are mapped to ACL permissions thus:
+
++---------------------------------------+---------------+
+| Operation                             | Permission    |
++=======================================+===============+
+| ``s3:GetObject``                      | ``READ``      |
++---------------------------------------+---------------+
+| ``s3:GetObjectTorrent``               | ``READ``      |
++---------------------------------------+---------------+
+| ``s3:GetObjectVersion``               | ``READ``      |
++---------------------------------------+---------------+
+| ``s3:GetObjectVersionTorrent``        | ``READ``      |
++---------------------------------------+---------------+
+| ``s3:GetObjectTagging``               | ``READ``      |
++---------------------------------------+---------------+
+| ``s3:GetObjectVersionTagging``        | ``READ``      |
++---------------------------------------+---------------+
+| ``s3:ListAllMyBuckets``               | ``READ``      |
++---------------------------------------+---------------+
+| ``s3:ListBucket``                     | ``READ``      |
++---------------------------------------+---------------+
+| ``s3:ListBucketMultipartUploads``     | ``READ``      |
++---------------------------------------+---------------+
+| ``s3:ListBucketVersions``             | ``READ``      |
++---------------------------------------+---------------+
+| ``s3:ListMultipartUploadParts``       | ``READ``      |
++---------------------------------------+---------------+
+| ``s3:AbortMultipartUpload``           | ``WRITE``     |
++---------------------------------------+---------------+
+| ``s3:CreateBucket``                   | ``WRITE``     |
++---------------------------------------+---------------+
+| ``s3:DeleteBucket``                   | ``WRITE``     |
++---------------------------------------+---------------+
+| ``s3:DeleteObject``                   | ``WRITE``     |
++---------------------------------------+---------------+
+| ``s3:s3DeleteObjectVersion``          | ``WRITE``     |
++---------------------------------------+---------------+
+| ``s3:PutObject``                      | ``WRITE``     |
++---------------------------------------+---------------+
+| ``s3:PutObjectTagging``               | ``WRITE``     |
++---------------------------------------+---------------+
+| ``s3:PutObjectVersionTagging``        | ``WRITE``     |
++---------------------------------------+---------------+
+| ``s3:DeleteObjectTagging``            | ``WRITE``     |
++---------------------------------------+---------------+
+| ``s3:DeleteObjectVersionTagging``     | ``WRITE``     |
++---------------------------------------+---------------+
+| ``s3:RestoreObject``                  | ``WRITE``     |
++---------------------------------------+---------------+
+| ``s3:GetAccelerateConfiguration``     | ``READ_ACP``  |
++---------------------------------------+---------------+
+| ``s3:GetBucketAcl``                   | ``READ_ACP``  |
++---------------------------------------+---------------+
+| ``s3:GetBucketCORS``                  | ``READ_ACP``  |
++---------------------------------------+---------------+
+| ``s3:GetBucketLocation``              | ``READ_ACP``  |
++---------------------------------------+---------------+
+| ``s3:GetBucketLogging``               | ``READ_ACP``  |
++---------------------------------------+---------------+
+| ``s3:GetBucketNotification``          | ``READ_ACP``  |
++---------------------------------------+---------------+
+| ``s3:GetBucketPolicy``                | ``READ_ACP``  |
++---------------------------------------+---------------+
+| ``s3:GetBucketRequestPayment``        | ``READ_ACP``  |
++---------------------------------------+---------------+
+| ``s3:GetBucketTagging``               | ``READ_ACP``  |
++---------------------------------------+---------------+
+| ``s3:GetBucketVersioning``            | ``READ_ACP``  |
++---------------------------------------+---------------+
+| ``s3:GetBucketWebsite``               | ``READ_ACP``  |
++---------------------------------------+---------------+
+| ``s3:GetLifecycleConfiguration``      | ``READ_ACP``  |
++---------------------------------------+---------------+
+| ``s3:GetObjectAcl``                   | ``READ_ACP``  |
++---------------------------------------+---------------+
+| ``s3:GetObjectVersionAcl``            | ``READ_ACP``  |
++---------------------------------------+---------------+
+| ``s3:GetReplicationConfiguration``    | ``READ_ACP``  |
++---------------------------------------+---------------+
+| ``s3:DeleteBucketPolicy``             | ``WRITE_ACP`` |
++---------------------------------------+---------------+
+| ``s3:DeleteBucketWebsite``            | ``WRITE_ACP`` |
++---------------------------------------+---------------+
+| ``s3:DeleteReplicationConfiguration`` | ``WRITE_ACP`` |
++---------------------------------------+---------------+
+| ``s3:PutAccelerateConfiguration``     | ``WRITE_ACP`` |
++---------------------------------------+---------------+
+| ``s3:PutBucketAcl``                   | ``WRITE_ACP`` |
++---------------------------------------+---------------+
+| ``s3:PutBucketCORS``                  | ``WRITE_ACP`` |
++---------------------------------------+---------------+
+| ``s3:PutBucketLogging``               | ``WRITE_ACP`` |
++---------------------------------------+---------------+
+| ``s3:PutBucketNotification``          | ``WRITE_ACP`` |
++---------------------------------------+---------------+
+| ``s3:PutBucketPolicy``                | ``WRITE_ACP`` |
++---------------------------------------+---------------+
+| ``s3:PutBucketRequestPayment``        | ``WRITE_ACP`` |
++---------------------------------------+---------------+
+| ``s3:PutBucketTagging``               | ``WRITE_ACP`` |
++---------------------------------------+---------------+
+| ``s3:PutPutBucketVersioning``         | ``WRITE_ACP`` |
++---------------------------------------+---------------+
+| ``s3:PutBucketWebsite``               | ``WRITE_ACP`` |
++---------------------------------------+---------------+
+| ``s3:PutLifecycleConfiguration``      | ``WRITE_ACP`` |
++---------------------------------------+---------------+
+| ``s3:PutObjectAcl``                   | ``WRITE_ACP`` |
++---------------------------------------+---------------+
+| ``s3:PutObjectVersionAcl``            | ``WRITE_ACP`` |
++---------------------------------------+---------------+
+| ``s3:PutReplicationConfiguration``    | ``WRITE_ACP`` |
++---------------------------------------+---------------+
+
+Some mappings, (e.g. ``s3:CreateBucket`` to ``WRITE``) are not
+applicable to S3 operation, but are required to allow Swift and S3 to
+access the same resources when things like Swift user ACLs are in
+play. This is one of the many reasons that you should use S3 bucket
+policies rather than S3 ACLs when possible.
+
+
 .. _RFC 2104: http://www.ietf.org/rfc/rfc2104.txt
 .. _HMAC: http://en.wikipedia.org/wiki/HMAC
diff -Nru ceph-12.2.11/PendingReleaseNotes ceph-12.2.12/PendingReleaseNotes
--- ceph-12.2.11/PendingReleaseNotes	2019-01-30 15:51:26.000000000 +0000
+++ ceph-12.2.12/PendingReleaseNotes	2019-04-11 12:33:50.000000000 +0000
@@ -1,5 +1,12 @@
->= 12.2.11
-----------
+12.2.12
+-------
+* In 12.2.9 and earlier releases, keyring caps were not checked for validity,
+  so the caps string could be anything. As of 12.2.10, caps strings are
+  validated and providing a keyring with an invalid caps string to, e.g.,
+  "ceph auth add" will result in an error.
+
+12.2.11
+-------
 * `cephfs-journal-tool` makes rank argument (--rank) mandatory. Rank is
   of format `filesystem:rank`, where `filesystem` is the cephfs filesystem
   and `rank` is the MDS rank on which the operation is to be executed. To
@@ -8,6 +15,26 @@
   suffixed dump files. Importing journal information from dump files is
   disallowed if operation is targetted for all ranks.
 
+* The MDS cache trimming is now throttled. Dropping the MDS cache
+  via the `ceph tell mds.<foo> cache drop` command or large reductions in the
+  cache size will no longer cause service unavailability.
+
+* The CephFS MDS behavior with recalling caps has been significantly improved
+  to not attempt recalling too many caps at once, leading to instability.
+  MDS with a large cache (64GB+) should be more stable.
+
+* MDS now provides a config option "mds_max_caps_per_client" (default: 1M) to
+  limit the number of caps a client session may hold. Long running client
+  sessions with a large number of caps have been a source of instability in the
+  MDS when all of these caps need to be processed during certain session
+  events. It is recommended to not unnecessarily increase this value.
+
+* The MDS config mds_recall_state_timeout has been removed. Late client recall
+  warnings are now generated based on the number of caps the MDS has recalled
+  which have not been released. The new configs mds_recall_warning_threshold
+  (default: 32K) and mds_recall_warning_decay_rate (default: 60s) sets the
+  threshold for this warning.
+
 >= 12.1.2
 ---------
 * When running 'df' on a CephFS filesystem comprising exactly one data pool,
diff -Nru ceph-12.2.11/qa/cephfs/objectstore-ec/bluestore-bitmap.yaml ceph-12.2.12/qa/cephfs/objectstore-ec/bluestore-bitmap.yaml
--- ceph-12.2.11/qa/cephfs/objectstore-ec/bluestore-bitmap.yaml	1970-01-01 00:00:00.000000000 +0000
+++ ceph-12.2.12/qa/cephfs/objectstore-ec/bluestore-bitmap.yaml	2019-04-11 12:33:50.000000000 +0000
@@ -0,0 +1,43 @@
+overrides:
+  thrashosds:
+    bdev_inject_crash: 2
+    bdev_inject_crash_probability: .5
+  ceph:
+    fs: xfs
+    conf:
+      osd:
+        osd objectstore: bluestore
+        bluestore block size: 96636764160
+        debug bluestore: 20
+        debug bluefs: 20
+        debug rocksdb: 10
+        bluestore fsck on mount: true
+        bluestore allocator: bitmap
+        # lower the full ratios since we can fill up a 100gb osd so quickly
+        mon osd full ratio: .9
+        mon osd backfillfull_ratio: .85
+        mon osd nearfull ratio: .8
+        osd failsafe full ratio: .95
+# this doesn't work with failures bc the log writes are not atomic across the two backends
+#        bluestore bluefs env mirror: true
+        bdev enable discard: true
+        bdev async discard: true
+  ceph-deploy:
+    fs: xfs
+    bluestore: yes
+    conf:
+      osd:
+        osd objectstore: bluestore
+        bluestore block size: 96636764160
+        debug bluestore: 20
+        debug bluefs: 20
+        debug rocksdb: 10
+        bluestore fsck on mount: true
+        # lower the full ratios since we can fill up a 100gb osd so quickly
+        mon osd full ratio: .9
+        mon osd backfillfull_ratio: .85
+        mon osd nearfull ratio: .8
+        osd failsafe full ratio: .95
+        bdev enable discard: true
+        bdev async discard: true
+
diff -Nru ceph-12.2.11/qa/cephfs/objectstore-ec/bluestore.yaml ceph-12.2.12/qa/cephfs/objectstore-ec/bluestore.yaml
--- ceph-12.2.11/qa/cephfs/objectstore-ec/bluestore.yaml	2019-01-30 15:51:26.000000000 +0000
+++ ceph-12.2.12/qa/cephfs/objectstore-ec/bluestore.yaml	1970-01-01 00:00:00.000000000 +0000
@@ -1,38 +0,0 @@
-overrides:
-  thrashosds:
-    bdev_inject_crash: 2
-    bdev_inject_crash_probability: .5
-  ceph:
-    fs: xfs
-    conf:
-      osd:
-        osd objectstore: bluestore
-        bluestore block size: 96636764160
-        debug bluestore: 20
-        debug bluefs: 20
-        debug rocksdb: 10
-        bluestore fsck on mount: true
-        # lower the full ratios since we can fill up a 100gb osd so quickly
-        mon osd full ratio: .9
-        mon osd backfillfull_ratio: .85
-        mon osd nearfull ratio: .8
-        osd failsafe full ratio: .95
-# this doesn't work with failures bc the log writes are not atomic across the two backends
-#        bluestore bluefs env mirror: true
-  ceph-deploy:
-    fs: xfs
-    bluestore: yes
-    conf:
-      osd:
-        osd objectstore: bluestore
-        bluestore block size: 96636764160
-        debug bluestore: 20
-        debug bluefs: 20
-        debug rocksdb: 10
-        bluestore fsck on mount: true
-        # lower the full ratios since we can fill up a 100gb osd so quickly
-        mon osd full ratio: .9
-        mon osd backfillfull_ratio: .85
-        mon osd nearfull ratio: .8
-        osd failsafe full ratio: .95
-
diff -Nru ceph-12.2.11/qa/cephfs/tasks/cfuse_workunit_suites_fsstress.yaml ceph-12.2.12/qa/cephfs/tasks/cfuse_workunit_suites_fsstress.yaml
--- ceph-12.2.11/qa/cephfs/tasks/cfuse_workunit_suites_fsstress.yaml	2019-01-30 15:51:26.000000000 +0000
+++ ceph-12.2.12/qa/cephfs/tasks/cfuse_workunit_suites_fsstress.yaml	2019-04-11 12:33:50.000000000 +0000
@@ -1,5 +1,6 @@
 tasks:
 - workunit:
+    timeout: 6h
     clients:
       all:
         - suites/fsstress.sh
diff -Nru ceph-12.2.11/qa/objectstore/bluestore-bitmap.yaml ceph-12.2.12/qa/objectstore/bluestore-bitmap.yaml
--- ceph-12.2.11/qa/objectstore/bluestore-bitmap.yaml	2019-01-30 15:51:26.000000000 +0000
+++ ceph-12.2.12/qa/objectstore/bluestore-bitmap.yaml	2019-04-11 12:33:50.000000000 +0000
@@ -20,6 +20,8 @@
         osd failsafe full ratio: .95
 # this doesn't work with failures bc the log writes are not atomic across the two backends
 #        bluestore bluefs env mirror: true
+        bdev enable discard: true
+        bdev async discard: true
   ceph-deploy:
     fs: xfs
     bluestore: yes
@@ -36,4 +38,6 @@
         mon osd backfillfull_ratio: .85
         mon osd nearfull ratio: .8
         osd failsafe full ratio: .95
+        bdev enable discard: true
+        bdev async discard: true
 
diff -Nru ceph-12.2.11/qa/objectstore/bluestore-stupid.yaml ceph-12.2.12/qa/objectstore/bluestore-stupid.yaml
--- ceph-12.2.11/qa/objectstore/bluestore-stupid.yaml	1970-01-01 00:00:00.000000000 +0000
+++ ceph-12.2.12/qa/objectstore/bluestore-stupid.yaml	2019-04-11 12:33:50.000000000 +0000
@@ -0,0 +1,39 @@
+overrides:
+  thrashosds:
+    bdev_inject_crash: 2
+    bdev_inject_crash_probability: .5
+  ceph:
+    fs: xfs
+    conf:
+      osd:
+        osd objectstore: bluestore
+        bluestore block size: 96636764160
+        debug bluestore: 20
+        debug bluefs: 20
+        debug rocksdb: 10
+        bluestore fsck on mount: true
+        bluestore allocator: stupid
+        # lower the full ratios since we can fill up a 100gb osd so quickly
+        mon osd full ratio: .9
+        mon osd backfillfull_ratio: .85
+        mon osd nearfull ratio: .8
+        osd failsafe full ratio: .95
+# this doesn't work with failures bc the log writes are not atomic across the two backends
+#        bluestore bluefs env mirror: true
+  ceph-deploy:
+    fs: xfs
+    bluestore: yes
+    conf:
+      osd:
+        osd objectstore: bluestore
+        bluestore block size: 96636764160
+        debug bluestore: 20
+        debug bluefs: 20
+        debug rocksdb: 10
+        bluestore fsck on mount: true
+        # lower the full ratios since we can fill up a 100gb osd so quickly
+        mon osd full ratio: .9
+        mon osd backfillfull_ratio: .85
+        mon osd nearfull ratio: .8
+        osd failsafe full ratio: .95
+
diff -Nru ceph-12.2.11/qa/objectstore/bluestore.yaml ceph-12.2.12/qa/objectstore/bluestore.yaml
--- ceph-12.2.11/qa/objectstore/bluestore.yaml	2019-01-30 15:51:26.000000000 +0000
+++ ceph-12.2.12/qa/objectstore/bluestore.yaml	1970-01-01 00:00:00.000000000 +0000
@@ -1,38 +0,0 @@
-overrides:
-  thrashosds:
-    bdev_inject_crash: 2
-    bdev_inject_crash_probability: .5
-  ceph:
-    fs: xfs
-    conf:
-      osd:
-        osd objectstore: bluestore
-        bluestore block size: 96636764160
-        debug bluestore: 20
-        debug bluefs: 20
-        debug rocksdb: 10
-        bluestore fsck on mount: true
-        # lower the full ratios since we can fill up a 100gb osd so quickly
-        mon osd full ratio: .9
-        mon osd backfillfull_ratio: .85
-        mon osd nearfull ratio: .8
-        osd failsafe full ratio: .95
-# this doesn't work with failures bc the log writes are not atomic across the two backends
-#        bluestore bluefs env mirror: true
-  ceph-deploy:
-    fs: xfs
-    bluestore: yes
-    conf:
-      osd:
-        osd objectstore: bluestore
-        bluestore block size: 96636764160
-        debug bluestore: 20
-        debug bluefs: 20
-        debug rocksdb: 10
-        bluestore fsck on mount: true
-        # lower the full ratios since we can fill up a 100gb osd so quickly
-        mon osd full ratio: .9
-        mon osd backfillfull_ratio: .85
-        mon osd nearfull ratio: .8
-        osd failsafe full ratio: .95
-
diff -Nru ceph-12.2.11/qa/objectstore_cephfs/bluestore-bitmap.yaml ceph-12.2.12/qa/objectstore_cephfs/bluestore-bitmap.yaml
--- ceph-12.2.11/qa/objectstore_cephfs/bluestore-bitmap.yaml	1970-01-01 00:00:00.000000000 +0000
+++ ceph-12.2.12/qa/objectstore_cephfs/bluestore-bitmap.yaml	2019-04-11 12:33:50.000000000 +0000
@@ -0,0 +1,43 @@
+overrides:
+  thrashosds:
+    bdev_inject_crash: 2
+    bdev_inject_crash_probability: .5
+  ceph:
+    fs: xfs
+    conf:
+      osd:
+        osd objectstore: bluestore
+        bluestore block size: 96636764160
+        debug bluestore: 20
+        debug bluefs: 20
+        debug rocksdb: 10
+        bluestore fsck on mount: true
+        bluestore allocator: bitmap
+        # lower the full ratios since we can fill up a 100gb osd so quickly
+        mon osd full ratio: .9
+        mon osd backfillfull_ratio: .85
+        mon osd nearfull ratio: .8
+        osd failsafe full ratio: .95
+# this doesn't work with failures bc the log writes are not atomic across the two backends
+#        bluestore bluefs env mirror: true
+        bdev enable discard: true
+        bdev async discard: true
+  ceph-deploy:
+    fs: xfs
+    bluestore: yes
+    conf:
+      osd:
+        osd objectstore: bluestore
+        bluestore block size: 96636764160
+        debug bluestore: 20
+        debug bluefs: 20
+        debug rocksdb: 10
+        bluestore fsck on mount: true
+        # lower the full ratios since we can fill up a 100gb osd so quickly
+        mon osd full ratio: .9
+        mon osd backfillfull_ratio: .85
+        mon osd nearfull ratio: .8
+        osd failsafe full ratio: .95
+        bdev enable discard: true
+        bdev async discard: true
+
diff -Nru ceph-12.2.11/qa/objectstore_cephfs/bluestore.yaml ceph-12.2.12/qa/objectstore_cephfs/bluestore.yaml
--- ceph-12.2.11/qa/objectstore_cephfs/bluestore.yaml	2019-01-30 15:51:26.000000000 +0000
+++ ceph-12.2.12/qa/objectstore_cephfs/bluestore.yaml	1970-01-01 00:00:00.000000000 +0000
@@ -1,38 +0,0 @@
-overrides:
-  thrashosds:
-    bdev_inject_crash: 2
-    bdev_inject_crash_probability: .5
-  ceph:
-    fs: xfs
-    conf:
-      osd:
-        osd objectstore: bluestore
-        bluestore block size: 96636764160
-        debug bluestore: 20
-        debug bluefs: 20
-        debug rocksdb: 10
-        bluestore fsck on mount: true
-        # lower the full ratios since we can fill up a 100gb osd so quickly
-        mon osd full ratio: .9
-        mon osd backfillfull_ratio: .85
-        mon osd nearfull ratio: .8
-        osd failsafe full ratio: .95
-# this doesn't work with failures bc the log writes are not atomic across the two backends
-#        bluestore bluefs env mirror: true
-  ceph-deploy:
-    fs: xfs
-    bluestore: yes
-    conf:
-      osd:
-        osd objectstore: bluestore
-        bluestore block size: 96636764160
-        debug bluestore: 20
-        debug bluefs: 20
-        debug rocksdb: 10
-        bluestore fsck on mount: true
-        # lower the full ratios since we can fill up a 100gb osd so quickly
-        mon osd full ratio: .9
-        mon osd backfillfull_ratio: .85
-        mon osd nearfull ratio: .8
-        osd failsafe full ratio: .95
-
diff -Nru ceph-12.2.11/qa/overrides/short_pg_log.yaml ceph-12.2.12/qa/overrides/short_pg_log.yaml
--- ceph-12.2.11/qa/overrides/short_pg_log.yaml	2019-01-30 15:51:26.000000000 +0000
+++ ceph-12.2.12/qa/overrides/short_pg_log.yaml	2019-04-11 12:33:50.000000000 +0000
@@ -2,5 +2,5 @@
   ceph:
     conf:
       global:
-        osd_min_pg_log_entries: 300
-        osd_max_pg_log_entries: 600
+        osd_min_pg_log_entries: 1
+        osd_max_pg_log_entries: 2
diff -Nru ceph-12.2.11/qa/packages/packages.yaml ceph-12.2.12/qa/packages/packages.yaml
--- ceph-12.2.11/qa/packages/packages.yaml	2019-01-30 15:51:26.000000000 +0000
+++ ceph-12.2.12/qa/packages/packages.yaml	2019-04-11 12:33:50.000000000 +0000
@@ -11,8 +11,6 @@
   - python-ceph
   - libcephfs2
   - libcephfs-dev
-  - libcephfs-java
-  - libcephfs-jni
   - librados2
   - librbd1
   - rbd-fuse
@@ -40,8 +38,6 @@
   - ceph
   - ceph-mgr
   - ceph-fuse
-  - cephfs-java
-  - libcephfs_jni1
   - libcephfs2
   - libcephfs-devel
   - librados2
diff -Nru ceph-12.2.11/qa/run-standalone.sh ceph-12.2.12/qa/run-standalone.sh
--- ceph-12.2.11/qa/run-standalone.sh	2019-01-30 15:51:26.000000000 +0000
+++ ceph-12.2.12/qa/run-standalone.sh	2019-04-11 12:33:50.000000000 +0000
@@ -36,6 +36,8 @@
 
 PATH=$(pwd)/bin:$PATH
 
+export LD_LIBRARY_PATH="$(pwd)/lib"
+
 # TODO: Use getops
 dryrun=false
 if [[ "$1" = "--dry-run" ]]; then
diff -Nru ceph-12.2.11/qa/standalone/ceph-helpers.sh ceph-12.2.12/qa/standalone/ceph-helpers.sh
--- ceph-12.2.11/qa/standalone/ceph-helpers.sh	2019-01-30 15:51:26.000000000 +0000
+++ ceph-12.2.12/qa/standalone/ceph-helpers.sh	2019-04-11 12:33:50.000000000 +0000
@@ -19,7 +19,7 @@
 #
 TIMEOUT=300
 PG_NUM=4
-TMPDIR=${TMPDIR:-/tmp}
+TMPDIR=${TMPDIR:-${CEPH_BUILD_DIR}}
 CEPH_BUILD_VIRTUALENV=${TMPDIR}
 TESTDIR=${TESTDIR:-${TMPDIR}}
 
@@ -389,6 +389,17 @@
     teardown $dir || return 1
 }
 
+#
+# return a random TCP port which is not used yet
+#
+# please note, there could be racing if we use this function for
+# a free port, and then try to bind on this port.
+#
+function get_unused_port() {
+    local ip=127.0.0.1
+    python3 -c "import socket; s=socket.socket(); s.bind(('$ip', 0)); print(s.getsockname()[1]); s.close()"
+}
+
 #######################################################################
 
 ##
@@ -1411,6 +1422,7 @@
 # @return 0 if the cluster is clean, 1 otherwise
 #
 function wait_for_clean() {
+    local cmd=$1
     local num_active_clean=-1
     local cur_active_clean
     local -a delays=($(get_timeout_delays $TIMEOUT .1))
@@ -1436,6 +1448,8 @@
             ceph report
             return 1
         fi
+	# eval is a no-op if cmd is empty
+        eval $cmd
         sleep ${delays[$loop]}
         loop+=1
     done
diff -Nru ceph-12.2.11/qa/standalone/osd/osd-backfill-prio.sh ceph-12.2.12/qa/standalone/osd/osd-backfill-prio.sh
--- ceph-12.2.11/qa/standalone/osd/osd-backfill-prio.sh	1970-01-01 00:00:00.000000000 +0000
+++ ceph-12.2.12/qa/standalone/osd/osd-backfill-prio.sh	2019-04-11 12:33:50.000000000 +0000
@@ -0,0 +1,504 @@
+#!/usr/bin/env bash
+#
+# Copyright (C) 2019 Red Hat <contact@redhat.com>
+#
+# Author: David Zafman <dzafman@redhat.com>
+#
+# This program is free software; you can redistribute it and/or modify
+# it under the terms of the GNU Library Public License as published by
+# the Free Software Foundation; either version 2, or (at your option)
+# any later version.
+#
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+# GNU Library Public License for more details.
+#
+
+source $CEPH_ROOT/qa/standalone/ceph-helpers.sh
+
+function run() {
+    local dir=$1
+    shift
+
+    # Fix port????
+    export CEPH_MON="127.0.0.1:7114" # git grep '\<7114\>' : there must be only one
+    export CEPH_ARGS
+    CEPH_ARGS+="--fsid=$(uuidgen) --auth-supported=none "
+    CEPH_ARGS+="--mon-host=$CEPH_MON --osd_max_backfills=1 --debug_reserver=20 "
+    CEPH_ARGS+="--osd_min_pg_log_entries=5 --osd_max_pg_log_entries=10 "
+    export objects=50
+    export poolprefix=test
+    export FORCE_PRIO="254"     # See OSD_BACKFILL_PRIORITY_FORCED
+    export DEGRADED_PRIO="140"  # See OSD_BACKFILL_DEGRADED_PRIORITY_BASE
+    export NORMAL_PRIO="100"    # See OSD_BACKFILL_PRIORITY_BASE
+
+    local funcs=${@:-$(set | sed -n -e 's/^\(TEST_[0-9a-z_]*\) .*/\1/p')}
+    for func in $funcs ; do
+        setup $dir || return 1
+        $func $dir || return 1
+        teardown $dir || return 1
+    done
+}
+
+
+function TEST_backfill_priority() {
+    local dir=$1
+    local pools=10
+    local OSDS=5
+    # size 2 -> 1 means degraded by 1, so add 1 to base prio
+    local degraded_prio=$(expr $DEGRADED_PRIO + 1)
+
+    run_mon $dir a || return 1
+    run_mgr $dir x || return 1
+    export CEPH_ARGS
+
+    for osd in $(seq 0 $(expr $OSDS - 1))
+    do
+      run_osd $dir $osd || return 1
+    done
+
+    for p in $(seq 1 $pools)
+    do
+      create_pool "${poolprefix}$p" 1 1
+      ceph osd pool set "${poolprefix}$p" size 2
+    done
+    sleep 5
+
+    wait_for_clean || return 1
+
+    ceph pg dump pgs
+
+    # Find 3 pools with a pg with the same primaries but second
+    # replica on another osd.
+    local PG1
+    local POOLNUM1
+    local pool1
+    local chk_osd1_1
+    local chk_osd1_2
+
+    local PG2
+    local POOLNUM2
+    local pool2
+    local chk_osd2
+
+    local PG3
+    local POOLNUM3
+    local pool3
+
+    for p in $(seq 1 $pools)
+    do
+      ceph pg map ${p}.0 --format=json | jq '.acting[]' > $dir/acting
+      local test_osd1=$(head -1 $dir/acting)
+      local test_osd2=$(tail -1 $dir/acting)
+      if [ -z "$PG1" ];
+      then
+        PG1="${p}.0"
+        POOLNUM1=$p
+        pool1="${poolprefix}$p"
+        chk_osd1_1=$test_osd1
+        chk_osd1_2=$test_osd2
+      elif [ -z "$PG2" -a $chk_osd1_1 = $test_osd1 -a $chk_osd1_2 != $test_osd2 ];
+      then
+        PG2="${p}.0"
+        POOLNUM2=$p
+        pool2="${poolprefix}$p"
+        chk_osd2=$test_osd2
+      elif [ -n "$PG2" -a $chk_osd1_1 = $test_osd1 -a $chk_osd1_2 != $test_osd2 -a "$chk_osd2" != $test_osd2 ];
+      then
+        PG3="${p}.0"
+        POOLNUM3=$p
+        pool3="${poolprefix}$p"
+        break
+      fi
+    done
+    rm -f $dir/acting
+
+    if [ "$pool2" = "" -o "pool3" = "" ];
+    then
+      echo "Failure to find appropirate PGs"
+      return 1
+    fi
+
+    for p in $(seq 1 $pools)
+    do
+      if [ $p != $POOLNUM1 -a $p != $POOLNUM2 -a $p != $POOLNUM3 ];
+      then
+        delete_pool ${poolprefix}$p
+      fi
+    done
+
+    ceph osd pool set $pool2 size 1
+    ceph osd pool set $pool3 size 1
+    wait_for_clean || return 1
+
+    dd if=/dev/urandom of=$dir/data bs=1M count=10
+    p=1
+    for pname in $pool1 $pool2 $pool3
+    do
+      for i in $(seq 1 $objects)
+      do
+	rados -p ${pname} put obj${i}-p${p} $dir/data
+      done
+      p=$(expr $p + 1)
+    done
+
+    local otherosd=$(get_not_primary $pool1 obj1-p1)
+
+    ceph pg dump pgs
+    ERRORS=0
+
+    ceph osd set nobackfill
+    ceph osd set noout
+
+    # Get a pg to want to backfill and quickly force it
+    # to be preempted.
+    ceph osd pool set $pool3 size 2
+    sleep 2
+
+    CEPH_ARGS='' ceph --admin-daemon $(get_asok_path osd.${chk_osd1_1}) dump_reservations || return 1
+
+    # 3. Item is in progress, adjust priority with no higher priority waiting
+    while(ceph pg force-backfill $PG3 2>&1 | grep -q "doesn't require backfilling")
+    do
+      sleep 2
+    done
+    flush_pg_stats || return 1
+    CEPH_ARGS='' ceph --admin-daemon $(get_asok_path osd.${chk_osd1_1}) dump_reservations || return 1
+
+    ceph osd out osd.$chk_osd1_2
+    sleep 2
+    flush_pg_stats || return 1
+    CEPH_ARGS='' ceph --admin-daemon $(get_asok_path osd.${chk_osd1_1}) dump_reservations || return 1
+    ceph pg dump pgs
+
+    ceph osd pool set $pool2 size 2
+    sleep 2
+    flush_pg_stats || return 1
+    CEPH_ARGS='' ceph --admin-daemon $(get_asok_path osd.${chk_osd1_1}) dump_reservations > $dir/out || return 1
+    cat $dir/out
+    ceph pg dump pgs
+
+    PRIO=$(cat $dir/out | jq "(.local_reservations.queues[].items[] | select(.item == \"${PG1}\")).prio")
+    if [ "$PRIO" != "$NORMAL_PRIO" ];
+    then
+      echo "The normal PG ${PG1} doesn't have prio $NORMAL_PRIO queued waiting"
+      ERRORS=$(expr $ERRORS + 1)
+    fi
+
+    # Using eval will strip double-quotes from item
+    eval ITEM=$(cat $dir/out | jq '.local_reservations.in_progress[0].item')
+    if [ "$ITEM" != ${PG3} ];
+    then
+      echo "The force-backfill PG $PG3 didn't become the in progress item"
+      ERRORS=$(expr $ERRORS + 1)
+    else
+      PRIO=$(cat $dir/out | jq '.local_reservations.in_progress[0].prio')
+      if [ "$PRIO" != $FORCE_PRIO ];
+      then
+        echo "The force-backfill PG ${PG3} doesn't have prio $FORCE_PRIO"
+        ERRORS=$(expr $ERRORS + 1)
+      fi
+    fi
+
+    # 1. Item is queued, re-queue with new priority
+    while(ceph pg force-backfill $PG2 2>&1 | grep -q "doesn't require backfilling")
+    do
+      sleep 2
+    done
+    sleep 2
+    CEPH_ARGS='' ceph --admin-daemon $(get_asok_path osd.${chk_osd1_1}) dump_reservations > $dir/out || return 1
+    cat $dir/out
+    PRIO=$(cat $dir/out | jq "(.local_reservations.queues[].items[] | select(.item == \"${PG2}\")).prio")
+    if [ "$PRIO" != "$FORCE_PRIO" ];
+    then
+      echo "The second force-backfill PG ${PG2} doesn't have prio $FORCE_PRIO"
+      ERRORS=$(expr $ERRORS + 1)
+    fi
+    flush_pg_stats || return 1
+
+    # 4. Item is in progress, if higher priority items waiting prempt item
+    ceph pg cancel-force-backfill $PG3 || return 1
+    sleep 2
+    CEPH_ARGS='' ceph --admin-daemon $(get_asok_path osd.${chk_osd1_1}) dump_reservations > $dir/out || return 1
+    cat $dir/out
+    PRIO=$(cat $dir/out | jq "(.local_reservations.queues[].items[] | select(.item == \"${PG3}\")).prio")
+    if [ "$PRIO" != "$degraded_prio" ];
+    then
+      echo "After cancel-force-backfill PG ${PG3} doesn't have prio $degraded_prio"
+      ERRORS=$(expr $ERRORS + 1)
+    fi
+
+    eval ITEM=$(cat $dir/out | jq '.local_reservations.in_progress[0].item')
+    if [ "$ITEM" != ${PG2} ];
+    then
+      echo "The force-recovery PG $PG2 didn't become the in progress item"
+      ERRORS=$(expr $ERRORS + 1)
+    else
+      PRIO=$(cat $dir/out | jq '.local_reservations.in_progress[0].prio')
+      if [ "$PRIO" != $FORCE_PRIO ];
+      then
+        echo "The first force-recovery PG ${PG2} doesn't have prio $FORCE_PRIO"
+        ERRORS=$(expr $ERRORS + 1)
+      fi
+    fi
+
+    ceph pg cancel-force-backfill $PG2 || return 1
+    sleep 5
+    CEPH_ARGS='' ceph --admin-daemon $(get_asok_path osd.${chk_osd1_1}) dump_reservations || return 1
+
+    # 2. Item is queued, re-queue and preempt because new priority higher than an in progress item
+    flush_pg_stats || return 1
+    ceph pg force-backfill $PG3 || return 1
+    sleep 2
+
+    CEPH_ARGS='' ceph --admin-daemon $(get_asok_path osd.${chk_osd1_1}) dump_reservations > $dir/out || return 1
+    cat $dir/out
+    PRIO=$(cat $dir/out | jq "(.local_reservations.queues[].items[] | select(.item == \"${PG2}\")).prio")
+    if [ "$PRIO" != "$degraded_prio" ];
+    then
+      echo "After cancel-force-backfill PG ${PG2} doesn't have prio $degraded_prio"
+      ERRORS=$(expr $ERRORS + 1)
+    fi
+
+    eval ITEM=$(cat $dir/out | jq '.local_reservations.in_progress[0].item')
+    if [ "$ITEM" != ${PG3} ];
+    then
+      echo "The force-backfill PG $PG3 didn't get promoted to an in progress item"
+      ERRORS=$(expr $ERRORS + 1)
+    else
+      PRIO=$(cat $dir/out | jq '.local_reservations.in_progress[0].prio')
+      if [ "$PRIO" != $FORCE_PRIO ];
+      then
+        echo "The force-backfill PG ${PG2} doesn't have prio $FORCE_PRIO"
+        ERRORS=$(expr $ERRORS + 1)
+      fi
+    fi
+
+    ceph osd unset noout
+    ceph osd unset nobackfill
+
+    wait_for_clean "CEPH_ARGS='' ceph --admin-daemon $(get_asok_path osd.${chk_osd1_1}) dump_reservations" || return 1
+
+    ceph pg dump pgs
+
+    CEPH_ARGS='' ceph --admin-daemon $(get_asok_path osd.${chk_osd1_1}) dump_pgstate_history
+
+    if [ $ERRORS != "0" ];
+    then
+      echo "$ERRORS error(s) found"
+    else
+      echo TEST PASSED
+    fi
+
+    delete_pool $pool1
+    delete_pool $pool2
+    delete_pool $pool3
+    kill_daemons $dir || return 1
+    return $ERRORS
+}
+
+#
+# Show that pool recovery_priority is added to the backfill priority
+#
+# Create 2 pools with 2 OSDs with different primarys
+# pool 1 with recovery_priority 1
+# pool 2 with recovery_priority 2
+#
+# Start backfill by changing the pool sizes from 1 to 2
+# Use dump_reservations to verify priorities
+function TEST_backfill_pool_priority() {
+    local dir=$1
+    local pools=3 # Don't assume the first 2 pools are exact what we want
+    local OSDS=2
+
+    run_mon $dir a || return 1
+    run_mgr $dir x || return 1
+    export CEPH_ARGS
+
+    for osd in $(seq 0 $(expr $OSDS - 1))
+    do
+      run_osd $dir $osd || return 1
+    done
+
+    for p in $(seq 1 $pools)
+    do
+      create_pool "${poolprefix}$p" 1 1
+      ceph osd pool set "${poolprefix}$p" size 2
+    done
+    sleep 5
+
+    wait_for_clean || return 1
+
+    ceph pg dump pgs
+
+    # Find 2 pools with different primaries which
+    # means the replica must be on another osd.
+    local PG1
+    local POOLNUM1
+    local pool1
+    local chk_osd1_1
+    local chk_osd1_2
+
+    local PG2
+    local POOLNUM2
+    local pool2
+    local chk_osd2_1
+    local chk_osd2_2
+
+    for p in $(seq 1 $pools)
+    do
+      ceph pg map ${p}.0 --format=json | jq '.acting[]' > $dir/acting
+      local test_osd1=$(head -1 $dir/acting)
+      local test_osd2=$(tail -1 $dir/acting)
+      if [ -z "$PG1" ];
+      then
+        PG1="${p}.0"
+        POOLNUM1=$p
+        pool1="${poolprefix}$p"
+        chk_osd1_1=$test_osd1
+        chk_osd1_2=$test_osd2
+      elif [ $chk_osd1_1 != $test_osd1 ];
+      then
+        PG2="${p}.0"
+        POOLNUM2=$p
+        pool2="${poolprefix}$p"
+        chk_osd2_1=$test_osd1
+        chk_osd2_2=$test_osd2
+        break
+      fi
+    done
+    rm -f $dir/acting
+
+    if [ "$pool2" = "" ];
+    then
+      echo "Failure to find appropirate PGs"
+      return 1
+    fi
+
+    for p in $(seq 1 $pools)
+    do
+      if [ $p != $POOLNUM1 -a $p != $POOLNUM2 ];
+      then
+        delete_pool ${poolprefix}$p
+      fi
+    done
+
+    pool1_extra_prio=1
+    pool2_extra_prio=2
+    # size 2 -> 1 means degraded by 1, so add 1 to base prio
+    pool1_prio=$(expr $DEGRADED_PRIO + 1 + $pool1_extra_prio)
+    pool2_prio=$(expr $DEGRADED_PRIO + 1 + $pool2_extra_prio)
+
+    ceph osd pool set $pool1 size 1
+    ceph osd pool set $pool1 recovery_priority $pool1_extra_prio
+    ceph osd pool set $pool2 size 1
+    ceph osd pool set $pool2 recovery_priority $pool2_extra_prio
+    wait_for_clean || return 1
+
+    dd if=/dev/urandom of=$dir/data bs=1M count=10
+    p=1
+    for pname in $pool1 $pool2
+    do
+      for i in $(seq 1 $objects)
+      do
+	rados -p ${pname} put obj${i}-p${p} $dir/data
+      done
+      p=$(expr $p + 1)
+    done
+
+    local otherosd=$(get_not_primary $pool1 obj1-p1)
+
+    ceph pg dump pgs
+    ERRORS=0
+
+    ceph osd pool set $pool1 size 2
+    ceph osd pool set $pool2 size 2
+    sleep 5
+    CEPH_ARGS='' ceph --admin-daemon $(get_asok_path osd.${chk_osd1_1}) dump_reservations > $dir/dump.${chk_osd1_1}.out
+    echo osd.${chk_osd1_1}
+    cat $dir/dump.${chk_osd1_1}.out
+    CEPH_ARGS='' ceph --admin-daemon $(get_asok_path osd.${chk_osd1_2}) dump_reservations > $dir/dump.${chk_osd1_2}.out
+    echo osd.${chk_osd1_2}
+    cat $dir/dump.${chk_osd1_2}.out
+
+    # Using eval will strip double-quotes from item
+    eval ITEM=$(cat $dir/dump.${chk_osd1_1}.out | jq '.local_reservations.in_progress[0].item')
+    if [ "$ITEM" != ${PG1} ];
+    then
+      echo "The primary PG ${PG1} didn't become the in progress item"
+      ERRORS=$(expr $ERRORS + 1)
+    else
+      PRIO=$(cat $dir/dump.${chk_osd1_1}.out | jq '.local_reservations.in_progress[0].prio')
+      if [ "$PRIO" != $pool1_prio ];
+      then
+        echo "The primary PG ${PG1} doesn't have prio $pool1_prio"
+        ERRORS=$(expr $ERRORS + 1)
+      fi
+    fi
+
+    # Using eval will strip double-quotes from item
+    eval ITEM=$(cat $dir/dump.${chk_osd1_2}.out | jq '.remote_reservations.in_progress[0].item')
+    if [ "$ITEM" != ${PG1} ];
+    then
+      echo "The primary PG ${PG1} didn't become the in progress item on remote"
+      ERRORS=$(expr $ERRORS + 1)
+    else
+      PRIO=$(cat $dir/dump.${chk_osd1_2}.out | jq '.remote_reservations.in_progress[0].prio')
+      if [ "$PRIO" != $pool1_prio ];
+      then
+        echo "The primary PG ${PG1} doesn't have prio $pool1_prio on remote"
+        ERRORS=$(expr $ERRORS + 1)
+      fi
+    fi
+
+    # Using eval will strip double-quotes from item
+    eval ITEM=$(cat $dir/dump.${chk_osd2_1}.out | jq '.local_reservations.in_progress[0].item')
+    if [ "$ITEM" != ${PG2} ];
+    then
+      echo "The primary PG ${PG2} didn't become the in progress item"
+      ERRORS=$(expr $ERRORS + 1)
+    else
+      PRIO=$(cat $dir/dump.${chk_osd2_1}.out | jq '.local_reservations.in_progress[0].prio')
+      if [ "$PRIO" != $pool2_prio ];
+      then
+        echo "The primary PG ${PG2} doesn't have prio $pool2_prio"
+        ERRORS=$(expr $ERRORS + 1)
+      fi
+    fi
+
+    # Using eval will strip double-quotes from item
+    eval ITEM=$(cat $dir/dump.${chk_osd2_2}.out | jq '.remote_reservations.in_progress[0].item')
+    if [ "$ITEM" != ${PG2} ];
+    then
+      echo "The primary PG $PG2 didn't become the in progress item on remote"
+      ERRORS=$(expr $ERRORS + 1)
+    else
+      PRIO=$(cat $dir/dump.${chk_osd2_2}.out | jq '.remote_reservations.in_progress[0].prio')
+      if [ "$PRIO" != $pool2_prio ];
+      then
+        echo "The primary PG ${PG2} doesn't have prio $pool2_prio on remote"
+        ERRORS=$(expr $ERRORS + 1)
+      fi
+    fi
+
+    wait_for_clean || return 1
+
+    if [ $ERRORS != "0" ];
+    then
+      echo "$ERRORS error(s) found"
+    else
+      echo TEST PASSED
+    fi
+
+    delete_pool $pool1
+    delete_pool $pool2
+    kill_daemons $dir || return 1
+    return $ERRORS
+}
+
+main osd-backfill-prio "$@"
+
+# Local Variables:
+# compile-command: "make -j4 && ../qa/run-standalone.sh osd-backfill-prio.sh"
+# End:
diff -Nru ceph-12.2.11/qa/standalone/osd/osd-markdown.sh ceph-12.2.12/qa/standalone/osd/osd-markdown.sh
--- ceph-12.2.11/qa/standalone/osd/osd-markdown.sh	2019-01-30 15:51:26.000000000 +0000
+++ ceph-12.2.12/qa/standalone/osd/osd-markdown.sh	2019-04-11 12:33:50.000000000 +0000
@@ -45,7 +45,10 @@
     ceph osd tree
     ceph osd tree | grep osd.0 |grep up || return 1
     # mark the OSD down.
-    ceph osd down 0
+    # override any dup setting in the environment to ensure we do this
+    # exactly once (modulo messenger failures, at least; we can't *actually*
+    # provide exactly-once semantics for mon commands).
+    CEPH_CLI_TEST_DUP_COMMAND=0 ceph osd down 0
     sleep $sleeptime
   done
 }
diff -Nru ceph-12.2.11/qa/standalone/osd/osd-recovery-prio.sh ceph-12.2.12/qa/standalone/osd/osd-recovery-prio.sh
--- ceph-12.2.11/qa/standalone/osd/osd-recovery-prio.sh	1970-01-01 00:00:00.000000000 +0000
+++ ceph-12.2.12/qa/standalone/osd/osd-recovery-prio.sh	2019-04-11 12:33:50.000000000 +0000
@@ -0,0 +1,500 @@
+#!/usr/bin/env bash
+#
+# Copyright (C) 2019 Red Hat <contact@redhat.com>
+#
+# Author: David Zafman <dzafman@redhat.com>
+#
+# This program is free software; you can redistribute it and/or modify
+# it under the terms of the GNU Library Public License as published by
+# the Free Software Foundation; either version 2, or (at your option)
+# any later version.
+#
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+# GNU Library Public License for more details.
+#
+
+source $CEPH_ROOT/qa/standalone/ceph-helpers.sh
+
+function run() {
+    local dir=$1
+    shift
+
+    # Fix port????
+    export CEPH_MON="127.0.0.1:7114" # git grep '\<7114\>' : there must be only one
+    export CEPH_ARGS
+    CEPH_ARGS+="--fsid=$(uuidgen) --auth-supported=none "
+    CEPH_ARGS+="--mon-host=$CEPH_MON --osd_max_backfills=1 --debug_reserver=20"
+    export objects=200
+    export poolprefix=test
+    export FORCE_PRIO="255"    # See OSD_RECOVERY_PRIORITY_FORCED
+    export NORMAL_PRIO="180"   # See OSD_RECOVERY_PRIORITY_BASE
+
+    local funcs=${@:-$(set | sed -n -e 's/^\(TEST_[0-9a-z_]*\) .*/\1/p')}
+    for func in $funcs ; do
+        setup $dir || return 1
+        $func $dir || return 1
+        teardown $dir || return 1
+    done
+}
+
+
+function TEST_recovery_priority() {
+    local dir=$1
+    local pools=10
+    local OSDS=5
+
+    run_mon $dir a || return 1
+    run_mgr $dir x || return 1
+    export CEPH_ARGS
+
+    for osd in $(seq 0 $(expr $OSDS - 1))
+    do
+      run_osd $dir $osd || return 1
+    done
+
+    for p in $(seq 1 $pools)
+    do
+      create_pool "${poolprefix}$p" 1 1
+      ceph osd pool set "${poolprefix}$p" size 2
+    done
+    sleep 5
+
+    wait_for_clean || return 1
+
+    ceph pg dump pgs
+
+    # Find 3 pools with a pg with the same primaries but second
+    # replica on another osd.
+    local PG1
+    local POOLNUM1
+    local pool1
+    local chk_osd1_1
+    local chk_osd1_2
+
+    local PG2
+    local POOLNUM2
+    local pool2
+    local chk_osd2
+
+    local PG3
+    local POOLNUM3
+    local pool3
+
+    for p in $(seq 1 $pools)
+    do
+      ceph pg map ${p}.0 --format=json | jq '.acting[]' > $dir/acting
+      local test_osd1=$(head -1 $dir/acting)
+      local test_osd2=$(tail -1 $dir/acting)
+      if [ -z "$PG1" ];
+      then
+        PG1="${p}.0"
+        POOLNUM1=$p
+        pool1="${poolprefix}$p"
+        chk_osd1_1=$test_osd1
+        chk_osd1_2=$test_osd2
+      elif [ -z "$PG2" -a $chk_osd1_1 = $test_osd1 -a $chk_osd1_2 != $test_osd2 ];
+      then
+        PG2="${p}.0"
+        POOLNUM2=$p
+        pool2="${poolprefix}$p"
+        chk_osd2=$test_osd2
+      elif [ -n "$PG2" -a $chk_osd1_1 = $test_osd1 -a $chk_osd1_2 != $test_osd2 -a "$chk_osd2" != $test_osd2 ];
+      then
+        PG3="${p}.0"
+        POOLNUM3=$p
+        pool3="${poolprefix}$p"
+        break
+      fi
+    done
+    rm -f $dir/acting
+
+    if [ "$pool2" = "" -o "pool3" = "" ];
+    then
+      echo "Failure to find appropirate PGs"
+      return 1
+    fi
+
+    for p in $(seq 1 $pools)
+    do
+      if [ $p != $POOLNUM1 -a $p != $POOLNUM2 -a $p != $POOLNUM3 ];
+      then
+        delete_pool ${poolprefix}$p
+      fi
+    done
+
+    ceph osd pool set $pool2 size 1
+    ceph osd pool set $pool3 size 1
+    wait_for_clean || return 1
+
+    dd if=/dev/urandom of=$dir/data bs=1M count=10
+    p=1
+    for pname in $pool1 $pool2 $pool3
+    do
+      for i in $(seq 1 $objects)
+      do
+	rados -p ${pname} put obj${i}-p${p} $dir/data
+      done
+      p=$(expr $p + 1)
+    done
+
+    local otherosd=$(get_not_primary $pool1 obj1-p1)
+
+    ceph pg dump pgs
+    ERRORS=0
+
+    ceph osd set norecover
+    ceph osd set noout
+
+    # Get a pg to want to recover and quickly force it
+    # to be preempted.
+    ceph osd pool set $pool3 size 2
+    sleep 2
+    CEPH_ARGS='' ceph --admin-daemon $(get_asok_path osd.${chk_osd1_1}) dump_reservations || return 1
+
+    # 3. Item is in progress, adjust priority with no higher priority waiting
+    while(ceph pg force-recovery $PG3 2>&1 | grep -q "doesn't require recovery")
+    do
+      sleep 2
+    done
+    flush_pg_stats || return 1
+    CEPH_ARGS='' ceph --admin-daemon $(get_asok_path osd.${chk_osd1_1}) dump_reservations || return 1
+
+    ceph osd out osd.$chk_osd1_2
+    sleep 2
+    flush_pg_stats || return 1
+    CEPH_ARGS='' ceph --admin-daemon $(get_asok_path osd.${chk_osd1_1}) dump_reservations || return 1
+    ceph pg dump pgs
+
+    ceph osd pool set $pool2 size 2
+    sleep 2
+    flush_pg_stats || return 1
+    CEPH_ARGS='' ceph --admin-daemon $(get_asok_path osd.${chk_osd1_1}) dump_reservations > $dir/out || return 1
+    cat $dir/out
+    ceph pg dump pgs
+
+    PRIO=$(cat $dir/out | jq "(.local_reservations.queues[].items[] | select(.item == \"${PG1}\")).prio")
+    if [ "$PRIO" != "$NORMAL_PRIO" ];
+    then
+      echo "The normal PG ${PG1} doesn't have prio $NORMAL_PRIO queued waiting"
+      ERRORS=$(expr $ERRORS + 1)
+    fi
+
+    # Using eval will strip double-quotes from item
+    eval ITEM=$(cat $dir/out | jq '.local_reservations.in_progress[0].item')
+    if [ "$ITEM" != ${PG3} ];
+    then
+      echo "The first force-recovery PG $PG3 didn't become the in progress item"
+      ERRORS=$(expr $ERRORS + 1)
+    else
+      PRIO=$(cat $dir/out | jq '.local_reservations.in_progress[0].prio')
+      if [ "$PRIO" != $FORCE_PRIO ];
+      then
+        echo "The first force-recovery PG ${PG3} doesn't have prio $FORCE_PRIO"
+        ERRORS=$(expr $ERRORS + 1)
+      fi
+    fi
+
+    # 1. Item is queued, re-queue with new priority
+    while(ceph pg force-recovery $PG2 2>&1 | grep -q "doesn't require recovery")
+    do
+      sleep 2
+    done
+    sleep 2
+    CEPH_ARGS='' ceph --admin-daemon $(get_asok_path osd.${chk_osd1_1}) dump_reservations > $dir/out || return 1
+    cat $dir/out
+    PRIO=$(cat $dir/out | jq "(.local_reservations.queues[].items[] | select(.item == \"${PG2}\")).prio")
+    if [ "$PRIO" != "$FORCE_PRIO" ];
+    then
+      echo "The second force-recovery PG ${PG2} doesn't have prio $FORCE_PRIO"
+      ERRORS=$(expr $ERRORS + 1)
+    fi
+    flush_pg_stats || return 1
+
+    # 4. Item is in progress, if higher priority items waiting prempt item
+    #ceph osd unset norecover
+    ceph pg cancel-force-recovery $PG3 || return 1
+    sleep 2
+    #ceph osd set norecover
+    CEPH_ARGS='' ceph --admin-daemon $(get_asok_path osd.${chk_osd1_1}) dump_reservations > $dir/out || return 1
+    cat $dir/out
+    PRIO=$(cat $dir/out | jq "(.local_reservations.queues[].items[] | select(.item == \"${PG3}\")).prio")
+    if [ "$PRIO" != "$NORMAL_PRIO" ];
+    then
+      echo "After cancel-recovery PG ${PG3} doesn't have prio $NORMAL_PRIO"
+      ERRORS=$(expr $ERRORS + 1)
+    fi
+
+    eval ITEM=$(cat $dir/out | jq '.local_reservations.in_progress[0].item')
+    if [ "$ITEM" != ${PG2} ];
+    then
+      echo "The force-recovery PG $PG2 didn't become the in progress item"
+      ERRORS=$(expr $ERRORS + 1)
+    else
+      PRIO=$(cat $dir/out | jq '.local_reservations.in_progress[0].prio')
+      if [ "$PRIO" != $FORCE_PRIO ];
+      then
+        echo "The first force-recovery PG ${PG2} doesn't have prio $FORCE_PRIO"
+        ERRORS=$(expr $ERRORS + 1)
+      fi
+    fi
+
+    ceph pg cancel-force-recovery $PG2 || return 1
+    sleep 5
+    CEPH_ARGS='' ceph --admin-daemon $(get_asok_path osd.${chk_osd1_1}) dump_reservations || return 1
+
+    # 2. Item is queued, re-queue and preempt because new priority higher than an in progress item
+    flush_pg_stats || return 1
+    ceph pg force-recovery $PG3 || return 1
+    sleep 2
+
+    CEPH_ARGS='' ceph --admin-daemon $(get_asok_path osd.${chk_osd1_1}) dump_reservations > $dir/out || return 1
+    cat $dir/out
+    PRIO=$(cat $dir/out | jq "(.local_reservations.queues[].items[] | select(.item == \"${PG2}\")).prio")
+    if [ "$PRIO" != "$NORMAL_PRIO" ];
+    then
+      echo "After cancel-force-recovery PG ${PG3} doesn't have prio $NORMAL_PRIO"
+      ERRORS=$(expr $ERRORS + 1)
+    fi
+
+    eval ITEM=$(cat $dir/out | jq '.local_reservations.in_progress[0].item')
+    if [ "$ITEM" != ${PG3} ];
+    then
+      echo "The force-recovery PG $PG3 didn't get promoted to an in progress item"
+      ERRORS=$(expr $ERRORS + 1)
+    else
+      PRIO=$(cat $dir/out | jq '.local_reservations.in_progress[0].prio')
+      if [ "$PRIO" != $FORCE_PRIO ];
+      then
+        echo "The force-recovery PG ${PG2} doesn't have prio $FORCE_PRIO"
+        ERRORS=$(expr $ERRORS + 1)
+      fi
+    fi
+
+    ceph osd unset noout
+    ceph osd unset norecover
+
+    wait_for_clean "CEPH_ARGS='' ceph --admin-daemon $(get_asok_path osd.${chk_osd1_1}) dump_reservations" || return 1
+
+    ceph pg dump pgs
+
+    CEPH_ARGS='' ceph --admin-daemon $(get_asok_path osd.${chk_osd1_1}) dump_pgstate_history
+
+    if [ $ERRORS != "0" ];
+    then
+      echo "$ERRORS error(s) found"
+    else
+      echo TEST PASSED
+    fi
+
+    delete_pool $pool1
+    delete_pool $pool2
+    delete_pool $pool3
+    kill_daemons $dir || return 1
+    return $ERRORS
+}
+
+#
+# Show that pool recovery_priority is added to recovery priority
+#
+# Create 2 pools with 2 OSDs with different primarys
+# pool 1 with recovery_priority 1
+# pool 2 with recovery_priority 2
+#
+# Start recovery by changing the pool sizes from 1 to 2
+# Use dump_reservations to verify priorities
+function TEST_recovery_pool_priority() {
+    local dir=$1
+    local pools=3 # Don't assume the first 2 pools are exact what we want
+    local OSDS=2
+
+    run_mon $dir a || return 1
+    run_mgr $dir x || return 1
+    export CEPH_ARGS
+
+    for osd in $(seq 0 $(expr $OSDS - 1))
+    do
+      run_osd $dir $osd || return 1
+    done
+
+    for p in $(seq 1 $pools)
+    do
+      create_pool "${poolprefix}$p" 1 1
+      ceph osd pool set "${poolprefix}$p" size 2
+    done
+    sleep 5
+
+    wait_for_clean || return 1
+
+    ceph pg dump pgs
+
+    # Find 2 pools with different primaries which
+    # means the replica must be on another osd.
+    local PG1
+    local POOLNUM1
+    local pool1
+    local chk_osd1_1
+    local chk_osd1_2
+
+    local PG2
+    local POOLNUM2
+    local pool2
+    local chk_osd2_1
+    local chk_osd2_2
+
+    for p in $(seq 1 $pools)
+    do
+      ceph pg map ${p}.0 --format=json | jq '.acting[]' > $dir/acting
+      local test_osd1=$(head -1 $dir/acting)
+      local test_osd2=$(tail -1 $dir/acting)
+      if [ -z "$PG1" ];
+      then
+        PG1="${p}.0"
+        POOLNUM1=$p
+        pool1="${poolprefix}$p"
+        chk_osd1_1=$test_osd1
+        chk_osd1_2=$test_osd2
+      elif [ $chk_osd1_1 != $test_osd1 ];
+      then
+        PG2="${p}.0"
+        POOLNUM2=$p
+        pool2="${poolprefix}$p"
+        chk_osd2_1=$test_osd1
+        chk_osd2_2=$test_osd2
+        break
+      fi
+    done
+    rm -f $dir/acting
+
+    if [ "$pool2" = "" ];
+    then
+      echo "Failure to find appropirate PGs"
+      return 1
+    fi
+
+    for p in $(seq 1 $pools)
+    do
+      if [ $p != $POOLNUM1 -a $p != $POOLNUM2 ];
+      then
+        delete_pool ${poolprefix}$p
+      fi
+    done
+
+    pool1_extra_prio=1
+    pool2_extra_prio=2
+    pool1_prio=$(expr $NORMAL_PRIO + $pool1_extra_prio)
+    pool2_prio=$(expr $NORMAL_PRIO + $pool2_extra_prio)
+
+    ceph osd pool set $pool1 size 1
+    ceph osd pool set $pool1 recovery_priority $pool1_extra_prio
+    ceph osd pool set $pool2 size 1
+    ceph osd pool set $pool2 recovery_priority $pool2_extra_prio
+    wait_for_clean || return 1
+
+    dd if=/dev/urandom of=$dir/data bs=1M count=10
+    p=1
+    for pname in $pool1 $pool2
+    do
+      for i in $(seq 1 $objects)
+      do
+	rados -p ${pname} put obj${i}-p${p} $dir/data
+      done
+      p=$(expr $p + 1)
+    done
+
+    local otherosd=$(get_not_primary $pool1 obj1-p1)
+
+    ceph pg dump pgs
+    ERRORS=0
+
+    ceph osd pool set $pool1 size 2
+    ceph osd pool set $pool2 size 2
+    sleep 10
+    CEPH_ARGS='' ceph --admin-daemon $(get_asok_path osd.${chk_osd1_1}) dump_reservations > $dir/dump.${chk_osd1_1}.out
+    echo osd.${chk_osd1_1}
+    cat $dir/dump.${chk_osd1_1}.out
+    CEPH_ARGS='' ceph --admin-daemon $(get_asok_path osd.${chk_osd1_2}) dump_reservations > $dir/dump.${chk_osd1_2}.out
+    echo osd.${chk_osd1_2}
+    cat $dir/dump.${chk_osd1_2}.out
+
+    # Using eval will strip double-quotes from item
+    eval ITEM=$(cat $dir/dump.${chk_osd1_1}.out | jq '.local_reservations.in_progress[0].item')
+    if [ "$ITEM" != ${PG1} ];
+    then
+      echo "The primary PG for $pool1 didn't become the in progress item"
+      ERRORS=$(expr $ERRORS + 1)
+    else
+      PRIO=$(cat $dir/dump.${chk_osd1_1}.out | jq '.local_reservations.in_progress[0].prio')
+      if [ "$PRIO" != $pool1_prio ];
+      then
+        echo "The primary PG ${PG1} doesn't have prio $pool1_prio"
+        ERRORS=$(expr $ERRORS + 1)
+      fi
+    fi
+
+    # Using eval will strip double-quotes from item
+    eval ITEM=$(cat $dir/dump.${chk_osd1_2}.out | jq '.remote_reservations.in_progress[0].item')
+    if [ "$ITEM" != ${PG1} ];
+    then
+      echo "The primary PG for $pool1 didn't become the in progress item on remote"
+      ERRORS=$(expr $ERRORS + 1)
+    else
+      PRIO=$(cat $dir/dump.${chk_osd1_2}.out | jq '.remote_reservations.in_progress[0].prio')
+      if [ "$PRIO" != $pool1_prio ];
+      then
+        echo "The primary PG ${PG1} doesn't have prio $pool1_prio on remote"
+        ERRORS=$(expr $ERRORS + 1)
+      fi
+    fi
+
+    # Using eval will strip double-quotes from item
+    eval ITEM=$(cat $dir/dump.${chk_osd2_1}.out | jq '.local_reservations.in_progress[0].item')
+    if [ "$ITEM" != ${PG2} ];
+    then
+      echo "The primary PG for $pool2 didn't become the in progress item"
+      ERRORS=$(expr $ERRORS + 1)
+    else
+      PRIO=$(cat $dir/dump.${chk_osd2_1}.out | jq '.local_reservations.in_progress[0].prio')
+      if [ "$PRIO" != $pool2_prio ];
+      then
+        echo "The primary PG ${PG2} doesn't have prio $pool2_prio"
+        ERRORS=$(expr $ERRORS + 1)
+      fi
+    fi
+
+    # Using eval will strip double-quotes from item
+    eval ITEM=$(cat $dir/dump.${chk_osd2_2}.out | jq '.remote_reservations.in_progress[0].item')
+    if [ "$ITEM" != ${PG2} ];
+    then
+      echo "The primary PG $PG2 didn't become the in progress item on remote"
+      ERRORS=$(expr $ERRORS + 1)
+    else
+      PRIO=$(cat $dir/dump.${chk_osd2_2}.out | jq '.remote_reservations.in_progress[0].prio')
+      if [ "$PRIO" != $pool2_prio ];
+      then
+        echo "The primary PG ${PG2} doesn't have prio $pool2_prio on remote"
+        ERRORS=$(expr $ERRORS + 1)
+      fi
+    fi
+
+    wait_for_clean || return 1
+
+    if [ $ERRORS != "0" ];
+    then
+      echo "$ERRORS error(s) found"
+    else
+      echo TEST PASSED
+    fi
+
+    delete_pool $pool1
+    delete_pool $pool2
+    kill_daemons $dir || return 1
+    return $ERRORS
+}
+
+main osd-recovery-prio "$@"
+
+# Local Variables:
+# compile-command: "make -j4 && ../qa/run-standalone.sh osd-recovery-prio.sh"
+# End:
diff -Nru ceph-12.2.11/qa/standalone/scrub/osd-scrub-repair.sh ceph-12.2.12/qa/standalone/scrub/osd-scrub-repair.sh
--- ceph-12.2.11/qa/standalone/scrub/osd-scrub-repair.sh	2019-01-30 15:51:26.000000000 +0000
+++ ceph-12.2.12/qa/standalone/scrub/osd-scrub-repair.sh	2019-04-11 12:33:50.000000000 +0000
@@ -194,7 +194,7 @@
     local pool_name=$1
     local allow_overwrites=$2
 
-    ceph osd erasure-code-profile set myprofile crush-failure-domain=osd $3 $4 $5 $6 $7 || return 1
+    ceph osd erasure-code-profile set myprofile crush-failure-domain=osd "$@" || return 1
 
     create_pool "$poolname" 1 1 erasure myprofile || return 1
 
@@ -5245,7 +5245,7 @@
     # Can't upgrade with this set
     ceph osd set nodeep-scrub
     # Let map change propagate to OSDs
-    flush pg_stats
+    flush_pg_stats
     sleep 5
 
     # Fake a schedule scrub
@@ -5274,6 +5274,91 @@
     rados list-inconsistent-obj $pg | jq '.' | grep -qv $objname || return 1
 }
 
+function TEST_scrub_warning() {
+    local dir=$1
+    local poolname=psr_pool
+    local objname=POBJ
+    local scrubs=5
+    local deep_scrubs=5
+    local i1_day=86400
+    local i7_days=$(calc $i1_day \* 7)
+    local i14_days=$(calc $i1_day \* 14)
+    local overdue=$i1_day
+    local conf_overdue_seconds=$(calc $i7_days + $overdue )
+    local pool_overdue_seconds=$(calc $i14_days + $overdue )
+
+    setup $dir || return 1
+    run_mon $dir a --osd_pool_default_size=1 || return 1
+    run_mgr $dir x --mon_warn_not_scrubbed=${overdue} --mon_warn_not_deep_scrubbed=${overdue} || return 1
+    run_osd $dir 0 $ceph_osd_args --osd_scrub_backoff_ratio=0 || return 1
+
+    for i in $(seq 1 $(expr $scrubs + $deep_scrubs))
+    do
+      create_pool $poolname-$i 1 1 || return 1
+      wait_for_clean || return 1
+      if [ $i = "1" ];
+      then
+        ceph osd pool set $poolname-$i scrub_max_interval $i14_days
+      fi
+      if [ $i = $(expr $scrubs + 1) ];
+      then
+        ceph osd pool set $poolname-$i deep_scrub_interval $i14_days
+      fi
+    done
+
+    # Only 1 osd
+    local primary=0
+
+    ceph osd set noscrub || return 1
+    ceph osd set nodeep-scrub || return 1
+    ceph config set global osd_scrub_interval_randomize_ratio 0
+    ceph config set global osd_deep_scrub_randomize_ratio 0
+    ceph config set global osd_scrub_max_interval ${i7_days}
+    ceph config set global osd_deep_scrub_interval ${i7_days}
+
+    # Fake schedule scrubs
+    for i in $(seq 1 $scrubs)
+    do
+      if [ $i = "1" ];
+      then
+        overdue_seconds=$pool_overdue_seconds
+      else
+        overdue_seconds=$conf_overdue_seconds
+      fi
+      CEPH_ARGS='' ceph daemon $(get_asok_path osd.${primary}) \
+             trigger_scrub ${i}.0 $(expr ${overdue_seconds} + ${i}00) || return 1
+    done
+    # Fake schedule deep scrubs
+    for i in $(seq $(expr $scrubs + 1) $(expr $scrubs + $deep_scrubs))
+    do
+      if [ $i = "$(expr $scrubs + 1)" ];
+      then
+        overdue_seconds=$pool_overdue_seconds
+      else
+        overdue_seconds=$conf_overdue_seconds
+      fi
+      CEPH_ARGS='' ceph daemon $(get_asok_path osd.${primary}) \
+             trigger_deep_scrub ${i}.0 $(expr ${overdue_seconds} + ${i}00) || return 1
+    done
+    flush_pg_stats
+
+    ceph health
+    ceph health detail
+    ceph health | grep -q "$deep_scrubs pgs not deep-scrubbed in time" || return 1
+    ceph health | grep -q "$scrubs pgs not scrubbed in time" || return 1
+    COUNT=$(ceph health detail | grep "not scrubbed since" | wc -l)
+    if [ "$COUNT" != $scrubs ]; then
+      ceph health detail | grep "not scrubbed since"
+      return 1
+    fi
+    COUNT=$(ceph health detail | grep "not deep-scrubbed since" | wc -l)
+    if [ "$COUNT" != $deep_scrubs ]; then
+      ceph health detail | grep "not deep-scrubbed since"
+      return 1
+    fi
+    return 0
+}
+
 #
 # Corrupt snapset in replicated pool
 #
diff -Nru ceph-12.2.11/qa/standalone/special/ceph_objectstore_tool.py ceph-12.2.12/qa/standalone/special/ceph_objectstore_tool.py
--- ceph-12.2.11/qa/standalone/special/ceph_objectstore_tool.py	2019-01-30 15:51:26.000000000 +0000
+++ ceph-12.2.12/qa/standalone/special/ceph_objectstore_tool.py	2019-04-11 12:33:50.000000000 +0000
@@ -686,8 +686,8 @@
     EC_NAME = "ECobject"
     if len(argv) > 0 and argv[0] == 'large':
         PG_COUNT = 12
-        NUM_REP_OBJECTS = 800
-        NUM_CLONED_REP_OBJECTS = 100
+        NUM_REP_OBJECTS = 200
+        NUM_CLONED_REP_OBJECTS = 50
         NUM_EC_OBJECTS = 12
         NUM_NSPACES = 4
         # Larger data sets for first object per namespace
@@ -1470,7 +1470,7 @@
         for basename in db[nspace].keys():
             file = os.path.join(DATADIR, nspace + "-" + basename + "__head")
             JSON = db[nspace][basename]['json']
-            GETNAME = "/tmp/getbytes.{pid}".format(pid=pid)
+            jsondict = json.loads(JSON)
             for pg in OBJREPPGS:
                 OSDS = get_osds(pg, OSDDIR)
                 for osd in OSDS:
@@ -1481,12 +1481,33 @@
                         continue
                     if int(basename.split(REP_NAME)[1]) > int(NUM_CLONED_REP_OBJECTS):
                         continue
+                    logging.debug("REPobject " + JSON)
                     cmd = (CFSD_PREFIX + " '{json}' dump | grep '\"snap\": 1,' > /dev/null").format(osd=osd, json=JSON)
                     logging.debug(cmd)
                     ret = call(cmd, shell=True)
                     if ret != 0:
                         logging.error("Invalid dump for {json}".format(json=JSON))
                         ERRORS += 1
+            if 'shard_id' in jsondict[1]:
+                logging.debug("ECobject " + JSON)
+                for pg in OBJECPGS:
+                    OSDS = get_osds(pg, OSDDIR)
+                    jsondict = json.loads(JSON)
+                    for osd in OSDS:
+                        DIR = os.path.join(OSDDIR, os.path.join(osd, os.path.join("current", "{pg}_head".format(pg=pg))))
+                        fnames = [f for f in os.listdir(DIR) if os.path.isfile(os.path.join(DIR, f))
+                                  and f.split("_")[0] == basename and f.split("_")[4] == nspace]
+                        if not fnames:
+                            continue
+                        if int(basename.split(EC_NAME)[1]) > int(NUM_EC_OBJECTS):
+                            continue
+                        # Fix shard_id since we only have one json instance for each object
+                        jsondict[1]['shard_id'] = int(pg.split('s')[1])
+                        cmd = (CFSD_PREFIX + " '{json}' dump | grep '\"hinfo\": [{{]' > /dev/null").format(osd=osd, json=json.dumps((pg, jsondict[1])))
+                        logging.debug(cmd)
+                        ret = call(cmd, shell=True)
+                        if ret != 0:
+                            logging.error("Invalid dump for {json}".format(json=JSON))
 
     print("Test list-attrs get-attr")
     ATTRFILE = r"/tmp/attrs.{pid}".format(pid=pid)
@@ -1497,16 +1518,16 @@
             JSON = db[nspace][basename]['json']
             jsondict = json.loads(JSON)
 
-            if 'shard_id' in jsondict:
+            if 'shard_id' in jsondict[1]:
                 logging.debug("ECobject " + JSON)
                 found = 0
                 for pg in OBJECPGS:
                     OSDS = get_osds(pg, OSDDIR)
                     # Fix shard_id since we only have one json instance for each object
-                    jsondict['shard_id'] = int(pg.split('s')[1])
-                    JSON = json.dumps(jsondict)
+                    jsondict[1]['shard_id'] = int(pg.split('s')[1])
+                    JSON = json.dumps((pg, jsondict[1]))
                     for osd in OSDS:
-                        cmd = (CFSD_PREFIX + "--pgid {pg} '{json}' get-attr hinfo_key").format(osd=osd, pg=pg, json=JSON)
+                        cmd = (CFSD_PREFIX + " '{json}' get-attr hinfo_key").format(osd=osd, json=JSON)
                         logging.debug("TRY: " + cmd)
                         try:
                             out = check_output(cmd, shell=True, stderr=subprocess.STDOUT)
@@ -1522,12 +1543,12 @@
 
             for pg in ALLPGS:
                 # Make sure rep obj with rep pg or ec obj with ec pg
-                if ('shard_id' in jsondict) != (pg.find('s') > 0):
+                if ('shard_id' in jsondict[1]) != (pg.find('s') > 0):
                     continue
-                if 'shard_id' in jsondict:
+                if 'shard_id' in jsondict[1]:
                     # Fix shard_id since we only have one json instance for each object
-                    jsondict['shard_id'] = int(pg.split('s')[1])
-                    JSON = json.dumps(jsondict)
+                    jsondict[1]['shard_id'] = int(pg.split('s')[1])
+                    JSON = json.dumps((pg, jsondict[1]))
                 OSDS = get_osds(pg, OSDDIR)
                 for osd in OSDS:
                     DIR = os.path.join(OSDDIR, os.path.join(osd, os.path.join("current", "{pg}_head".format(pg=pg))))
@@ -1536,7 +1557,7 @@
                     if not fnames:
                         continue
                     afd = open(ATTRFILE, "wb")
-                    cmd = (CFSD_PREFIX + "--pgid {pg} '{json}' list-attrs").format(osd=osd, pg=pg, json=JSON)
+                    cmd = (CFSD_PREFIX + " '{json}' list-attrs").format(osd=osd, json=JSON)
                     logging.debug(cmd)
                     ret = call(cmd, shell=True, stdout=afd)
                     afd.close()
@@ -1556,7 +1577,7 @@
                             continue
                         exp = values.pop(key)
                         vfd = open(VALFILE, "wb")
-                        cmd = (CFSD_PREFIX + "--pgid {pg} '{json}' get-attr {key}").format(osd=osd, pg=pg, json=JSON, key="_" + key)
+                        cmd = (CFSD_PREFIX + " '{json}' get-attr {key}").format(osd=osd, json=JSON, key="_" + key)
                         logging.debug(cmd)
                         ret = call(cmd, shell=True, stdout=vfd)
                         vfd.close()
diff -Nru ceph-12.2.11/qa/suites/big/rados-thrash/objectstore/bluestore-bitmap.yaml ceph-12.2.12/qa/suites/big/rados-thrash/objectstore/bluestore-bitmap.yaml
--- ceph-12.2.11/qa/suites/big/rados-thrash/objectstore/bluestore-bitmap.yaml	2019-01-30 15:51:26.000000000 +0000
+++ ceph-12.2.12/qa/suites/big/rados-thrash/objectstore/bluestore-bitmap.yaml	2019-04-11 12:33:50.000000000 +0000
@@ -20,6 +20,8 @@
         osd failsafe full ratio: .95
 # this doesn't work with failures bc the log writes are not atomic across the two backends
 #        bluestore bluefs env mirror: true
+        bdev enable discard: true
+        bdev async discard: true
   ceph-deploy:
     fs: xfs
     bluestore: yes
@@ -36,4 +38,6 @@
         mon osd backfillfull_ratio: .85
         mon osd nearfull ratio: .8
         osd failsafe full ratio: .95
+        bdev enable discard: true
+        bdev async discard: true
 
diff -Nru ceph-12.2.11/qa/suites/big/rados-thrash/objectstore/bluestore-stupid.yaml ceph-12.2.12/qa/suites/big/rados-thrash/objectstore/bluestore-stupid.yaml
--- ceph-12.2.11/qa/suites/big/rados-thrash/objectstore/bluestore-stupid.yaml	1970-01-01 00:00:00.000000000 +0000
+++ ceph-12.2.12/qa/suites/big/rados-thrash/objectstore/bluestore-stupid.yaml	2019-04-11 12:33:50.000000000 +0000
@@ -0,0 +1,39 @@
+overrides:
+  thrashosds:
+    bdev_inject_crash: 2
+    bdev_inject_crash_probability: .5
+  ceph:
+    fs: xfs
+    conf:
+      osd:
+        osd objectstore: bluestore
+        bluestore block size: 96636764160
+        debug bluestore: 20
+        debug bluefs: 20
+        debug rocksdb: 10
+        bluestore fsck on mount: true
+        bluestore allocator: stupid
+        # lower the full ratios since we can fill up a 100gb osd so quickly
+        mon osd full ratio: .9
+        mon osd backfillfull_ratio: .85
+        mon osd nearfull ratio: .8
+        osd failsafe full ratio: .95
+# this doesn't work with failures bc the log writes are not atomic across the two backends
+#        bluestore bluefs env mirror: true
+  ceph-deploy:
+    fs: xfs
+    bluestore: yes
+    conf:
+      osd:
+        osd objectstore: bluestore
+        bluestore block size: 96636764160
+        debug bluestore: 20
+        debug bluefs: 20
+        debug rocksdb: 10
+        bluestore fsck on mount: true
+        # lower the full ratios since we can fill up a 100gb osd so quickly
+        mon osd full ratio: .9
+        mon osd backfillfull_ratio: .85
+        mon osd nearfull ratio: .8
+        osd failsafe full ratio: .95
+
diff -Nru ceph-12.2.11/qa/suites/big/rados-thrash/objectstore/bluestore.yaml ceph-12.2.12/qa/suites/big/rados-thrash/objectstore/bluestore.yaml
--- ceph-12.2.11/qa/suites/big/rados-thrash/objectstore/bluestore.yaml	2019-01-30 15:51:26.000000000 +0000
+++ ceph-12.2.12/qa/suites/big/rados-thrash/objectstore/bluestore.yaml	1970-01-01 00:00:00.000000000 +0000
@@ -1,38 +0,0 @@
-overrides:
-  thrashosds:
-    bdev_inject_crash: 2
-    bdev_inject_crash_probability: .5
-  ceph:
-    fs: xfs
-    conf:
-      osd:
-        osd objectstore: bluestore
-        bluestore block size: 96636764160
-        debug bluestore: 20
-        debug bluefs: 20
-        debug rocksdb: 10
-        bluestore fsck on mount: true
-        # lower the full ratios since we can fill up a 100gb osd so quickly
-        mon osd full ratio: .9
-        mon osd backfillfull_ratio: .85
-        mon osd nearfull ratio: .8
-        osd failsafe full ratio: .95
-# this doesn't work with failures bc the log writes are not atomic across the two backends
-#        bluestore bluefs env mirror: true
-  ceph-deploy:
-    fs: xfs
-    bluestore: yes
-    conf:
-      osd:
-        osd objectstore: bluestore
-        bluestore block size: 96636764160
-        debug bluestore: 20
-        debug bluefs: 20
-        debug rocksdb: 10
-        bluestore fsck on mount: true
-        # lower the full ratios since we can fill up a 100gb osd so quickly
-        mon osd full ratio: .9
-        mon osd backfillfull_ratio: .85
-        mon osd nearfull ratio: .8
-        osd failsafe full ratio: .95
-
diff -Nru ceph-12.2.11/qa/suites/ceph-deploy/basic/objectstore/bluestore-bitmap.yaml ceph-12.2.12/qa/suites/ceph-deploy/basic/objectstore/bluestore-bitmap.yaml
--- ceph-12.2.11/qa/suites/ceph-deploy/basic/objectstore/bluestore-bitmap.yaml	1970-01-01 00:00:00.000000000 +0000
+++ ceph-12.2.12/qa/suites/ceph-deploy/basic/objectstore/bluestore-bitmap.yaml	2019-04-11 12:33:50.000000000 +0000
@@ -0,0 +1,43 @@
+overrides:
+  thrashosds:
+    bdev_inject_crash: 2
+    bdev_inject_crash_probability: .5
+  ceph:
+    fs: xfs
+    conf:
+      osd:
+        osd objectstore: bluestore
+        bluestore block size: 96636764160
+        debug bluestore: 20
+        debug bluefs: 20
+        debug rocksdb: 10
+        bluestore fsck on mount: true
+        bluestore allocator: bitmap
+        # lower the full ratios since we can fill up a 100gb osd so quickly
+        mon osd full ratio: .9
+        mon osd backfillfull_ratio: .85
+        mon osd nearfull ratio: .8
+        osd failsafe full ratio: .95
+# this doesn't work with failures bc the log writes are not atomic across the two backends
+#        bluestore bluefs env mirror: true
+        bdev enable discard: true
+        bdev async discard: true
+  ceph-deploy:
+    fs: xfs
+    bluestore: yes
+    conf:
+      osd:
+        osd objectstore: bluestore
+        bluestore block size: 96636764160
+        debug bluestore: 20
+        debug bluefs: 20
+        debug rocksdb: 10
+        bluestore fsck on mount: true
+        # lower the full ratios since we can fill up a 100gb osd so quickly
+        mon osd full ratio: .9
+        mon osd backfillfull_ratio: .85
+        mon osd nearfull ratio: .8
+        osd failsafe full ratio: .95
+        bdev enable discard: true
+        bdev async discard: true
+
diff -Nru ceph-12.2.11/qa/suites/ceph-deploy/basic/objectstore/bluestore.yaml ceph-12.2.12/qa/suites/ceph-deploy/basic/objectstore/bluestore.yaml
--- ceph-12.2.11/qa/suites/ceph-deploy/basic/objectstore/bluestore.yaml	2019-01-30 15:51:26.000000000 +0000
+++ ceph-12.2.12/qa/suites/ceph-deploy/basic/objectstore/bluestore.yaml	1970-01-01 00:00:00.000000000 +0000
@@ -1,38 +0,0 @@
-overrides:
-  thrashosds:
-    bdev_inject_crash: 2
-    bdev_inject_crash_probability: .5
-  ceph:
-    fs: xfs
-    conf:
-      osd:
-        osd objectstore: bluestore
-        bluestore block size: 96636764160
-        debug bluestore: 20
-        debug bluefs: 20
-        debug rocksdb: 10
-        bluestore fsck on mount: true
-        # lower the full ratios since we can fill up a 100gb osd so quickly
-        mon osd full ratio: .9
-        mon osd backfillfull_ratio: .85
-        mon osd nearfull ratio: .8
-        osd failsafe full ratio: .95
-# this doesn't work with failures bc the log writes are not atomic across the two backends
-#        bluestore bluefs env mirror: true
-  ceph-deploy:
-    fs: xfs
-    bluestore: yes
-    conf:
-      osd:
-        osd objectstore: bluestore
-        bluestore block size: 96636764160
-        debug bluestore: 20
-        debug bluefs: 20
-        debug rocksdb: 10
-        bluestore fsck on mount: true
-        # lower the full ratios since we can fill up a 100gb osd so quickly
-        mon osd full ratio: .9
-        mon osd backfillfull_ratio: .85
-        mon osd nearfull ratio: .8
-        osd failsafe full ratio: .95
-
diff -Nru ceph-12.2.11/qa/suites/fs/32bits/objectstore-ec/bluestore-bitmap.yaml ceph-12.2.12/qa/suites/fs/32bits/objectstore-ec/bluestore-bitmap.yaml
--- ceph-12.2.11/qa/suites/fs/32bits/objectstore-ec/bluestore-bitmap.yaml	1970-01-01 00:00:00.000000000 +0000
+++ ceph-12.2.12/qa/suites/fs/32bits/objectstore-ec/bluestore-bitmap.yaml	2019-04-11 12:33:50.000000000 +0000
@@ -0,0 +1,43 @@
+overrides:
+  thrashosds:
+    bdev_inject_crash: 2
+    bdev_inject_crash_probability: .5
+  ceph:
+    fs: xfs
+    conf:
+      osd:
+        osd objectstore: bluestore
+        bluestore block size: 96636764160
+        debug bluestore: 20
+        debug bluefs: 20
+        debug rocksdb: 10
+        bluestore fsck on mount: true
+        bluestore allocator: bitmap
+        # lower the full ratios since we can fill up a 100gb osd so quickly
+        mon osd full ratio: .9
+        mon osd backfillfull_ratio: .85
+        mon osd nearfull ratio: .8
+        osd failsafe full ratio: .95
+# this doesn't work with failures bc the log writes are not atomic across the two backends
+#        bluestore bluefs env mirror: true
+        bdev enable discard: true
+        bdev async discard: true
+  ceph-deploy:
+    fs: xfs
+    bluestore: yes
+    conf:
+      osd:
+        osd objectstore: bluestore
+        bluestore block size: 96636764160
+        debug bluestore: 20
+        debug bluefs: 20
+        debug rocksdb: 10
+        bluestore fsck on mount: true
+        # lower the full ratios since we can fill up a 100gb osd so quickly
+        mon osd full ratio: .9
+        mon osd backfillfull_ratio: .85
+        mon osd nearfull ratio: .8
+        osd failsafe full ratio: .95
+        bdev enable discard: true
+        bdev async discard: true
+
diff -Nru ceph-12.2.11/qa/suites/fs/32bits/objectstore-ec/bluestore.yaml ceph-12.2.12/qa/suites/fs/32bits/objectstore-ec/bluestore.yaml
--- ceph-12.2.11/qa/suites/fs/32bits/objectstore-ec/bluestore.yaml	2019-01-30 15:51:26.000000000 +0000
+++ ceph-12.2.12/qa/suites/fs/32bits/objectstore-ec/bluestore.yaml	1970-01-01 00:00:00.000000000 +0000
@@ -1,38 +0,0 @@
-overrides:
-  thrashosds:
-    bdev_inject_crash: 2
-    bdev_inject_crash_probability: .5
-  ceph:
-    fs: xfs
-    conf:
-      osd:
-        osd objectstore: bluestore
-        bluestore block size: 96636764160
-        debug bluestore: 20
-        debug bluefs: 20
-        debug rocksdb: 10
-        bluestore fsck on mount: true
-        # lower the full ratios since we can fill up a 100gb osd so quickly
-        mon osd full ratio: .9
-        mon osd backfillfull_ratio: .85
-        mon osd nearfull ratio: .8
-        osd failsafe full ratio: .95
-# this doesn't work with failures bc the log writes are not atomic across the two backends
-#        bluestore bluefs env mirror: true
-  ceph-deploy:
-    fs: xfs
-    bluestore: yes
-    conf:
-      osd:
-        osd objectstore: bluestore
-        bluestore block size: 96636764160
-        debug bluestore: 20
-        debug bluefs: 20
-        debug rocksdb: 10
-        bluestore fsck on mount: true
-        # lower the full ratios since we can fill up a 100gb osd so quickly
-        mon osd full ratio: .9
-        mon osd backfillfull_ratio: .85
-        mon osd nearfull ratio: .8
-        osd failsafe full ratio: .95
-
diff -Nru ceph-12.2.11/qa/suites/fs/32bits/tasks/cfuse_workunit_suites_fsstress.yaml ceph-12.2.12/qa/suites/fs/32bits/tasks/cfuse_workunit_suites_fsstress.yaml
--- ceph-12.2.11/qa/suites/fs/32bits/tasks/cfuse_workunit_suites_fsstress.yaml	2019-01-30 15:51:26.000000000 +0000
+++ ceph-12.2.12/qa/suites/fs/32bits/tasks/cfuse_workunit_suites_fsstress.yaml	2019-04-11 12:33:50.000000000 +0000
@@ -1,5 +1,6 @@
 tasks:
 - workunit:
+    timeout: 6h
     clients:
       all:
         - suites/fsstress.sh
diff -Nru ceph-12.2.11/qa/suites/fs/basic_functional/objectstore/bluestore-bitmap.yaml ceph-12.2.12/qa/suites/fs/basic_functional/objectstore/bluestore-bitmap.yaml
--- ceph-12.2.11/qa/suites/fs/basic_functional/objectstore/bluestore-bitmap.yaml	1970-01-01 00:00:00.000000000 +0000
+++ ceph-12.2.12/qa/suites/fs/basic_functional/objectstore/bluestore-bitmap.yaml	2019-04-11 12:33:50.000000000 +0000
@@ -0,0 +1,43 @@
+overrides:
+  thrashosds:
+    bdev_inject_crash: 2
+    bdev_inject_crash_probability: .5
+  ceph:
+    fs: xfs
+    conf:
+      osd:
+        osd objectstore: bluestore
+        bluestore block size: 96636764160
+        debug bluestore: 20
+        debug bluefs: 20
+        debug rocksdb: 10
+        bluestore fsck on mount: true
+        bluestore allocator: bitmap
+        # lower the full ratios since we can fill up a 100gb osd so quickly
+        mon osd full ratio: .9
+        mon osd backfillfull_ratio: .85
+        mon osd nearfull ratio: .8
+        osd failsafe full ratio: .95
+# this doesn't work with failures bc the log writes are not atomic across the two backends
+#        bluestore bluefs env mirror: true
+        bdev enable discard: true
+        bdev async discard: true
+  ceph-deploy:
+    fs: xfs
+    bluestore: yes
+    conf:
+      osd:
+        osd objectstore: bluestore
+        bluestore block size: 96636764160
+        debug bluestore: 20
+        debug bluefs: 20
+        debug rocksdb: 10
+        bluestore fsck on mount: true
+        # lower the full ratios since we can fill up a 100gb osd so quickly
+        mon osd full ratio: .9
+        mon osd backfillfull_ratio: .85
+        mon osd nearfull ratio: .8
+        osd failsafe full ratio: .95
+        bdev enable discard: true
+        bdev async discard: true
+
diff -Nru ceph-12.2.11/qa/suites/fs/basic_functional/objectstore/bluestore.yaml ceph-12.2.12/qa/suites/fs/basic_functional/objectstore/bluestore.yaml
--- ceph-12.2.11/qa/suites/fs/basic_functional/objectstore/bluestore.yaml	2019-01-30 15:51:26.000000000 +0000
+++ ceph-12.2.12/qa/suites/fs/basic_functional/objectstore/bluestore.yaml	1970-01-01 00:00:00.000000000 +0000
@@ -1,38 +0,0 @@
-overrides:
-  thrashosds:
-    bdev_inject_crash: 2
-    bdev_inject_crash_probability: .5
-  ceph:
-    fs: xfs
-    conf:
-      osd:
-        osd objectstore: bluestore
-        bluestore block size: 96636764160
-        debug bluestore: 20
-        debug bluefs: 20
-        debug rocksdb: 10
-        bluestore fsck on mount: true
-        # lower the full ratios since we can fill up a 100gb osd so quickly
-        mon osd full ratio: .9
-        mon osd backfillfull_ratio: .85
-        mon osd nearfull ratio: .8
-        osd failsafe full ratio: .95
-# this doesn't work with failures bc the log writes are not atomic across the two backends
-#        bluestore bluefs env mirror: true
-  ceph-deploy:
-    fs: xfs
-    bluestore: yes
-    conf:
-      osd:
-        osd objectstore: bluestore
-        bluestore block size: 96636764160
-        debug bluestore: 20
-        debug bluefs: 20
-        debug rocksdb: 10
-        bluestore fsck on mount: true
-        # lower the full ratios since we can fill up a 100gb osd so quickly
-        mon osd full ratio: .9
-        mon osd backfillfull_ratio: .85
-        mon osd nearfull ratio: .8
-        osd failsafe full ratio: .95
-
diff -Nru ceph-12.2.11/qa/suites/fs/basic_functional/tasks/libcephfs_java.yaml ceph-12.2.12/qa/suites/fs/basic_functional/tasks/libcephfs_java.yaml
--- ceph-12.2.11/qa/suites/fs/basic_functional/tasks/libcephfs_java.yaml	2019-01-30 15:51:26.000000000 +0000
+++ ceph-12.2.12/qa/suites/fs/basic_functional/tasks/libcephfs_java.yaml	1970-01-01 00:00:00.000000000 +0000
@@ -1,14 +0,0 @@
-
-os_type: ubuntu
-os_version: "14.04"
-
-overrides:
-  ceph-fuse:
-    disabled: true
-  kclient:
-    disabled: true
-tasks:
-- workunit:
-    clients:
-      client.0:
-        - libcephfs-java/test.sh
diff -Nru ceph-12.2.11/qa/suites/fs/basic_workload/objectstore-ec/bluestore-bitmap.yaml ceph-12.2.12/qa/suites/fs/basic_workload/objectstore-ec/bluestore-bitmap.yaml
--- ceph-12.2.11/qa/suites/fs/basic_workload/objectstore-ec/bluestore-bitmap.yaml	1970-01-01 00:00:00.000000000 +0000
+++ ceph-12.2.12/qa/suites/fs/basic_workload/objectstore-ec/bluestore-bitmap.yaml	2019-04-11 12:33:50.000000000 +0000
@@ -0,0 +1,43 @@
+overrides:
+  thrashosds:
+    bdev_inject_crash: 2
+    bdev_inject_crash_probability: .5
+  ceph:
+    fs: xfs
+    conf:
+      osd:
+        osd objectstore: bluestore
+        bluestore block size: 96636764160
+        debug bluestore: 20
+        debug bluefs: 20
+        debug rocksdb: 10
+        bluestore fsck on mount: true
+        bluestore allocator: bitmap
+        # lower the full ratios since we can fill up a 100gb osd so quickly
+        mon osd full ratio: .9
+        mon osd backfillfull_ratio: .85
+        mon osd nearfull ratio: .8
+        osd failsafe full ratio: .95
+# this doesn't work with failures bc the log writes are not atomic across the two backends
+#        bluestore bluefs env mirror: true
+        bdev enable discard: true
+        bdev async discard: true
+  ceph-deploy:
+    fs: xfs
+    bluestore: yes
+    conf:
+      osd:
+        osd objectstore: bluestore
+        bluestore block size: 96636764160
+        debug bluestore: 20
+        debug bluefs: 20
+        debug rocksdb: 10
+        bluestore fsck on mount: true
+        # lower the full ratios since we can fill up a 100gb osd so quickly
+        mon osd full ratio: .9
+        mon osd backfillfull_ratio: .85
+        mon osd nearfull ratio: .8
+        osd failsafe full ratio: .95
+        bdev enable discard: true
+        bdev async discard: true
+
diff -Nru ceph-12.2.11/qa/suites/fs/basic_workload/objectstore-ec/bluestore.yaml ceph-12.2.12/qa/suites/fs/basic_workload/objectstore-ec/bluestore.yaml
--- ceph-12.2.11/qa/suites/fs/basic_workload/objectstore-ec/bluestore.yaml	2019-01-30 15:51:26.000000000 +0000
+++ ceph-12.2.12/qa/suites/fs/basic_workload/objectstore-ec/bluestore.yaml	1970-01-01 00:00:00.000000000 +0000
@@ -1,38 +0,0 @@
-overrides:
-  thrashosds:
-    bdev_inject_crash: 2
-    bdev_inject_crash_probability: .5
-  ceph:
-    fs: xfs
-    conf:
-      osd:
-        osd objectstore: bluestore
-        bluestore block size: 96636764160
-        debug bluestore: 20
-        debug bluefs: 20
-        debug rocksdb: 10
-        bluestore fsck on mount: true
-        # lower the full ratios since we can fill up a 100gb osd so quickly
-        mon osd full ratio: .9
-        mon osd backfillfull_ratio: .85
-        mon osd nearfull ratio: .8
-        osd failsafe full ratio: .95
-# this doesn't work with failures bc the log writes are not atomic across the two backends
-#        bluestore bluefs env mirror: true
-  ceph-deploy:
-    fs: xfs
-    bluestore: yes
-    conf:
-      osd:
-        osd objectstore: bluestore
-        bluestore block size: 96636764160
-        debug bluestore: 20
-        debug bluefs: 20
-        debug rocksdb: 10
-        bluestore fsck on mount: true
-        # lower the full ratios since we can fill up a 100gb osd so quickly
-        mon osd full ratio: .9
-        mon osd backfillfull_ratio: .85
-        mon osd nearfull ratio: .8
-        osd failsafe full ratio: .95
-
diff -Nru ceph-12.2.11/qa/suites/fs/basic_workload/tasks/cfuse_workunit_suites_fsstress.yaml ceph-12.2.12/qa/suites/fs/basic_workload/tasks/cfuse_workunit_suites_fsstress.yaml
--- ceph-12.2.11/qa/suites/fs/basic_workload/tasks/cfuse_workunit_suites_fsstress.yaml	2019-01-30 15:51:26.000000000 +0000
+++ ceph-12.2.12/qa/suites/fs/basic_workload/tasks/cfuse_workunit_suites_fsstress.yaml	2019-04-11 12:33:50.000000000 +0000
@@ -1,5 +1,6 @@
 tasks:
 - workunit:
+    timeout: 6h
     clients:
       all:
         - suites/fsstress.sh
diff -Nru ceph-12.2.11/qa/suites/fs/bugs/client_trim_caps/objectstore/bluestore-bitmap.yaml ceph-12.2.12/qa/suites/fs/bugs/client_trim_caps/objectstore/bluestore-bitmap.yaml
--- ceph-12.2.11/qa/suites/fs/bugs/client_trim_caps/objectstore/bluestore-bitmap.yaml	1970-01-01 00:00:00.000000000 +0000
+++ ceph-12.2.12/qa/suites/fs/bugs/client_trim_caps/objectstore/bluestore-bitmap.yaml	2019-04-11 12:33:50.000000000 +0000
@@ -0,0 +1,43 @@
+overrides:
+  thrashosds:
+    bdev_inject_crash: 2
+    bdev_inject_crash_probability: .5
+  ceph:
+    fs: xfs
+    conf:
+      osd:
+        osd objectstore: bluestore
+        bluestore block size: 96636764160
+        debug bluestore: 20
+        debug bluefs: 20
+        debug rocksdb: 10
+        bluestore fsck on mount: true
+        bluestore allocator: bitmap
+        # lower the full ratios since we can fill up a 100gb osd so quickly
+        mon osd full ratio: .9
+        mon osd backfillfull_ratio: .85
+        mon osd nearfull ratio: .8
+        osd failsafe full ratio: .95
+# this doesn't work with failures bc the log writes are not atomic across the two backends
+#        bluestore bluefs env mirror: true
+        bdev enable discard: true
+        bdev async discard: true
+  ceph-deploy:
+    fs: xfs
+    bluestore: yes
+    conf:
+      osd:
+        osd objectstore: bluestore
+        bluestore block size: 96636764160
+        debug bluestore: 20
+        debug bluefs: 20
+        debug rocksdb: 10
+        bluestore fsck on mount: true
+        # lower the full ratios since we can fill up a 100gb osd so quickly
+        mon osd full ratio: .9
+        mon osd backfillfull_ratio: .85
+        mon osd nearfull ratio: .8
+        osd failsafe full ratio: .95
+        bdev enable discard: true
+        bdev async discard: true
+
diff -Nru ceph-12.2.11/qa/suites/fs/bugs/client_trim_caps/objectstore/bluestore.yaml ceph-12.2.12/qa/suites/fs/bugs/client_trim_caps/objectstore/bluestore.yaml
--- ceph-12.2.11/qa/suites/fs/bugs/client_trim_caps/objectstore/bluestore.yaml	2019-01-30 15:51:26.000000000 +0000
+++ ceph-12.2.12/qa/suites/fs/bugs/client_trim_caps/objectstore/bluestore.yaml	1970-01-01 00:00:00.000000000 +0000
@@ -1,38 +0,0 @@
-overrides:
-  thrashosds:
-    bdev_inject_crash: 2
-    bdev_inject_crash_probability: .5
-  ceph:
-    fs: xfs
-    conf:
-      osd:
-        osd objectstore: bluestore
-        bluestore block size: 96636764160
-        debug bluestore: 20
-        debug bluefs: 20
-        debug rocksdb: 10
-        bluestore fsck on mount: true
-        # lower the full ratios since we can fill up a 100gb osd so quickly
-        mon osd full ratio: .9
-        mon osd backfillfull_ratio: .85
-        mon osd nearfull ratio: .8
-        osd failsafe full ratio: .95
-# this doesn't work with failures bc the log writes are not atomic across the two backends
-#        bluestore bluefs env mirror: true
-  ceph-deploy:
-    fs: xfs
-    bluestore: yes
-    conf:
-      osd:
-        osd objectstore: bluestore
-        bluestore block size: 96636764160
-        debug bluestore: 20
-        debug bluefs: 20
-        debug rocksdb: 10
-        bluestore fsck on mount: true
-        # lower the full ratios since we can fill up a 100gb osd so quickly
-        mon osd full ratio: .9
-        mon osd backfillfull_ratio: .85
-        mon osd nearfull ratio: .8
-        osd failsafe full ratio: .95
-
diff -Nru ceph-12.2.11/qa/suites/fs/bugs/client_trim_caps/tasks/trim-i22073.yaml ceph-12.2.12/qa/suites/fs/bugs/client_trim_caps/tasks/trim-i22073.yaml
--- ceph-12.2.11/qa/suites/fs/bugs/client_trim_caps/tasks/trim-i22073.yaml	2019-01-30 15:51:26.000000000 +0000
+++ ceph-12.2.12/qa/suites/fs/bugs/client_trim_caps/tasks/trim-i22073.yaml	2019-04-11 12:33:50.000000000 +0000
@@ -10,7 +10,6 @@
 tasks:
 - exec:
     mon.a:
-    - "ceph tell mds.* config set mds_max_ratio_caps_per_client 1"
     - "ceph tell mds.* config set mds_min_caps_per_client 1"
 - background_exec:
     mon.a:
diff -Nru ceph-12.2.11/qa/suites/fs/multiclient/objectstore-ec/bluestore-bitmap.yaml ceph-12.2.12/qa/suites/fs/multiclient/objectstore-ec/bluestore-bitmap.yaml
--- ceph-12.2.11/qa/suites/fs/multiclient/objectstore-ec/bluestore-bitmap.yaml	1970-01-01 00:00:00.000000000 +0000
+++ ceph-12.2.12/qa/suites/fs/multiclient/objectstore-ec/bluestore-bitmap.yaml	2019-04-11 12:33:50.000000000 +0000
@@ -0,0 +1,43 @@
+overrides:
+  thrashosds:
+    bdev_inject_crash: 2
+    bdev_inject_crash_probability: .5
+  ceph:
+    fs: xfs
+    conf:
+      osd:
+        osd objectstore: bluestore
+        bluestore block size: 96636764160
+        debug bluestore: 20
+        debug bluefs: 20
+        debug rocksdb: 10
+        bluestore fsck on mount: true
+        bluestore allocator: bitmap
+        # lower the full ratios since we can fill up a 100gb osd so quickly
+        mon osd full ratio: .9
+        mon osd backfillfull_ratio: .85
+        mon osd nearfull ratio: .8
+        osd failsafe full ratio: .95
+# this doesn't work with failures bc the log writes are not atomic across the two backends
+#        bluestore bluefs env mirror: true
+        bdev enable discard: true
+        bdev async discard: true
+  ceph-deploy:
+    fs: xfs
+    bluestore: yes
+    conf:
+      osd:
+        osd objectstore: bluestore
+        bluestore block size: 96636764160
+        debug bluestore: 20
+        debug bluefs: 20
+        debug rocksdb: 10
+        bluestore fsck on mount: true
+        # lower the full ratios since we can fill up a 100gb osd so quickly
+        mon osd full ratio: .9
+        mon osd backfillfull_ratio: .85
+        mon osd nearfull ratio: .8
+        osd failsafe full ratio: .95
+        bdev enable discard: true
+        bdev async discard: true
+
diff -Nru ceph-12.2.11/qa/suites/fs/multiclient/objectstore-ec/bluestore.yaml ceph-12.2.12/qa/suites/fs/multiclient/objectstore-ec/bluestore.yaml
--- ceph-12.2.11/qa/suites/fs/multiclient/objectstore-ec/bluestore.yaml	2019-01-30 15:51:26.000000000 +0000
+++ ceph-12.2.12/qa/suites/fs/multiclient/objectstore-ec/bluestore.yaml	1970-01-01 00:00:00.000000000 +0000
@@ -1,38 +0,0 @@
-overrides:
-  thrashosds:
-    bdev_inject_crash: 2
-    bdev_inject_crash_probability: .5
-  ceph:
-    fs: xfs
-    conf:
-      osd:
-        osd objectstore: bluestore
-        bluestore block size: 96636764160
-        debug bluestore: 20
-        debug bluefs: 20
-        debug rocksdb: 10
-        bluestore fsck on mount: true
-        # lower the full ratios since we can fill up a 100gb osd so quickly
-        mon osd full ratio: .9
-        mon osd backfillfull_ratio: .85
-        mon osd nearfull ratio: .8
-        osd failsafe full ratio: .95
-# this doesn't work with failures bc the log writes are not atomic across the two backends
-#        bluestore bluefs env mirror: true
-  ceph-deploy:
-    fs: xfs
-    bluestore: yes
-    conf:
-      osd:
-        osd objectstore: bluestore
-        bluestore block size: 96636764160
-        debug bluestore: 20
-        debug bluefs: 20
-        debug rocksdb: 10
-        bluestore fsck on mount: true
-        # lower the full ratios since we can fill up a 100gb osd so quickly
-        mon osd full ratio: .9
-        mon osd backfillfull_ratio: .85
-        mon osd nearfull ratio: .8
-        osd failsafe full ratio: .95
-
diff -Nru ceph-12.2.11/qa/suites/fs/multifs/objectstore-ec/bluestore-bitmap.yaml ceph-12.2.12/qa/suites/fs/multifs/objectstore-ec/bluestore-bitmap.yaml
--- ceph-12.2.11/qa/suites/fs/multifs/objectstore-ec/bluestore-bitmap.yaml	1970-01-01 00:00:00.000000000 +0000
+++ ceph-12.2.12/qa/suites/fs/multifs/objectstore-ec/bluestore-bitmap.yaml	2019-04-11 12:33:50.000000000 +0000
@@ -0,0 +1,43 @@
+overrides:
+  thrashosds:
+    bdev_inject_crash: 2
+    bdev_inject_crash_probability: .5
+  ceph:
+    fs: xfs
+    conf:
+      osd:
+        osd objectstore: bluestore
+        bluestore block size: 96636764160
+        debug bluestore: 20
+        debug bluefs: 20
+        debug rocksdb: 10
+        bluestore fsck on mount: true
+        bluestore allocator: bitmap
+        # lower the full ratios since we can fill up a 100gb osd so quickly
+        mon osd full ratio: .9
+        mon osd backfillfull_ratio: .85
+        mon osd nearfull ratio: .8
+        osd failsafe full ratio: .95
+# this doesn't work with failures bc the log writes are not atomic across the two backends
+#        bluestore bluefs env mirror: true
+        bdev enable discard: true
+        bdev async discard: true
+  ceph-deploy:
+    fs: xfs
+    bluestore: yes
+    conf:
+      osd:
+        osd objectstore: bluestore
+        bluestore block size: 96636764160
+        debug bluestore: 20
+        debug bluefs: 20
+        debug rocksdb: 10
+        bluestore fsck on mount: true
+        # lower the full ratios since we can fill up a 100gb osd so quickly
+        mon osd full ratio: .9
+        mon osd backfillfull_ratio: .85
+        mon osd nearfull ratio: .8
+        osd failsafe full ratio: .95
+        bdev enable discard: true
+        bdev async discard: true
+
diff -Nru ceph-12.2.11/qa/suites/fs/multifs/objectstore-ec/bluestore.yaml ceph-12.2.12/qa/suites/fs/multifs/objectstore-ec/bluestore.yaml
--- ceph-12.2.11/qa/suites/fs/multifs/objectstore-ec/bluestore.yaml	2019-01-30 15:51:26.000000000 +0000
+++ ceph-12.2.12/qa/suites/fs/multifs/objectstore-ec/bluestore.yaml	1970-01-01 00:00:00.000000000 +0000
@@ -1,38 +0,0 @@
-overrides:
-  thrashosds:
-    bdev_inject_crash: 2
-    bdev_inject_crash_probability: .5
-  ceph:
-    fs: xfs
-    conf:
-      osd:
-        osd objectstore: bluestore
-        bluestore block size: 96636764160
-        debug bluestore: 20
-        debug bluefs: 20
-        debug rocksdb: 10
-        bluestore fsck on mount: true
-        # lower the full ratios since we can fill up a 100gb osd so quickly
-        mon osd full ratio: .9
-        mon osd backfillfull_ratio: .85
-        mon osd nearfull ratio: .8
-        osd failsafe full ratio: .95
-# this doesn't work with failures bc the log writes are not atomic across the two backends
-#        bluestore bluefs env mirror: true
-  ceph-deploy:
-    fs: xfs
-    bluestore: yes
-    conf:
-      osd:
-        osd objectstore: bluestore
-        bluestore block size: 96636764160
-        debug bluestore: 20
-        debug bluefs: 20
-        debug rocksdb: 10
-        bluestore fsck on mount: true
-        # lower the full ratios since we can fill up a 100gb osd so quickly
-        mon osd full ratio: .9
-        mon osd backfillfull_ratio: .85
-        mon osd nearfull ratio: .8
-        osd failsafe full ratio: .95
-
diff -Nru ceph-12.2.11/qa/suites/fs/permission/objectstore-ec/bluestore-bitmap.yaml ceph-12.2.12/qa/suites/fs/permission/objectstore-ec/bluestore-bitmap.yaml
--- ceph-12.2.11/qa/suites/fs/permission/objectstore-ec/bluestore-bitmap.yaml	1970-01-01 00:00:00.000000000 +0000
+++ ceph-12.2.12/qa/suites/fs/permission/objectstore-ec/bluestore-bitmap.yaml	2019-04-11 12:33:50.000000000 +0000
@@ -0,0 +1,43 @@
+overrides:
+  thrashosds:
+    bdev_inject_crash: 2
+    bdev_inject_crash_probability: .5
+  ceph:
+    fs: xfs
+    conf:
+      osd:
+        osd objectstore: bluestore
+        bluestore block size: 96636764160
+        debug bluestore: 20
+        debug bluefs: 20
+        debug rocksdb: 10
+        bluestore fsck on mount: true
+        bluestore allocator: bitmap
+        # lower the full ratios since we can fill up a 100gb osd so quickly
+        mon osd full ratio: .9
+        mon osd backfillfull_ratio: .85
+        mon osd nearfull ratio: .8
+        osd failsafe full ratio: .95
+# this doesn't work with failures bc the log writes are not atomic across the two backends
+#        bluestore bluefs env mirror: true
+        bdev enable discard: true
+        bdev async discard: true
+  ceph-deploy:
+    fs: xfs
+    bluestore: yes
+    conf:
+      osd:
+        osd objectstore: bluestore
+        bluestore block size: 96636764160
+        debug bluestore: 20
+        debug bluefs: 20
+        debug rocksdb: 10
+        bluestore fsck on mount: true
+        # lower the full ratios since we can fill up a 100gb osd so quickly
+        mon osd full ratio: .9
+        mon osd backfillfull_ratio: .85
+        mon osd nearfull ratio: .8
+        osd failsafe full ratio: .95
+        bdev enable discard: true
+        bdev async discard: true
+
diff -Nru ceph-12.2.11/qa/suites/fs/permission/objectstore-ec/bluestore.yaml ceph-12.2.12/qa/suites/fs/permission/objectstore-ec/bluestore.yaml
--- ceph-12.2.11/qa/suites/fs/permission/objectstore-ec/bluestore.yaml	2019-01-30 15:51:26.000000000 +0000
+++ ceph-12.2.12/qa/suites/fs/permission/objectstore-ec/bluestore.yaml	1970-01-01 00:00:00.000000000 +0000
@@ -1,38 +0,0 @@
-overrides:
-  thrashosds:
-    bdev_inject_crash: 2
-    bdev_inject_crash_probability: .5
-  ceph:
-    fs: xfs
-    conf:
-      osd:
-        osd objectstore: bluestore
-        bluestore block size: 96636764160
-        debug bluestore: 20
-        debug bluefs: 20
-        debug rocksdb: 10
-        bluestore fsck on mount: true
-        # lower the full ratios since we can fill up a 100gb osd so quickly
-        mon osd full ratio: .9
-        mon osd backfillfull_ratio: .85
-        mon osd nearfull ratio: .8
-        osd failsafe full ratio: .95
-# this doesn't work with failures bc the log writes are not atomic across the two backends
-#        bluestore bluefs env mirror: true
-  ceph-deploy:
-    fs: xfs
-    bluestore: yes
-    conf:
-      osd:
-        osd objectstore: bluestore
-        bluestore block size: 96636764160
-        debug bluestore: 20
-        debug bluefs: 20
-        debug rocksdb: 10
-        bluestore fsck on mount: true
-        # lower the full ratios since we can fill up a 100gb osd so quickly
-        mon osd full ratio: .9
-        mon osd backfillfull_ratio: .85
-        mon osd nearfull ratio: .8
-        osd failsafe full ratio: .95
-
diff -Nru ceph-12.2.11/qa/suites/fs/snaps/objectstore-ec/bluestore-bitmap.yaml ceph-12.2.12/qa/suites/fs/snaps/objectstore-ec/bluestore-bitmap.yaml
--- ceph-12.2.11/qa/suites/fs/snaps/objectstore-ec/bluestore-bitmap.yaml	1970-01-01 00:00:00.000000000 +0000
+++ ceph-12.2.12/qa/suites/fs/snaps/objectstore-ec/bluestore-bitmap.yaml	2019-04-11 12:33:50.000000000 +0000
@@ -0,0 +1,43 @@
+overrides:
+  thrashosds:
+    bdev_inject_crash: 2
+    bdev_inject_crash_probability: .5
+  ceph:
+    fs: xfs
+    conf:
+      osd:
+        osd objectstore: bluestore
+        bluestore block size: 96636764160
+        debug bluestore: 20
+        debug bluefs: 20
+        debug rocksdb: 10
+        bluestore fsck on mount: true
+        bluestore allocator: bitmap
+        # lower the full ratios since we can fill up a 100gb osd so quickly
+        mon osd full ratio: .9
+        mon osd backfillfull_ratio: .85
+        mon osd nearfull ratio: .8
+        osd failsafe full ratio: .95
+# this doesn't work with failures bc the log writes are not atomic across the two backends
+#        bluestore bluefs env mirror: true
+        bdev enable discard: true
+        bdev async discard: true
+  ceph-deploy:
+    fs: xfs
+    bluestore: yes
+    conf:
+      osd:
+        osd objectstore: bluestore
+        bluestore block size: 96636764160
+        debug bluestore: 20
+        debug bluefs: 20
+        debug rocksdb: 10
+        bluestore fsck on mount: true
+        # lower the full ratios since we can fill up a 100gb osd so quickly
+        mon osd full ratio: .9
+        mon osd backfillfull_ratio: .85
+        mon osd nearfull ratio: .8
+        osd failsafe full ratio: .95
+        bdev enable discard: true
+        bdev async discard: true
+
diff -Nru ceph-12.2.11/qa/suites/fs/snaps/objectstore-ec/bluestore.yaml ceph-12.2.12/qa/suites/fs/snaps/objectstore-ec/bluestore.yaml
--- ceph-12.2.11/qa/suites/fs/snaps/objectstore-ec/bluestore.yaml	2019-01-30 15:51:26.000000000 +0000
+++ ceph-12.2.12/qa/suites/fs/snaps/objectstore-ec/bluestore.yaml	1970-01-01 00:00:00.000000000 +0000
@@ -1,38 +0,0 @@
-overrides:
-  thrashosds:
-    bdev_inject_crash: 2
-    bdev_inject_crash_probability: .5
-  ceph:
-    fs: xfs
-    conf:
-      osd:
-        osd objectstore: bluestore
-        bluestore block size: 96636764160
-        debug bluestore: 20
-        debug bluefs: 20
-        debug rocksdb: 10
-        bluestore fsck on mount: true
-        # lower the full ratios since we can fill up a 100gb osd so quickly
-        mon osd full ratio: .9
-        mon osd backfillfull_ratio: .85
-        mon osd nearfull ratio: .8
-        osd failsafe full ratio: .95
-# this doesn't work with failures bc the log writes are not atomic across the two backends
-#        bluestore bluefs env mirror: true
-  ceph-deploy:
-    fs: xfs
-    bluestore: yes
-    conf:
-      osd:
-        osd objectstore: bluestore
-        bluestore block size: 96636764160
-        debug bluestore: 20
-        debug bluefs: 20
-        debug rocksdb: 10
-        bluestore fsck on mount: true
-        # lower the full ratios since we can fill up a 100gb osd so quickly
-        mon osd full ratio: .9
-        mon osd backfillfull_ratio: .85
-        mon osd nearfull ratio: .8
-        osd failsafe full ratio: .95
-
diff -Nru ceph-12.2.11/qa/suites/fs/thrash/objectstore-ec/bluestore-bitmap.yaml ceph-12.2.12/qa/suites/fs/thrash/objectstore-ec/bluestore-bitmap.yaml
--- ceph-12.2.11/qa/suites/fs/thrash/objectstore-ec/bluestore-bitmap.yaml	1970-01-01 00:00:00.000000000 +0000
+++ ceph-12.2.12/qa/suites/fs/thrash/objectstore-ec/bluestore-bitmap.yaml	2019-04-11 12:33:50.000000000 +0000
@@ -0,0 +1,43 @@
+overrides:
+  thrashosds:
+    bdev_inject_crash: 2
+    bdev_inject_crash_probability: .5
+  ceph:
+    fs: xfs
+    conf:
+      osd:
+        osd objectstore: bluestore
+        bluestore block size: 96636764160
+        debug bluestore: 20
+        debug bluefs: 20
+        debug rocksdb: 10
+        bluestore fsck on mount: true
+        bluestore allocator: bitmap
+        # lower the full ratios since we can fill up a 100gb osd so quickly
+        mon osd full ratio: .9
+        mon osd backfillfull_ratio: .85
+        mon osd nearfull ratio: .8
+        osd failsafe full ratio: .95
+# this doesn't work with failures bc the log writes are not atomic across the two backends
+#        bluestore bluefs env mirror: true
+        bdev enable discard: true
+        bdev async discard: true
+  ceph-deploy:
+    fs: xfs
+    bluestore: yes
+    conf:
+      osd:
+        osd objectstore: bluestore
+        bluestore block size: 96636764160
+        debug bluestore: 20
+        debug bluefs: 20
+        debug rocksdb: 10
+        bluestore fsck on mount: true
+        # lower the full ratios since we can fill up a 100gb osd so quickly
+        mon osd full ratio: .9
+        mon osd backfillfull_ratio: .85
+        mon osd nearfull ratio: .8
+        osd failsafe full ratio: .95
+        bdev enable discard: true
+        bdev async discard: true
+
diff -Nru ceph-12.2.11/qa/suites/fs/thrash/objectstore-ec/bluestore.yaml ceph-12.2.12/qa/suites/fs/thrash/objectstore-ec/bluestore.yaml
--- ceph-12.2.11/qa/suites/fs/thrash/objectstore-ec/bluestore.yaml	2019-01-30 15:51:26.000000000 +0000
+++ ceph-12.2.12/qa/suites/fs/thrash/objectstore-ec/bluestore.yaml	1970-01-01 00:00:00.000000000 +0000
@@ -1,38 +0,0 @@
-overrides:
-  thrashosds:
-    bdev_inject_crash: 2
-    bdev_inject_crash_probability: .5
-  ceph:
-    fs: xfs
-    conf:
-      osd:
-        osd objectstore: bluestore
-        bluestore block size: 96636764160
-        debug bluestore: 20
-        debug bluefs: 20
-        debug rocksdb: 10
-        bluestore fsck on mount: true
-        # lower the full ratios since we can fill up a 100gb osd so quickly
-        mon osd full ratio: .9
-        mon osd backfillfull_ratio: .85
-        mon osd nearfull ratio: .8
-        osd failsafe full ratio: .95
-# this doesn't work with failures bc the log writes are not atomic across the two backends
-#        bluestore bluefs env mirror: true
-  ceph-deploy:
-    fs: xfs
-    bluestore: yes
-    conf:
-      osd:
-        osd objectstore: bluestore
-        bluestore block size: 96636764160
-        debug bluestore: 20
-        debug bluefs: 20
-        debug rocksdb: 10
-        bluestore fsck on mount: true
-        # lower the full ratios since we can fill up a 100gb osd so quickly
-        mon osd full ratio: .9
-        mon osd backfillfull_ratio: .85
-        mon osd nearfull ratio: .8
-        osd failsafe full ratio: .95
-
diff -Nru ceph-12.2.11/qa/suites/fs/thrash/tasks/cfuse_workunit_suites_fsstress.yaml ceph-12.2.12/qa/suites/fs/thrash/tasks/cfuse_workunit_suites_fsstress.yaml
--- ceph-12.2.11/qa/suites/fs/thrash/tasks/cfuse_workunit_suites_fsstress.yaml	2019-01-30 15:51:26.000000000 +0000
+++ ceph-12.2.12/qa/suites/fs/thrash/tasks/cfuse_workunit_suites_fsstress.yaml	2019-04-11 12:33:50.000000000 +0000
@@ -1,5 +1,6 @@
 tasks:
 - workunit:
+    timeout: 6h
     clients:
       all:
         - suites/fsstress.sh
diff -Nru ceph-12.2.11/qa/suites/fs/traceless/objectstore-ec/bluestore-bitmap.yaml ceph-12.2.12/qa/suites/fs/traceless/objectstore-ec/bluestore-bitmap.yaml
--- ceph-12.2.11/qa/suites/fs/traceless/objectstore-ec/bluestore-bitmap.yaml	1970-01-01 00:00:00.000000000 +0000
+++ ceph-12.2.12/qa/suites/fs/traceless/objectstore-ec/bluestore-bitmap.yaml	2019-04-11 12:33:50.000000000 +0000
@@ -0,0 +1,43 @@
+overrides:
+  thrashosds:
+    bdev_inject_crash: 2
+    bdev_inject_crash_probability: .5
+  ceph:
+    fs: xfs
+    conf:
+      osd:
+        osd objectstore: bluestore
+        bluestore block size: 96636764160
+        debug bluestore: 20
+        debug bluefs: 20
+        debug rocksdb: 10
+        bluestore fsck on mount: true
+        bluestore allocator: bitmap
+        # lower the full ratios since we can fill up a 100gb osd so quickly
+        mon osd full ratio: .9
+        mon osd backfillfull_ratio: .85
+        mon osd nearfull ratio: .8
+        osd failsafe full ratio: .95
+# this doesn't work with failures bc the log writes are not atomic across the two backends
+#        bluestore bluefs env mirror: true
+        bdev enable discard: true
+        bdev async discard: true
+  ceph-deploy:
+    fs: xfs
+    bluestore: yes
+    conf:
+      osd:
+        osd objectstore: bluestore
+        bluestore block size: 96636764160
+        debug bluestore: 20
+        debug bluefs: 20
+        debug rocksdb: 10
+        bluestore fsck on mount: true
+        # lower the full ratios since we can fill up a 100gb osd so quickly
+        mon osd full ratio: .9
+        mon osd backfillfull_ratio: .85
+        mon osd nearfull ratio: .8
+        osd failsafe full ratio: .95
+        bdev enable discard: true
+        bdev async discard: true
+
diff -Nru ceph-12.2.11/qa/suites/fs/traceless/objectstore-ec/bluestore.yaml ceph-12.2.12/qa/suites/fs/traceless/objectstore-ec/bluestore.yaml
--- ceph-12.2.11/qa/suites/fs/traceless/objectstore-ec/bluestore.yaml	2019-01-30 15:51:26.000000000 +0000
+++ ceph-12.2.12/qa/suites/fs/traceless/objectstore-ec/bluestore.yaml	1970-01-01 00:00:00.000000000 +0000
@@ -1,38 +0,0 @@
-overrides:
-  thrashosds:
-    bdev_inject_crash: 2
-    bdev_inject_crash_probability: .5
-  ceph:
-    fs: xfs
-    conf:
-      osd:
-        osd objectstore: bluestore
-        bluestore block size: 96636764160
-        debug bluestore: 20
-        debug bluefs: 20
-        debug rocksdb: 10
-        bluestore fsck on mount: true
-        # lower the full ratios since we can fill up a 100gb osd so quickly
-        mon osd full ratio: .9
-        mon osd backfillfull_ratio: .85
-        mon osd nearfull ratio: .8
-        osd failsafe full ratio: .95
-# this doesn't work with failures bc the log writes are not atomic across the two backends
-#        bluestore bluefs env mirror: true
-  ceph-deploy:
-    fs: xfs
-    bluestore: yes
-    conf:
-      osd:
-        osd objectstore: bluestore
-        bluestore block size: 96636764160
-        debug bluestore: 20
-        debug bluefs: 20
-        debug rocksdb: 10
-        bluestore fsck on mount: true
-        # lower the full ratios since we can fill up a 100gb osd so quickly
-        mon osd full ratio: .9
-        mon osd backfillfull_ratio: .85
-        mon osd nearfull ratio: .8
-        osd failsafe full ratio: .95
-
diff -Nru ceph-12.2.11/qa/suites/fs/traceless/tasks/cfuse_workunit_suites_fsstress.yaml ceph-12.2.12/qa/suites/fs/traceless/tasks/cfuse_workunit_suites_fsstress.yaml
--- ceph-12.2.11/qa/suites/fs/traceless/tasks/cfuse_workunit_suites_fsstress.yaml	2019-01-30 15:51:26.000000000 +0000
+++ ceph-12.2.12/qa/suites/fs/traceless/tasks/cfuse_workunit_suites_fsstress.yaml	2019-04-11 12:33:50.000000000 +0000
@@ -1,5 +1,6 @@
 tasks:
 - workunit:
+    timeout: 6h
     clients:
       all:
         - suites/fsstress.sh
diff -Nru ceph-12.2.11/qa/suites/fs/verify/objectstore-ec/bluestore-bitmap.yaml ceph-12.2.12/qa/suites/fs/verify/objectstore-ec/bluestore-bitmap.yaml
--- ceph-12.2.11/qa/suites/fs/verify/objectstore-ec/bluestore-bitmap.yaml	1970-01-01 00:00:00.000000000 +0000
+++ ceph-12.2.12/qa/suites/fs/verify/objectstore-ec/bluestore-bitmap.yaml	2019-04-11 12:33:50.000000000 +0000
@@ -0,0 +1,43 @@
+overrides:
+  thrashosds:
+    bdev_inject_crash: 2
+    bdev_inject_crash_probability: .5
+  ceph:
+    fs: xfs
+    conf:
+      osd:
+        osd objectstore: bluestore
+        bluestore block size: 96636764160
+        debug bluestore: 20
+        debug bluefs: 20
+        debug rocksdb: 10
+        bluestore fsck on mount: true
+        bluestore allocator: bitmap
+        # lower the full ratios since we can fill up a 100gb osd so quickly
+        mon osd full ratio: .9
+        mon osd backfillfull_ratio: .85
+        mon osd nearfull ratio: .8
+        osd failsafe full ratio: .95
+# this doesn't work with failures bc the log writes are not atomic across the two backends
+#        bluestore bluefs env mirror: true
+        bdev enable discard: true
+        bdev async discard: true
+  ceph-deploy:
+    fs: xfs
+    bluestore: yes
+    conf:
+      osd:
+        osd objectstore: bluestore
+        bluestore block size: 96636764160
+        debug bluestore: 20
+        debug bluefs: 20
+        debug rocksdb: 10
+        bluestore fsck on mount: true
+        # lower the full ratios since we can fill up a 100gb osd so quickly
+        mon osd full ratio: .9
+        mon osd backfillfull_ratio: .85
+        mon osd nearfull ratio: .8
+        osd failsafe full ratio: .95
+        bdev enable discard: true
+        bdev async discard: true
+
diff -Nru ceph-12.2.11/qa/suites/fs/verify/objectstore-ec/bluestore.yaml ceph-12.2.12/qa/suites/fs/verify/objectstore-ec/bluestore.yaml
--- ceph-12.2.11/qa/suites/fs/verify/objectstore-ec/bluestore.yaml	2019-01-30 15:51:26.000000000 +0000
+++ ceph-12.2.12/qa/suites/fs/verify/objectstore-ec/bluestore.yaml	1970-01-01 00:00:00.000000000 +0000
@@ -1,38 +0,0 @@
-overrides:
-  thrashosds:
-    bdev_inject_crash: 2
-    bdev_inject_crash_probability: .5
-  ceph:
-    fs: xfs
-    conf:
-      osd:
-        osd objectstore: bluestore
-        bluestore block size: 96636764160
-        debug bluestore: 20
-        debug bluefs: 20
-        debug rocksdb: 10
-        bluestore fsck on mount: true
-        # lower the full ratios since we can fill up a 100gb osd so quickly
-        mon osd full ratio: .9
-        mon osd backfillfull_ratio: .85
-        mon osd nearfull ratio: .8
-        osd failsafe full ratio: .95
-# this doesn't work with failures bc the log writes are not atomic across the two backends
-#        bluestore bluefs env mirror: true
-  ceph-deploy:
-    fs: xfs
-    bluestore: yes
-    conf:
-      osd:
-        osd objectstore: bluestore
-        bluestore block size: 96636764160
-        debug bluestore: 20
-        debug bluefs: 20
-        debug rocksdb: 10
-        bluestore fsck on mount: true
-        # lower the full ratios since we can fill up a 100gb osd so quickly
-        mon osd full ratio: .9
-        mon osd backfillfull_ratio: .85
-        mon osd nearfull ratio: .8
-        osd failsafe full ratio: .95
-
diff -Nru ceph-12.2.11/qa/suites/fs/verify/tasks/cfuse_workunit_suites_fsstress.yaml ceph-12.2.12/qa/suites/fs/verify/tasks/cfuse_workunit_suites_fsstress.yaml
--- ceph-12.2.11/qa/suites/fs/verify/tasks/cfuse_workunit_suites_fsstress.yaml	2019-01-30 15:51:26.000000000 +0000
+++ ceph-12.2.12/qa/suites/fs/verify/tasks/cfuse_workunit_suites_fsstress.yaml	2019-04-11 12:33:50.000000000 +0000
@@ -1,5 +1,6 @@
 tasks:
 - workunit:
+    timeout: 6h
     clients:
       all:
         - suites/fsstress.sh
diff -Nru ceph-12.2.11/qa/suites/fs/verify/validater/valgrind.yaml ceph-12.2.12/qa/suites/fs/verify/validater/valgrind.yaml
--- ceph-12.2.11/qa/suites/fs/verify/validater/valgrind.yaml	2019-01-30 15:51:26.000000000 +0000
+++ ceph-12.2.12/qa/suites/fs/verify/validater/valgrind.yaml	2019-04-11 12:33:50.000000000 +0000
@@ -1,21 +1,20 @@
 # see http://tracker.ceph.com/issues/20360 and http://tracker.ceph.com/issues/18126
 os_type: centos
 
-# Valgrind makes everything slow, so ignore slow requests
-overrides:
-  ceph:
-    log-whitelist:
-      - slow requests are blocked
-
 overrides:
   install:
     ceph:
       flavor: notcmalloc
       debuginfo: true
   ceph:
+    # Valgrind makes everything slow, so ignore slow requests and extend heartbeat grace
+    log-whitelist:
+      - slow requests are blocked
     conf:
       global:
         osd heartbeat grace: 40
+      mds:
+        mds heartbeat grace: 60
       mon:
         mon osd crush smoke test: false
     valgrind:
diff -Nru ceph-12.2.11/qa/suites/kcephfs/cephfs/objectstore-ec/bluestore-bitmap.yaml ceph-12.2.12/qa/suites/kcephfs/cephfs/objectstore-ec/bluestore-bitmap.yaml
--- ceph-12.2.11/qa/suites/kcephfs/cephfs/objectstore-ec/bluestore-bitmap.yaml	1970-01-01 00:00:00.000000000 +0000
+++ ceph-12.2.12/qa/suites/kcephfs/cephfs/objectstore-ec/bluestore-bitmap.yaml	2019-04-11 12:33:50.000000000 +0000
@@ -0,0 +1,43 @@
+overrides:
+  thrashosds:
+    bdev_inject_crash: 2
+    bdev_inject_crash_probability: .5
+  ceph:
+    fs: xfs
+    conf:
+      osd:
+        osd objectstore: bluestore
+        bluestore block size: 96636764160
+        debug bluestore: 20
+        debug bluefs: 20
+        debug rocksdb: 10
+        bluestore fsck on mount: true
+        bluestore allocator: bitmap
+        # lower the full ratios since we can fill up a 100gb osd so quickly
+        mon osd full ratio: .9
+        mon osd backfillfull_ratio: .85
+        mon osd nearfull ratio: .8
+        osd failsafe full ratio: .95
+# this doesn't work with failures bc the log writes are not atomic across the two backends
+#        bluestore bluefs env mirror: true
+        bdev enable discard: true
+        bdev async discard: true
+  ceph-deploy:
+    fs: xfs
+    bluestore: yes
+    conf:
+      osd:
+        osd objectstore: bluestore
+        bluestore block size: 96636764160
+        debug bluestore: 20
+        debug bluefs: 20
+        debug rocksdb: 10
+        bluestore fsck on mount: true
+        # lower the full ratios since we can fill up a 100gb osd so quickly
+        mon osd full ratio: .9
+        mon osd backfillfull_ratio: .85
+        mon osd nearfull ratio: .8
+        osd failsafe full ratio: .95
+        bdev enable discard: true
+        bdev async discard: true
+
diff -Nru ceph-12.2.11/qa/suites/kcephfs/cephfs/objectstore-ec/bluestore.yaml ceph-12.2.12/qa/suites/kcephfs/cephfs/objectstore-ec/bluestore.yaml
--- ceph-12.2.11/qa/suites/kcephfs/cephfs/objectstore-ec/bluestore.yaml	2019-01-30 15:51:26.000000000 +0000
+++ ceph-12.2.12/qa/suites/kcephfs/cephfs/objectstore-ec/bluestore.yaml	1970-01-01 00:00:00.000000000 +0000
@@ -1,38 +0,0 @@
-overrides:
-  thrashosds:
-    bdev_inject_crash: 2
-    bdev_inject_crash_probability: .5
-  ceph:
-    fs: xfs
-    conf:
-      osd:
-        osd objectstore: bluestore
-        bluestore block size: 96636764160
-        debug bluestore: 20
-        debug bluefs: 20
-        debug rocksdb: 10
-        bluestore fsck on mount: true
-        # lower the full ratios since we can fill up a 100gb osd so quickly
-        mon osd full ratio: .9
-        mon osd backfillfull_ratio: .85
-        mon osd nearfull ratio: .8
-        osd failsafe full ratio: .95
-# this doesn't work with failures bc the log writes are not atomic across the two backends
-#        bluestore bluefs env mirror: true
-  ceph-deploy:
-    fs: xfs
-    bluestore: yes
-    conf:
-      osd:
-        osd objectstore: bluestore
-        bluestore block size: 96636764160
-        debug bluestore: 20
-        debug bluefs: 20
-        debug rocksdb: 10
-        bluestore fsck on mount: true
-        # lower the full ratios since we can fill up a 100gb osd so quickly
-        mon osd full ratio: .9
-        mon osd backfillfull_ratio: .85
-        mon osd nearfull ratio: .8
-        osd failsafe full ratio: .95
-
diff -Nru ceph-12.2.11/qa/suites/kcephfs/mixed-clients/objectstore-ec/bluestore-bitmap.yaml ceph-12.2.12/qa/suites/kcephfs/mixed-clients/objectstore-ec/bluestore-bitmap.yaml
--- ceph-12.2.11/qa/suites/kcephfs/mixed-clients/objectstore-ec/bluestore-bitmap.yaml	1970-01-01 00:00:00.000000000 +0000
+++ ceph-12.2.12/qa/suites/kcephfs/mixed-clients/objectstore-ec/bluestore-bitmap.yaml	2019-04-11 12:33:50.000000000 +0000
@@ -0,0 +1,43 @@
+overrides:
+  thrashosds:
+    bdev_inject_crash: 2
+    bdev_inject_crash_probability: .5
+  ceph:
+    fs: xfs
+    conf:
+      osd:
+        osd objectstore: bluestore
+        bluestore block size: 96636764160
+        debug bluestore: 20
+        debug bluefs: 20
+        debug rocksdb: 10
+        bluestore fsck on mount: true
+        bluestore allocator: bitmap
+        # lower the full ratios since we can fill up a 100gb osd so quickly
+        mon osd full ratio: .9
+        mon osd backfillfull_ratio: .85
+        mon osd nearfull ratio: .8
+        osd failsafe full ratio: .95
+# this doesn't work with failures bc the log writes are not atomic across the two backends
+#        bluestore bluefs env mirror: true
+        bdev enable discard: true
+        bdev async discard: true
+  ceph-deploy:
+    fs: xfs
+    bluestore: yes
+    conf:
+      osd:
+        osd objectstore: bluestore
+        bluestore block size: 96636764160
+        debug bluestore: 20
+        debug bluefs: 20
+        debug rocksdb: 10
+        bluestore fsck on mount: true
+        # lower the full ratios since we can fill up a 100gb osd so quickly
+        mon osd full ratio: .9
+        mon osd backfillfull_ratio: .85
+        mon osd nearfull ratio: .8
+        osd failsafe full ratio: .95
+        bdev enable discard: true
+        bdev async discard: true
+
diff -Nru ceph-12.2.11/qa/suites/kcephfs/mixed-clients/objectstore-ec/bluestore.yaml ceph-12.2.12/qa/suites/kcephfs/mixed-clients/objectstore-ec/bluestore.yaml
--- ceph-12.2.11/qa/suites/kcephfs/mixed-clients/objectstore-ec/bluestore.yaml	2019-01-30 15:51:26.000000000 +0000
+++ ceph-12.2.12/qa/suites/kcephfs/mixed-clients/objectstore-ec/bluestore.yaml	1970-01-01 00:00:00.000000000 +0000
@@ -1,38 +0,0 @@
-overrides:
-  thrashosds:
-    bdev_inject_crash: 2
-    bdev_inject_crash_probability: .5
-  ceph:
-    fs: xfs
-    conf:
-      osd:
-        osd objectstore: bluestore
-        bluestore block size: 96636764160
-        debug bluestore: 20
-        debug bluefs: 20
-        debug rocksdb: 10
-        bluestore fsck on mount: true
-        # lower the full ratios since we can fill up a 100gb osd so quickly
-        mon osd full ratio: .9
-        mon osd backfillfull_ratio: .85
-        mon osd nearfull ratio: .8
-        osd failsafe full ratio: .95
-# this doesn't work with failures bc the log writes are not atomic across the two backends
-#        bluestore bluefs env mirror: true
-  ceph-deploy:
-    fs: xfs
-    bluestore: yes
-    conf:
-      osd:
-        osd objectstore: bluestore
-        bluestore block size: 96636764160
-        debug bluestore: 20
-        debug bluefs: 20
-        debug rocksdb: 10
-        bluestore fsck on mount: true
-        # lower the full ratios since we can fill up a 100gb osd so quickly
-        mon osd full ratio: .9
-        mon osd backfillfull_ratio: .85
-        mon osd nearfull ratio: .8
-        osd failsafe full ratio: .95
-
diff -Nru ceph-12.2.11/qa/suites/kcephfs/recovery/objectstore-ec/bluestore-bitmap.yaml ceph-12.2.12/qa/suites/kcephfs/recovery/objectstore-ec/bluestore-bitmap.yaml
--- ceph-12.2.11/qa/suites/kcephfs/recovery/objectstore-ec/bluestore-bitmap.yaml	1970-01-01 00:00:00.000000000 +0000
+++ ceph-12.2.12/qa/suites/kcephfs/recovery/objectstore-ec/bluestore-bitmap.yaml	2019-04-11 12:33:50.000000000 +0000
@@ -0,0 +1,43 @@
+overrides:
+  thrashosds:
+    bdev_inject_crash: 2
+    bdev_inject_crash_probability: .5
+  ceph:
+    fs: xfs
+    conf:
+      osd:
+        osd objectstore: bluestore
+        bluestore block size: 96636764160
+        debug bluestore: 20
+        debug bluefs: 20
+        debug rocksdb: 10
+        bluestore fsck on mount: true
+        bluestore allocator: bitmap
+        # lower the full ratios since we can fill up a 100gb osd so quickly
+        mon osd full ratio: .9
+        mon osd backfillfull_ratio: .85
+        mon osd nearfull ratio: .8
+        osd failsafe full ratio: .95
+# this doesn't work with failures bc the log writes are not atomic across the two backends
+#        bluestore bluefs env mirror: true
+        bdev enable discard: true
+        bdev async discard: true
+  ceph-deploy:
+    fs: xfs
+    bluestore: yes
+    conf:
+      osd:
+        osd objectstore: bluestore
+        bluestore block size: 96636764160
+        debug bluestore: 20
+        debug bluefs: 20
+        debug rocksdb: 10
+        bluestore fsck on mount: true
+        # lower the full ratios since we can fill up a 100gb osd so quickly
+        mon osd full ratio: .9
+        mon osd backfillfull_ratio: .85
+        mon osd nearfull ratio: .8
+        osd failsafe full ratio: .95
+        bdev enable discard: true
+        bdev async discard: true
+
diff -Nru ceph-12.2.11/qa/suites/kcephfs/recovery/objectstore-ec/bluestore.yaml ceph-12.2.12/qa/suites/kcephfs/recovery/objectstore-ec/bluestore.yaml
--- ceph-12.2.11/qa/suites/kcephfs/recovery/objectstore-ec/bluestore.yaml	2019-01-30 15:51:26.000000000 +0000
+++ ceph-12.2.12/qa/suites/kcephfs/recovery/objectstore-ec/bluestore.yaml	1970-01-01 00:00:00.000000000 +0000
@@ -1,38 +0,0 @@
-overrides:
-  thrashosds:
-    bdev_inject_crash: 2
-    bdev_inject_crash_probability: .5
-  ceph:
-    fs: xfs
-    conf:
-      osd:
-        osd objectstore: bluestore
-        bluestore block size: 96636764160
-        debug bluestore: 20
-        debug bluefs: 20
-        debug rocksdb: 10
-        bluestore fsck on mount: true
-        # lower the full ratios since we can fill up a 100gb osd so quickly
-        mon osd full ratio: .9
-        mon osd backfillfull_ratio: .85
-        mon osd nearfull ratio: .8
-        osd failsafe full ratio: .95
-# this doesn't work with failures bc the log writes are not atomic across the two backends
-#        bluestore bluefs env mirror: true
-  ceph-deploy:
-    fs: xfs
-    bluestore: yes
-    conf:
-      osd:
-        osd objectstore: bluestore
-        bluestore block size: 96636764160
-        debug bluestore: 20
-        debug bluefs: 20
-        debug rocksdb: 10
-        bluestore fsck on mount: true
-        # lower the full ratios since we can fill up a 100gb osd so quickly
-        mon osd full ratio: .9
-        mon osd backfillfull_ratio: .85
-        mon osd nearfull ratio: .8
-        osd failsafe full ratio: .95
-
diff -Nru ceph-12.2.11/qa/suites/kcephfs/thrash/objectstore-ec/bluestore-bitmap.yaml ceph-12.2.12/qa/suites/kcephfs/thrash/objectstore-ec/bluestore-bitmap.yaml
--- ceph-12.2.11/qa/suites/kcephfs/thrash/objectstore-ec/bluestore-bitmap.yaml	1970-01-01 00:00:00.000000000 +0000
+++ ceph-12.2.12/qa/suites/kcephfs/thrash/objectstore-ec/bluestore-bitmap.yaml	2019-04-11 12:33:50.000000000 +0000
@@ -0,0 +1,43 @@
+overrides:
+  thrashosds:
+    bdev_inject_crash: 2
+    bdev_inject_crash_probability: .5
+  ceph:
+    fs: xfs
+    conf:
+      osd:
+        osd objectstore: bluestore
+        bluestore block size: 96636764160
+        debug bluestore: 20
+        debug bluefs: 20
+        debug rocksdb: 10
+        bluestore fsck on mount: true
+        bluestore allocator: bitmap
+        # lower the full ratios since we can fill up a 100gb osd so quickly
+        mon osd full ratio: .9
+        mon osd backfillfull_ratio: .85
+        mon osd nearfull ratio: .8
+        osd failsafe full ratio: .95
+# this doesn't work with failures bc the log writes are not atomic across the two backends
+#        bluestore bluefs env mirror: true
+        bdev enable discard: true
+        bdev async discard: true
+  ceph-deploy:
+    fs: xfs
+    bluestore: yes
+    conf:
+      osd:
+        osd objectstore: bluestore
+        bluestore block size: 96636764160
+        debug bluestore: 20
+        debug bluefs: 20
+        debug rocksdb: 10
+        bluestore fsck on mount: true
+        # lower the full ratios since we can fill up a 100gb osd so quickly
+        mon osd full ratio: .9
+        mon osd backfillfull_ratio: .85
+        mon osd nearfull ratio: .8
+        osd failsafe full ratio: .95
+        bdev enable discard: true
+        bdev async discard: true
+
diff -Nru ceph-12.2.11/qa/suites/kcephfs/thrash/objectstore-ec/bluestore.yaml ceph-12.2.12/qa/suites/kcephfs/thrash/objectstore-ec/bluestore.yaml
--- ceph-12.2.11/qa/suites/kcephfs/thrash/objectstore-ec/bluestore.yaml	2019-01-30 15:51:26.000000000 +0000
+++ ceph-12.2.12/qa/suites/kcephfs/thrash/objectstore-ec/bluestore.yaml	1970-01-01 00:00:00.000000000 +0000
@@ -1,38 +0,0 @@
-overrides:
-  thrashosds:
-    bdev_inject_crash: 2
-    bdev_inject_crash_probability: .5
-  ceph:
-    fs: xfs
-    conf:
-      osd:
-        osd objectstore: bluestore
-        bluestore block size: 96636764160
-        debug bluestore: 20
-        debug bluefs: 20
-        debug rocksdb: 10
-        bluestore fsck on mount: true
-        # lower the full ratios since we can fill up a 100gb osd so quickly
-        mon osd full ratio: .9
-        mon osd backfillfull_ratio: .85
-        mon osd nearfull ratio: .8
-        osd failsafe full ratio: .95
-# this doesn't work with failures bc the log writes are not atomic across the two backends
-#        bluestore bluefs env mirror: true
-  ceph-deploy:
-    fs: xfs
-    bluestore: yes
-    conf:
-      osd:
-        osd objectstore: bluestore
-        bluestore block size: 96636764160
-        debug bluestore: 20
-        debug bluefs: 20
-        debug rocksdb: 10
-        bluestore fsck on mount: true
-        # lower the full ratios since we can fill up a 100gb osd so quickly
-        mon osd full ratio: .9
-        mon osd backfillfull_ratio: .85
-        mon osd nearfull ratio: .8
-        osd failsafe full ratio: .95
-
diff -Nru ceph-12.2.11/qa/suites/mixed-clients/basic/objectstore/bluestore-bitmap.yaml ceph-12.2.12/qa/suites/mixed-clients/basic/objectstore/bluestore-bitmap.yaml
--- ceph-12.2.11/qa/suites/mixed-clients/basic/objectstore/bluestore-bitmap.yaml	2019-01-30 15:51:26.000000000 +0000
+++ ceph-12.2.12/qa/suites/mixed-clients/basic/objectstore/bluestore-bitmap.yaml	2019-04-11 12:33:50.000000000 +0000
@@ -20,6 +20,8 @@
         osd failsafe full ratio: .95
 # this doesn't work with failures bc the log writes are not atomic across the two backends
 #        bluestore bluefs env mirror: true
+        bdev enable discard: true
+        bdev async discard: true
   ceph-deploy:
     fs: xfs
     bluestore: yes
@@ -36,4 +38,6 @@
         mon osd backfillfull_ratio: .85
         mon osd nearfull ratio: .8
         osd failsafe full ratio: .95
+        bdev enable discard: true
+        bdev async discard: true
 
diff -Nru ceph-12.2.11/qa/suites/mixed-clients/basic/objectstore/bluestore-stupid.yaml ceph-12.2.12/qa/suites/mixed-clients/basic/objectstore/bluestore-stupid.yaml
--- ceph-12.2.11/qa/suites/mixed-clients/basic/objectstore/bluestore-stupid.yaml	1970-01-01 00:00:00.000000000 +0000
+++ ceph-12.2.12/qa/suites/mixed-clients/basic/objectstore/bluestore-stupid.yaml	2019-04-11 12:33:50.000000000 +0000
@@ -0,0 +1,39 @@
+overrides:
+  thrashosds:
+    bdev_inject_crash: 2
+    bdev_inject_crash_probability: .5
+  ceph:
+    fs: xfs
+    conf:
+      osd:
+        osd objectstore: bluestore
+        bluestore block size: 96636764160
+        debug bluestore: 20
+        debug bluefs: 20
+        debug rocksdb: 10
+        bluestore fsck on mount: true
+        bluestore allocator: stupid
+        # lower the full ratios since we can fill up a 100gb osd so quickly
+        mon osd full ratio: .9
+        mon osd backfillfull_ratio: .85
+        mon osd nearfull ratio: .8
+        osd failsafe full ratio: .95
+# this doesn't work with failures bc the log writes are not atomic across the two backends
+#        bluestore bluefs env mirror: true
+  ceph-deploy:
+    fs: xfs
+    bluestore: yes
+    conf:
+      osd:
+        osd objectstore: bluestore
+        bluestore block size: 96636764160
+        debug bluestore: 20
+        debug bluefs: 20
+        debug rocksdb: 10
+        bluestore fsck on mount: true
+        # lower the full ratios since we can fill up a 100gb osd so quickly
+        mon osd full ratio: .9
+        mon osd backfillfull_ratio: .85
+        mon osd nearfull ratio: .8
+        osd failsafe full ratio: .95
+
diff -Nru ceph-12.2.11/qa/suites/mixed-clients/basic/objectstore/bluestore.yaml ceph-12.2.12/qa/suites/mixed-clients/basic/objectstore/bluestore.yaml
--- ceph-12.2.11/qa/suites/mixed-clients/basic/objectstore/bluestore.yaml	2019-01-30 15:51:26.000000000 +0000
+++ ceph-12.2.12/qa/suites/mixed-clients/basic/objectstore/bluestore.yaml	1970-01-01 00:00:00.000000000 +0000
@@ -1,38 +0,0 @@
-overrides:
-  thrashosds:
-    bdev_inject_crash: 2
-    bdev_inject_crash_probability: .5
-  ceph:
-    fs: xfs
-    conf:
-      osd:
-        osd objectstore: bluestore
-        bluestore block size: 96636764160
-        debug bluestore: 20
-        debug bluefs: 20
-        debug rocksdb: 10
-        bluestore fsck on mount: true
-        # lower the full ratios since we can fill up a 100gb osd so quickly
-        mon osd full ratio: .9
-        mon osd backfillfull_ratio: .85
-        mon osd nearfull ratio: .8
-        osd failsafe full ratio: .95
-# this doesn't work with failures bc the log writes are not atomic across the two backends
-#        bluestore bluefs env mirror: true
-  ceph-deploy:
-    fs: xfs
-    bluestore: yes
-    conf:
-      osd:
-        osd objectstore: bluestore
-        bluestore block size: 96636764160
-        debug bluestore: 20
-        debug bluefs: 20
-        debug rocksdb: 10
-        bluestore fsck on mount: true
-        # lower the full ratios since we can fill up a 100gb osd so quickly
-        mon osd full ratio: .9
-        mon osd backfillfull_ratio: .85
-        mon osd nearfull ratio: .8
-        osd failsafe full ratio: .95
-
diff -Nru ceph-12.2.11/qa/suites/multimds/basic/objectstore-ec/bluestore-bitmap.yaml ceph-12.2.12/qa/suites/multimds/basic/objectstore-ec/bluestore-bitmap.yaml
--- ceph-12.2.11/qa/suites/multimds/basic/objectstore-ec/bluestore-bitmap.yaml	1970-01-01 00:00:00.000000000 +0000
+++ ceph-12.2.12/qa/suites/multimds/basic/objectstore-ec/bluestore-bitmap.yaml	2019-04-11 12:33:50.000000000 +0000
@@ -0,0 +1,43 @@
+overrides:
+  thrashosds:
+    bdev_inject_crash: 2
+    bdev_inject_crash_probability: .5
+  ceph:
+    fs: xfs
+    conf:
+      osd:
+        osd objectstore: bluestore
+        bluestore block size: 96636764160
+        debug bluestore: 20
+        debug bluefs: 20
+        debug rocksdb: 10
+        bluestore fsck on mount: true
+        bluestore allocator: bitmap
+        # lower the full ratios since we can fill up a 100gb osd so quickly
+        mon osd full ratio: .9
+        mon osd backfillfull_ratio: .85
+        mon osd nearfull ratio: .8
+        osd failsafe full ratio: .95
+# this doesn't work with failures bc the log writes are not atomic across the two backends
+#        bluestore bluefs env mirror: true
+        bdev enable discard: true
+        bdev async discard: true
+  ceph-deploy:
+    fs: xfs
+    bluestore: yes
+    conf:
+      osd:
+        osd objectstore: bluestore
+        bluestore block size: 96636764160
+        debug bluestore: 20
+        debug bluefs: 20
+        debug rocksdb: 10
+        bluestore fsck on mount: true
+        # lower the full ratios since we can fill up a 100gb osd so quickly
+        mon osd full ratio: .9
+        mon osd backfillfull_ratio: .85
+        mon osd nearfull ratio: .8
+        osd failsafe full ratio: .95
+        bdev enable discard: true
+        bdev async discard: true
+
diff -Nru ceph-12.2.11/qa/suites/multimds/basic/objectstore-ec/bluestore.yaml ceph-12.2.12/qa/suites/multimds/basic/objectstore-ec/bluestore.yaml
--- ceph-12.2.11/qa/suites/multimds/basic/objectstore-ec/bluestore.yaml	2019-01-30 15:51:26.000000000 +0000
+++ ceph-12.2.12/qa/suites/multimds/basic/objectstore-ec/bluestore.yaml	1970-01-01 00:00:00.000000000 +0000
@@ -1,38 +0,0 @@
-overrides:
-  thrashosds:
-    bdev_inject_crash: 2
-    bdev_inject_crash_probability: .5
-  ceph:
-    fs: xfs
-    conf:
-      osd:
-        osd objectstore: bluestore
-        bluestore block size: 96636764160
-        debug bluestore: 20
-        debug bluefs: 20
-        debug rocksdb: 10
-        bluestore fsck on mount: true
-        # lower the full ratios since we can fill up a 100gb osd so quickly
-        mon osd full ratio: .9
-        mon osd backfillfull_ratio: .85
-        mon osd nearfull ratio: .8
-        osd failsafe full ratio: .95
-# this doesn't work with failures bc the log writes are not atomic across the two backends
-#        bluestore bluefs env mirror: true
-  ceph-deploy:
-    fs: xfs
-    bluestore: yes
-    conf:
-      osd:
-        osd objectstore: bluestore
-        bluestore block size: 96636764160
-        debug bluestore: 20
-        debug bluefs: 20
-        debug rocksdb: 10
-        bluestore fsck on mount: true
-        # lower the full ratios since we can fill up a 100gb osd so quickly
-        mon osd full ratio: .9
-        mon osd backfillfull_ratio: .85
-        mon osd nearfull ratio: .8
-        osd failsafe full ratio: .95
-
diff -Nru ceph-12.2.11/qa/suites/multimds/basic/tasks/cfuse_workunit_suites_fsstress.yaml ceph-12.2.12/qa/suites/multimds/basic/tasks/cfuse_workunit_suites_fsstress.yaml
--- ceph-12.2.11/qa/suites/multimds/basic/tasks/cfuse_workunit_suites_fsstress.yaml	2019-01-30 15:51:26.000000000 +0000
+++ ceph-12.2.12/qa/suites/multimds/basic/tasks/cfuse_workunit_suites_fsstress.yaml	2019-04-11 12:33:50.000000000 +0000
@@ -1,5 +1,6 @@
 tasks:
 - workunit:
+    timeout: 6h
     clients:
       all:
         - suites/fsstress.sh
diff -Nru ceph-12.2.11/qa/suites/multimds/thrash/objectstore-ec/bluestore-bitmap.yaml ceph-12.2.12/qa/suites/multimds/thrash/objectstore-ec/bluestore-bitmap.yaml
--- ceph-12.2.11/qa/suites/multimds/thrash/objectstore-ec/bluestore-bitmap.yaml	1970-01-01 00:00:00.000000000 +0000
+++ ceph-12.2.12/qa/suites/multimds/thrash/objectstore-ec/bluestore-bitmap.yaml	2019-04-11 12:33:50.000000000 +0000
@@ -0,0 +1,43 @@
+overrides:
+  thrashosds:
+    bdev_inject_crash: 2
+    bdev_inject_crash_probability: .5
+  ceph:
+    fs: xfs
+    conf:
+      osd:
+        osd objectstore: bluestore
+        bluestore block size: 96636764160
+        debug bluestore: 20
+        debug bluefs: 20
+        debug rocksdb: 10
+        bluestore fsck on mount: true
+        bluestore allocator: bitmap
+        # lower the full ratios since we can fill up a 100gb osd so quickly
+        mon osd full ratio: .9
+        mon osd backfillfull_ratio: .85
+        mon osd nearfull ratio: .8
+        osd failsafe full ratio: .95
+# this doesn't work with failures bc the log writes are not atomic across the two backends
+#        bluestore bluefs env mirror: true
+        bdev enable discard: true
+        bdev async discard: true
+  ceph-deploy:
+    fs: xfs
+    bluestore: yes
+    conf:
+      osd:
+        osd objectstore: bluestore
+        bluestore block size: 96636764160
+        debug bluestore: 20
+        debug bluefs: 20
+        debug rocksdb: 10
+        bluestore fsck on mount: true
+        # lower the full ratios since we can fill up a 100gb osd so quickly
+        mon osd full ratio: .9
+        mon osd backfillfull_ratio: .85
+        mon osd nearfull ratio: .8
+        osd failsafe full ratio: .95
+        bdev enable discard: true
+        bdev async discard: true
+
diff -Nru ceph-12.2.11/qa/suites/multimds/thrash/objectstore-ec/bluestore.yaml ceph-12.2.12/qa/suites/multimds/thrash/objectstore-ec/bluestore.yaml
--- ceph-12.2.11/qa/suites/multimds/thrash/objectstore-ec/bluestore.yaml	2019-01-30 15:51:26.000000000 +0000
+++ ceph-12.2.12/qa/suites/multimds/thrash/objectstore-ec/bluestore.yaml	1970-01-01 00:00:00.000000000 +0000
@@ -1,38 +0,0 @@
-overrides:
-  thrashosds:
-    bdev_inject_crash: 2
-    bdev_inject_crash_probability: .5
-  ceph:
-    fs: xfs
-    conf:
-      osd:
-        osd objectstore: bluestore
-        bluestore block size: 96636764160
-        debug bluestore: 20
-        debug bluefs: 20
-        debug rocksdb: 10
-        bluestore fsck on mount: true
-        # lower the full ratios since we can fill up a 100gb osd so quickly
-        mon osd full ratio: .9
-        mon osd backfillfull_ratio: .85
-        mon osd nearfull ratio: .8
-        osd failsafe full ratio: .95
-# this doesn't work with failures bc the log writes are not atomic across the two backends
-#        bluestore bluefs env mirror: true
-  ceph-deploy:
-    fs: xfs
-    bluestore: yes
-    conf:
-      osd:
-        osd objectstore: bluestore
-        bluestore block size: 96636764160
-        debug bluestore: 20
-        debug bluefs: 20
-        debug rocksdb: 10
-        bluestore fsck on mount: true
-        # lower the full ratios since we can fill up a 100gb osd so quickly
-        mon osd full ratio: .9
-        mon osd backfillfull_ratio: .85
-        mon osd nearfull ratio: .8
-        osd failsafe full ratio: .95
-
diff -Nru ceph-12.2.11/qa/suites/multimds/thrash/tasks/cfuse_workunit_suites_fsstress.yaml ceph-12.2.12/qa/suites/multimds/thrash/tasks/cfuse_workunit_suites_fsstress.yaml
--- ceph-12.2.11/qa/suites/multimds/thrash/tasks/cfuse_workunit_suites_fsstress.yaml	2019-01-30 15:51:26.000000000 +0000
+++ ceph-12.2.12/qa/suites/multimds/thrash/tasks/cfuse_workunit_suites_fsstress.yaml	2019-04-11 12:33:50.000000000 +0000
@@ -1,5 +1,6 @@
 tasks:
 - workunit:
+    timeout: 6h
     clients:
       all:
         - suites/fsstress.sh
diff -Nru ceph-12.2.11/qa/suites/multimds/verify/objectstore-ec/bluestore-bitmap.yaml ceph-12.2.12/qa/suites/multimds/verify/objectstore-ec/bluestore-bitmap.yaml
--- ceph-12.2.11/qa/suites/multimds/verify/objectstore-ec/bluestore-bitmap.yaml	1970-01-01 00:00:00.000000000 +0000
+++ ceph-12.2.12/qa/suites/multimds/verify/objectstore-ec/bluestore-bitmap.yaml	2019-04-11 12:33:50.000000000 +0000
@@ -0,0 +1,43 @@
+overrides:
+  thrashosds:
+    bdev_inject_crash: 2
+    bdev_inject_crash_probability: .5
+  ceph:
+    fs: xfs
+    conf:
+      osd:
+        osd objectstore: bluestore
+        bluestore block size: 96636764160
+        debug bluestore: 20
+        debug bluefs: 20
+        debug rocksdb: 10
+        bluestore fsck on mount: true
+        bluestore allocator: bitmap
+        # lower the full ratios since we can fill up a 100gb osd so quickly
+        mon osd full ratio: .9
+        mon osd backfillfull_ratio: .85
+        mon osd nearfull ratio: .8
+        osd failsafe full ratio: .95
+# this doesn't work with failures bc the log writes are not atomic across the two backends
+#        bluestore bluefs env mirror: true
+        bdev enable discard: true
+        bdev async discard: true
+  ceph-deploy:
+    fs: xfs
+    bluestore: yes
+    conf:
+      osd:
+        osd objectstore: bluestore
+        bluestore block size: 96636764160
+        debug bluestore: 20
+        debug bluefs: 20
+        debug rocksdb: 10
+        bluestore fsck on mount: true
+        # lower the full ratios since we can fill up a 100gb osd so quickly
+        mon osd full ratio: .9
+        mon osd backfillfull_ratio: .85
+        mon osd nearfull ratio: .8
+        osd failsafe full ratio: .95
+        bdev enable discard: true
+        bdev async discard: true
+
diff -Nru ceph-12.2.11/qa/suites/multimds/verify/objectstore-ec/bluestore.yaml ceph-12.2.12/qa/suites/multimds/verify/objectstore-ec/bluestore.yaml
--- ceph-12.2.11/qa/suites/multimds/verify/objectstore-ec/bluestore.yaml	2019-01-30 15:51:26.000000000 +0000
+++ ceph-12.2.12/qa/suites/multimds/verify/objectstore-ec/bluestore.yaml	1970-01-01 00:00:00.000000000 +0000
@@ -1,38 +0,0 @@
-overrides:
-  thrashosds:
-    bdev_inject_crash: 2
-    bdev_inject_crash_probability: .5
-  ceph:
-    fs: xfs
-    conf:
-      osd:
-        osd objectstore: bluestore
-        bluestore block size: 96636764160
-        debug bluestore: 20
-        debug bluefs: 20
-        debug rocksdb: 10
-        bluestore fsck on mount: true
-        # lower the full ratios since we can fill up a 100gb osd so quickly
-        mon osd full ratio: .9
-        mon osd backfillfull_ratio: .85
-        mon osd nearfull ratio: .8
-        osd failsafe full ratio: .95
-# this doesn't work with failures bc the log writes are not atomic across the two backends
-#        bluestore bluefs env mirror: true
-  ceph-deploy:
-    fs: xfs
-    bluestore: yes
-    conf:
-      osd:
-        osd objectstore: bluestore
-        bluestore block size: 96636764160
-        debug bluestore: 20
-        debug bluefs: 20
-        debug rocksdb: 10
-        bluestore fsck on mount: true
-        # lower the full ratios since we can fill up a 100gb osd so quickly
-        mon osd full ratio: .9
-        mon osd backfillfull_ratio: .85
-        mon osd nearfull ratio: .8
-        osd failsafe full ratio: .95
-
diff -Nru ceph-12.2.11/qa/suites/multimds/verify/tasks/cfuse_workunit_suites_fsstress.yaml ceph-12.2.12/qa/suites/multimds/verify/tasks/cfuse_workunit_suites_fsstress.yaml
--- ceph-12.2.11/qa/suites/multimds/verify/tasks/cfuse_workunit_suites_fsstress.yaml	2019-01-30 15:51:26.000000000 +0000
+++ ceph-12.2.12/qa/suites/multimds/verify/tasks/cfuse_workunit_suites_fsstress.yaml	2019-04-11 12:33:50.000000000 +0000
@@ -1,5 +1,6 @@
 tasks:
 - workunit:
+    timeout: 6h
     clients:
       all:
         - suites/fsstress.sh
diff -Nru ceph-12.2.11/qa/suites/multimds/verify/validater/valgrind.yaml ceph-12.2.12/qa/suites/multimds/verify/validater/valgrind.yaml
--- ceph-12.2.11/qa/suites/multimds/verify/validater/valgrind.yaml	2019-01-30 15:51:26.000000000 +0000
+++ ceph-12.2.12/qa/suites/multimds/verify/validater/valgrind.yaml	2019-04-11 12:33:50.000000000 +0000
@@ -1,21 +1,20 @@
 # see http://tracker.ceph.com/issues/20360 and http://tracker.ceph.com/issues/18126
 os_type: centos
 
-# Valgrind makes everything slow, so ignore slow requests
-overrides:
-  ceph:
-    log-whitelist:
-      - slow requests are blocked
-
 overrides:
   install:
     ceph:
       flavor: notcmalloc
       debuginfo: true
   ceph:
+    # Valgrind makes everything slow, so ignore slow requests and extend heartbeat grace
+    log-whitelist:
+      - slow requests are blocked
     conf:
       global:
         osd heartbeat grace: 40
+      mds:
+        mds heartbeat grace: 60
       mon:
         mon osd crush smoke test: false
     valgrind:
diff -Nru ceph-12.2.11/qa/suites/powercycle/osd/objectstore/bluestore-bitmap.yaml ceph-12.2.12/qa/suites/powercycle/osd/objectstore/bluestore-bitmap.yaml
--- ceph-12.2.11/qa/suites/powercycle/osd/objectstore/bluestore-bitmap.yaml	2019-01-30 15:51:26.000000000 +0000
+++ ceph-12.2.12/qa/suites/powercycle/osd/objectstore/bluestore-bitmap.yaml	2019-04-11 12:33:50.000000000 +0000
@@ -20,6 +20,8 @@
         osd failsafe full ratio: .95
 # this doesn't work with failures bc the log writes are not atomic across the two backends
 #        bluestore bluefs env mirror: true
+        bdev enable discard: true
+        bdev async discard: true
   ceph-deploy:
     fs: xfs
     bluestore: yes
@@ -36,4 +38,6 @@
         mon osd backfillfull_ratio: .85
         mon osd nearfull ratio: .8
         osd failsafe full ratio: .95
+        bdev enable discard: true
+        bdev async discard: true
 
diff -Nru ceph-12.2.11/qa/suites/powercycle/osd/objectstore/bluestore-stupid.yaml ceph-12.2.12/qa/suites/powercycle/osd/objectstore/bluestore-stupid.yaml
--- ceph-12.2.11/qa/suites/powercycle/osd/objectstore/bluestore-stupid.yaml	1970-01-01 00:00:00.000000000 +0000
+++ ceph-12.2.12/qa/suites/powercycle/osd/objectstore/bluestore-stupid.yaml	2019-04-11 12:33:50.000000000 +0000
@@ -0,0 +1,39 @@
+overrides:
+  thrashosds:
+    bdev_inject_crash: 2
+    bdev_inject_crash_probability: .5
+  ceph:
+    fs: xfs
+    conf:
+      osd:
+        osd objectstore: bluestore
+        bluestore block size: 96636764160
+        debug bluestore: 20
+        debug bluefs: 20
+        debug rocksdb: 10
+        bluestore fsck on mount: true
+        bluestore allocator: stupid
+        # lower the full ratios since we can fill up a 100gb osd so quickly
+        mon osd full ratio: .9
+        mon osd backfillfull_ratio: .85
+        mon osd nearfull ratio: .8
+        osd failsafe full ratio: .95
+# this doesn't work with failures bc the log writes are not atomic across the two backends
+#        bluestore bluefs env mirror: true
+  ceph-deploy:
+    fs: xfs
+    bluestore: yes
+    conf:
+      osd:
+        osd objectstore: bluestore
+        bluestore block size: 96636764160
+        debug bluestore: 20
+        debug bluefs: 20
+        debug rocksdb: 10
+        bluestore fsck on mount: true
+        # lower the full ratios since we can fill up a 100gb osd so quickly
+        mon osd full ratio: .9
+        mon osd backfillfull_ratio: .85
+        mon osd nearfull ratio: .8
+        osd failsafe full ratio: .95
+
diff -Nru ceph-12.2.11/qa/suites/powercycle/osd/objectstore/bluestore.yaml ceph-12.2.12/qa/suites/powercycle/osd/objectstore/bluestore.yaml
--- ceph-12.2.11/qa/suites/powercycle/osd/objectstore/bluestore.yaml	2019-01-30 15:51:26.000000000 +0000
+++ ceph-12.2.12/qa/suites/powercycle/osd/objectstore/bluestore.yaml	1970-01-01 00:00:00.000000000 +0000
@@ -1,38 +0,0 @@
-overrides:
-  thrashosds:
-    bdev_inject_crash: 2
-    bdev_inject_crash_probability: .5
-  ceph:
-    fs: xfs
-    conf:
-      osd:
-        osd objectstore: bluestore
-        bluestore block size: 96636764160
-        debug bluestore: 20
-        debug bluefs: 20
-        debug rocksdb: 10
-        bluestore fsck on mount: true
-        # lower the full ratios since we can fill up a 100gb osd so quickly
-        mon osd full ratio: .9
-        mon osd backfillfull_ratio: .85
-        mon osd nearfull ratio: .8
-        osd failsafe full ratio: .95
-# this doesn't work with failures bc the log writes are not atomic across the two backends
-#        bluestore bluefs env mirror: true
-  ceph-deploy:
-    fs: xfs
-    bluestore: yes
-    conf:
-      osd:
-        osd objectstore: bluestore
-        bluestore block size: 96636764160
-        debug bluestore: 20
-        debug bluefs: 20
-        debug rocksdb: 10
-        bluestore fsck on mount: true
-        # lower the full ratios since we can fill up a 100gb osd so quickly
-        mon osd full ratio: .9
-        mon osd backfillfull_ratio: .85
-        mon osd nearfull ratio: .8
-        osd failsafe full ratio: .95
-
diff -Nru ceph-12.2.11/qa/suites/powercycle/osd/whitelist_health.yaml ceph-12.2.12/qa/suites/powercycle/osd/whitelist_health.yaml
--- ceph-12.2.11/qa/suites/powercycle/osd/whitelist_health.yaml	2019-01-30 15:51:26.000000000 +0000
+++ ceph-12.2.12/qa/suites/powercycle/osd/whitelist_health.yaml	2019-04-11 12:33:50.000000000 +0000
@@ -3,4 +3,5 @@
     log-whitelist:
       - \(MDS_TRIM\)
       - \(MDS_SLOW_REQUEST\)
+      - MDS_SLOW_METADATA_IO
       - Behind on trimming
diff -Nru ceph-12.2.11/qa/suites/rados/basic/objectstore/bluestore-bitmap.yaml ceph-12.2.12/qa/suites/rados/basic/objectstore/bluestore-bitmap.yaml
--- ceph-12.2.11/qa/suites/rados/basic/objectstore/bluestore-bitmap.yaml	2019-01-30 15:51:26.000000000 +0000
+++ ceph-12.2.12/qa/suites/rados/basic/objectstore/bluestore-bitmap.yaml	2019-04-11 12:33:50.000000000 +0000
@@ -20,6 +20,8 @@
         osd failsafe full ratio: .95
 # this doesn't work with failures bc the log writes are not atomic across the two backends
 #        bluestore bluefs env mirror: true
+        bdev enable discard: true
+        bdev async discard: true
   ceph-deploy:
     fs: xfs
     bluestore: yes
@@ -36,4 +38,6 @@
         mon osd backfillfull_ratio: .85
         mon osd nearfull ratio: .8
         osd failsafe full ratio: .95
+        bdev enable discard: true
+        bdev async discard: true
 
diff -Nru ceph-12.2.11/qa/suites/rados/basic/objectstore/bluestore-stupid.yaml ceph-12.2.12/qa/suites/rados/basic/objectstore/bluestore-stupid.yaml
--- ceph-12.2.11/qa/suites/rados/basic/objectstore/bluestore-stupid.yaml	1970-01-01 00:00:00.000000000 +0000
+++ ceph-12.2.12/qa/suites/rados/basic/objectstore/bluestore-stupid.yaml	2019-04-11 12:33:50.000000000 +0000
@@ -0,0 +1,39 @@
+overrides:
+  thrashosds:
+    bdev_inject_crash: 2
+    bdev_inject_crash_probability: .5
+  ceph:
+    fs: xfs
+    conf:
+      osd:
+        osd objectstore: bluestore
+        bluestore block size: 96636764160
+        debug bluestore: 20
+        debug bluefs: 20
+        debug rocksdb: 10
+        bluestore fsck on mount: true
+        bluestore allocator: stupid
+        # lower the full ratios since we can fill up a 100gb osd so quickly
+        mon osd full ratio: .9
+        mon osd backfillfull_ratio: .85
+        mon osd nearfull ratio: .8
+        osd failsafe full ratio: .95
+# this doesn't work with failures bc the log writes are not atomic across the two backends
+#        bluestore bluefs env mirror: true
+  ceph-deploy:
+    fs: xfs
+    bluestore: yes
+    conf:
+      osd:
+        osd objectstore: bluestore
+        bluestore block size: 96636764160
+        debug bluestore: 20
+        debug bluefs: 20
+        debug rocksdb: 10
+        bluestore fsck on mount: true
+        # lower the full ratios since we can fill up a 100gb osd so quickly
+        mon osd full ratio: .9
+        mon osd backfillfull_ratio: .85
+        mon osd nearfull ratio: .8
+        osd failsafe full ratio: .95
+
diff -Nru ceph-12.2.11/qa/suites/rados/basic/objectstore/bluestore.yaml ceph-12.2.12/qa/suites/rados/basic/objectstore/bluestore.yaml
--- ceph-12.2.11/qa/suites/rados/basic/objectstore/bluestore.yaml	2019-01-30 15:51:26.000000000 +0000
+++ ceph-12.2.12/qa/suites/rados/basic/objectstore/bluestore.yaml	1970-01-01 00:00:00.000000000 +0000
@@ -1,38 +0,0 @@
-overrides:
-  thrashosds:
-    bdev_inject_crash: 2
-    bdev_inject_crash_probability: .5
-  ceph:
-    fs: xfs
-    conf:
-      osd:
-        osd objectstore: bluestore
-        bluestore block size: 96636764160
-        debug bluestore: 20
-        debug bluefs: 20
-        debug rocksdb: 10
-        bluestore fsck on mount: true
-        # lower the full ratios since we can fill up a 100gb osd so quickly
-        mon osd full ratio: .9
-        mon osd backfillfull_ratio: .85
-        mon osd nearfull ratio: .8
-        osd failsafe full ratio: .95
-# this doesn't work with failures bc the log writes are not atomic across the two backends
-#        bluestore bluefs env mirror: true
-  ceph-deploy:
-    fs: xfs
-    bluestore: yes
-    conf:
-      osd:
-        osd objectstore: bluestore
-        bluestore block size: 96636764160
-        debug bluestore: 20
-        debug bluefs: 20
-        debug rocksdb: 10
-        bluestore fsck on mount: true
-        # lower the full ratios since we can fill up a 100gb osd so quickly
-        mon osd full ratio: .9
-        mon osd backfillfull_ratio: .85
-        mon osd nearfull ratio: .8
-        osd failsafe full ratio: .95
-
diff -Nru ceph-12.2.11/qa/suites/rados/basic-luminous/objectstore/bluestore-bitmap.yaml ceph-12.2.12/qa/suites/rados/basic-luminous/objectstore/bluestore-bitmap.yaml
--- ceph-12.2.11/qa/suites/rados/basic-luminous/objectstore/bluestore-bitmap.yaml	2019-01-30 15:51:26.000000000 +0000
+++ ceph-12.2.12/qa/suites/rados/basic-luminous/objectstore/bluestore-bitmap.yaml	2019-04-11 12:33:50.000000000 +0000
@@ -20,6 +20,8 @@
         osd failsafe full ratio: .95
 # this doesn't work with failures bc the log writes are not atomic across the two backends
 #        bluestore bluefs env mirror: true
+        bdev enable discard: true
+        bdev async discard: true
   ceph-deploy:
     fs: xfs
     bluestore: yes
@@ -36,4 +38,6 @@
         mon osd backfillfull_ratio: .85
         mon osd nearfull ratio: .8
         osd failsafe full ratio: .95
+        bdev enable discard: true
+        bdev async discard: true
 
diff -Nru ceph-12.2.11/qa/suites/rados/basic-luminous/objectstore/bluestore-stupid.yaml ceph-12.2.12/qa/suites/rados/basic-luminous/objectstore/bluestore-stupid.yaml
--- ceph-12.2.11/qa/suites/rados/basic-luminous/objectstore/bluestore-stupid.yaml	1970-01-01 00:00:00.000000000 +0000
+++ ceph-12.2.12/qa/suites/rados/basic-luminous/objectstore/bluestore-stupid.yaml	2019-04-11 12:33:50.000000000 +0000
@@ -0,0 +1,39 @@
+overrides:
+  thrashosds:
+    bdev_inject_crash: 2
+    bdev_inject_crash_probability: .5
+  ceph:
+    fs: xfs
+    conf:
+      osd:
+        osd objectstore: bluestore
+        bluestore block size: 96636764160
+        debug bluestore: 20
+        debug bluefs: 20
+        debug rocksdb: 10
+        bluestore fsck on mount: true
+        bluestore allocator: stupid
+        # lower the full ratios since we can fill up a 100gb osd so quickly
+        mon osd full ratio: .9
+        mon osd backfillfull_ratio: .85
+        mon osd nearfull ratio: .8
+        osd failsafe full ratio: .95
+# this doesn't work with failures bc the log writes are not atomic across the two backends
+#        bluestore bluefs env mirror: true
+  ceph-deploy:
+    fs: xfs
+    bluestore: yes
+    conf:
+      osd:
+        osd objectstore: bluestore
+        bluestore block size: 96636764160
+        debug bluestore: 20
+        debug bluefs: 20
+        debug rocksdb: 10
+        bluestore fsck on mount: true
+        # lower the full ratios since we can fill up a 100gb osd so quickly
+        mon osd full ratio: .9
+        mon osd backfillfull_ratio: .85
+        mon osd nearfull ratio: .8
+        osd failsafe full ratio: .95
+
diff -Nru ceph-12.2.11/qa/suites/rados/basic-luminous/objectstore/bluestore.yaml ceph-12.2.12/qa/suites/rados/basic-luminous/objectstore/bluestore.yaml
--- ceph-12.2.11/qa/suites/rados/basic-luminous/objectstore/bluestore.yaml	2019-01-30 15:51:26.000000000 +0000
+++ ceph-12.2.12/qa/suites/rados/basic-luminous/objectstore/bluestore.yaml	1970-01-01 00:00:00.000000000 +0000
@@ -1,38 +0,0 @@
-overrides:
-  thrashosds:
-    bdev_inject_crash: 2
-    bdev_inject_crash_probability: .5
-  ceph:
-    fs: xfs
-    conf:
-      osd:
-        osd objectstore: bluestore
-        bluestore block size: 96636764160
-        debug bluestore: 20
-        debug bluefs: 20
-        debug rocksdb: 10
-        bluestore fsck on mount: true
-        # lower the full ratios since we can fill up a 100gb osd so quickly
-        mon osd full ratio: .9
-        mon osd backfillfull_ratio: .85
-        mon osd nearfull ratio: .8
-        osd failsafe full ratio: .95
-# this doesn't work with failures bc the log writes are not atomic across the two backends
-#        bluestore bluefs env mirror: true
-  ceph-deploy:
-    fs: xfs
-    bluestore: yes
-    conf:
-      osd:
-        osd objectstore: bluestore
-        bluestore block size: 96636764160
-        debug bluestore: 20
-        debug bluefs: 20
-        debug rocksdb: 10
-        bluestore fsck on mount: true
-        # lower the full ratios since we can fill up a 100gb osd so quickly
-        mon osd full ratio: .9
-        mon osd backfillfull_ratio: .85
-        mon osd nearfull ratio: .8
-        osd failsafe full ratio: .95
-
diff -Nru ceph-12.2.11/qa/suites/rados/mgr/objectstore/bluestore-bitmap.yaml ceph-12.2.12/qa/suites/rados/mgr/objectstore/bluestore-bitmap.yaml
--- ceph-12.2.11/qa/suites/rados/mgr/objectstore/bluestore-bitmap.yaml	2019-01-30 15:51:26.000000000 +0000
+++ ceph-12.2.12/qa/suites/rados/mgr/objectstore/bluestore-bitmap.yaml	2019-04-11 12:33:50.000000000 +0000
@@ -20,6 +20,8 @@
         osd failsafe full ratio: .95
 # this doesn't work with failures bc the log writes are not atomic across the two backends
 #        bluestore bluefs env mirror: true
+        bdev enable discard: true
+        bdev async discard: true
   ceph-deploy:
     fs: xfs
     bluestore: yes
@@ -36,4 +38,6 @@
         mon osd backfillfull_ratio: .85
         mon osd nearfull ratio: .8
         osd failsafe full ratio: .95
+        bdev enable discard: true
+        bdev async discard: true
 
diff -Nru ceph-12.2.11/qa/suites/rados/mgr/objectstore/bluestore-stupid.yaml ceph-12.2.12/qa/suites/rados/mgr/objectstore/bluestore-stupid.yaml
--- ceph-12.2.11/qa/suites/rados/mgr/objectstore/bluestore-stupid.yaml	1970-01-01 00:00:00.000000000 +0000
+++ ceph-12.2.12/qa/suites/rados/mgr/objectstore/bluestore-stupid.yaml	2019-04-11 12:33:50.000000000 +0000
@@ -0,0 +1,39 @@
+overrides:
+  thrashosds:
+    bdev_inject_crash: 2
+    bdev_inject_crash_probability: .5
+  ceph:
+    fs: xfs
+    conf:
+      osd:
+        osd objectstore: bluestore
+        bluestore block size: 96636764160
+        debug bluestore: 20
+        debug bluefs: 20
+        debug rocksdb: 10
+        bluestore fsck on mount: true
+        bluestore allocator: stupid
+        # lower the full ratios since we can fill up a 100gb osd so quickly
+        mon osd full ratio: .9
+        mon osd backfillfull_ratio: .85
+        mon osd nearfull ratio: .8
+        osd failsafe full ratio: .95
+# this doesn't work with failures bc the log writes are not atomic across the two backends
+#        bluestore bluefs env mirror: true
+  ceph-deploy:
+    fs: xfs
+    bluestore: yes
+    conf:
+      osd:
+        osd objectstore: bluestore
+        bluestore block size: 96636764160
+        debug bluestore: 20
+        debug bluefs: 20
+        debug rocksdb: 10
+        bluestore fsck on mount: true
+        # lower the full ratios since we can fill up a 100gb osd so quickly
+        mon osd full ratio: .9
+        mon osd backfillfull_ratio: .85
+        mon osd nearfull ratio: .8
+        osd failsafe full ratio: .95
+
diff -Nru ceph-12.2.11/qa/suites/rados/mgr/objectstore/bluestore.yaml ceph-12.2.12/qa/suites/rados/mgr/objectstore/bluestore.yaml
--- ceph-12.2.11/qa/suites/rados/mgr/objectstore/bluestore.yaml	2019-01-30 15:51:26.000000000 +0000
+++ ceph-12.2.12/qa/suites/rados/mgr/objectstore/bluestore.yaml	1970-01-01 00:00:00.000000000 +0000
@@ -1,38 +0,0 @@
-overrides:
-  thrashosds:
-    bdev_inject_crash: 2
-    bdev_inject_crash_probability: .5
-  ceph:
-    fs: xfs
-    conf:
-      osd:
-        osd objectstore: bluestore
-        bluestore block size: 96636764160
-        debug bluestore: 20
-        debug bluefs: 20
-        debug rocksdb: 10
-        bluestore fsck on mount: true
-        # lower the full ratios since we can fill up a 100gb osd so quickly
-        mon osd full ratio: .9
-        mon osd backfillfull_ratio: .85
-        mon osd nearfull ratio: .8
-        osd failsafe full ratio: .95
-# this doesn't work with failures bc the log writes are not atomic across the two backends
-#        bluestore bluefs env mirror: true
-  ceph-deploy:
-    fs: xfs
-    bluestore: yes
-    conf:
-      osd:
-        osd objectstore: bluestore
-        bluestore block size: 96636764160
-        debug bluestore: 20
-        debug bluefs: 20
-        debug rocksdb: 10
-        bluestore fsck on mount: true
-        # lower the full ratios since we can fill up a 100gb osd so quickly
-        mon osd full ratio: .9
-        mon osd backfillfull_ratio: .85
-        mon osd nearfull ratio: .8
-        osd failsafe full ratio: .95
-
diff -Nru ceph-12.2.11/qa/suites/rados/monthrash/objectstore/bluestore-bitmap.yaml ceph-12.2.12/qa/suites/rados/monthrash/objectstore/bluestore-bitmap.yaml
--- ceph-12.2.11/qa/suites/rados/monthrash/objectstore/bluestore-bitmap.yaml	2019-01-30 15:51:26.000000000 +0000
+++ ceph-12.2.12/qa/suites/rados/monthrash/objectstore/bluestore-bitmap.yaml	2019-04-11 12:33:50.000000000 +0000
@@ -20,6 +20,8 @@
         osd failsafe full ratio: .95
 # this doesn't work with failures bc the log writes are not atomic across the two backends
 #        bluestore bluefs env mirror: true
+        bdev enable discard: true
+        bdev async discard: true
   ceph-deploy:
     fs: xfs
     bluestore: yes
@@ -36,4 +38,6 @@
         mon osd backfillfull_ratio: .85
         mon osd nearfull ratio: .8
         osd failsafe full ratio: .95
+        bdev enable discard: true
+        bdev async discard: true
 
diff -Nru ceph-12.2.11/qa/suites/rados/monthrash/objectstore/bluestore-stupid.yaml ceph-12.2.12/qa/suites/rados/monthrash/objectstore/bluestore-stupid.yaml
--- ceph-12.2.11/qa/suites/rados/monthrash/objectstore/bluestore-stupid.yaml	1970-01-01 00:00:00.000000000 +0000
+++ ceph-12.2.12/qa/suites/rados/monthrash/objectstore/bluestore-stupid.yaml	2019-04-11 12:33:50.000000000 +0000
@@ -0,0 +1,39 @@
+overrides:
+  thrashosds:
+    bdev_inject_crash: 2
+    bdev_inject_crash_probability: .5
+  ceph:
+    fs: xfs
+    conf:
+      osd:
+        osd objectstore: bluestore
+        bluestore block size: 96636764160
+        debug bluestore: 20
+        debug bluefs: 20
+        debug rocksdb: 10
+        bluestore fsck on mount: true
+        bluestore allocator: stupid
+        # lower the full ratios since we can fill up a 100gb osd so quickly
+        mon osd full ratio: .9
+        mon osd backfillfull_ratio: .85
+        mon osd nearfull ratio: .8
+        osd failsafe full ratio: .95
+# this doesn't work with failures bc the log writes are not atomic across the two backends
+#        bluestore bluefs env mirror: true
+  ceph-deploy:
+    fs: xfs
+    bluestore: yes
+    conf:
+      osd:
+        osd objectstore: bluestore
+        bluestore block size: 96636764160
+        debug bluestore: 20
+        debug bluefs: 20
+        debug rocksdb: 10
+        bluestore fsck on mount: true
+        # lower the full ratios since we can fill up a 100gb osd so quickly
+        mon osd full ratio: .9
+        mon osd backfillfull_ratio: .85
+        mon osd nearfull ratio: .8
+        osd failsafe full ratio: .95
+
diff -Nru ceph-12.2.11/qa/suites/rados/monthrash/objectstore/bluestore.yaml ceph-12.2.12/qa/suites/rados/monthrash/objectstore/bluestore.yaml
--- ceph-12.2.11/qa/suites/rados/monthrash/objectstore/bluestore.yaml	2019-01-30 15:51:26.000000000 +0000
+++ ceph-12.2.12/qa/suites/rados/monthrash/objectstore/bluestore.yaml	1970-01-01 00:00:00.000000000 +0000
@@ -1,38 +0,0 @@
-overrides:
-  thrashosds:
-    bdev_inject_crash: 2
-    bdev_inject_crash_probability: .5
-  ceph:
-    fs: xfs
-    conf:
-      osd:
-        osd objectstore: bluestore
-        bluestore block size: 96636764160
-        debug bluestore: 20
-        debug bluefs: 20
-        debug rocksdb: 10
-        bluestore fsck on mount: true
-        # lower the full ratios since we can fill up a 100gb osd so quickly
-        mon osd full ratio: .9
-        mon osd backfillfull_ratio: .85
-        mon osd nearfull ratio: .8
-        osd failsafe full ratio: .95
-# this doesn't work with failures bc the log writes are not atomic across the two backends
-#        bluestore bluefs env mirror: true
-  ceph-deploy:
-    fs: xfs
-    bluestore: yes
-    conf:
-      osd:
-        osd objectstore: bluestore
-        bluestore block size: 96636764160
-        debug bluestore: 20
-        debug bluefs: 20
-        debug rocksdb: 10
-        bluestore fsck on mount: true
-        # lower the full ratios since we can fill up a 100gb osd so quickly
-        mon osd full ratio: .9
-        mon osd backfillfull_ratio: .85
-        mon osd nearfull ratio: .8
-        osd failsafe full ratio: .95
-
diff -Nru ceph-12.2.11/qa/suites/rados/monthrash/workloads/rados_api_tests.yaml ceph-12.2.12/qa/suites/rados/monthrash/workloads/rados_api_tests.yaml
--- ceph-12.2.11/qa/suites/rados/monthrash/workloads/rados_api_tests.yaml	2019-01-30 15:51:26.000000000 +0000
+++ ceph-12.2.12/qa/suites/rados/monthrash/workloads/rados_api_tests.yaml	2019-04-11 12:33:50.000000000 +0000
@@ -11,6 +11,7 @@
       - \(PG_
       - \(POOL_APP_NOT_ENABLED\)
       - \(SMALLER_PGP_NUM\)
+      - slow request
     conf:
       global:
         debug objecter: 20
diff -Nru ceph-12.2.11/qa/suites/rados/multimon/objectstore/bluestore-bitmap.yaml ceph-12.2.12/qa/suites/rados/multimon/objectstore/bluestore-bitmap.yaml
--- ceph-12.2.11/qa/suites/rados/multimon/objectstore/bluestore-bitmap.yaml	2019-01-30 15:51:26.000000000 +0000
+++ ceph-12.2.12/qa/suites/rados/multimon/objectstore/bluestore-bitmap.yaml	2019-04-11 12:33:50.000000000 +0000
@@ -20,6 +20,8 @@
         osd failsafe full ratio: .95
 # this doesn't work with failures bc the log writes are not atomic across the two backends
 #        bluestore bluefs env mirror: true
+        bdev enable discard: true
+        bdev async discard: true
   ceph-deploy:
     fs: xfs
     bluestore: yes
@@ -36,4 +38,6 @@
         mon osd backfillfull_ratio: .85
         mon osd nearfull ratio: .8
         osd failsafe full ratio: .95
+        bdev enable discard: true
+        bdev async discard: true
 
diff -Nru ceph-12.2.11/qa/suites/rados/multimon/objectstore/bluestore-stupid.yaml ceph-12.2.12/qa/suites/rados/multimon/objectstore/bluestore-stupid.yaml
--- ceph-12.2.11/qa/suites/rados/multimon/objectstore/bluestore-stupid.yaml	1970-01-01 00:00:00.000000000 +0000
+++ ceph-12.2.12/qa/suites/rados/multimon/objectstore/bluestore-stupid.yaml	2019-04-11 12:33:50.000000000 +0000
@@ -0,0 +1,39 @@
+overrides:
+  thrashosds:
+    bdev_inject_crash: 2
+    bdev_inject_crash_probability: .5
+  ceph:
+    fs: xfs
+    conf:
+      osd:
+        osd objectstore: bluestore
+        bluestore block size: 96636764160
+        debug bluestore: 20
+        debug bluefs: 20
+        debug rocksdb: 10
+        bluestore fsck on mount: true
+        bluestore allocator: stupid
+        # lower the full ratios since we can fill up a 100gb osd so quickly
+        mon osd full ratio: .9
+        mon osd backfillfull_ratio: .85
+        mon osd nearfull ratio: .8
+        osd failsafe full ratio: .95
+# this doesn't work with failures bc the log writes are not atomic across the two backends
+#        bluestore bluefs env mirror: true
+  ceph-deploy:
+    fs: xfs
+    bluestore: yes
+    conf:
+      osd:
+        osd objectstore: bluestore
+        bluestore block size: 96636764160
+        debug bluestore: 20
+        debug bluefs: 20
+        debug rocksdb: 10
+        bluestore fsck on mount: true
+        # lower the full ratios since we can fill up a 100gb osd so quickly
+        mon osd full ratio: .9
+        mon osd backfillfull_ratio: .85
+        mon osd nearfull ratio: .8
+        osd failsafe full ratio: .95
+
diff -Nru ceph-12.2.11/qa/suites/rados/multimon/objectstore/bluestore.yaml ceph-12.2.12/qa/suites/rados/multimon/objectstore/bluestore.yaml
--- ceph-12.2.11/qa/suites/rados/multimon/objectstore/bluestore.yaml	2019-01-30 15:51:26.000000000 +0000
+++ ceph-12.2.12/qa/suites/rados/multimon/objectstore/bluestore.yaml	1970-01-01 00:00:00.000000000 +0000
@@ -1,38 +0,0 @@
-overrides:
-  thrashosds:
-    bdev_inject_crash: 2
-    bdev_inject_crash_probability: .5
-  ceph:
-    fs: xfs
-    conf:
-      osd:
-        osd objectstore: bluestore
-        bluestore block size: 96636764160
-        debug bluestore: 20
-        debug bluefs: 20
-        debug rocksdb: 10
-        bluestore fsck on mount: true
-        # lower the full ratios since we can fill up a 100gb osd so quickly
-        mon osd full ratio: .9
-        mon osd backfillfull_ratio: .85
-        mon osd nearfull ratio: .8
-        osd failsafe full ratio: .95
-# this doesn't work with failures bc the log writes are not atomic across the two backends
-#        bluestore bluefs env mirror: true
-  ceph-deploy:
-    fs: xfs
-    bluestore: yes
-    conf:
-      osd:
-        osd objectstore: bluestore
-        bluestore block size: 96636764160
-        debug bluestore: 20
-        debug bluefs: 20
-        debug rocksdb: 10
-        bluestore fsck on mount: true
-        # lower the full ratios since we can fill up a 100gb osd so quickly
-        mon osd full ratio: .9
-        mon osd backfillfull_ratio: .85
-        mon osd nearfull ratio: .8
-        osd failsafe full ratio: .95
-
diff -Nru ceph-12.2.11/qa/suites/rados/rest/rest_test.yaml ceph-12.2.12/qa/suites/rados/rest/rest_test.yaml
--- ceph-12.2.11/qa/suites/rados/rest/rest_test.yaml	2019-01-30 15:51:26.000000000 +0000
+++ ceph-12.2.12/qa/suites/rados/rest/rest_test.yaml	2019-04-11 12:33:50.000000000 +0000
@@ -32,6 +32,7 @@
       - \(SLOW_OPS\)
       - \(TOO_FEW_PGS\)
       - but it is still running
+      - slow request
     conf:
       client.rest0:
         debug ms: 1
diff -Nru ceph-12.2.11/qa/suites/rados/singleton/all/osd-recovery-incomplete.yaml ceph-12.2.12/qa/suites/rados/singleton/all/osd-recovery-incomplete.yaml
--- ceph-12.2.11/qa/suites/rados/singleton/all/osd-recovery-incomplete.yaml	2019-01-30 15:51:26.000000000 +0000
+++ ceph-12.2.12/qa/suites/rados/singleton/all/osd-recovery-incomplete.yaml	2019-04-11 12:33:50.000000000 +0000
@@ -22,6 +22,7 @@
       - \(PG_
       - \(OBJECT_
       - \(REQUEST_SLOW\)
+      - slow request
     conf:
       osd:
         osd min pg log entries: 5
diff -Nru ceph-12.2.11/qa/suites/rados/singleton/all/osd-recovery.yaml ceph-12.2.12/qa/suites/rados/singleton/all/osd-recovery.yaml
--- ceph-12.2.11/qa/suites/rados/singleton/all/osd-recovery.yaml	2019-01-30 15:51:26.000000000 +0000
+++ ceph-12.2.12/qa/suites/rados/singleton/all/osd-recovery.yaml	2019-04-11 12:33:50.000000000 +0000
@@ -20,8 +20,8 @@
       - \(OSD_
       - \(PG_
       - \(OBJECT_DEGRADED\)
-      - \(SLOW_OPS\)
       - \(REQUEST_SLOW\)
+      - slow request
     conf:
       osd:
         osd min pg log entries: 5
diff -Nru ceph-12.2.11/qa/suites/rados/singleton/all/thrash-eio.yaml ceph-12.2.12/qa/suites/rados/singleton/all/thrash-eio.yaml
--- ceph-12.2.11/qa/suites/rados/singleton/all/thrash-eio.yaml	2019-01-30 15:51:26.000000000 +0000
+++ ceph-12.2.12/qa/suites/rados/singleton/all/thrash-eio.yaml	2019-04-11 12:33:50.000000000 +0000
@@ -30,6 +30,7 @@
     - \(PG_
     - \(OBJECT_MISPLACED\)
     - \(OSD_
+    - slow request
 - thrashosds:
     op_delay: 30
     clean_interval: 120
diff -Nru ceph-12.2.11/qa/suites/rados/singleton/objectstore/bluestore-bitmap.yaml ceph-12.2.12/qa/suites/rados/singleton/objectstore/bluestore-bitmap.yaml
--- ceph-12.2.11/qa/suites/rados/singleton/objectstore/bluestore-bitmap.yaml	2019-01-30 15:51:26.000000000 +0000
+++ ceph-12.2.12/qa/suites/rados/singleton/objectstore/bluestore-bitmap.yaml	2019-04-11 12:33:50.000000000 +0000
@@ -20,6 +20,8 @@
         osd failsafe full ratio: .95
 # this doesn't work with failures bc the log writes are not atomic across the two backends
 #        bluestore bluefs env mirror: true
+        bdev enable discard: true
+        bdev async discard: true
   ceph-deploy:
     fs: xfs
     bluestore: yes
@@ -36,4 +38,6 @@
         mon osd backfillfull_ratio: .85
         mon osd nearfull ratio: .8
         osd failsafe full ratio: .95
+        bdev enable discard: true
+        bdev async discard: true
 
diff -Nru ceph-12.2.11/qa/suites/rados/singleton/objectstore/bluestore-stupid.yaml ceph-12.2.12/qa/suites/rados/singleton/objectstore/bluestore-stupid.yaml
--- ceph-12.2.11/qa/suites/rados/singleton/objectstore/bluestore-stupid.yaml	1970-01-01 00:00:00.000000000 +0000
+++ ceph-12.2.12/qa/suites/rados/singleton/objectstore/bluestore-stupid.yaml	2019-04-11 12:33:50.000000000 +0000
@@ -0,0 +1,39 @@
+overrides:
+  thrashosds:
+    bdev_inject_crash: 2
+    bdev_inject_crash_probability: .5
+  ceph:
+    fs: xfs
+    conf:
+      osd:
+        osd objectstore: bluestore
+        bluestore block size: 96636764160
+        debug bluestore: 20
+        debug bluefs: 20
+        debug rocksdb: 10
+        bluestore fsck on mount: true
+        bluestore allocator: stupid
+        # lower the full ratios since we can fill up a 100gb osd so quickly
+        mon osd full ratio: .9
+        mon osd backfillfull_ratio: .85
+        mon osd nearfull ratio: .8
+        osd failsafe full ratio: .95
+# this doesn't work with failures bc the log writes are not atomic across the two backends
+#        bluestore bluefs env mirror: true
+  ceph-deploy:
+    fs: xfs
+    bluestore: yes
+    conf:
+      osd:
+        osd objectstore: bluestore
+        bluestore block size: 96636764160
+        debug bluestore: 20
+        debug bluefs: 20
+        debug rocksdb: 10
+        bluestore fsck on mount: true
+        # lower the full ratios since we can fill up a 100gb osd so quickly
+        mon osd full ratio: .9
+        mon osd backfillfull_ratio: .85
+        mon osd nearfull ratio: .8
+        osd failsafe full ratio: .95
+
diff -Nru ceph-12.2.11/qa/suites/rados/singleton/objectstore/bluestore.yaml ceph-12.2.12/qa/suites/rados/singleton/objectstore/bluestore.yaml
--- ceph-12.2.11/qa/suites/rados/singleton/objectstore/bluestore.yaml	2019-01-30 15:51:26.000000000 +0000
+++ ceph-12.2.12/qa/suites/rados/singleton/objectstore/bluestore.yaml	1970-01-01 00:00:00.000000000 +0000
@@ -1,38 +0,0 @@
-overrides:
-  thrashosds:
-    bdev_inject_crash: 2
-    bdev_inject_crash_probability: .5
-  ceph:
-    fs: xfs
-    conf:
-      osd:
-        osd objectstore: bluestore
-        bluestore block size: 96636764160
-        debug bluestore: 20
-        debug bluefs: 20
-        debug rocksdb: 10
-        bluestore fsck on mount: true
-        # lower the full ratios since we can fill up a 100gb osd so quickly
-        mon osd full ratio: .9
-        mon osd backfillfull_ratio: .85
-        mon osd nearfull ratio: .8
-        osd failsafe full ratio: .95
-# this doesn't work with failures bc the log writes are not atomic across the two backends
-#        bluestore bluefs env mirror: true
-  ceph-deploy:
-    fs: xfs
-    bluestore: yes
-    conf:
-      osd:
-        osd objectstore: bluestore
-        bluestore block size: 96636764160
-        debug bluestore: 20
-        debug bluefs: 20
-        debug rocksdb: 10
-        bluestore fsck on mount: true
-        # lower the full ratios since we can fill up a 100gb osd so quickly
-        mon osd full ratio: .9
-        mon osd backfillfull_ratio: .85
-        mon osd nearfull ratio: .8
-        osd failsafe full ratio: .95
-
diff -Nru ceph-12.2.11/qa/suites/rados/singleton-bluestore/objectstore/bluestore-bitmap.yaml ceph-12.2.12/qa/suites/rados/singleton-bluestore/objectstore/bluestore-bitmap.yaml
--- ceph-12.2.11/qa/suites/rados/singleton-bluestore/objectstore/bluestore-bitmap.yaml	1970-01-01 00:00:00.000000000 +0000
+++ ceph-12.2.12/qa/suites/rados/singleton-bluestore/objectstore/bluestore-bitmap.yaml	2019-04-11 12:33:50.000000000 +0000
@@ -0,0 +1,43 @@
+overrides:
+  thrashosds:
+    bdev_inject_crash: 2
+    bdev_inject_crash_probability: .5
+  ceph:
+    fs: xfs
+    conf:
+      osd:
+        osd objectstore: bluestore
+        bluestore block size: 96636764160
+        debug bluestore: 20
+        debug bluefs: 20
+        debug rocksdb: 10
+        bluestore fsck on mount: true
+        bluestore allocator: bitmap
+        # lower the full ratios since we can fill up a 100gb osd so quickly
+        mon osd full ratio: .9
+        mon osd backfillfull_ratio: .85
+        mon osd nearfull ratio: .8
+        osd failsafe full ratio: .95
+# this doesn't work with failures bc the log writes are not atomic across the two backends
+#        bluestore bluefs env mirror: true
+        bdev enable discard: true
+        bdev async discard: true
+  ceph-deploy:
+    fs: xfs
+    bluestore: yes
+    conf:
+      osd:
+        osd objectstore: bluestore
+        bluestore block size: 96636764160
+        debug bluestore: 20
+        debug bluefs: 20
+        debug rocksdb: 10
+        bluestore fsck on mount: true
+        # lower the full ratios since we can fill up a 100gb osd so quickly
+        mon osd full ratio: .9
+        mon osd backfillfull_ratio: .85
+        mon osd nearfull ratio: .8
+        osd failsafe full ratio: .95
+        bdev enable discard: true
+        bdev async discard: true
+
diff -Nru ceph-12.2.11/qa/suites/rados/singleton-bluestore/objectstore/bluestore.yaml ceph-12.2.12/qa/suites/rados/singleton-bluestore/objectstore/bluestore.yaml
--- ceph-12.2.11/qa/suites/rados/singleton-bluestore/objectstore/bluestore.yaml	2019-01-30 15:51:26.000000000 +0000
+++ ceph-12.2.12/qa/suites/rados/singleton-bluestore/objectstore/bluestore.yaml	1970-01-01 00:00:00.000000000 +0000
@@ -1,38 +0,0 @@
-overrides:
-  thrashosds:
-    bdev_inject_crash: 2
-    bdev_inject_crash_probability: .5
-  ceph:
-    fs: xfs
-    conf:
-      osd:
-        osd objectstore: bluestore
-        bluestore block size: 96636764160
-        debug bluestore: 20
-        debug bluefs: 20
-        debug rocksdb: 10
-        bluestore fsck on mount: true
-        # lower the full ratios since we can fill up a 100gb osd so quickly
-        mon osd full ratio: .9
-        mon osd backfillfull_ratio: .85
-        mon osd nearfull ratio: .8
-        osd failsafe full ratio: .95
-# this doesn't work with failures bc the log writes are not atomic across the two backends
-#        bluestore bluefs env mirror: true
-  ceph-deploy:
-    fs: xfs
-    bluestore: yes
-    conf:
-      osd:
-        osd objectstore: bluestore
-        bluestore block size: 96636764160
-        debug bluestore: 20
-        debug bluefs: 20
-        debug rocksdb: 10
-        bluestore fsck on mount: true
-        # lower the full ratios since we can fill up a 100gb osd so quickly
-        mon osd full ratio: .9
-        mon osd backfillfull_ratio: .85
-        mon osd nearfull ratio: .8
-        osd failsafe full ratio: .95
-
diff -Nru ceph-12.2.11/qa/suites/rados/thrash/1-pg-log-overrides/short_pg_log.yaml ceph-12.2.12/qa/suites/rados/thrash/1-pg-log-overrides/short_pg_log.yaml
--- ceph-12.2.11/qa/suites/rados/thrash/1-pg-log-overrides/short_pg_log.yaml	2019-01-30 15:51:26.000000000 +0000
+++ ceph-12.2.12/qa/suites/rados/thrash/1-pg-log-overrides/short_pg_log.yaml	2019-04-11 12:33:50.000000000 +0000
@@ -2,5 +2,5 @@
   ceph:
     conf:
       global:
-        osd_min_pg_log_entries: 300
-        osd_max_pg_log_entries: 600
+        osd_min_pg_log_entries: 1
+        osd_max_pg_log_entries: 2
diff -Nru ceph-12.2.11/qa/suites/rados/thrash/objectstore/bluestore-bitmap.yaml ceph-12.2.12/qa/suites/rados/thrash/objectstore/bluestore-bitmap.yaml
--- ceph-12.2.11/qa/suites/rados/thrash/objectstore/bluestore-bitmap.yaml	2019-01-30 15:51:26.000000000 +0000
+++ ceph-12.2.12/qa/suites/rados/thrash/objectstore/bluestore-bitmap.yaml	2019-04-11 12:33:50.000000000 +0000
@@ -20,6 +20,8 @@
         osd failsafe full ratio: .95
 # this doesn't work with failures bc the log writes are not atomic across the two backends
 #        bluestore bluefs env mirror: true
+        bdev enable discard: true
+        bdev async discard: true
   ceph-deploy:
     fs: xfs
     bluestore: yes
@@ -36,4 +38,6 @@
         mon osd backfillfull_ratio: .85
         mon osd nearfull ratio: .8
         osd failsafe full ratio: .95
+        bdev enable discard: true
+        bdev async discard: true
 
diff -Nru ceph-12.2.11/qa/suites/rados/thrash/objectstore/bluestore-stupid.yaml ceph-12.2.12/qa/suites/rados/thrash/objectstore/bluestore-stupid.yaml
--- ceph-12.2.11/qa/suites/rados/thrash/objectstore/bluestore-stupid.yaml	1970-01-01 00:00:00.000000000 +0000
+++ ceph-12.2.12/qa/suites/rados/thrash/objectstore/bluestore-stupid.yaml	2019-04-11 12:33:50.000000000 +0000
@@ -0,0 +1,39 @@
+overrides:
+  thrashosds:
+    bdev_inject_crash: 2
+    bdev_inject_crash_probability: .5
+  ceph:
+    fs: xfs
+    conf:
+      osd:
+        osd objectstore: bluestore
+        bluestore block size: 96636764160
+        debug bluestore: 20
+        debug bluefs: 20
+        debug rocksdb: 10
+        bluestore fsck on mount: true
+        bluestore allocator: stupid
+        # lower the full ratios since we can fill up a 100gb osd so quickly
+        mon osd full ratio: .9
+        mon osd backfillfull_ratio: .85
+        mon osd nearfull ratio: .8
+        osd failsafe full ratio: .95
+# this doesn't work with failures bc the log writes are not atomic across the two backends
+#        bluestore bluefs env mirror: true
+  ceph-deploy:
+    fs: xfs
+    bluestore: yes
+    conf:
+      osd:
+        osd objectstore: bluestore
+        bluestore block size: 96636764160
+        debug bluestore: 20
+        debug bluefs: 20
+        debug rocksdb: 10
+        bluestore fsck on mount: true
+        # lower the full ratios since we can fill up a 100gb osd so quickly
+        mon osd full ratio: .9
+        mon osd backfillfull_ratio: .85
+        mon osd nearfull ratio: .8
+        osd failsafe full ratio: .95
+
diff -Nru ceph-12.2.11/qa/suites/rados/thrash/objectstore/bluestore.yaml ceph-12.2.12/qa/suites/rados/thrash/objectstore/bluestore.yaml
--- ceph-12.2.11/qa/suites/rados/thrash/objectstore/bluestore.yaml	2019-01-30 15:51:26.000000000 +0000
+++ ceph-12.2.12/qa/suites/rados/thrash/objectstore/bluestore.yaml	1970-01-01 00:00:00.000000000 +0000
@@ -1,38 +0,0 @@
-overrides:
-  thrashosds:
-    bdev_inject_crash: 2
-    bdev_inject_crash_probability: .5
-  ceph:
-    fs: xfs
-    conf:
-      osd:
-        osd objectstore: bluestore
-        bluestore block size: 96636764160
-        debug bluestore: 20
-        debug bluefs: 20
-        debug rocksdb: 10
-        bluestore fsck on mount: true
-        # lower the full ratios since we can fill up a 100gb osd so quickly
-        mon osd full ratio: .9
-        mon osd backfillfull_ratio: .85
-        mon osd nearfull ratio: .8
-        osd failsafe full ratio: .95
-# this doesn't work with failures bc the log writes are not atomic across the two backends
-#        bluestore bluefs env mirror: true
-  ceph-deploy:
-    fs: xfs
-    bluestore: yes
-    conf:
-      osd:
-        osd objectstore: bluestore
-        bluestore block size: 96636764160
-        debug bluestore: 20
-        debug bluefs: 20
-        debug rocksdb: 10
-        bluestore fsck on mount: true
-        # lower the full ratios since we can fill up a 100gb osd so quickly
-        mon osd full ratio: .9
-        mon osd backfillfull_ratio: .85
-        mon osd nearfull ratio: .8
-        osd failsafe full ratio: .95
-
diff -Nru ceph-12.2.11/qa/suites/rados/thrash-erasure-code/objectstore/bluestore-bitmap.yaml ceph-12.2.12/qa/suites/rados/thrash-erasure-code/objectstore/bluestore-bitmap.yaml
--- ceph-12.2.11/qa/suites/rados/thrash-erasure-code/objectstore/bluestore-bitmap.yaml	2019-01-30 15:51:26.000000000 +0000
+++ ceph-12.2.12/qa/suites/rados/thrash-erasure-code/objectstore/bluestore-bitmap.yaml	2019-04-11 12:33:50.000000000 +0000
@@ -20,6 +20,8 @@
         osd failsafe full ratio: .95
 # this doesn't work with failures bc the log writes are not atomic across the two backends
 #        bluestore bluefs env mirror: true
+        bdev enable discard: true
+        bdev async discard: true
   ceph-deploy:
     fs: xfs
     bluestore: yes
@@ -36,4 +38,6 @@
         mon osd backfillfull_ratio: .85
         mon osd nearfull ratio: .8
         osd failsafe full ratio: .95
+        bdev enable discard: true
+        bdev async discard: true
 
diff -Nru ceph-12.2.11/qa/suites/rados/thrash-erasure-code/objectstore/bluestore-stupid.yaml ceph-12.2.12/qa/suites/rados/thrash-erasure-code/objectstore/bluestore-stupid.yaml
--- ceph-12.2.11/qa/suites/rados/thrash-erasure-code/objectstore/bluestore-stupid.yaml	1970-01-01 00:00:00.000000000 +0000
+++ ceph-12.2.12/qa/suites/rados/thrash-erasure-code/objectstore/bluestore-stupid.yaml	2019-04-11 12:33:50.000000000 +0000
@@ -0,0 +1,39 @@
+overrides:
+  thrashosds:
+    bdev_inject_crash: 2
+    bdev_inject_crash_probability: .5
+  ceph:
+    fs: xfs
+    conf:
+      osd:
+        osd objectstore: bluestore
+        bluestore block size: 96636764160
+        debug bluestore: 20
+        debug bluefs: 20
+        debug rocksdb: 10
+        bluestore fsck on mount: true
+        bluestore allocator: stupid
+        # lower the full ratios since we can fill up a 100gb osd so quickly
+        mon osd full ratio: .9
+        mon osd backfillfull_ratio: .85
+        mon osd nearfull ratio: .8
+        osd failsafe full ratio: .95
+# this doesn't work with failures bc the log writes are not atomic across the two backends
+#        bluestore bluefs env mirror: true
+  ceph-deploy:
+    fs: xfs
+    bluestore: yes
+    conf:
+      osd:
+        osd objectstore: bluestore
+        bluestore block size: 96636764160
+        debug bluestore: 20
+        debug bluefs: 20
+        debug rocksdb: 10
+        bluestore fsck on mount: true
+        # lower the full ratios since we can fill up a 100gb osd so quickly
+        mon osd full ratio: .9
+        mon osd backfillfull_ratio: .85
+        mon osd nearfull ratio: .8
+        osd failsafe full ratio: .95
+
diff -Nru ceph-12.2.11/qa/suites/rados/thrash-erasure-code/objectstore/bluestore.yaml ceph-12.2.12/qa/suites/rados/thrash-erasure-code/objectstore/bluestore.yaml
--- ceph-12.2.11/qa/suites/rados/thrash-erasure-code/objectstore/bluestore.yaml	2019-01-30 15:51:26.000000000 +0000
+++ ceph-12.2.12/qa/suites/rados/thrash-erasure-code/objectstore/bluestore.yaml	1970-01-01 00:00:00.000000000 +0000
@@ -1,38 +0,0 @@
-overrides:
-  thrashosds:
-    bdev_inject_crash: 2
-    bdev_inject_crash_probability: .5
-  ceph:
-    fs: xfs
-    conf:
-      osd:
-        osd objectstore: bluestore
-        bluestore block size: 96636764160
-        debug bluestore: 20
-        debug bluefs: 20
-        debug rocksdb: 10
-        bluestore fsck on mount: true
-        # lower the full ratios since we can fill up a 100gb osd so quickly
-        mon osd full ratio: .9
-        mon osd backfillfull_ratio: .85
-        mon osd nearfull ratio: .8
-        osd failsafe full ratio: .95
-# this doesn't work with failures bc the log writes are not atomic across the two backends
-#        bluestore bluefs env mirror: true
-  ceph-deploy:
-    fs: xfs
-    bluestore: yes
-    conf:
-      osd:
-        osd objectstore: bluestore
-        bluestore block size: 96636764160
-        debug bluestore: 20
-        debug bluefs: 20
-        debug rocksdb: 10
-        bluestore fsck on mount: true
-        # lower the full ratios since we can fill up a 100gb osd so quickly
-        mon osd full ratio: .9
-        mon osd backfillfull_ratio: .85
-        mon osd nearfull ratio: .8
-        osd failsafe full ratio: .95
-
diff -Nru ceph-12.2.11/qa/suites/rados/thrash-erasure-code-big/objectstore/bluestore-bitmap.yaml ceph-12.2.12/qa/suites/rados/thrash-erasure-code-big/objectstore/bluestore-bitmap.yaml
--- ceph-12.2.11/qa/suites/rados/thrash-erasure-code-big/objectstore/bluestore-bitmap.yaml	2019-01-30 15:51:26.000000000 +0000
+++ ceph-12.2.12/qa/suites/rados/thrash-erasure-code-big/objectstore/bluestore-bitmap.yaml	2019-04-11 12:33:50.000000000 +0000
@@ -20,6 +20,8 @@
         osd failsafe full ratio: .95
 # this doesn't work with failures bc the log writes are not atomic across the two backends
 #        bluestore bluefs env mirror: true
+        bdev enable discard: true
+        bdev async discard: true
   ceph-deploy:
     fs: xfs
     bluestore: yes
@@ -36,4 +38,6 @@
         mon osd backfillfull_ratio: .85
         mon osd nearfull ratio: .8
         osd failsafe full ratio: .95
+        bdev enable discard: true
+        bdev async discard: true
 
diff -Nru ceph-12.2.11/qa/suites/rados/thrash-erasure-code-big/objectstore/bluestore-stupid.yaml ceph-12.2.12/qa/suites/rados/thrash-erasure-code-big/objectstore/bluestore-stupid.yaml
--- ceph-12.2.11/qa/suites/rados/thrash-erasure-code-big/objectstore/bluestore-stupid.yaml	1970-01-01 00:00:00.000000000 +0000
+++ ceph-12.2.12/qa/suites/rados/thrash-erasure-code-big/objectstore/bluestore-stupid.yaml	2019-04-11 12:33:50.000000000 +0000
@@ -0,0 +1,39 @@
+overrides:
+  thrashosds:
+    bdev_inject_crash: 2
+    bdev_inject_crash_probability: .5
+  ceph:
+    fs: xfs
+    conf:
+      osd:
+        osd objectstore: bluestore
+        bluestore block size: 96636764160
+        debug bluestore: 20
+        debug bluefs: 20
+        debug rocksdb: 10
+        bluestore fsck on mount: true
+        bluestore allocator: stupid
+        # lower the full ratios since we can fill up a 100gb osd so quickly
+        mon osd full ratio: .9
+        mon osd backfillfull_ratio: .85
+        mon osd nearfull ratio: .8
+        osd failsafe full ratio: .95
+# this doesn't work with failures bc the log writes are not atomic across the two backends
+#        bluestore bluefs env mirror: true
+  ceph-deploy:
+    fs: xfs
+    bluestore: yes
+    conf:
+      osd:
+        osd objectstore: bluestore
+        bluestore block size: 96636764160
+        debug bluestore: 20
+        debug bluefs: 20
+        debug rocksdb: 10
+        bluestore fsck on mount: true
+        # lower the full ratios since we can fill up a 100gb osd so quickly
+        mon osd full ratio: .9
+        mon osd backfillfull_ratio: .85
+        mon osd nearfull ratio: .8
+        osd failsafe full ratio: .95
+
diff -Nru ceph-12.2.11/qa/suites/rados/thrash-erasure-code-big/objectstore/bluestore.yaml ceph-12.2.12/qa/suites/rados/thrash-erasure-code-big/objectstore/bluestore.yaml
--- ceph-12.2.11/qa/suites/rados/thrash-erasure-code-big/objectstore/bluestore.yaml	2019-01-30 15:51:26.000000000 +0000
+++ ceph-12.2.12/qa/suites/rados/thrash-erasure-code-big/objectstore/bluestore.yaml	1970-01-01 00:00:00.000000000 +0000
@@ -1,38 +0,0 @@
-overrides:
-  thrashosds:
-    bdev_inject_crash: 2
-    bdev_inject_crash_probability: .5
-  ceph:
-    fs: xfs
-    conf:
-      osd:
-        osd objectstore: bluestore
-        bluestore block size: 96636764160
-        debug bluestore: 20
-        debug bluefs: 20
-        debug rocksdb: 10
-        bluestore fsck on mount: true
-        # lower the full ratios since we can fill up a 100gb osd so quickly
-        mon osd full ratio: .9
-        mon osd backfillfull_ratio: .85
-        mon osd nearfull ratio: .8
-        osd failsafe full ratio: .95
-# this doesn't work with failures bc the log writes are not atomic across the two backends
-#        bluestore bluefs env mirror: true
-  ceph-deploy:
-    fs: xfs
-    bluestore: yes
-    conf:
-      osd:
-        osd objectstore: bluestore
-        bluestore block size: 96636764160
-        debug bluestore: 20
-        debug bluefs: 20
-        debug rocksdb: 10
-        bluestore fsck on mount: true
-        # lower the full ratios since we can fill up a 100gb osd so quickly
-        mon osd full ratio: .9
-        mon osd backfillfull_ratio: .85
-        mon osd nearfull ratio: .8
-        osd failsafe full ratio: .95
-
diff -Nru ceph-12.2.11/qa/suites/rados/thrash-erasure-code-isa/objectstore/bluestore-bitmap.yaml ceph-12.2.12/qa/suites/rados/thrash-erasure-code-isa/objectstore/bluestore-bitmap.yaml
--- ceph-12.2.11/qa/suites/rados/thrash-erasure-code-isa/objectstore/bluestore-bitmap.yaml	2019-01-30 15:51:26.000000000 +0000
+++ ceph-12.2.12/qa/suites/rados/thrash-erasure-code-isa/objectstore/bluestore-bitmap.yaml	2019-04-11 12:33:50.000000000 +0000
@@ -20,6 +20,8 @@
         osd failsafe full ratio: .95
 # this doesn't work with failures bc the log writes are not atomic across the two backends
 #        bluestore bluefs env mirror: true
+        bdev enable discard: true
+        bdev async discard: true
   ceph-deploy:
     fs: xfs
     bluestore: yes
@@ -36,4 +38,6 @@
         mon osd backfillfull_ratio: .85
         mon osd nearfull ratio: .8
         osd failsafe full ratio: .95
+        bdev enable discard: true
+        bdev async discard: true
 
diff -Nru ceph-12.2.11/qa/suites/rados/thrash-erasure-code-isa/objectstore/bluestore-stupid.yaml ceph-12.2.12/qa/suites/rados/thrash-erasure-code-isa/objectstore/bluestore-stupid.yaml
--- ceph-12.2.11/qa/suites/rados/thrash-erasure-code-isa/objectstore/bluestore-stupid.yaml	1970-01-01 00:00:00.000000000 +0000
+++ ceph-12.2.12/qa/suites/rados/thrash-erasure-code-isa/objectstore/bluestore-stupid.yaml	2019-04-11 12:33:50.000000000 +0000
@@ -0,0 +1,39 @@
+overrides:
+  thrashosds:
+    bdev_inject_crash: 2
+    bdev_inject_crash_probability: .5
+  ceph:
+    fs: xfs
+    conf:
+      osd:
+        osd objectstore: bluestore
+        bluestore block size: 96636764160
+        debug bluestore: 20
+        debug bluefs: 20
+        debug rocksdb: 10
+        bluestore fsck on mount: true
+        bluestore allocator: stupid
+        # lower the full ratios since we can fill up a 100gb osd so quickly
+        mon osd full ratio: .9
+        mon osd backfillfull_ratio: .85
+        mon osd nearfull ratio: .8
+        osd failsafe full ratio: .95
+# this doesn't work with failures bc the log writes are not atomic across the two backends
+#        bluestore bluefs env mirror: true
+  ceph-deploy:
+    fs: xfs
+    bluestore: yes
+    conf:
+      osd:
+        osd objectstore: bluestore
+        bluestore block size: 96636764160
+        debug bluestore: 20
+        debug bluefs: 20
+        debug rocksdb: 10
+        bluestore fsck on mount: true
+        # lower the full ratios since we can fill up a 100gb osd so quickly
+        mon osd full ratio: .9
+        mon osd backfillfull_ratio: .85
+        mon osd nearfull ratio: .8
+        osd failsafe full ratio: .95
+
diff -Nru ceph-12.2.11/qa/suites/rados/thrash-erasure-code-isa/objectstore/bluestore.yaml ceph-12.2.12/qa/suites/rados/thrash-erasure-code-isa/objectstore/bluestore.yaml
--- ceph-12.2.11/qa/suites/rados/thrash-erasure-code-isa/objectstore/bluestore.yaml	2019-01-30 15:51:26.000000000 +0000
+++ ceph-12.2.12/qa/suites/rados/thrash-erasure-code-isa/objectstore/bluestore.yaml	1970-01-01 00:00:00.000000000 +0000
@@ -1,38 +0,0 @@
-overrides:
-  thrashosds:
-    bdev_inject_crash: 2
-    bdev_inject_crash_probability: .5
-  ceph:
-    fs: xfs
-    conf:
-      osd:
-        osd objectstore: bluestore
-        bluestore block size: 96636764160
-        debug bluestore: 20
-        debug bluefs: 20
-        debug rocksdb: 10
-        bluestore fsck on mount: true
-        # lower the full ratios since we can fill up a 100gb osd so quickly
-        mon osd full ratio: .9
-        mon osd backfillfull_ratio: .85
-        mon osd nearfull ratio: .8
-        osd failsafe full ratio: .95
-# this doesn't work with failures bc the log writes are not atomic across the two backends
-#        bluestore bluefs env mirror: true
-  ceph-deploy:
-    fs: xfs
-    bluestore: yes
-    conf:
-      osd:
-        osd objectstore: bluestore
-        bluestore block size: 96636764160
-        debug bluestore: 20
-        debug bluefs: 20
-        debug rocksdb: 10
-        bluestore fsck on mount: true
-        # lower the full ratios since we can fill up a 100gb osd so quickly
-        mon osd full ratio: .9
-        mon osd backfillfull_ratio: .85
-        mon osd nearfull ratio: .8
-        osd failsafe full ratio: .95
-
diff -Nru ceph-12.2.11/qa/suites/rados/thrash-erasure-code-overwrites/bluestore-bitmap.yaml ceph-12.2.12/qa/suites/rados/thrash-erasure-code-overwrites/bluestore-bitmap.yaml
--- ceph-12.2.11/qa/suites/rados/thrash-erasure-code-overwrites/bluestore-bitmap.yaml	1970-01-01 00:00:00.000000000 +0000
+++ ceph-12.2.12/qa/suites/rados/thrash-erasure-code-overwrites/bluestore-bitmap.yaml	2019-04-11 12:33:50.000000000 +0000
@@ -0,0 +1,43 @@
+overrides:
+  thrashosds:
+    bdev_inject_crash: 2
+    bdev_inject_crash_probability: .5
+  ceph:
+    fs: xfs
+    conf:
+      osd:
+        osd objectstore: bluestore
+        bluestore block size: 96636764160
+        debug bluestore: 20
+        debug bluefs: 20
+        debug rocksdb: 10
+        bluestore fsck on mount: true
+        bluestore allocator: bitmap
+        # lower the full ratios since we can fill up a 100gb osd so quickly
+        mon osd full ratio: .9
+        mon osd backfillfull_ratio: .85
+        mon osd nearfull ratio: .8
+        osd failsafe full ratio: .95
+# this doesn't work with failures bc the log writes are not atomic across the two backends
+#        bluestore bluefs env mirror: true
+        bdev enable discard: true
+        bdev async discard: true
+  ceph-deploy:
+    fs: xfs
+    bluestore: yes
+    conf:
+      osd:
+        osd objectstore: bluestore
+        bluestore block size: 96636764160
+        debug bluestore: 20
+        debug bluefs: 20
+        debug rocksdb: 10
+        bluestore fsck on mount: true
+        # lower the full ratios since we can fill up a 100gb osd so quickly
+        mon osd full ratio: .9
+        mon osd backfillfull_ratio: .85
+        mon osd nearfull ratio: .8
+        osd failsafe full ratio: .95
+        bdev enable discard: true
+        bdev async discard: true
+
diff -Nru ceph-12.2.11/qa/suites/rados/thrash-erasure-code-overwrites/bluestore.yaml ceph-12.2.12/qa/suites/rados/thrash-erasure-code-overwrites/bluestore.yaml
--- ceph-12.2.11/qa/suites/rados/thrash-erasure-code-overwrites/bluestore.yaml	2019-01-30 15:51:26.000000000 +0000
+++ ceph-12.2.12/qa/suites/rados/thrash-erasure-code-overwrites/bluestore.yaml	1970-01-01 00:00:00.000000000 +0000
@@ -1,38 +0,0 @@
-overrides:
-  thrashosds:
-    bdev_inject_crash: 2
-    bdev_inject_crash_probability: .5
-  ceph:
-    fs: xfs
-    conf:
-      osd:
-        osd objectstore: bluestore
-        bluestore block size: 96636764160
-        debug bluestore: 20
-        debug bluefs: 20
-        debug rocksdb: 10
-        bluestore fsck on mount: true
-        # lower the full ratios since we can fill up a 100gb osd so quickly
-        mon osd full ratio: .9
-        mon osd backfillfull_ratio: .85
-        mon osd nearfull ratio: .8
-        osd failsafe full ratio: .95
-# this doesn't work with failures bc the log writes are not atomic across the two backends
-#        bluestore bluefs env mirror: true
-  ceph-deploy:
-    fs: xfs
-    bluestore: yes
-    conf:
-      osd:
-        osd objectstore: bluestore
-        bluestore block size: 96636764160
-        debug bluestore: 20
-        debug bluefs: 20
-        debug rocksdb: 10
-        bluestore fsck on mount: true
-        # lower the full ratios since we can fill up a 100gb osd so quickly
-        mon osd full ratio: .9
-        mon osd backfillfull_ratio: .85
-        mon osd nearfull ratio: .8
-        osd failsafe full ratio: .95
-
diff -Nru ceph-12.2.11/qa/suites/rados/thrash-erasure-code-shec/objectstore/bluestore-bitmap.yaml ceph-12.2.12/qa/suites/rados/thrash-erasure-code-shec/objectstore/bluestore-bitmap.yaml
--- ceph-12.2.11/qa/suites/rados/thrash-erasure-code-shec/objectstore/bluestore-bitmap.yaml	2019-01-30 15:51:26.000000000 +0000
+++ ceph-12.2.12/qa/suites/rados/thrash-erasure-code-shec/objectstore/bluestore-bitmap.yaml	2019-04-11 12:33:50.000000000 +0000
@@ -20,6 +20,8 @@
         osd failsafe full ratio: .95
 # this doesn't work with failures bc the log writes are not atomic across the two backends
 #        bluestore bluefs env mirror: true
+        bdev enable discard: true
+        bdev async discard: true
   ceph-deploy:
     fs: xfs
     bluestore: yes
@@ -36,4 +38,6 @@
         mon osd backfillfull_ratio: .85
         mon osd nearfull ratio: .8
         osd failsafe full ratio: .95
+        bdev enable discard: true
+        bdev async discard: true
 
diff -Nru ceph-12.2.11/qa/suites/rados/thrash-erasure-code-shec/objectstore/bluestore-stupid.yaml ceph-12.2.12/qa/suites/rados/thrash-erasure-code-shec/objectstore/bluestore-stupid.yaml
--- ceph-12.2.11/qa/suites/rados/thrash-erasure-code-shec/objectstore/bluestore-stupid.yaml	1970-01-01 00:00:00.000000000 +0000
+++ ceph-12.2.12/qa/suites/rados/thrash-erasure-code-shec/objectstore/bluestore-stupid.yaml	2019-04-11 12:33:50.000000000 +0000
@@ -0,0 +1,39 @@
+overrides:
+  thrashosds:
+    bdev_inject_crash: 2
+    bdev_inject_crash_probability: .5
+  ceph:
+    fs: xfs
+    conf:
+      osd:
+        osd objectstore: bluestore
+        bluestore block size: 96636764160
+        debug bluestore: 20
+        debug bluefs: 20
+        debug rocksdb: 10
+        bluestore fsck on mount: true
+        bluestore allocator: stupid
+        # lower the full ratios since we can fill up a 100gb osd so quickly
+        mon osd full ratio: .9
+        mon osd backfillfull_ratio: .85
+        mon osd nearfull ratio: .8
+        osd failsafe full ratio: .95
+# this doesn't work with failures bc the log writes are not atomic across the two backends
+#        bluestore bluefs env mirror: true
+  ceph-deploy:
+    fs: xfs
+    bluestore: yes
+    conf:
+      osd:
+        osd objectstore: bluestore
+        bluestore block size: 96636764160
+        debug bluestore: 20
+        debug bluefs: 20
+        debug rocksdb: 10
+        bluestore fsck on mount: true
+        # lower the full ratios since we can fill up a 100gb osd so quickly
+        mon osd full ratio: .9
+        mon osd backfillfull_ratio: .85
+        mon osd nearfull ratio: .8
+        osd failsafe full ratio: .95
+
diff -Nru ceph-12.2.11/qa/suites/rados/thrash-erasure-code-shec/objectstore/bluestore.yaml ceph-12.2.12/qa/suites/rados/thrash-erasure-code-shec/objectstore/bluestore.yaml
--- ceph-12.2.11/qa/suites/rados/thrash-erasure-code-shec/objectstore/bluestore.yaml	2019-01-30 15:51:26.000000000 +0000
+++ ceph-12.2.12/qa/suites/rados/thrash-erasure-code-shec/objectstore/bluestore.yaml	1970-01-01 00:00:00.000000000 +0000
@@ -1,38 +0,0 @@
-overrides:
-  thrashosds:
-    bdev_inject_crash: 2
-    bdev_inject_crash_probability: .5
-  ceph:
-    fs: xfs
-    conf:
-      osd:
-        osd objectstore: bluestore
-        bluestore block size: 96636764160
-        debug bluestore: 20
-        debug bluefs: 20
-        debug rocksdb: 10
-        bluestore fsck on mount: true
-        # lower the full ratios since we can fill up a 100gb osd so quickly
-        mon osd full ratio: .9
-        mon osd backfillfull_ratio: .85
-        mon osd nearfull ratio: .8
-        osd failsafe full ratio: .95
-# this doesn't work with failures bc the log writes are not atomic across the two backends
-#        bluestore bluefs env mirror: true
-  ceph-deploy:
-    fs: xfs
-    bluestore: yes
-    conf:
-      osd:
-        osd objectstore: bluestore
-        bluestore block size: 96636764160
-        debug bluestore: 20
-        debug bluefs: 20
-        debug rocksdb: 10
-        bluestore fsck on mount: true
-        # lower the full ratios since we can fill up a 100gb osd so quickly
-        mon osd full ratio: .9
-        mon osd backfillfull_ratio: .85
-        mon osd nearfull ratio: .8
-        osd failsafe full ratio: .95
-
diff -Nru ceph-12.2.11/qa/suites/rados/thrash-luminous/1-pg-log-overrides/short_pg_log.yaml ceph-12.2.12/qa/suites/rados/thrash-luminous/1-pg-log-overrides/short_pg_log.yaml
--- ceph-12.2.11/qa/suites/rados/thrash-luminous/1-pg-log-overrides/short_pg_log.yaml	2019-01-30 15:51:26.000000000 +0000
+++ ceph-12.2.12/qa/suites/rados/thrash-luminous/1-pg-log-overrides/short_pg_log.yaml	2019-04-11 12:33:50.000000000 +0000
@@ -2,5 +2,5 @@
   ceph:
     conf:
       global:
-        osd_min_pg_log_entries: 300
-        osd_max_pg_log_entries: 600
+        osd_min_pg_log_entries: 1
+        osd_max_pg_log_entries: 2
diff -Nru ceph-12.2.11/qa/suites/rados/thrash-luminous/objectstore/bluestore-bitmap.yaml ceph-12.2.12/qa/suites/rados/thrash-luminous/objectstore/bluestore-bitmap.yaml
--- ceph-12.2.11/qa/suites/rados/thrash-luminous/objectstore/bluestore-bitmap.yaml	2019-01-30 15:51:26.000000000 +0000
+++ ceph-12.2.12/qa/suites/rados/thrash-luminous/objectstore/bluestore-bitmap.yaml	2019-04-11 12:33:50.000000000 +0000
@@ -20,6 +20,8 @@
         osd failsafe full ratio: .95
 # this doesn't work with failures bc the log writes are not atomic across the two backends
 #        bluestore bluefs env mirror: true
+        bdev enable discard: true
+        bdev async discard: true
   ceph-deploy:
     fs: xfs
     bluestore: yes
@@ -36,4 +38,6 @@
         mon osd backfillfull_ratio: .85
         mon osd nearfull ratio: .8
         osd failsafe full ratio: .95
+        bdev enable discard: true
+        bdev async discard: true
 
diff -Nru ceph-12.2.11/qa/suites/rados/thrash-luminous/objectstore/bluestore-stupid.yaml ceph-12.2.12/qa/suites/rados/thrash-luminous/objectstore/bluestore-stupid.yaml
--- ceph-12.2.11/qa/suites/rados/thrash-luminous/objectstore/bluestore-stupid.yaml	1970-01-01 00:00:00.000000000 +0000
+++ ceph-12.2.12/qa/suites/rados/thrash-luminous/objectstore/bluestore-stupid.yaml	2019-04-11 12:33:50.000000000 +0000
@@ -0,0 +1,39 @@
+overrides:
+  thrashosds:
+    bdev_inject_crash: 2
+    bdev_inject_crash_probability: .5
+  ceph:
+    fs: xfs
+    conf:
+      osd:
+        osd objectstore: bluestore
+        bluestore block size: 96636764160
+        debug bluestore: 20
+        debug bluefs: 20
+        debug rocksdb: 10
+        bluestore fsck on mount: true
+        bluestore allocator: stupid
+        # lower the full ratios since we can fill up a 100gb osd so quickly
+        mon osd full ratio: .9
+        mon osd backfillfull_ratio: .85
+        mon osd nearfull ratio: .8
+        osd failsafe full ratio: .95
+# this doesn't work with failures bc the log writes are not atomic across the two backends
+#        bluestore bluefs env mirror: true
+  ceph-deploy:
+    fs: xfs
+    bluestore: yes
+    conf:
+      osd:
+        osd objectstore: bluestore
+        bluestore block size: 96636764160
+        debug bluestore: 20
+        debug bluefs: 20
+        debug rocksdb: 10
+        bluestore fsck on mount: true
+        # lower the full ratios since we can fill up a 100gb osd so quickly
+        mon osd full ratio: .9
+        mon osd backfillfull_ratio: .85
+        mon osd nearfull ratio: .8
+        osd failsafe full ratio: .95
+
diff -Nru ceph-12.2.11/qa/suites/rados/thrash-luminous/objectstore/bluestore.yaml ceph-12.2.12/qa/suites/rados/thrash-luminous/objectstore/bluestore.yaml
--- ceph-12.2.11/qa/suites/rados/thrash-luminous/objectstore/bluestore.yaml	2019-01-30 15:51:26.000000000 +0000
+++ ceph-12.2.12/qa/suites/rados/thrash-luminous/objectstore/bluestore.yaml	1970-01-01 00:00:00.000000000 +0000
@@ -1,38 +0,0 @@
-overrides:
-  thrashosds:
-    bdev_inject_crash: 2
-    bdev_inject_crash_probability: .5
-  ceph:
-    fs: xfs
-    conf:
-      osd:
-        osd objectstore: bluestore
-        bluestore block size: 96636764160
-        debug bluestore: 20
-        debug bluefs: 20
-        debug rocksdb: 10
-        bluestore fsck on mount: true
-        # lower the full ratios since we can fill up a 100gb osd so quickly
-        mon osd full ratio: .9
-        mon osd backfillfull_ratio: .85
-        mon osd nearfull ratio: .8
-        osd failsafe full ratio: .95
-# this doesn't work with failures bc the log writes are not atomic across the two backends
-#        bluestore bluefs env mirror: true
-  ceph-deploy:
-    fs: xfs
-    bluestore: yes
-    conf:
-      osd:
-        osd objectstore: bluestore
-        bluestore block size: 96636764160
-        debug bluestore: 20
-        debug bluefs: 20
-        debug rocksdb: 10
-        bluestore fsck on mount: true
-        # lower the full ratios since we can fill up a 100gb osd so quickly
-        mon osd full ratio: .9
-        mon osd backfillfull_ratio: .85
-        mon osd nearfull ratio: .8
-        osd failsafe full ratio: .95
-
diff -Nru ceph-12.2.11/qa/suites/rados/verify/objectstore/bluestore-bitmap.yaml ceph-12.2.12/qa/suites/rados/verify/objectstore/bluestore-bitmap.yaml
--- ceph-12.2.11/qa/suites/rados/verify/objectstore/bluestore-bitmap.yaml	2019-01-30 15:51:26.000000000 +0000
+++ ceph-12.2.12/qa/suites/rados/verify/objectstore/bluestore-bitmap.yaml	2019-04-11 12:33:50.000000000 +0000
@@ -20,6 +20,8 @@
         osd failsafe full ratio: .95
 # this doesn't work with failures bc the log writes are not atomic across the two backends
 #        bluestore bluefs env mirror: true
+        bdev enable discard: true
+        bdev async discard: true
   ceph-deploy:
     fs: xfs
     bluestore: yes
@@ -36,4 +38,6 @@
         mon osd backfillfull_ratio: .85
         mon osd nearfull ratio: .8
         osd failsafe full ratio: .95
+        bdev enable discard: true
+        bdev async discard: true
 
diff -Nru ceph-12.2.11/qa/suites/rados/verify/objectstore/bluestore-stupid.yaml ceph-12.2.12/qa/suites/rados/verify/objectstore/bluestore-stupid.yaml
--- ceph-12.2.11/qa/suites/rados/verify/objectstore/bluestore-stupid.yaml	1970-01-01 00:00:00.000000000 +0000
+++ ceph-12.2.12/qa/suites/rados/verify/objectstore/bluestore-stupid.yaml	2019-04-11 12:33:50.000000000 +0000
@@ -0,0 +1,39 @@
+overrides:
+  thrashosds:
+    bdev_inject_crash: 2
+    bdev_inject_crash_probability: .5
+  ceph:
+    fs: xfs
+    conf:
+      osd:
+        osd objectstore: bluestore
+        bluestore block size: 96636764160
+        debug bluestore: 20
+        debug bluefs: 20
+        debug rocksdb: 10
+        bluestore fsck on mount: true
+        bluestore allocator: stupid
+        # lower the full ratios since we can fill up a 100gb osd so quickly
+        mon osd full ratio: .9
+        mon osd backfillfull_ratio: .85
+        mon osd nearfull ratio: .8
+        osd failsafe full ratio: .95
+# this doesn't work with failures bc the log writes are not atomic across the two backends
+#        bluestore bluefs env mirror: true
+  ceph-deploy:
+    fs: xfs
+    bluestore: yes
+    conf:
+      osd:
+        osd objectstore: bluestore
+        bluestore block size: 96636764160
+        debug bluestore: 20
+        debug bluefs: 20
+        debug rocksdb: 10
+        bluestore fsck on mount: true
+        # lower the full ratios since we can fill up a 100gb osd so quickly
+        mon osd full ratio: .9
+        mon osd backfillfull_ratio: .85
+        mon osd nearfull ratio: .8
+        osd failsafe full ratio: .95
+
diff -Nru ceph-12.2.11/qa/suites/rados/verify/objectstore/bluestore.yaml ceph-12.2.12/qa/suites/rados/verify/objectstore/bluestore.yaml
--- ceph-12.2.11/qa/suites/rados/verify/objectstore/bluestore.yaml	2019-01-30 15:51:26.000000000 +0000
+++ ceph-12.2.12/qa/suites/rados/verify/objectstore/bluestore.yaml	1970-01-01 00:00:00.000000000 +0000
@@ -1,38 +0,0 @@
-overrides:
-  thrashosds:
-    bdev_inject_crash: 2
-    bdev_inject_crash_probability: .5
-  ceph:
-    fs: xfs
-    conf:
-      osd:
-        osd objectstore: bluestore
-        bluestore block size: 96636764160
-        debug bluestore: 20
-        debug bluefs: 20
-        debug rocksdb: 10
-        bluestore fsck on mount: true
-        # lower the full ratios since we can fill up a 100gb osd so quickly
-        mon osd full ratio: .9
-        mon osd backfillfull_ratio: .85
-        mon osd nearfull ratio: .8
-        osd failsafe full ratio: .95
-# this doesn't work with failures bc the log writes are not atomic across the two backends
-#        bluestore bluefs env mirror: true
-  ceph-deploy:
-    fs: xfs
-    bluestore: yes
-    conf:
-      osd:
-        osd objectstore: bluestore
-        bluestore block size: 96636764160
-        debug bluestore: 20
-        debug bluefs: 20
-        debug rocksdb: 10
-        bluestore fsck on mount: true
-        # lower the full ratios since we can fill up a 100gb osd so quickly
-        mon osd full ratio: .9
-        mon osd backfillfull_ratio: .85
-        mon osd nearfull ratio: .8
-        osd failsafe full ratio: .95
-
diff -Nru ceph-12.2.11/qa/suites/rados/verify/tasks/rados_api_tests.yaml ceph-12.2.12/qa/suites/rados/verify/tasks/rados_api_tests.yaml
--- ceph-12.2.11/qa/suites/rados/verify/tasks/rados_api_tests.yaml	2019-01-30 15:51:26.000000000 +0000
+++ ceph-12.2.12/qa/suites/rados/verify/tasks/rados_api_tests.yaml	2019-04-11 12:33:50.000000000 +0000
@@ -10,6 +10,7 @@
       - \(CACHE_POOL_NEAR_FULL\)
       - \(POOL_APP_NOT_ENABLED\)
       - \(PG_AVAILABILITY\)
+      - slow request
     conf:
       client:
         debug ms: 1
diff -Nru ceph-12.2.11/qa/suites/rbd/basic/objectstore/bluestore-bitmap.yaml ceph-12.2.12/qa/suites/rbd/basic/objectstore/bluestore-bitmap.yaml
--- ceph-12.2.11/qa/suites/rbd/basic/objectstore/bluestore-bitmap.yaml	2019-01-30 15:51:26.000000000 +0000
+++ ceph-12.2.12/qa/suites/rbd/basic/objectstore/bluestore-bitmap.yaml	2019-04-11 12:33:50.000000000 +0000
@@ -20,6 +20,8 @@
         osd failsafe full ratio: .95
 # this doesn't work with failures bc the log writes are not atomic across the two backends
 #        bluestore bluefs env mirror: true
+        bdev enable discard: true
+        bdev async discard: true
   ceph-deploy:
     fs: xfs
     bluestore: yes
@@ -36,4 +38,6 @@
         mon osd backfillfull_ratio: .85
         mon osd nearfull ratio: .8
         osd failsafe full ratio: .95
+        bdev enable discard: true
+        bdev async discard: true
 
diff -Nru ceph-12.2.11/qa/suites/rbd/basic/objectstore/bluestore-stupid.yaml ceph-12.2.12/qa/suites/rbd/basic/objectstore/bluestore-stupid.yaml
--- ceph-12.2.11/qa/suites/rbd/basic/objectstore/bluestore-stupid.yaml	1970-01-01 00:00:00.000000000 +0000
+++ ceph-12.2.12/qa/suites/rbd/basic/objectstore/bluestore-stupid.yaml	2019-04-11 12:33:50.000000000 +0000
@@ -0,0 +1,39 @@
+overrides:
+  thrashosds:
+    bdev_inject_crash: 2
+    bdev_inject_crash_probability: .5
+  ceph:
+    fs: xfs
+    conf:
+      osd:
+        osd objectstore: bluestore
+        bluestore block size: 96636764160
+        debug bluestore: 20
+        debug bluefs: 20
+        debug rocksdb: 10
+        bluestore fsck on mount: true
+        bluestore allocator: stupid
+        # lower the full ratios since we can fill up a 100gb osd so quickly
+        mon osd full ratio: .9
+        mon osd backfillfull_ratio: .85
+        mon osd nearfull ratio: .8
+        osd failsafe full ratio: .95
+# this doesn't work with failures bc the log writes are not atomic across the two backends
+#        bluestore bluefs env mirror: true
+  ceph-deploy:
+    fs: xfs
+    bluestore: yes
+    conf:
+      osd:
+        osd objectstore: bluestore
+        bluestore block size: 96636764160
+        debug bluestore: 20
+        debug bluefs: 20
+        debug rocksdb: 10
+        bluestore fsck on mount: true
+        # lower the full ratios since we can fill up a 100gb osd so quickly
+        mon osd full ratio: .9
+        mon osd backfillfull_ratio: .85
+        mon osd nearfull ratio: .8
+        osd failsafe full ratio: .95
+
diff -Nru ceph-12.2.11/qa/suites/rbd/basic/objectstore/bluestore.yaml ceph-12.2.12/qa/suites/rbd/basic/objectstore/bluestore.yaml
--- ceph-12.2.11/qa/suites/rbd/basic/objectstore/bluestore.yaml	2019-01-30 15:51:26.000000000 +0000
+++ ceph-12.2.12/qa/suites/rbd/basic/objectstore/bluestore.yaml	1970-01-01 00:00:00.000000000 +0000
@@ -1,38 +0,0 @@
-overrides:
-  thrashosds:
-    bdev_inject_crash: 2
-    bdev_inject_crash_probability: .5
-  ceph:
-    fs: xfs
-    conf:
-      osd:
-        osd objectstore: bluestore
-        bluestore block size: 96636764160
-        debug bluestore: 20
-        debug bluefs: 20
-        debug rocksdb: 10
-        bluestore fsck on mount: true
-        # lower the full ratios since we can fill up a 100gb osd so quickly
-        mon osd full ratio: .9
-        mon osd backfillfull_ratio: .85
-        mon osd nearfull ratio: .8
-        osd failsafe full ratio: .95
-# this doesn't work with failures bc the log writes are not atomic across the two backends
-#        bluestore bluefs env mirror: true
-  ceph-deploy:
-    fs: xfs
-    bluestore: yes
-    conf:
-      osd:
-        osd objectstore: bluestore
-        bluestore block size: 96636764160
-        debug bluestore: 20
-        debug bluefs: 20
-        debug rocksdb: 10
-        bluestore fsck on mount: true
-        # lower the full ratios since we can fill up a 100gb osd so quickly
-        mon osd full ratio: .9
-        mon osd backfillfull_ratio: .85
-        mon osd nearfull ratio: .8
-        osd failsafe full ratio: .95
-
diff -Nru ceph-12.2.11/qa/suites/rbd/basic/tasks/rbd_python_api_tests_old_format.yaml ceph-12.2.12/qa/suites/rbd/basic/tasks/rbd_python_api_tests_old_format.yaml
--- ceph-12.2.11/qa/suites/rbd/basic/tasks/rbd_python_api_tests_old_format.yaml	2019-01-30 15:51:26.000000000 +0000
+++ ceph-12.2.12/qa/suites/rbd/basic/tasks/rbd_python_api_tests_old_format.yaml	2019-04-11 12:33:50.000000000 +0000
@@ -2,6 +2,7 @@
   ceph:
     log-whitelist:
       - \(REQUEST_SLOW\)
+      - slow request
 tasks:
 - workunit:
     clients:
diff -Nru ceph-12.2.11/qa/suites/rbd/cli/objectstore/bluestore-bitmap.yaml ceph-12.2.12/qa/suites/rbd/cli/objectstore/bluestore-bitmap.yaml
--- ceph-12.2.11/qa/suites/rbd/cli/objectstore/bluestore-bitmap.yaml	2019-01-30 15:51:26.000000000 +0000
+++ ceph-12.2.12/qa/suites/rbd/cli/objectstore/bluestore-bitmap.yaml	2019-04-11 12:33:50.000000000 +0000
@@ -20,6 +20,8 @@
         osd failsafe full ratio: .95
 # this doesn't work with failures bc the log writes are not atomic across the two backends
 #        bluestore bluefs env mirror: true
+        bdev enable discard: true
+        bdev async discard: true
   ceph-deploy:
     fs: xfs
     bluestore: yes
@@ -36,4 +38,6 @@
         mon osd backfillfull_ratio: .85
         mon osd nearfull ratio: .8
         osd failsafe full ratio: .95
+        bdev enable discard: true
+        bdev async discard: true
 
diff -Nru ceph-12.2.11/qa/suites/rbd/cli/objectstore/bluestore-stupid.yaml ceph-12.2.12/qa/suites/rbd/cli/objectstore/bluestore-stupid.yaml
--- ceph-12.2.11/qa/suites/rbd/cli/objectstore/bluestore-stupid.yaml	1970-01-01 00:00:00.000000000 +0000
+++ ceph-12.2.12/qa/suites/rbd/cli/objectstore/bluestore-stupid.yaml	2019-04-11 12:33:50.000000000 +0000
@@ -0,0 +1,39 @@
+overrides:
+  thrashosds:
+    bdev_inject_crash: 2
+    bdev_inject_crash_probability: .5
+  ceph:
+    fs: xfs
+    conf:
+      osd:
+        osd objectstore: bluestore
+        bluestore block size: 96636764160
+        debug bluestore: 20
+        debug bluefs: 20
+        debug rocksdb: 10
+        bluestore fsck on mount: true
+        bluestore allocator: stupid
+        # lower the full ratios since we can fill up a 100gb osd so quickly
+        mon osd full ratio: .9
+        mon osd backfillfull_ratio: .85
+        mon osd nearfull ratio: .8
+        osd failsafe full ratio: .95
+# this doesn't work with failures bc the log writes are not atomic across the two backends
+#        bluestore bluefs env mirror: true
+  ceph-deploy:
+    fs: xfs
+    bluestore: yes
+    conf:
+      osd:
+        osd objectstore: bluestore
+        bluestore block size: 96636764160
+        debug bluestore: 20
+        debug bluefs: 20
+        debug rocksdb: 10
+        bluestore fsck on mount: true
+        # lower the full ratios since we can fill up a 100gb osd so quickly
+        mon osd full ratio: .9
+        mon osd backfillfull_ratio: .85
+        mon osd nearfull ratio: .8
+        osd failsafe full ratio: .95
+
diff -Nru ceph-12.2.11/qa/suites/rbd/cli/objectstore/bluestore.yaml ceph-12.2.12/qa/suites/rbd/cli/objectstore/bluestore.yaml
--- ceph-12.2.11/qa/suites/rbd/cli/objectstore/bluestore.yaml	2019-01-30 15:51:26.000000000 +0000
+++ ceph-12.2.12/qa/suites/rbd/cli/objectstore/bluestore.yaml	1970-01-01 00:00:00.000000000 +0000
@@ -1,38 +0,0 @@
-overrides:
-  thrashosds:
-    bdev_inject_crash: 2
-    bdev_inject_crash_probability: .5
-  ceph:
-    fs: xfs
-    conf:
-      osd:
-        osd objectstore: bluestore
-        bluestore block size: 96636764160
-        debug bluestore: 20
-        debug bluefs: 20
-        debug rocksdb: 10
-        bluestore fsck on mount: true
-        # lower the full ratios since we can fill up a 100gb osd so quickly
-        mon osd full ratio: .9
-        mon osd backfillfull_ratio: .85
-        mon osd nearfull ratio: .8
-        osd failsafe full ratio: .95
-# this doesn't work with failures bc the log writes are not atomic across the two backends
-#        bluestore bluefs env mirror: true
-  ceph-deploy:
-    fs: xfs
-    bluestore: yes
-    conf:
-      osd:
-        osd objectstore: bluestore
-        bluestore block size: 96636764160
-        debug bluestore: 20
-        debug bluefs: 20
-        debug rocksdb: 10
-        bluestore fsck on mount: true
-        # lower the full ratios since we can fill up a 100gb osd so quickly
-        mon osd full ratio: .9
-        mon osd backfillfull_ratio: .85
-        mon osd nearfull ratio: .8
-        osd failsafe full ratio: .95
-
diff -Nru ceph-12.2.11/qa/suites/rbd/librbd/objectstore/bluestore-bitmap.yaml ceph-12.2.12/qa/suites/rbd/librbd/objectstore/bluestore-bitmap.yaml
--- ceph-12.2.11/qa/suites/rbd/librbd/objectstore/bluestore-bitmap.yaml	2019-01-30 15:51:26.000000000 +0000
+++ ceph-12.2.12/qa/suites/rbd/librbd/objectstore/bluestore-bitmap.yaml	2019-04-11 12:33:50.000000000 +0000
@@ -20,6 +20,8 @@
         osd failsafe full ratio: .95
 # this doesn't work with failures bc the log writes are not atomic across the two backends
 #        bluestore bluefs env mirror: true
+        bdev enable discard: true
+        bdev async discard: true
   ceph-deploy:
     fs: xfs
     bluestore: yes
@@ -36,4 +38,6 @@
         mon osd backfillfull_ratio: .85
         mon osd nearfull ratio: .8
         osd failsafe full ratio: .95
+        bdev enable discard: true
+        bdev async discard: true
 
diff -Nru ceph-12.2.11/qa/suites/rbd/librbd/objectstore/bluestore-stupid.yaml ceph-12.2.12/qa/suites/rbd/librbd/objectstore/bluestore-stupid.yaml
--- ceph-12.2.11/qa/suites/rbd/librbd/objectstore/bluestore-stupid.yaml	1970-01-01 00:00:00.000000000 +0000
+++ ceph-12.2.12/qa/suites/rbd/librbd/objectstore/bluestore-stupid.yaml	2019-04-11 12:33:50.000000000 +0000
@@ -0,0 +1,39 @@
+overrides:
+  thrashosds:
+    bdev_inject_crash: 2
+    bdev_inject_crash_probability: .5
+  ceph:
+    fs: xfs
+    conf:
+      osd:
+        osd objectstore: bluestore
+        bluestore block size: 96636764160
+        debug bluestore: 20
+        debug bluefs: 20
+        debug rocksdb: 10
+        bluestore fsck on mount: true
+        bluestore allocator: stupid
+        # lower the full ratios since we can fill up a 100gb osd so quickly
+        mon osd full ratio: .9
+        mon osd backfillfull_ratio: .85
+        mon osd nearfull ratio: .8
+        osd failsafe full ratio: .95
+# this doesn't work with failures bc the log writes are not atomic across the two backends
+#        bluestore bluefs env mirror: true
+  ceph-deploy:
+    fs: xfs
+    bluestore: yes
+    conf:
+      osd:
+        osd objectstore: bluestore
+        bluestore block size: 96636764160
+        debug bluestore: 20
+        debug bluefs: 20
+        debug rocksdb: 10
+        bluestore fsck on mount: true
+        # lower the full ratios since we can fill up a 100gb osd so quickly
+        mon osd full ratio: .9
+        mon osd backfillfull_ratio: .85
+        mon osd nearfull ratio: .8
+        osd failsafe full ratio: .95
+
diff -Nru ceph-12.2.11/qa/suites/rbd/librbd/objectstore/bluestore.yaml ceph-12.2.12/qa/suites/rbd/librbd/objectstore/bluestore.yaml
--- ceph-12.2.11/qa/suites/rbd/librbd/objectstore/bluestore.yaml	2019-01-30 15:51:26.000000000 +0000
+++ ceph-12.2.12/qa/suites/rbd/librbd/objectstore/bluestore.yaml	1970-01-01 00:00:00.000000000 +0000
@@ -1,38 +0,0 @@
-overrides:
-  thrashosds:
-    bdev_inject_crash: 2
-    bdev_inject_crash_probability: .5
-  ceph:
-    fs: xfs
-    conf:
-      osd:
-        osd objectstore: bluestore
-        bluestore block size: 96636764160
-        debug bluestore: 20
-        debug bluefs: 20
-        debug rocksdb: 10
-        bluestore fsck on mount: true
-        # lower the full ratios since we can fill up a 100gb osd so quickly
-        mon osd full ratio: .9
-        mon osd backfillfull_ratio: .85
-        mon osd nearfull ratio: .8
-        osd failsafe full ratio: .95
-# this doesn't work with failures bc the log writes are not atomic across the two backends
-#        bluestore bluefs env mirror: true
-  ceph-deploy:
-    fs: xfs
-    bluestore: yes
-    conf:
-      osd:
-        osd objectstore: bluestore
-        bluestore block size: 96636764160
-        debug bluestore: 20
-        debug bluefs: 20
-        debug rocksdb: 10
-        bluestore fsck on mount: true
-        # lower the full ratios since we can fill up a 100gb osd so quickly
-        mon osd full ratio: .9
-        mon osd backfillfull_ratio: .85
-        mon osd nearfull ratio: .8
-        osd failsafe full ratio: .95
-
diff -Nru ceph-12.2.11/qa/suites/rbd/maintenance/objectstore/bluestore-bitmap.yaml ceph-12.2.12/qa/suites/rbd/maintenance/objectstore/bluestore-bitmap.yaml
--- ceph-12.2.11/qa/suites/rbd/maintenance/objectstore/bluestore-bitmap.yaml	2019-01-30 15:51:26.000000000 +0000
+++ ceph-12.2.12/qa/suites/rbd/maintenance/objectstore/bluestore-bitmap.yaml	2019-04-11 12:33:50.000000000 +0000
@@ -20,6 +20,8 @@
         osd failsafe full ratio: .95
 # this doesn't work with failures bc the log writes are not atomic across the two backends
 #        bluestore bluefs env mirror: true
+        bdev enable discard: true
+        bdev async discard: true
   ceph-deploy:
     fs: xfs
     bluestore: yes
@@ -36,4 +38,6 @@
         mon osd backfillfull_ratio: .85
         mon osd nearfull ratio: .8
         osd failsafe full ratio: .95
+        bdev enable discard: true
+        bdev async discard: true
 
diff -Nru ceph-12.2.11/qa/suites/rbd/maintenance/objectstore/bluestore-stupid.yaml ceph-12.2.12/qa/suites/rbd/maintenance/objectstore/bluestore-stupid.yaml
--- ceph-12.2.11/qa/suites/rbd/maintenance/objectstore/bluestore-stupid.yaml	1970-01-01 00:00:00.000000000 +0000
+++ ceph-12.2.12/qa/suites/rbd/maintenance/objectstore/bluestore-stupid.yaml	2019-04-11 12:33:50.000000000 +0000
@@ -0,0 +1,39 @@
+overrides:
+  thrashosds:
+    bdev_inject_crash: 2
+    bdev_inject_crash_probability: .5
+  ceph:
+    fs: xfs
+    conf:
+      osd:
+        osd objectstore: bluestore
+        bluestore block size: 96636764160
+        debug bluestore: 20
+        debug bluefs: 20
+        debug rocksdb: 10
+        bluestore fsck on mount: true
+        bluestore allocator: stupid
+        # lower the full ratios since we can fill up a 100gb osd so quickly
+        mon osd full ratio: .9
+        mon osd backfillfull_ratio: .85
+        mon osd nearfull ratio: .8
+        osd failsafe full ratio: .95
+# this doesn't work with failures bc the log writes are not atomic across the two backends
+#        bluestore bluefs env mirror: true
+  ceph-deploy:
+    fs: xfs
+    bluestore: yes
+    conf:
+      osd:
+        osd objectstore: bluestore
+        bluestore block size: 96636764160
+        debug bluestore: 20
+        debug bluefs: 20
+        debug rocksdb: 10
+        bluestore fsck on mount: true
+        # lower the full ratios since we can fill up a 100gb osd so quickly
+        mon osd full ratio: .9
+        mon osd backfillfull_ratio: .85
+        mon osd nearfull ratio: .8
+        osd failsafe full ratio: .95
+
diff -Nru ceph-12.2.11/qa/suites/rbd/maintenance/objectstore/bluestore.yaml ceph-12.2.12/qa/suites/rbd/maintenance/objectstore/bluestore.yaml
--- ceph-12.2.11/qa/suites/rbd/maintenance/objectstore/bluestore.yaml	2019-01-30 15:51:26.000000000 +0000
+++ ceph-12.2.12/qa/suites/rbd/maintenance/objectstore/bluestore.yaml	1970-01-01 00:00:00.000000000 +0000
@@ -1,38 +0,0 @@
-overrides:
-  thrashosds:
-    bdev_inject_crash: 2
-    bdev_inject_crash_probability: .5
-  ceph:
-    fs: xfs
-    conf:
-      osd:
-        osd objectstore: bluestore
-        bluestore block size: 96636764160
-        debug bluestore: 20
-        debug bluefs: 20
-        debug rocksdb: 10
-        bluestore fsck on mount: true
-        # lower the full ratios since we can fill up a 100gb osd so quickly
-        mon osd full ratio: .9
-        mon osd backfillfull_ratio: .85
-        mon osd nearfull ratio: .8
-        osd failsafe full ratio: .95
-# this doesn't work with failures bc the log writes are not atomic across the two backends
-#        bluestore bluefs env mirror: true
-  ceph-deploy:
-    fs: xfs
-    bluestore: yes
-    conf:
-      osd:
-        osd objectstore: bluestore
-        bluestore block size: 96636764160
-        debug bluestore: 20
-        debug bluefs: 20
-        debug rocksdb: 10
-        bluestore fsck on mount: true
-        # lower the full ratios since we can fill up a 100gb osd so quickly
-        mon osd full ratio: .9
-        mon osd backfillfull_ratio: .85
-        mon osd nearfull ratio: .8
-        osd failsafe full ratio: .95
-
diff -Nru ceph-12.2.11/qa/suites/rbd/mirror/objectstore/bluestore-bitmap.yaml ceph-12.2.12/qa/suites/rbd/mirror/objectstore/bluestore-bitmap.yaml
--- ceph-12.2.11/qa/suites/rbd/mirror/objectstore/bluestore-bitmap.yaml	2019-01-30 15:51:26.000000000 +0000
+++ ceph-12.2.12/qa/suites/rbd/mirror/objectstore/bluestore-bitmap.yaml	2019-04-11 12:33:50.000000000 +0000
@@ -20,6 +20,8 @@
         osd failsafe full ratio: .95
 # this doesn't work with failures bc the log writes are not atomic across the two backends
 #        bluestore bluefs env mirror: true
+        bdev enable discard: true
+        bdev async discard: true
   ceph-deploy:
     fs: xfs
     bluestore: yes
@@ -36,4 +38,6 @@
         mon osd backfillfull_ratio: .85
         mon osd nearfull ratio: .8
         osd failsafe full ratio: .95
+        bdev enable discard: true
+        bdev async discard: true
 
diff -Nru ceph-12.2.11/qa/suites/rbd/mirror/objectstore/bluestore-stupid.yaml ceph-12.2.12/qa/suites/rbd/mirror/objectstore/bluestore-stupid.yaml
--- ceph-12.2.11/qa/suites/rbd/mirror/objectstore/bluestore-stupid.yaml	1970-01-01 00:00:00.000000000 +0000
+++ ceph-12.2.12/qa/suites/rbd/mirror/objectstore/bluestore-stupid.yaml	2019-04-11 12:33:50.000000000 +0000
@@ -0,0 +1,39 @@
+overrides:
+  thrashosds:
+    bdev_inject_crash: 2
+    bdev_inject_crash_probability: .5
+  ceph:
+    fs: xfs
+    conf:
+      osd:
+        osd objectstore: bluestore
+        bluestore block size: 96636764160
+        debug bluestore: 20
+        debug bluefs: 20
+        debug rocksdb: 10
+        bluestore fsck on mount: true
+        bluestore allocator: stupid
+        # lower the full ratios since we can fill up a 100gb osd so quickly
+        mon osd full ratio: .9
+        mon osd backfillfull_ratio: .85
+        mon osd nearfull ratio: .8
+        osd failsafe full ratio: .95
+# this doesn't work with failures bc the log writes are not atomic across the two backends
+#        bluestore bluefs env mirror: true
+  ceph-deploy:
+    fs: xfs
+    bluestore: yes
+    conf:
+      osd:
+        osd objectstore: bluestore
+        bluestore block size: 96636764160
+        debug bluestore: 20
+        debug bluefs: 20
+        debug rocksdb: 10
+        bluestore fsck on mount: true
+        # lower the full ratios since we can fill up a 100gb osd so quickly
+        mon osd full ratio: .9
+        mon osd backfillfull_ratio: .85
+        mon osd nearfull ratio: .8
+        osd failsafe full ratio: .95
+
diff -Nru ceph-12.2.11/qa/suites/rbd/mirror/objectstore/bluestore.yaml ceph-12.2.12/qa/suites/rbd/mirror/objectstore/bluestore.yaml
--- ceph-12.2.11/qa/suites/rbd/mirror/objectstore/bluestore.yaml	2019-01-30 15:51:26.000000000 +0000
+++ ceph-12.2.12/qa/suites/rbd/mirror/objectstore/bluestore.yaml	1970-01-01 00:00:00.000000000 +0000
@@ -1,38 +0,0 @@
-overrides:
-  thrashosds:
-    bdev_inject_crash: 2
-    bdev_inject_crash_probability: .5
-  ceph:
-    fs: xfs
-    conf:
-      osd:
-        osd objectstore: bluestore
-        bluestore block size: 96636764160
-        debug bluestore: 20
-        debug bluefs: 20
-        debug rocksdb: 10
-        bluestore fsck on mount: true
-        # lower the full ratios since we can fill up a 100gb osd so quickly
-        mon osd full ratio: .9
-        mon osd backfillfull_ratio: .85
-        mon osd nearfull ratio: .8
-        osd failsafe full ratio: .95
-# this doesn't work with failures bc the log writes are not atomic across the two backends
-#        bluestore bluefs env mirror: true
-  ceph-deploy:
-    fs: xfs
-    bluestore: yes
-    conf:
-      osd:
-        osd objectstore: bluestore
-        bluestore block size: 96636764160
-        debug bluestore: 20
-        debug bluefs: 20
-        debug rocksdb: 10
-        bluestore fsck on mount: true
-        # lower the full ratios since we can fill up a 100gb osd so quickly
-        mon osd full ratio: .9
-        mon osd backfillfull_ratio: .85
-        mon osd nearfull ratio: .8
-        osd failsafe full ratio: .95
-
diff -Nru ceph-12.2.11/qa/suites/rbd/mirror-ha/objectstore/bluestore-bitmap.yaml ceph-12.2.12/qa/suites/rbd/mirror-ha/objectstore/bluestore-bitmap.yaml
--- ceph-12.2.11/qa/suites/rbd/mirror-ha/objectstore/bluestore-bitmap.yaml	2019-01-30 15:51:26.000000000 +0000
+++ ceph-12.2.12/qa/suites/rbd/mirror-ha/objectstore/bluestore-bitmap.yaml	2019-04-11 12:33:50.000000000 +0000
@@ -20,6 +20,8 @@
         osd failsafe full ratio: .95
 # this doesn't work with failures bc the log writes are not atomic across the two backends
 #        bluestore bluefs env mirror: true
+        bdev enable discard: true
+        bdev async discard: true
   ceph-deploy:
     fs: xfs
     bluestore: yes
@@ -36,4 +38,6 @@
         mon osd backfillfull_ratio: .85
         mon osd nearfull ratio: .8
         osd failsafe full ratio: .95
+        bdev enable discard: true
+        bdev async discard: true
 
diff -Nru ceph-12.2.11/qa/suites/rbd/mirror-ha/objectstore/bluestore-stupid.yaml ceph-12.2.12/qa/suites/rbd/mirror-ha/objectstore/bluestore-stupid.yaml
--- ceph-12.2.11/qa/suites/rbd/mirror-ha/objectstore/bluestore-stupid.yaml	1970-01-01 00:00:00.000000000 +0000
+++ ceph-12.2.12/qa/suites/rbd/mirror-ha/objectstore/bluestore-stupid.yaml	2019-04-11 12:33:50.000000000 +0000
@@ -0,0 +1,39 @@
+overrides:
+  thrashosds:
+    bdev_inject_crash: 2
+    bdev_inject_crash_probability: .5
+  ceph:
+    fs: xfs
+    conf:
+      osd:
+        osd objectstore: bluestore
+        bluestore block size: 96636764160
+        debug bluestore: 20
+        debug bluefs: 20
+        debug rocksdb: 10
+        bluestore fsck on mount: true
+        bluestore allocator: stupid
+        # lower the full ratios since we can fill up a 100gb osd so quickly
+        mon osd full ratio: .9
+        mon osd backfillfull_ratio: .85
+        mon osd nearfull ratio: .8
+        osd failsafe full ratio: .95
+# this doesn't work with failures bc the log writes are not atomic across the two backends
+#        bluestore bluefs env mirror: true
+  ceph-deploy:
+    fs: xfs
+    bluestore: yes
+    conf:
+      osd:
+        osd objectstore: bluestore
+        bluestore block size: 96636764160
+        debug bluestore: 20
+        debug bluefs: 20
+        debug rocksdb: 10
+        bluestore fsck on mount: true
+        # lower the full ratios since we can fill up a 100gb osd so quickly
+        mon osd full ratio: .9
+        mon osd backfillfull_ratio: .85
+        mon osd nearfull ratio: .8
+        osd failsafe full ratio: .95
+
diff -Nru ceph-12.2.11/qa/suites/rbd/mirror-ha/objectstore/bluestore.yaml ceph-12.2.12/qa/suites/rbd/mirror-ha/objectstore/bluestore.yaml
--- ceph-12.2.11/qa/suites/rbd/mirror-ha/objectstore/bluestore.yaml	2019-01-30 15:51:26.000000000 +0000
+++ ceph-12.2.12/qa/suites/rbd/mirror-ha/objectstore/bluestore.yaml	1970-01-01 00:00:00.000000000 +0000
@@ -1,38 +0,0 @@
-overrides:
-  thrashosds:
-    bdev_inject_crash: 2
-    bdev_inject_crash_probability: .5
-  ceph:
-    fs: xfs
-    conf:
-      osd:
-        osd objectstore: bluestore
-        bluestore block size: 96636764160
-        debug bluestore: 20
-        debug bluefs: 20
-        debug rocksdb: 10
-        bluestore fsck on mount: true
-        # lower the full ratios since we can fill up a 100gb osd so quickly
-        mon osd full ratio: .9
-        mon osd backfillfull_ratio: .85
-        mon osd nearfull ratio: .8
-        osd failsafe full ratio: .95
-# this doesn't work with failures bc the log writes are not atomic across the two backends
-#        bluestore bluefs env mirror: true
-  ceph-deploy:
-    fs: xfs
-    bluestore: yes
-    conf:
-      osd:
-        osd objectstore: bluestore
-        bluestore block size: 96636764160
-        debug bluestore: 20
-        debug bluefs: 20
-        debug rocksdb: 10
-        bluestore fsck on mount: true
-        # lower the full ratios since we can fill up a 100gb osd so quickly
-        mon osd full ratio: .9
-        mon osd backfillfull_ratio: .85
-        mon osd nearfull ratio: .8
-        osd failsafe full ratio: .95
-
diff -Nru ceph-12.2.11/qa/suites/rbd/nbd/objectstore/bluestore-bitmap.yaml ceph-12.2.12/qa/suites/rbd/nbd/objectstore/bluestore-bitmap.yaml
--- ceph-12.2.11/qa/suites/rbd/nbd/objectstore/bluestore-bitmap.yaml	2019-01-30 15:51:26.000000000 +0000
+++ ceph-12.2.12/qa/suites/rbd/nbd/objectstore/bluestore-bitmap.yaml	2019-04-11 12:33:50.000000000 +0000
@@ -20,6 +20,8 @@
         osd failsafe full ratio: .95
 # this doesn't work with failures bc the log writes are not atomic across the two backends
 #        bluestore bluefs env mirror: true
+        bdev enable discard: true
+        bdev async discard: true
   ceph-deploy:
     fs: xfs
     bluestore: yes
@@ -36,4 +38,6 @@
         mon osd backfillfull_ratio: .85
         mon osd nearfull ratio: .8
         osd failsafe full ratio: .95
+        bdev enable discard: true
+        bdev async discard: true
 
diff -Nru ceph-12.2.11/qa/suites/rbd/nbd/objectstore/bluestore-stupid.yaml ceph-12.2.12/qa/suites/rbd/nbd/objectstore/bluestore-stupid.yaml
--- ceph-12.2.11/qa/suites/rbd/nbd/objectstore/bluestore-stupid.yaml	1970-01-01 00:00:00.000000000 +0000
+++ ceph-12.2.12/qa/suites/rbd/nbd/objectstore/bluestore-stupid.yaml	2019-04-11 12:33:50.000000000 +0000
@@ -0,0 +1,39 @@
+overrides:
+  thrashosds:
+    bdev_inject_crash: 2
+    bdev_inject_crash_probability: .5
+  ceph:
+    fs: xfs
+    conf:
+      osd:
+        osd objectstore: bluestore
+        bluestore block size: 96636764160
+        debug bluestore: 20
+        debug bluefs: 20
+        debug rocksdb: 10
+        bluestore fsck on mount: true
+        bluestore allocator: stupid
+        # lower the full ratios since we can fill up a 100gb osd so quickly
+        mon osd full ratio: .9
+        mon osd backfillfull_ratio: .85
+        mon osd nearfull ratio: .8
+        osd failsafe full ratio: .95
+# this doesn't work with failures bc the log writes are not atomic across the two backends
+#        bluestore bluefs env mirror: true
+  ceph-deploy:
+    fs: xfs
+    bluestore: yes
+    conf:
+      osd:
+        osd objectstore: bluestore
+        bluestore block size: 96636764160
+        debug bluestore: 20
+        debug bluefs: 20
+        debug rocksdb: 10
+        bluestore fsck on mount: true
+        # lower the full ratios since we can fill up a 100gb osd so quickly
+        mon osd full ratio: .9
+        mon osd backfillfull_ratio: .85
+        mon osd nearfull ratio: .8
+        osd failsafe full ratio: .95
+
diff -Nru ceph-12.2.11/qa/suites/rbd/nbd/objectstore/bluestore.yaml ceph-12.2.12/qa/suites/rbd/nbd/objectstore/bluestore.yaml
--- ceph-12.2.11/qa/suites/rbd/nbd/objectstore/bluestore.yaml	2019-01-30 15:51:26.000000000 +0000
+++ ceph-12.2.12/qa/suites/rbd/nbd/objectstore/bluestore.yaml	1970-01-01 00:00:00.000000000 +0000
@@ -1,38 +0,0 @@
-overrides:
-  thrashosds:
-    bdev_inject_crash: 2
-    bdev_inject_crash_probability: .5
-  ceph:
-    fs: xfs
-    conf:
-      osd:
-        osd objectstore: bluestore
-        bluestore block size: 96636764160
-        debug bluestore: 20
-        debug bluefs: 20
-        debug rocksdb: 10
-        bluestore fsck on mount: true
-        # lower the full ratios since we can fill up a 100gb osd so quickly
-        mon osd full ratio: .9
-        mon osd backfillfull_ratio: .85
-        mon osd nearfull ratio: .8
-        osd failsafe full ratio: .95
-# this doesn't work with failures bc the log writes are not atomic across the two backends
-#        bluestore bluefs env mirror: true
-  ceph-deploy:
-    fs: xfs
-    bluestore: yes
-    conf:
-      osd:
-        osd objectstore: bluestore
-        bluestore block size: 96636764160
-        debug bluestore: 20
-        debug bluefs: 20
-        debug rocksdb: 10
-        bluestore fsck on mount: true
-        # lower the full ratios since we can fill up a 100gb osd so quickly
-        mon osd full ratio: .9
-        mon osd backfillfull_ratio: .85
-        mon osd nearfull ratio: .8
-        osd failsafe full ratio: .95
-
diff -Nru ceph-12.2.11/qa/suites/rbd/openstack/objectstore/bluestore-bitmap.yaml ceph-12.2.12/qa/suites/rbd/openstack/objectstore/bluestore-bitmap.yaml
--- ceph-12.2.11/qa/suites/rbd/openstack/objectstore/bluestore-bitmap.yaml	2019-01-30 15:51:26.000000000 +0000
+++ ceph-12.2.12/qa/suites/rbd/openstack/objectstore/bluestore-bitmap.yaml	2019-04-11 12:33:50.000000000 +0000
@@ -20,6 +20,8 @@
         osd failsafe full ratio: .95
 # this doesn't work with failures bc the log writes are not atomic across the two backends
 #        bluestore bluefs env mirror: true
+        bdev enable discard: true
+        bdev async discard: true
   ceph-deploy:
     fs: xfs
     bluestore: yes
@@ -36,4 +38,6 @@
         mon osd backfillfull_ratio: .85
         mon osd nearfull ratio: .8
         osd failsafe full ratio: .95
+        bdev enable discard: true
+        bdev async discard: true
 
diff -Nru ceph-12.2.11/qa/suites/rbd/openstack/objectstore/bluestore-stupid.yaml ceph-12.2.12/qa/suites/rbd/openstack/objectstore/bluestore-stupid.yaml
--- ceph-12.2.11/qa/suites/rbd/openstack/objectstore/bluestore-stupid.yaml	1970-01-01 00:00:00.000000000 +0000
+++ ceph-12.2.12/qa/suites/rbd/openstack/objectstore/bluestore-stupid.yaml	2019-04-11 12:33:50.000000000 +0000
@@ -0,0 +1,39 @@
+overrides:
+  thrashosds:
+    bdev_inject_crash: 2
+    bdev_inject_crash_probability: .5
+  ceph:
+    fs: xfs
+    conf:
+      osd:
+        osd objectstore: bluestore
+        bluestore block size: 96636764160
+        debug bluestore: 20
+        debug bluefs: 20
+        debug rocksdb: 10
+        bluestore fsck on mount: true
+        bluestore allocator: stupid
+        # lower the full ratios since we can fill up a 100gb osd so quickly
+        mon osd full ratio: .9
+        mon osd backfillfull_ratio: .85
+        mon osd nearfull ratio: .8
+        osd failsafe full ratio: .95
+# this doesn't work with failures bc the log writes are not atomic across the two backends
+#        bluestore bluefs env mirror: true
+  ceph-deploy:
+    fs: xfs
+    bluestore: yes
+    conf:
+      osd:
+        osd objectstore: bluestore
+        bluestore block size: 96636764160
+        debug bluestore: 20
+        debug bluefs: 20
+        debug rocksdb: 10
+        bluestore fsck on mount: true
+        # lower the full ratios since we can fill up a 100gb osd so quickly
+        mon osd full ratio: .9
+        mon osd backfillfull_ratio: .85
+        mon osd nearfull ratio: .8
+        osd failsafe full ratio: .95
+
diff -Nru ceph-12.2.11/qa/suites/rbd/openstack/objectstore/bluestore.yaml ceph-12.2.12/qa/suites/rbd/openstack/objectstore/bluestore.yaml
--- ceph-12.2.11/qa/suites/rbd/openstack/objectstore/bluestore.yaml	2019-01-30 15:51:26.000000000 +0000
+++ ceph-12.2.12/qa/suites/rbd/openstack/objectstore/bluestore.yaml	1970-01-01 00:00:00.000000000 +0000
@@ -1,38 +0,0 @@
-overrides:
-  thrashosds:
-    bdev_inject_crash: 2
-    bdev_inject_crash_probability: .5
-  ceph:
-    fs: xfs
-    conf:
-      osd:
-        osd objectstore: bluestore
-        bluestore block size: 96636764160
-        debug bluestore: 20
-        debug bluefs: 20
-        debug rocksdb: 10
-        bluestore fsck on mount: true
-        # lower the full ratios since we can fill up a 100gb osd so quickly
-        mon osd full ratio: .9
-        mon osd backfillfull_ratio: .85
-        mon osd nearfull ratio: .8
-        osd failsafe full ratio: .95
-# this doesn't work with failures bc the log writes are not atomic across the two backends
-#        bluestore bluefs env mirror: true
-  ceph-deploy:
-    fs: xfs
-    bluestore: yes
-    conf:
-      osd:
-        osd objectstore: bluestore
-        bluestore block size: 96636764160
-        debug bluestore: 20
-        debug bluefs: 20
-        debug rocksdb: 10
-        bluestore fsck on mount: true
-        # lower the full ratios since we can fill up a 100gb osd so quickly
-        mon osd full ratio: .9
-        mon osd backfillfull_ratio: .85
-        mon osd nearfull ratio: .8
-        osd failsafe full ratio: .95
-
diff -Nru ceph-12.2.11/qa/suites/rbd/qemu/objectstore/bluestore-bitmap.yaml ceph-12.2.12/qa/suites/rbd/qemu/objectstore/bluestore-bitmap.yaml
--- ceph-12.2.11/qa/suites/rbd/qemu/objectstore/bluestore-bitmap.yaml	2019-01-30 15:51:26.000000000 +0000
+++ ceph-12.2.12/qa/suites/rbd/qemu/objectstore/bluestore-bitmap.yaml	2019-04-11 12:33:50.000000000 +0000
@@ -20,6 +20,8 @@
         osd failsafe full ratio: .95
 # this doesn't work with failures bc the log writes are not atomic across the two backends
 #        bluestore bluefs env mirror: true
+        bdev enable discard: true
+        bdev async discard: true
   ceph-deploy:
     fs: xfs
     bluestore: yes
@@ -36,4 +38,6 @@
         mon osd backfillfull_ratio: .85
         mon osd nearfull ratio: .8
         osd failsafe full ratio: .95
+        bdev enable discard: true
+        bdev async discard: true
 
diff -Nru ceph-12.2.11/qa/suites/rbd/qemu/objectstore/bluestore-stupid.yaml ceph-12.2.12/qa/suites/rbd/qemu/objectstore/bluestore-stupid.yaml
--- ceph-12.2.11/qa/suites/rbd/qemu/objectstore/bluestore-stupid.yaml	1970-01-01 00:00:00.000000000 +0000
+++ ceph-12.2.12/qa/suites/rbd/qemu/objectstore/bluestore-stupid.yaml	2019-04-11 12:33:50.000000000 +0000
@@ -0,0 +1,39 @@
+overrides:
+  thrashosds:
+    bdev_inject_crash: 2
+    bdev_inject_crash_probability: .5
+  ceph:
+    fs: xfs
+    conf:
+      osd:
+        osd objectstore: bluestore
+        bluestore block size: 96636764160
+        debug bluestore: 20
+        debug bluefs: 20
+        debug rocksdb: 10
+        bluestore fsck on mount: true
+        bluestore allocator: stupid
+        # lower the full ratios since we can fill up a 100gb osd so quickly
+        mon osd full ratio: .9
+        mon osd backfillfull_ratio: .85
+        mon osd nearfull ratio: .8
+        osd failsafe full ratio: .95
+# this doesn't work with failures bc the log writes are not atomic across the two backends
+#        bluestore bluefs env mirror: true
+  ceph-deploy:
+    fs: xfs
+    bluestore: yes
+    conf:
+      osd:
+        osd objectstore: bluestore
+        bluestore block size: 96636764160
+        debug bluestore: 20
+        debug bluefs: 20
+        debug rocksdb: 10
+        bluestore fsck on mount: true
+        # lower the full ratios since we can fill up a 100gb osd so quickly
+        mon osd full ratio: .9
+        mon osd backfillfull_ratio: .85
+        mon osd nearfull ratio: .8
+        osd failsafe full ratio: .95
+
diff -Nru ceph-12.2.11/qa/suites/rbd/qemu/objectstore/bluestore.yaml ceph-12.2.12/qa/suites/rbd/qemu/objectstore/bluestore.yaml
--- ceph-12.2.11/qa/suites/rbd/qemu/objectstore/bluestore.yaml	2019-01-30 15:51:26.000000000 +0000
+++ ceph-12.2.12/qa/suites/rbd/qemu/objectstore/bluestore.yaml	1970-01-01 00:00:00.000000000 +0000
@@ -1,38 +0,0 @@
-overrides:
-  thrashosds:
-    bdev_inject_crash: 2
-    bdev_inject_crash_probability: .5
-  ceph:
-    fs: xfs
-    conf:
-      osd:
-        osd objectstore: bluestore
-        bluestore block size: 96636764160
-        debug bluestore: 20
-        debug bluefs: 20
-        debug rocksdb: 10
-        bluestore fsck on mount: true
-        # lower the full ratios since we can fill up a 100gb osd so quickly
-        mon osd full ratio: .9
-        mon osd backfillfull_ratio: .85
-        mon osd nearfull ratio: .8
-        osd failsafe full ratio: .95
-# this doesn't work with failures bc the log writes are not atomic across the two backends
-#        bluestore bluefs env mirror: true
-  ceph-deploy:
-    fs: xfs
-    bluestore: yes
-    conf:
-      osd:
-        osd objectstore: bluestore
-        bluestore block size: 96636764160
-        debug bluestore: 20
-        debug bluefs: 20
-        debug rocksdb: 10
-        bluestore fsck on mount: true
-        # lower the full ratios since we can fill up a 100gb osd so quickly
-        mon osd full ratio: .9
-        mon osd backfillfull_ratio: .85
-        mon osd nearfull ratio: .8
-        osd failsafe full ratio: .95
-
diff -Nru ceph-12.2.11/qa/suites/rbd/singleton/objectstore/bluestore-bitmap.yaml ceph-12.2.12/qa/suites/rbd/singleton/objectstore/bluestore-bitmap.yaml
--- ceph-12.2.11/qa/suites/rbd/singleton/objectstore/bluestore-bitmap.yaml	2019-01-30 15:51:26.000000000 +0000
+++ ceph-12.2.12/qa/suites/rbd/singleton/objectstore/bluestore-bitmap.yaml	2019-04-11 12:33:50.000000000 +0000
@@ -20,6 +20,8 @@
         osd failsafe full ratio: .95
 # this doesn't work with failures bc the log writes are not atomic across the two backends
 #        bluestore bluefs env mirror: true
+        bdev enable discard: true
+        bdev async discard: true
   ceph-deploy:
     fs: xfs
     bluestore: yes
@@ -36,4 +38,6 @@
         mon osd backfillfull_ratio: .85
         mon osd nearfull ratio: .8
         osd failsafe full ratio: .95
+        bdev enable discard: true
+        bdev async discard: true
 
diff -Nru ceph-12.2.11/qa/suites/rbd/singleton/objectstore/bluestore-stupid.yaml ceph-12.2.12/qa/suites/rbd/singleton/objectstore/bluestore-stupid.yaml
--- ceph-12.2.11/qa/suites/rbd/singleton/objectstore/bluestore-stupid.yaml	1970-01-01 00:00:00.000000000 +0000
+++ ceph-12.2.12/qa/suites/rbd/singleton/objectstore/bluestore-stupid.yaml	2019-04-11 12:33:50.000000000 +0000
@@ -0,0 +1,39 @@
+overrides:
+  thrashosds:
+    bdev_inject_crash: 2
+    bdev_inject_crash_probability: .5
+  ceph:
+    fs: xfs
+    conf:
+      osd:
+        osd objectstore: bluestore
+        bluestore block size: 96636764160
+        debug bluestore: 20
+        debug bluefs: 20
+        debug rocksdb: 10
+        bluestore fsck on mount: true
+        bluestore allocator: stupid
+        # lower the full ratios since we can fill up a 100gb osd so quickly
+        mon osd full ratio: .9
+        mon osd backfillfull_ratio: .85
+        mon osd nearfull ratio: .8
+        osd failsafe full ratio: .95
+# this doesn't work with failures bc the log writes are not atomic across the two backends
+#        bluestore bluefs env mirror: true
+  ceph-deploy:
+    fs: xfs
+    bluestore: yes
+    conf:
+      osd:
+        osd objectstore: bluestore
+        bluestore block size: 96636764160
+        debug bluestore: 20
+        debug bluefs: 20
+        debug rocksdb: 10
+        bluestore fsck on mount: true
+        # lower the full ratios since we can fill up a 100gb osd so quickly
+        mon osd full ratio: .9
+        mon osd backfillfull_ratio: .85
+        mon osd nearfull ratio: .8
+        osd failsafe full ratio: .95
+
diff -Nru ceph-12.2.11/qa/suites/rbd/singleton/objectstore/bluestore.yaml ceph-12.2.12/qa/suites/rbd/singleton/objectstore/bluestore.yaml
--- ceph-12.2.11/qa/suites/rbd/singleton/objectstore/bluestore.yaml	2019-01-30 15:51:26.000000000 +0000
+++ ceph-12.2.12/qa/suites/rbd/singleton/objectstore/bluestore.yaml	1970-01-01 00:00:00.000000000 +0000
@@ -1,38 +0,0 @@
-overrides:
-  thrashosds:
-    bdev_inject_crash: 2
-    bdev_inject_crash_probability: .5
-  ceph:
-    fs: xfs
-    conf:
-      osd:
-        osd objectstore: bluestore
-        bluestore block size: 96636764160
-        debug bluestore: 20
-        debug bluefs: 20
-        debug rocksdb: 10
-        bluestore fsck on mount: true
-        # lower the full ratios since we can fill up a 100gb osd so quickly
-        mon osd full ratio: .9
-        mon osd backfillfull_ratio: .85
-        mon osd nearfull ratio: .8
-        osd failsafe full ratio: .95
-# this doesn't work with failures bc the log writes are not atomic across the two backends
-#        bluestore bluefs env mirror: true
-  ceph-deploy:
-    fs: xfs
-    bluestore: yes
-    conf:
-      osd:
-        osd objectstore: bluestore
-        bluestore block size: 96636764160
-        debug bluestore: 20
-        debug bluefs: 20
-        debug rocksdb: 10
-        bluestore fsck on mount: true
-        # lower the full ratios since we can fill up a 100gb osd so quickly
-        mon osd full ratio: .9
-        mon osd backfillfull_ratio: .85
-        mon osd nearfull ratio: .8
-        osd failsafe full ratio: .95
-
diff -Nru ceph-12.2.11/qa/suites/rbd/singleton-bluestore/objectstore/bluestore-bitmap.yaml ceph-12.2.12/qa/suites/rbd/singleton-bluestore/objectstore/bluestore-bitmap.yaml
--- ceph-12.2.11/qa/suites/rbd/singleton-bluestore/objectstore/bluestore-bitmap.yaml	1970-01-01 00:00:00.000000000 +0000
+++ ceph-12.2.12/qa/suites/rbd/singleton-bluestore/objectstore/bluestore-bitmap.yaml	2019-04-11 12:33:50.000000000 +0000
@@ -0,0 +1,43 @@
+overrides:
+  thrashosds:
+    bdev_inject_crash: 2
+    bdev_inject_crash_probability: .5
+  ceph:
+    fs: xfs
+    conf:
+      osd:
+        osd objectstore: bluestore
+        bluestore block size: 96636764160
+        debug bluestore: 20
+        debug bluefs: 20
+        debug rocksdb: 10
+        bluestore fsck on mount: true
+        bluestore allocator: bitmap
+        # lower the full ratios since we can fill up a 100gb osd so quickly
+        mon osd full ratio: .9
+        mon osd backfillfull_ratio: .85
+        mon osd nearfull ratio: .8
+        osd failsafe full ratio: .95
+# this doesn't work with failures bc the log writes are not atomic across the two backends
+#        bluestore bluefs env mirror: true
+        bdev enable discard: true
+        bdev async discard: true
+  ceph-deploy:
+    fs: xfs
+    bluestore: yes
+    conf:
+      osd:
+        osd objectstore: bluestore
+        bluestore block size: 96636764160
+        debug bluestore: 20
+        debug bluefs: 20
+        debug rocksdb: 10
+        bluestore fsck on mount: true
+        # lower the full ratios since we can fill up a 100gb osd so quickly
+        mon osd full ratio: .9
+        mon osd backfillfull_ratio: .85
+        mon osd nearfull ratio: .8
+        osd failsafe full ratio: .95
+        bdev enable discard: true
+        bdev async discard: true
+
diff -Nru ceph-12.2.11/qa/suites/rbd/singleton-bluestore/objectstore/bluestore.yaml ceph-12.2.12/qa/suites/rbd/singleton-bluestore/objectstore/bluestore.yaml
--- ceph-12.2.11/qa/suites/rbd/singleton-bluestore/objectstore/bluestore.yaml	2019-01-30 15:51:26.000000000 +0000
+++ ceph-12.2.12/qa/suites/rbd/singleton-bluestore/objectstore/bluestore.yaml	1970-01-01 00:00:00.000000000 +0000
@@ -1,38 +0,0 @@
-overrides:
-  thrashosds:
-    bdev_inject_crash: 2
-    bdev_inject_crash_probability: .5
-  ceph:
-    fs: xfs
-    conf:
-      osd:
-        osd objectstore: bluestore
-        bluestore block size: 96636764160
-        debug bluestore: 20
-        debug bluefs: 20
-        debug rocksdb: 10
-        bluestore fsck on mount: true
-        # lower the full ratios since we can fill up a 100gb osd so quickly
-        mon osd full ratio: .9
-        mon osd backfillfull_ratio: .85
-        mon osd nearfull ratio: .8
-        osd failsafe full ratio: .95
-# this doesn't work with failures bc the log writes are not atomic across the two backends
-#        bluestore bluefs env mirror: true
-  ceph-deploy:
-    fs: xfs
-    bluestore: yes
-    conf:
-      osd:
-        osd objectstore: bluestore
-        bluestore block size: 96636764160
-        debug bluestore: 20
-        debug bluefs: 20
-        debug rocksdb: 10
-        bluestore fsck on mount: true
-        # lower the full ratios since we can fill up a 100gb osd so quickly
-        mon osd full ratio: .9
-        mon osd backfillfull_ratio: .85
-        mon osd nearfull ratio: .8
-        osd failsafe full ratio: .95
-
diff -Nru ceph-12.2.11/qa/suites/rbd/thrash/objectstore/bluestore-bitmap.yaml ceph-12.2.12/qa/suites/rbd/thrash/objectstore/bluestore-bitmap.yaml
--- ceph-12.2.11/qa/suites/rbd/thrash/objectstore/bluestore-bitmap.yaml	2019-01-30 15:51:26.000000000 +0000
+++ ceph-12.2.12/qa/suites/rbd/thrash/objectstore/bluestore-bitmap.yaml	2019-04-11 12:33:50.000000000 +0000
@@ -20,6 +20,8 @@
         osd failsafe full ratio: .95
 # this doesn't work with failures bc the log writes are not atomic across the two backends
 #        bluestore bluefs env mirror: true
+        bdev enable discard: true
+        bdev async discard: true
   ceph-deploy:
     fs: xfs
     bluestore: yes
@@ -36,4 +38,6 @@
         mon osd backfillfull_ratio: .85
         mon osd nearfull ratio: .8
         osd failsafe full ratio: .95
+        bdev enable discard: true
+        bdev async discard: true
 
diff -Nru ceph-12.2.11/qa/suites/rbd/thrash/objectstore/bluestore-stupid.yaml ceph-12.2.12/qa/suites/rbd/thrash/objectstore/bluestore-stupid.yaml
--- ceph-12.2.11/qa/suites/rbd/thrash/objectstore/bluestore-stupid.yaml	1970-01-01 00:00:00.000000000 +0000
+++ ceph-12.2.12/qa/suites/rbd/thrash/objectstore/bluestore-stupid.yaml	2019-04-11 12:33:50.000000000 +0000
@@ -0,0 +1,39 @@
+overrides:
+  thrashosds:
+    bdev_inject_crash: 2
+    bdev_inject_crash_probability: .5
+  ceph:
+    fs: xfs
+    conf:
+      osd:
+        osd objectstore: bluestore
+        bluestore block size: 96636764160
+        debug bluestore: 20
+        debug bluefs: 20
+        debug rocksdb: 10
+        bluestore fsck on mount: true
+        bluestore allocator: stupid
+        # lower the full ratios since we can fill up a 100gb osd so quickly
+        mon osd full ratio: .9
+        mon osd backfillfull_ratio: .85
+        mon osd nearfull ratio: .8
+        osd failsafe full ratio: .95
+# this doesn't work with failures bc the log writes are not atomic across the two backends
+#        bluestore bluefs env mirror: true
+  ceph-deploy:
+    fs: xfs
+    bluestore: yes
+    conf:
+      osd:
+        osd objectstore: bluestore
+        bluestore block size: 96636764160
+        debug bluestore: 20
+        debug bluefs: 20
+        debug rocksdb: 10
+        bluestore fsck on mount: true
+        # lower the full ratios since we can fill up a 100gb osd so quickly
+        mon osd full ratio: .9
+        mon osd backfillfull_ratio: .85
+        mon osd nearfull ratio: .8
+        osd failsafe full ratio: .95
+
diff -Nru ceph-12.2.11/qa/suites/rbd/thrash/objectstore/bluestore.yaml ceph-12.2.12/qa/suites/rbd/thrash/objectstore/bluestore.yaml
--- ceph-12.2.11/qa/suites/rbd/thrash/objectstore/bluestore.yaml	2019-01-30 15:51:26.000000000 +0000
+++ ceph-12.2.12/qa/suites/rbd/thrash/objectstore/bluestore.yaml	1970-01-01 00:00:00.000000000 +0000
@@ -1,38 +0,0 @@
-overrides:
-  thrashosds:
-    bdev_inject_crash: 2
-    bdev_inject_crash_probability: .5
-  ceph:
-    fs: xfs
-    conf:
-      osd:
-        osd objectstore: bluestore
-        bluestore block size: 96636764160
-        debug bluestore: 20
-        debug bluefs: 20
-        debug rocksdb: 10
-        bluestore fsck on mount: true
-        # lower the full ratios since we can fill up a 100gb osd so quickly
-        mon osd full ratio: .9
-        mon osd backfillfull_ratio: .85
-        mon osd nearfull ratio: .8
-        osd failsafe full ratio: .95
-# this doesn't work with failures bc the log writes are not atomic across the two backends
-#        bluestore bluefs env mirror: true
-  ceph-deploy:
-    fs: xfs
-    bluestore: yes
-    conf:
-      osd:
-        osd objectstore: bluestore
-        bluestore block size: 96636764160
-        debug bluestore: 20
-        debug bluefs: 20
-        debug rocksdb: 10
-        bluestore fsck on mount: true
-        # lower the full ratios since we can fill up a 100gb osd so quickly
-        mon osd full ratio: .9
-        mon osd backfillfull_ratio: .85
-        mon osd nearfull ratio: .8
-        osd failsafe full ratio: .95
-
diff -Nru ceph-12.2.11/qa/suites/rbd/valgrind/objectstore/bluestore-bitmap.yaml ceph-12.2.12/qa/suites/rbd/valgrind/objectstore/bluestore-bitmap.yaml
--- ceph-12.2.11/qa/suites/rbd/valgrind/objectstore/bluestore-bitmap.yaml	2019-01-30 15:51:26.000000000 +0000
+++ ceph-12.2.12/qa/suites/rbd/valgrind/objectstore/bluestore-bitmap.yaml	2019-04-11 12:33:50.000000000 +0000
@@ -20,6 +20,8 @@
         osd failsafe full ratio: .95
 # this doesn't work with failures bc the log writes are not atomic across the two backends
 #        bluestore bluefs env mirror: true
+        bdev enable discard: true
+        bdev async discard: true
   ceph-deploy:
     fs: xfs
     bluestore: yes
@@ -36,4 +38,6 @@
         mon osd backfillfull_ratio: .85
         mon osd nearfull ratio: .8
         osd failsafe full ratio: .95
+        bdev enable discard: true
+        bdev async discard: true
 
diff -Nru ceph-12.2.11/qa/suites/rbd/valgrind/objectstore/bluestore-stupid.yaml ceph-12.2.12/qa/suites/rbd/valgrind/objectstore/bluestore-stupid.yaml
--- ceph-12.2.11/qa/suites/rbd/valgrind/objectstore/bluestore-stupid.yaml	1970-01-01 00:00:00.000000000 +0000
+++ ceph-12.2.12/qa/suites/rbd/valgrind/objectstore/bluestore-stupid.yaml	2019-04-11 12:33:50.000000000 +0000
@@ -0,0 +1,39 @@
+overrides:
+  thrashosds:
+    bdev_inject_crash: 2
+    bdev_inject_crash_probability: .5
+  ceph:
+    fs: xfs
+    conf:
+      osd:
+        osd objectstore: bluestore
+        bluestore block size: 96636764160
+        debug bluestore: 20
+        debug bluefs: 20
+        debug rocksdb: 10
+        bluestore fsck on mount: true
+        bluestore allocator: stupid
+        # lower the full ratios since we can fill up a 100gb osd so quickly
+        mon osd full ratio: .9
+        mon osd backfillfull_ratio: .85
+        mon osd nearfull ratio: .8
+        osd failsafe full ratio: .95
+# this doesn't work with failures bc the log writes are not atomic across the two backends
+#        bluestore bluefs env mirror: true
+  ceph-deploy:
+    fs: xfs
+    bluestore: yes
+    conf:
+      osd:
+        osd objectstore: bluestore
+        bluestore block size: 96636764160
+        debug bluestore: 20
+        debug bluefs: 20
+        debug rocksdb: 10
+        bluestore fsck on mount: true
+        # lower the full ratios since we can fill up a 100gb osd so quickly
+        mon osd full ratio: .9
+        mon osd backfillfull_ratio: .85
+        mon osd nearfull ratio: .8
+        osd failsafe full ratio: .95
+
diff -Nru ceph-12.2.11/qa/suites/rbd/valgrind/objectstore/bluestore.yaml ceph-12.2.12/qa/suites/rbd/valgrind/objectstore/bluestore.yaml
--- ceph-12.2.11/qa/suites/rbd/valgrind/objectstore/bluestore.yaml	2019-01-30 15:51:26.000000000 +0000
+++ ceph-12.2.12/qa/suites/rbd/valgrind/objectstore/bluestore.yaml	1970-01-01 00:00:00.000000000 +0000
@@ -1,38 +0,0 @@
-overrides:
-  thrashosds:
-    bdev_inject_crash: 2
-    bdev_inject_crash_probability: .5
-  ceph:
-    fs: xfs
-    conf:
-      osd:
-        osd objectstore: bluestore
-        bluestore block size: 96636764160
-        debug bluestore: 20
-        debug bluefs: 20
-        debug rocksdb: 10
-        bluestore fsck on mount: true
-        # lower the full ratios since we can fill up a 100gb osd so quickly
-        mon osd full ratio: .9
-        mon osd backfillfull_ratio: .85
-        mon osd nearfull ratio: .8
-        osd failsafe full ratio: .95
-# this doesn't work with failures bc the log writes are not atomic across the two backends
-#        bluestore bluefs env mirror: true
-  ceph-deploy:
-    fs: xfs
-    bluestore: yes
-    conf:
-      osd:
-        osd objectstore: bluestore
-        bluestore block size: 96636764160
-        debug bluestore: 20
-        debug bluefs: 20
-        debug rocksdb: 10
-        bluestore fsck on mount: true
-        # lower the full ratios since we can fill up a 100gb osd so quickly
-        mon osd full ratio: .9
-        mon osd backfillfull_ratio: .85
-        mon osd nearfull ratio: .8
-        osd failsafe full ratio: .95
-
diff -Nru ceph-12.2.11/qa/suites/rgw/multifs/objectstore/bluestore-bitmap.yaml ceph-12.2.12/qa/suites/rgw/multifs/objectstore/bluestore-bitmap.yaml
--- ceph-12.2.11/qa/suites/rgw/multifs/objectstore/bluestore-bitmap.yaml	2019-01-30 15:51:26.000000000 +0000
+++ ceph-12.2.12/qa/suites/rgw/multifs/objectstore/bluestore-bitmap.yaml	2019-04-11 12:33:50.000000000 +0000
@@ -20,6 +20,8 @@
         osd failsafe full ratio: .95
 # this doesn't work with failures bc the log writes are not atomic across the two backends
 #        bluestore bluefs env mirror: true
+        bdev enable discard: true
+        bdev async discard: true
   ceph-deploy:
     fs: xfs
     bluestore: yes
@@ -36,4 +38,6 @@
         mon osd backfillfull_ratio: .85
         mon osd nearfull ratio: .8
         osd failsafe full ratio: .95
+        bdev enable discard: true
+        bdev async discard: true
 
diff -Nru ceph-12.2.11/qa/suites/rgw/multifs/objectstore/bluestore-stupid.yaml ceph-12.2.12/qa/suites/rgw/multifs/objectstore/bluestore-stupid.yaml
--- ceph-12.2.11/qa/suites/rgw/multifs/objectstore/bluestore-stupid.yaml	1970-01-01 00:00:00.000000000 +0000
+++ ceph-12.2.12/qa/suites/rgw/multifs/objectstore/bluestore-stupid.yaml	2019-04-11 12:33:50.000000000 +0000
@@ -0,0 +1,39 @@
+overrides:
+  thrashosds:
+    bdev_inject_crash: 2
+    bdev_inject_crash_probability: .5
+  ceph:
+    fs: xfs
+    conf:
+      osd:
+        osd objectstore: bluestore
+        bluestore block size: 96636764160
+        debug bluestore: 20
+        debug bluefs: 20
+        debug rocksdb: 10
+        bluestore fsck on mount: true
+        bluestore allocator: stupid
+        # lower the full ratios since we can fill up a 100gb osd so quickly
+        mon osd full ratio: .9
+        mon osd backfillfull_ratio: .85
+        mon osd nearfull ratio: .8
+        osd failsafe full ratio: .95
+# this doesn't work with failures bc the log writes are not atomic across the two backends
+#        bluestore bluefs env mirror: true
+  ceph-deploy:
+    fs: xfs
+    bluestore: yes
+    conf:
+      osd:
+        osd objectstore: bluestore
+        bluestore block size: 96636764160
+        debug bluestore: 20
+        debug bluefs: 20
+        debug rocksdb: 10
+        bluestore fsck on mount: true
+        # lower the full ratios since we can fill up a 100gb osd so quickly
+        mon osd full ratio: .9
+        mon osd backfillfull_ratio: .85
+        mon osd nearfull ratio: .8
+        osd failsafe full ratio: .95
+
diff -Nru ceph-12.2.11/qa/suites/rgw/multifs/objectstore/bluestore.yaml ceph-12.2.12/qa/suites/rgw/multifs/objectstore/bluestore.yaml
--- ceph-12.2.11/qa/suites/rgw/multifs/objectstore/bluestore.yaml	2019-01-30 15:51:26.000000000 +0000
+++ ceph-12.2.12/qa/suites/rgw/multifs/objectstore/bluestore.yaml	1970-01-01 00:00:00.000000000 +0000
@@ -1,38 +0,0 @@
-overrides:
-  thrashosds:
-    bdev_inject_crash: 2
-    bdev_inject_crash_probability: .5
-  ceph:
-    fs: xfs
-    conf:
-      osd:
-        osd objectstore: bluestore
-        bluestore block size: 96636764160
-        debug bluestore: 20
-        debug bluefs: 20
-        debug rocksdb: 10
-        bluestore fsck on mount: true
-        # lower the full ratios since we can fill up a 100gb osd so quickly
-        mon osd full ratio: .9
-        mon osd backfillfull_ratio: .85
-        mon osd nearfull ratio: .8
-        osd failsafe full ratio: .95
-# this doesn't work with failures bc the log writes are not atomic across the two backends
-#        bluestore bluefs env mirror: true
-  ceph-deploy:
-    fs: xfs
-    bluestore: yes
-    conf:
-      osd:
-        osd objectstore: bluestore
-        bluestore block size: 96636764160
-        debug bluestore: 20
-        debug bluefs: 20
-        debug rocksdb: 10
-        bluestore fsck on mount: true
-        # lower the full ratios since we can fill up a 100gb osd so quickly
-        mon osd full ratio: .9
-        mon osd backfillfull_ratio: .85
-        mon osd nearfull ratio: .8
-        osd failsafe full ratio: .95
-
diff -Nru ceph-12.2.11/qa/suites/rgw/multisite/overrides.yaml ceph-12.2.12/qa/suites/rgw/multisite/overrides.yaml
--- ceph-12.2.11/qa/suites/rgw/multisite/overrides.yaml	2019-01-30 15:51:26.000000000 +0000
+++ ceph-12.2.12/qa/suites/rgw/multisite/overrides.yaml	2019-04-11 12:33:50.000000000 +0000
@@ -7,5 +7,7 @@
         rgw crypt s3 kms encryption keys: testkey-1=YmluCmJvb3N0CmJvb3N0LWJ1aWxkCmNlcGguY29uZgo=
         rgw crypt require ssl: false
         rgw sync log trim interval: 0
+        rgw md log max shards: 4
+        rgw data log num shards: 4
   rgw:
     compression type: random
diff -Nru ceph-12.2.11/qa/suites/rgw/singleton/objectstore/bluestore-bitmap.yaml ceph-12.2.12/qa/suites/rgw/singleton/objectstore/bluestore-bitmap.yaml
--- ceph-12.2.11/qa/suites/rgw/singleton/objectstore/bluestore-bitmap.yaml	2019-01-30 15:51:26.000000000 +0000
+++ ceph-12.2.12/qa/suites/rgw/singleton/objectstore/bluestore-bitmap.yaml	2019-04-11 12:33:50.000000000 +0000
@@ -20,6 +20,8 @@
         osd failsafe full ratio: .95
 # this doesn't work with failures bc the log writes are not atomic across the two backends
 #        bluestore bluefs env mirror: true
+        bdev enable discard: true
+        bdev async discard: true
   ceph-deploy:
     fs: xfs
     bluestore: yes
@@ -36,4 +38,6 @@
         mon osd backfillfull_ratio: .85
         mon osd nearfull ratio: .8
         osd failsafe full ratio: .95
+        bdev enable discard: true
+        bdev async discard: true
 
diff -Nru ceph-12.2.11/qa/suites/rgw/singleton/objectstore/bluestore-stupid.yaml ceph-12.2.12/qa/suites/rgw/singleton/objectstore/bluestore-stupid.yaml
--- ceph-12.2.11/qa/suites/rgw/singleton/objectstore/bluestore-stupid.yaml	1970-01-01 00:00:00.000000000 +0000
+++ ceph-12.2.12/qa/suites/rgw/singleton/objectstore/bluestore-stupid.yaml	2019-04-11 12:33:50.000000000 +0000
@@ -0,0 +1,39 @@
+overrides:
+  thrashosds:
+    bdev_inject_crash: 2
+    bdev_inject_crash_probability: .5
+  ceph:
+    fs: xfs
+    conf:
+      osd:
+        osd objectstore: bluestore
+        bluestore block size: 96636764160
+        debug bluestore: 20
+        debug bluefs: 20
+        debug rocksdb: 10
+        bluestore fsck on mount: true
+        bluestore allocator: stupid
+        # lower the full ratios since we can fill up a 100gb osd so quickly
+        mon osd full ratio: .9
+        mon osd backfillfull_ratio: .85
+        mon osd nearfull ratio: .8
+        osd failsafe full ratio: .95
+# this doesn't work with failures bc the log writes are not atomic across the two backends
+#        bluestore bluefs env mirror: true
+  ceph-deploy:
+    fs: xfs
+    bluestore: yes
+    conf:
+      osd:
+        osd objectstore: bluestore
+        bluestore block size: 96636764160
+        debug bluestore: 20
+        debug bluefs: 20
+        debug rocksdb: 10
+        bluestore fsck on mount: true
+        # lower the full ratios since we can fill up a 100gb osd so quickly
+        mon osd full ratio: .9
+        mon osd backfillfull_ratio: .85
+        mon osd nearfull ratio: .8
+        osd failsafe full ratio: .95
+
diff -Nru ceph-12.2.11/qa/suites/rgw/singleton/objectstore/bluestore.yaml ceph-12.2.12/qa/suites/rgw/singleton/objectstore/bluestore.yaml
--- ceph-12.2.11/qa/suites/rgw/singleton/objectstore/bluestore.yaml	2019-01-30 15:51:26.000000000 +0000
+++ ceph-12.2.12/qa/suites/rgw/singleton/objectstore/bluestore.yaml	1970-01-01 00:00:00.000000000 +0000
@@ -1,38 +0,0 @@
-overrides:
-  thrashosds:
-    bdev_inject_crash: 2
-    bdev_inject_crash_probability: .5
-  ceph:
-    fs: xfs
-    conf:
-      osd:
-        osd objectstore: bluestore
-        bluestore block size: 96636764160
-        debug bluestore: 20
-        debug bluefs: 20
-        debug rocksdb: 10
-        bluestore fsck on mount: true
-        # lower the full ratios since we can fill up a 100gb osd so quickly
-        mon osd full ratio: .9
-        mon osd backfillfull_ratio: .85
-        mon osd nearfull ratio: .8
-        osd failsafe full ratio: .95
-# this doesn't work with failures bc the log writes are not atomic across the two backends
-#        bluestore bluefs env mirror: true
-  ceph-deploy:
-    fs: xfs
-    bluestore: yes
-    conf:
-      osd:
-        osd objectstore: bluestore
-        bluestore block size: 96636764160
-        debug bluestore: 20
-        debug bluefs: 20
-        debug rocksdb: 10
-        bluestore fsck on mount: true
-        # lower the full ratios since we can fill up a 100gb osd so quickly
-        mon osd full ratio: .9
-        mon osd backfillfull_ratio: .85
-        mon osd nearfull ratio: .8
-        osd failsafe full ratio: .95
-
diff -Nru ceph-12.2.11/qa/suites/rgw/thrash/objectstore/bluestore-bitmap.yaml ceph-12.2.12/qa/suites/rgw/thrash/objectstore/bluestore-bitmap.yaml
--- ceph-12.2.11/qa/suites/rgw/thrash/objectstore/bluestore-bitmap.yaml	2019-01-30 15:51:26.000000000 +0000
+++ ceph-12.2.12/qa/suites/rgw/thrash/objectstore/bluestore-bitmap.yaml	2019-04-11 12:33:50.000000000 +0000
@@ -20,6 +20,8 @@
         osd failsafe full ratio: .95
 # this doesn't work with failures bc the log writes are not atomic across the two backends
 #        bluestore bluefs env mirror: true
+        bdev enable discard: true
+        bdev async discard: true
   ceph-deploy:
     fs: xfs
     bluestore: yes
@@ -36,4 +38,6 @@
         mon osd backfillfull_ratio: .85
         mon osd nearfull ratio: .8
         osd failsafe full ratio: .95
+        bdev enable discard: true
+        bdev async discard: true
 
diff -Nru ceph-12.2.11/qa/suites/rgw/thrash/objectstore/bluestore-stupid.yaml ceph-12.2.12/qa/suites/rgw/thrash/objectstore/bluestore-stupid.yaml
--- ceph-12.2.11/qa/suites/rgw/thrash/objectstore/bluestore-stupid.yaml	1970-01-01 00:00:00.000000000 +0000
+++ ceph-12.2.12/qa/suites/rgw/thrash/objectstore/bluestore-stupid.yaml	2019-04-11 12:33:50.000000000 +0000
@@ -0,0 +1,39 @@
+overrides:
+  thrashosds:
+    bdev_inject_crash: 2
+    bdev_inject_crash_probability: .5
+  ceph:
+    fs: xfs
+    conf:
+      osd:
+        osd objectstore: bluestore
+        bluestore block size: 96636764160
+        debug bluestore: 20
+        debug bluefs: 20
+        debug rocksdb: 10
+        bluestore fsck on mount: true
+        bluestore allocator: stupid
+        # lower the full ratios since we can fill up a 100gb osd so quickly
+        mon osd full ratio: .9
+        mon osd backfillfull_ratio: .85
+        mon osd nearfull ratio: .8
+        osd failsafe full ratio: .95
+# this doesn't work with failures bc the log writes are not atomic across the two backends
+#        bluestore bluefs env mirror: true
+  ceph-deploy:
+    fs: xfs
+    bluestore: yes
+    conf:
+      osd:
+        osd objectstore: bluestore
+        bluestore block size: 96636764160
+        debug bluestore: 20
+        debug bluefs: 20
+        debug rocksdb: 10
+        bluestore fsck on mount: true
+        # lower the full ratios since we can fill up a 100gb osd so quickly
+        mon osd full ratio: .9
+        mon osd backfillfull_ratio: .85
+        mon osd nearfull ratio: .8
+        osd failsafe full ratio: .95
+
diff -Nru ceph-12.2.11/qa/suites/rgw/thrash/objectstore/bluestore.yaml ceph-12.2.12/qa/suites/rgw/thrash/objectstore/bluestore.yaml
--- ceph-12.2.11/qa/suites/rgw/thrash/objectstore/bluestore.yaml	2019-01-30 15:51:26.000000000 +0000
+++ ceph-12.2.12/qa/suites/rgw/thrash/objectstore/bluestore.yaml	1970-01-01 00:00:00.000000000 +0000
@@ -1,38 +0,0 @@
-overrides:
-  thrashosds:
-    bdev_inject_crash: 2
-    bdev_inject_crash_probability: .5
-  ceph:
-    fs: xfs
-    conf:
-      osd:
-        osd objectstore: bluestore
-        bluestore block size: 96636764160
-        debug bluestore: 20
-        debug bluefs: 20
-        debug rocksdb: 10
-        bluestore fsck on mount: true
-        # lower the full ratios since we can fill up a 100gb osd so quickly
-        mon osd full ratio: .9
-        mon osd backfillfull_ratio: .85
-        mon osd nearfull ratio: .8
-        osd failsafe full ratio: .95
-# this doesn't work with failures bc the log writes are not atomic across the two backends
-#        bluestore bluefs env mirror: true
-  ceph-deploy:
-    fs: xfs
-    bluestore: yes
-    conf:
-      osd:
-        osd objectstore: bluestore
-        bluestore block size: 96636764160
-        debug bluestore: 20
-        debug bluefs: 20
-        debug rocksdb: 10
-        bluestore fsck on mount: true
-        # lower the full ratios since we can fill up a 100gb osd so quickly
-        mon osd full ratio: .9
-        mon osd backfillfull_ratio: .85
-        mon osd nearfull ratio: .8
-        osd failsafe full ratio: .95
-
diff -Nru ceph-12.2.11/qa/suites/rgw/verify/objectstore/bluestore-bitmap.yaml ceph-12.2.12/qa/suites/rgw/verify/objectstore/bluestore-bitmap.yaml
--- ceph-12.2.11/qa/suites/rgw/verify/objectstore/bluestore-bitmap.yaml	2019-01-30 15:51:26.000000000 +0000
+++ ceph-12.2.12/qa/suites/rgw/verify/objectstore/bluestore-bitmap.yaml	2019-04-11 12:33:50.000000000 +0000
@@ -20,6 +20,8 @@
         osd failsafe full ratio: .95
 # this doesn't work with failures bc the log writes are not atomic across the two backends
 #        bluestore bluefs env mirror: true
+        bdev enable discard: true
+        bdev async discard: true
   ceph-deploy:
     fs: xfs
     bluestore: yes
@@ -36,4 +38,6 @@
         mon osd backfillfull_ratio: .85
         mon osd nearfull ratio: .8
         osd failsafe full ratio: .95
+        bdev enable discard: true
+        bdev async discard: true
 
diff -Nru ceph-12.2.11/qa/suites/rgw/verify/objectstore/bluestore-stupid.yaml ceph-12.2.12/qa/suites/rgw/verify/objectstore/bluestore-stupid.yaml
--- ceph-12.2.11/qa/suites/rgw/verify/objectstore/bluestore-stupid.yaml	1970-01-01 00:00:00.000000000 +0000
+++ ceph-12.2.12/qa/suites/rgw/verify/objectstore/bluestore-stupid.yaml	2019-04-11 12:33:50.000000000 +0000
@@ -0,0 +1,39 @@
+overrides:
+  thrashosds:
+    bdev_inject_crash: 2
+    bdev_inject_crash_probability: .5
+  ceph:
+    fs: xfs
+    conf:
+      osd:
+        osd objectstore: bluestore
+        bluestore block size: 96636764160
+        debug bluestore: 20
+        debug bluefs: 20
+        debug rocksdb: 10
+        bluestore fsck on mount: true
+        bluestore allocator: stupid
+        # lower the full ratios since we can fill up a 100gb osd so quickly
+        mon osd full ratio: .9
+        mon osd backfillfull_ratio: .85
+        mon osd nearfull ratio: .8
+        osd failsafe full ratio: .95
+# this doesn't work with failures bc the log writes are not atomic across the two backends
+#        bluestore bluefs env mirror: true
+  ceph-deploy:
+    fs: xfs
+    bluestore: yes
+    conf:
+      osd:
+        osd objectstore: bluestore
+        bluestore block size: 96636764160
+        debug bluestore: 20
+        debug bluefs: 20
+        debug rocksdb: 10
+        bluestore fsck on mount: true
+        # lower the full ratios since we can fill up a 100gb osd so quickly
+        mon osd full ratio: .9
+        mon osd backfillfull_ratio: .85
+        mon osd nearfull ratio: .8
+        osd failsafe full ratio: .95
+
diff -Nru ceph-12.2.11/qa/suites/rgw/verify/objectstore/bluestore.yaml ceph-12.2.12/qa/suites/rgw/verify/objectstore/bluestore.yaml
--- ceph-12.2.11/qa/suites/rgw/verify/objectstore/bluestore.yaml	2019-01-30 15:51:26.000000000 +0000
+++ ceph-12.2.12/qa/suites/rgw/verify/objectstore/bluestore.yaml	1970-01-01 00:00:00.000000000 +0000
@@ -1,38 +0,0 @@
-overrides:
-  thrashosds:
-    bdev_inject_crash: 2
-    bdev_inject_crash_probability: .5
-  ceph:
-    fs: xfs
-    conf:
-      osd:
-        osd objectstore: bluestore
-        bluestore block size: 96636764160
-        debug bluestore: 20
-        debug bluefs: 20
-        debug rocksdb: 10
-        bluestore fsck on mount: true
-        # lower the full ratios since we can fill up a 100gb osd so quickly
-        mon osd full ratio: .9
-        mon osd backfillfull_ratio: .85
-        mon osd nearfull ratio: .8
-        osd failsafe full ratio: .95
-# this doesn't work with failures bc the log writes are not atomic across the two backends
-#        bluestore bluefs env mirror: true
-  ceph-deploy:
-    fs: xfs
-    bluestore: yes
-    conf:
-      osd:
-        osd objectstore: bluestore
-        bluestore block size: 96636764160
-        debug bluestore: 20
-        debug bluefs: 20
-        debug rocksdb: 10
-        bluestore fsck on mount: true
-        # lower the full ratios since we can fill up a 100gb osd so quickly
-        mon osd full ratio: .9
-        mon osd backfillfull_ratio: .85
-        mon osd nearfull ratio: .8
-        osd failsafe full ratio: .95
-
diff -Nru ceph-12.2.11/qa/suites/samba/objectstore/bluestore-bitmap.yaml ceph-12.2.12/qa/suites/samba/objectstore/bluestore-bitmap.yaml
--- ceph-12.2.11/qa/suites/samba/objectstore/bluestore-bitmap.yaml	2019-01-30 15:51:26.000000000 +0000
+++ ceph-12.2.12/qa/suites/samba/objectstore/bluestore-bitmap.yaml	2019-04-11 12:33:50.000000000 +0000
@@ -20,6 +20,8 @@
         osd failsafe full ratio: .95
 # this doesn't work with failures bc the log writes are not atomic across the two backends
 #        bluestore bluefs env mirror: true
+        bdev enable discard: true
+        bdev async discard: true
   ceph-deploy:
     fs: xfs
     bluestore: yes
@@ -36,4 +38,6 @@
         mon osd backfillfull_ratio: .85
         mon osd nearfull ratio: .8
         osd failsafe full ratio: .95
+        bdev enable discard: true
+        bdev async discard: true
 
diff -Nru ceph-12.2.11/qa/suites/samba/objectstore/bluestore-stupid.yaml ceph-12.2.12/qa/suites/samba/objectstore/bluestore-stupid.yaml
--- ceph-12.2.11/qa/suites/samba/objectstore/bluestore-stupid.yaml	1970-01-01 00:00:00.000000000 +0000
+++ ceph-12.2.12/qa/suites/samba/objectstore/bluestore-stupid.yaml	2019-04-11 12:33:50.000000000 +0000
@@ -0,0 +1,39 @@
+overrides:
+  thrashosds:
+    bdev_inject_crash: 2
+    bdev_inject_crash_probability: .5
+  ceph:
+    fs: xfs
+    conf:
+      osd:
+        osd objectstore: bluestore
+        bluestore block size: 96636764160
+        debug bluestore: 20
+        debug bluefs: 20
+        debug rocksdb: 10
+        bluestore fsck on mount: true
+        bluestore allocator: stupid
+        # lower the full ratios since we can fill up a 100gb osd so quickly
+        mon osd full ratio: .9
+        mon osd backfillfull_ratio: .85
+        mon osd nearfull ratio: .8
+        osd failsafe full ratio: .95
+# this doesn't work with failures bc the log writes are not atomic across the two backends
+#        bluestore bluefs env mirror: true
+  ceph-deploy:
+    fs: xfs
+    bluestore: yes
+    conf:
+      osd:
+        osd objectstore: bluestore
+        bluestore block size: 96636764160
+        debug bluestore: 20
+        debug bluefs: 20
+        debug rocksdb: 10
+        bluestore fsck on mount: true
+        # lower the full ratios since we can fill up a 100gb osd so quickly
+        mon osd full ratio: .9
+        mon osd backfillfull_ratio: .85
+        mon osd nearfull ratio: .8
+        osd failsafe full ratio: .95
+
diff -Nru ceph-12.2.11/qa/suites/samba/objectstore/bluestore.yaml ceph-12.2.12/qa/suites/samba/objectstore/bluestore.yaml
--- ceph-12.2.11/qa/suites/samba/objectstore/bluestore.yaml	2019-01-30 15:51:26.000000000 +0000
+++ ceph-12.2.12/qa/suites/samba/objectstore/bluestore.yaml	1970-01-01 00:00:00.000000000 +0000
@@ -1,38 +0,0 @@
-overrides:
-  thrashosds:
-    bdev_inject_crash: 2
-    bdev_inject_crash_probability: .5
-  ceph:
-    fs: xfs
-    conf:
-      osd:
-        osd objectstore: bluestore
-        bluestore block size: 96636764160
-        debug bluestore: 20
-        debug bluefs: 20
-        debug rocksdb: 10
-        bluestore fsck on mount: true
-        # lower the full ratios since we can fill up a 100gb osd so quickly
-        mon osd full ratio: .9
-        mon osd backfillfull_ratio: .85
-        mon osd nearfull ratio: .8
-        osd failsafe full ratio: .95
-# this doesn't work with failures bc the log writes are not atomic across the two backends
-#        bluestore bluefs env mirror: true
-  ceph-deploy:
-    fs: xfs
-    bluestore: yes
-    conf:
-      osd:
-        osd objectstore: bluestore
-        bluestore block size: 96636764160
-        debug bluestore: 20
-        debug bluefs: 20
-        debug rocksdb: 10
-        bluestore fsck on mount: true
-        # lower the full ratios since we can fill up a 100gb osd so quickly
-        mon osd full ratio: .9
-        mon osd backfillfull_ratio: .85
-        mon osd nearfull ratio: .8
-        osd failsafe full ratio: .95
-
diff -Nru ceph-12.2.11/qa/suites/smoke/basic/objectstore/bluestore-bitmap.yaml ceph-12.2.12/qa/suites/smoke/basic/objectstore/bluestore-bitmap.yaml
--- ceph-12.2.11/qa/suites/smoke/basic/objectstore/bluestore-bitmap.yaml	1970-01-01 00:00:00.000000000 +0000
+++ ceph-12.2.12/qa/suites/smoke/basic/objectstore/bluestore-bitmap.yaml	2019-04-11 12:33:50.000000000 +0000
@@ -0,0 +1,43 @@
+overrides:
+  thrashosds:
+    bdev_inject_crash: 2
+    bdev_inject_crash_probability: .5
+  ceph:
+    fs: xfs
+    conf:
+      osd:
+        osd objectstore: bluestore
+        bluestore block size: 96636764160
+        debug bluestore: 20
+        debug bluefs: 20
+        debug rocksdb: 10
+        bluestore fsck on mount: true
+        bluestore allocator: bitmap
+        # lower the full ratios since we can fill up a 100gb osd so quickly
+        mon osd full ratio: .9
+        mon osd backfillfull_ratio: .85
+        mon osd nearfull ratio: .8
+        osd failsafe full ratio: .95
+# this doesn't work with failures bc the log writes are not atomic across the two backends
+#        bluestore bluefs env mirror: true
+        bdev enable discard: true
+        bdev async discard: true
+  ceph-deploy:
+    fs: xfs
+    bluestore: yes
+    conf:
+      osd:
+        osd objectstore: bluestore
+        bluestore block size: 96636764160
+        debug bluestore: 20
+        debug bluefs: 20
+        debug rocksdb: 10
+        bluestore fsck on mount: true
+        # lower the full ratios since we can fill up a 100gb osd so quickly
+        mon osd full ratio: .9
+        mon osd backfillfull_ratio: .85
+        mon osd nearfull ratio: .8
+        osd failsafe full ratio: .95
+        bdev enable discard: true
+        bdev async discard: true
+
diff -Nru ceph-12.2.11/qa/suites/smoke/basic/objectstore/bluestore.yaml ceph-12.2.12/qa/suites/smoke/basic/objectstore/bluestore.yaml
--- ceph-12.2.11/qa/suites/smoke/basic/objectstore/bluestore.yaml	2019-01-30 15:51:26.000000000 +0000
+++ ceph-12.2.12/qa/suites/smoke/basic/objectstore/bluestore.yaml	1970-01-01 00:00:00.000000000 +0000
@@ -1,38 +0,0 @@
-overrides:
-  thrashosds:
-    bdev_inject_crash: 2
-    bdev_inject_crash_probability: .5
-  ceph:
-    fs: xfs
-    conf:
-      osd:
-        osd objectstore: bluestore
-        bluestore block size: 96636764160
-        debug bluestore: 20
-        debug bluefs: 20
-        debug rocksdb: 10
-        bluestore fsck on mount: true
-        # lower the full ratios since we can fill up a 100gb osd so quickly
-        mon osd full ratio: .9
-        mon osd backfillfull_ratio: .85
-        mon osd nearfull ratio: .8
-        osd failsafe full ratio: .95
-# this doesn't work with failures bc the log writes are not atomic across the two backends
-#        bluestore bluefs env mirror: true
-  ceph-deploy:
-    fs: xfs
-    bluestore: yes
-    conf:
-      osd:
-        osd objectstore: bluestore
-        bluestore block size: 96636764160
-        debug bluestore: 20
-        debug bluefs: 20
-        debug rocksdb: 10
-        bluestore fsck on mount: true
-        # lower the full ratios since we can fill up a 100gb osd so quickly
-        mon osd full ratio: .9
-        mon osd backfillfull_ratio: .85
-        mon osd nearfull ratio: .8
-        osd failsafe full ratio: .95
-
diff -Nru ceph-12.2.11/qa/suites/upgrade/kraken-x/parallel/objectstore/bluestore-bitmap.yaml ceph-12.2.12/qa/suites/upgrade/kraken-x/parallel/objectstore/bluestore-bitmap.yaml
--- ceph-12.2.11/qa/suites/upgrade/kraken-x/parallel/objectstore/bluestore-bitmap.yaml	1970-01-01 00:00:00.000000000 +0000
+++ ceph-12.2.12/qa/suites/upgrade/kraken-x/parallel/objectstore/bluestore-bitmap.yaml	2019-04-11 12:33:50.000000000 +0000
@@ -0,0 +1,43 @@
+overrides:
+  thrashosds:
+    bdev_inject_crash: 2
+    bdev_inject_crash_probability: .5
+  ceph:
+    fs: xfs
+    conf:
+      osd:
+        osd objectstore: bluestore
+        bluestore block size: 96636764160
+        debug bluestore: 20
+        debug bluefs: 20
+        debug rocksdb: 10
+        bluestore fsck on mount: true
+        bluestore allocator: bitmap
+        # lower the full ratios since we can fill up a 100gb osd so quickly
+        mon osd full ratio: .9
+        mon osd backfillfull_ratio: .85
+        mon osd nearfull ratio: .8
+        osd failsafe full ratio: .95
+# this doesn't work with failures bc the log writes are not atomic across the two backends
+#        bluestore bluefs env mirror: true
+        bdev enable discard: true
+        bdev async discard: true
+  ceph-deploy:
+    fs: xfs
+    bluestore: yes
+    conf:
+      osd:
+        osd objectstore: bluestore
+        bluestore block size: 96636764160
+        debug bluestore: 20
+        debug bluefs: 20
+        debug rocksdb: 10
+        bluestore fsck on mount: true
+        # lower the full ratios since we can fill up a 100gb osd so quickly
+        mon osd full ratio: .9
+        mon osd backfillfull_ratio: .85
+        mon osd nearfull ratio: .8
+        osd failsafe full ratio: .95
+        bdev enable discard: true
+        bdev async discard: true
+
diff -Nru ceph-12.2.11/qa/suites/upgrade/kraken-x/parallel/objectstore/bluestore.yaml ceph-12.2.12/qa/suites/upgrade/kraken-x/parallel/objectstore/bluestore.yaml
--- ceph-12.2.11/qa/suites/upgrade/kraken-x/parallel/objectstore/bluestore.yaml	2019-01-30 15:51:26.000000000 +0000
+++ ceph-12.2.12/qa/suites/upgrade/kraken-x/parallel/objectstore/bluestore.yaml	1970-01-01 00:00:00.000000000 +0000
@@ -1,38 +0,0 @@
-overrides:
-  thrashosds:
-    bdev_inject_crash: 2
-    bdev_inject_crash_probability: .5
-  ceph:
-    fs: xfs
-    conf:
-      osd:
-        osd objectstore: bluestore
-        bluestore block size: 96636764160
-        debug bluestore: 20
-        debug bluefs: 20
-        debug rocksdb: 10
-        bluestore fsck on mount: true
-        # lower the full ratios since we can fill up a 100gb osd so quickly
-        mon osd full ratio: .9
-        mon osd backfillfull_ratio: .85
-        mon osd nearfull ratio: .8
-        osd failsafe full ratio: .95
-# this doesn't work with failures bc the log writes are not atomic across the two backends
-#        bluestore bluefs env mirror: true
-  ceph-deploy:
-    fs: xfs
-    bluestore: yes
-    conf:
-      osd:
-        osd objectstore: bluestore
-        bluestore block size: 96636764160
-        debug bluestore: 20
-        debug bluefs: 20
-        debug rocksdb: 10
-        bluestore fsck on mount: true
-        # lower the full ratios since we can fill up a 100gb osd so quickly
-        mon osd full ratio: .9
-        mon osd backfillfull_ratio: .85
-        mon osd nearfull ratio: .8
-        osd failsafe full ratio: .95
-
diff -Nru ceph-12.2.11/qa/suites/upgrade/kraken-x/stress-split/objectstore/bluestore-bitmap.yaml ceph-12.2.12/qa/suites/upgrade/kraken-x/stress-split/objectstore/bluestore-bitmap.yaml
--- ceph-12.2.11/qa/suites/upgrade/kraken-x/stress-split/objectstore/bluestore-bitmap.yaml	1970-01-01 00:00:00.000000000 +0000
+++ ceph-12.2.12/qa/suites/upgrade/kraken-x/stress-split/objectstore/bluestore-bitmap.yaml	2019-04-11 12:33:50.000000000 +0000
@@ -0,0 +1,43 @@
+overrides:
+  thrashosds:
+    bdev_inject_crash: 2
+    bdev_inject_crash_probability: .5
+  ceph:
+    fs: xfs
+    conf:
+      osd:
+        osd objectstore: bluestore
+        bluestore block size: 96636764160
+        debug bluestore: 20
+        debug bluefs: 20
+        debug rocksdb: 10
+        bluestore fsck on mount: true
+        bluestore allocator: bitmap
+        # lower the full ratios since we can fill up a 100gb osd so quickly
+        mon osd full ratio: .9
+        mon osd backfillfull_ratio: .85
+        mon osd nearfull ratio: .8
+        osd failsafe full ratio: .95
+# this doesn't work with failures bc the log writes are not atomic across the two backends
+#        bluestore bluefs env mirror: true
+        bdev enable discard: true
+        bdev async discard: true
+  ceph-deploy:
+    fs: xfs
+    bluestore: yes
+    conf:
+      osd:
+        osd objectstore: bluestore
+        bluestore block size: 96636764160
+        debug bluestore: 20
+        debug bluefs: 20
+        debug rocksdb: 10
+        bluestore fsck on mount: true
+        # lower the full ratios since we can fill up a 100gb osd so quickly
+        mon osd full ratio: .9
+        mon osd backfillfull_ratio: .85
+        mon osd nearfull ratio: .8
+        osd failsafe full ratio: .95
+        bdev enable discard: true
+        bdev async discard: true
+
diff -Nru ceph-12.2.11/qa/suites/upgrade/kraken-x/stress-split/objectstore/bluestore.yaml ceph-12.2.12/qa/suites/upgrade/kraken-x/stress-split/objectstore/bluestore.yaml
--- ceph-12.2.11/qa/suites/upgrade/kraken-x/stress-split/objectstore/bluestore.yaml	2019-01-30 15:51:26.000000000 +0000
+++ ceph-12.2.12/qa/suites/upgrade/kraken-x/stress-split/objectstore/bluestore.yaml	1970-01-01 00:00:00.000000000 +0000
@@ -1,38 +0,0 @@
-overrides:
-  thrashosds:
-    bdev_inject_crash: 2
-    bdev_inject_crash_probability: .5
-  ceph:
-    fs: xfs
-    conf:
-      osd:
-        osd objectstore: bluestore
-        bluestore block size: 96636764160
-        debug bluestore: 20
-        debug bluefs: 20
-        debug rocksdb: 10
-        bluestore fsck on mount: true
-        # lower the full ratios since we can fill up a 100gb osd so quickly
-        mon osd full ratio: .9
-        mon osd backfillfull_ratio: .85
-        mon osd nearfull ratio: .8
-        osd failsafe full ratio: .95
-# this doesn't work with failures bc the log writes are not atomic across the two backends
-#        bluestore bluefs env mirror: true
-  ceph-deploy:
-    fs: xfs
-    bluestore: yes
-    conf:
-      osd:
-        osd objectstore: bluestore
-        bluestore block size: 96636764160
-        debug bluestore: 20
-        debug bluefs: 20
-        debug rocksdb: 10
-        bluestore fsck on mount: true
-        # lower the full ratios since we can fill up a 100gb osd so quickly
-        mon osd full ratio: .9
-        mon osd backfillfull_ratio: .85
-        mon osd nearfull ratio: .8
-        osd failsafe full ratio: .95
-
diff -Nru ceph-12.2.11/qa/suites/upgrade/kraken-x/stress-split-erasure-code/objectstore/bluestore-bitmap.yaml ceph-12.2.12/qa/suites/upgrade/kraken-x/stress-split-erasure-code/objectstore/bluestore-bitmap.yaml
--- ceph-12.2.11/qa/suites/upgrade/kraken-x/stress-split-erasure-code/objectstore/bluestore-bitmap.yaml	1970-01-01 00:00:00.000000000 +0000
+++ ceph-12.2.12/qa/suites/upgrade/kraken-x/stress-split-erasure-code/objectstore/bluestore-bitmap.yaml	2019-04-11 12:33:50.000000000 +0000
@@ -0,0 +1,43 @@
+overrides:
+  thrashosds:
+    bdev_inject_crash: 2
+    bdev_inject_crash_probability: .5
+  ceph:
+    fs: xfs
+    conf:
+      osd:
+        osd objectstore: bluestore
+        bluestore block size: 96636764160
+        debug bluestore: 20
+        debug bluefs: 20
+        debug rocksdb: 10
+        bluestore fsck on mount: true
+        bluestore allocator: bitmap
+        # lower the full ratios since we can fill up a 100gb osd so quickly
+        mon osd full ratio: .9
+        mon osd backfillfull_ratio: .85
+        mon osd nearfull ratio: .8
+        osd failsafe full ratio: .95
+# this doesn't work with failures bc the log writes are not atomic across the two backends
+#        bluestore bluefs env mirror: true
+        bdev enable discard: true
+        bdev async discard: true
+  ceph-deploy:
+    fs: xfs
+    bluestore: yes
+    conf:
+      osd:
+        osd objectstore: bluestore
+        bluestore block size: 96636764160
+        debug bluestore: 20
+        debug bluefs: 20
+        debug rocksdb: 10
+        bluestore fsck on mount: true
+        # lower the full ratios since we can fill up a 100gb osd so quickly
+        mon osd full ratio: .9
+        mon osd backfillfull_ratio: .85
+        mon osd nearfull ratio: .8
+        osd failsafe full ratio: .95
+        bdev enable discard: true
+        bdev async discard: true
+
diff -Nru ceph-12.2.11/qa/suites/upgrade/kraken-x/stress-split-erasure-code/objectstore/bluestore.yaml ceph-12.2.12/qa/suites/upgrade/kraken-x/stress-split-erasure-code/objectstore/bluestore.yaml
--- ceph-12.2.11/qa/suites/upgrade/kraken-x/stress-split-erasure-code/objectstore/bluestore.yaml	2019-01-30 15:51:26.000000000 +0000
+++ ceph-12.2.12/qa/suites/upgrade/kraken-x/stress-split-erasure-code/objectstore/bluestore.yaml	1970-01-01 00:00:00.000000000 +0000
@@ -1,38 +0,0 @@
-overrides:
-  thrashosds:
-    bdev_inject_crash: 2
-    bdev_inject_crash_probability: .5
-  ceph:
-    fs: xfs
-    conf:
-      osd:
-        osd objectstore: bluestore
-        bluestore block size: 96636764160
-        debug bluestore: 20
-        debug bluefs: 20
-        debug rocksdb: 10
-        bluestore fsck on mount: true
-        # lower the full ratios since we can fill up a 100gb osd so quickly
-        mon osd full ratio: .9
-        mon osd backfillfull_ratio: .85
-        mon osd nearfull ratio: .8
-        osd failsafe full ratio: .95
-# this doesn't work with failures bc the log writes are not atomic across the two backends
-#        bluestore bluefs env mirror: true
-  ceph-deploy:
-    fs: xfs
-    bluestore: yes
-    conf:
-      osd:
-        osd objectstore: bluestore
-        bluestore block size: 96636764160
-        debug bluestore: 20
-        debug bluefs: 20
-        debug rocksdb: 10
-        bluestore fsck on mount: true
-        # lower the full ratios since we can fill up a 100gb osd so quickly
-        mon osd full ratio: .9
-        mon osd backfillfull_ratio: .85
-        mon osd nearfull ratio: .8
-        osd failsafe full ratio: .95
-
diff -Nru ceph-12.2.11/qa/tasks/cephfs/test_client_limits.py ceph-12.2.12/qa/tasks/cephfs/test_client_limits.py
--- ceph-12.2.11/qa/tasks/cephfs/test_client_limits.py	2019-01-30 15:51:26.000000000 +0000
+++ ceph-12.2.12/qa/tasks/cephfs/test_client_limits.py	2019-04-11 12:33:50.000000000 +0000
@@ -42,12 +42,14 @@
         cache_size = open_files/2
 
         self.set_conf('mds', 'mds cache size', cache_size)
+        self.set_conf('mds', 'mds_recall_max_caps', open_files/2)
+        self.set_conf('mds', 'mds_recall_warning_threshold', open_files)
         self.fs.mds_fail_restart()
         self.fs.wait_for_daemons()
 
         mds_min_caps_per_client = int(self.fs.get_config("mds_min_caps_per_client"))
+        mds_recall_warning_decay_rate = self.fs.get_config("mds_recall_warning_decay_rate")
         self.assertTrue(open_files >= mds_min_caps_per_client)
-        mds_max_ratio_caps_per_client = float(self.fs.get_config("mds_max_ratio_caps_per_client"))
 
         mount_a_client_id = self.mount_a.get_global_id()
         path = "subdir/mount_a" if use_subdir else "mount_a"
@@ -64,13 +66,11 @@
 
         # MDS should not be happy about that, as the client is failing to comply
         # with the SESSION_RECALL messages it is being sent
-        mds_recall_state_timeout = float(self.fs.get_config("mds_recall_state_timeout"))
-        self.wait_for_health("MDS_CLIENT_RECALL", mds_recall_state_timeout+10)
+        self.wait_for_health("MDS_CLIENT_RECALL", mds_recall_warning_decay_rate*2)
 
         # We can also test that the MDS health warning for oversized
         # cache is functioning as intended.
-        self.wait_for_health("MDS_CACHE_OVERSIZED",
-                mds_recall_state_timeout + 10)
+        self.wait_for_health("MDS_CACHE_OVERSIZED", mds_recall_warning_decay_rate*2)
 
         # When the client closes the files, it should retain only as many caps as allowed
         # under the SESSION_RECALL policy
@@ -84,14 +84,13 @@
 
         # The remaining caps should comply with the numbers sent from MDS in SESSION_RECALL message,
         # which depend on the caps outstanding, cache size and overall ratio
-        recall_expected_value = int((1.0-mds_max_ratio_caps_per_client)*(open_files+2))
         def expected_caps():
             num_caps = self.get_session(mount_a_client_id)['num_caps']
             if num_caps < mds_min_caps_per_client:
                 raise RuntimeError("client caps fell below min!")
             elif num_caps == mds_min_caps_per_client:
                 return True
-            elif recall_expected_value*.95 <= num_caps <= recall_expected_value*1.05:
+            elif num_caps < cache_size:
                 return True
             else:
                 return False
@@ -237,3 +236,28 @@
     def test_client_cache_size(self):
         self._test_client_cache_size(False)
         self._test_client_cache_size(True)
+
+    def test_client_max_caps(self):
+        """
+        That the MDS will not let a client sit above mds_max_caps_per_client caps.
+        """
+
+        mds_min_caps_per_client = int(self.fs.get_config("mds_min_caps_per_client"))
+        mds_max_caps_per_client = 2*mds_min_caps_per_client
+        self.set_conf('mds', 'mds_max_caps_per_client', mds_max_caps_per_client)
+        self.fs.mds_fail_restart()
+        self.fs.wait_for_daemons()
+
+        self.mount_a.create_n_files("foo/", 3*mds_max_caps_per_client, sync=True)
+
+        mount_a_client_id = self.mount_a.get_global_id()
+        def expected_caps():
+            num_caps = self.get_session(mount_a_client_id)['num_caps']
+            if num_caps < mds_min_caps_per_client:
+                raise RuntimeError("client caps fell below min!")
+            elif num_caps <= mds_max_caps_per_client:
+                return True
+            else:
+                return False
+
+        self.wait_until_true(expected_caps, timeout=60)
diff -Nru ceph-12.2.11/qa/tasks/cephfs/test_misc.py ceph-12.2.12/qa/tasks/cephfs/test_misc.py
--- ceph-12.2.11/qa/tasks/cephfs/test_misc.py	2019-01-30 15:51:26.000000000 +0000
+++ ceph-12.2.12/qa/tasks/cephfs/test_misc.py	2019-04-11 12:33:50.000000000 +0000
@@ -52,6 +52,9 @@
         self.assertGreaterEqual(rctime, t-10)
 
     def test_fs_new(self):
+        self.mount_a.umount_wait()
+        self.mount_b.umount_wait()
+
         data_pool_name = self.fs.get_data_pool_name()
 
         self.fs.mds_stop()
diff -Nru ceph-12.2.11/qa/tasks/radosbench.py ceph-12.2.12/qa/tasks/radosbench.py
--- ceph-12.2.11/qa/tasks/radosbench.py	2019-01-30 15:51:26.000000000 +0000
+++ ceph-12.2.12/qa/tasks/radosbench.py	2019-04-11 12:33:50.000000000 +0000
@@ -76,12 +76,12 @@
             else:
                 pool = manager.create_pool_with_unique_name(erasure_code_profile_name=profile_name)
 
-        osize = config.get('objectsize', 0)
+        osize = config.get('objectsize', 65536)
         if osize is 0:
             objectsize = []
         else:
             objectsize = ['-o', str(osize)]
-        size = ['-b', str(config.get('size', 4<<20))]
+        size = ['-b', str(config.get('size', 65536))]
         # If doing a reading run then populate data
         if runtype != "write":
             proc = remote.run(
diff -Nru ceph-12.2.11/qa/tasks/rgw_multi/tests.py ceph-12.2.12/qa/tasks/rgw_multi/tests.py
--- ceph-12.2.11/qa/tasks/rgw_multi/tests.py	2019-01-30 15:51:26.000000000 +0000
+++ ceph-12.2.12/qa/tasks/rgw_multi/tests.py	2019-04-11 12:33:50.000000000 +0000
@@ -96,6 +96,15 @@
 def mdlog_autotrim(zone):
     zone.cluster.admin(['mdlog', 'autotrim'])
 
+def datalog_list(zone, period = None):
+    cmd = ['datalog', 'list']
+    (datalog_json, _) = zone.cluster.admin(cmd, read_only=True)
+    datalog_json = datalog_json.decode('utf-8')
+    return json.loads(datalog_json)
+
+def datalog_autotrim(zone):
+    zone.cluster.admin(['datalog', 'autotrim'])
+
 def bilog_list(zone, bucket, args = None):
     cmd = ['bilog', 'list', '--bucket', bucket] + (args or [])
     bilog, _ = zone.cluster.admin(cmd, read_only=True)
@@ -280,7 +289,7 @@
 def data_source_log_status(source_zone):
     source_cluster = source_zone.cluster
     cmd = ['datalog', 'status'] + source_zone.zone_args()
-    datalog_status_json, retcode = source_cluster.rgw_admin(cmd, read_only=True)
+    datalog_status_json, retcode = source_cluster.admin(cmd, read_only=True)
     datalog_status = json.loads(datalog_status_json.decode('utf-8'))
 
     markers = {i: s['marker'] for i, s in enumerate(datalog_status)}
@@ -345,7 +354,7 @@
 
     return True
 
-def zone_data_checkpoint(target_zone, source_zone_conn):
+def zone_data_checkpoint(target_zone, source_zone):
     if target_zone == source_zone:
         return
 
@@ -367,6 +376,13 @@
     assert False, 'failed data checkpoint for target_zone=%s source_zone=%s' % \
                   (target_zone.name, source_zone.name)
 
+def zonegroup_data_checkpoint(zonegroup_conns):
+    for source_conn in zonegroup_conns.rw_zones:
+        for target_conn in zonegroup_conns.zones:
+            if source_conn.zone == target_conn.zone:
+                continue
+            log.debug('data checkpoint: source=%s target=%s', source_conn.zone.name, target_conn.zone.name)
+            zone_data_checkpoint(target_conn.zone, source_conn.zone)
 
 def zone_bucket_checkpoint(target_zone, source_zone, bucket_name):
     if target_zone == source_zone:
@@ -688,6 +704,90 @@
     for _, bucket in zone_bucket:
         zonegroup_bucket_checkpoint(zonegroup_conns, bucket.name)
 
+def test_delete_marker_full_sync():
+    zonegroup = realm.master_zonegroup()
+    zonegroup_conns = ZonegroupConns(zonegroup)
+    buckets, zone_bucket = create_bucket_per_zone(zonegroup_conns)
+
+    # enable versioning
+    for _, bucket in zone_bucket:
+        bucket.configure_versioning(True)
+    zonegroup_meta_checkpoint(zonegroup)
+
+    for zone, bucket in zone_bucket:
+        # upload an initial object
+        key1 = new_key(zone, bucket, 'obj')
+        key1.set_contents_from_string('')
+
+        # create a delete marker
+        key2 = new_key(zone, bucket, 'obj')
+        key2.delete()
+
+    # wait for full sync
+    for _, bucket in zone_bucket:
+        zonegroup_bucket_checkpoint(zonegroup_conns, bucket.name)
+
+def test_suspended_delete_marker_full_sync():
+    zonegroup = realm.master_zonegroup()
+    zonegroup_conns = ZonegroupConns(zonegroup)
+    buckets, zone_bucket = create_bucket_per_zone(zonegroup_conns)
+
+    # enable/suspend versioning
+    for _, bucket in zone_bucket:
+        bucket.configure_versioning(True)
+        bucket.configure_versioning(False)
+    zonegroup_meta_checkpoint(zonegroup)
+
+    for zone, bucket in zone_bucket:
+        # upload an initial object
+        key1 = new_key(zone, bucket, 'obj')
+        key1.set_contents_from_string('')
+
+        # create a delete marker
+        key2 = new_key(zone, bucket, 'obj')
+        key2.delete()
+
+    # wait for full sync
+    for _, bucket in zone_bucket:
+        zonegroup_bucket_checkpoint(zonegroup_conns, bucket.name)
+
+def test_version_suspended_incremental_sync():
+    zonegroup = realm.master_zonegroup()
+    zonegroup_conns = ZonegroupConns(zonegroup)
+
+    zone = zonegroup_conns.rw_zones[0]
+
+    # create a non-versioned bucket
+    bucket = zone.create_bucket(gen_bucket_name())
+    log.debug('created bucket=%s', bucket.name)
+    zonegroup_meta_checkpoint(zonegroup)
+
+    # upload an initial object
+    key1 = new_key(zone, bucket, 'obj')
+    key1.set_contents_from_string('')
+    log.debug('created initial version id=%s', key1.version_id)
+    zonegroup_bucket_checkpoint(zonegroup_conns, bucket.name)
+
+    # enable versioning
+    bucket.configure_versioning(True)
+    zonegroup_meta_checkpoint(zonegroup)
+
+    # re-upload the object as a new version
+    key2 = new_key(zone, bucket, 'obj')
+    key2.set_contents_from_string('')
+    log.debug('created new version id=%s', key2.version_id)
+    zonegroup_bucket_checkpoint(zonegroup_conns, bucket.name)
+
+    # suspend versioning
+    bucket.configure_versioning(False)
+    zonegroup_meta_checkpoint(zonegroup)
+
+    # re-upload the object as a 'null' version
+    key3 = new_key(zone, bucket, 'obj')
+    key3.set_contents_from_string('')
+    log.debug('created null version id=%s', key3.version_id)
+    zonegroup_bucket_checkpoint(zonegroup_conns, bucket.name)
+
 
 def test_bucket_versioning():
     buckets, zone_bucket = create_bucket_per_zone_in_realm()
@@ -822,6 +922,25 @@
             mdlog = mdlog_list(zone, period)
             assert len(mdlog) == 0
 
+def test_datalog_autotrim():
+    zonegroup = realm.master_zonegroup()
+    zonegroup_conns = ZonegroupConns(zonegroup)
+    buckets, zone_bucket = create_bucket_per_zone(zonegroup_conns)
+
+    # upload an object to each zone to generate a datalog entry
+    for zone, bucket in zone_bucket:
+        k = new_key(zone, bucket.name, 'key')
+        k.set_contents_from_string('body')
+
+    # wait for data sync to catch up
+    zonegroup_data_checkpoint(zonegroup_conns)
+
+    # trim each datalog
+    for zone, _ in zone_bucket:
+        datalog_autotrim(zone.zone)
+        datalog = datalog_list(zone.zone)
+        assert len(datalog) == 0
+
 def test_zonegroup_remove():
     zonegroup = realm.master_zonegroup()
     zonegroup_conns = ZonegroupConns(zonegroup)
@@ -913,6 +1032,8 @@
     for zone in zonegroup.zones:
         check_buckets_sync_status_obj_not_exist(zone, buckets)
 
+    zonegroup_data_checkpoint(zonegroup_conns)
+
 def test_bucket_sync_enable_right_after_disable():
     zonegroup = realm.master_zonegroup()
     zonegroup_conns = ZonegroupConns(zonegroup)
@@ -943,6 +1064,8 @@
     for bucket_name in buckets:
         zonegroup_bucket_checkpoint(zonegroup_conns, bucket_name)
 
+    zonegroup_data_checkpoint(zonegroup_conns)
+
 def test_bucket_sync_disable_enable():
     zonegroup = realm.master_zonegroup()
     zonegroup_conns = ZonegroupConns(zonegroup)
@@ -979,6 +1102,8 @@
     for bucket_name in buckets:
         zonegroup_bucket_checkpoint(zonegroup_conns, bucket_name)
 
+    zonegroup_data_checkpoint(zonegroup_conns)
+
 def test_multipart_object_sync():
     zonegroup = realm.master_zonegroup()
     zonegroup_conns = ZonegroupConns(zonegroup)
diff -Nru ceph-12.2.11/qa/tasks/rgw_multi/zone_rados.py ceph-12.2.12/qa/tasks/rgw_multi/zone_rados.py
--- ceph-12.2.11/qa/tasks/rgw_multi/zone_rados.py	2019-01-30 15:51:26.000000000 +0000
+++ ceph-12.2.12/qa/tasks/rgw_multi/zone_rados.py	2019-04-11 12:33:50.000000000 +0000
@@ -1,4 +1,5 @@
 import logging
+from boto.s3.deletemarker import DeleteMarker
 
 try:
     from itertools import izip_longest as zip_longest
@@ -16,6 +17,13 @@
     assert k2
     log.debug('comparing key name=%s', k1.name)
     eq(k1.name, k2.name)
+    eq(k1.version_id, k2.version_id)
+    eq(k1.is_latest, k2.is_latest)
+    eq(k1.last_modified, k2.last_modified)
+    if isinstance(k1, DeleteMarker):
+        assert isinstance(k2, DeleteMarker)
+        return
+
     eq(k1.get_contents_as_string(), k2.get_contents_as_string())
     eq(k1.metadata, k2.metadata)
     eq(k1.cache_control, k2.cache_control)
@@ -24,16 +32,13 @@
     eq(k1.content_disposition, k2.content_disposition)
     eq(k1.content_language, k2.content_language)
     eq(k1.etag, k2.etag)
-    eq(k1.last_modified, k2.last_modified)
     if check_extra:
         eq(k1.owner.id, k2.owner.id)
         eq(k1.owner.display_name, k2.owner.display_name)
     eq(k1.storage_class, k2.storage_class)
     eq(k1.size, k2.size)
-    eq(k1.version_id, k2.version_id)
     eq(k1.encrypted, k2.encrypted)
 
-
 class RadosZone(Zone):
     def __init__(self, name, zonegroup = None, cluster = None, data = None, zone_id = None, gateways = None):
         super(RadosZone, self).__init__(name, zonegroup, cluster, data, zone_id, gateways)
@@ -57,14 +62,17 @@
             b1 = self.get_bucket(bucket_name)
             b2 = zone_conn.get_bucket(bucket_name)
 
+            b1_versions = b1.list_versions()
             log.debug('bucket1 objects:')
-            for o in b1.get_all_versions():
+            for o in b1_versions:
                 log.debug('o=%s', o.name)
+
+            b2_versions = b2.list_versions()
             log.debug('bucket2 objects:')
-            for o in b2.get_all_versions():
+            for o in b2_versions:
                 log.debug('o=%s', o.name)
 
-            for k1, k2 in zip_longest(b1.get_all_versions(), b2.get_all_versions()):
+            for k1, k2 in zip_longest(b1_versions, b2_versions):
                 if k1 is None:
                     log.critical('key=%s is missing from zone=%s', k2.name, self.name)
                     assert False
@@ -74,11 +82,23 @@
 
                 check_object_eq(k1, k2)
 
-                # now get the keys through a HEAD operation, verify that the available data is the same
-                k1_head = b1.get_key(k1.name)
-                k2_head = b2.get_key(k2.name)
-
-                check_object_eq(k1_head, k2_head, False)
+                if isinstance(k1, DeleteMarker):
+                    # verify that HEAD sees a delete marker
+                    assert b1.get_key(k1.name) is None
+                    assert b2.get_key(k2.name) is None
+                else:
+                    # now get the keys through a HEAD operation, verify that the available data is the same
+                    k1_head = b1.get_key(k1.name, version_id=k1.version_id)
+                    k2_head = b2.get_key(k2.name, version_id=k2.version_id)
+                    check_object_eq(k1_head, k2_head, False)
+
+                    if k1.version_id:
+                        # compare the olh to make sure they agree about the current version
+                        k1_olh = b1.get_key(k1.name)
+                        k2_olh = b2.get_key(k2.name)
+                        # if there's a delete marker, HEAD will return None
+                        if k1_olh or k2_olh:
+                            check_object_eq(k1_olh, k2_olh, False)
 
             log.info('success, bucket identical: bucket=%s zones={%s, %s}', bucket_name, self.name, zone_conn.name)
 
diff -Nru ceph-12.2.11/qa/valgrind.supp ceph-12.2.12/qa/valgrind.supp
--- ceph-12.2.11/qa/valgrind.supp	1970-01-01 00:00:00.000000000 +0000
+++ ceph-12.2.12/qa/valgrind.supp	2019-04-11 12:33:50.000000000 +0000
@@ -0,0 +1,622 @@
+{
+   older boost mersenne twister uses uninitialized memory for randomness
+   Memcheck:Cond
+   ...
+   fun:*Monitor::prepare_new_fingerprint*
+   ...
+}
+{
+   older boost mersenne twister uses uninitialized memory for randomness
+   Memcheck:Value8
+   ...
+   fun:*Monitor::prepare_new_fingerprint*
+   ...
+}
+{
+   apparent TLS leak in eglibc
+   Memcheck:Leak
+   fun:calloc
+   fun:_dl_allocate_tls
+   fun:pthread_create*
+   ...
+}
+{
+   osd: ignore ec plugin loading (FIXME SOMEDAY)
+   Memcheck:Leak
+   ...
+   fun:*ErasureCodePluginRegistry*load*
+   ...
+}
+{
+   osd: ignore ec plugin factory (FIXME SOMEDAY)
+   Memcheck:Leak
+   ...
+   fun:*ErasureCodePluginRegistry*factory*
+   ...
+}
+{
+   tcmalloc: libboost_thread-mt.so.1.53 is linked with tcmalloc
+   Memcheck:Param
+   msync(start)
+   obj:/usr/lib64/libpthread-2.17.so
+   obj:/usr/lib64/libunwind.so.8.0.1
+   obj:/usr/lib64/libunwind.so.8.0.1
+   obj:/usr/lib64/libunwind.so.8.0.1
+   obj:/usr/lib64/libunwind.so.8.0.1
+   ...
+   fun:*tcmalloc*ThreadCache*
+   ...
+   obj:/usr/lib64/libboost_thread-mt.so.1.53.0
+}
+{
+   tcmalloc: msync heap allocation points to uninit bytes (centos 6.5)
+   Memcheck:Param
+   msync(start)
+   obj:/lib64/libpthread-2.12.so
+   obj:/usr/lib64/libunwind.so.8.0.1
+   obj:/usr/lib64/libunwind.so.8.0.1
+   obj:/usr/lib64/libunwind.so.8.0.1
+   obj:/usr/lib64/libunwind.so.8.0.1
+   fun:_ULx86_64_step
+   fun:_Z13GetStackTracePPvii
+   fun:_ZN8tcmalloc8PageHeap8GrowHeapEm
+   fun:_ZN8tcmalloc8PageHeap3NewEm
+}
+{
+   tcmalloc: msync heap allocation points to unaddressible bytes (centos 6.5 #2)
+   Memcheck:Param
+   msync(start)
+   obj:/lib64/libpthread-2.12.so
+   obj:/usr/lib64/libunwind.so.7.0.0
+   fun:_ULx86_64_step
+   fun:_Z13GetStackTracePPvii
+   fun:_ZN8tcmalloc8PageHeap8GrowHeapEm
+   fun:_ZN8tcmalloc8PageHeap3NewEm
+}
+{
+   tcmalloc: msync heap allocation points to uninit bytes (rhel7)
+   Memcheck:Param
+   msync(start)
+   obj:/usr/lib64/libpthread-2.17.so
+   obj:/usr/lib64/libunwind.so.8.0.1
+   obj:/usr/lib64/libunwind.so.8.0.1
+   obj:/usr/lib64/libunwind.so.8.0.1
+   obj:/usr/lib64/libunwind.so.8.0.1
+   fun:_ULx86_64_step
+   fun:_Z13GetStackTracePPvii
+   fun:_ZN8tcmalloc8PageHeap8GrowHeapEm
+   fun:_ZN8tcmalloc8PageHeap3NewEm
+}
+{
+   tcmalloc: msync heap allocation points to uninit bytes (rhel7 #2)
+   Memcheck:Param
+   msync(start)
+   obj:/usr/lib64/libpthread-2.17.so
+   obj:/usr/lib64/libunwind.so.8.0.1
+   obj:/usr/lib64/libunwind.so.8.0.1
+   obj:/usr/lib64/libunwind.so.8.0.1
+   obj:/usr/lib64/libunwind.so.8.0.1
+   fun:_ULx86_64_step
+   obj:/usr/lib64/libtcmalloc.so.4.2.6
+   fun:_Z13GetStackTracePPvii
+   fun:_ZN8tcmalloc8PageHeap8GrowHeapEm
+   fun:_ZN8tcmalloc8PageHeap3NewEm
+}
+{
+   tcmalloc: msync heap allocation points to uninit bytes (wheezy)
+   Memcheck:Param
+   msync(start)
+   obj:/lib/x86_64-linux-gnu/libpthread-2.13.so
+   obj:/usr/lib/libunwind.so.7.0.0
+   fun:_ULx86_64_step
+   fun:_Z13GetStackTracePPvii
+   fun:_ZN8tcmalloc8PageHeap8GrowHeapEm
+   fun:_ZN8tcmalloc8PageHeap3NewEm
+}
+{
+   tcmalloc: msync heap allocation points to uninit bytes (precise)
+   Memcheck:Param
+   msync(start)
+   obj:/lib/x86_64-linux-gnu/libpthread-2.15.so
+   obj:/usr/lib/libunwind.so.7.0.0
+   fun:_ULx86_64_step
+   fun:_Z13GetStackTracePPvii
+   fun:_ZN8tcmalloc8PageHeap8GrowHeapEm
+   fun:_ZN8tcmalloc8PageHeap3NewEm
+   obj:/usr/lib/libtcmalloc.so.0.1.0
+}
+{
+   tcmalloc: msync heap allocation points to uninit bytes (trusty)
+   Memcheck:Param
+   msync(start)
+   obj:/lib/x86_64-linux-gnu/libpthread-2.19.so
+   obj:/usr/lib/x86_64-linux-gnu/libunwind.so.8.0.1
+   obj:/usr/lib/x86_64-linux-gnu/libunwind.so.8.0.1
+   obj:/usr/lib/x86_64-linux-gnu/libunwind.so.8.0.1
+   obj:/usr/lib/x86_64-linux-gnu/libunwind.so.8.0.1
+   fun:_ULx86_64_step
+   fun:_Z13GetStackTracePPvii
+   fun:_ZN8tcmalloc8PageHeap8GrowHeapEm
+   fun:_ZN8tcmalloc8PageHeap3NewEm
+}
+{
+   tcmalloc: msync heap allocation points to uninit bytes 2 (trusty)
+   Memcheck:Param
+   msync(start)
+   fun:__msync_nocancel
+   obj:/usr/lib/x86_64-linux-gnu/libunwind.so.8.0.1
+   obj:/usr/lib/x86_64-linux-gnu/libunwind.so.8.0.1
+   obj:/usr/lib/x86_64-linux-gnu/libunwind.so.8.0.1
+   obj:/usr/lib/x86_64-linux-gnu/libunwind.so.8.0.1
+   fun:_ULx86_64_step
+   fun:_Z13GetStackTracePPvii
+   fun:_ZN8tcmalloc8PageHeap8GrowHeapEm
+   fun:_ZN8tcmalloc8PageHeap3NewEm
+   fun:_ZN8tcmalloc15CentralFreeList8PopulateEv
+   fun:_ZN8tcmalloc15CentralFreeList18FetchFromSpansSafeEv
+   fun:_ZN8tcmalloc15CentralFreeList11RemoveRangeEPPvS2_i
+}
+{
+   tcmalloc: msync (xenial)
+   Memcheck:Param
+   msync(start)
+   fun:__msync_nocancel
+   obj:/usr/lib/x86_64-linux-gnu/libunwind.so.8.0.1
+   obj:/usr/lib/x86_64-linux-gnu/libunwind.so.8.0.1
+   obj:/usr/lib/x86_64-linux-gnu/libunwind.so.8.0.1
+   obj:/usr/lib/x86_64-linux-gnu/libunwind.so.8.0.1
+   obj:*tcmalloc*
+   fun:*GetStackTrace*
+}
+{
+	tcmalloc: string
+	Memcheck:Leak
+	...
+	obj:*tcmalloc*
+	fun:call_init*
+	...
+}
+{
+	ceph global: deliberate onexit leak
+	Memcheck:Leak
+	...
+	fun:*set_flush_on_exit*
+	...
+}
+{
+	libleveldb: ignore all static leveldb leaks
+	Memcheck:Leak
+	...
+	fun:*leveldb*
+	...
+}
+{
+	libleveldb: ignore all dynamic libleveldb leaks
+	Memcheck:Leak
+	...
+	obj:*libleveldb.so*
+	...
+}
+{
+	libcurl: ignore libcurl leaks
+	Memcheck:Leak
+	...
+	fun:*curl_global_init
+}
+{
+	ignore gnutls leaks
+	Memcheck:Leak
+	...
+	fun:gnutls_global_init
+}
+{
+	ignore libfcgi leak; OS_LibShutdown has no callers!
+	Memcheck:Leak
+	...
+	fun:OS_LibInit
+	fun:FCGX_Init
+}
+{
+	ignore libnss3 leaks
+	Memcheck:Leak
+	...
+	obj:*libnss3*
+	...
+}
+{
+        strptime suckage
+        Memcheck:Cond
+        fun:__GI___strncasecmp_l
+        fun:__strptime_internal
+        ...
+}
+{
+        strptime suckage 2
+        Memcheck:Value8
+        fun:__GI___strncasecmp_l
+        fun:__strptime_internal
+        ...
+}
+{
+        strptime suckage 3
+        Memcheck:Addr8
+        fun:__GI___strncasecmp_l
+        fun:__strptime_internal
+        ...
+}
+{
+	inet_ntop does something lame on local stack
+	Memcheck:Value8
+	...
+	fun:inet_ntop
+	...
+}
+{
+	inet_ntop does something lame on local stack
+	Memcheck:Addr8
+	...
+	fun:inet_ntop
+	...
+}
+{
+	dl-lookup.c thing .. Invalid write of size 8
+	Memcheck:Value8
+	fun:do_lookup_x
+	...
+	fun:_dl_lookup_symbol_x
+	...
+}
+{
+	dl-lookup.c thing .. Invalid write of size 8
+	Memcheck:Addr8
+	fun:do_lookup_x
+	...
+	fun:_dl_lookup_symbol_x
+	...
+}
+{
+	weird thing from libc
+	Memcheck:Leak
+	...
+	fun:*sub_I_comparator*
+	fun:__libc_csu_init
+	...
+}
+{
+	libfuse leak
+	Memcheck:Leak
+	...
+	fun:fuse_parse_cmdline
+	...
+}
+{
+	boost thread leaks on exit
+	Memcheck:Leak
+	...
+	fun:*boost*detail*
+	...
+	fun:exit
+}
+{
+	lttng appears to not clean up state
+	Memcheck:Leak
+	...
+	fun:lttng_ust_baddr_statedump_init
+	fun:lttng_ust_init
+	fun:call_init.part.0
+	...
+}
+{
+	fun:PK11_CreateContextBySymKey race
+	Helgrind:Race
+	obj:/usr/*lib*/libfreebl*3.so
+	...
+	obj:/usr/*lib*/libsoftokn3.so
+	...
+	obj:/usr/*lib*/libnss3.so
+	fun:PK11_CreateContextBySymKey
+	...
+}
+{
+	thread init race
+	Helgrind:Race
+	fun:mempcpy
+	fun:_dl_allocate_tls_init
+	...
+	fun:pthread_create@*
+	...
+}
+{
+	thread_local memory is falsely detected (https://svn.boost.org/trac/boost/ticket/3296)
+	Memcheck:Leak
+	...
+	fun:*boost*detail*get_once_per_thread_epoch*
+	fun:*boost*call_once*
+	fun:*boost*detail*get_current_thread_data*
+	...
+}
+{
+	rocksdb thread local singletons
+	Memcheck:Leak
+	...
+	fun:rocksdb::Env::Default()
+	...
+}
+{
+	rocksdb column thread local leaks
+	Memcheck:Leak
+	...
+	fun:rocksdb::ThreadLocalPtr::StaticMeta::SetHandler*
+	fun:rocksdb::ColumnFamilyData::ColumnFamilyData*
+	...
+}
+{
+	rocksdb thread crap
+	Memcheck:Leak
+	...
+	fun:*ThreadLocalPtr*
+	...
+}
+{
+	rocksdb singleton Env leak, blech
+	Memcheck:Leak
+	...
+	fun:CreateThreadStatusUpdater
+	fun:PosixEnv
+	...
+}
+{
+	rocksdb::Env::Default()
+	Memcheck:Leak
+	...
+	fun:*rocksdb*Env*Default*
+	...
+}
+{
+	rocksdb BGThreadWrapper
+	Memcheck:Leak
+	...
+	fun:*BGThreadWrapper*
+	...
+}
+{
+	libstdc++ leak on xenial
+	Memcheck:Leak
+	fun:malloc
+	...
+	fun:call_init.part.0
+	fun:call_init
+	fun:_dl_init
+	...
+}
+{
+	strange leak of std::string memory from md_config_t seen in radosgw
+	Memcheck:Leak
+	...
+	fun:_ZNSs4_Rep9_S_createEmmRKSaIcE
+	fun:_ZNSs12_S_constructIPKcEEPcT_S3_RKSaIcESt20forward_iterator_tag
+	...
+	fun:_ZN11md_config_tC1Ev
+	fun:_ZN11CephContextC1Eji
+	...
+}
+{
+    python does not reset the member field when dealloc an object
+    Memcheck:Leak
+    match-leak-kinds: all
+    ...
+    fun:Py_InitializeEx
+    ...
+}
+{
+    statically allocated python types don't get members freed
+    Memcheck:Leak
+    match-leak-kinds: all
+    ...
+    fun:PyType_Ready
+    ...
+}
+{
+    manually constructed python module members don't get freed
+    Memcheck:Leak
+    match-leak-kinds: all
+    ...
+    fun:Py_InitModule4_64
+    ...
+}
+{
+    manually constructed python module members don't get freed
+    Memcheck:Leak
+    match-leak-kinds: all
+    ...
+    fun:PyModule_AddObject
+    ...
+}
+{
+    python subinterpreters may not clean up properly
+    Memcheck:Leak
+    match-leak-kinds: all
+    ...
+    fun:Py_NewInterpreter
+    ...
+}
+{
+    python should be able to take care of itself
+    Memcheck:Leak
+    match-leak-kinds: all
+    ...
+    fun:PyEval_EvalCode
+}
+{
+    python should be able to take care of itself
+    Memcheck:Leak
+    match-leak-kinds: all
+    ...
+    fun:PyImport_ImportModuleLevel
+}
+{
+    python-owned threads may not full clean up after themselves
+    Memcheck:Leak
+    match-leak-kinds: all
+    ...
+    fun:PyEval_CallObjectWithKeywords
+}
+{
+    python should be able to take care of itself
+    Memcheck:Leak
+    match-leak-kinds: all
+    ...
+    fun:PyEval_EvalFrameEx
+    ...
+    obj:/usr/lib64/libpython2.7.so.1.0
+}
+{
+    python should be able to take care of itself
+    Memcheck:Leak
+    match-leak-kinds: all
+    ...
+    fun:PyObject_Call
+}
+
+{
+   rados cython constants
+   Memcheck:Leak
+   match-leak-kinds: definite
+   fun:malloc
+   fun:PyObject_Malloc
+   fun:PyCode_New
+   fun:__Pyx_InitCachedConstants
+   fun:initrados
+   fun:_PyImport_LoadDynamicModule
+   ...
+   fun:PyImport_ImportModuleLevel
+   ...
+   fun:PyObject_Call
+   fun:PyEval_CallObjectWithKeywords
+   fun:PyEval_EvalFrameEx
+}
+
+{
+   rbd cython constants
+   Memcheck:Leak
+   match-leak-kinds: definite
+   fun:malloc
+   fun:PyObject_Malloc
+   fun:PyCode_New
+   fun:__Pyx_InitCachedConstants
+   fun:initrbd
+   fun:_PyImport_LoadDynamicModule
+   ...
+   fun:PyImport_ImportModuleLevel
+   ...
+   fun:PyObject_Call
+   fun:PyEval_CallObjectWithKeywords
+   fun:PyEval_EvalFrameEx
+}
+
+{
+  dlopen() with -lceph-common https://bugs.debian.org/cgi-bin/bugreport.cgi?bug=700899
+  Memcheck:Leak
+  match-leak-kinds: reachable
+  fun:*alloc
+  ...
+  fun:_dlerror_run
+  fun:dlopen@@GLIBC_2.2.5
+}
+
+{
+  ethdev_init_log thing
+  Memcheck:Leak
+  match-leak-kinds: reachable
+  ...
+  fun:ethdev_init_log
+  ...
+}
+
+{
+  rte_log_init() in DPDK fails to reset strdup()'ed string at exit
+  Memcheck:Leak
+  match-leak-kinds: reachable
+  fun:*alloc
+  ...
+  fun:rte_log_init
+  fun:__libc_csu_init
+}
+
+{
+  libc_csu_init (strdup, rte_log_register, etc.)
+  Memcheck:Leak
+  match-leak-kinds: reachable
+  ...
+  fun:__libc_csu_init
+  ...
+}
+
+{
+  Boost.Thread fails to call tls_destructor() when the thread exists
+  Memcheck:Leak
+  match-leak-kinds: reachable
+  ...
+  fun:*boost*detail*make_external_thread_data*
+  fun:*boost*detail*add_new_tss_node*
+  fun:*boost*detail*set_tss_data*
+  ...
+}
+
+{
+  ignore *all* ceph-mgr python crap.  this is overkill, but better than nothing
+  Memcheck:Leak
+  match-leak-kinds: all
+  ...
+  fun:Py*
+  ...
+}
+
+{
+  something in glibc
+  Memcheck:Leak
+  match-leak-kinds: all
+  ...
+  fun:strdup
+  fun:__trans_list_add
+  ...
+  fun:_dl_init
+  ...
+}
+
+# "Conditional jump or move depends on uninitialised value(s)" in OpenSSL
+# while using aes-128-gcm with AES-NI enabled. Not observed while running
+# with `OPENSSL_ia32cap="~0x200000200000000"`.
+{
+   <insert_a_suppression_name_here>
+   Memcheck:Cond
+   ...
+   fun:EVP_DecryptFinal_ex
+   fun:_ZN4ceph6crypto6onwire25AES128GCM_OnWireRxHandler34authenticated_decrypt_update_finalEONS_6buffer7v14_2_04listEj
+   fun:_ZN10ProtocolV231handle_read_frame_epilogue_mainEOSt10unique_ptrIN4ceph6buffer7v14_2_08ptr_nodeENS4_8disposerEEi
+   fun:_ZN10ProtocolV216run_continuationER2CtIS_E
+   ...
+   fun:_ZN15AsyncConnection7processEv
+   fun:_ZN11EventCenter14process_eventsEjPNSt6chrono8durationImSt5ratioILl1ELl1000000000EEEE
+   fun:operator()
+   fun:_ZNSt17_Function_handlerIFvvEZN12NetworkStack10add_threadEjEUlvE_E9_M_invokeERKSt9_Any_data
+   fun:execute_native_thread_routine
+   fun:start_thread
+   fun:clone
+}
+
+{
+   <insert_a_suppression_name_here>
+   Memcheck:Cond
+   fun:_ZN4ceph6crypto6onwire25AES128GCM_OnWireRxHandler34authenticated_decrypt_update_finalEONS_6buffer7v14_2_04listEj
+   fun:_ZN10ProtocolV231handle_read_frame_epilogue_mainEOSt10unique_ptrIN4ceph6buffer7v14_2_08ptr_nodeENS4_8disposerEEi
+   fun:_ZN10ProtocolV216run_continuationER2CtIS_E
+   ...
+   fun:_ZN15AsyncConnection7processEv
+   fun:_ZN11EventCenter14process_eventsEjPNSt6chrono8durationImSt5ratioILl1ELl1000000000EEEE
+   fun:operator()
+   fun:_ZNSt17_Function_handlerIFvvEZN12NetworkStack10add_threadEjEUlvE_E9_M_invokeERKSt9_Any_data
+   fun:execute_native_thread_routine
+   fun:start_thread
+   fun:clone
+}
diff -Nru ceph-12.2.11/qa/workunits/cephtool/test.sh ceph-12.2.12/qa/workunits/cephtool/test.sh
--- ceph-12.2.11/qa/workunits/cephtool/test.sh	2019-01-30 15:51:26.000000000 +0000
+++ ceph-12.2.12/qa/workunits/cephtool/test.sh	2019-04-11 12:33:50.000000000 +0000
@@ -49,7 +49,7 @@
 }
 
 
-TEMP_DIR=$(mktemp -d ${TMPDIR-/tmp}/cephtool.XXX)
+TEMP_DIR=$(mktemp -d ${TMPDIR:-/tmp}/cephtool.XXX)
 trap "rm -fr $TEMP_DIR" 0
 
 TMPFILE=$(mktemp $TEMP_DIR/test_invalid.XXX)
@@ -578,7 +578,9 @@
 
 function test_auth()
 {
-  ceph auth add client.xx mon allow osd "allow *"
+  expect_false ceph auth add client.xx mon 'invalid' osd "allow *"
+  expect_false ceph auth add client.xx mon 'allow *' osd "allow *" invalid "allow *"
+  ceph auth add client.xx mon 'allow *' osd "allow *"
   ceph auth export client.xx >client.xx.keyring
   ceph auth add client.xx -i client.xx.keyring
   rm -f client.xx.keyring
@@ -602,7 +604,7 @@
   expect_false ceph auth get client.xx
 
   # (almost) interactive mode
-  echo -e 'auth add client.xx mon allow osd "allow *"\n' | ceph
+  echo -e 'auth add client.xx mon "allow *" osd "allow *"\n' | ceph
   ceph auth get client.xx
   # script mode
   echo 'auth del client.xx' | ceph
diff -Nru ceph-12.2.11/qa/workunits/libcephfs-java/test.sh ceph-12.2.12/qa/workunits/libcephfs-java/test.sh
--- ceph-12.2.11/qa/workunits/libcephfs-java/test.sh	2019-01-30 15:51:26.000000000 +0000
+++ ceph-12.2.12/qa/workunits/libcephfs-java/test.sh	1970-01-01 00:00:00.000000000 +0000
@@ -1,39 +0,0 @@
-#!/bin/sh -e
-
-echo "starting libcephfs-java tests"
-# configure CEPH_CONF and LD_LIBRARY_PATH if they're not already set
-conf="$CEPH_CONF"
-if [ -z "$conf" ] ; then
-	echo "Setting conf to /etc/ceph/ceph.conf" 
-	conf="/etc/ceph/ceph.conf"
-else
-	echo "conf is set to $conf"
-fi
-
-ld_lib_path="$LD_LIBRARY_PATH"
-if [ -z "$ld_lib_path" ] ; then
-	echo "Setting ld_lib_path to /usr/lib/jni:/usr/lib64"
-	ld_lib_path="/usr/lib/jni:/usr/lib64"
-else
-	echo "ld_lib_path was set to $ld_lib_path"
-fi
-
-ceph_java="$CEPH_JAVA_PATH"
-if [ -z "$ceph_java" ] ; then
-	echo "Setting ceph_java to /usr/share/java"
-	ceph_java="/usr/share/java"
-else
-	echo "ceph_java was set to $ceph_java"
-fi
-
-command="java -DCEPH_CONF_FILE=$conf -Djava.library.path=$ld_lib_path -cp /usr/share/java/junit4.jar:$ceph_java/libcephfs.jar:$ceph_java/libcephfs-test.jar org.junit.runner.JUnitCore com.ceph.fs.CephAllTests"
-
-echo "----------------------"
-echo $command
-echo "----------------------"
-
-$command
-
-echo "completed libcephfs-java tests"
-
-exit 0
diff -Nru ceph-12.2.11/qa/workunits/rados/test_health_warnings.sh ceph-12.2.12/qa/workunits/rados/test_health_warnings.sh
--- ceph-12.2.11/qa/workunits/rados/test_health_warnings.sh	2019-01-30 15:51:26.000000000 +0000
+++ ceph-12.2.12/qa/workunits/rados/test_health_warnings.sh	2019-04-11 12:33:50.000000000 +0000
@@ -7,6 +7,7 @@
 ceph osd setcrushmap -i crushmap
 ceph osd tree
 ceph tell osd.* injectargs --osd_max_markdown_count 1024 --osd_max_markdown_period 1
+ceph osd set noout
 
 wait_for_healthy() {
   while ceph health | grep down
diff -Nru ceph-12.2.11/src/auth/Crypto.cc ceph-12.2.12/src/auth/Crypto.cc
--- ceph-12.2.11/src/auth/Crypto.cc	2019-01-30 15:51:26.000000000 +0000
+++ ceph-12.2.12/src/auth/Crypto.cc	2019-04-11 12:33:50.000000000 +0000
@@ -291,8 +291,9 @@
     keyItem.type = siBuffer;
     keyItem.data = (unsigned char*)secret.c_str();
     keyItem.len = secret.length();
-    key = PK11_ImportSymKey(slot, mechanism, PK11_OriginUnwrap, CKA_ENCRYPT,
-			    &keyItem, NULL);
+    using ceph::crypto::PK11_ImportSymKey_FIPS;
+    key = PK11_ImportSymKey_FIPS(slot, mechanism, PK11_OriginUnwrap, CKA_ENCRYPT,
+				 &keyItem, NULL);
     if (!key) {
       err << "cannot convert AES key for NSS: " << PR_GetError();
       return -1;
diff -Nru ceph-12.2.11/src/ceph-disk/run-tox.sh ceph-12.2.12/src/ceph-disk/run-tox.sh
--- ceph-12.2.11/src/ceph-disk/run-tox.sh	2019-01-30 15:51:26.000000000 +0000
+++ ceph-12.2.12/src/ceph-disk/run-tox.sh	2019-04-11 12:33:50.000000000 +0000
@@ -16,7 +16,7 @@
 #
 
 # run from the ceph-disk directory or from its parent
-: ${CEPH_DISK_VIRTUALENV:=/tmp/ceph-disk-virtualenv}
+: ${CEPH_DISK_VIRTUALENV:=$CEPH_BUILD_DIR/ceph-disk-virtualenv}
 test -d ceph-disk && cd ceph-disk
 
 if [ -e tox.ini ]; then
diff -Nru ceph-12.2.11/src/ceph.in ceph-12.2.12/src/ceph.in
--- ceph-12.2.11/src/ceph.in	2019-01-30 15:51:26.000000000 +0000
+++ ceph-12.2.12/src/ceph.in	2019-04-11 12:33:50.000000000 +0000
@@ -21,7 +21,9 @@
 
 from __future__ import print_function
 import codecs
+import grp
 import os
+import pwd
 import sys
 import platform
 
@@ -270,7 +272,10 @@
                         help='input file, or "-" for stdin')
     parser.add_argument('-o', '--out-file', dest='output_file',
                         help='output file, or "-" for stdout')
-
+    parser.add_argument('--setuser', dest='setuser',
+                        help='set user file permission')
+    parser.add_argument('--setgroup', dest='setgroup',
+                        help='set group file permission')
     parser.add_argument('--id', '--user', dest='client_id',
                         help='client id for authentication')
     parser.add_argument('--name', '-n', dest='client_name',
@@ -990,6 +995,20 @@
         except Exception as e:
             print('Can\'t open output file {0}: {1}'.format(parsed_args.output_file, e), file=sys.stderr)
             return 1
+        if parsed_args.setuser:
+            try:
+                ownerid = pwd.getpwnam(parsed_args.setuser).pw_uid
+                os.fchown(outf.fileno(), ownerid, -1)
+            except OSError as e:
+                print('Failed to change user ownership of {0} to {1}: {2}'.format(outf, parsed_args.setuser, e))
+                return 1
+        if parsed_args.setgroup:
+            try:
+                groupid = grp.getgrnam(parsed_args.setgroup).gr_gid
+                os.fchown(outf.fileno(), -1, groupid)
+            except OSError as e:
+                print('Failed to change group ownership of {0} to {1}: {2}'.format(outf, parsed_args.setgroup, e))
+                return 1
 
     # -s behaves like a command (ceph status).
     if parsed_args.status:
diff -Nru ceph-12.2.11/src/ceph-volume/ceph_volume/devices/simple/activate.py ceph-12.2.12/src/ceph-volume/ceph_volume/devices/simple/activate.py
--- ceph-12.2.11/src/ceph-volume/ceph_volume/devices/simple/activate.py	2019-01-30 15:51:26.000000000 +0000
+++ ceph-12.2.12/src/ceph-volume/ceph_volume/devices/simple/activate.py	2019-04-11 12:33:50.000000000 +0000
@@ -1,6 +1,7 @@
 from __future__ import print_function
 import argparse
 import base64
+import glob
 import json
 import logging
 import os
@@ -231,6 +232,12 @@
             help='The FSID of the OSD, similar to a SHA1'
         )
         parser.add_argument(
+            '--all',
+            help='Activate all OSDs with a OSD JSON config',
+            action='store_true',
+            default=False,
+        )
+        parser.add_argument(
             '--file',
             help='The path to a JSON file, from a scanned OSD'
         )
@@ -244,7 +251,7 @@
             print(sub_command_help)
             return
         args = parser.parse_args(self.argv)
-        if not args.file:
+        if not args.file and not args.all:
             if not args.osd_id and not args.osd_fsid:
                 terminal.error('ID and FSID are required to find the right OSD to activate')
                 terminal.error('from a scanned OSD location in /etc/ceph/osd/')
@@ -253,13 +260,22 @@
         # implicitly indicate that it would be possible to activate a json file
         # at a non-default location which would not work at boot time if the
         # custom location is not exposed through an ENV var
+        self.skip_systemd = args.skip_systemd
         json_dir = os.environ.get('CEPH_VOLUME_SIMPLE_JSON_DIR', '/etc/ceph/osd/')
-        if args.file:
-            json_config = args.file
+        if args.all:
+            if args.file or args.osd_id:
+                mlogger.warn('--all was passed, ignoring --file and ID/FSID arguments')
+            json_configs = glob.glob('{}/*.json'.format(json_dir))
+            for json_config in json_configs:
+                mlogger.info('activating OSD specified in {}'.format(json_config))
+                args.json_config = json_config
+                self.activate(args)
         else:
-            json_config = os.path.join(json_dir, '%s-%s.json' % (args.osd_id, args.osd_fsid))
-        if not os.path.exists(json_config):
-            raise RuntimeError('Expected JSON config path not found: %s' % json_config)
-        args.json_config = json_config
-        self.skip_systemd = args.skip_systemd
-        self.activate(args)
+            if args.file:
+                json_config = args.file
+            else:
+                json_config = os.path.join(json_dir, '%s-%s.json' % (args.osd_id, args.osd_fsid))
+            if not os.path.exists(json_config):
+                raise RuntimeError('Expected JSON config path not found: %s' % json_config)
+            args.json_config = json_config
+            self.activate(args)
diff -Nru ceph-12.2.11/src/ceph-volume/ceph_volume/devices/simple/scan.py ceph-12.2.12/src/ceph-volume/ceph_volume/devices/simple/scan.py
--- ceph-12.2.11/src/ceph-volume/ceph_volume/devices/simple/scan.py	2019-01-30 15:51:26.000000000 +0000
+++ ceph-12.2.12/src/ceph-volume/ceph_volume/devices/simple/scan.py	2019-04-11 12:33:50.000000000 +0000
@@ -7,6 +7,7 @@
 from textwrap import dedent
 from ceph_volume import decorators, terminal, conf
 from ceph_volume.api import lvm
+from ceph_volume.systemd import systemctl
 from ceph_volume.util import arg_validators, system, disk, encryption
 from ceph_volume.util.device import Device
 
@@ -40,7 +41,7 @@
 
 class Scan(object):
 
-    help = 'Capture metadata from an OSD data partition or directory'
+    help = 'Capture metadata from all running ceph-disk OSDs, OSD data partition or directory'
 
     def __init__(self, argv):
         self.argv = argv
@@ -283,7 +284,7 @@
 
     def main(self):
         sub_command_help = dedent("""
-        Scan an OSD directory (or data device) for files and configurations
+        Scan running OSDs, an OSD directory (or data device) for files and configurations
         that will allow to take over the management of the OSD.
 
         Scanned OSDs will get their configurations stored in
@@ -298,13 +299,19 @@
 
             /etc/ceph/osd/0-a9d50838-e823-43d6-b01f-2f8d0a77afc2.json
 
-        To a scan an existing, running, OSD:
+        To scan all running OSDs:
+
+            ceph-volume simple scan
+
+        To a scan a specific running OSD:
 
             ceph-volume simple scan /var/lib/ceph/osd/{cluster}-{osd id}
 
         And to scan a device (mounted or unmounted) that has OSD data in it, for example /dev/sda1
 
             ceph-volume simple scan /dev/sda1
+
+        Scanning a device or directory that belongs to an OSD not created by ceph-disk will be ingored.
         """)
         parser = argparse.ArgumentParser(
             prog='ceph-volume simple scan',
@@ -329,25 +336,40 @@
             metavar='OSD_PATH',
             type=arg_validators.OSDPath(),
             nargs='?',
+            default=None,
             help='Path to an existing OSD directory or OSD data partition'
         )
 
-        if len(self.argv) == 0:
-            print(sub_command_help)
-            return
-
         args = parser.parse_args(self.argv)
-        device = Device(args.osd_path)
-        if device.is_partition:
-            if device.ceph_disk.type != 'data':
-                label = device.ceph_disk.partlabel
-                msg = 'Device must be the ceph data partition, but PARTLABEL reported: "%s"' % label
-                raise RuntimeError(msg)
+        paths = []
+        if args.osd_path:
+            paths.append(args.osd_path)
+        else:
+            osd_ids = systemctl.get_running_osd_ids()
+            for osd_id in osd_ids:
+                paths.append("/var/lib/ceph/osd/{}-{}".format(
+                    conf.cluster,
+                    osd_id,
+                ))
 
         # Capture some environment status, so that it can be reused all over
         self.device_mounts = system.get_mounts(devices=True)
         self.path_mounts = system.get_mounts(paths=True)
-        self.encryption_metadata = encryption.legacy_encrypted(args.osd_path)
-        self.is_encrypted = self.encryption_metadata['encrypted']
 
-        self.scan(args)
+        for path in paths:
+            args.osd_path = path
+            device = Device(args.osd_path)
+            if device.is_partition:
+                if device.ceph_disk.type != 'data':
+                    label = device.ceph_disk.partlabel
+                    msg = 'Device must be the ceph data partition, but PARTLABEL reported: "%s"' % label
+                    raise RuntimeError(msg)
+
+            self.encryption_metadata = encryption.legacy_encrypted(args.osd_path)
+            self.is_encrypted = self.encryption_metadata['encrypted']
+
+            device = Device(self.encryption_metadata['device'])
+            if not device.is_ceph_disk_member:
+                terminal.warning("Ignoring %s because it's not a ceph-disk created osd." % path)
+            else:
+                self.scan(args)
diff -Nru ceph-12.2.11/src/ceph-volume/ceph_volume/systemd/systemctl.py ceph-12.2.12/src/ceph-volume/ceph_volume/systemd/systemctl.py
--- ceph-12.2.11/src/ceph-volume/ceph_volume/systemd/systemctl.py	2019-01-30 15:51:26.000000000 +0000
+++ ceph-12.2.12/src/ceph-volume/ceph_volume/systemd/systemctl.py	2019-04-11 12:33:50.000000000 +0000
@@ -1,8 +1,11 @@
 """
 Utilities to control systemd units
 """
+import logging
+
 from ceph_volume import process
 
+logger = logging.getLogger(__name__)
 
 def start(unit):
     process.run(['systemctl', 'start', unit])
@@ -34,6 +37,26 @@
     )
     return rc == 0
 
+def get_running_osd_ids():
+    out, err, rc = process.call([
+        'systemctl',
+        'show',
+        '--no-pager',
+        '--property=Id',
+        '--state=running',
+        'ceph-osd@*',
+    ])
+    osd_ids = []
+    if rc == 0:
+        for line in out:
+            if line:
+                # example line looks like: Id=ceph-osd@1.service
+                try:
+                    osd_id = line.split("@")[1].split(".service")[0]
+                    osd_ids.append(osd_id)
+                except (IndexError, TypeError):
+                    logger.warning("Failed to parse output from systemctl: %s", line)
+    return osd_ids
 
 def start_osd(id_):
     return start(osd_unit % id_)
diff -Nru ceph-12.2.11/src/ceph-volume/ceph_volume/tests/devices/simple/test_activate.py ceph-12.2.12/src/ceph-volume/ceph_volume/tests/devices/simple/test_activate.py
--- ceph-12.2.11/src/ceph-volume/ceph_volume/tests/devices/simple/test_activate.py	2019-01-30 15:51:26.000000000 +0000
+++ ceph-12.2.12/src/ceph-volume/ceph_volume/tests/devices/simple/test_activate.py	2019-04-11 12:33:50.000000000 +0000
@@ -22,6 +22,26 @@
         stdout, stderr = capsys.readouterr()
         assert 'Activate OSDs by mounting devices previously configured' in stdout
 
+    def test_activate_all(self, is_root, monkeypatch):
+        '''
+        make sure Activate calls activate for each file returned by glob
+        '''
+        mocked_glob = []
+        def mock_glob(glob):
+            path = os.path.dirname(glob)
+            mocked_glob.extend(['{}/{}.json'.format(path, file_) for file_ in
+                                ['1', '2', '3']])
+            return mocked_glob
+        activate_files = []
+        def mock_activate(self, args):
+            activate_files.append(args.json_config)
+        monkeypatch.setattr('glob.glob', mock_glob)
+        monkeypatch.setattr(activate.Activate, 'activate', mock_activate)
+        activate.Activate(['--all']).main()
+        assert activate_files == mocked_glob
+
+
+
 
 class TestEnableSystemdUnits(object):
 
diff -Nru ceph-12.2.11/src/ceph-volume/ceph_volume/tests/devices/simple/test_scan.py ceph-12.2.12/src/ceph-volume/ceph_volume/tests/devices/simple/test_scan.py
--- ceph-12.2.11/src/ceph-volume/ceph_volume/tests/devices/simple/test_scan.py	2019-01-30 15:51:26.000000000 +0000
+++ ceph-12.2.12/src/ceph-volume/ceph_volume/tests/devices/simple/test_scan.py	2019-04-11 12:33:50.000000000 +0000
@@ -3,14 +3,6 @@
 from ceph_volume.devices.simple import scan
 
 
-class TestScan(object):
-
-    def test_main_spits_help_with_no_arguments(self, capsys):
-        scan.Scan([]).main()
-        stdout, stderr = capsys.readouterr()
-        assert 'Scan an OSD directory (or data device) for files' in stdout
-
-
 class TestGetContents(object):
 
     def test_multiple_lines_are_left_as_is(self, tmpfile):
diff -Nru ceph-12.2.11/src/ceph-volume/ceph_volume/tests/functional/batch/tox.ini ceph-12.2.12/src/ceph-volume/ceph_volume/tests/functional/batch/tox.ini
--- ceph-12.2.11/src/ceph-volume/ceph_volume/tests/functional/batch/tox.ini	2019-01-30 15:51:26.000000000 +0000
+++ ceph-12.2.12/src/ceph-volume/ceph_volume/tests/functional/batch/tox.ini	2019-04-11 12:33:50.000000000 +0000
@@ -48,20 +48,20 @@
   # prepare nodes for testing with testinfra
   ansible-playbook -vv -i {changedir}/hosts {envdir}/tmp/ceph-ansible/tests/functional/setup.yml
 
-  # test cluster state using ceph-ansible tests
-  testinfra -n 4 --sudo -v --connection=ansible --ansible-inventory={changedir}/hosts {envdir}/tmp/ceph-ansible/tests/functional/tests
+  # test cluster state using testinfra 
+  py.test -n 4 --sudo -v --connection=ansible --ansible-inventory={changedir}/hosts {toxinidir}/../tests
 
   # reboot all vms - attempt
   bash {toxinidir}/../scripts/vagrant_reload.sh {env:VAGRANT_UP_FLAGS:"--no-provision"} {posargs:--provider=virtualbox}
 
   # retest to ensure cluster came back up correctly after rebooting
-  testinfra -n 4 --sudo -v --connection=ansible --ansible-inventory={changedir}/hosts {envdir}/tmp/ceph-ansible/tests/functional/tests
+  py.test -n 4 --sudo -v --connection=ansible --ansible-inventory={changedir}/hosts {toxinidir}/../tests
 
   # destroy an OSD, zap it's device and recreate it using it's ID
   ansible-playbook -vv -i {changedir}/hosts {changedir}/test.yml
 
   # retest to ensure cluster came back up correctly
-  testinfra -n 4 --sudo -v --connection=ansible --ansible-inventory={changedir}/hosts {envdir}/tmp/ceph-ansible/tests/functional/tests
+  py.test -n 4 --sudo -v --connection=ansible --ansible-inventory={changedir}/hosts {toxinidir}/../tests
 
   # test zap OSDs by ID
   ansible-playbook -vv -i {changedir}/hosts {changedir}/test_zap.yml
diff -Nru ceph-12.2.11/src/ceph-volume/ceph_volume/tests/functional/lvm/centos7/bluestore/create/test.yml ceph-12.2.12/src/ceph-volume/ceph_volume/tests/functional/lvm/centos7/bluestore/create/test.yml
--- ceph-12.2.11/src/ceph-volume/ceph_volume/tests/functional/lvm/centos7/bluestore/create/test.yml	2019-01-30 15:51:26.000000000 +0000
+++ ceph-12.2.12/src/ceph-volume/ceph_volume/tests/functional/lvm/centos7/bluestore/create/test.yml	2019-04-11 12:33:50.000000000 +0000
@@ -98,6 +98,11 @@
       environment:
         CEPH_VOLUME_DEBUG: 1
 
+    - name: node inventory
+      command: "ceph-volume inventory"
+      environment:
+        CEPH_VOLUME_DEBUG: 1
+
     - name: list all OSDs
       command: "ceph-volume lvm list"
       environment:
diff -Nru ceph-12.2.11/src/ceph-volume/ceph_volume/tests/functional/lvm/centos7/bluestore/dmcrypt/test.yml ceph-12.2.12/src/ceph-volume/ceph_volume/tests/functional/lvm/centos7/bluestore/dmcrypt/test.yml
--- ceph-12.2.11/src/ceph-volume/ceph_volume/tests/functional/lvm/centos7/bluestore/dmcrypt/test.yml	2019-01-30 15:51:26.000000000 +0000
+++ ceph-12.2.12/src/ceph-volume/ceph_volume/tests/functional/lvm/centos7/bluestore/dmcrypt/test.yml	2019-04-11 12:33:50.000000000 +0000
@@ -93,6 +93,11 @@
       environment:
         CEPH_VOLUME_DEBUG: 1
 
+    - name: node inventory
+      command: "ceph-volume inventory"
+      environment:
+        CEPH_VOLUME_DEBUG: 1
+
     - name: list all OSDs
       command: "ceph-volume lvm list"
       environment:
diff -Nru ceph-12.2.11/src/ceph-volume/ceph_volume/tests/functional/lvm/centos7/filestore/create/test.yml ceph-12.2.12/src/ceph-volume/ceph_volume/tests/functional/lvm/centos7/filestore/create/test.yml
--- ceph-12.2.11/src/ceph-volume/ceph_volume/tests/functional/lvm/centos7/filestore/create/test.yml	2019-01-30 15:51:26.000000000 +0000
+++ ceph-12.2.12/src/ceph-volume/ceph_volume/tests/functional/lvm/centos7/filestore/create/test.yml	2019-04-11 12:33:50.000000000 +0000
@@ -119,6 +119,11 @@
       environment:
         CEPH_VOLUME_DEBUG: 1
 
+    - name: node inventory
+      command: "ceph-volume inventory"
+      environment:
+        CEPH_VOLUME_DEBUG: 1
+
     - name: list all OSDs
       command: "ceph-volume lvm list"
       environment:
diff -Nru ceph-12.2.11/src/ceph-volume/ceph_volume/tests/functional/lvm/centos7/filestore/dmcrypt/test.yml ceph-12.2.12/src/ceph-volume/ceph_volume/tests/functional/lvm/centos7/filestore/dmcrypt/test.yml
--- ceph-12.2.11/src/ceph-volume/ceph_volume/tests/functional/lvm/centos7/filestore/dmcrypt/test.yml	2019-01-30 15:51:26.000000000 +0000
+++ ceph-12.2.12/src/ceph-volume/ceph_volume/tests/functional/lvm/centos7/filestore/dmcrypt/test.yml	2019-04-11 12:33:50.000000000 +0000
@@ -97,6 +97,11 @@
       environment:
         CEPH_VOLUME_DEBUG: 1
 
+    - name: node inventory
+      command: "ceph-volume inventory"
+      environment:
+        CEPH_VOLUME_DEBUG: 1
+
     - name: list all OSDs
       command: "ceph-volume lvm list"
       environment:
diff -Nru ceph-12.2.11/src/ceph-volume/ceph_volume/tests/functional/lvm/playbooks/test_bluestore.yml ceph-12.2.12/src/ceph-volume/ceph_volume/tests/functional/lvm/playbooks/test_bluestore.yml
--- ceph-12.2.11/src/ceph-volume/ceph_volume/tests/functional/lvm/playbooks/test_bluestore.yml	2019-01-30 15:51:26.000000000 +0000
+++ ceph-12.2.12/src/ceph-volume/ceph_volume/tests/functional/lvm/playbooks/test_bluestore.yml	2019-04-11 12:33:50.000000000 +0000
@@ -98,6 +98,11 @@
       environment:
         CEPH_VOLUME_DEBUG: 1
 
+    - name: node inventory
+      command: "ceph-volume inventory"
+      environment:
+        CEPH_VOLUME_DEBUG: 1
+
     - name: list all OSDs
       command: "ceph-volume lvm list"
       environment:
diff -Nru ceph-12.2.11/src/ceph-volume/ceph_volume/tests/functional/lvm/playbooks/test_filestore.yml ceph-12.2.12/src/ceph-volume/ceph_volume/tests/functional/lvm/playbooks/test_filestore.yml
--- ceph-12.2.11/src/ceph-volume/ceph_volume/tests/functional/lvm/playbooks/test_filestore.yml	2019-01-30 15:51:26.000000000 +0000
+++ ceph-12.2.12/src/ceph-volume/ceph_volume/tests/functional/lvm/playbooks/test_filestore.yml	2019-04-11 12:33:50.000000000 +0000
@@ -119,6 +119,11 @@
       environment:
         CEPH_VOLUME_DEBUG: 1
 
+    - name: node inventory
+      command: "ceph-volume inventory"
+      environment:
+        CEPH_VOLUME_DEBUG: 1
+
     - name: list all OSDs
       command: "ceph-volume lvm list"
       environment:
diff -Nru ceph-12.2.11/src/ceph-volume/ceph_volume/tests/functional/lvm/tox.ini ceph-12.2.12/src/ceph-volume/ceph_volume/tests/functional/lvm/tox.ini
--- ceph-12.2.11/src/ceph-volume/ceph_volume/tests/functional/lvm/tox.ini	2019-01-30 15:51:26.000000000 +0000
+++ ceph-12.2.12/src/ceph-volume/ceph_volume/tests/functional/lvm/tox.ini	2019-04-11 12:33:50.000000000 +0000
@@ -56,19 +56,19 @@
   # prepare nodes for testing with testinfra
   ansible-playbook -vv -i {changedir}/hosts {envdir}/tmp/ceph-ansible/tests/functional/setup.yml
 
-  # test cluster state using ceph-ansible tests
-  testinfra -n 4 --sudo -v --connection=ansible --ansible-inventory={changedir}/hosts {envdir}/tmp/ceph-ansible/tests/functional/tests
+  # test cluster state using testinfra
+  py.test -n 4 --sudo -v --connection=ansible --ansible-inventory={changedir}/hosts {toxinidir}/../tests
 
   # reboot all vms - attempt
   bash {toxinidir}/../scripts/vagrant_reload.sh {env:VAGRANT_UP_FLAGS:"--no-provision"} {posargs:--provider=virtualbox}
 
   # retest to ensure cluster came back up correctly after rebooting
-  testinfra -n 4 --sudo -v --connection=ansible --ansible-inventory={changedir}/hosts {envdir}/tmp/ceph-ansible/tests/functional/tests
+  py.test -n 4 --sudo -v --connection=ansible --ansible-inventory={changedir}/hosts {toxinidir}/../tests
 
   # destroy an OSD, zap it's device and recreate it using it's ID
   ansible-playbook -vv -i {changedir}/hosts {changedir}/test.yml
 
   # retest to ensure cluster came back up correctly
-  testinfra -n 4 --sudo -v --connection=ansible --ansible-inventory={changedir}/hosts {envdir}/tmp/ceph-ansible/tests/functional/tests
+  py.test -n 4 --sudo -v --connection=ansible --ansible-inventory={changedir}/hosts {toxinidir}/../tests
 
   vagrant destroy --force
diff -Nru ceph-12.2.11/src/ceph-volume/ceph_volume/tests/functional/lvm/xenial/bluestore/create/test.yml ceph-12.2.12/src/ceph-volume/ceph_volume/tests/functional/lvm/xenial/bluestore/create/test.yml
--- ceph-12.2.11/src/ceph-volume/ceph_volume/tests/functional/lvm/xenial/bluestore/create/test.yml	2019-01-30 15:51:26.000000000 +0000
+++ ceph-12.2.12/src/ceph-volume/ceph_volume/tests/functional/lvm/xenial/bluestore/create/test.yml	2019-04-11 12:33:50.000000000 +0000
@@ -98,6 +98,11 @@
       environment:
         CEPH_VOLUME_DEBUG: 1
 
+    - name: node inventory
+      command: "ceph-volume inventory"
+      environment:
+        CEPH_VOLUME_DEBUG: 1
+
     - name: list all OSDs
       command: "ceph-volume lvm list"
       environment:
diff -Nru ceph-12.2.11/src/ceph-volume/ceph_volume/tests/functional/lvm/xenial/bluestore/dmcrypt/test.yml ceph-12.2.12/src/ceph-volume/ceph_volume/tests/functional/lvm/xenial/bluestore/dmcrypt/test.yml
--- ceph-12.2.11/src/ceph-volume/ceph_volume/tests/functional/lvm/xenial/bluestore/dmcrypt/test.yml	2019-01-30 15:51:26.000000000 +0000
+++ ceph-12.2.12/src/ceph-volume/ceph_volume/tests/functional/lvm/xenial/bluestore/dmcrypt/test.yml	2019-04-11 12:33:50.000000000 +0000
@@ -93,6 +93,11 @@
       environment:
         CEPH_VOLUME_DEBUG: 1
 
+    - name: node inventory
+      command: "ceph-volume inventory"
+      environment:
+        CEPH_VOLUME_DEBUG: 1
+
     - name: list all OSDs
       command: "ceph-volume lvm list"
       environment:
diff -Nru ceph-12.2.11/src/ceph-volume/ceph_volume/tests/functional/lvm/xenial/filestore/create/test.yml ceph-12.2.12/src/ceph-volume/ceph_volume/tests/functional/lvm/xenial/filestore/create/test.yml
--- ceph-12.2.11/src/ceph-volume/ceph_volume/tests/functional/lvm/xenial/filestore/create/test.yml	2019-01-30 15:51:26.000000000 +0000
+++ ceph-12.2.12/src/ceph-volume/ceph_volume/tests/functional/lvm/xenial/filestore/create/test.yml	2019-04-11 12:33:50.000000000 +0000
@@ -119,6 +119,11 @@
       environment:
         CEPH_VOLUME_DEBUG: 1
 
+    - name: node inventory
+      command: "ceph-volume inventory"
+      environment:
+        CEPH_VOLUME_DEBUG: 1
+
     - name: list all OSDs
       command: "ceph-volume lvm list"
       environment:
diff -Nru ceph-12.2.11/src/ceph-volume/ceph_volume/tests/functional/lvm/xenial/filestore/dmcrypt/test.yml ceph-12.2.12/src/ceph-volume/ceph_volume/tests/functional/lvm/xenial/filestore/dmcrypt/test.yml
--- ceph-12.2.11/src/ceph-volume/ceph_volume/tests/functional/lvm/xenial/filestore/dmcrypt/test.yml	2019-01-30 15:51:26.000000000 +0000
+++ ceph-12.2.12/src/ceph-volume/ceph_volume/tests/functional/lvm/xenial/filestore/dmcrypt/test.yml	2019-04-11 12:33:50.000000000 +0000
@@ -97,6 +97,11 @@
       environment:
         CEPH_VOLUME_DEBUG: 1
 
+    - name: node inventory
+      command: "ceph-volume inventory"
+      environment:
+        CEPH_VOLUME_DEBUG: 1
+
     - name: list all OSDs
       command: "ceph-volume lvm list"
       environment:
diff -Nru ceph-12.2.11/src/ceph-volume/ceph_volume/tests/functional/simple/centos7/bluestore/dmcrypt-luks/test.yml ceph-12.2.12/src/ceph-volume/ceph_volume/tests/functional/simple/centos7/bluestore/dmcrypt-luks/test.yml
--- ceph-12.2.11/src/ceph-volume/ceph_volume/tests/functional/simple/centos7/bluestore/dmcrypt-luks/test.yml	2019-01-30 15:51:26.000000000 +0000
+++ ceph-12.2.12/src/ceph-volume/ceph_volume/tests/functional/simple/centos7/bluestore/dmcrypt-luks/test.yml	2019-04-11 12:33:50.000000000 +0000
@@ -4,28 +4,12 @@
   become: yes
   tasks:
 
-    - name: list all OSD directories
-      find:
-        paths: /var/lib/ceph/osd
-        file_type: directory
-      register: osd_paths
-
-    - name: scan all OSD directories
-      command: "ceph-volume --cluster={{ cluster }} simple scan {{ item.path }}"
+    - name: scan all running OSDs
+      command: "ceph-volume --cluster={{ cluster }} simple scan"
       environment:
         CEPH_VOLUME_DEBUG: 1
-      with_items:
-        - "{{ osd_paths.files }}"
-
-    - name: list all OSD JSON files
-      find:
-        paths: /etc/ceph/osd
-        file_type: file
-      register: osd_configs
 
     - name: activate all scanned OSDs
-      command: "ceph-volume --cluster={{ cluster }} simple activate --file {{ item.path }}"
+      command: "ceph-volume --cluster={{ cluster }} simple activate --all"
       environment:
         CEPH_VOLUME_DEBUG: 1
-      with_items:
-        - "{{ osd_configs.files }}"
diff -Nru ceph-12.2.11/src/ceph-volume/ceph_volume/tests/functional/simple/centos7/filestore/activate/test.yml ceph-12.2.12/src/ceph-volume/ceph_volume/tests/functional/simple/centos7/filestore/activate/test.yml
--- ceph-12.2.11/src/ceph-volume/ceph_volume/tests/functional/simple/centos7/filestore/activate/test.yml	2019-01-30 15:51:26.000000000 +0000
+++ ceph-12.2.12/src/ceph-volume/ceph_volume/tests/functional/simple/centos7/filestore/activate/test.yml	2019-04-11 12:33:50.000000000 +0000
@@ -24,8 +24,6 @@
       register: osd_configs
 
     - name: activate all scanned OSDs
-      command: "ceph-volume --cluster={{ cluster }} simple activate --file {{ item.path }}"
+      command: "ceph-volume --cluster={{ cluster }} simple activate --all"
       environment:
         CEPH_VOLUME_DEBUG: 1
-      with_items:
-        - "{{ osd_configs.files }}"
diff -Nru ceph-12.2.11/src/ceph-volume/ceph_volume/tests/functional/simple/tox.ini ceph-12.2.12/src/ceph-volume/ceph_volume/tests/functional/simple/tox.ini
--- ceph-12.2.11/src/ceph-volume/ceph_volume/tests/functional/simple/tox.ini	2019-01-30 15:51:26.000000000 +0000
+++ ceph-12.2.12/src/ceph-volume/ceph_volume/tests/functional/simple/tox.ini	2019-04-11 12:33:50.000000000 +0000
@@ -46,8 +46,8 @@
   # prepare nodes for testing with testinfra
   ansible-playbook -vv -i {changedir}/hosts {envdir}/tmp/ceph-ansible/tests/functional/setup.yml
 
-  # test cluster state using ceph-ansible tests
-  testinfra -n 4 --sudo -v --connection=ansible --ansible-inventory={changedir}/hosts {envdir}/tmp/ceph-ansible/tests/functional/tests
+  # test cluster state testinfra
+  py.test -n 4 --sudo -v --connection=ansible --ansible-inventory={changedir}/hosts {toxinidir}/../tests
 
   # make ceph-volume simple take over all the OSDs that got deployed, disabling ceph-disk
   ansible-playbook -vv -i {changedir}/hosts {changedir}/test.yml
@@ -59,6 +59,6 @@
   sleep 120
 
   # retest to ensure cluster came back up correctly after rebooting
-  testinfra -n 4 --sudo -v --connection=ansible --ansible-inventory={changedir}/hosts {envdir}/tmp/ceph-ansible/tests/functional/tests
+  py.test -n 4 --sudo -v --connection=ansible --ansible-inventory={changedir}/hosts {toxinidir}/../tests
 
   vagrant destroy --force
diff -Nru ceph-12.2.11/src/ceph-volume/ceph_volume/tests/functional/simple/xenial/filestore/activate/test.yml ceph-12.2.12/src/ceph-volume/ceph_volume/tests/functional/simple/xenial/filestore/activate/test.yml
--- ceph-12.2.11/src/ceph-volume/ceph_volume/tests/functional/simple/xenial/filestore/activate/test.yml	2019-01-30 15:51:26.000000000 +0000
+++ ceph-12.2.12/src/ceph-volume/ceph_volume/tests/functional/simple/xenial/filestore/activate/test.yml	2019-04-11 12:33:50.000000000 +0000
@@ -4,28 +4,12 @@
   become: yes
   tasks:
 
-    - name: list all OSD directories
-      find:
-        paths: /var/lib/ceph/osd
-        file_type: directory
-      register: osd_paths
-
-    - name: scan all OSD directories
-      command: "ceph-volume --cluster={{ cluster }} simple scan {{ item.path }}"
+    - name: scan all running OSDs
+      command: "ceph-volume --cluster={{ cluster }} simple scan"
       environment:
         CEPH_VOLUME_DEBUG: 1
-      with_items:
-        - "{{ osd_paths.files }}"
-
-    - name: list all OSD JSON files
-      find:
-        paths: /etc/ceph/osd
-        file_type: file
-      register: osd_configs
 
     - name: activate all scanned OSDs
-      command: "ceph-volume --cluster={{ cluster }} simple activate --file {{ item.path }}"
+      command: "ceph-volume --cluster={{ cluster }} simple activate --all"
       environment:
         CEPH_VOLUME_DEBUG: 1
-      with_items:
-        - "{{ osd_configs.files }}"
diff -Nru ceph-12.2.11/src/ceph-volume/ceph_volume/tests/functional/tests/conftest.py ceph-12.2.12/src/ceph-volume/ceph_volume/tests/functional/tests/conftest.py
--- ceph-12.2.11/src/ceph-volume/ceph_volume/tests/functional/tests/conftest.py	1970-01-01 00:00:00.000000000 +0000
+++ ceph-12.2.12/src/ceph-volume/ceph_volume/tests/functional/tests/conftest.py	2019-04-11 12:33:50.000000000 +0000
@@ -0,0 +1,103 @@
+import pytest
+import os
+
+
+@pytest.fixture()
+def node(host, request):
+    """ This fixture represents a single node in the ceph cluster. Using the
+    host.ansible fixture provided by testinfra it can access all the ansible
+    variables provided to it by the specific test scenario being ran.
+
+    You must include this fixture on any tests that operate on specific type
+    of node because it contains the logic to manage which tests a node
+    should run.
+    """
+    ansible_vars = host.ansible.get_variables()
+    # tox/jenkins/user will pass in this environment variable. we need to do it this way
+    # because testinfra does not collect and provide ansible config passed in
+    # from using --extra-vars
+    ceph_dev_branch = os.environ.get("CEPH_DEV_BRANCH", "master")
+    group_names = ansible_vars["group_names"]
+    num_osd_ports = 4
+    if ceph_dev_branch in ['luminous', 'mimic']:
+        num_osd_ports = 2
+
+    # capture the initial/default state
+    test_is_applicable = False
+    for marker in request.node.iter_markers():
+        if marker.name in group_names or marker.name == 'all':
+            test_is_applicable = True
+            break
+    # Check if any markers on the test method exist in the nodes group_names.
+    # If they do not, this test is not valid for the node being tested.
+    if not test_is_applicable:
+        reason = "%s: Not a valid test for node type: %s" % (
+            request.function, group_names)
+        pytest.skip(reason)
+
+    osd_ids = []
+    osds = []
+    cluster_address = ""
+    # I can assume eth1 because I know all the vagrant
+    # boxes we test with use that interface
+    address = host.interface("eth1").addresses[0]
+    subnet = ".".join(ansible_vars["public_network"].split(".")[0:-1])
+    num_mons = len(ansible_vars["groups"]["mons"])
+    num_osds = len(ansible_vars.get("devices", []))
+    if not num_osds:
+        num_osds = len(ansible_vars.get("lvm_volumes", []))
+    osds_per_device = ansible_vars.get("osds_per_device", 1)
+    num_osds = num_osds * osds_per_device
+
+    # If number of devices doesn't map to number of OSDs, allow tests to define
+    # that custom number, defaulting it to ``num_devices``
+    num_osds = ansible_vars.get('num_osds', num_osds)
+    cluster_name = ansible_vars.get("cluster", "ceph")
+    conf_path = "/etc/ceph/{}.conf".format(cluster_name)
+    if "osds" in group_names:
+        # I can assume eth2 because I know all the vagrant
+        # boxes we test with use that interface. OSDs are the only
+        # nodes that have this interface.
+        cluster_address = host.interface("eth2").addresses[0]
+        cmd = host.run('sudo ls /var/lib/ceph/osd/ | sed "s/.*-//"')
+        if cmd.rc == 0:
+            osd_ids = cmd.stdout.rstrip("\n").split("\n")
+            osds = osd_ids
+
+    data = dict(
+        address=address,
+        subnet=subnet,
+        vars=ansible_vars,
+        osd_ids=osd_ids,
+        num_mons=num_mons,
+        num_osds=num_osds,
+        num_osd_ports=num_osd_ports,
+        cluster_name=cluster_name,
+        conf_path=conf_path,
+        cluster_address=cluster_address,
+        osds=osds,
+    )
+    return data
+
+
+def pytest_collection_modifyitems(session, config, items):
+    for item in items:
+        test_path = item.location[0]
+        if "mon" in test_path:
+            item.add_marker(pytest.mark.mons)
+        elif "osd" in test_path:
+            item.add_marker(pytest.mark.osds)
+        elif "mds" in test_path:
+            item.add_marker(pytest.mark.mdss)
+        elif "mgr" in test_path:
+            item.add_marker(pytest.mark.mgrs)
+        elif "rbd-mirror" in test_path:
+            item.add_marker(pytest.mark.rbdmirrors)
+        elif "rgw" in test_path:
+            item.add_marker(pytest.mark.rgws)
+        elif "nfs" in test_path:
+            item.add_marker(pytest.mark.nfss)
+        elif "iscsi" in test_path:
+            item.add_marker(pytest.mark.iscsigws)
+        else:
+            item.add_marker(pytest.mark.all)
diff -Nru ceph-12.2.11/src/ceph-volume/ceph_volume/tests/functional/tests/osd/test_osds.py ceph-12.2.12/src/ceph-volume/ceph_volume/tests/functional/tests/osd/test_osds.py
--- ceph-12.2.11/src/ceph-volume/ceph_volume/tests/functional/tests/osd/test_osds.py	1970-01-01 00:00:00.000000000 +0000
+++ ceph-12.2.12/src/ceph-volume/ceph_volume/tests/functional/tests/osd/test_osds.py	2019-04-11 12:33:50.000000000 +0000
@@ -0,0 +1,60 @@
+import json
+
+
+class TestOSDs(object):
+
+    def test_ceph_osd_package_is_installed(self, node, host):
+        assert host.package("ceph-osd").is_installed
+
+    def test_osds_listen_on_public_network(self, node, host):
+        # TODO: figure out way to paramaterize this test
+        nb_port = (node["num_osds"] * node["num_osd_ports"])
+        assert host.check_output(
+            "netstat -lntp | grep ceph-osd | grep %s | wc -l" % (node["address"])) == str(nb_port)  # noqa E501
+
+    def test_osds_listen_on_cluster_network(self, node, host):
+        # TODO: figure out way to paramaterize this test
+        nb_port = (node["num_osds"] * node["num_osd_ports"])
+        assert host.check_output("netstat -lntp | grep ceph-osd | grep %s | wc -l" %  # noqa E501
+                                 (node["cluster_address"])) == str(nb_port)
+
+    def test_osd_services_are_running(self, node, host):
+        # TODO: figure out way to paramaterize node['osds'] for this test
+        for osd in node["osds"]:
+            assert host.service("ceph-osd@%s" % osd).is_running
+
+    def test_osd_are_mounted(self, node, host):
+        # TODO: figure out way to paramaterize node['osd_ids'] for this test
+        for osd_id in node["osd_ids"]:
+            osd_path = "/var/lib/ceph/osd/{cluster}-{osd_id}".format(
+                cluster=node["cluster_name"],
+                osd_id=osd_id,
+            )
+            assert host.mount_point(osd_path).exists
+
+    def test_ceph_volume_is_installed(self, node, host):
+        host.exists('ceph-volume')
+
+    def test_ceph_volume_systemd_is_installed(self, node, host):
+        host.exists('ceph-volume-systemd')
+
+    def _get_osd_id_from_host(self, node, osd_tree):
+        children = []
+        for n in osd_tree['nodes']:
+            if n['name'] == node['vars']['inventory_hostname'] and n['type'] == 'host':  # noqa E501
+                children = n['children']
+        return children
+
+    def _get_nb_up_osds_from_ids(self, node, osd_tree):
+        nb_up = 0
+        ids = self._get_osd_id_from_host(node, osd_tree)
+        for n in osd_tree['nodes']:
+            if n['id'] in ids and n['status'] == 'up':
+                nb_up += 1
+        return nb_up
+
+    def test_all_osds_are_up_and_in(self, node, host):
+        cmd = "sudo ceph --cluster={cluster} --connect-timeout 5 --keyring /var/lib/ceph/bootstrap-osd/{cluster}.keyring -n client.bootstrap-osd osd tree -f json".format(  # noqa E501
+            cluster=node["cluster_name"])
+        output = json.loads(host.check_output(cmd))
+        assert node["num_osds"] == self._get_nb_up_osds_from_ids(node, output)
diff -Nru ceph-12.2.11/src/ceph-volume/ceph_volume/tests/systemd/test_systemctl.py ceph-12.2.12/src/ceph-volume/ceph_volume/tests/systemd/test_systemctl.py
--- ceph-12.2.11/src/ceph-volume/ceph_volume/tests/systemd/test_systemctl.py	1970-01-01 00:00:00.000000000 +0000
+++ ceph-12.2.12/src/ceph-volume/ceph_volume/tests/systemd/test_systemctl.py	2019-04-11 12:33:50.000000000 +0000
@@ -0,0 +1,21 @@
+import pytest
+from ceph_volume.systemd import systemctl
+
+class TestSystemctl(object):
+
+    @pytest.mark.parametrize("stdout,expected", [
+        (['Id=ceph-osd@1.service', '', 'Id=ceph-osd@2.service'], ['1','2']),
+        (['Id=ceph-osd1.service',], []),
+        (['Id=ceph-osd@1'], ['1']),
+        ([], []),
+    ])
+    def test_get_running_osd_ids(self, stub_call, stdout, expected):
+        stub_call((stdout, [], 0))
+        osd_ids = systemctl.get_running_osd_ids()
+        assert osd_ids == expected
+
+    def test_returns_empty_list_on_nonzero_return_code(self, stub_call):
+        stdout = ['Id=ceph-osd@1.service', '', 'Id=ceph-osd@2.service']
+        stub_call((stdout, [], 1))
+        osd_ids = systemctl.get_running_osd_ids()
+        assert osd_ids == []
diff -Nru ceph-12.2.11/src/ceph-volume/ceph_volume/tests/util/test_device.py ceph-12.2.12/src/ceph-volume/ceph_volume/tests/util/test_device.py
--- ceph-12.2.11/src/ceph-volume/ceph_volume/tests/util/test_device.py	2019-01-30 15:51:26.000000000 +0000
+++ ceph-12.2.12/src/ceph-volume/ceph_volume/tests/util/test_device.py	2019-04-11 12:33:50.000000000 +0000
@@ -43,6 +43,42 @@
         disk = device.Device("/dev/sda")
         assert disk.is_device is True
 
+    def test_device_is_rotational(self, device_info, pvolumes):
+        data = {"/dev/sda": {"rotational": "1"}}
+        lsblk = {"TYPE": "device"}
+        device_info(devices=data, lsblk=lsblk)
+        disk = device.Device("/dev/sda")
+        assert disk.rotational
+
+    def test_device_is_not_rotational(self, device_info, pvolumes):
+        data = {"/dev/sda": {"rotational": "0"}}
+        lsblk = {"TYPE": "device"}
+        device_info(devices=data, lsblk=lsblk)
+        disk = device.Device("/dev/sda")
+        assert not disk.rotational
+
+    def test_device_is_rotational_lsblk(self, device_info, pvolumes):
+        data = {"/dev/sda": {"foo": "bar"}}
+        lsblk = {"TYPE": "device", "ROTA": "1"}
+        device_info(devices=data, lsblk=lsblk)
+        disk = device.Device("/dev/sda")
+        assert disk.rotational
+
+    def test_device_is_not_rotational_lsblk(self, device_info, pvolumes):
+        data = {"/dev/sda": {"rotational": "0"}}
+        lsblk = {"TYPE": "device", "ROTA": "0"}
+        device_info(devices=data, lsblk=lsblk)
+        disk = device.Device("/dev/sda")
+        assert not disk.rotational
+
+    def test_device_is_rotational_defaults_true(self, device_info, pvolumes):
+        # rotational will default true if no info from sys_api or lsblk is found
+        data = {"/dev/sda": {"foo": "bar"}}
+        lsblk = {"TYPE": "device", "foo": "bar"}
+        device_info(devices=data, lsblk=lsblk)
+        disk = device.Device("/dev/sda")
+        assert disk.rotational
+
     def test_disk_is_device(self, device_info, pvolumes):
         data = {"/dev/sda": {"foo": "bar"}}
         lsblk = {"TYPE": "disk"}
diff -Nru ceph-12.2.11/src/ceph-volume/ceph_volume/tests/util/test_disk.py ceph-12.2.12/src/ceph-volume/ceph_volume/tests/util/test_disk.py
--- ceph-12.2.11/src/ceph-volume/ceph_volume/tests/util/test_disk.py	2019-01-30 15:51:26.000000000 +0000
+++ ceph-12.2.12/src/ceph-volume/ceph_volume/tests/util/test_disk.py	2019-04-11 12:33:50.000000000 +0000
@@ -267,28 +267,6 @@
         assert len(result) == 1
         assert result == [ceph_data_path]
 
-    def test_sda1_partition(self, tmpfile, tmpdir):
-        block_path, dev_path, mapper_path = self.setup_paths(tmpdir)
-        block_sda_path = os.path.join(block_path, 'sda')
-        block_sda1_path = os.path.join(block_sda_path, 'sda1')
-        block_sda1_holders = os.path.join(block_sda1_path, 'holders')
-        dev_sda_path = os.path.join(dev_path, 'sda')
-        dev_sda1_path = os.path.join(dev_path, 'sda1')
-        os.makedirs(block_sda_path)
-        os.makedirs(block_sda1_path)
-        os.makedirs(dev_sda1_path)
-        os.makedirs(block_sda1_holders)
-        os.makedirs(dev_sda_path)
-        tmpfile('size', '1024', directory=block_sda_path)
-        tmpfile('partition', '1', directory=block_sda1_path)
-        result = disk.get_devices(
-            _sys_block_path=block_path,
-            _dev_path=dev_path,
-            _mapper_path=mapper_path)
-        assert dev_sda_path in list(result.keys())
-        assert '/dev/sda1' in list(result.keys())
-        assert result['/dev/sda1']['holders'] == []
-
     def test_sda_size(self, tmpfile, tmpdir):
         block_path, dev_path, mapper_path = self.setup_paths(tmpdir)
         block_sda_path = os.path.join(block_path, 'sda')
diff -Nru ceph-12.2.11/src/ceph-volume/ceph_volume/util/device.py ceph-12.2.12/src/ceph-volume/ceph_volume/util/device.py
--- ceph-12.2.11/src/ceph-volume/ceph_volume/util/device.py	2019-01-30 15:51:26.000000000 +0000
+++ ceph-12.2.12/src/ceph-volume/ceph_volume/util/device.py	2019-04-11 12:33:50.000000000 +0000
@@ -110,6 +110,14 @@
         if not sys_info.devices:
             sys_info.devices = disk.get_devices()
         self.sys_api = sys_info.devices.get(self.abspath, {})
+        if not self.sys_api:
+            # if no device was found check if we are a partition
+            partname = self.abspath.split('/')[-1]
+            for device, info in sys_info.devices.items():
+                part = info['partitions'].get(partname, {})
+                if part:
+                    self.sys_api = part
+                    break
 
         # start with lvm since it can use an absolute or relative path
         lv = lvm.get_lv_from_argument(self.path)
@@ -257,7 +265,12 @@
 
     @property
     def rotational(self):
-        return self.sys_api['rotational'] == '1'
+        rotational = self.sys_api.get('rotational')
+        if rotational is None:
+            # fall back to lsblk if not found in sys_api
+            # default to '1' if no value is found with lsblk either
+            rotational = self.disk_api.get('ROTA', '1')
+        return rotational == '1'
 
     @property
     def model(self):
diff -Nru ceph-12.2.11/src/ceph-volume/ceph_volume/util/disk.py ceph-12.2.12/src/ceph-volume/ceph_volume/util/disk.py
--- ceph-12.2.11/src/ceph-volume/ceph_volume/util/disk.py	2019-01-30 15:51:26.000000000 +0000
+++ ceph-12.2.12/src/ceph-volume/ceph_volume/util/disk.py	2019-04-11 12:33:50.000000000 +0000
@@ -815,9 +815,5 @@
         metadata['path'] = diskname
         metadata['locked'] = is_locked_raw_device(metadata['path'])
 
-        for part_name, part_metadata in metadata['partitions'].items():
-            part_abspath = '/dev/%s' % part_name
-            device_facts[part_abspath] = part_metadata
-
         device_facts[diskname] = metadata
     return device_facts
diff -Nru ceph-12.2.11/src/ceph-volume/tox.ini ceph-12.2.12/src/ceph-volume/tox.ini
--- ceph-12.2.11/src/ceph-volume/tox.ini	2019-01-30 15:51:26.000000000 +0000
+++ ceph-12.2.12/src/ceph-volume/tox.ini	2019-04-11 12:33:50.000000000 +0000
@@ -4,7 +4,7 @@
 [testenv]
 deps=
   pytest
-commands=py.test -v {posargs:ceph_volume/tests}
+commands=py.test -v {posargs:ceph_volume/tests} --ignore=ceph_volume/tests/functional
 
 [testenv:flake8]
 deps=flake8
diff -Nru ceph-12.2.11/src/civetweb/examples/websocket_client/ssl/server.key.orig ceph-12.2.12/src/civetweb/examples/websocket_client/ssl/server.key.orig
--- ceph-12.2.11/src/civetweb/examples/websocket_client/ssl/server.key.orig	2017-11-02 17:58:06.000000000 +0000
+++ ceph-12.2.12/src/civetweb/examples/websocket_client/ssl/server.key.orig	1970-01-01 00:00:00.000000000 +0000
@@ -1,18 +0,0 @@
------BEGIN RSA PRIVATE KEY-----
-Proc-Type: 4,ENCRYPTED
-DEK-Info: DES-EDE3-CBC,89778A6427F05D4A
-
-4aXqO/8oCHVfMLB+a1DfjbXyEddjbd7nB+YVFLPKy68Tam9PRTvC1zRHBet59ll0
-1w7R8tXR6/xH7HRhBeqDHCcuvBhtw3xGxtXWv54WBFhzuq7TvKOAaCFl++cw/JHq
-PCS0rAaYnqF2MAgMi7QBjZKmHFHL43Gy60VfOrB0mmOdxqqXA0NBFC2uEd7Z/MAx
-S2A85bNJJKQaWEeDThP1u0OOlNCq99lkLJ31jiOH7ntdL0/vqcbZ+PUtdPLwAG4L
-1GUHuiC2v5FvDlPiejMk2dvrxCNpcu2e3tQKHpg2KcsTVrpB7EVzRSazln4HywUZ
-EJfBvxqqrS7plImZgj4LXSnln0JPuBb+aHnhKIFvisjYSwqDGJnnp/OaD7YdRhYh
-UCcL011Ge+yUbRipeAmHdtJlSUSdB14KWq+WdIX/KgCRGx06QZm9s1PBLH+fww+I
-EL3A/LFX0a5KUHkCp29akYYv9bUYaQ79Nt7BlaEON+/SW3pJMbGr+nx8aqogr0Yo
-SJ/Zz5TSDBhecRjbCDGkT6DizVZ8cbm2xl8QLBd0H+ZA6uYMgfpAOJGrJx3Nm4Lv
-prEApgFtjSrsQDGYHAcmDMW1UWOVHuNp7BSvwUze9Ftnzr/jlpdzES2rhgMyGhg1
-0Szbsfs3vgw4iM83LFJXza07GQJzF8gRF79dY5JiQX/sOKUprA6Lofk631jE0G8r
-3z59cxblaq9y7EgFsE944Gk7/HIEimBRiqIZzGVJVukD0itynQ+XmYTdbyH1lpvi
-c0ZheZPUoGwUW9RYy+nle5gEDFyZWXcCAuJasQvDBXt//r/bso3ZpA==
------END RSA PRIVATE KEY-----
diff -Nru ceph-12.2.11/src/civetweb/resources/cert/client.key.orig ceph-12.2.12/src/civetweb/resources/cert/client.key.orig
--- ceph-12.2.11/src/civetweb/resources/cert/client.key.orig	2017-11-02 17:58:06.000000000 +0000
+++ ceph-12.2.12/src/civetweb/resources/cert/client.key.orig	1970-01-01 00:00:00.000000000 +0000
@@ -1,18 +0,0 @@
------BEGIN RSA PRIVATE KEY-----
-Proc-Type: 4,ENCRYPTED
-DEK-Info: DES-EDE3-CBC,74B669EA97409DB5
-
-U8mySuY/I28r8dygZbyHF9K5VFPekhar4zgN2p6wUyrIT9UvA0Y75VE6Pu55+FZS
-JEiR+5btbONt2Lf7z52zi8bv8cb+IJryjSoGkk9Klesmwc9qkUxtuZosdIoZKGFl
-SgSNecK7QYOu66PK4GOi47CFsKuKT4pR8A8Kt11PYTpXDNGomcsdS3DNEbpxvj7F
-/D/1V24IMu+cknL3r6wwL0stB3idwS+4Oq/JLosKHC5mB6+Pu285K6/NWUo5FqLy
-WgzCWzFzzQy3vBce30HOf2gCUJ++2JKoBa8wdj06ei0OTz6oFWAvftv1fTen6cyW
-LG5uAmNPpv4PmtTpOtNJtd8VFpShxiCbYm772MXiDRNiLL9iMsS9OtgTkxHyqRqr
-i8RRKzZCFzf4+xTGxO6GkkFV0/W/PM+TnvFWoWOviCjJJOFGwrQAUzRFce3UAU8V
-sSmvnE0mGvREQAUiw15onGaHKT/ivzFFutgghrcrjpGH55j/zp5gxD+WDeDqAgNA
-RPk0l63D9CrjyTuyTX1H35V1+EZ9YYP5tZ3wGn6i3WCC3WjHqDg5EZHRprjvPw7p
-rfurs33qHUon42aM1G/dJ+jtn4993RdCvCztxW6aBp+nLEEROMA/0HCZJeM9lE7L
-nWAy+jkn/6wRoATa01fEPHozju0HQhCrPcxjrJ8tIVgI1iEL2xw7STlvo6BZcnP2
-oGLMiEk5gmHCOonh+taLkFhKP+F0cSZJlJcmEr6YYzhh7FoR+sKEb2Cx2n2ySuhv
-LYh1Wn4T0Xmau7OFX2Pc+d5zBaW6lYn/ZUw8GbaDqNd3sT/UICC0Ww==
------END RSA PRIVATE KEY-----
diff -Nru ceph-12.2.11/src/civetweb/resources/cert/server.key.orig ceph-12.2.12/src/civetweb/resources/cert/server.key.orig
--- ceph-12.2.11/src/civetweb/resources/cert/server.key.orig	2017-11-02 17:58:06.000000000 +0000
+++ ceph-12.2.12/src/civetweb/resources/cert/server.key.orig	1970-01-01 00:00:00.000000000 +0000
@@ -1,18 +0,0 @@
------BEGIN RSA PRIVATE KEY-----
-Proc-Type: 4,ENCRYPTED
-DEK-Info: DES-EDE3-CBC,6BB8D380C900AA8B
-
-+p0gAY1fa9vtz8lmaTgJClsSVhD9Aw/SL0raL8w90e3yYFnT948s5xLPxoz+c3V1
-CkdWcatZO72G1VVOlg1NyjYmujBntkvMF4DiAGAg9l/u19wrvYNINurb86uPsZ2P
-9S6SlEzIkuRDdnJXXwT/bpgEYoTVIpOQYMjzIcdYbseaYhy7n9uZCXQLgXChjiuf
-LNbArHcB5tJC8QK9DCv4iEV6U8udSd85gs2xs4dBs4dz2jpgD3GLpSMd6+LNSNOV
-AqWZM6Xa0PtM9Mlz2JkX+misfY5wR2lqs2z6f6JFIZsLjr3buqUJVNXRTcSLZ7A2
-e/RgE8wC3VVX9ij+7yh3dBKNorJF1nLcSkfTt22OXyppbwIwHKI1RYPc6a6GNZEW
-ecZlnuHueUc62e8L8lm6dPtJ5Z4SR6hBBqPOBxNxgEGvt7Gc3jPO7SkqmXatVJ2k
-S3HI2umA0f3grolkeJGXlaabRb0z+C13nvBSEDog2Sg2uFu3gwEOXsCfI+EaOghp
-earIkirAlasVtFGKwUn0eMVLBrsxvr1yz6y7PnY63kTVkh2JPoOQ/hCO+9bfdIvS
-7Sa+pbL29OXSNnt/WDsErKcMTPPAstuz0an1Q7dA7G+3FW7UsLWbYh5sMklHiG8L
-u5NC4M4/+oqq0Bv/rYROOSmIc7XRbZ38hep2ML9WHC/zdMssSph4lY/TnMNGqSQ/
-wyRmuT0VmLBQFlFvO41YD97yJD5uEsu+dMH0fsfIJ36U2T1YmJEJU5YOfpE/iGdf
-2LuFKRU0TMfwiosxi1Geef6RC/9ADaIuda6aIvfMheAZ7b8Xy3vuvQ==
------END RSA PRIVATE KEY-----
diff -Nru ceph-12.2.11/src/client/Client.cc ceph-12.2.12/src/client/Client.cc
--- ceph-12.2.11/src/client/Client.cc	2019-01-30 15:51:26.000000000 +0000
+++ ceph-12.2.12/src/client/Client.cc	2019-04-11 12:33:50.000000000 +0000
@@ -909,9 +909,9 @@
     return in;   // as with readdir returning indoes in different snaprealms (no caps!)
 
   if (in->snapid == CEPH_NOSNAP) {
-    add_update_cap(in, session, st->cap.cap_id, st->cap.caps, st->cap.seq,
-		   st->cap.mseq, inodeno_t(st->cap.realm), st->cap.flags,
-		   request_perms);
+    add_update_cap(in, session, st->cap.cap_id, st->cap.caps, st->cap.wanted,
+		   st->cap.seq, st->cap.mseq, inodeno_t(st->cap.realm),
+		   st->cap.flags, request_perms);
     if (in->auth_cap && in->auth_cap->session == session) {
       in->max_size = st->max_size;
       in->rstat = st->rstat;
@@ -2087,9 +2087,11 @@
 
   case CEPH_SESSION_RENEWCAPS:
     if (session->cap_renew_seq == m->get_seq()) {
+      bool was_stale = ceph_clock_now() >= session->cap_ttl;
       session->cap_ttl =
 	session->last_cap_renew_request + mdsmap->get_session_timeout();
-      wake_inode_waiters(session);
+      if (was_stale)
+	wake_up_session_caps(session, false);
     }
     break;
 
@@ -2106,6 +2108,14 @@
     break;
 
   case CEPH_SESSION_FLUSHMSG:
+    /* flush cap release */
+    {
+      auto& m = session->release;
+      if (m) {
+        session->con->send_message(std::move(m));
+        m = nullptr;
+      }
+    }
     session->con->send_message(new MClientSession(CEPH_SESSION_FLUSHMSG_ACK, m->get_seq()));
     break;
 
@@ -2703,8 +2713,7 @@
 	kick_requests(session);
 	kick_flushing_caps(session);
 	signal_context_list(session->waiting_for_open);
-	kick_maxsize_requests(session);
-	wake_inode_waiters(session);
+	wake_up_session_caps(session, true);
       }
       connect_mds_targets(mds);
     } else if (newstate == MDSMap::STATE_NULL &&
@@ -3255,10 +3264,8 @@
 	  return ret;
 	continue;
       }
-      if ((mds_wanted & file_wanted) ==
-	  (file_wanted & (CEPH_CAP_FILE_RD | CEPH_CAP_FILE_WR))) {
+      if (!(file_wanted & ~mds_wanted))
 	in->flags &= ~I_CAP_DROPPED;
-      }
     }
 
     if (waitfor_caps)
@@ -3424,23 +3431,30 @@
   unsigned used = get_caps_used(in);
   unsigned cap_used;
 
-  if (in->is_dir() && (in->flags & I_COMPLETE)) {
-    // we do this here because we don't want to drop to Fs (and then
-    // drop the Fs if we do a create!) if that alone makes us send lookups
-    // to the MDS. Doing it in in->caps_wanted() has knock-on effects elsewhere
-    wanted |= CEPH_CAP_FILE_EXCL;
-  }
-
   int implemented;
   int issued = in->caps_issued(&implemented);
   int revoking = implemented & ~issued;
 
   int retain = wanted | used | CEPH_CAP_PIN;
-  if (!unmounting) {
-    if (wanted)
+  if (!unmounting && in->nlink > 0) {
+    if (wanted) {
       retain |= CEPH_CAP_ANY;
-    else
+    } else if (in->is_dir() &&
+	       (issued & CEPH_CAP_FILE_SHARED) &&
+	       (in->flags & I_COMPLETE)) {
+      // we do this here because we don't want to drop to Fs (and then
+      // drop the Fs if we do a create!) if that alone makes us send lookups
+      // to the MDS. Doing it in in->caps_wanted() has knock-on effects elsewhere
+      wanted = CEPH_CAP_ANY_SHARED | CEPH_CAP_FILE_EXCL;
+      retain |= wanted;
+    } else {
       retain |= CEPH_CAP_ANY_SHARED;
+      // keep RD only if we didn't have the file open RW,
+      // because then the mds would revoke it anyway to
+      // journal max_size=0.
+      if (in->max_size == 0)
+	retain |= CEPH_CAP_ANY_RD;
+    }
   }
 
   ldout(cct, 10) << "check_caps on " << *in
@@ -3520,9 +3534,8 @@
     if (!revoking && unmounting && (cap_used == 0))
       goto ack;
 
-    if (wanted == cap->wanted &&         // mds knows what we want.
-	((cap->issued & ~retain) == 0) &&// and we don't have anything we wouldn't like
-	!in->dirty_caps)                 // and we have no dirty caps
+    if ((cap->issued & ~retain) == 0 && // and we don't have anything we wouldn't like
+	!in->dirty_caps)               // and we have no dirty caps
       continue;
 
     if (now < in->hold_caps_until) {
@@ -3743,12 +3756,26 @@
   }
 }
 
-void Client::wake_inode_waiters(MetaSession *s)
+void Client::wake_up_session_caps(MetaSession *s, bool reconnect)
 {
   xlist<Cap*>::iterator iter = s->caps.begin();
   while (!iter.end()){
-    signal_cond_list((*iter)->inode->waitfor_caps);
+    auto cap = *iter;
+    auto in = cap->inode;
     ++iter;
+    if (reconnect) {
+      in->requested_max_size = 0;
+      in->wanted_max_size = 0;
+    } else {
+      if (cap->gen < s->cap_gen) {
+	// mds did not re-issue stale cap.
+	cap->issued = cap->implemented = CEPH_CAP_PIN;
+	// make sure mds knows what we want.
+	if (in->caps_file_wanted() & ~cap->wanted)
+	  in->flags |= I_CAP_DROPPED;
+      }
+    }
+    signal_cond_list(in->waitfor_caps);
   }
 }
 
@@ -3912,13 +3939,16 @@
 }
 
 void Client::add_update_cap(Inode *in, MetaSession *mds_session, uint64_t cap_id,
-			    unsigned issued, unsigned seq, unsigned mseq, inodeno_t realm,
-			    int flags, const UserPerm& cap_perms)
+			    unsigned issued, unsigned wanted, unsigned seq, unsigned mseq,
+			    inodeno_t realm, int flags, const UserPerm& cap_perms)
 {
   Cap *cap = 0;
   mds_rank_t mds = mds_session->mds_num;
-  if (in->caps.count(mds)) {
-    cap = in->caps[mds];
+  auto it = in->caps.find(mds);
+  if (it != in->caps.end()) {
+    cap = it->second;
+    if (cap->gen < mds_session->cap_gen)
+      cap->issued = cap->implemented = CEPH_CAP_PIN;
 
     /*
      * auth mds of the inode changed. we received the cap export
@@ -3971,15 +4001,17 @@
   cap->cap_id = cap_id;
   cap->issued = issued;
   cap->implemented |= issued;
+  if (ceph_seq_cmp(mseq, cap->mseq) > 0)
+    cap->wanted = wanted;
+  else
+    cap->wanted |= wanted;
   cap->seq = seq;
   cap->issue_seq = seq;
   cap->mseq = mseq;
   cap->gen = mds_session->cap_gen;
   cap->latest_perms = cap_perms;
   ldout(cct, 10) << "add_update_cap issued " << ccap_string(old_caps) << " -> " << ccap_string(cap->issued)
-	   << " from mds." << mds
-	   << " on " << *in
-	   << dendl;
+		 << " from mds." << mds << " on " << *in << dendl;
 
   if ((issued & ~old_caps) && in->auth_cap == cap) {
     // non-auth MDS is revoking the newly grant caps ?
@@ -4055,10 +4087,10 @@
       dirty_caps = in->dirty_caps | in->flushing_caps;
       in->wanted_max_size = 0;
       in->requested_max_size = 0;
-      in->flags |= I_CAP_DROPPED;
     }
+    if (cap->wanted | cap->issued)
+      in->flags |= I_CAP_DROPPED;
     remove_cap(cap, false);
-    signal_cond_list(in->waitfor_caps);
     if (cap_snaps) {
       InodeRef tmp_ref(in);
       in->cap_snaps.clear();
@@ -4073,6 +4105,7 @@
       in->mark_caps_clean();
       put_inode(in);
     }
+    signal_cond_list(in->waitfor_caps);
   }
   s->flushing_caps_tids.clear();
   sync_cond.Signal();
@@ -4425,17 +4458,6 @@
   }
 }
 
-void Client::kick_maxsize_requests(MetaSession *session)
-{
-  xlist<Cap*>::iterator iter = session->caps.begin();
-  while (!iter.end()){
-    (*iter)->inode->requested_max_size = 0;
-    (*iter)->inode->wanted_max_size = 0;
-    signal_cond_list((*iter)->inode->waitfor_caps);
-    ++iter;
-  }
-}
-
 void SnapRealm::build_snap_context()
 {
   set<snapid_t> snaps;
@@ -4840,8 +4862,8 @@
   update_snap_trace(m->snapbl, &realm);
 
   add_update_cap(in, session, m->get_cap_id(),
-		 m->get_caps(), m->get_seq(), m->get_mseq(), m->get_realm(),
-		 CEPH_CAP_FLAG_AUTH, cap_perms);
+		 m->get_caps(), m->get_wanted(), m->get_seq(), m->get_mseq(),
+		 m->get_realm(), CEPH_CAP_FLAG_AUTH, cap_perms);
   
   if (cap && cap->cap_id == m->peer.cap_id) {
       remove_cap(cap, (m->peer.flags & CEPH_CAP_FLAG_RELEASE));
@@ -4870,10 +4892,9 @@
   if (in->caps.count(mds))
     cap = in->caps[mds];
 
-  const mds_rank_t peer_mds = mds_rank_t(m->peer.mds);
-
   if (cap && cap->cap_id == m->get_cap_id()) {
     if (m->peer.cap_id) {
+      const mds_rank_t peer_mds = mds_rank_t(m->peer.mds);
       MetaSession *tsession = _get_or_open_mds_session(peer_mds);
       if (in->caps.count(peer_mds)) {
 	Cap *tcap = in->caps[peer_mds];
@@ -4890,13 +4911,13 @@
 	    adjust_session_flushing_caps(in, session, tsession);
 	}
       } else {
-	add_update_cap(in, tsession, m->peer.cap_id, cap->issued,
+	add_update_cap(in, tsession, m->peer.cap_id, cap->issued, 0,
 		       m->peer.seq - 1, m->peer.mseq, (uint64_t)-1,
 		       cap == in->auth_cap ? CEPH_CAP_FLAG_AUTH : 0,
 		       cap->latest_perms);
       }
     } else {
-      if (cap == in->auth_cap)
+      if (cap->wanted | cap->issued)
 	in->flags |= I_CAP_DROPPED;
     }
 
@@ -5106,15 +5127,21 @@
   int used = get_caps_used(in);
   int wanted = in->caps_wanted();
 
-  const int old_caps = cap->issued;
-  const int new_caps = m->get_caps();
+  const unsigned new_caps = m->get_caps();
+  const bool was_stale = session->cap_gen > cap->gen;
   ldout(cct, 5) << "handle_cap_grant on in " << m->get_ino() 
 		<< " mds." << mds << " seq " << m->get_seq()
 		<< " caps now " << ccap_string(new_caps)
-		<< " was " << ccap_string(old_caps) << dendl;
+		<< " was " << ccap_string(cap->issued)
+		<< (was_stale ? "" : " (stale)") << dendl;
+
+  if (was_stale)
+      cap->issued = cap->implemented = CEPH_CAP_PIN;
   cap->seq = m->get_seq();
   cap->gen = session->cap_gen;
 
+  check_cap_issue(in, cap, new_caps);
+
   // update inode
   int issued;
   in->caps_issued(&issued);
@@ -5181,13 +5208,21 @@
   }
 
   bool check = false;
-  if (m->get_op() == CEPH_CAP_OP_IMPORT && m->get_wanted() != wanted)
+  if ((was_stale || m->get_op() == CEPH_CAP_OP_IMPORT) &&
+      (wanted & ~(cap->wanted | new_caps))) {
+    // If mds is importing cap, prior cap messages that update 'wanted'
+    // may get dropped by mds (migrate seq mismatch).
+    //
+    // We don't send cap message to update 'wanted' if what we want are
+    // already issued. If mds revokes caps, cap message that releases caps
+    // also tells mds what we want. But if caps got revoked by mds forcedly
+    // (session stale). We may haven't told mds what we want.
     check = true;
+  }
 
-  check_cap_issue(in, cap, new_caps);
 
   // update caps
-  int revoked = old_caps & ~new_caps;
+  auto revoked = cap->issued & ~new_caps;
   if (revoked) {
     ldout(cct, 10) << "  revocation of " << ccap_string(revoked) << dendl;
     cap->issued = new_caps;
@@ -5209,10 +5244,10 @@
       cap->wanted = 0; // don't let check_caps skip sending a response to MDS
       check = true;
     }
-  } else if (old_caps == new_caps) {
-    ldout(cct, 10) << "  caps unchanged at " << ccap_string(old_caps) << dendl;
+  } else if (cap->issued == new_caps) {
+    ldout(cct, 10) << "  caps unchanged at " << ccap_string(cap->issued) << dendl;
   } else {
-    ldout(cct, 10) << "  grant, new caps are " << ccap_string(new_caps & ~old_caps) << dendl;
+    ldout(cct, 10) << "  grant, new caps are " << ccap_string(new_caps & ~cap->issued) << dendl;
     cap->issued = new_caps;
     cap->implemented |= new_caps;
 
diff -Nru ceph-12.2.11/src/client/Client.h ceph-12.2.12/src/client/Client.h
--- ceph-12.2.11/src/client/Client.h	2019-01-30 15:51:26.000000000 +0000
+++ ceph-12.2.12/src/client/Client.h	2019-04-11 12:33:50.000000000 +0000
@@ -483,7 +483,7 @@
   Mutex                  client_lock;
 
   // helpers
-  void wake_inode_waiters(MetaSession *s);
+  void wake_up_session_caps(MetaSession *s, bool reconnect);
 
   void wait_on_context_list(list<Context*>& ls);
   void signal_context_list(list<Context*>& ls);
@@ -630,8 +630,8 @@
   // file caps
   void check_cap_issue(Inode *in, Cap *cap, unsigned issued);
   void add_update_cap(Inode *in, MetaSession *session, uint64_t cap_id,
-		      unsigned issued, unsigned seq, unsigned mseq, inodeno_t realm,
-		      int flags, const UserPerm& perms);
+		      unsigned issued, unsigned wanted, unsigned seq, unsigned mseq,
+		      inodeno_t realm, int flags, const UserPerm& perms);
   void remove_cap(Cap *cap, bool queue_release);
   void remove_all_caps(Inode *in);
   void remove_session_caps(MetaSession *session);
@@ -641,7 +641,6 @@
   void flush_caps(Inode *in, MetaSession *session, bool sync=false);
   void kick_flushing_caps(MetaSession *session);
   void early_kick_flushing_caps(MetaSession *session);
-  void kick_maxsize_requests(MetaSession *session);
   int get_caps(Inode *in, int need, int want, int *have, loff_t endoff);
   int get_caps_used(Inode *in);
 
diff -Nru ceph-12.2.11/src/CMakeLists.txt ceph-12.2.12/src/CMakeLists.txt
--- ceph-12.2.11/src/CMakeLists.txt	2019-01-30 15:51:26.000000000 +0000
+++ ceph-12.2.12/src/CMakeLists.txt	2019-04-11 12:33:50.000000000 +0000
@@ -742,7 +742,7 @@
 # virtualenv base directory for ceph-disk and ceph-detect-init
 set(CEPH_BUILD_VIRTUALENV $ENV{TMPDIR})
 if(NOT CEPH_BUILD_VIRTUALENV)
-  set(CEPH_BUILD_VIRTUALENV /tmp)
+  set(CEPH_BUILD_VIRTUALENV ${CMAKE_BINARY_DIR})
 endif()
 
 add_subdirectory(pybind)
diff -Nru ceph-12.2.11/src/common/AsyncReserver.h ceph-12.2.12/src/common/AsyncReserver.h
--- ceph-12.2.11/src/common/AsyncReserver.h	2019-01-30 15:51:26.000000000 +0000
+++ ceph-12.2.12/src/common/AsyncReserver.h	2019-04-11 12:33:50.000000000 +0000
@@ -143,6 +143,78 @@
     do_queues();
   }
 
+  /**
+   * Update the priority of a reservation
+   *
+   * Note, on_reserved may be called following update_priority.  Thus,
+   * the callback must be safe in that case.  Callback will be called
+   * with no locks held.  cancel_reservation must be called to release the
+   * reservation slot.
+   *
+   * Cases
+   * 1. Item is queued, re-queue with new priority
+   * 2. Item is queued, re-queue and preempt if new priority higher than an in progress item
+   * 3. Item is in progress, just adjust priority if no higher priority waiting
+   * 4. Item is in progress, adjust priority if higher priority items waiting preempt item
+   *
+   */
+  void update_priority(T item, unsigned newprio) {
+    Mutex::Locker l(lock);
+    auto i = queue_pointers.find(item);
+    if (i != queue_pointers.end()) {
+      unsigned prio = i->second.first;
+      if (newprio == prio)
+        return;
+      Reservation r = *i->second.second;
+      rdout(10) << __func__ << " update " << r << " (was queued)" << dendl;
+      // Like cancel_reservation() without preempting
+      queues[prio].erase(i->second.second);
+      if (queues[prio].empty()) {
+	queues.erase(prio);
+      }
+      queue_pointers.erase(i);
+
+      // Like request_reservation() to re-queue it but with new priority
+      assert(!queue_pointers.count(item) &&
+	   !in_progress.count(item));
+      r.prio = newprio;
+      queues[newprio].push_back(r);
+      queue_pointers.insert(make_pair(item,
+				    make_pair(newprio,--(queues[newprio]).end())));
+    } else {
+      auto p = in_progress.find(item);
+      if (p != in_progress.end()) {
+        if (p->second.prio == newprio)
+          return;
+	rdout(10) << __func__ << " update " << p->second
+		  << " (in progress)" << dendl;
+        // We want to preempt if priority goes down
+        // and smaller then highest priority waiting
+	if (p->second.preempt) {
+	  if (newprio < p->second.prio && !queues.empty()) {
+            // choose highest priority queue
+            auto it = queues.end();
+            --it;
+            assert(!it->second.empty());
+            if (it->first > newprio) {
+	      rdout(10) << __func__ << " update " << p->second
+		        << " lowered priority let do_queues() preempt it" << dendl;
+            }
+          }
+	  preempt_by_prio.erase(make_pair(p->second.prio, p->second.item));
+          p->second.prio = newprio;
+	  preempt_by_prio.insert(make_pair(p->second.prio, p->second.item));
+	} else {
+          p->second.prio = newprio;
+        }
+      } else {
+	rdout(10) << __func__ << " update " << item << " (not found)" << dendl;
+      }
+    }
+    do_queues();
+    return;
+  }
+
   void dump(Formatter *f) {
     Mutex::Locker l(lock);
     _dump(f);
diff -Nru ceph-12.2.11/src/common/ceph_crypto.cc ceph-12.2.12/src/common/ceph_crypto.cc
--- ceph-12.2.11/src/common/ceph_crypto.cc	2019-01-30 15:51:26.000000000 +0000
+++ ceph-12.2.12/src/common/ceph_crypto.cc	2019-04-11 12:33:50.000000000 +0000
@@ -14,6 +14,7 @@
 
 #include "common/config.h"
 #include "ceph_crypto.h"
+#include "include/scope_guard.h"
 
 #ifdef USE_CRYPTOPP
 void ceph::crypto::init(CephContext *cct)
@@ -44,6 +45,124 @@
 static NSSInitContext *crypto_context = NULL;
 static pid_t crypto_init_pid = 0;
 
+PK11SymKey *ceph::crypto::PK11_ImportSymKey_FIPS(
+    PK11SlotInfo * const slot,
+    const CK_MECHANISM_TYPE type,
+    const PK11Origin origin,
+    const CK_ATTRIBUTE_TYPE operation,
+    SECItem * const raw_key,
+    void * const wincx)
+{
+  if (PK11_IsFIPS() == PR_FALSE) {
+    // This isn't the FIPS mode, and thus PK11_ImportSymKey is available. Let's
+    // make use of it to avoid overhead related to e.g. creating extra PK11Ctx.
+    PK11SymKey *ret_key = nullptr;
+    ret_key = PK11_ImportSymKey(slot, type, origin, operation, raw_key, wincx);
+
+    return ret_key;
+  }
+
+  ceph_assert_always(wincx == nullptr);
+
+  std::vector<unsigned char> wrapped_key;
+
+  // getting 306 on my system which is CKM_DES3_ECB.
+  const CK_MECHANISM_TYPE wrap_mechanism = PK11_GetBestWrapMechanism(slot);
+
+  // Generate a wrapping key. It will be used exactly twice over the scope:
+  //   * to encrypt raw_key giving wrapped_key,
+  //   * to decrypt wrapped_key in the internals of PK11_UnwrapSymKey().
+  PK11SymKey * const wrapping_key = PK11_KeyGen(
+    slot,
+    wrap_mechanism,
+    nullptr,
+    PK11_GetBestKeyLength(slot, wrap_mechanism),
+    nullptr);
+  if (wrapping_key == nullptr) {
+    return nullptr;
+  }
+  auto wk_guard = make_scope_guard([wrapping_key] {
+    PK11_FreeSymKey(wrapping_key);
+  });
+
+  // Prepare a PK11 context for the raw_key -> wrapped_key encryption.
+  SECItem tmp_sec_item;
+  ::memset(&tmp_sec_item, 0, sizeof(tmp_sec_item));
+  PK11Context * const wrap_key_crypt_context = PK11_CreateContextBySymKey(
+    wrap_mechanism,
+    CKA_ENCRYPT,
+    wrapping_key,
+    &tmp_sec_item);
+  if (wrap_key_crypt_context == nullptr) {
+    return nullptr;
+  }
+  auto wkcc_guard = make_scope_guard([wrap_key_crypt_context] {
+    PK11_DestroyContext(wrap_key_crypt_context, PR_TRUE);
+  });
+
+
+  // Finally wrap the key. Important note is that the wrapping mechanism
+  // selection (read: just grabbing a cipher) offers, at least in my NSS
+  // copy, mostly CKM_*_ECB ciphers (with 3DES as the leading one, see
+  // wrapMechanismList[] in pk11mech.c). There is no CKM_*_*_PAD variant
+  // which means that plaintext we are providing to PK11_CipherOp() must
+  // be aligned to cipher's block size. For 3DES it's 64 bits.
+  {
+    const auto block_size = PK11_GetBlockSize(wrap_mechanism, nullptr);
+    SECItem * const raw_key_aligned = PK11_BlockData(raw_key, block_size);
+    if (raw_key_aligned == nullptr) {
+      return nullptr;
+    }
+    auto rka_guard = make_scope_guard([raw_key_aligned] {
+      SECITEM_FreeItem(raw_key_aligned, PR_TRUE);
+    });
+
+    // PARANOIA: always add space for one extra cipher's block. This seems
+    // unnecessary at the moment as padding is never used (see the comment
+    // above) but let's assume it can change in the future. Just in case.
+    wrapped_key.resize(raw_key_aligned->len + block_size, 0x0);
+    int out_len = 0;
+
+    int ret = PK11_CipherOp(
+      wrap_key_crypt_context,
+      wrapped_key.data(),
+      &out_len,
+      wrapped_key.size(), // max space
+      raw_key_aligned->data,
+      raw_key_aligned->len);
+    if (ret != SECSuccess) {
+      return nullptr;
+    }
+
+    ret = PK11_Finalize(wrap_key_crypt_context);
+    if (ret != SECSuccess) {
+      return nullptr;
+    }
+
+    ceph_assert(out_len <= static_cast<int>(wrapped_key.size()));
+    wrapped_key.resize(out_len);
+  }
+
+  // Key is wrapped now so we can acquire the ultimate PK11SymKey through
+  // unwrapping it. Of course these two opposite operations form NOP with
+  // a side effect: FIPS level 1 compatibility.
+  ::memset(&tmp_sec_item, 0, sizeof(tmp_sec_item));
+
+  SECItem wrapped_key_item;
+  ::memset(&wrapped_key_item, 0, sizeof(wrapped_key_item));
+  wrapped_key_item.data = wrapped_key.data();
+  wrapped_key_item.len = wrapped_key.size();
+
+  return PK11_UnwrapSymKey(
+    wrapping_key,
+    wrap_mechanism,
+    &tmp_sec_item,
+    &wrapped_key_item,
+    type,
+    operation,
+    raw_key->len);
+}
+
 void ceph::crypto::init(CephContext *cct)
 {
   pid_t pid = getpid();
diff -Nru ceph-12.2.11/src/common/ceph_crypto.h ceph-12.2.12/src/common/ceph_crypto.h
--- ceph-12.2.11/src/common/ceph_crypto.h	2019-01-30 15:51:26.000000000 +0000
+++ ceph-12.2.12/src/common/ceph_crypto.h	2019-04-11 12:33:50.000000000 +0000
@@ -69,6 +69,20 @@
 
 namespace ceph {
   namespace crypto {
+    // workaround for no PK11_ImportSymKey in FIPS mode
+    PK11SymKey *PK11_ImportSymKey_FIPS(
+	PK11SlotInfo *slot,
+	CK_MECHANISM_TYPE type,
+	PK11Origin origin,
+	CK_ATTRIBUTE_TYPE operation,
+	SECItem *key,
+	void *wincx);
+  } // namespace crypto
+} // namespace
+
+
+namespace ceph {
+  namespace crypto {
     void assert_init();
     void init(CephContext *cct);
     void shutdown(bool shared=true);
@@ -136,8 +150,8 @@
 	keyItem.type = siBuffer;
 	keyItem.data = (unsigned char*)key;
 	keyItem.len = length;
-	symkey = PK11_ImportSymKey(slot, cktype, PK11_OriginUnwrap,
-				   CKA_SIGN,  &keyItem, NULL);
+	symkey = PK11_ImportSymKey_FIPS(slot, cktype, PK11_OriginUnwrap,
+					CKA_SIGN,  &keyItem, NULL);
 	assert(symkey);
 	SECItem param;
 	param.type = siBuffer;
diff -Nru ceph-12.2.11/src/common/ceph_timer.h ceph-12.2.12/src/common/ceph_timer.h
--- ceph-12.2.11/src/common/ceph_timer.h	2019-01-30 15:51:26.000000000 +0000
+++ ceph-12.2.12/src/common/ceph_timer.h	2019-04-11 12:33:50.000000000 +0000
@@ -138,6 +138,8 @@
 	    } // Otherwise the event requeued itself
 	  }
 
+          if (suspended)
+            break;
 	  if (schedule.empty())
 	    cond.wait(l);
 	  else
diff -Nru ceph-12.2.11/src/common/legacy_config_opts.h ceph-12.2.12/src/common/legacy_config_opts.h
--- ceph-12.2.11/src/common/legacy_config_opts.h	2019-01-30 15:51:26.000000000 +0000
+++ ceph-12.2.12/src/common/legacy_config_opts.h	2019-04-11 12:33:50.000000000 +0000
@@ -443,7 +443,6 @@
 OPTION(mds_session_blacklist_on_evict, OPT_BOOL)  // whether to blacklist clients whose sessions are dropped via admin commands
 
 OPTION(mds_sessionmap_keys_per_op, OPT_U32)    // how many sessions should I try to load/store in a single OMAP operation?
-OPTION(mds_recall_state_timeout, OPT_FLOAT)    // detect clients which aren't trimming caps
 OPTION(mds_freeze_tree_timeout, OPT_FLOAT)    // detecting freeze tree deadlock
 OPTION(mds_health_summarize_threshold, OPT_INT) // collapse N-client health metrics to a single 'many'
 OPTION(mds_reconnect_timeout, OPT_FLOAT)  // seconds to wait for clients during mds restart
@@ -1099,6 +1098,7 @@
 OPTION(bluestore_fsck_on_mkfs, OPT_BOOL)
 OPTION(bluestore_fsck_on_mkfs_deep, OPT_BOOL)
 OPTION(bluestore_sync_submit_transaction, OPT_BOOL) // submit kv txn in queueing thread (not kv_sync_thread)
+OPTION(bluestore_fsck_read_bytes_cap, OPT_U64)
 OPTION(bluestore_throttle_bytes, OPT_U64)
 OPTION(bluestore_throttle_deferred_bytes, OPT_U64)
 OPTION(bluestore_throttle_cost_per_io_hdd, OPT_U64)
diff -Nru ceph-12.2.11/src/common/options.cc ceph-12.2.12/src/common/options.cc
--- ceph-12.2.11/src/common/options.cc	2019-01-30 15:51:26.000000000 +0000
+++ ceph-12.2.12/src/common/options.cc	2019-04-11 12:33:50.000000000 +0000
@@ -444,7 +444,7 @@
     .set_description(""),
 
     Option("mon_cluster_log_file_level", Option::TYPE_STR, Option::LEVEL_ADVANCED)
-    .set_default("info")
+    .set_default("debug")
     .set_description(""),
 
     Option("mon_cluster_log_to_graylog", Option::TYPE_STR, Option::LEVEL_ADVANCED)
@@ -1626,6 +1626,22 @@
     .set_default(10)
     .set_description(""),
 
+    Option("osd_calc_pg_upmaps_aggressively", Option::TYPE_BOOL, Option::LEVEL_ADVANCED)
+    .set_default(true)
+    .set_description("try to calculate PG upmaps more aggressively, e.g., "
+                     "by doing a fairly exhaustive search of existing PGs "
+                     "that can be unmapped or upmapped"),
+
+    Option("osd_calc_pg_upmaps_max_stddev", Option::TYPE_FLOAT, Option::LEVEL_ADVANCED)
+    .set_default(1.0)
+    .set_description("standard deviation below which there is no attempt made "
+                     "while trying to calculate PG upmaps"),
+
+    Option("osd_calc_pg_upmaps_local_fallback_retries", Option::TYPE_UINT, Option::LEVEL_ADVANCED)
+    .set_default(100)
+    .set_description("Maximum number of PGs we can attempt to unmap or upmap "
+                     "for a specific overfull or underfull osd per iteration "),
+
     Option("journaler_prezero_periods", Option::TYPE_INT, Option::LEVEL_ADVANCED)
     .set_default(5)
     .set_description(""),
@@ -2564,7 +2580,8 @@
 
     Option("osd_scrub_backoff_ratio", Option::TYPE_FLOAT, Option::LEVEL_DEV)
     .set_default(.66)
-    .set_description("Backoff ratio after a failed scrub scheduling attempt"),
+    .set_long_description("This is the precentage of ticks that do NOT schedule scrubs, 66% means that 1 out of 3 ticks will schedule scrubs")
+    .set_description("Backoff ratio for scheduling scrubs"),
 
     Option("osd_scrub_chunk_min", Option::TYPE_INT, Option::LEVEL_ADVANCED)
     .set_default(5)
@@ -2599,9 +2616,8 @@
 
     Option("osd_deep_scrub_randomize_ratio", Option::TYPE_FLOAT, Option::LEVEL_ADVANCED)
     .set_default(0.15)
-    .set_description("Ratio of deep scrub interval to randomly vary")
-    .set_long_description("This prevents a deep scrub 'stampede' by randomly varying the scrub intervals so that they are soon uniformly distributed over the week")
-    .add_see_also("osd_deep_scrub_interval"),
+    .set_description("Scrubs will randomly become deep scrubs at this rate (0.15 -> 15% of scrubs are deep)")
+    .set_long_description("This prevents a deep scrub 'stampede' by spreading deep scrubs so they are uniformly distributed over the week"),
 
     Option("osd_deep_scrub_stride", Option::TYPE_INT, Option::LEVEL_ADVANCED)
     .set_default(524288)
@@ -3691,6 +3707,10 @@
     .set_default(false)
     .set_description("Try to submit metadata transaction to rocksdb in queuing thread context"),
 
+    Option("bluestore_fsck_read_bytes_cap", Option::TYPE_UINT, Option::LEVEL_ADVANCED)
+    .set_default(64_M)
+    .set_description("Maximum bytes read at once by deep fsck"),
+
     Option("bluestore_throttle_bytes", Option::TYPE_UINT, Option::LEVEL_ADVANCED)
     .set_default(64_M)
     .set_safe()
@@ -5978,7 +5998,7 @@
     .set_description("default krbd map options"),
 
     Option("rbd_journal_order", Option::TYPE_UINT, Option::LEVEL_ADVANCED)
-    .set_min(12)
+    .set_min_max(12, 26)
     .set_default(24)
     .set_description("default order (object size) for journal data objects"),
 
@@ -6098,6 +6118,14 @@
     .set_default(.7)
     .set_description(""),
 
+    Option("mds_cache_trim_decay_rate", Option::TYPE_FLOAT, Option::LEVEL_ADVANCED)
+    .set_default(1)
+    .set_description("decay rate for trimming MDS cache throttle"),
+
+    Option("mds_cache_trim_threshold", Option::TYPE_UINT, Option::LEVEL_ADVANCED)
+    .set_default(64_K)
+    .set_description("threshold for number of dentries that can be trimmed"),
+
     Option("mds_max_file_recover", Option::TYPE_UINT, Option::LEVEL_ADVANCED)
     .set_default(32)
     .set_description(""),
@@ -6142,9 +6170,29 @@
     .set_default(1024)
     .set_description(""),
 
-    Option("mds_recall_state_timeout", Option::TYPE_FLOAT, Option::LEVEL_ADVANCED)
-    .set_default(60)
-    .set_description(""),
+    Option("mds_recall_max_caps", Option::TYPE_UINT, Option::LEVEL_ADVANCED)
+    .set_default(5000)
+    .set_description("maximum number of caps to recall from client session in single recall"),
+
+    Option("mds_recall_max_decay_rate", Option::TYPE_FLOAT, Option::LEVEL_ADVANCED)
+    .set_default(2.5)
+    .set_description("decay rate for throttle on recalled caps on a session"),
+
+    Option("mds_recall_max_decay_threshold", Option::TYPE_UINT, Option::LEVEL_ADVANCED)
+    .set_default(16_K)
+    .set_description("decay threshold for throttle on recalled caps on a session"),
+
+    Option("mds_recall_global_max_decay_threshold", Option::TYPE_UINT, Option::LEVEL_ADVANCED)
+    .set_default(64_K)
+    .set_description("decay threshold for throttle on recalled caps globally"),
+
+    Option("mds_recall_warning_threshold", Option::TYPE_UINT, Option::LEVEL_ADVANCED)
+    .set_default(32_K)
+    .set_description("decay threshold for warning on slow session cap recall"),
+
+    Option("mds_recall_warning_decay_rate", Option::TYPE_FLOAT, Option::LEVEL_ADVANCED)
+    .set_default(60.0)
+    .set_description("decay rate for warning on slow session cap recall"),
 
     Option("mds_freeze_tree_timeout", Option::TYPE_FLOAT, Option::LEVEL_ADVANCED)
     .set_default(30)
@@ -6518,9 +6566,10 @@
     .set_default(100)
     .set_description("minimum number of capabilities a client may hold"),
 
-    Option("mds_max_ratio_caps_per_client", Option::TYPE_FLOAT, Option::LEVEL_DEV)
-    .set_default(.8)
-    .set_description("maximum ratio of current caps that may be recalled during MDS cache pressure"),
+    Option("mds_max_caps_per_client", Option::TYPE_UINT, Option::LEVEL_ADVANCED)
+    .set_default(1_M)
+    .set_description("maximum number of capabilities a client may hold"),
+
     Option("mds_hack_allow_loading_invalid_metadata", Option::TYPE_BOOL, Option::LEVEL_ADVANCED)
      .set_default(0)
      .set_description("INTENTIONALLY CAUSE DATA LOSS by bypasing checks for invalid metadata on disk. Allows testing repair tools."),
diff -Nru ceph-12.2.11/src/common/str_map.cc ceph-12.2.12/src/common/str_map.cc
--- ceph-12.2.11/src/common/str_map.cc	2019-01-30 15:51:26.000000000 +0000
+++ ceph-12.2.12/src/common/str_map.cc	2019-04-11 12:33:50.000000000 +0000
@@ -17,6 +17,8 @@
 #include "include/str_map.h"
 #include "include/str_list.h"
 
+#include <boost/algorithm/string.hpp>
+
 #include "json_spirit/json_spirit.h"
 
 using namespace std;
@@ -56,19 +58,13 @@
   }
   return 0;
 }
+
 string trim(const string& str) {
-  size_t start = 0;
-  size_t end = str.size() - 1;
-  while (isspace(str[start]) != 0 && start <= end) {
-    ++start;
-  }
-  while (isspace(str[end]) != 0 && start <= end) {
-    --end;
-  }
-  if (start <= end) {
-    return str.substr(start, end - start + 1);
-  }
-  return string();
+  return boost::algorithm::trim_copy_if(
+    str,
+    [](unsigned char c) {
+      return std::isspace(c);
+    });
 }
 
 int get_str_map(
diff -Nru ceph-12.2.11/src/crush/CrushWrapper.cc ceph-12.2.12/src/crush/CrushWrapper.cc
--- ceph-12.2.11/src/crush/CrushWrapper.cc	2019-01-30 15:51:26.000000000 +0000
+++ ceph-12.2.12/src/crush/CrushWrapper.cc	2019-04-11 12:33:50.000000000 +0000
@@ -878,23 +878,84 @@
   }
 }
 
-int CrushWrapper::get_rule_failure_domain(int rule_id)
-{
-  crush_rule *rule = get_rule(rule_id);
-  if (IS_ERR(rule)) {
+int CrushWrapper::verify_upmap(CephContext *cct,
+                               int rule_id,
+                               int pool_size,
+                               const vector<int>& up)
+{
+  auto rule = get_rule(rule_id);
+  if (IS_ERR(rule) || !rule) {
+    lderr(cct) << __func__ << " rule " << rule_id << " does not exist"
+               << dendl;
     return -ENOENT;
   }
-  int type = 0; // default to osd-level
-  for (unsigned s = 0; s < rule->len; ++s) {
-    if ((rule->steps[s].op == CRUSH_RULE_CHOOSE_FIRSTN ||
-         rule->steps[s].op == CRUSH_RULE_CHOOSE_INDEP ||
-         rule->steps[s].op == CRUSH_RULE_CHOOSELEAF_FIRSTN ||
-         rule->steps[s].op == CRUSH_RULE_CHOOSELEAF_INDEP) &&
-         rule->steps[s].arg2 > type) {
-      type = rule->steps[s].arg2;
+  for (unsigned step = 0; step < rule->len; ++step) {
+    auto curstep = &rule->steps[step];
+    ldout(cct, 10) << __func__ << " step " << step << dendl;
+    switch (curstep->op) {
+    case CRUSH_RULE_CHOOSELEAF_FIRSTN:
+    case CRUSH_RULE_CHOOSELEAF_INDEP:
+      {
+        int type = curstep->arg2;
+        if (type == 0) // osd
+          break;
+        map<int, set<int>> osds_by_parent; // parent_of_desired_type -> osds
+        for (auto osd : up) {
+          auto parent = get_parent_of_type(osd, type, rule_id);
+          if (parent < 0) {
+            osds_by_parent[parent].insert(osd);
+          } else {
+            ldout(cct, 1) << __func__ << " unable to get parent of osd." << osd
+                          << ", skipping for now"
+                          << dendl;
+          }
+        }
+        for (auto i : osds_by_parent) {
+          if (i.second.size() > 1) {
+            lderr(cct) << __func__ << " multiple osds " << i.second
+                       << " come from same failure domain " << i.first
+                       << dendl;
+            return -EINVAL;
+          }
+        }
+      }
+      break;
+
+    case CRUSH_RULE_CHOOSE_FIRSTN:
+    case CRUSH_RULE_CHOOSE_INDEP:
+      {
+        int numrep = curstep->arg1;
+        int type = curstep->arg2;
+        if (type == 0) // osd
+          break;
+        if (numrep <= 0)
+          numrep += pool_size;
+        set<int> parents_of_type;
+        for (auto osd : up) {
+          auto parent = get_parent_of_type(osd, type, rule_id);
+          if (parent < 0) {
+            parents_of_type.insert(parent);
+          } else {
+            ldout(cct, 1) << __func__ << " unable to get parent of osd." << osd
+                          << ", skipping for now"
+                          << dendl;
+          }
+        }
+        if ((int)parents_of_type.size() > numrep) {
+          lderr(cct) << __func__ << " number of buckets "
+                     << parents_of_type.size() << " exceeds desired " << numrep
+                     << dendl;
+          return -EINVAL;
+        }
+      }
+      break;
+
+    default:
+      // ignore
+      break;
     }
   }
-  return type;
+  return 0;
 }
 
 int CrushWrapper::_get_leaves(int id, list<int> *leaves)
@@ -3614,7 +3675,8 @@
   const vector<int>& orig,
   vector<int>::const_iterator& i,
   set<int>& used,
-  vector<int> *pw) const
+  vector<int> *pw,
+  int root_bucket) const
 {
   vector<int> w = *pw;
   vector<int> o;
@@ -3624,7 +3686,7 @@
 		 << " at " << *i
 		 << " pw " << *pw
 		 << dendl;
-
+  ceph_assert(root_bucket < 0);
   vector<int> cumulative_fanout(stack.size());
   int f = 1;
   for (int j = (int)stack.size() - 1; j >= 0; --j) {
@@ -3652,6 +3714,10 @@
       item = get_parent_of_type(item, type);
       ldout(cct, 10) << __func__ << " underfull " << osd << " type " << type
 		     << " is " << item << dendl;
+      if (!subtree_contains(root_bucket, item)) {
+        ldout(cct, 20) << __func__ << " not in root subtree " << root_bucket << dendl;
+        continue;
+      }
       underfull_buckets[j].insert(item);
     }
   }
@@ -3811,7 +3877,7 @@
   set<int> used;
 
   vector<pair<int,int>> type_stack;  // (type, fan-out)
-
+  int root_bucket = 0;
   for (unsigned step = 0; step < rule->len; ++step) {
     const crush_rule_step *curstep = &rule->steps[step];
     ldout(cct, 10) << __func__ << " step " << step << " w " << w << dendl;
@@ -3822,6 +3888,7 @@
 	   map->buckets[-1-curstep->arg1])) {
 	w.clear();
 	w.push_back(curstep->arg1);
+	root_bucket = curstep->arg1;
 	ldout(cct, 10) << __func__ << " take " << w << dendl;
       } else {
 	ldout(cct, 1) << " bad take value " << curstep->arg1 << dendl;
@@ -3839,7 +3906,7 @@
         if (type > 0)
 	  type_stack.push_back(make_pair(0, 1));
 	int r = _choose_type_stack(cct, type_stack, overfull, underfull, orig,
-				   i, used, &w);
+				   i, used, &w, root_bucket);
 	if (r < 0)
 	  return r;
 	type_stack.clear();
@@ -3861,7 +3928,7 @@
       ldout(cct, 10) << " emit " << w << dendl;
       if (!type_stack.empty()) {
 	int r = _choose_type_stack(cct, type_stack, overfull, underfull, orig,
-				   i, used, &w);
+				   i, used, &w, root_bucket);
 	if (r < 0)
 	  return r;
 	type_stack.clear();
diff -Nru ceph-12.2.11/src/crush/CrushWrapper.h ceph-12.2.12/src/crush/CrushWrapper.h
--- ceph-12.2.11/src/crush/CrushWrapper.h	2019-01-30 15:51:26.000000000 +0000
+++ ceph-12.2.12/src/crush/CrushWrapper.h	2019-04-11 12:33:50.000000000 +0000
@@ -733,12 +733,15 @@
 			    set<int> *children,
 			    bool exclude_shadow = true) const;
 
+
   /**
-    * get failure-domain type of a specific crush rule
-    * @param rule_id crush rule id
-    * @return type of failure-domain or a negative errno on error.
-    */
-  int get_rule_failure_domain(int rule_id);
+   * verify upmapping results.
+   * return 0 on success or a negative errno on error.
+   */
+  int verify_upmap(CephContext *cct,
+                   int rule_id,
+                   int pool_size,
+                   const vector<int>& up);
 
   /**
     * enumerate leaves(devices) of given node
@@ -1529,7 +1532,8 @@
     const vector<int>& orig,
     vector<int>::const_iterator& i,
     set<int>& used,
-    vector<int> *pw) const;
+    vector<int> *pw,
+    int root_bucket) const;
 
   int try_remap_rule(
     CephContext *cct,
diff -Nru ceph-12.2.11/src/erasure-code/jerasure/jerasure/Examples/makefile.orig ceph-12.2.12/src/erasure-code/jerasure/jerasure/Examples/makefile.orig
--- ceph-12.2.11/src/erasure-code/jerasure/jerasure/Examples/makefile.orig	2017-04-10 09:15:19.000000000 +0000
+++ ceph-12.2.12/src/erasure-code/jerasure/jerasure/Examples/makefile.orig	1970-01-01 00:00:00.000000000 +0000
@@ -1,203 +0,0 @@
-# Examples/makefile
-# Jerasure - A C/C++ Library for a Variety of Reed-Solomon and RAID-6 Erasure Coding Techniques
-# 
-# Revision 1.2A
-# May 24, 2011
-# 
-# James S. Plank
-# Department of Electrical Engineering and Computer Science
-# University of Tennessee
-# Knoxville, TN 37996
-# plank@cs.utk.edu
-# 
-# Copyright (c) 2011, James S. Plank
-# All rights reserved.
-# 
-# Redistribution and use in source and binary forms, with or without
-# modification, are permitted provided that the following conditions
-# are met:
-# 
-#  - Redistributions of source code must retain the above copyright
-#    notice, this list of conditions and the following disclaimer.
-# 
-#  - Redistributions in binary form must reproduce the above copyright
-#    notice, this list of conditions and the following disclaimer in
-#    the documentation and/or other materials provided with the
-#    distribution.
-# 
-#  - Neither the name of the University of Tennessee nor the names of its
-#    contributors may be used to endorse or promote products derived
-#    from this software without specific prior written permission.
-# 
-# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
-# "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
-# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
-# A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
-# HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
-# INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
-# BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS
-# OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED
-# AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
-# LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY
-# WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
-# POSSIBILITY OF SUCH DAMAGE.
-
-PREFIX=/usr/local
-BINDIR=${PREFIX}/bin
-LIBDIR=${PREFIX}/lib
-INCDIR=${PREFIX}/include
-CC = gcc  
-CFLAGS = -O2 -I$(HOME)/include
-
-ALL =	jerasure_01 \
-        jerasure_02 \
-        jerasure_03 \
-        jerasure_04 \
-        jerasure_05 \
-        jerasure_06 \
-        jerasure_07 \
-        jerasure_08 \
-        reed_sol_01 \
-        reed_sol_02 \
-        reed_sol_03 \
-        reed_sol_04 \
-        reed_sol_test_gf \
-        reed_sol_time_gf \
-        reed_sol_hard_time_gf \
-        cauchy_01 \
-        cauchy_02 \
-        cauchy_03 \
-        cauchy_04 \
-        liberation_01 \
-		encoder \
-		decoder \
-
-all: $(ALL)
-
-clean:
-	rm -f core *.o $(ALL) a.out cauchy.h cauchy.c liberation.h liberation.c reed_sol.c reed_sol.h\
-              jerasure.c jerasure.h galois.c galois.h
-
-.SUFFIXES: .c .o
-.c.o:
-	$(CC) $(CFLAGS) -c $*.c
-
-liberation.h: ../liberation.h
-	rm -f liberation.h ; cp ../liberation.h . ; chmod 0444 liberation.h
-
-liberation.c: ../liberation.c
-	rm -f liberation.c ; cp ../liberation.c . ; chmod 0444 liberation.c
-
-cauchy.h: ../cauchy.h
-	rm -f cauchy.h ; cp ../cauchy.h . ; chmod 0444 cauchy.h
-
-cauchy.c: ../cauchy.c
-	rm -f cauchy.c ; cp ../cauchy.c . ; chmod 0444 cauchy.c
-
-reed_sol.h: ../reed_sol.h
-	rm -f reed_sol.h ; cp ../reed_sol.h . ; chmod 0444 reed_sol.h
-
-reed_sol.c: ../reed_sol.c
-	rm -f reed_sol.c ; cp ../reed_sol.c . ; chmod 0444 reed_sol.c
-
-jerasure.h: ../jerasure.h
-	rm -f jerasure.h ; cp ../jerasure.h . ; chmod 0444 jerasure.h
-
-jerasure.c: ../jerasure.c
-	rm -f jerasure.c ; cp ../jerasure.c . ; chmod 0444 jerasure.c
-
-galois.h: ../galois.h
-	rm -f galois.h ; cp ../galois.h . ; chmod 0444 galois.h
-
-galois.c: ../galois.c
-	rm -f galois.c ; cp ../galois.c . ; chmod 0444 galois.c
-
-jerasure.o: jerasure.h galois.h
-
-jerasure_01.o: galois.h jerasure.h 
-jerasure_01: jerasure_01.o galois.o jerasure.o
-	$(CC) $(CFLAGS) -o jerasure_01 jerasure_01.o jerasure.o galois.o -lgf_complete
-
-jerasure_02.o: galois.h jerasure.h 
-jerasure_02: jerasure_02.o galois.o jerasure.o
-	$(CC) $(CFLAGS) -o jerasure_02 jerasure_02.o jerasure.o galois.o -lgf_complete
-
-jerasure_03.o: galois.h jerasure.h 
-jerasure_03: jerasure_03.o galois.o  jerasure.o
-	$(CC) $(CFLAGS) -o jerasure_03 jerasure_03.o jerasure.o galois.o -lgf_complete
-
-jerasure_04.o: galois.h jerasure.h 
-jerasure_04: jerasure_04.o galois.o  jerasure.o
-	$(CC) $(CFLAGS) -o jerasure_04 jerasure_04.o jerasure.o galois.o -lgf_complete
-
-jerasure_05.o: galois.h jerasure.h 
-jerasure_05: jerasure_05.o galois.o  jerasure.o
-	$(CC) $(CFLAGS) -o jerasure_05 jerasure_05.o jerasure.o galois.o -lgf_complete
-
-jerasure_06.o: galois.h jerasure.h 
-jerasure_06: jerasure_06.o galois.o  jerasure.o
-	$(CC) $(CFLAGS) -o jerasure_06 jerasure_06.o jerasure.o galois.o -lgf_complete
-
-jerasure_07.o: galois.h jerasure.h 
-jerasure_07: jerasure_07.o galois.o  jerasure.o
-	$(CC) $(CFLAGS) -o jerasure_07 jerasure_07.o jerasure.o galois.o -lgf_complete
-
-jerasure_08.o: galois.h jerasure.h 
-jerasure_08: jerasure_08.o galois.o  jerasure.o
-	$(CC) $(CFLAGS) -o jerasure_08 jerasure_08.o jerasure.o galois.o -lgf_complete
-
-reed_sol_01.o: galois.h reed_sol.h jerasure.h
-reed_sol_01: reed_sol_01.o galois.o  jerasure.o reed_sol.o
-	$(CC) $(CFLAGS) -o reed_sol_01 reed_sol_01.o reed_sol.o jerasure.o galois.o -lgf_complete
-
-reed_sol_02.o: galois.h reed_sol.h jerasure.h
-reed_sol_02: reed_sol_02.o galois.o  jerasure.o reed_sol.o
-	$(CC) $(CFLAGS) -o reed_sol_02 reed_sol_02.o reed_sol.o jerasure.o galois.o -lgf_complete
-
-reed_sol_03.o: galois.h reed_sol.h jerasure.h
-reed_sol_03: reed_sol_03.o galois.o  jerasure.o reed_sol.o
-	$(CC) $(CFLAGS) -o reed_sol_03 reed_sol_03.o reed_sol.o jerasure.o galois.o -lgf_complete
-
-reed_sol_04.o: galois.h reed_sol.h jerasure.h
-reed_sol_04: reed_sol_04.o galois.o  jerasure.o reed_sol.o
-	$(CC) $(CFLAGS) -o reed_sol_04 reed_sol_04.o reed_sol.o jerasure.o galois.o -lgf_complete
-
-cauchy_01.o: galois.h cauchy.h jerasure.h
-cauchy_01: cauchy_01.o galois.o  jerasure.o cauchy.o
-	$(CC) $(CFLAGS) -o cauchy_01 cauchy_01.o cauchy.o jerasure.o galois.o -lgf_complete
-
-cauchy_02.o: galois.h cauchy.h jerasure.h
-cauchy_02: cauchy_02.o galois.o  jerasure.o cauchy.o
-	$(CC) $(CFLAGS) -o cauchy_02 cauchy_02.o cauchy.o jerasure.o galois.o -lgf_complete
-
-cauchy_03.o: galois.h cauchy.h jerasure.h
-cauchy_03: cauchy_03.o galois.o  jerasure.o cauchy.o
-	$(CC) $(CFLAGS) -o cauchy_03 cauchy_03.o cauchy.o jerasure.o galois.o -lgf_complete
-
-cauchy_04.o: galois.h cauchy.h jerasure.h
-cauchy_04: cauchy_04.o galois.o  jerasure.o cauchy.o
-	$(CC) $(CFLAGS) -o cauchy_04 cauchy_04.o cauchy.o jerasure.o galois.o -lgf_complete
-
-liberation_01.o: galois.h liberation.h jerasure.h
-liberation_01: liberation_01.o galois.o  jerasure.o liberation.o
-	$(CC) $(CFLAGS) -o liberation_01 liberation_01.o liberation.o jerasure.o galois.o -lgf_complete
-
-encoder.o: galois.h liberation.h jerasure.h reed_sol.h cauchy.h
-encoder: encoder.o galois.o  jerasure.o liberation.o reed_sol.o cauchy.o
-	$(CC) $(CFLAGS) -o encoder encoder.o liberation.o jerasure.o galois.o  reed_sol.o cauchy.o -lgf_complete
-
-decoder.o: galois.h liberation.h jerasure.h reed_sol.h cauchy.h
-decoder: decoder.o galois.o  jerasure.o liberation.o reed_sol.o cauchy.o
-	$(CC) $(CFLAGS) -o decoder decoder.o liberation.o jerasure.o galois.o  reed_sol.o cauchy.o -lgf_complete
-
-reed_sol_test_gf.o: galois.h reed_sol.h jerasure.h
-reed_sol_test_gf: reed_sol_test_gf.o galois.o jerasure.o reed_sol.o
-	$(CC) $(CFLAGS) -o reed_sol_test_gf reed_sol_test_gf.o reed_sol.o jerasure.o galois.o -lgf_complete
-
-reed_sol_time_gf.o: galois.h reed_sol.h jerasure.h
-reed_sol_time_gf: reed_sol_time_gf.o galois.o jerasure.o reed_sol.o
-	$(CC) $(CFLAGS) -o reed_sol_time_gf reed_sol_time_gf.o reed_sol.o jerasure.o galois.o -lgf_complete
-
-reed_sol_hard_time_gf.o: galois.h reed_sol.h jerasure.h
-reed_sol_hard_time_gf: reed_sol_hard_time_gf.o galois.o jerasure.o reed_sol.o
-	$(CC) $(CFLAGS) -o reed_sol_hard_time_gf reed_sol_hard_time_gf.o reed_sol.o jerasure.o galois.o -lgf_complete
diff -Nru ceph-12.2.11/src/erasure-code/jerasure/jerasure/include/config.h.in~ ceph-12.2.12/src/erasure-code/jerasure/jerasure/include/config.h.in~
--- ceph-12.2.11/src/erasure-code/jerasure/jerasure/include/config.h.in~	2017-04-10 09:15:19.000000000 +0000
+++ ceph-12.2.12/src/erasure-code/jerasure/jerasure/include/config.h.in~	1970-01-01 00:00:00.000000000 +0000
@@ -1,107 +0,0 @@
-/* include/config.h.in.  Generated from configure.ac by autoheader.  */
-
-/* Support Altivec instructions */
-#undef HAVE_ALTIVEC
-
-/* Support AVX (Advanced Vector Extensions) instructions */
-#undef HAVE_AVX
-
-/* Define to 1 if you have the <dlfcn.h> header file. */
-#undef HAVE_DLFCN_H
-
-/* Define to 1 if you have the <gf_complete.h> header file. */
-#undef HAVE_GF_COMPLETE_H
-
-/* Define to 1 if you have the <gf_general.h> header file. */
-#undef HAVE_GF_GENERAL_H
-
-/* Define to 1 if you have the <gf_int.h> header file. */
-#undef HAVE_GF_INT_H
-
-/* Define to 1 if you have the <gf_method.h> header file. */
-#undef HAVE_GF_METHOD_H
-
-/* Define to 1 if you have the <gf_rand.h> header file. */
-#undef HAVE_GF_RAND_H
-
-/* Define to 1 if you have the <inttypes.h> header file. */
-#undef HAVE_INTTYPES_H
-
-/* Define to 1 if you have the `gf_complete' library (-lgf_complete). */
-#undef HAVE_LIBGF_COMPLETE
-
-/* Define to 1 if you have the <memory.h> header file. */
-#undef HAVE_MEMORY_H
-
-/* Support mmx instructions */
-#undef HAVE_MMX
-
-/* Support SSE (Streaming SIMD Extensions) instructions */
-#undef HAVE_SSE
-
-/* Support SSE2 (Streaming SIMD Extensions 2) instructions */
-#undef HAVE_SSE2
-
-/* Support SSE3 (Streaming SIMD Extensions 3) instructions */
-#undef HAVE_SSE3
-
-/* Support SSSE4.1 (Streaming SIMD Extensions 4.1) instructions */
-#undef HAVE_SSE4_1
-
-/* Support SSSE4.2 (Streaming SIMD Extensions 4.2) instructions */
-#undef HAVE_SSE4_2
-
-/* Support SSSE3 (Supplemental Streaming SIMD Extensions 3) instructions */
-#undef HAVE_SSSE3
-
-/* Define to 1 if you have the <stdint.h> header file. */
-#undef HAVE_STDINT_H
-
-/* Define to 1 if you have the <stdlib.h> header file. */
-#undef HAVE_STDLIB_H
-
-/* Define to 1 if you have the <strings.h> header file. */
-#undef HAVE_STRINGS_H
-
-/* Define to 1 if you have the <string.h> header file. */
-#undef HAVE_STRING_H
-
-/* Define to 1 if you have the <sys/stat.h> header file. */
-#undef HAVE_SYS_STAT_H
-
-/* Define to 1 if you have the <sys/types.h> header file. */
-#undef HAVE_SYS_TYPES_H
-
-/* Define to 1 if you have the <unistd.h> header file. */
-#undef HAVE_UNISTD_H
-
-/* Define to the sub-directory in which libtool stores uninstalled libraries.
-   */
-#undef LT_OBJDIR
-
-/* Name of package */
-#undef PACKAGE
-
-/* Define to the address where bug reports for this package should be sent. */
-#undef PACKAGE_BUGREPORT
-
-/* Define to the full name of this package. */
-#undef PACKAGE_NAME
-
-/* Define to the full name and version of this package. */
-#undef PACKAGE_STRING
-
-/* Define to the one symbol short name of this package. */
-#undef PACKAGE_TARNAME
-
-/* Define to the home page for this package. */
-#undef PACKAGE_URL
-
-/* Define to the version of this package. */
-#undef PACKAGE_VERSION
-
-/* Define to 1 if you have the ANSI C header files. */
-#undef STDC_HEADERS
-
-/* Version number of package */
-#undef VERSION
diff -Nru ceph-12.2.11/src/erasure-code/jerasure/jerasure/makefile.orig ceph-12.2.12/src/erasure-code/jerasure/jerasure/makefile.orig
--- ceph-12.2.11/src/erasure-code/jerasure/jerasure/makefile.orig	2017-04-10 09:15:19.000000000 +0000
+++ ceph-12.2.12/src/erasure-code/jerasure/jerasure/makefile.orig	1970-01-01 00:00:00.000000000 +0000
@@ -1,79 +0,0 @@
-# Makefile
-# James S. Plank
-# 
-# JERASURE - Library for Erasure Coding
-# Copright (C) 2007 James S. Plank
-# 
-# This library is free software; you can redistribute it and/or
-# modify it under the terms of the GNU Lesser General Public
-# License as published by the Free Software Foundation; either
-# version 2.1 of the License, or (at your option) any later version.
-# 
-# This library is distributed in the hope that it will be useful,
-# but WITHOUT ANY WARRANTY; without even the implied warranty of
-# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
-# Lesser General Public License for more details.
-# 
-# You should have received a copy of the GNU Lesser General Public
-# License along with this library; if not, write to the Free Software
-# Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA
-# 
-# James S. Plank
-# Department of Electrical Engineering and Computer Science
-# University of Tennessee
-# Knoxville, TN 37996
-# plank@cs.utk.edu
-
-# $Revision: 1.0 $
-# $Date: 2007/09/25 15:12:20 $
-
-UNAME := $(shell uname)
-
-ifeq ($(UNAME), Linux)
-LIBARGS=-shared -Wl,-soname,libJerasure.so.0
-endif
-ifeq ($(UNAME), Darwin)
-LIBARGS=-shared -Wl,-install_name,libJerasure.so.0
-endif
-
-PREFIX=/usr/local
-BINDIR=${PREFIX}/bin
-LIBDIR=${PREFIX}/lib
-INCDIR=${PREFIX}/include
-
-CC = gcc  
-CFLAGS = -O3 -I${INCDIR} -L${LIBDIR} -fPIC
-
-ALL =	galois.o jerasure.o reed_sol.o cauchy.o liberation.o lib/libJerasure.so
-OBJS = galois.o jerasure.o reed_sol.o cauchy.o liberation.o
-
-all: $(ALL)
-
-clean:
-	rm -f core *.o $(ALL) a.out lib/libJerasure.so.0
-
-lib:
-	mkdir -p lib
-
-.SUFFIXES: .c .o
-.c.o:
-	$(CC) $(CFLAGS) -c $*.c
-
-galois.o: galois.h
-jerasure.o: jerasure.h galois.h
-reed_sol.o: jerasure.h galois.h reed_sol.h
-cauchy.o: jerasure.h galois.h cauchy.h
-liberation.o: jerasure.h galois.h liberation.h
-
-lib/libJerasure.so: lib/libJerasure.so.0
-	ln -sf libJerasure.so.0 lib/libJerasure.so
-
-lib/libJerasure.so.0: lib $(OBJS)
-	$(CC) $(LIBARGS) \
-	  -o lib/libJerasure.so.0 $(OBJS) -lgf_complete
-
-install: lib/libJerasure.so
-	cp -P lib/libJerasure.so* ${LIBDIR}
-	mkdir -p ${INCDIR}
-	cp *.h ${INCDIR}
-
diff -Nru ceph-12.2.11/src/.git_version ceph-12.2.12/src/.git_version
--- ceph-12.2.11/src/.git_version	2019-01-30 15:55:46.000000000 +0000
+++ ceph-12.2.12/src/.git_version	2019-04-11 12:36:34.000000000 +0000
@@ -1,2 +1,2 @@
-26dc3775efc7bb286a1d6d66faee0ba30ea23eee
-v12.2.11
+1436006594665279fe734b4c15d7e08c13ebd777
+v12.2.12
diff -Nru ceph-12.2.11/src/journal/Journaler.cc ceph-12.2.12/src/journal/Journaler.cc
--- ceph-12.2.11/src/journal/Journaler.cc	2019-01-30 15:51:26.000000000 +0000
+++ ceph-12.2.12/src/journal/Journaler.cc	2019-04-11 12:33:50.000000000 +0000
@@ -211,8 +211,8 @@
 
 void Journaler::create(uint8_t order, uint8_t splay_width,
                       int64_t pool_id, Context *on_finish) {
-  if (order > 64 || order < 12) {
-    lderr(m_cct) << "order must be in the range [12, 64]" << dendl;
+  if (order > 26 || order < 12) {
+    lderr(m_cct) << "order must be in the range [12, 26]" << dendl;
     on_finish->complete(-EDOM);
     return;
   }
diff -Nru ceph-12.2.11/src/log/test.cc ceph-12.2.12/src/log/test.cc
--- ceph-12.2.11/src/log/test.cc	2019-01-30 15:51:26.000000000 +0000
+++ ceph-12.2.12/src/log/test.cc	2019-04-11 12:33:50.000000000 +0000
@@ -19,7 +19,7 @@
   Log log(&subs);
   log.start();
  
-  log.set_log_file("/tmp/foo");
+  log.set_log_file("foo");
   log.reopen_log_file();
 
   log.set_stderr_level(5, -1);
@@ -53,7 +53,7 @@
   subs.add(1, "foo", 1, 1);
   Log log(&subs);
   log.start();
-  log.set_log_file("/tmp/big");
+  log.set_log_file("big");
   log.reopen_log_file();
   for (int i=0; i<many; i++) {
     int l = 10;
@@ -71,7 +71,7 @@
   subs.add(1, "foo", 20, 10);
   Log log(&subs);
   log.start();
-  log.set_log_file("/tmp/big");
+  log.set_log_file("big");
   log.reopen_log_file();
   for (int i=0; i<many; i++) {
     int l = 10;
@@ -89,7 +89,7 @@
   subs.add(1, "foo", 20, 10);
   Log log(&subs);
   log.start();
-  log.set_log_file("/tmp/big");
+  log.set_log_file("big");
   log.reopen_log_file();
   for (int i=0; i<many; i++) {
     int l = 10;
@@ -110,7 +110,7 @@
   subs.add(1, "foo", 20, 10);
   Log log(&subs);
   log.start();
-  log.set_log_file("/tmp/big");
+  log.set_log_file("big");
   log.reopen_log_file();
   for (int i=0; i<many; i++) {
     int l = 10;
@@ -133,7 +133,7 @@
   subs.add(1, "foo", 20, 10);
   Log log(&subs);
   log.start();
-  log.set_log_file("/tmp/big");
+  log.set_log_file("big");
   log.reopen_log_file();
   for (int i=0; i<many; i++) {
     int l = 10;
@@ -156,7 +156,7 @@
   subs.add(1, "foo", 20, 10);
   Log log(&subs);
   log.start();
-  log.set_log_file("/tmp/big");
+  log.set_log_file("big");
   log.reopen_log_file();
   for (int i=0; i<many; i++) {
     int l = 10;
@@ -180,7 +180,7 @@
   subs.add(1, "foo", 20, 1);
   Log log(&subs);
   log.start();
-  log.set_log_file("/tmp/big");
+  log.set_log_file("big");
   log.reopen_log_file();
   for (int i=0; i<many; i++) {
     int l = 10;
@@ -197,7 +197,7 @@
   subs.add(1, "foo", 20, 1);
   Log log(&subs);
   log.start();
-  log.set_log_file("/tmp/big");
+  log.set_log_file("big");
   log.reopen_log_file();
 
   log.inject_segv();
@@ -222,7 +222,7 @@
   subs.add(1, "foo", 20, 10);
   Log log(&subs);
   log.start();
-  log.set_log_file("/tmp/big");
+  log.set_log_file("big");
   log.reopen_log_file();
   int l = 10;
   Entry *e = new Entry(ceph_clock_now(), pthread_self(), l, 1);
diff -Nru ceph-12.2.11/src/mds/Beacon.cc ceph-12.2.12/src/mds/Beacon.cc
--- ceph-12.2.11/src/mds/Beacon.cc	2019-01-30 15:51:26.000000000 +0000
+++ ceph-12.2.12/src/mds/Beacon.cc	2019-04-11 12:33:50.000000000 +0000
@@ -28,6 +28,7 @@
 
 #include "Beacon.h"
 
+#include <math.h>
 #include <chrono>
 
 #define dout_context g_ceph_context
@@ -78,7 +79,9 @@
       auto since = std::chrono::duration<double>(now-last_send).count();
       auto interval = beacon_interval;
       if (since >= interval*.90) {
-        _send();
+        if (!_send()) {
+          interval = 0.5; /* 500ms */
+        }
       } else {
         interval -= since;
       }
@@ -183,7 +186,7 @@
 /**
  * Call periodically, or when you have updated the desired state
  */
-void Beacon::_send()
+bool Beacon::_send()
 {
   auto now = clock::now();
   auto since = std::chrono::duration<double>(now-last_acked_stamp).count();
@@ -192,7 +195,7 @@
     /* If anything isn't progressing, let avoid sending a beacon so that
      * the MDS will consider us laggy */
     dout(0) << "Skipping beacon heartbeat to monitors (last acked " << since << "s ago); MDS internal heartbeat is not healthy!" << dendl;
-    return;
+    return false;
   }
 
   ++last_seq;
@@ -225,6 +228,7 @@
   }
   monc->send_mon_message(beacon);
   last_send = now;
+  return true;
 }
 
 /**
@@ -385,40 +389,27 @@
     set<Session*> sessions;
     mds->sessionmap.get_client_session_set(sessions);
 
-    auto mds_recall_state_timeout = g_conf->mds_recall_state_timeout;
-    auto last_recall = mds->mdcache->last_recall_state;
-    auto last_recall_span = std::chrono::duration<double>(clock::now()-last_recall).count();
-    bool recall_state_timedout = last_recall_span > mds_recall_state_timeout;
-
+    const auto recall_warning_threshold = g_conf->get_val<uint64_t>("mds_recall_warning_threshold");
+    const auto max_completed_requests = g_conf->mds_max_completed_requests;
+    const auto max_completed_flushes = g_conf->mds_max_completed_flushes;
     std::list<MDSHealthMetric> late_recall_metrics;
     std::list<MDSHealthMetric> large_completed_requests_metrics;
     for (auto& session : sessions) {
-      if (session->recalled_at != Session::time::min()) {
-        auto last_recall_sent = session->last_recall_sent;
-        auto recalled_at = session->recalled_at;
-        auto recalled_at_span = std::chrono::duration<double>(clock::now()-recalled_at).count();
-
-        dout(20) << "Session servicing RECALL " << session->info.inst
-          << ": " << recalled_at_span << "s ago " << session->recall_release_count
-          << "/" << session->recall_count << dendl;
-	if (recall_state_timedout || last_recall_sent < last_recall) {
-	  dout(20) << "  no longer recall" << dendl;
-	  session->clear_recalled_at();
-	} else if (recalled_at_span > mds_recall_state_timeout) {
-          dout(20) << "  exceeded timeout " << recalled_at_span << " vs. " << mds_recall_state_timeout << dendl;
-          std::ostringstream oss;
-	  oss << "Client " << session->get_human_name() << " failing to respond to cache pressure";
-          MDSHealthMetric m(MDS_HEALTH_CLIENT_RECALL, HEALTH_WARN, oss.str());
-          m.metadata["client_id"] = stringify(session->info.inst.name.num());
-          late_recall_metrics.push_back(m);
-        } else {
-          dout(20) << "  within timeout " << recalled_at_span << " vs. " << mds_recall_state_timeout << dendl;
-        }
+      const uint64_t recall_caps = fmax(0.0, session->get_recall_caps()); /* In Luminous: decay counter may go negative due to hit */
+      if (recall_caps > recall_warning_threshold) {
+        dout(2) << "Session " << *session <<
+             " is not releasing caps fast enough. Recalled caps at " << recall_caps
+          << " > " << recall_warning_threshold << " (mds_recall_warning_threshold)." << dendl;
+        std::ostringstream oss;
+        oss << "Client " << session->get_human_name() << " failing to respond to cache pressure";
+        MDSHealthMetric m(MDS_HEALTH_CLIENT_RECALL, HEALTH_WARN, oss.str());
+        m.metadata["client_id"] = stringify(session->get_client());
+        late_recall_metrics.push_back(m);
       }
       if ((session->get_num_trim_requests_warnings() > 0 &&
-	   session->get_num_completed_requests() >= g_conf->mds_max_completed_requests) ||
+	   session->get_num_completed_requests() >= max_completed_requests) ||
 	  (session->get_num_trim_flushes_warnings() > 0 &&
-	   session->get_num_completed_flushes() >= g_conf->mds_max_completed_flushes)) {
+	   session->get_num_completed_flushes() >= max_completed_flushes)) {
 	std::ostringstream oss;
 	oss << "Client " << session->get_human_name() << " failing to advance its oldest client/flush tid";
 	MDSHealthMetric m(MDS_HEALTH_CLIENT_OLDEST_TID, HEALTH_WARN, oss.str());
diff -Nru ceph-12.2.11/src/mds/Beacon.h ceph-12.2.12/src/mds/Beacon.h
--- ceph-12.2.11/src/mds/Beacon.h	2019-01-30 15:51:26.000000000 +0000
+++ ceph-12.2.12/src/mds/Beacon.h	2019-04-11 12:33:50.000000000 +0000
@@ -85,7 +85,7 @@
 
 private:
   void _notify_mdsmap(MDSMap const *mdsmap);
-  void _send();
+  bool _send();
 
   mutable std::mutex mutex;
   std::thread sender;
diff -Nru ceph-12.2.11/src/mds/Capability.cc ceph-12.2.12/src/mds/Capability.cc
--- ceph-12.2.11/src/mds/Capability.cc	2019-01-30 15:51:26.000000000 +0000
+++ ceph-12.2.12/src/mds/Capability.cc	2019-04-11 12:33:50.000000000 +0000
@@ -13,6 +13,7 @@
  */
 
 #include "Capability.h"
+#include "SessionMap.h"
 
 #include "common/Formatter.h"
 
@@ -140,6 +141,85 @@
 /*
  * Capability
  */
+Capability::Capability(CInode *i, Session *s, uint64_t id) :
+  client_follows(0),
+  client_xattr_version(0), client_inline_version(0),
+  last_rbytes(0), last_rsize(0),
+  item_session_caps(this), item_snaprealm_caps(this),
+  item_revoking_caps(this), item_client_revoking_caps(this),
+  inode(i), session(s),
+  cap_id(id), _wanted(0), num_revoke_warnings(0),
+  _pending(0), _issued(0), last_sent(0), last_issue(0), mseq(0),
+  suppress(0), state(0)
+{
+  if (session) {
+    session->touch_cap_bottom(this);
+    cap_gen = session->get_cap_gen();
+  }
+}
+
+client_t Capability::get_client() const
+{
+  return session ? session->get_client() : client_t(-1);
+}
+
+bool Capability::is_stale() const
+{
+  return session ? session->is_stale() : false;
+}
+
+bool Capability::is_valid() const
+{
+  return !session || session->get_cap_gen() == cap_gen;
+}
+
+void Capability::revalidate()
+{
+  if (is_valid())
+    return;
+
+  if (_pending & ~CEPH_CAP_PIN)
+    inc_last_seq();
+
+  bool was_revoking = _issued & ~_pending;
+  _pending = _issued = CEPH_CAP_PIN;
+  _revokes.clear();
+
+  cap_gen = session->get_cap_gen();
+
+  if (was_revoking)
+    maybe_clear_notable();
+}
+
+void Capability::mark_notable()
+{
+  state |= STATE_NOTABLE;
+  session->touch_cap(this);
+}
+
+void Capability::maybe_clear_notable()
+{
+  if ((_issued == _pending) &&
+      !is_clientwriteable() &&
+      !is_wanted_notable(_wanted)) {
+    ceph_assert(is_notable());
+    state &= ~STATE_NOTABLE;
+    session->touch_cap_bottom(this);
+  }
+}
+
+void Capability::set_wanted(int w) {
+  CInode *in = get_inode();
+  if (in) {
+    if (!is_wanted_notable(_wanted) && is_wanted_notable(w)) {
+      if (!is_notable())
+	mark_notable();
+    } else if (is_wanted_notable(_wanted) && !is_wanted_notable(w)) {
+      maybe_clear_notable();
+    }
+  }
+  _wanted = w;
+}
 
 void Capability::encode(bufferlist& bl) const
 {
@@ -164,7 +244,7 @@
   ::decode(_revokes, bl);
   DECODE_FINISH(bl);
   
-  _calc_issued();
+  calc_issued();
 }
 
 void Capability::dump(Formatter *f) const
diff -Nru ceph-12.2.11/src/mds/Capability.h ceph-12.2.12/src/mds/Capability.h
--- ceph-12.2.11/src/mds/Capability.h	2019-01-30 15:51:26.000000000 +0000
+++ ceph-12.2.12/src/mds/Capability.h	2019-04-11 12:33:50.000000000 +0000
@@ -61,6 +61,7 @@
  */
 
 class CInode;
+class Session;
 
 namespace ceph {
   class Formatter;
@@ -109,41 +110,33 @@
     static void generate_test_instances(list<revoke_info*>& ls);
   };
 
-
-  const static unsigned STATE_STALE		= (1<<0);
+  const static unsigned STATE_NOTABLE		= (1<<0);
   const static unsigned STATE_NEW		= (1<<1);
   const static unsigned STATE_IMPORTING		= (1<<2);
+  const static unsigned STATE_CLIENTWRITEABLE	= (1<<4);
 
-
-  Capability(CInode *i = NULL, uint64_t id = 0, client_t c = 0) :
-    client_follows(0), client_xattr_version(0),
-    client_inline_version(0),
-    last_rbytes(0), last_rsize(0),
-    item_session_caps(this), item_snaprealm_caps(this),
-    item_revoking_caps(this), item_client_revoking_caps(this),
-    inode(i), client(c),
-    cap_id(id),
-    _wanted(0), num_revoke_warnings(0),
-    _pending(0), _issued(0),
-    last_sent(0),
-    last_issue(0),
-    mseq(0),
-    suppress(0), state(0) {
-  }
+  Capability(CInode *i=nullptr, Session *s=nullptr, uint64_t id=0);
   Capability(const Capability& other);  // no copying
 
   const Capability& operator=(const Capability& other);  // no copying
 
-  int pending() { return _pending; }
-  int issued() { return _issued; }
-  bool is_null() { return !_pending && _revokes.empty(); }
+  int pending() const {
+    return is_valid() ? _pending : (_pending & CEPH_CAP_PIN);
+  }
+  int issued() const {
+    return is_valid() ? _issued : (_issued & CEPH_CAP_PIN);
+  }
 
   ceph_seq_t issue(unsigned c) {
+    revalidate();
+
     if (_pending & ~c) {
       // revoking (and maybe adding) bits.  note caps prior to this revocation
       _revokes.emplace_back(_pending, last_sent, last_issue);
       _pending = c;
       _issued |= c;
+      if (!is_notable())
+	mark_notable();
     } else if (~_pending & c) {
       // adding bits only.  remove obsolete revocations?
       _pending |= c;
@@ -157,23 +150,20 @@
       assert(_pending == c);
     }
     //last_issue = 
-    ++last_sent;
+    inc_last_seq();
     return last_sent;
   }
   ceph_seq_t issue_norevoke(unsigned c) {
+    revalidate();
+
     _pending |= c;
     _issued |= c;
     //check_rdcaps_list();
-    ++last_sent;
+    inc_last_seq();
     return last_sent;
   }
-  void _calc_issued() {
-    _issued = _pending;
-    for (const auto &r : _revokes) {
-      _issued |= r.before;
-    }
-  }
   void confirm_receipt(ceph_seq_t seq, unsigned caps) {
+    bool was_revoking = (_issued & ~_pending);
     if (seq == last_sent) {
       _revokes.clear();
       _issued = caps;
@@ -186,16 +176,17 @@
       if (!_revokes.empty()) {
 	if (_revokes.front().seq == seq)
 	  _revokes.begin()->before = caps;
-	_calc_issued();
+	calc_issued();
       } else {
 	// seq < last_sent
 	_issued = caps | _pending;
       }
     }
 
-    if (_issued == _pending) {
+    if (was_revoking && _issued == _pending) {
       item_revoking_caps.remove_myself();
       item_client_revoking_caps.remove_myself();
+      maybe_clear_notable();
     }
     //check_rdcaps_list();
   }
@@ -208,19 +199,20 @@
       changed = true;
     }
     if (changed) {
-      _calc_issued();
-      if (_issued == _pending) {
+      bool was_revoking = (_issued & ~_pending);
+      calc_issued();
+      if (was_revoking && _issued == _pending) {
 	item_revoking_caps.remove_myself();
 	item_client_revoking_caps.remove_myself();
+	maybe_clear_notable();
       }
     }
   }
   ceph_seq_t get_mseq() { return mseq; }
   void inc_mseq() { mseq++; }
 
-  ceph_seq_t get_last_sent() { return last_sent; }
-  utime_t get_last_issue_stamp() { return last_issue_stamp; }
-  utime_t get_last_revoke_stamp() { return last_revoke_stamp; }
+  utime_t get_last_issue_stamp() const { return last_issue_stamp; }
+  utime_t get_last_revoke_stamp() const { return last_revoke_stamp; }
 
   void set_last_issue() { last_issue = last_sent; }
   void set_last_issue_stamp(utime_t t) { last_issue_stamp = t; }
@@ -238,29 +230,49 @@
   void inc_suppress() { suppress++; }
   void dec_suppress() { suppress--; }
 
-  bool is_stale() { return state & STATE_STALE; }
-  void mark_stale() { state |= STATE_STALE; }
-  void clear_stale() { state &= ~STATE_STALE; }
-  bool is_new() { return state & STATE_NEW; }
+  static bool is_wanted_notable(int wanted) {
+    return wanted & (CEPH_CAP_ANY_WR|CEPH_CAP_FILE_WR|CEPH_CAP_FILE_RD);
+  }
+  bool is_notable() const { return state & STATE_NOTABLE; }
+
+  bool is_stale() const;
+  bool is_new() const { return state & STATE_NEW; }
   void mark_new() { state |= STATE_NEW; }
   void clear_new() { state &= ~STATE_NEW; }
   bool is_importing() { return state & STATE_IMPORTING; }
   void mark_importing() { state |= STATE_IMPORTING; }
   void clear_importing() { state &= ~STATE_IMPORTING; }
 
-  CInode *get_inode() { return inode; }
-  client_t get_client() const { return client; }
+  bool is_clientwriteable() const { return state & STATE_CLIENTWRITEABLE; }
+  void mark_clientwriteable() {
+    if (!is_clientwriteable()) {
+      state |= STATE_CLIENTWRITEABLE;
+      if (!is_notable())
+	mark_notable();
+    }
+  }
+  void clear_clientwriteable() {
+    if (is_clientwriteable()) {
+      state &= ~STATE_CLIENTWRITEABLE;
+      maybe_clear_notable();
+    }
+  }
+
+  CInode *get_inode() const { return inode; }
+  Session *get_session() const { return session; }
+  client_t get_client() const;
 
   // caps this client wants to hold
-  int wanted() { return _wanted; }
-  void set_wanted(int w) {
-    _wanted = w;
-    //check_rdcaps_list();
-  }
+  int wanted() const { return _wanted; }
+  void set_wanted(int w);
 
   void inc_last_seq() { last_sent++; }
-  ceph_seq_t get_last_seq() { return last_sent; }
-  ceph_seq_t get_last_issue() { return last_issue; }
+  ceph_seq_t get_last_seq() const {
+    if (!is_valid() && (_pending & ~CEPH_CAP_PIN))
+      return last_sent + 1;
+    return last_sent;
+  }
+  ceph_seq_t get_last_issue() const { return last_issue; }
 
   void reset_seq() {
     last_sent = 0;
@@ -268,8 +280,8 @@
   }
   
   // -- exports --
-  Export make_export() {
-    return Export(cap_id, _wanted, issued(), pending(), client_follows, last_sent, mseq+1, last_issue_stamp);
+  Export make_export() const {
+    return Export(cap_id, wanted(), issued(), pending(), client_follows, get_last_seq(), mseq+1, last_issue_stamp);
   }
   void merge(const Export& other, bool auth_cap) {
     if (!is_stale()) {
@@ -287,7 +299,7 @@
     client_follows = other.client_follows;
 
     // wanted
-    _wanted = _wanted | other.wanted;
+    set_wanted(wanted() | other.wanted);
     if (auth_cap)
       mseq = other.mseq;
   }
@@ -304,7 +316,7 @@
     }
 
     // wanted
-    _wanted = _wanted | otherwanted;
+    set_wanted(wanted() | otherwanted);
   }
 
   void revoke() {
@@ -332,9 +344,10 @@
 
 private:
   CInode *inode;
-  client_t client;
+  Session *session;
 
   uint64_t cap_id;
+  uint32_t cap_gen;
 
   __u32 _wanted;     // what the client wants (ideally)
 
@@ -354,6 +367,19 @@
 
   int suppress;
   unsigned state;
+
+  void calc_issued() {
+    _issued = _pending;
+    for (const auto &r : _revokes) {
+      _issued |= r.before;
+    }
+  }
+
+  bool is_valid() const;
+  void revalidate();
+
+  void mark_notable();
+  void maybe_clear_notable();
 };
 
 WRITE_CLASS_ENCODER(Capability::Export)
diff -Nru ceph-12.2.11/src/mds/CInode.cc ceph-12.2.12/src/mds/CInode.cc
--- ceph-12.2.11/src/mds/CInode.cc	2019-01-30 15:51:26.000000000 +0000
+++ ceph-12.2.12/src/mds/CInode.cc	2019-04-11 12:33:50.000000000 +0000
@@ -218,7 +218,7 @@
       if (it->second->issued() != it->second->pending())
 	out << "/" << ccap_string(it->second->issued());
       out << "/" << ccap_string(it->second->wanted())
-	  << "@" << it->second->get_last_sent();
+	  << "@" << it->second->get_last_seq();
     }
     out << "}";
     if (in.get_loner() >= 0 || in.get_wanted_loner() >= 0) {
@@ -2812,14 +2812,10 @@
   if (client_caps.empty())
     mdcache->num_inodes_with_caps++;
   
-  Capability *cap = new Capability(this, ++mdcache->last_cap_id, client);
+  Capability *cap = new Capability(this, session, ++mdcache->last_cap_id);
   assert(client_caps.count(client) == 0);
   client_caps[client] = cap;
 
-  session->add_cap(cap);
-  if (session->is_stale())
-    cap->mark_stale();
-  
   cap->client_follows = first-1;
   
   containing_realm->add_cap(client, cap);
@@ -4264,7 +4260,7 @@
     f->dump_string("pending", ccap_string(it->second->pending()));
     f->dump_string("issued", ccap_string(it->second->issued()));
     f->dump_string("wanted", ccap_string(it->second->wanted()));
-    f->dump_int("last_sent", it->second->get_last_sent());
+    f->dump_int("last_sent", it->second->get_last_seq());
     f->close_section();
   }
   f->close_section();
diff -Nru ceph-12.2.11/src/mds/Locker.cc ceph-12.2.12/src/mds/Locker.cc
--- ceph-12.2.11/src/mds/Locker.cc	2019-01-30 15:51:26.000000000 +0000
+++ ceph-12.2.12/src/mds/Locker.cc	2019-04-11 12:33:50.000000000 +0000
@@ -759,8 +759,7 @@
     bool need_issue = false;
     if (lock->get_state() == LOCK_PREXLOCK) {
       _finish_xlock(lock, -1, &need_issue);
-    } else if (lock->get_state() == LOCK_LOCK_XLOCK &&
-	       lock->get_num_xlocks() == 0) {
+    } else if (lock->get_state() == LOCK_LOCK_XLOCK) {
       lock->set_state(LOCK_XLOCKDONE);
       eval_gather(lock, true, &need_issue);
     }
@@ -819,6 +818,24 @@
   issue_caps_set(need_issue);
 }
 
+void Locker::drop_locks_for_fragment_unfreeze(MutationImpl *mut)
+{
+  set<CInode*> need_issue;
+
+  for (auto it = mut->locks.begin(); it != mut->locks.end(); ) {
+    SimpleLock *lock = *it;
+    ++it;
+    if (lock->get_type() == CEPH_LOCK_IDFT) {
+      continue;
+    }
+    bool ni = false;
+    wrlock_finish(lock, mut, &ni);
+    if (ni)
+      need_issue.insert(static_cast<CInode*>(lock->get_parent()));
+  }
+  issue_caps_set(need_issue);
+}
+
 // generics
 
 void Locker::eval_gather(SimpleLock *lock, bool first, bool *pneed_issue, list<MDSInternalContextBase*> *pfinishers)
@@ -1626,11 +1643,17 @@
   dout(7) << "xlock_start on " << *lock << " on " << *lock->get_parent() << dendl;
   client_t client = mut->get_client();
 
+  CInode *in = nullptr;
+  if (lock->get_cap_shift())
+    in = static_cast<CInode *>(lock->get_parent());
+
   // auth?
   if (lock->get_parent()->is_auth()) {
     // auth
     while (1) {
-      if (lock->can_xlock(client)) {
+      if (lock->can_xlock(client) &&
+	  !(lock->get_state() == LOCK_LOCK_XLOCK &&	// client is not xlocker or
+	    in && in->issued_caps_need_gather(lock))) { // xlocker does not hold shared cap
 	lock->set_state(LOCK_XLOCK);
 	lock->get_xlock(mut, client);
 	mut->xlocks.insert(lock);
@@ -1639,11 +1662,9 @@
 	return true;
       }
       
-      if (lock->get_type() == CEPH_LOCK_IFILE) {
-	CInode *in = static_cast<CInode*>(lock->get_parent());
-	if (in->state_test(CInode::STATE_RECOVERING)) {
-	  mds->mdcache->recovery_queue.prioritize(in);
-	}
+      if (lock->get_type() == CEPH_LOCK_IFILE &&
+	  in->state_test(CInode::STATE_RECOVERING)) {
+	mds->mdcache->recovery_queue.prioritize(in);
       }
 
       if (!lock->is_stable() && (lock->get_state() != LOCK_XLOCKDONE ||
@@ -1764,9 +1785,8 @@
 			 SimpleLock::WAIT_WR | 
 			 SimpleLock::WAIT_RD, 0); 
   } else {
-    if (lock->get_num_xlocks() == 0) {
-      if (lock->get_state() == LOCK_LOCK_XLOCK)
-	lock->set_state(LOCK_XLOCKDONE);
+    if (lock->get_num_xlocks() == 0 &&
+        lock->get_state() != LOCK_LOCK_XLOCK) { // no one is taking xlock
       _finish_xlock(lock, xlocker, &do_issue);
     }
   }
@@ -1914,10 +1934,9 @@
   bool is_new;
 
   // if replay, try to reconnect cap, and otherwise do nothing.
-  if (is_replay) {
-    mds->mdcache->try_reconnect_cap(in, session);
-    return 0;
-  }
+  if (is_replay)
+    return mds->mdcache->try_reconnect_cap(in, session);
+
 
   // my needs
   assert(session->info.inst.name.is_client());
@@ -2124,19 +2143,27 @@
     check_inode_max_size(in);
 }
 
-
-void Locker::revoke_stale_caps(Capability *cap)
+void Locker::revoke_stale_caps(Session *session)
 {
-  CInode *in = cap->get_inode();
-  if (in->state_test(CInode::STATE_EXPORTINGCAPS)) {
-    // if export succeeds, the cap will be removed. if export fails, we need to
-    // revoke the cap if it's still stale.
-    in->state_set(CInode::STATE_EVALSTALECAPS);
-    return;
-  }
+  dout(10) << "revoke_stale_caps for " << session->info.inst.name << dendl;
 
-  int issued = cap->issued();
-  if (issued & ~CEPH_CAP_PIN) {
+  std::vector<CInode*> to_eval;
+
+  for (auto p = session->caps.begin(); !p.end(); ) {
+    Capability *cap = *p;
+    ++p;
+    if (!cap->is_notable()) {
+      // the rest ones are not being revoked and don't have writeable range
+      // and don't want exclusive caps or want file read/write. They don't
+      // need recover, they don't affect eval_gather()/try_eval()
+      break;
+    }
+
+    int issued = cap->issued();
+    if (!(issued & ~CEPH_CAP_PIN))
+      continue;
+
+    CInode *in = cap->get_inode();
     dout(10) << " revoking " << ccap_string(issued) << " on " << *in << dendl;
     cap->revoke();
 
@@ -2144,27 +2171,31 @@
 	in->inode.client_ranges.count(cap->get_client()))
       in->state_set(CInode::STATE_NEEDSRECOVER);
 
-    if (!in->filelock.is_stable()) eval_gather(&in->filelock);
-    if (!in->linklock.is_stable()) eval_gather(&in->linklock);
-    if (!in->authlock.is_stable()) eval_gather(&in->authlock);
-    if (!in->xattrlock.is_stable()) eval_gather(&in->xattrlock);
-
-    if (in->is_auth()) {
-      try_eval(in, CEPH_CAP_LOCKS);
-    } else {
-      request_inode_file_caps(in);
-    }
+    // eval lock/inode may finish contexts, which may modify other cap's position
+    // in the session->caps.
+    to_eval.push_back(in);
   }
-}
 
-void Locker::revoke_stale_caps(Session *session)
-{
-  dout(10) << "revoke_stale_caps for " << session->info.inst.name << dendl;
+  // invalidate the rest
+  session->inc_cap_gen();
 
-  for (xlist<Capability*>::iterator p = session->caps.begin(); !p.end(); ++p) {
-    Capability *cap = *p;
-    cap->mark_stale();
-    revoke_stale_caps(cap);
+  for (auto in : to_eval) {
+    if (in->state_test(CInode::STATE_EXPORTINGCAPS))
+      continue;
+
+    if (!in->filelock.is_stable())
+      eval_gather(&in->filelock);
+    if (!in->linklock.is_stable())
+      eval_gather(&in->linklock);
+    if (!in->authlock.is_stable())
+      eval_gather(&in->authlock);
+    if (!in->xattrlock.is_stable())
+      eval_gather(&in->xattrlock);
+
+    if (in->is_auth())
+      try_eval(in, CEPH_CAP_LOCKS);
+    else
+      request_inode_file_caps(in);
   }
 }
 
@@ -2172,24 +2203,25 @@
 {
   dout(10) << "resume_stale_caps for " << session->info.inst.name << dendl;
 
-  for (xlist<Capability*>::iterator p = session->caps.begin(); !p.end(); ++p) {
+  for (xlist<Capability*>::iterator p = session->caps.begin(); !p.end(); ) {
     Capability *cap = *p;
+    ++p;
+    if (!cap->is_notable())
+      break; // see revoke_stale_caps()
+
     CInode *in = cap->get_inode();
-    assert(in->is_head());
-    if (cap->is_stale()) {
-      dout(10) << " clearing stale flag on " << *in << dendl;
-      cap->clear_stale();
-
-      if (in->state_test(CInode::STATE_EXPORTINGCAPS)) {
-	// if export succeeds, the cap will be removed. if export fails,
-	// we need to re-issue the cap if it's not stale.
-	in->state_set(CInode::STATE_EVALSTALECAPS);
-	continue;
-      }
+    ceph_assert(in->is_head());
+    dout(10) << " clearing stale flag on " << *in << dendl;
 
-      if (!in->is_auth() || !eval(in, CEPH_CAP_LOCKS))
-	issue_caps(in, cap);
+    if (in->state_test(CInode::STATE_EXPORTINGCAPS)) {
+      // if export succeeds, the cap will be removed. if export fails,
+      // we need to re-issue the cap if it's not stale.
+      in->state_set(CInode::STATE_EVALSTALECAPS);
+      continue;
     }
+
+    if (!in->is_auth() || !eval(in, CEPH_CAP_LOCKS))
+      issue_caps(in, cap);
   }
 }
 
@@ -2257,7 +2289,13 @@
 void Locker::handle_inode_file_caps(MInodeFileCaps *m)
 {
   // nobody should be talking to us during recovery.
-  assert(mds->is_clientreplay() || mds->is_active() || mds->is_stopping());
+  if (mds->get_state() < MDSMap::STATE_CLIENTREPLAY) {
+    if (mds->get_want_state() >= MDSMap::STATE_CLIENTREPLAY) {
+      mds->wait_for_replay(new C_MDS_RetryMessage(mds, m));
+      return;
+    }
+    assert(!"got unexpected message during recovery");
+  }
 
   // ok
   CInode *in = mdcache->get_inode(m->get_ino());
@@ -2310,13 +2348,13 @@
   return ROUND_UP_TO(new_max, pi->get_layout_size_increment());
 }
 
-void Locker::calc_new_client_ranges(CInode *in, uint64_t size,
+void Locker::calc_new_client_ranges(CInode *in, uint64_t size, bool update,
 				    CInode::mempool_inode::client_range_map *new_ranges,
 				    bool *max_increased)
 {
   auto latest = in->get_projected_inode();
   uint64_t ms;
-  if(latest->has_layout()) {
+  if (latest->has_layout()) {
     ms = calc_new_max_size(latest, size);
   } else {
     // Layout-less directories like ~mds0/, have zero size
@@ -2328,7 +2366,7 @@
   for (map<client_t,Capability*>::iterator p = in->client_caps.begin();
        p != in->client_caps.end();
        ++p) {
-    if ((p->second->issued() | p->second->wanted()) & (CEPH_CAP_FILE_WR|CEPH_CAP_FILE_BUFFER)) {
+    if ((p->second->issued() | p->second->wanted()) & (CEPH_CAP_ANY_FILE_WR)) {
       client_writeable_range_t& nr = (*new_ranges)[p->first];
       nr.range.first = 0;
       if (latest->client_ranges.count(p->first)) {
@@ -2342,6 +2380,11 @@
 	nr.range.last = ms;
 	nr.follows = in->first - 1;
       }
+      if (update)
+	p->second->mark_clientwriteable();
+    } else {
+      if (update)
+	p->second->clear_clientwriteable();
     }
   }
 }
@@ -2367,7 +2410,23 @@
       update_size = false;
   }
 
-  calc_new_client_ranges(in, max(new_max_size, size), &new_ranges, &max_increased);
+  int can_update = 1;
+  if (in->is_frozen()) {
+    can_update = -1;
+  } else if (!force_wrlock && !in->filelock.can_wrlock(in->get_loner())) {
+    // lock?
+    if (in->filelock.is_stable()) {
+      if (in->get_target_loner() >= 0)
+	file_excl(&in->filelock);
+      else
+	simple_lock(&in->filelock);
+    }
+    if (!in->filelock.can_wrlock(in->get_loner()))
+      can_update = -2;
+  }
+
+  calc_new_client_ranges(in, std::max(new_max_size, size), can_update > 0,
+			 &new_ranges, &max_increased);
 
   if (max_increased || latest->client_ranges != new_ranges)
     update_max = true;
@@ -2381,34 +2440,16 @@
 	   << " update_size " << update_size
 	   << " on " << *in << dendl;
 
-  if (in->is_frozen()) {
-    dout(10) << "check_inode_max_size frozen, waiting on " << *in << dendl;
-    C_MDL_CheckMaxSize *cms = new C_MDL_CheckMaxSize(this, in,
-                                                     new_max_size,
-                                                     new_size,
-                                                     new_mtime);
-    in->add_waiter(CInode::WAIT_UNFREEZE, cms);
-    return false;
-  }
-  if (!force_wrlock && !in->filelock.can_wrlock(in->get_loner())) {
-    // lock?
-    if (in->filelock.is_stable()) {
-      if (in->get_target_loner() >= 0)
-	file_excl(&in->filelock);
-      else
-	simple_lock(&in->filelock);
-    }
-    if (!in->filelock.can_wrlock(in->get_loner())) {
-      // try again later
-      C_MDL_CheckMaxSize *cms = new C_MDL_CheckMaxSize(this, in,
-                                                       new_max_size,
-                                                       new_size,
-                                                       new_mtime);
-
+  if (can_update < 0) {
+    auto cms = new C_MDL_CheckMaxSize(this, in, new_max_size, new_size, new_mtime);
+    if (can_update == -1) {
+      dout(10) << "check_inode_max_size frozen, waiting on " << *in << dendl;
+      in->add_waiter(CInode::WAIT_UNFREEZE, cms);
+    } else {
       in->filelock.add_waiter(SimpleLock::WAIT_STABLE, cms);
       dout(10) << "check_inode_max_size can't wrlock, waiting on " << *in << dendl;
-      return false;    
     }
+    return false;
   }
 
   MutationRef mut(new MutationImpl());
@@ -3030,7 +3071,7 @@
 void Locker::kick_issue_caps(CInode *in, client_t client, ceph_seq_t seq)
 {
   Capability *cap = in->get_client_cap(client);
-  if (!cap || cap->get_last_sent() != seq)
+  if (!cap || cap->get_last_seq() != seq)
     return;
   if (in->is_frozen()) {
     dout(10) << "kick_issue_caps waiting for unfreeze on " << *in << dendl;
@@ -3381,8 +3422,13 @@
       cr.range.first = 0;
       cr.range.last = new_max;
       cr.follows = in->first - 1;
-    } else 
+      if (cap)
+	cap->mark_clientwriteable();
+    } else {
       pi.inode.client_ranges.erase(client);
+      if (cap)
+	cap->clear_clientwriteable();
+    }
   }
     
   if (change_max || (dirty & (CEPH_CAP_FILE_EXCL|CEPH_CAP_FILE_WR))) 
@@ -3506,18 +3552,20 @@
     eval_cap_gather(in);
     return;
   }
-  remove_client_cap(in, client);
+  remove_client_cap(in, cap);
 }
 
-/* This function DOES put the passed message before returning */
-
-void Locker::remove_client_cap(CInode *in, client_t client)
+void Locker::remove_client_cap(CInode *in, Capability *cap)
 {
+  client_t client = cap->get_client();
   // clean out any pending snapflush state
   if (!in->client_need_snapflush.empty())
     _do_null_snapflush(in, client);
 
+  bool notable = cap->is_notable();
   in->remove_client_cap(client);
+  if (!notable)
+    return;
 
   if (in->is_auth()) {
     // make sure we clear out the client byte range
diff -Nru ceph-12.2.11/src/mds/Locker.h ceph-12.2.12/src/mds/Locker.h
--- ceph-12.2.11/src/mds/Locker.h	2019-01-30 15:51:26.000000000 +0000
+++ ceph-12.2.12/src/mds/Locker.h	2019-04-11 12:33:50.000000000 +0000
@@ -88,6 +88,7 @@
   void set_xlocks_done(MutationImpl *mut, bool skip_dentry=false);
   void drop_non_rdlocks(MutationImpl *mut, set<CInode*> *pneed_issue=0);
   void drop_rdlocks_for_early_reply(MutationImpl *mut);
+  void drop_locks_for_fragment_unfreeze(MutationImpl *mut);
 
   void eval_gather(SimpleLock *lock, bool first=false, bool *need_issue=0, list<MDSInternalContextBase*> *pfinishers=0);
   void eval(SimpleLock *lock, bool *need_issue);
@@ -184,7 +185,7 @@
   void kick_cap_releases(MDRequestRef& mdr);
   void kick_issue_caps(CInode *in, client_t client, ceph_seq_t seq);
 
-  void remove_client_cap(CInode *in, client_t client);
+  void remove_client_cap(CInode *in, Capability *cap);
 
   void get_late_revoking_clients(std::list<client_t> *result, double timeout) const;
 
@@ -244,7 +245,6 @@
   void issue_caps_set(set<CInode*>& inset);
   void issue_truncate(CInode *in);
   void revoke_stale_caps(Session *session);
-  void revoke_stale_caps(Capability *cap);
   void resume_stale_caps(Session *session);
   void remove_stale_leases(Session *session);
 
@@ -258,7 +258,7 @@
 private:
   uint64_t calc_new_max_size(CInode::mempool_inode *pi, uint64_t size);
 public:
-  void calc_new_client_ranges(CInode *in, uint64_t size,
+  void calc_new_client_ranges(CInode *in, uint64_t size, bool update,
 			      CInode::mempool_inode::client_range_map* new_ranges,
 			      bool *max_increased);
   bool check_inode_max_size(CInode *in, bool force_wrlock=false,
diff -Nru ceph-12.2.11/src/mds/MDBalancer.cc ceph-12.2.12/src/mds/MDBalancer.cc
--- ceph-12.2.11/src/mds/MDBalancer.cc	2019-01-30 15:51:26.000000000 +0000
+++ ceph-12.2.12/src/mds/MDBalancer.cc	2019-04-11 12:33:50.000000000 +0000
@@ -813,7 +813,7 @@
   /* execute the balancer */
   Mantle mantle;
   int ret = mantle.balance(bal_code, mds->get_nodeid(), metrics, state.targets);
-  dout(2) << " mantle decided that new targets=" << state.targets << dendl;
+  dout(5) << " mantle decided that new targets=" << state.targets << dendl;
 
   /* mantle doesn't know about cluster size, so check target len here */
   if ((int) state.targets.size() != cluster_size)
diff -Nru ceph-12.2.11/src/mds/MDCache.cc ceph-12.2.12/src/mds/MDCache.cc
--- ceph-12.2.11/src/mds/MDCache.cc	2019-01-30 15:51:26.000000000 +0000
+++ ceph-12.2.12/src/mds/MDCache.cc	2019-04-11 12:33:50.000000000 +0000
@@ -94,6 +94,7 @@
 #include "messages/MMDSSlaveRequest.h"
 
 #include "messages/MMDSFragmentNotify.h"
+#include "messages/MMDSFragmentNotifyAck.h"
 
 #include "messages/MGatherCaps.h"
 
@@ -173,7 +174,8 @@
   filer(m->objecter, m->finisher),
   exceeded_size_limit(false),
   recovery_queue(m),
-  stray_manager(m, purge_queue_)
+  stray_manager(m, purge_queue_),
+  trim_counter(ceph_clock_now(), g_conf->get_val<double>("mds_cache_trim_decay_rate"))
 {
   migrator.reset(new Migrator(mds, this));
   root = NULL;
@@ -243,6 +245,9 @@
     cache_health_threshold = g_conf->get_val<double>("mds_health_cache_threshold");
   if (changed.count("mds_cache_mid"))
     lru.lru_set_midpoint(g_conf->get_val<double>("mds_cache_mid"));
+  if (changed.count("mds_cache_trim_decay_rate")) {
+    trim_counter = DecayCounter(ceph_clock_now(), g_conf->get_val<double>("mds_cache_trim_decay_rate"));
+  }
 
   migrator->handle_conf_change(conf, changed, mdsmap);
   mds->balancer->handle_conf_change(conf, changed, mdsmap);
@@ -2015,14 +2020,15 @@
   }
 }
 
-void MDCache::broadcast_quota_to_client(CInode *in, client_t exclude_ct)
+void MDCache::broadcast_quota_to_client(CInode *in, client_t exclude_ct, bool quota_change)
 {
   if (!in->is_auth() || in->is_frozen())
     return;
 
   auto i = in->get_projected_inode();
-
-  if (!i->quota.is_enable())
+  
+  if (!i->quota.is_enable() &&
+  	  !quota_change)
     return;
 
   for (map<client_t,Capability*>::iterator it = in->client_caps.begin();
@@ -3056,9 +3062,19 @@
        p != fragments.end(); ) {
     dirfrag_t df = p->first;
     fragment_info_t& info = p->second;
-    ++p;
-    if (info.is_fragmenting())
+
+    if (info.is_fragmenting()) {
+      if (info.notify_ack_waiting.erase(who) &&
+	  info.notify_ack_waiting.empty()) {
+	fragment_drop_locks(info);
+	fragment_maybe_finish(p++);
+      } else {
+	++p;
+      }
       continue;
+    }
+
+    ++p;
     dout(10) << "cancelling fragment " << df << " bit " << info.bits << dendl;
     list<CDir*> dirs;
     info.dirs.swap(dirs);
@@ -4643,7 +4659,13 @@
   mds_rank_t from = mds_rank_t(strong->get_source().num());
 
   // only a recovering node will get a strong rejoin.
-  assert(mds->is_rejoin());
+  if (!mds->is_rejoin()) {
+    if (mds->get_want_state() == MDSMap::STATE_REJOIN) {
+      mds->wait_for_rejoin(new C_MDS_RetryMessage(mds, strong));
+      return;
+    }
+    assert(!"got unexpected rejoin message during recovery");
+  }
 
   // assimilate any potentially dirty scatterlock state
   for (map<inodeno_t,MMDSCacheRejoin::lock_bls>::iterator p = strong->inode_scatterlocks.begin();
@@ -5688,12 +5710,13 @@
   }
 }
 
-void MDCache::try_reconnect_cap(CInode *in, Session *session)
+Capability* MDCache::try_reconnect_cap(CInode *in, Session *session)
 {
   client_t client = session->info.get_client();
+  Capability *cap = nullptr;
   const cap_reconnect_t *rc = get_replay_cap_reconnect(in->ino(), client);
   if (rc) {
-    in->reconnect_cap(client, *rc, session);
+    cap = in->reconnect_cap(client, *rc, session);
     dout(10) << "try_reconnect_cap client." << client
 	     << " reconnect wanted " << ccap_string(rc->capinfo.wanted)
 	     << " issue " << ccap_string(rc->capinfo.issued)
@@ -5721,6 +5744,7 @@
       cap_reconnect_waiters.erase(it);
     }
   }
+  return cap;
 }
 
 
@@ -6228,7 +6252,9 @@
 	 p != in->inode.client_ranges.end();
 	 ++p) {
       Capability *cap = in->get_client_cap(p->first);
-      if (!cap) {
+      if (cap) {
+	cap->mark_clientwriteable();
+      } else {
 	dout(10) << " client." << p->first << " has range " << p->second << " but no cap on " << *in << dendl;
 	recover = true;
 	break;
@@ -6472,12 +6498,14 @@
 // ================================================================================
 // cache trimming
 
-void MDCache::trim_lru(uint64_t count, map<mds_rank_t, MCacheExpire*> &expiremap)
+std::pair<bool, uint64_t> MDCache::trim_lru(uint64_t count, map<mds_rank_t, MCacheExpire*> &expiremap)
 {
   bool is_standby_replay = mds->is_standby_replay();
   std::vector<CDentry *> unexpirables;
   uint64_t trimmed = 0;
 
+  auto trim_threshold = g_conf->get_val<uint64_t>("mds_cache_trim_threshold");
+
   dout(7) << "trim_lru trimming " << count
           << " items from LRU"
           << " size=" << lru.lru_get_size()
@@ -6486,7 +6514,11 @@
           << " pinned=" << lru.lru_get_num_pinned()
           << dendl;
 
-  for (;;) {
+  const uint64_t trim_counter_start = trim_counter.get(ceph_clock_now());
+  bool throttled = false;
+  while (1) {
+    throttled |= trim_counter_start+trimmed >= trim_threshold;
+    if (throttled) break;
     CDentry *dn = static_cast<CDentry*>(bottom_lru.lru_expire());
     if (!dn)
       break;
@@ -6503,7 +6535,9 @@
   unexpirables.clear();
 
   // trim dentries from the LRU until count is reached
-  while (cache_toofull() || count > 0) {
+  while (!throttled && (cache_toofull() || count > 0)) {
+    throttled |= trim_counter_start+trimmed >= trim_threshold;
+    if (throttled) break;
     CDentry *dn = static_cast<CDentry*>(lru.lru_expire());
     if (!dn) {
       break;
@@ -6518,6 +6552,7 @@
       if (count > 0) count--;
     }
   }
+  trim_counter.hit(ceph_clock_now(), trimmed);
 
   for (auto &dn : unexpirables) {
     lru.lru_insert_mid(dn);
@@ -6525,6 +6560,7 @@
   unexpirables.clear();
 
   dout(7) << "trim_lru trimmed " << trimmed << " items" << dendl;
+  return std::pair<bool, uint64_t>(throttled, trimmed);
 }
 
 /*
@@ -6533,7 +6569,7 @@
  *
  * @param count is number of dentries to try to expire
  */
-bool MDCache::trim(uint64_t count)
+std::pair<bool, uint64_t> MDCache::trim(uint64_t count)
 {
   uint64_t used = cache_size();
   uint64_t limit = cache_memory_limit;
@@ -6547,7 +6583,8 @@
   // process delayed eval_stray()
   stray_manager.advance_delayed();
 
-  trim_lru(count, expiremap);
+  auto result = trim_lru(count, expiremap);
+  auto& trimmed = result.second;
 
   // trim non-auth, non-bound subtrees
   for (auto p = subtrees.begin(); p != subtrees.end();) {
@@ -6563,6 +6600,7 @@
 	  continue;
 
 	migrator->export_empty_import(dir);
+        ++trimmed;
       }
     } else {
       if (!diri->is_auth()) {
@@ -6579,6 +6617,7 @@
 	    rejoin_ack_gather.count(dir->get_dir_auth().first))
 	  continue;
 	trim_dirfrag(dir, 0, expiremap);
+        ++trimmed;
       }
     }
   }
@@ -6589,11 +6628,15 @@
     root->get_dirfrags(ls);
     for (list<CDir*>::iterator p = ls.begin(); p != ls.end(); ++p) {
       CDir *dir = *p;
-      if (dir->get_num_ref() == 1)  // subtree pin
+      if (dir->get_num_ref() == 1) { // subtree pin
 	trim_dirfrag(dir, 0, expiremap);
+        ++trimmed;
+      }
     }
-    if (root->get_num_ref() == 0)
+    if (root->get_num_ref() == 0) {
       trim_inode(0, root, 0, expiremap);
+      ++trimmed;
+    }
   }
 
   std::set<mds_rank_t> stopping;
@@ -6616,11 +6659,15 @@
       list<CDir*> ls;
       mdsdir_in->get_dirfrags(ls);
       for (auto dir : ls) {
-	if (dir->get_num_ref() == 1)  // subtree pin
+	if (dir->get_num_ref() == 1) {  // subtree pin
 	  trim_dirfrag(dir, dir, expiremap);
+          ++trimmed;
+        }
       }
-      if (mdsdir_in->get_num_ref() == 0)
+      if (mdsdir_in->get_num_ref() == 0) {
 	trim_inode(NULL, mdsdir_in, NULL, expiremap);
+        ++trimmed;
+      }
     } else {
       dout(20) << __func__ << ": some unexpirable contents in mdsdir" << dendl;
     }
@@ -6634,6 +6681,7 @@
         dout(20) << __func__ << ": maybe trimming base: " << *(*p) << dendl;
         if ((*p)->get_num_ref() == 0) {
           trim_inode(NULL, *p, NULL, expiremap);
+          ++trimmed;
         }
       }
     }
@@ -6642,7 +6690,7 @@
   // send any expire messages
   send_expire_messages(expiremap);
 
-  return true;
+  return result;
 }
 
 void MDCache::send_expire_messages(map<mds_rank_t, MCacheExpire*>& expiremap)
@@ -7482,7 +7530,7 @@
   if (CInode::count())
     caps_per_inode = (double)Capability::count() / (double)CInode::count();
 
-  dout(2) << "check_memory_usage"
+  dout(2) << "Memory usage: "
 	   << " total " << last.get_total()
 	   << ", rss " << last.get_rss()
 	   << ", heap " << last.get_heap()
@@ -7497,8 +7545,7 @@
   mds->mlogger->set(l_mdm_heap, last.get_heap());
 
   if (cache_toofull()) {
-    last_recall_state = clock::now();
-    mds->server->recall_client_state(-1.0, false, nullptr);
+    mds->server->recall_client_state(nullptr);
   }
 
   // If the cache size had exceeded its limit, but we're back in bounds
@@ -7508,7 +7555,7 @@
     // Only do this once we are back in bounds: otherwise the releases would
     // slow down whatever process caused us to exceed bounds to begin with
     if (ceph_using_tcmalloc()) {
-      dout(2) << "check_memory_usage: releasing unused space from tcmalloc" 
+      dout(5) << "check_memory_usage: releasing unused space from tcmalloc"
 	      << dendl;
       ceph_heap_release_free_memory();
     }
@@ -7558,7 +7605,7 @@
 
 void MDCache::shutdown_start()
 {
-  dout(2) << "shutdown_start" << dendl;
+  dout(5) << "shutdown_start" << dendl;
 
   if (g_conf->mds_shutdown_check)
     mds->timer.add_event_after(g_conf->mds_shutdown_check, new C_MDC_ShutdownCheck(this));
@@ -7755,7 +7802,7 @@
   }
 
   // done!
-  dout(2) << "shutdown done." << dendl;
+  dout(5) << "shutdown done." << dendl;
   return true;
 }
 
@@ -7921,6 +7968,9 @@
   case MSG_MDS_FRAGMENTNOTIFY:
     handle_fragment_notify(static_cast<MMDSFragmentNotify*>(m));
     break;
+  case MSG_MDS_FRAGMENTNOTIFYACK:
+    handle_fragment_notify_ack(static_cast<MMDSFragmentNotifyAck*>(m));
+    break;
 
   case MSG_MDS_FINDINO:
     handle_find_ino(static_cast<MMDSFindIno *>(m));
@@ -11309,29 +11359,29 @@
 
 class C_MDC_FragmentCommit : public MDCacheLogContext {
   dirfrag_t basedirfrag;
-  list<CDir*> resultfrags;
+  MDRequestRef mdr;
 public:
-  C_MDC_FragmentCommit(MDCache *m, dirfrag_t df, list<CDir*>& l) :
-    MDCacheLogContext(m), basedirfrag(df), resultfrags(l) {}
+  C_MDC_FragmentCommit(MDCache *m, dirfrag_t df, const MDRequestRef& r) :
+    MDCacheLogContext(m), basedirfrag(df), mdr(r) {}
   void finish(int r) override {
-    mdcache->_fragment_committed(basedirfrag, resultfrags);
+    mdcache->_fragment_committed(basedirfrag, mdr);
   }
 };
 
-class C_IO_MDC_FragmentFinish : public MDCacheIOContext {
+class C_IO_MDC_FragmentPurgeOld : public MDCacheIOContext {
   dirfrag_t basedirfrag;
-  list<CDir*> resultfrags;
+  int bits;
+  MDRequestRef mdr;
 public:
-  C_IO_MDC_FragmentFinish(MDCache *m, dirfrag_t f, list<CDir*>& l) :
-    MDCacheIOContext(m), basedirfrag(f) {
-    resultfrags.swap(l);
-  }
+  C_IO_MDC_FragmentPurgeOld(MDCache *m, dirfrag_t f, int b,
+			    const MDRequestRef& r) :
+    MDCacheIOContext(m), basedirfrag(f), bits(b), mdr(r) {}
   void finish(int r) override {
     assert(r == 0 || r == -ENOENT);
-    mdcache->_fragment_finish(basedirfrag, resultfrags);
+    mdcache->_fragment_old_purged(basedirfrag, bits, mdr);
   }
   void print(ostream& out) const override {
-    out << "dirfrags_commit(" << basedirfrag << ")";
+    out << "fragment_purge_old(" << basedirfrag << ")";
   }
 };
 
@@ -11460,13 +11510,12 @@
 void MDCache::_fragment_logged(MDRequestRef& mdr)
 {
   dirfrag_t basedirfrag = mdr->more()->fragment_base;
-  map<dirfrag_t,fragment_info_t>::iterator it = fragments.find(basedirfrag);
-  assert(it != fragments.end());
-  fragment_info_t &info = it->second;
+  auto& info = fragments.at(basedirfrag);
   CInode *diri = info.resultfrags.front()->get_inode();
 
   dout(10) << "fragment_logged " << basedirfrag << " bits " << info.bits
 	   << " on " << *diri << dendl;
+  mdr->mark_event("prepare logged");
 
   if (diri->is_auth())
     diri->pop_and_dirty_projected_inode(mdr->ls);
@@ -11494,23 +11543,46 @@
 void MDCache::_fragment_stored(MDRequestRef& mdr)
 {
   dirfrag_t basedirfrag = mdr->more()->fragment_base;
-  map<dirfrag_t,fragment_info_t>::iterator it = fragments.find(basedirfrag);
-  assert(it != fragments.end());
-  fragment_info_t &info = it->second;
-  CInode *diri = info.resultfrags.front()->get_inode();
+  fragment_info_t &info = fragments.at(basedirfrag);
+  CDir *first = info.resultfrags.front();
+  CInode *diri = first->get_inode();
 
   dout(10) << "fragment_stored " << basedirfrag << " bits " << info.bits
 	   << " on " << *diri << dendl;
+  mdr->mark_event("new frags stored");
 
   // tell peers
-  CDir *first = *info.resultfrags.begin();
+  mds_rank_t diri_auth = (first->is_subtree_root() && !diri->is_auth()) ?
+			  diri->authority().first : CDIR_AUTH_UNKNOWN;
   for (const auto &p : first->get_replicas()) {
     if (mds->mdsmap->get_state(p.first) < MDSMap::STATE_REJOIN ||
 	(mds->mdsmap->get_state(p.first) == MDSMap::STATE_REJOIN &&
 	 rejoin_gather.count(p.first)))
       continue;
 
-    MMDSFragmentNotify *notify = new MMDSFragmentNotify(basedirfrag, info.bits);
+    auto notify = new MMDSFragmentNotify(basedirfrag, info.bits, mdr->reqid.tid);
+    if (diri_auth != CDIR_AUTH_UNKNOWN && // subtree root
+	diri_auth != p.first) { // not auth mds of diri
+      /*
+       * In the nornal case, mds does not trim dir inode whose child dirfrags
+       * are likely being fragmented (see trim_inode()). But when fragmenting
+       * subtree roots, following race can happen:
+       *
+       * - mds.a (auth mds of dirfrag) sends fragment_notify message to
+       *   mds.c and drops wrlock on dirfragtreelock.
+       * - mds.b (auth mds of dir inode) changes dirfragtreelock state to
+       *   SYNC and send lock message mds.c
+       * - mds.c receives the lock message and changes dirfragtreelock state
+       *   to SYNC
+       * - mds.c trim dirfrag and dir inode from its cache
+       * - mds.c receives the fragment_notify message
+       *
+       * So we need to ensure replicas have received the notify, then unlock
+       * the dirfragtreelock.
+       */
+      notify->mark_ack_wanted();
+      info.notify_ack_waiting.insert(p.first);
+    }
 
     // freshly replicate new dirs to peers
     for (list<CDir*>::iterator q = info.resultfrags.begin();
@@ -11523,10 +11595,8 @@
 
   // journal commit
   EFragment *le = new EFragment(mds->mdlog, EFragment::OP_COMMIT, basedirfrag, info.bits);
-  mds->mdlog->start_submit_entry(le, new C_MDC_FragmentCommit(this, basedirfrag,
-							      info.resultfrags));
+  mds->mdlog->start_submit_entry(le, new C_MDC_FragmentCommit(this, basedirfrag, mdr));
 
-  mds->locker->drop_locks(mdr.get());
 
   // unfreeze resulting frags
   for (list<CDir*>::iterator p = info.resultfrags.begin();
@@ -11546,22 +11616,26 @@
     dir->unfreeze_dir();
   }
 
-  fragments.erase(it);
-  request_finish(mdr);
+  if (info.notify_ack_waiting.empty()) {
+    fragment_drop_locks(info);
+  } else {
+    mds->locker->drop_locks_for_fragment_unfreeze(mdr.get());
+  }
 }
 
-void MDCache::_fragment_committed(dirfrag_t basedirfrag, list<CDir*>& resultfrags)
+void MDCache::_fragment_committed(dirfrag_t basedirfrag, const MDRequestRef& mdr)
 {
   dout(10) << "fragment_committed " << basedirfrag << dendl;
-  map<dirfrag_t, ufragment>::iterator it = uncommitted_fragments.find(basedirfrag);
-  assert(it != uncommitted_fragments.end());
-  ufragment &uf = it->second;
+  if (mdr)
+    mdr->mark_event("commit logged");
+
+  ufragment &uf = uncommitted_fragments.at(basedirfrag);
 
   // remove old frags
   C_GatherBuilder gather(
     g_ceph_context,
     new C_OnFinisher(
-      new C_IO_MDC_FragmentFinish(this, basedirfrag, resultfrags),
+      new C_IO_MDC_FragmentPurgeOld(this, basedirfrag, uf.bits, mdr),
       mds->finisher));
 
   SnapContext nullsnapc;
@@ -11589,16 +11663,50 @@
   gather.activate();
 }
 
-void MDCache::_fragment_finish(dirfrag_t basedirfrag, list<CDir*>& resultfrags)
+void MDCache::_fragment_old_purged(dirfrag_t basedirfrag, int bits, const MDRequestRef& mdr)
 {
-  dout(10) << "fragment_finish " << basedirfrag << "resultfrags.size="
-           << resultfrags.size() << dendl;
-  map<dirfrag_t, ufragment>::iterator it = uncommitted_fragments.find(basedirfrag);
-  assert(it != uncommitted_fragments.end());
-  ufragment &uf = it->second;
+  dout(10) << "fragment_old_purged " << basedirfrag << dendl;
+  if (mdr)
+    mdr->mark_event("old frags purged");
+
+  EFragment *le = new EFragment(mds->mdlog, EFragment::OP_FINISH, basedirfrag, bits);
+  mds->mdlog->start_submit_entry(le);
+
+  finish_uncommitted_fragment(basedirfrag, EFragment::OP_FINISH);
+
+  if (mds->logger) {
+    if (bits > 0) {
+      mds->logger->inc(l_mds_dir_split);
+    } else {
+      mds->logger->inc(l_mds_dir_merge);
+    }
+  }
+
+  if (mdr) {
+    auto it = fragments.find(basedirfrag);
+    ceph_assert(it != fragments.end());
+    it->second.finishing = true;
+    if (it->second.notify_ack_waiting.empty())
+      fragment_maybe_finish(it);
+    else
+      mdr->mark_event("wating for notify acks");
+  }
+}
+
+void MDCache::fragment_drop_locks(fragment_info_t& info)
+{
+  mds->locker->drop_locks(info.mdr.get());
+  request_finish(info.mdr);
+  //info.mdr.reset();
+}
+
+void MDCache::fragment_maybe_finish(const fragment_info_iterator& it)
+{
+  if (!it->second.finishing)
+    return;
 
   // unmark & auth_unpin
-  for (const auto &dir : resultfrags) {
+  for (const auto &dir : it->second.resultfrags) {
     dir->state_clear(CDir::STATE_FRAGMENTING);
     dir->auth_unpin(this);
 
@@ -11609,24 +11717,41 @@
     mds->balancer->maybe_fragment(dir, false);
   }
 
-  if (mds->logger) {
-    if (resultfrags.size() > 1) {
-      mds->logger->inc(l_mds_dir_split);
-    } else {
-      mds->logger->inc(l_mds_dir_merge);
-    }
+  fragments.erase(it);
+}
+
+
+void MDCache::handle_fragment_notify_ack(MMDSFragmentNotifyAck *ack)
+{
+  dout(10) << "handle_fragment_notify_ack " << *ack << " from " << ack->get_source() << dendl;
+  mds_rank_t from = mds_rank_t(ack->get_source().num());
+
+  if (mds->get_state() < MDSMap::STATE_ACTIVE) {
+    ack->put();
+    return;
   }
 
-  EFragment *le = new EFragment(mds->mdlog, EFragment::OP_FINISH, basedirfrag, uf.bits);
-  mds->mdlog->start_submit_entry(le);
+  auto it = fragments.find(ack->get_base_dirfrag());
+  if (it == fragments.end() ||
+      it->second.get_tid() != ack->get_tid()) {
+    dout(10) << "handle_fragment_notify_ack obsolete message, dropping" << dendl;
+    ack->put();
+    return;
+  }
 
-  finish_uncommitted_fragment(basedirfrag, EFragment::OP_FINISH);
+  if (it->second.notify_ack_waiting.erase(from) &&
+      it->second.notify_ack_waiting.empty()) {
+    fragment_drop_locks(it->second);
+    fragment_maybe_finish(it);
+  }
+  ack->put();
 }
 
 /* This function DOES put the passed message before returning */
 void MDCache::handle_fragment_notify(MMDSFragmentNotify *notify)
 {
   dout(10) << "handle_fragment_notify " << *notify << " from " << notify->get_source() << dendl;
+  mds_rank_t from = mds_rank_t(notify->get_source().num());
 
   if (mds->get_state() < MDSMap::STATE_REJOIN) {
     notify->put();
@@ -11661,13 +11786,18 @@
     // add new replica dirs values
     bufferlist::iterator p = notify->basebl.begin();
     while (!p.end())
-      add_replica_dir(p, diri, mds_rank_t(notify->get_source().num()), waiters);
+      add_replica_dir(p, diri, from, waiters);
 
     mds->queue_waiters(waiters);
   } else {
     ceph_abort();
   }
 
+  if (notify->is_ack_wanted()) {
+    auto ack = new MMDSFragmentNotifyAck(notify->get_base_dirfrag(),
+					 notify->get_bits(), notify->get_tid());
+    mds->send_message_mds(ack, from);
+  }
   notify->put();
 }
 
@@ -11730,14 +11860,7 @@
     assert(diri);
 
     if (uf.committed) {
-      list<CDir*> frags;
-      diri->get_dirfrags_under(p->first.frag, frags);
-      for (list<CDir*>::iterator q = frags.begin(); q != frags.end(); ++q) {
-	CDir *dir = *q;
-	dir->auth_pin(this);
-	dir->state_set(CDir::STATE_FRAGMENTING);
-      }
-      _fragment_committed(p->first, frags);
+      _fragment_committed(p->first, MDRequestRef());
       continue;
     }
 
@@ -11809,16 +11932,10 @@
     for (list<frag_t>::iterator q = old_frags.begin(); q != old_frags.end(); ++q)
       assert(!diri->dirfragtree.is_leaf(*q));
 
-    for (list<CDir*>::iterator q = resultfrags.begin(); q != resultfrags.end(); ++q) {
-      CDir *dir = *q;
-      dir->auth_pin(this);
-      dir->state_set(CDir::STATE_FRAGMENTING);
-    }
-
     mds->mdlog->submit_entry(le);
 
     uf.old_frags.swap(old_frags);
-    _fragment_committed(p->first, resultfrags);
+    _fragment_committed(p->first, MDRequestRef());
   }
 }
 
diff -Nru ceph-12.2.11/src/mds/MDCache.h ceph-12.2.12/src/mds/MDCache.h
--- ceph-12.2.11/src/mds/MDCache.h	2019-01-30 15:51:26.000000000 +0000
+++ ceph-12.2.12/src/mds/MDCache.h	2019-04-11 12:33:50.000000000 +0000
@@ -19,6 +19,7 @@
 
 #include <boost/utility/string_view.hpp>
 
+#include "common/DecayCounter.h"
 #include "include/types.h"
 #include "include/filepath.h"
 #include "include/elist.h"
@@ -68,6 +69,7 @@
 struct MClientSnap;
 
 class MMDSFragmentNotify;
+class MMDSFragmentNotifyAck;
 
 class ESubtreeMap;
 
@@ -191,6 +193,9 @@
    */
   void notify_stray(CDentry *dn) {
     assert(dn->get_dir()->get_inode()->is_stray());
+    if (dn->state_test(CDentry::STATE_PURGING))
+      return;
+
     stray_manager.eval_stray(dn);
   }
 
@@ -400,7 +405,7 @@
   void project_rstat_frag_to_inode(nest_info_t& rstat, nest_info_t& accounted_rstat,
 				   snapid_t ofirst, snapid_t last, 
 				   CInode *pin, bool cow_head);
-  void broadcast_quota_to_client(CInode *in, client_t exclude_ct = -1);
+  void broadcast_quota_to_client(CInode *in, client_t exclude_ct = -1, bool quota_change = false);
   void predirty_journal_parents(MutationRef mut, EMetaBlob *blob,
 				CInode *in, CDir *parent,
 				int flags, int linkunlink=0,
@@ -651,7 +656,7 @@
   void send_snaps(map<client_t,MClientSnap*>& splits);
   Capability* rejoin_import_cap(CInode *in, client_t client, const cap_reconnect_t& icr, mds_rank_t frommds);
   void finish_snaprealm_reconnect(client_t client, SnapRealm *realm, snapid_t seq);
-  void try_reconnect_cap(CInode *in, Session *session);
+  Capability* try_reconnect_cap(CInode *in, Session *session);
   void export_remaining_imported_caps();
 
   // cap imports.  delayed snap parent opens.
@@ -715,9 +720,9 @@
   size_t get_cache_size() { return lru.lru_get_size(); }
 
   // trimming
-  bool trim(uint64_t count=0);
+  std::pair<bool, uint64_t> trim(uint64_t count=0);
 private:
-  void trim_lru(uint64_t count, map<mds_rank_t, MCacheExpire*>& expiremap);
+  std::pair<bool, uint64_t> trim_lru(uint64_t count, map<mds_rank_t, MCacheExpire*>& expiremap);
   bool trim_dentry(CDentry *dn, map<mds_rank_t, MCacheExpire*>& expiremap);
   void trim_dirfrag(CDir *dir, CDir *con,
 		    map<mds_rank_t, MCacheExpire*>& expiremap);
@@ -755,8 +760,6 @@
   void trim_client_leases();
   void check_memory_usage();
 
-  time last_recall_state;
-
   // shutdown
 private:
   set<inodeno_t> shutdown_exporting_strays;
@@ -1099,15 +1102,20 @@
     list<CDir*> dirs;
     list<CDir*> resultfrags;
     MDRequestRef mdr;
+    set<mds_rank_t> notify_ack_waiting;
+    bool finishing = false;
+
     // for deadlock detection
-    bool all_frozen;
+    bool all_frozen = false;
     utime_t last_cum_auth_pins_change;
-    int last_cum_auth_pins;
-    int num_remote_waiters;	// number of remote authpin waiters
-    fragment_info_t() : bits(0), all_frozen(false), last_cum_auth_pins(0), num_remote_waiters(0) {}
+    int last_cum_auth_pins = 0;
+    int num_remote_waiters = 0;	// number of remote authpin waiters
+    fragment_info_t() {}
     bool is_fragmenting() { return !resultfrags.empty(); }
+    uint64_t get_tid() { return mdr ? mdr->reqid.tid : 0; }
   };
   map<dirfrag_t,fragment_info_t> fragments;
+  typedef map<dirfrag_t,fragment_info_t>::iterator fragment_info_iterator;
 
   void adjust_dir_fragments(CInode *diri, frag_t basefrag, int bits,
 			    list<CDir*>& frags, list<MDSInternalContextBase*>& waiters, bool replay);
@@ -1125,11 +1133,13 @@
   void fragment_mark_and_complete(MDRequestRef& mdr);
   void fragment_frozen(MDRequestRef& mdr, int r);
   void fragment_unmark_unfreeze_dirs(list<CDir*>& dirs);
+  void fragment_drop_locks(fragment_info_t &info);
+  void fragment_maybe_finish(const fragment_info_iterator& it);
   void dispatch_fragment_dir(MDRequestRef& mdr);
   void _fragment_logged(MDRequestRef& mdr);
   void _fragment_stored(MDRequestRef& mdr);
-  void _fragment_committed(dirfrag_t f, list<CDir*>& resultfrags);
-  void _fragment_finish(dirfrag_t f, list<CDir*>& resultfrags);
+  void _fragment_committed(dirfrag_t f, const MDRequestRef& mdr);
+  void _fragment_old_purged(dirfrag_t f, int bits, const MDRequestRef& mdr);
 
   friend class EFragment;
   friend class C_MDC_FragmentFrozen;
@@ -1137,14 +1147,19 @@
   friend class C_MDC_FragmentPrep;
   friend class C_MDC_FragmentStore;
   friend class C_MDC_FragmentCommit;
-  friend class C_IO_MDC_FragmentFinish;
+  friend class C_IO_MDC_FragmentPurgeOld;
 
   void handle_fragment_notify(MMDSFragmentNotify *m);
+  void handle_fragment_notify_ack(MMDSFragmentNotifyAck *m);
 
   void add_uncommitted_fragment(dirfrag_t basedirfrag, int bits, list<frag_t>& old_frag,
 				LogSegment *ls, bufferlist *rollback=NULL);
   void finish_uncommitted_fragment(dirfrag_t basedirfrag, int op);
   void rollback_uncommitted_fragment(dirfrag_t basedirfrag, list<frag_t>& old_frags);
+
+
+  DecayCounter trim_counter;
+
 public:
   void wait_for_uncommitted_fragment(dirfrag_t dirfrag, MDSInternalContextBase *c) {
     assert(uncommitted_fragments.count(dirfrag));
diff -Nru ceph-12.2.11/src/mds/MDSDaemon.cc ceph-12.2.12/src/mds/MDSDaemon.cc
--- ceph-12.2.11/src/mds/MDSDaemon.cc	2019-01-30 15:51:26.000000000 +0000
+++ ceph-12.2.12/src/mds/MDSDaemon.cc	2019-04-11 12:33:50.000000000 +0000
@@ -373,6 +373,7 @@
     "mds_health_cache_threshold",
     "mds_cache_mid",
     "mds_dump_cache_threshold_formatter",
+    "mds_cache_trim_decay_rate",
     "mds_dump_cache_threshold_file",
     // MDBalancer
     "mds_bal_fragment_interval",
@@ -386,8 +387,10 @@
     "mds_inject_migrator_message_loss",
     "host",
     "fsid",
-    "mds_request_load_average_decay_rate",
     "mds_cap_revoke_eviction_timeout",
+    // SessionMap
+    "mds_request_load_average_decay_rate",
+    "mds_recall_max_decay_rate",
     NULL
   };
   return KEYS;
@@ -1356,7 +1359,7 @@
     // It doesn't go into a SessionMap instance until it sends an explicit
     // request to open a session (initial state of Session is `closed`)
     if (!s) {
-      s = new Session;
+      s = new Session(nullptr);
       s->info.auth_name = name;
       s->info.inst.addr = con->get_peer_addr();
       s->info.inst.name = n;
diff -Nru ceph-12.2.11/src/mds/MDSDaemon.h ceph-12.2.12/src/mds/MDSDaemon.h
--- ceph-12.2.11/src/mds/MDSDaemon.h	2019-01-30 15:51:26.000000000 +0000
+++ ceph-12.2.12/src/mds/MDSDaemon.h	2019-04-11 12:33:50.000000000 +0000
@@ -29,7 +29,7 @@
 #include "MDSMap.h"
 #include "MDSRank.h"
 
-#define CEPH_MDS_PROTOCOL    31 /* cluster internal */
+#define CEPH_MDS_PROTOCOL    34 /* cluster internal */
 
 class AuthAuthorizeHandlerRegistry;
 class Message;
diff -Nru ceph-12.2.11/src/mds/MDSRank.cc ceph-12.2.12/src/mds/MDSRank.cc
--- ceph-12.2.11/src/mds/MDSRank.cc	2019-01-30 15:51:26.000000000 +0000
+++ ceph-12.2.12/src/mds/MDSRank.cc	2019-04-11 12:33:50.000000000 +0000
@@ -248,7 +248,8 @@
                Formatter *f, Context *on_finish)
     : MDSInternalContext(mds),
       server(server), mdcache(mdcache), mdlog(mdlog),
-      recall_timeout(recall_timeout), f(f), on_finish(on_finish),
+      recall_timeout(recall_timeout), recall_start(mono_clock::now()),
+      f(f), on_finish(on_finish),
       whoami(mds->whoami), incarnation(mds->incarnation) {
   }
 
@@ -258,6 +259,7 @@
     assert(mds->mds_lock.is_locked());
 
     dout(20) << __func__ << dendl;
+    f->open_object_section("result");
     recall_client_state();
   }
 
@@ -316,25 +318,42 @@
 
   void recall_client_state() {
     dout(20) << __func__ << dendl;
-
-    f->open_object_section("result");
+    auto now = mono_clock::now();
+    auto duration = std::chrono::duration<double>(now-recall_start).count();
 
     MDSGatherBuilder *gather = new MDSGatherBuilder(g_ceph_context);
-    server->recall_client_state(1.0, true, gather);
-    if (!gather->has_subs()) {
-      handle_recall_client_state(0);
-      delete gather;
-      return;
+    auto result = server->recall_client_state(gather, Server::RecallFlags::STEADY);
+    auto& throttled = result.first;
+    auto& count = result.second;
+    dout(10) << __func__
+             << (throttled ? " (throttled)" : "")
+             << " recalled " << count << " caps" << dendl;
+
+    caps_recalled += count;
+    if ((throttled || count > 0) && (recall_timeout == 0 || duration < recall_timeout)) {
+      auto timer = new FunctionContext([this](int _) {
+        recall_client_state();
+      });
+      mds->timer.add_event_after(1.0, timer);
+    } else {
+      if (!gather->has_subs()) {
+        delete gather;
+        return handle_recall_client_state(0);
+      } else if (recall_timeout > 0 && duration > recall_timeout) {
+        delete gather;
+        return handle_recall_client_state(-ETIMEDOUT);
+      } else {
+        uint64_t remaining = (recall_timeout == 0 ? 0 : recall_timeout-duration);
+        C_ContextTimeout *ctx = new C_ContextTimeout(
+          mds, remaining, new FunctionContext([this](int r) {
+              handle_recall_client_state(r);
+            }));
+
+        ctx->start_timer();
+        gather->set_finisher(new MDSInternalContextWrapper(mds, ctx));
+        gather->activate();
+      }
     }
-
-    C_ContextTimeout *ctx = new C_ContextTimeout(
-      mds, recall_timeout, new FunctionContext([this](int r) {
-          handle_recall_client_state(r);
-        }));
-
-    ctx->start_timer();
-    gather->set_finisher(new MDSInternalContextWrapper(mds, ctx));
-    gather->activate();
   }
 
   void handle_recall_client_state(int r) {
@@ -344,21 +363,10 @@
     f->open_object_section("client_recall");
     f->dump_int("return_code", r);
     f->dump_string("message", cpp_strerror(r));
+    f->dump_int("recalled", caps_recalled);
     f->close_section();
 
     // we can still continue after recall timeout
-    trim_cache();
-  }
-
-  void trim_cache() {
-    dout(20) << __func__ << dendl;
-
-    if (!mdcache->trim(UINT64_MAX)) {
-      cmd_err(f, "failed to trim cache");
-      complete(-EINVAL);
-      return;
-    }
-
     flush_journal();
   }
 
@@ -388,15 +396,38 @@
     f->dump_string("message", ss.str());
     f->close_section();
 
-    cache_status();
+    trim_cache();
+  }
+
+  void trim_cache() {
+    dout(20) << __func__ << dendl;
+
+    auto p = mdcache->trim(UINT64_MAX);
+    auto& throttled = p.first;
+    auto& count = p.second;
+    dout(10) << __func__
+             << (throttled ? " (throttled)" : "")
+             << " trimmed " << count << " caps" << dendl;
+    dentries_trimmed += count;
+    if (throttled && count > 0) {
+      auto timer = new FunctionContext([this](int _) {
+        trim_cache();
+      });
+      mds->timer.add_event_after(1.0, timer);
+    } else {
+      cache_status();
+    }
   }
 
   void cache_status() {
     dout(20) << __func__ << dendl;
 
+    f->open_object_section("trim_cache");
+    f->dump_int("trimmed", dentries_trimmed);
+    f->close_section();
+
     // cache status section
     mdcache->cache_status(f);
-    f->close_section();
 
     complete(0);
   }
@@ -404,6 +435,10 @@
   void finish(int r) override {
     dout(20) << __func__ << ": r=" << r << dendl;
 
+    auto d = std::chrono::duration<double>(mono_clock::now()-recall_start);
+    f->dump_float("duration", d.count());
+
+    f->close_section();
     on_finish->complete(r);
   }
 
@@ -411,11 +446,14 @@
   MDCache *mdcache;
   MDLog *mdlog;
   uint64_t recall_timeout;
+  mono_time recall_start;
   Formatter *f;
   Context *on_finish;
 
   int retval = 0;
   std::stringstream ss;
+  uint64_t caps_recalled = 0;
+  uint64_t dentries_trimmed = 0;
 
   // so as to use dout
   mds_rank_t whoami;
@@ -648,6 +686,7 @@
   sessionmap.update_average_session_age();
 
   if (is_active() || is_stopping()) {
+    server->recall_client_state(nullptr, Server::RecallFlags::ENFORCE_MAX);
     mdcache->trim();
     mdcache->trim_client_leases();
     mdcache->check_memory_usage();
@@ -1474,27 +1513,27 @@
 
         MDSGatherBuilder gather(g_ceph_context,
             new C_MDS_BootStart(this, MDS_BOOT_OPEN_ROOT));
-        dout(2) << "boot_start " << step << ": opening inotable" << dendl;
+        dout(2) << "Booting: " << step << ": opening inotable" << dendl;
         inotable->set_rank(whoami);
         inotable->load(gather.new_sub());
 
-        dout(2) << "boot_start " << step << ": opening sessionmap" << dendl;
+        dout(2) << "Booting: " << step << ": opening sessionmap" << dendl;
         sessionmap.set_rank(whoami);
         sessionmap.load(gather.new_sub());
 
-        dout(2) << "boot_start " << step << ": opening mds log" << dendl;
+        dout(2) << "Booting: " << step << ": opening mds log" << dendl;
         mdlog->open(gather.new_sub());
 
 	if (is_starting()) {
-	  dout(2) << "boot_start " << step << ": opening purge queue" << dendl;
+	  dout(2) << "Booting: " << step << ": opening purge queue" << dendl;
 	  purge_queue.open(new C_IO_Wrapper(this, gather.new_sub()));
 	} else if (!standby_replaying) {
-	  dout(2) << "boot_start " << step << ": opening purge queue (async)" << dendl;
+	  dout(2) << "Booting: " << step << ": opening purge queue (async)" << dendl;
 	  purge_queue.open(NULL);
 	}
 
         if (mdsmap->get_tableserver() == whoami) {
-          dout(2) << "boot_start " << step << ": opening snap table" << dendl;
+          dout(2) << "Booting: " << step << ": opening snap table" << dendl;
           snapserver->set_rank(whoami);
           snapserver->load(gather.new_sub());
         }
@@ -1504,7 +1543,7 @@
       break;
     case MDS_BOOT_OPEN_ROOT:
       {
-        dout(2) << "boot_start " << step << ": loading/discovering base inodes" << dendl;
+        dout(2) << "Booting: " << step << ": loading/discovering base inodes" << dendl;
 
         MDSGatherBuilder gather(g_ceph_context,
             new C_MDS_BootStart(this, MDS_BOOT_PREPARE_LOG));
@@ -1527,19 +1566,19 @@
       break;
     case MDS_BOOT_PREPARE_LOG:
       if (is_any_replay()) {
-	dout(2) << "boot_start " << step << ": replaying mds log" << dendl;
+	dout(2) << "Booting: " << step << ": replaying mds log" << dendl;
 	MDSGatherBuilder gather(g_ceph_context,
 	    new C_MDS_BootStart(this, MDS_BOOT_REPLAY_DONE));
 
 	if (!standby_replaying) {
-	  dout(2) << "boot_start " << step << ": waiting for purge queue recovered" << dendl;
+	  dout(2) << "Booting: " << step << ": waiting for purge queue recovered" << dendl;
 	  purge_queue.wait_for_recovery(new C_IO_Wrapper(this, gather.new_sub()));
 	}
 
 	mdlog->replay(gather.new_sub());
 	gather.activate();
       } else {
-        dout(2) << "boot_start " << step << ": positioning at end of old mds log" << dendl;
+        dout(2) << "Booting: " << step << ": positioning at end of old mds log" << dendl;
         mdlog->append();
         starting_done();
       }
@@ -1823,6 +1862,7 @@
 {
   dout(1) << "rejoin_start" << dendl;
   mdcache->rejoin_start(new C_MDS_VoidFn(this, &MDSRank::rejoin_done));
+  finish_contexts(g_ceph_context, waiting_for_rejoin);
 }
 void MDSRank::rejoin_done()
 {
@@ -1984,7 +2024,7 @@
 
 void MDSRank::stopping_start()
 {
-  dout(2) << "stopping_start" << dendl;
+  dout(2) << "Stopping..." << dendl;
 
   if (mdsmap->get_num_in_mds() == 1 && !sessionmap.empty()) {
     // we're the only mds up!
@@ -1997,7 +2037,7 @@
 
 void MDSRank::stopping_done()
 {
-  dout(2) << "stopping_done" << dendl;
+  dout(2) << "Finished stopping..." << dendl;
 
   // tell monitor we shut down cleanly.
   request_state(MDSMap::STATE_STOPPED);
@@ -3072,11 +3112,20 @@
     return false;
   }
 
+  auto& addr = session->info.inst.addr;
+  {
+    std::stringstream ss;
+    ss << "Evicting " << (blacklist ? "(and blacklisting) " : "")
+       << "client session " << session_id << " (" << addr << ")";
+    dout(1) << ss.str() << dendl;
+    clog->info() << ss.str();
+  }
+
   dout(4) << "Preparing blacklist command... (wait=" << wait << ")" << dendl;
   stringstream ss;
   ss << "{\"prefix\":\"osd blacklist\", \"blacklistop\":\"add\",";
   ss << "\"addr\":\"";
-  ss << session->info.inst.addr;
+  ss << addr;
   ss << "\"}";
   std::string tmp = ss.str();
   std::vector<std::string> cmd = {tmp};
diff -Nru ceph-12.2.11/src/mds/MDSRank.h ceph-12.2.12/src/mds/MDSRank.h
--- ceph-12.2.11/src/mds/MDSRank.h	2019-01-30 15:51:26.000000000 +0000
+++ ceph-12.2.12/src/mds/MDSRank.h	2019-04-11 12:33:50.000000000 +0000
@@ -269,7 +269,8 @@
 
     ceph_tid_t last_tid;    // for mds-initiated requests (e.g. stray rename)
 
-    list<MDSInternalContextBase*> waiting_for_active, waiting_for_replay, waiting_for_reconnect, waiting_for_resolve;
+    list<MDSInternalContextBase*> waiting_for_active, waiting_for_replay, waiting_for_rejoin,
+				  waiting_for_reconnect, waiting_for_resolve;
     list<MDSInternalContextBase*> waiting_for_any_client_connection;
     list<MDSInternalContextBase*> replay_queue;
     map<mds_rank_t, list<MDSInternalContextBase*> > waiting_for_active_peer;
@@ -409,6 +410,9 @@
     void wait_for_replay(MDSInternalContextBase *c) { 
       waiting_for_replay.push_back(c); 
     }
+    void wait_for_rejoin(MDSInternalContextBase *c) {
+      waiting_for_rejoin.push_back(c);
+    }
     void wait_for_reconnect(MDSInternalContextBase *c) {
       waiting_for_reconnect.push_back(c);
     }
diff -Nru ceph-12.2.11/src/mds/Migrator.cc ceph-12.2.12/src/mds/Migrator.cc
--- ceph-12.2.11/src/mds/Migrator.cc	2019-01-30 15:51:26.000000000 +0000
+++ ceph-12.2.12/src/mds/Migrator.cc	2019-04-11 12:33:50.000000000 +0000
@@ -2128,10 +2128,9 @@
     bool need_issue = false;
     for (auto& p : in->get_client_caps()) {
       Capability *cap = p.second;
-      if (cap->is_stale()) {
-	mds->locker->revoke_stale_caps(cap);
-      } else {
+      if (!cap->is_stale()) {
 	need_issue = true;
+	break;
       }
     }
     if (need_issue &&
diff -Nru ceph-12.2.11/src/mds/Server.cc ceph-12.2.12/src/mds/Server.cc
--- ceph-12.2.11/src/mds/Server.cc	2019-01-30 15:51:26.000000000 +0000
+++ ceph-12.2.12/src/mds/Server.cc	2019-04-11 12:33:50.000000000 +0000
@@ -17,6 +17,7 @@
 
 #include <boost/config/warning_disable.hpp>
 #include <boost/fusion/include/std_pair.hpp>
+#include <boost/range/adaptor/reversed.hpp>
 
 #include "MDSRank.h"
 #include "Server.h"
@@ -58,6 +59,7 @@
 #include "osd/OSDMap.h"
 
 #include <errno.h>
+#include <math.h>
 
 #include <list>
 #include <iostream>
@@ -199,7 +201,8 @@
   reconnect_done(NULL),
   failed_reconnects(0),
   reconnect_evicting(false),
-  terminating_sessions(false)
+  terminating_sessions(false),
+  recall_throttle(ceph_clock_now(), g_conf->get_val<double>("mds_recall_max_decay_rate"))
 {
 }
 
@@ -354,8 +357,7 @@
       m->put();
       return;
     }
-    assert(session->is_closed() ||
-	   session->is_closing());
+    assert(session->is_closed() || session->is_closing());
 
     if (mds->is_stopping()) {
       dout(10) << "mds is stopping, dropping open req" << dendl;
@@ -368,50 +370,82 @@
           return osd_map.is_blacklisted(session->info.inst.addr);
         });
 
-    if (blacklisted) {
-      dout(10) << "rejecting blacklisted client " << session->info.inst.addr << dendl;
-      mds->send_message_client(new MClientSession(CEPH_SESSION_REJECT), session);
-      m->put();
-      return;
-    }
-
-    session->set_client_metadata(m->client_meta);
-    dout(20) << __func__ << " CEPH_SESSION_REQUEST_OPEN "
-      << session->info.client_metadata.size() << " metadata entries:" << dendl;
-    for (map<string, string>::iterator i = session->info.client_metadata.begin();
-        i != session->info.client_metadata.end(); ++i) {
-      dout(20) << "  " << i->first << ": " << i->second << dendl;
-    }
-
-    // Special case for the 'root' metadata path; validate that the claimed
-    // root is actually within the caps of the session
-    if (session->info.client_metadata.count("root")) {
-      const auto claimed_root = session->info.client_metadata.at("root");
-      // claimed_root has a leading "/" which we strip before passing
-      // into caps check
-      if (claimed_root.empty() || claimed_root[0] != '/' ||
-          !session->auth_caps.path_capable(claimed_root.substr(1))) {
-        derr << __func__ << " forbidden path claimed as mount root: "
-             << claimed_root << " by " << m->get_source() << dendl;
-        // Tell the client we're rejecting their open
+    {
+      auto& addr = session->info.inst.addr;
+      session->set_client_metadata(m->client_meta);
+      auto& client_metadata = session->info.client_metadata;
+
+      auto log_session_status = [this, m = m->get(), session](boost::string_view status, boost::string_view err) {
+        auto now = ceph_clock_now();
+        auto throttle_elapsed = m->get_recv_complete_stamp() - m->get_throttle_stamp();
+        auto elapsed = now - m->get_recv_stamp();
+        std::stringstream ss;
+        ss << "New client session:"
+             << " addr=\"" <<  session->info.inst.addr << "\""
+             << ",elapsed=" << elapsed
+             << ",throttled=" << throttle_elapsed
+             << ",status=\"" << status << "\"";
+        if (!err.empty()) {
+          ss << ",error=\"" << err << "\"";
+        }
+        const auto& metadata = session->info.client_metadata;
+        auto it = metadata.find("root");
+        if (it != metadata.end()) {
+          ss << ",root=\"" << it->second << "\"";
+        }
+        dout(2) << ss.str() << dendl;
+        m->put();
+      };
+
+      if (blacklisted) {
+        dout(10) << "rejecting blacklisted client " << addr << dendl;
+        log_session_status("REJECTED", "blacklisted");
         mds->send_message_client(new MClientSession(CEPH_SESSION_REJECT), session);
-        mds->clog->warn() << "client session with invalid root '" <<
-          claimed_root << "' denied (" << session->info.inst << ")";
-        session->clear();
-        // Drop out; don't record this session in SessionMap or journal it.
-        break;
+        m->put();
+        return;
+      }
+
+      dout(20) << __func__ << " CEPH_SESSION_REQUEST_OPEN "
+        << session->info.client_metadata.size() << " metadata entries:" << dendl;
+      for (map<string, string>::iterator i = session->info.client_metadata.begin();
+          i != session->info.client_metadata.end(); ++i) {
+        dout(20) << "  " << i->first << ": " << i->second << dendl;
+      }
+
+      // Special case for the 'root' metadata path; validate that the claimed
+      // root is actually within the caps of the session
+      if (session->info.client_metadata.count("root")) {
+        const auto claimed_root = session->info.client_metadata.at("root");
+        // claimed_root has a leading "/" which we strip before passing
+        // into caps check
+        if (claimed_root.empty() || claimed_root[0] != '/' ||
+            !session->auth_caps.path_capable(claimed_root.substr(1))) {
+          derr << __func__ << " forbidden path claimed as mount root: "
+               << claimed_root << " by " << m->get_source() << dendl;
+          // Tell the client we're rejecting their open
+          log_session_status("REJECTED", "invalid root");
+          mds->send_message_client(new MClientSession(CEPH_SESSION_REJECT), session);
+          mds->clog->warn() << "client session with invalid root '" <<
+            claimed_root << "' denied (" << session->info.inst << ")";
+          session->clear();
+          // Drop out; don't record this session in SessionMap or journal it.
+          break;
+        }
       }
-    }
 
-    if (session->is_closed())
-      mds->sessionmap.add_session(session);
+      if (session->is_closed())
+        mds->sessionmap.add_session(session);
 
-    pv = mds->sessionmap.mark_projected(session);
-    sseq = mds->sessionmap.set_state(session, Session::STATE_OPENING);
-    mds->sessionmap.touch_session(session);
-    mdlog->start_submit_entry(new ESession(m->get_source_inst(), true, pv, m->client_meta),
-			      new C_MDS_session_finish(this, session, sseq, true, pv));
-    mdlog->flush();
+      pv = mds->sessionmap.mark_projected(session);
+      sseq = mds->sessionmap.set_state(session, Session::STATE_OPENING);
+      mds->sessionmap.touch_session(session);
+      auto fin = new FunctionContext([log_session_status = std::move(log_session_status)](int r){
+        assert(r == 0);
+        log_session_status("ACCEPTED", "");
+      });
+      mdlog->start_submit_entry(new ESession(m->get_source_inst(), true, pv, client_metadata),
+				new C_MDS_session_finish(this, session, sseq, true, pv, fin));
+    }
     break;
 
   case CEPH_SESSION_REQUEST_RENEWCAPS:
@@ -546,7 +580,7 @@
       Capability *cap = session->caps.front();
       CInode *in = cap->get_inode();
       dout(20) << " killing capability " << ccap_string(cap->issued()) << " on " << *in << dendl;
-      mds->locker->remove_client_cap(in, session->info.inst.name.num());
+      mds->locker->remove_client_cap(in, cap);
     }
     while (!session->leases.empty()) {
       ClientLease *r = session->leases.front();
@@ -698,7 +732,7 @@
 
 void Server::terminate_sessions()
 {
-  dout(2) << "terminate_sessions" << dendl;
+  dout(5) << "terminating all sessions..." << dendl;
 
   terminating_sessions = true;
 
@@ -853,6 +887,9 @@
     dout(20) << __func__ << " cap revoke eviction timeout changed to "
             << cap_revoke_eviction_timeout << dendl;
   }
+  if (changed.count("mds_recall_max_decay_rate")) {
+    recall_throttle = DecayCounter(ceph_clock_now(), g_conf->get_val<double>("mds_recall_max_decay_rate"));
+  }
 }
 
 /*
@@ -1186,62 +1223,125 @@
   }
 }
 
-
 /**
  * Call this when the MDCache is oversized, to send requests to the clients
  * to trim some caps, and consequently unpin some inodes in the MDCache so
  * that it can trim too.
  */
-void Server::recall_client_state(double ratio, bool flush_client_session,
-                                 MDSGatherBuilder *gather) {
-  if (flush_client_session) {
-    assert(gather != nullptr);
-  }
-
-  /* try to recall at least 80% of all caps */
-  uint64_t max_caps_per_client = Capability::count() * g_conf->get_val<double>("mds_max_ratio_caps_per_client");
-  uint64_t min_caps_per_client = g_conf->get_val<uint64_t>("mds_min_caps_per_client");
-  if (max_caps_per_client < min_caps_per_client) {
-    dout(0) << "max_caps_per_client " << max_caps_per_client
-            << " < min_caps_per_client " << min_caps_per_client << dendl;
-    max_caps_per_client = min_caps_per_client + 1;
-  }
-
-  /* unless this ratio is smaller: */
-  /* ratio: determine the amount of caps to recall from each client. Use
-   * percentage full over the cache reservation. Cap the ratio at 80% of client
-   * caps. */
-  if (ratio < 0.0)
-    ratio = 1.0 - fmin(0.80, mdcache->cache_toofull_ratio());
-
-  dout(10) << __func__ << ": ratio=" << ratio << ", caps per client "
-           << min_caps_per_client << "-" << max_caps_per_client << dendl;
-
-  set<Session*> sessions;
-  mds->sessionmap.get_client_session_set(sessions);
+std::pair<bool, uint64_t> Server::recall_client_state(MDSGatherBuilder* gather, RecallFlags flags)
+{
+  const auto now = clock::now();
+  const bool steady = flags&RecallFlags::STEADY;
+  const bool enforce_max = flags&RecallFlags::ENFORCE_MAX;
+
+  const auto max_caps_per_client = g_conf->get_val<uint64_t>("mds_max_caps_per_client");
+  const auto min_caps_per_client = g_conf->get_val<uint64_t>("mds_min_caps_per_client");
+  const auto recall_global_max_decay_threshold = g_conf->get_val<uint64_t>("mds_recall_global_max_decay_threshold");
+  const auto recall_max_caps = g_conf->get_val<uint64_t>("mds_recall_max_caps");
+  const auto recall_max_decay_threshold = g_conf->get_val<uint64_t>("mds_recall_max_decay_threshold");
+
+  dout(7) << __func__ << ":"
+           << " min=" << min_caps_per_client
+           << " max=" << max_caps_per_client
+           << " total=" << Capability::count()
+           << " flags=0x" << std::hex << flags
+           << dendl;
 
-  for (auto &session : sessions) {
+  /* trim caps of sessions with the most caps first */
+  std::multimap<uint64_t, Session*> caps_session;
+  auto f = [&caps_session, enforce_max, max_caps_per_client](Session* s) {
+    auto num_caps = s->caps.size();
+    if (!enforce_max || num_caps > max_caps_per_client) {
+      caps_session.emplace(std::piecewise_construct, std::forward_as_tuple(num_caps), std::forward_as_tuple(s));
+    }
+  };
+  mds->sessionmap.get_client_sessions(std::move(f));
+
+  std::pair<bool, uint64_t> result = {false, 0};
+  auto& throttled = result.first;
+  auto& caps_recalled = result.second;
+  last_recall_state = now;
+  for (const auto p : boost::adaptors::reverse(caps_session)) {
+    auto& num_caps = p.first;
+    auto& session = p.second;
     if (!session->is_open() ||
         !session->connection.get() ||
 	!session->info.inst.name.is_client())
       continue;
 
-    dout(10) << " session " << session->info.inst
-	     << " caps " << session->caps.size()
+    dout(10) << __func__ << ":"
+             << " session " << session->info.inst
+	     << " caps " << num_caps
 	     << ", leases " << session->leases.size()
 	     << dendl;
 
-    uint64_t newlim = MAX(MIN((session->caps.size() * ratio), max_caps_per_client), min_caps_per_client);
-    if (session->caps.size() > newlim) {
-      MClientSession *m = new MClientSession(CEPH_SESSION_RECALL_STATE);
+    uint64_t newlim;
+    if (num_caps < recall_max_caps || (num_caps-recall_max_caps) < min_caps_per_client) {
+      newlim = min_caps_per_client;
+    } else {
+      newlim = num_caps-recall_max_caps;
+    }
+    if (num_caps > newlim) {
+      /* now limit the number of caps we recall at a time to prevent overloading ourselves */
+      uint64_t recall = std::min<uint64_t>(recall_max_caps, num_caps-newlim);
+      newlim = num_caps-recall;
+      const uint64_t session_recall_throttle = session->get_recall_caps_throttle();
+      const uint64_t global_recall_throttle = recall_throttle.get(ceph_clock_now());
+      if (session_recall_throttle+recall > recall_max_decay_threshold) {
+        dout(15) << "  session recall threshold (" << recall_max_decay_threshold << ") hit at " << session_recall_throttle << "; skipping!" << dendl;
+        throttled = true;
+        continue;
+      } else if (global_recall_throttle+recall > recall_global_max_decay_threshold) {
+        dout(15) << "  global recall threshold (" << recall_global_max_decay_threshold << ") hit at " << global_recall_throttle << "; skipping!" << dendl;
+        throttled = true;
+        break;
+      }
+
+      // now check if we've recalled caps recently and the client is unlikely to satisfy a new recall
+      if (steady) {
+        const auto session_recall = session->get_recall_caps();
+        const auto session_release = session->get_release_caps();
+        if (2*session_release < session_recall && 2*session_recall > recall_max_decay_threshold) {
+          /* The session has been unable to keep up with the number of caps
+           * recalled (by half); additionally, to prevent marking sessions
+           * we've just begun to recall from, the session_recall counter
+           * (decayed count of caps recently recalled) is **greater** than the
+           * session threshold for the session's cap recall throttle.
+           */
+          dout(15) << "  2*session_release < session_recall"
+                      " (2*" << session_release << " < " << session_recall << ");"
+                      " Skipping because we are unlikely to get more released." << dendl;
+          continue;
+        } else if (recall < recall_max_caps && 2*recall < session_recall) {
+          /* The number of caps recalled is less than the number we *could*
+           * recall (so there isn't much left to recall?) and the number of
+           * caps is less than the current recall_caps counter (decayed count
+           * of caps recently recalled).
+           */
+          dout(15) << "  2*recall < session_recall "
+                      " (2*" << recall << " < " << session_recall << ") &&"
+                      " recall < recall_max_caps (" << recall << " < " << recall_max_caps << ");"
+                      " Skipping because we are unlikely to get more released." << dendl;
+          continue;
+        }
+      }
+
+      dout(7) << "  recalling " << recall << " caps; session_recall_throttle = " << session_recall_throttle << "; global_recall_throttle = " << global_recall_throttle << dendl;
+
+      auto m = new MClientSession(CEPH_SESSION_RECALL_STATE);
       m->head.max_caps = newlim;
       mds->send_message_client(m, session);
-      if (flush_client_session) {
+      if (gather) {
         flush_session(session, gather);
       }
-      session->notify_recall_sent(newlim);
+      caps_recalled += session->notify_recall_sent(newlim);
+      recall_throttle.hit(ceph_clock_now(), recall);
     }
   }
+
+  dout(7) << "recalled" << (throttled ? " (throttled)" : "") << " " << caps_recalled << " client caps." << dendl;
+
+  return result;
 }
 
 void Server::force_clients_readonly()
@@ -3676,9 +3776,6 @@
   }
 
   // create inode.
-  SnapRealm *realm = diri->find_snaprealm();   // use directory's realm; inode isn't attached yet.
-  snapid_t follows = realm->get_newest_seq();
-
   CInode *in = prepare_new_inode(mdr, dn->get_dir(), inodeno_t(req->head.ino),
 				 req->head.args.open.mode | S_IFREG, &layout);
   assert(in);
@@ -3690,15 +3787,25 @@
   if (layout.pool_id != mdcache->default_file_layout.pool_id)
     in->inode.add_old_pool(mdcache->default_file_layout.pool_id);
   in->inode.update_backtrace();
-  if (cmode & CEPH_FILE_MODE_WR) {
+  in->inode.rstat.rfiles = 1;
+
+  SnapRealm *realm = diri->find_snaprealm();
+  snapid_t follows = realm->get_newest_seq();
+
+  ceph_assert(dn->first == follows+1);
+  in->first = dn->first;
+
+  // do the open
+  Capability *cap = mds->locker->issue_new_caps(in, cmode, mdr->session, realm, req->is_replay());
+  in->authlock.set_state(LOCK_EXCL);
+  in->xattrlock.set_state(LOCK_EXCL);
+
+  if (cap && (cmode & CEPH_FILE_MODE_WR)) {
     in->inode.client_ranges[client].range.first = 0;
     in->inode.client_ranges[client].range.last = in->inode.get_layout_size_increment();
     in->inode.client_ranges[client].follows = follows;
+    cap->mark_clientwriteable();
   }
-  in->inode.rstat.rfiles = 1;
-
-  assert(dn->first == follows+1);
-  in->first = dn->first;
   
   // prepare finisher
   mdr->ls = mdlog->get_current_segment();
@@ -3709,11 +3816,6 @@
   mdcache->predirty_journal_parents(mdr, &le->metablob, in, dn->get_dir(), PREDIRTY_PRIMARY|PREDIRTY_DIR, 1);
   le->metablob.add_primary_dentry(dn, in, true, true, true);
 
-  // do the open
-  mds->locker->issue_new_caps(in, cmode, mdr->session, realm, req->is_replay());
-  in->authlock.set_state(LOCK_EXCL);
-  in->xattrlock.set_state(LOCK_EXCL);
-
   // make sure this inode gets into the journal
   le->metablob.add_opened_ino(in->ino());
   LogSegment *ls = mds->mdlog->get_current_segment();
@@ -4278,7 +4380,7 @@
     // adjust client's max_size?
     CInode::mempool_inode::client_range_map new_ranges;
     bool max_increased = false;
-    mds->locker->calc_new_client_ranges(cur, pi.inode.size, &new_ranges, &max_increased);
+    mds->locker->calc_new_client_ranges(cur, pi.inode.size, true, &new_ranges, &max_increased);
     if (pi.inode.client_ranges != new_ranges) {
       dout(10) << " client_ranges " << pi.inode.client_ranges << " -> " << new_ranges << dendl;
       pi.inode.client_ranges = new_ranges;
@@ -4316,7 +4418,7 @@
   dout(10) << "do_open_truncate " << *in << dendl;
 
   SnapRealm *realm = in->find_snaprealm();
-  mds->locker->issue_new_caps(in, cmode, mdr->session, realm, mdr->client_request->is_replay());
+  Capability *cap = mds->locker->issue_new_caps(in, cmode, mdr->session, realm, mdr->client_request->is_replay());
 
   mdr->ls = mdlog->get_current_segment();
   EUpdate *le = new EUpdate(mdlog, "open_truncate");
@@ -4337,11 +4439,12 @@
   }
 
   bool changed_ranges = false;
-  if (cmode & CEPH_FILE_MODE_WR) {
+  if (cap && (cmode & CEPH_FILE_MODE_WR)) {
     pi.inode.client_ranges[client].range.first = 0;
     pi.inode.client_ranges[client].range.last = pi.inode.get_layout_size_increment();
     pi.inode.client_ranges[client].follows = in->find_snaprealm()->get_newest_seq();
     changed_ranges = true;
+    cap->mark_clientwriteable();
   }
   
   le->metablob.add_client_req(mdr->reqid, mdr->client_request->get_oldest_client_tid());
@@ -4827,7 +4930,7 @@
     pip = &pi.inode;
 
     client_t exclude_ct = mdr->get_client();
-    mdcache->broadcast_quota_to_client(cur, exclude_ct);
+    mdcache->broadcast_quota_to_client(cur, exclude_ct, true);
   } else if (name.find("ceph.dir.pin") == 0) {
     if (!cur->is_dir() || cur->is_root()) {
       respond_to_request(mdr, -EINVAL);
@@ -5223,11 +5326,6 @@
   // if the client created a _regular_ file via MKNOD, it's highly likely they'll
   // want to write to it (e.g., if they are reexporting NFS)
   if (S_ISREG(newi->inode.mode)) {
-    dout(15) << " setting a client_range too, since this is a regular file" << dendl;
-    newi->inode.client_ranges[client].range.first = 0;
-    newi->inode.client_ranges[client].range.last = newi->inode.get_layout_size_increment();
-    newi->inode.client_ranges[client].follows = follows;
-
     // issue a cap on the file
     int cmode = CEPH_FILE_MODE_RDWR;
     Capability *cap = mds->locker->issue_new_caps(newi, cmode, mdr->session, realm, req->is_replay());
@@ -5238,6 +5336,12 @@
       newi->filelock.set_state(LOCK_EXCL);
       newi->authlock.set_state(LOCK_EXCL);
       newi->xattrlock.set_state(LOCK_EXCL);
+
+      dout(15) << " setting a client_range too, since this is a regular file" << dendl;
+      newi->inode.client_ranges[client].range.first = 0;
+      newi->inode.client_ranges[client].range.last = newi->inode.get_layout_size_increment();
+      newi->inode.client_ranges[client].follows = follows;
+      cap->mark_clientwriteable();
     }
   }
 
diff -Nru ceph-12.2.11/src/mds/Server.h ceph-12.2.12/src/mds/Server.h
--- ceph-12.2.11/src/mds/Server.h	2019-01-30 15:51:26.000000000 +0000
+++ ceph-12.2.12/src/mds/Server.h	2019-04-11 12:33:50.000000000 +0000
@@ -17,6 +17,8 @@
 
 #include <boost/utility/string_view.hpp>
 
+#include <common/DecayCounter.h>
+
 #include "MDSRank.h"
 #include "Mutation.h"
 
@@ -121,6 +123,9 @@
   void dump_reconnect_status(Formatter *f) const;
 
   void handle_client_session(class MClientSession *m);
+  time last_recalled() const {
+    return last_recall_state;
+  }
   void _session_logged(Session *session, uint64_t state_seq, 
 		       bool open, version_t pv, interval_set<inodeno_t>& inos,version_t piv);
   version_t prepare_force_open_sessions(map<client_t,entity_inst_t> &cm,
@@ -141,8 +146,12 @@
   void reconnect_tick();
   void recover_filelocks(CInode *in, bufferlist locks, int64_t client);
 
-  void recall_client_state(double ratio, bool flush_client_session,
-                           MDSGatherBuilder *gather);
+  enum RecallFlags {
+    NONE = 0,
+    STEADY = (1<<0),
+    ENFORCE_MAX = (1<<1),
+  };
+  std::pair<bool, uint64_t> recall_client_state(MDSGatherBuilder* gather, enum RecallFlags=RecallFlags::NONE);
   void force_clients_readonly();
 
   // -- requests --
@@ -323,6 +332,9 @@
 private:
   void reply_client_request(MDRequestRef& mdr, MClientReply *reply);
   void flush_session(Session *session, MDSGatherBuilder *gather);
+
+  DecayCounter recall_throttle;
+  time last_recall_state;
 };
 
 #endif
diff -Nru ceph-12.2.11/src/mds/SessionMap.cc ceph-12.2.12/src/mds/SessionMap.cc
--- ceph-12.2.11/src/mds/SessionMap.cc	2019-01-30 15:51:26.000000000 +0000
+++ ceph-12.2.12/src/mds/SessionMap.cc	2019-04-11 12:33:50.000000000 +0000
@@ -546,8 +546,8 @@
     ::decode(n, p);
     
     while (n-- && !p.end()) {
-      bufferlist::iterator p2 = p;
-      Session *s = new Session;
+      auto p2 = p;
+      Session *s = new Session(ConnectionRef());
       s->info.decode(p);
       if (session_map.count(s->info.inst.name)) {
 	// eager client connected too fast!  aie.
@@ -859,11 +859,8 @@
  */
 void Session::notify_cap_release(size_t n_caps)
 {
-  if (recalled_at != time::min()) {
-    recall_release_count += n_caps;
-    if (recall_release_count >= recall_count)
-      clear_recalled_at();
-  }
+  recall_caps.hit(ceph_clock_now(), -(double)n_caps);
+  release_caps.hit(ceph_clock_now(), n_caps);
 }
 
 /**
@@ -872,32 +869,26 @@
  * in order to generate health metrics if the session doesn't see
  * a commensurate number of calls to ::notify_cap_release
  */
-void Session::notify_recall_sent(const size_t new_limit)
+uint64_t Session::notify_recall_sent(size_t new_limit)
 {
-  if (recalled_at == time::min()) {
-    // Entering recall phase, set up counters so we can later
-    // judge whether the client has respected the recall request
-    recalled_at = last_recall_sent = clock::now();
-    assert (new_limit < caps.size());  // Behaviour of Server::recall_client_state
-    recall_count = caps.size() - new_limit;
-    recall_release_count = 0;
+  const auto num_caps = caps.size();
+  ceph_assert(new_limit < num_caps);  // Behaviour of Server::recall_client_state
+  const auto count = num_caps-new_limit;
+  uint64_t new_change;
+  if (recall_limit != new_limit) {
+    new_change = count;
   } else {
-    last_recall_sent = clock::now();
+    new_change = 0; /* no change! */
   }
-}
-
-void Session::clear_recalled_at()
-{
-  recalled_at = last_recall_sent = time::min();
-  recall_count = 0;
-  recall_release_count = 0;
-}
 
-void Session::set_client_metadata(map<string, string> const &meta)
-{
-  info.client_metadata = meta;
-
-  _update_human_name();
+  /* Always hit the session counter as a RECALL message is still sent to the
+   * client and we do not want the MDS to burn its global counter tokens on a
+   * session that is not releasing caps (i.e. allow the session counter to
+   * throttle future RECALL messages).
+   */
+  recall_caps_throttle.hit(ceph_clock_now(), count);
+  recall_caps.hit(ceph_clock_now(), count);
+  return new_change;
 }
 
 /**
@@ -990,23 +981,58 @@
 }
 
 void SessionMap::handle_conf_change(const struct md_config_t *conf,
-                                    const std::set <std::string> &changed) {
+                                    const std::set <std::string> &changed)
+{
+
   if (changed.count("mds_request_load_average_decay_rate")) {
-    decay_rate = conf->get_val<double>("mds_request_load_average_decay_rate");
-    dout(20) << __func__ << " decay rate changed to " << decay_rate << dendl;
+    auto d = g_conf->get_val<double>("mds_request_load_average_decay_rate");
+    dout(20) << __func__ << " decay rate changed to " << d << dendl;
 
-    total_load_avg_rate = DecayRate(decay_rate);
+    decay_rate = d;
+    total_load_avg = DecayCounter(ceph_clock_now(), d);
 
-    auto p = by_state.find(Session::STATE_OPEN);
-    if (p != by_state.end()) {
-      for (const auto &session : *(p->second)) {
-        session->set_load_avg_decay_rate(decay_rate);
+    auto it = by_state.find(Session::STATE_OPEN);
+    if (it != by_state.end()) {
+      for (const auto &session : *(it->second)) {
+        session->set_load_avg_decay_rate(d);
+      }
+    }
+    it = by_state.find(Session::STATE_STALE);
+    if (it != by_state.end()) {
+      for (const auto &session : *(it->second)) {
+        session->set_load_avg_decay_rate(d);
+      }
+    }
+  }
+  if (changed.count("mds_recall_max_decay_rate")) {
+    auto d = g_conf->get_val<double>("mds_recall_max_decay_rate");
+    auto it = by_state.find(Session::STATE_OPEN);
+    if (it != by_state.end()) {
+      for (const auto &session : *(it->second)) {
+        session->recall_caps_throttle = DecayCounter(ceph_clock_now(), d);
+      }
+    }
+    it = by_state.find(Session::STATE_STALE);
+    if (it != by_state.end()) {
+      for (const auto &session : *(it->second)) {
+        session->recall_caps_throttle = DecayCounter(ceph_clock_now(), d);
+      }
+    }
+  }
+  if (changed.count("mds_recall_warning_decay_rate")) {
+    auto d = g_conf->get_val<double>("mds_recall_warning_decay_rate");
+    auto it = by_state.find(Session::STATE_OPEN);
+    if (it != by_state.end()) {
+      for (const auto &session : *(it->second)) {
+        session->recall_caps = DecayCounter(ceph_clock_now(), d);
+        session->release_caps = DecayCounter(ceph_clock_now(), d);
       }
     }
-    p = by_state.find(Session::STATE_STALE);
-    if (p != by_state.end()) {
-      for (const auto &session : *(p->second)) {
-        session->set_load_avg_decay_rate(decay_rate);
+    it = by_state.find(Session::STATE_STALE);
+    if (it != by_state.end()) {
+      for (const auto &session : *(it->second)) {
+        session->recall_caps = DecayCounter(ceph_clock_now(), d);
+        session->release_caps = DecayCounter(ceph_clock_now(), d);
       }
     }
   }
diff -Nru ceph-12.2.11/src/mds/SessionMap.h ceph-12.2.12/src/mds/SessionMap.h
--- ceph-12.2.11/src/mds/SessionMap.h	2019-01-30 15:51:26.000000000 +0000
+++ ceph-12.2.12/src/mds/SessionMap.h	2019-04-11 12:33:50.000000000 +0000
@@ -27,10 +27,10 @@
 #include "mdstypes.h"
 #include "mds/MDSAuthCaps.h"
 #include "common/perf_counters.h"
+#include "common/DecayCounter.h"
 
 class CInode;
 struct MDRequestImpl;
-class DecayCounter;
 
 #include "CInode.h"
 #include "Capability.h"
@@ -96,9 +96,9 @@
   }
 
 private:
-  int state;
-  uint64_t state_seq;
-  int importing_count;
+  int state = STATE_CLOSED;
+  uint64_t state_seq = 0;
+  int importing_count = 0;
   friend class SessionMap;
 
   // Human (friendly) name is soft state generated from client metadata
@@ -113,6 +113,16 @@
   mutable DecayCounter load_avg;
   DecayRate    load_avg_rate;
 
+  // Ephemeral state for tracking progress of capability recalls
+  // caps being recalled recently by this session; used for Beacon warnings
+  mutable DecayCounter recall_caps;
+  // caps that have been released
+  mutable DecayCounter release_caps;
+  // throttle on caps recalled
+  mutable DecayCounter recall_caps_throttle;
+  // New limit in SESSION_RECALL
+  uint32_t recall_limit = 0;
+
   // session start time -- used to track average session time
   // note that this is initialized in the constructor rather
   // than at the time of adding a session to the sessionmap
@@ -143,15 +153,14 @@
     }
   }
   void decode(bufferlist::iterator &p);
-  void set_client_metadata(std::map<std::string, std::string> const &meta);
+  template<typename T>
+  void set_client_metadata(T&& meta)
+  {
+    info.client_metadata = std::forward<T>(meta);
+    _update_human_name();
+  }
   std::string get_human_name() const {return human_name;}
 
-  // Ephemeral state for tracking progress of capability recalls
-  time recalled_at = time::min();  // When was I asked to SESSION_RECALL?
-  time last_recall_sent = time::min();
-  uint32_t recall_count;  // How many caps was I asked to SESSION_RECALL?
-  uint32_t recall_release_count;  // How many caps have I actually revoked?
-
   session_info_t info;                         ///< durable bits
 
   MDSAuthCaps auth_caps;
@@ -167,8 +176,16 @@
   interval_set<inodeno_t> pending_prealloc_inos; // journaling prealloc, will be added to prealloc_inos
 
   void notify_cap_release(size_t n_caps);
-  void notify_recall_sent(const size_t new_limit);
-  void clear_recalled_at();
+  uint64_t notify_recall_sent(size_t new_limit);
+  double get_recall_caps_throttle() const {
+    return recall_caps_throttle.get(ceph_clock_now());
+  }
+  double get_recall_caps() const {
+    return recall_caps.get(ceph_clock_now());
+  }
+  double get_release_caps() const {
+    return release_caps.get(ceph_clock_now());
+  }
 
   inodeno_t next_ino() const {
     if (info.prealloc_inos.empty())
@@ -239,7 +256,8 @@
 
   // -- caps --
 private:
-  version_t cap_push_seq;        // cap push seq #
+  uint32_t cap_gen = 0;
+  version_t cap_push_seq = 0;        // cap push seq #
   map<version_t, list<MDSInternalContextBase*> > waitfor_flush; // flush session messages
 
 public:
@@ -248,7 +266,9 @@
   time last_cap_renew = time::min();
   time last_seen = time::min();
 
-public:
+  void inc_cap_gen() { ++cap_gen; }
+  uint32_t get_cap_gen() const { return cap_gen; }
+
   version_t inc_push_seq() { return ++cap_push_seq; }
   version_t get_push_seq() const { return cap_push_seq; }
 
@@ -265,7 +285,10 @@
     }
   }
 
-  void add_cap(Capability *cap) {
+  void touch_cap(Capability *cap) {
+    caps.push_front(&cap->item_session_caps);
+  }
+  void touch_cap_bottom(Capability *cap) {
     caps.push_back(&cap->item_session_caps);
   }
   void touch_lease(ClientLease *r) {
@@ -273,16 +296,16 @@
   }
 
   // -- leases --
-  uint32_t lease_seq;
+  uint32_t lease_seq = 0;
 
   // -- completed requests --
 private:
   // Has completed_requests been modified since the last time we
   // wrote this session out?
-  bool completed_requests_dirty;
+  bool completed_requests_dirty = false;
 
-  unsigned num_trim_flushes_warnings;
-  unsigned num_trim_requests_warnings;
+  unsigned num_trim_flushes_warnings = 0;
+  unsigned num_trim_requests_warnings = 0;
 public:
   void add_completed_request(ceph_tid_t t, inodeno_t created) {
     info.completed_requests[t] = created;
@@ -357,18 +380,18 @@
   int check_access(CInode *in, unsigned mask, int caller_uid, int caller_gid,
 		   const vector<uint64_t> *gid_list, int new_uid, int new_gid);
 
-
-  Session() : 
-    state(STATE_CLOSED), state_seq(0), importing_count(0),
-    birth_time(clock::now()), recall_count(0),
-    recall_release_count(0), auth_caps(g_ceph_context),
-    connection(NULL), item_session_list(this),
-    requests(0),  // member_offset passed to front() manually
-    cap_push_seq(0),
-    lease_seq(0),
-    completed_requests_dirty(false),
-    num_trim_flushes_warnings(0),
-    num_trim_requests_warnings(0) { }
+  Session() = delete;
+  Session(ConnectionRef con) :
+    recall_caps(ceph_clock_now(), g_conf->get_val<double>("mds_recall_warning_decay_rate")),
+    release_caps(ceph_clock_now(), g_conf->get_val<double>("mds_recall_warning_decay_rate")),
+    recall_caps_throttle(ceph_clock_now(), g_conf->get_val<double>("mds_recall_max_decay_rate")),
+    birth_time(clock::now()),
+    auth_caps(g_ceph_context),
+    item_session_list(this),
+    requests(0)  // member_offset passed to front() manually
+  {
+    connection = std::move(con);
+  }
   ~Session() override {
     if (state == STATE_CLOSED) {
       item_session_list.remove_myself();
@@ -464,7 +487,7 @@
     if (session_map_entry != session_map.end()) {
       s = session_map_entry->second;
     } else {
-      s = session_map[i.name] = new Session;
+      s = session_map[i.name] = new Session(ConnectionRef());
       s->info.inst = i;
       s->last_cap_renew = Session::clock::now();
       if (logger) {
@@ -496,17 +519,15 @@
   MDSRank *mds;
 
 protected:
-  version_t projected, committing, committed;
+  version_t projected = 0, committing = 0, committed = 0;
 public:
   map<int,xlist<Session*>* > by_state;
   uint64_t set_state(Session *session, int state);
   map<version_t, list<MDSInternalContextBase*> > commit_waiters;
   void update_average_session_age();
 
-  explicit SessionMap(MDSRank *m) : mds(m),
-		       projected(0), committing(0), committed(0),
-                       loaded_legacy(false)
-  { }
+  SessionMap() = delete;
+  explicit SessionMap(MDSRank *m) : mds(m) {}
 
   ~SessionMap() override
   {
@@ -600,12 +621,20 @@
 
   void dump();
 
-  void get_client_session_set(set<Session*>& s) const {
-    for (ceph::unordered_map<entity_name_t,Session*>::const_iterator p = session_map.begin();
-	 p != session_map.end();
-	 ++p)
-      if (p->second->info.inst.name.is_client())
-	s.insert(p->second);
+  template<typename F>
+  void get_client_sessions(F&& f) const {
+    for (const auto& p : session_map) {
+      auto& session = p.second;
+      if (session->info.inst.name.is_client())
+	f(session);
+    }
+  }
+  template<typename C>
+  void get_client_session_set(C& c) const {
+    auto f = [&c](Session* s) {
+      c.insert(s);
+    };
+    get_client_sessions(f);
   }
 
   void replay_open_sessions(map<client_t,entity_inst_t>& client_map) {
@@ -667,7 +696,7 @@
 protected:
   std::set<entity_name_t> dirty_sessions;
   std::set<entity_name_t> null_sessions;
-  bool loaded_legacy;
+  bool loaded_legacy = false;
   void _mark_dirty(Session *session);
 public:
 
diff -Nru ceph-12.2.11/src/mds/SimpleLock.h ceph-12.2.12/src/mds/SimpleLock.h
--- ceph-12.2.11/src/mds/SimpleLock.h	2019-01-30 15:51:26.000000000 +0000
+++ ceph-12.2.12/src/mds/SimpleLock.h	2019-04-11 12:33:50.000000000 +0000
@@ -506,8 +506,9 @@
   }
   void put_xlock() {
     assert(state == LOCK_XLOCK || state == LOCK_XLOCKDONE ||
-	   state == LOCK_XLOCKSNAP || is_locallock() ||
-	   state == LOCK_LOCK /* if we are a master of a slave */);
+	   state == LOCK_XLOCKSNAP || state == LOCK_LOCK_XLOCK ||
+	   state == LOCK_LOCK  || /* if we are a master of a slave */
+	   is_locallock());
     --more()->num_xlock;
     parent->put(MDSCacheObject::PIN_LOCK);
     if (more()->num_xlock == 0) {
diff -Nru ceph-12.2.11/src/messages/MMDSFragmentNotifyAck.h ceph-12.2.12/src/messages/MMDSFragmentNotifyAck.h
--- ceph-12.2.11/src/messages/MMDSFragmentNotifyAck.h	1970-01-01 00:00:00.000000000 +0000
+++ ceph-12.2.12/src/messages/MMDSFragmentNotifyAck.h	2019-04-11 12:33:50.000000000 +0000
@@ -0,0 +1,57 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- 
+// vim: ts=8 sw=2 smarttab
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net>
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software 
+ * Foundation.  See file COPYING.
+ * 
+ */
+
+#ifndef CEPH_MMDSFRAGMENTNOTIFYAck_H
+#define CEPH_MMDSFRAGMENTNOTIFYAck_H
+
+#include "msg/Message.h"
+
+class MMDSFragmentNotifyAck : public Message {
+private:
+  dirfrag_t base_dirfrag;
+  int8_t bits = 0;
+
+public:
+  dirfrag_t get_base_dirfrag() const { return base_dirfrag; }
+  int get_bits() const { return bits; }
+
+  bufferlist basebl;
+
+  MMDSFragmentNotifyAck() : Message(MSG_MDS_FRAGMENTNOTIFYACK) {}
+  MMDSFragmentNotifyAck(dirfrag_t df, int b, uint64_t tid) :
+    Message(MSG_MDS_FRAGMENTNOTIFYACK),
+    base_dirfrag(df), bits(b) {
+    set_tid(tid);
+  }
+private:
+  ~MMDSFragmentNotifyAck() override {}
+
+public:
+  const char *get_type_name() const override { return "fragment_notify_ack"; }
+  void print(ostream& o) const override {
+    o << "fragment_notify_ack(" << base_dirfrag << " " << (int)bits << ")";
+  }
+
+  void encode_payload(uint64_t features) override {
+    ::encode(base_dirfrag, payload);
+    ::encode(bits, payload);
+  }
+  void decode_payload() override {
+    auto p = payload.begin();
+    ::decode(base_dirfrag, p);
+    ::decode(bits, p);
+  }
+};
+
+#endif
diff -Nru ceph-12.2.11/src/messages/MMDSFragmentNotify.h ceph-12.2.12/src/messages/MMDSFragmentNotify.h
--- ceph-12.2.11/src/messages/MMDSFragmentNotify.h	2019-01-30 15:51:26.000000000 +0000
+++ ceph-12.2.12/src/messages/MMDSFragmentNotify.h	2019-04-11 12:33:50.000000000 +0000
@@ -18,43 +18,52 @@
 #include "msg/Message.h"
 
 class MMDSFragmentNotify : public Message {
-  inodeno_t ino;
-  frag_t basefrag;
-  int8_t bits;
+  static constexpr int HEAD_VERSION = 2;
+  static constexpr int COMPAT_VERSION = 1;
+
+  dirfrag_t base_dirfrag;
+  int8_t bits = 0;
+  bool ack_wanted = false;
 
  public:
-  inodeno_t get_ino() { return ino; }
-  frag_t get_basefrag() { return basefrag; }
+  inodeno_t get_ino() { return base_dirfrag.ino; }
+  frag_t get_basefrag() { return base_dirfrag.frag; }
+  dirfrag_t get_base_dirfrag() const { return base_dirfrag; }
   int get_bits() { return bits; }
+  bool is_ack_wanted() const { return ack_wanted; }
+  void mark_ack_wanted() { ack_wanted = true; }
 
   bufferlist basebl;
 
-  MMDSFragmentNotify() : Message(MSG_MDS_FRAGMENTNOTIFY) {}
-  MMDSFragmentNotify(dirfrag_t df, int b) :
-	Message(MSG_MDS_FRAGMENTNOTIFY),
-    ino(df.ino), basefrag(df.frag), bits(b) { }
+  MMDSFragmentNotify() :
+    Message(MSG_MDS_FRAGMENTNOTIFY, HEAD_VERSION, COMPAT_VERSION) {}
+  MMDSFragmentNotify(dirfrag_t df, int b, uint64_t tid) :
+    Message(MSG_MDS_FRAGMENTNOTIFY, HEAD_VERSION, COMPAT_VERSION),
+    base_dirfrag(df), bits(b) {
+    set_tid(tid);
+  }
 private:
   ~MMDSFragmentNotify() override {}
 
 public:  
   const char *get_type_name() const override { return "fragment_notify"; }
   void print(ostream& o) const override {
-    o << "fragment_notify(" << ino << "." << basefrag
-      << " " << (int)bits << ")";
+    o << "fragment_notify(" << base_dirfrag << " " << (int)bits << ")";
   }
 
   void encode_payload(uint64_t features) override {
-    ::encode(ino, payload);
-    ::encode(basefrag, payload);
+    ::encode(base_dirfrag, payload);
     ::encode(bits, payload);
     ::encode(basebl, payload);
+    ::encode(ack_wanted, payload);
   }
   void decode_payload() override {
     bufferlist::iterator p = payload.begin();
-    ::decode(ino, p);
-    ::decode(basefrag, p);
+    ::decode(base_dirfrag, p);
     ::decode(bits, p);
     ::decode(basebl, p);
+    if (header.version >= 2)
+      ::decode(ack_wanted, p);
   }
   
 };
diff -Nru ceph-12.2.11/src/mgr/ActivePyModules.cc ceph-12.2.12/src/mgr/ActivePyModules.cc
--- ceph-12.2.11/src/mgr/ActivePyModules.cc	2019-01-30 15:51:26.000000000 +0000
+++ ceph-12.2.12/src/mgr/ActivePyModules.cc	2019-04-11 12:33:50.000000000 +0000
@@ -289,9 +289,9 @@
   } else if (what == "df") {
     PyFormatter f;
 
-    cluster_state.with_osdmap([this, &f](const OSDMap &osd_map){
-      cluster_state.with_pgmap(
-          [&osd_map, &f](const PGMap &pg_map) {
+    cluster_state.with_pgmap([this, &f](const PGMap &pg_map) {
+      cluster_state.with_osdmap(
+          [&pg_map, &f](const OSDMap &osd_map) {
         pg_map.dump_fs_stats(nullptr, &f, true);
         pg_map.dump_pool_stats_full(osd_map, nullptr, &f, true);
       });
@@ -420,15 +420,13 @@
 bool ActivePyModules::get_config(const std::string &module_name,
     const std::string &key, std::string *val) const
 {
-  PyThreadState *tstate = PyEval_SaveThread();
-  Mutex::Locker l(lock);
-  PyEval_RestoreThread(tstate);
-
   const std::string global_key = PyModuleRegistry::config_prefix
     + module_name + "/" + key;
 
   dout(4) << __func__ << "key: " << global_key << dendl;
 
+  Mutex::Locker l(lock);
+
   if (config_cache.count(global_key)) {
     *val = config_cache.at(global_key);
     return true;
diff -Nru ceph-12.2.11/src/mgr/BaseMgrModule.cc ceph-12.2.12/src/mgr/BaseMgrModule.cc
--- ceph-12.2.11/src/mgr/BaseMgrModule.cc	2019-01-30 15:51:26.000000000 +0000
+++ ceph-12.2.12/src/mgr/BaseMgrModule.cc	2019-04-11 12:33:50.000000000 +0000
@@ -137,6 +137,9 @@
 
   auto c = new MonCommandCompletion(self->py_modules,
       completion, tag, PyThreadState_Get());
+
+  PyThreadState *tstate = PyEval_SaveThread();
+
   if (std::string(type) == "mon") {
     self->py_modules->get_monc().start_mon_command(
         {cmd_json},
@@ -151,6 +154,7 @@
       delete c;
       string msg("invalid osd_id: ");
       msg.append("\"").append(name).append("\"");
+      PyEval_RestoreThread(tstate);
       PyErr_SetString(PyExc_ValueError, msg.c_str());
       return nullptr;
     }
@@ -175,6 +179,7 @@
     if (r != 0) {
       string msg("failed to send command to mds: ");
       msg.append(cpp_strerror(r));
+      PyEval_RestoreThread(tstate);
       PyErr_SetString(PyExc_RuntimeError, msg.c_str());
       return nullptr;
     }
@@ -184,6 +189,7 @@
       delete c;
       string msg("invalid pgid: ");
       msg.append("\"").append(name).append("\"");
+      PyEval_RestoreThread(tstate);
       PyErr_SetString(PyExc_ValueError, msg.c_str());
       return nullptr;
     }
@@ -197,15 +203,18 @@
         &c->outbl,
         &c->outs,
         c);
+    PyEval_RestoreThread(tstate);
     return nullptr;
   } else {
     delete c;
     string msg("unknown service type: ");
     msg.append(type);
+    PyEval_RestoreThread(tstate);
     PyErr_SetString(PyExc_ValueError, msg.c_str());
     return nullptr;
   }
 
+  PyEval_RestoreThread(tstate);
   Py_RETURN_NONE;
 }
 
@@ -356,9 +365,13 @@
     return nullptr;
   }
 
+  PyThreadState *tstate = PyEval_SaveThread();
   std::string value;
   bool found = self->py_modules->get_config(self->this_module->get_name(),
       what, &value);
+
+  PyEval_RestoreThread(tstate);
+
   if (found) {
     dout(10) << "ceph_config_get " << what << " found: " << value.c_str() << dendl;
     return PyString_FromString(value.c_str());
diff -Nru ceph-12.2.11/src/mgr/DaemonServer.cc ceph-12.2.12/src/mgr/DaemonServer.cc
--- ceph-12.2.11/src/mgr/DaemonServer.cc	2019-01-30 15:51:26.000000000 +0000
+++ ceph-12.2.12/src/mgr/DaemonServer.cc	2019-04-11 12:33:50.000000000 +0000
@@ -1464,6 +1464,9 @@
 	  jf.dump_object("health_checks", m->health_checks);
 	  jf.flush(*_dout);
 	  *_dout << dendl;
+          if (osdmap.require_osd_release >= CEPH_RELEASE_LUMINOUS) {
+              clog->debug() << "pgmap v" << pg_map.version << ": " << pg_map;
+          }
 	});
     });
 
diff -Nru ceph-12.2.11/src/mon/AuthMonitor.cc ceph-12.2.12/src/mon/AuthMonitor.cc
--- ceph-12.2.11/src/mon/AuthMonitor.cc	2019-01-30 15:51:26.000000000 +0000
+++ ceph-12.2.12/src/mon/AuthMonitor.cc	2019-04-11 12:33:50.000000000 +0000
@@ -32,6 +32,9 @@
 #include "include/stringify.h"
 #include "include/assert.h"
 
+#include "mds/MDSAuthCaps.h"
+#include "osd/OSDCap.h"
+
 #define dout_subsys ceph_subsys_mon
 #undef dout_prefix
 #define dout_prefix _prefix(_dout, mon, get_last_committed())
@@ -1040,6 +1043,37 @@
   return 0;
 }
 
+bool AuthMonitor::valid_caps(const vector<string>& caps, ostream *out)
+{
+  for (vector<string>::const_iterator p = caps.begin();
+       p != caps.end(); p += 2) {
+    if ((p+1) == caps.end()) {
+      *out << "cap '" << *p << "' has no value";
+      return false;
+    }
+    if (*p == "mon" || *p == "mgr") {
+      MonCap tmp;
+      if (!tmp.parse(*(p+1), out)) {
+	return false;
+      }
+    } else if (*p == "osd") {
+      OSDCap ocap;
+      if (!ocap.parse(*(p+1), out)) {
+	return false;
+      }
+    } else if (*p == "mds") {
+      MDSAuthCaps mdscap;
+      if (!mdscap.parse(g_ceph_context, *(p+1), out)) {
+	return false;
+      }
+    } else {
+      *out << "unknown cap type '" << *p << "'";
+      return false;
+    }
+  }
+  return true;
+}
+
 bool AuthMonitor::prepare_command(MonOpRequestRef op)
 {
   MMonCommand *m = static_cast<MMonCommand*>(op->get_req());
@@ -1141,6 +1175,11 @@
       }
     }
 
+    if (!valid_caps(caps_vec, &ss)) {
+      err = -EINVAL;
+      goto done;
+    }
+
     // are we about to have it?
     if (entity_is_pending(entity)) {
       wait_for_finished_proposal(op,
@@ -1209,7 +1248,7 @@
 						   get_last_committed() + 1));
     return true;
   } else if ((prefix == "auth get-or-create-key" ||
-	     prefix == "auth get-or-create") &&
+	      prefix == "auth get-or-create") &&
 	     !entity_name.empty()) {
     // auth get-or-create <name> [mon osdcapa osd osdcapb ...]
 
@@ -1316,6 +1355,11 @@
     string mds_cap_string, osd_cap_string;
     string osd_cap_wanted = "r";
 
+    if (!valid_caps(caps_vec, &ss)) {
+      err = -EINVAL;
+      goto done;
+    }
+
     for (auto it = caps_vec.begin();
 	 it != caps_vec.end() && (it + 1) != caps_vec.end();
 	 it += 2) {
diff -Nru ceph-12.2.11/src/mon/AuthMonitor.h ceph-12.2.12/src/mon/AuthMonitor.h
--- ceph-12.2.11/src/mon/AuthMonitor.h	2019-01-30 15:51:26.000000000 +0000
+++ ceph-12.2.12/src/mon/AuthMonitor.h	2019-04-11 12:33:50.000000000 +0000
@@ -19,6 +19,7 @@
 #include <set>
 using namespace std;
 
+#include "global/global_init.h"
 #include "include/ceph_features.h"
 #include "include/types.h"
 #include "mon/PaxosService.h"
@@ -128,19 +129,9 @@
     pending_auth.push_back(inc);
   }
 
-  /* validate mon caps ; don't care about caps for other services as
+  /* validate mon/osd/mds caps ; don't care about caps for other services as
    * we don't know how to validate them */
-  bool valid_caps(const vector<string>& caps, ostream *out) {
-    for (vector<string>::const_iterator p = caps.begin();
-         p != caps.end(); p += 2) {
-      if (!p->empty() && *p != "mon")
-        continue;
-      MonCap tmp;
-      if (!tmp.parse(*(p+1), out))
-        return false;
-    }
-    return true;
-  }
+  bool valid_caps(const vector<string>& caps, ostream *out);
 
   void on_active() override;
   bool should_propose(double& delay) override;
diff -Nru ceph-12.2.11/src/mon/CMakeLists.txt ceph-12.2.12/src/mon/CMakeLists.txt
--- ceph-12.2.11/src/mon/CMakeLists.txt	2019-01-30 15:51:26.000000000 +0000
+++ ceph-12.2.12/src/mon/CMakeLists.txt	2019-04-11 12:33:50.000000000 +0000
@@ -21,6 +21,7 @@
   PGMonitor.cc
   PGMap.cc
   ConfigKeyService.cc
+  ../mds/MDSAuthCaps.cc
   ../mgr/mgr_commands.cc
   ../osd/OSDCap.cc)
 add_library(mon STATIC
diff -Nru ceph-12.2.11/src/mon/FSCommands.cc ceph-12.2.12/src/mon/FSCommands.cc
--- ceph-12.2.11/src/mon/FSCommands.cc	2019-01-30 15:51:26.000000000 +0000
+++ ceph-12.2.12/src/mon/FSCommands.cc	2019-04-11 12:33:50.000000000 +0000
@@ -620,6 +620,13 @@
       map<string, cmd_vartype> &cmdmap,
       std::stringstream &ss) override
   {
+    /* We may need to blacklist ranks. */
+    if (!mon->osdmon()->is_writeable()) {
+      // not allowed to write yet, so retry when we can
+      mon->osdmon()->wait_for_writeable(op, new PaxosService::C_RetryMessage(mon->mdsmon(), op));
+      return -EAGAIN;
+    }
+
     // Check caller has correctly named the FS to delete
     // (redundant while there is only one FS, but command
     //  syntax should apply to multi-FS future)
@@ -663,6 +670,9 @@
       // wait for an osdmap propose here: ignore return value.
       mon->mdsmon()->fail_mds_gid(fsmap, gid);
     }
+    if (!to_fail.empty()) {
+      mon->osdmon()->propose_pending(); /* maybe new blacklists */
+    }
 
     fsmap.erase_filesystem(fs->fscid);
 
diff -Nru ceph-12.2.11/src/mon/MDSMonitor.cc ceph-12.2.12/src/mon/MDSMonitor.cc
--- ceph-12.2.11/src/mon/MDSMonitor.cc	2019-01-30 15:51:26.000000000 +0000
+++ ceph-12.2.12/src/mon/MDSMonitor.cc	2019-04-11 12:33:50.000000000 +0000
@@ -1255,6 +1255,8 @@
   const MDSMap::mds_info_t &info = fsmap.get_info_gid(gid);
   dout(1) << "fail_mds_gid " << gid << " mds." << info.name << " role " << info.rank << dendl;
 
+  ceph_assert(mon->osdmon()->is_writeable());
+
   epoch_t blacklist_epoch = 0;
   if (info.rank >= 0 && info.state != MDSMap::STATE_STANDBY_REPLAY) {
     utime_t until = ceph_clock_now();
@@ -2173,9 +2175,10 @@
 	do_propose = true;
       }
     }
-  } else {
+  } else if (!fs->mds_map.is_degraded()) {
     // There were no failures to replace, so try using any available standbys
-    // as standby-replay daemons.
+    // as standby-replay daemons. Don't do this when the cluster is degraded
+    // as a standby-replay daemon may try to read a journal being migrated.
 
     // Take a copy of the standby GIDs so that we can iterate over
     // them while perhaps-modifying standby_daemons during the loop
diff -Nru ceph-12.2.11/src/mon/MgrStatMonitor.cc ceph-12.2.12/src/mon/MgrStatMonitor.cc
--- ceph-12.2.11/src/mon/MgrStatMonitor.cc	2019-01-30 15:51:26.000000000 +0000
+++ ceph-12.2.12/src/mon/MgrStatMonitor.cc	2019-04-11 12:33:50.000000000 +0000
@@ -75,6 +75,7 @@
   dout(10) << __func__ << dendl;
   version = 0;
   service_map.epoch = 1;
+  pending_service_map_bl.clear();
   ::encode(service_map, pending_service_map_bl, CEPH_FEATURES_ALL);
 }
 
@@ -95,7 +96,8 @@
 	       << " service_map e" << service_map.epoch << dendl;
     }
     catch (buffer::error& e) {
-      derr << "failed to decode mgrstat state; luminous dev version?" << dendl;
+      derr << "failed to decode mgrstat state; luminous dev version? "
+	   << e.what() << dendl;
     }
   }
   check_subs();
diff -Nru ceph-12.2.11/src/mon/PGMap.cc ceph-12.2.12/src/mon/PGMap.cc
--- ceph-12.2.11/src/mon/PGMap.cc	2019-01-30 15:51:26.000000000 +0000
+++ ceph-12.2.12/src/mon/PGMap.cc	2019-04-11 12:33:50.000000000 +0000
@@ -2673,7 +2673,6 @@
     // Immediate reports
     { PG_STATE_INCONSISTENT,     {DAMAGED,     {}} },
     { PG_STATE_INCOMPLETE,       {UNAVAILABLE, {}} },
-    { PG_STATE_REPAIR,           {DAMAGED,     {}} },
     { PG_STATE_SNAPTRIM_ERROR,   {DAMAGED,     {}} },
     { PG_STATE_RECOVERY_UNFOUND, {DAMAGED,     {}} },
     { PG_STATE_BACKFILL_UNFOUND, {DAMAGED,     {}} },
@@ -3270,45 +3269,92 @@
 
   // PG_NOT_SCRUBBED
   // PG_NOT_DEEP_SCRUBBED
-  {
-    if (cct->_conf->mon_warn_not_scrubbed ||
+  if (cct->_conf->mon_warn_not_scrubbed ||
         cct->_conf->mon_warn_not_deep_scrubbed) {
-      list<string> detail, deep_detail;
-      const double age = cct->_conf->mon_warn_not_scrubbed +
-        cct->_conf->mon_scrub_interval;
-      utime_t cutoff = now;
-      cutoff -= age;
-      const double deep_age = cct->_conf->mon_warn_not_deep_scrubbed +
-        cct->_conf->osd_deep_scrub_interval;
-      utime_t deep_cutoff = now;
-      deep_cutoff -= deep_age;
-      for (auto& p : pg_stat) {
-        if (cct->_conf->mon_warn_not_scrubbed &&
-            p.second.last_scrub_stamp < cutoff) {
-	  ostringstream ss;
-	  ss << "pg " << p.first << " not scrubbed since "
-	     << p.second.last_scrub_stamp;
-          detail.push_back(ss.str());
+    list<string> detail, deep_detail;
+    int detail_max = max, deep_detail_max = max;
+    int detail_more = 0, deep_detail_more = 0;
+    int detail_total = 0, deep_detail_total = 0;
+    for (auto& p : pg_stat) {
+      int64_t pnum =  p.first.pool();
+      auto pool = osdmap.get_pg_pool(pnum);
+      if (!pool)
+        continue;
+      if (cct->_conf->mon_warn_not_scrubbed) {
+        double scrub_max_interval = 0;
+        pool->opts.get(pool_opts_t::SCRUB_MAX_INTERVAL, &scrub_max_interval);
+        if (scrub_max_interval <= 0) {
+          scrub_max_interval = cct->_conf->osd_scrub_max_interval;
         }
-        if (cct->_conf->mon_warn_not_deep_scrubbed &&
-            p.second.last_deep_scrub_stamp < deep_cutoff) {
-	  ostringstream ss;
-	  ss << "pg " << p.first << " not deep-scrubbed since "
-	     << p.second.last_deep_scrub_stamp;
-          deep_detail.push_back(ss.str());
+        const double age = cct->_conf->mon_warn_not_scrubbed +
+          scrub_max_interval;
+        utime_t cutoff = now;
+        cutoff -= age;
+        if (p.second.last_scrub_stamp < cutoff) {
+          if (detail_max > 0) {
+            ostringstream ss;
+            ss << "pg " << p.first << " not scrubbed since "
+               << p.second.last_scrub_stamp;
+            detail.push_back(ss.str());
+            --detail_max;
+          } else {
+            ++detail_more;
+          }
+          ++detail_total;
+        }
+      }
+      if (cct->_conf->mon_warn_not_deep_scrubbed) {
+        double deep_scrub_interval = 0;
+        pool->opts.get(pool_opts_t::DEEP_SCRUB_INTERVAL, &deep_scrub_interval);
+        if (deep_scrub_interval <= 0) {
+          deep_scrub_interval = cct->_conf->osd_deep_scrub_interval;
+        }
+        double deep_age = cct->_conf->mon_warn_not_deep_scrubbed +
+          deep_scrub_interval;
+        utime_t deep_cutoff = now;
+        deep_cutoff -= deep_age;
+        if (p.second.last_deep_scrub_stamp < deep_cutoff) {
+          if (deep_detail_max > 0) {
+            ostringstream ss;
+            ss << "pg " << p.first << " not deep-scrubbed since "
+               << p.second.last_deep_scrub_stamp;
+            deep_detail.push_back(ss.str());
+            --deep_detail_max;
+          } else {
+            ++deep_detail_more;
+          }
+          ++deep_detail_total;
         }
       }
+    }
+    if (detail_total) {
+      ostringstream ss;
+      ss << detail_total << " pgs not scrubbed in time";
+      auto& d = checks->add("PG_NOT_SCRUBBED", HEALTH_WARN, ss.str());
+
       if (!detail.empty()) {
-        ostringstream ss;
-        ss << detail.size() << " pgs not scrubbed for " << age;
-        auto& d = checks->add("PG_NOT_SCRUBBED", HEALTH_WARN, ss.str());
         d.detail.swap(detail);
+
+        if (detail_more) {
+          ostringstream ss;
+          ss << detail_more << " more pgs... ";
+          d.detail.push_back(ss.str());
+        }
       }
+    }
+    if (deep_detail_total) {
+      ostringstream ss;
+      ss << deep_detail_total << " pgs not deep-scrubbed in time";
+      auto& d = checks->add("PG_NOT_DEEP_SCRUBBED", HEALTH_WARN, ss.str());
+
       if (!deep_detail.empty()) {
-        ostringstream ss;
-        ss << deep_detail.size() << " pgs not deep-scrubbed for " << deep_age;
-        auto& d = checks->add("PG_NOT_DEEP_SCRUBBED", HEALTH_WARN, ss.str());
         d.detail.swap(deep_detail);
+
+        if (deep_detail_more) {
+          ostringstream ss;
+          ss << deep_detail_more << " more pgs... ";
+          d.detail.push_back(ss.str());
+        }
       }
     }
   }
diff -Nru ceph-12.2.11/src/msg/async/AsyncConnection.cc ceph-12.2.12/src/msg/async/AsyncConnection.cc
--- ceph-12.2.11/src/msg/async/AsyncConnection.cc	2019-01-30 15:51:26.000000000 +0000
+++ ceph-12.2.12/src/msg/async/AsyncConnection.cc	2019-04-11 12:33:50.000000000 +0000
@@ -129,7 +129,7 @@
     recv_start(0), recv_end(0),
     last_active(ceph::coarse_mono_clock::now()),
     inactive_timeout_us(cct->_conf->ms_tcp_read_timeout*1000*1000),
-    got_bad_auth(false), authorizer(NULL), replacing(false),
+    authorizer(NULL), replacing(false),
     is_reset_from_peer(false), once_ready(false), state_buffer(NULL), state_offset(0),
     worker(w), center(&w->center)
 {
@@ -879,7 +879,6 @@
         assert(!policy.server);
 
         // reset connect state variables
-        got_bad_auth = false;
         delete authorizer;
         authorizer = NULL;
         authorizer_buf.clear();
@@ -1275,10 +1274,13 @@
         }
 
         addr_bl.append(state_buffer+strlen(CEPH_BANNER), sizeof(ceph_entity_addr));
-        {
+        try {
           bufferlist::iterator ti = addr_bl.begin();
           ::decode(peer_addr, ti);
-        }
+        } catch (const buffer::error& e) {
+	  lderr(async_msgr->cct) << __func__ <<  " decode peer_addr failed " << dendl;
+          goto fail;
+	}
 
         ldout(async_msgr->cct, 10) << __func__ << " accept peer addr is " << peer_addr << dendl;
         if (peer_addr.is_blank_ip()) {
@@ -1423,12 +1425,7 @@
 
   if (reply.tag == CEPH_MSGR_TAG_BADAUTHORIZER) {
     ldout(async_msgr->cct,0) << __func__ << " connect got BADAUTHORIZER" << dendl;
-    if (got_bad_auth)
-      goto fail;
-    got_bad_auth = true;
-    delete authorizer;
-    authorizer = async_msgr->get_authorizer(peer_type, true);  // try harder
-    state = STATE_CONNECTING_SEND_CONNECT_MSG;
+    goto fail;
   }
   if (reply.tag == CEPH_MSGR_TAG_RESETSESSION) {
     ldout(async_msgr->cct, 0) << __func__ << " connect got RESETSESSION" << dendl;
@@ -1542,6 +1539,14 @@
 	need_challenge ? &authorizer_challenge : nullptr) ||
       !authorizer_valid) {
     lock.lock();
+    if (state != STATE_ACCEPTING_WAIT_CONNECT_MSG_AUTH) {
+      ldout(async_msgr->cct, 1) << __func__
+                                << " state changed while verify_authorizer,"
+                                << " it must be mark_down"
+                                << dendl;
+      ceph_assert(state == STATE_CLOSED);
+      return -1;
+    }
     char tag;
     if (need_challenge && !had_challenge && authorizer_challenge) {
       ldout(async_msgr->cct,10) << __func__ << ": challenging authorizer"
@@ -1875,6 +1880,7 @@
   if (state != STATE_ACCEPTING_WAIT_CONNECT_MSG_AUTH) {
     ldout(async_msgr->cct, 1) << __func__ << " state changed while accept_conn, it must be mark_down" << dendl;
     assert(state == STATE_CLOSED || state == STATE_NONE);
+    async_msgr->unregister_conn(this);
     goto fail_registered;
   }
 
@@ -2337,7 +2343,6 @@
       state <= STATE_CONNECTING_READY) {
     delete authorizer;
     authorizer = NULL;
-    got_bad_auth = false;
   }
 
   if (state > STATE_OPEN_MESSAGE_THROTTLE_MESSAGE &&
diff -Nru ceph-12.2.11/src/msg/async/AsyncConnection.h ceph-12.2.12/src/msg/async/AsyncConnection.h
--- ceph-12.2.11/src/msg/async/AsyncConnection.h	2019-01-30 15:51:26.000000000 +0000
+++ ceph-12.2.12/src/msg/async/AsyncConnection.h	2019-04-11 12:33:50.000000000 +0000
@@ -348,7 +348,6 @@
   bufferlist front, middle, data;
   ceph_msg_connect connect_msg;
   // Connecting state
-  bool got_bad_auth;
   AuthAuthorizer *authorizer;
   bufferlist authorizer_buf;
   ceph_msg_connect_reply connect_reply;
diff -Nru ceph-12.2.11/src/msg/async/EventEpoll.cc ceph-12.2.12/src/msg/async/EventEpoll.cc
--- ceph-12.2.11/src/msg/async/EventEpoll.cc	2019-01-30 15:51:26.000000000 +0000
+++ ceph-12.2.12/src/msg/async/EventEpoll.cc	2019-04-11 12:33:50.000000000 +0000
@@ -83,16 +83,18 @@
 {
   ldout(cct, 20) << __func__ << " del event fd=" << fd << " cur_mask=" << cur_mask
                  << " delmask=" << delmask << " to " << epfd << dendl;
-  struct epoll_event ee;
+  struct epoll_event ee = {0};
   int mask = cur_mask & (~delmask);
   int r = 0;
 
-  ee.events = 0;
-  if (mask & EVENT_READABLE) ee.events |= EPOLLIN;
-  if (mask & EVENT_WRITABLE) ee.events |= EPOLLOUT;
-  ee.data.u64 = 0; /* avoid valgrind warning */
-  ee.data.fd = fd;
   if (mask != EVENT_NONE) {
+    ee.events = EPOLLET;
+    ee.data.fd = fd;
+    if (mask & EVENT_READABLE)
+      ee.events |= EPOLLIN;
+    if (mask & EVENT_WRITABLE)
+      ee.events |= EPOLLOUT;
+
     if ((r = epoll_ctl(epfd, EPOLL_CTL_MOD, fd, &ee)) < 0) {
       lderr(cct) << __func__ << " epoll_ctl: modify fd=" << fd << " mask=" << mask
                  << " failed." << cpp_strerror(errno) << dendl;
diff -Nru ceph-12.2.11/src/msg/Message.cc ceph-12.2.12/src/msg/Message.cc
--- ceph-12.2.11/src/msg/Message.cc	2019-01-30 15:51:26.000000000 +0000
+++ ceph-12.2.12/src/msg/Message.cc	2019-04-11 12:33:50.000000000 +0000
@@ -138,6 +138,7 @@
 #include "messages/MDiscoverReply.h"
 
 #include "messages/MMDSFragmentNotify.h"
+#include "messages/MMDSFragmentNotifyAck.h"
 
 #include "messages/MExportDirDiscover.h"
 #include "messages/MExportDirDiscoverAck.h"
@@ -678,6 +679,10 @@
     m = new MMDSFragmentNotify;
     break;
 
+  case MSG_MDS_FRAGMENTNOTIFYACK:
+    m = new MMDSFragmentNotifyAck;
+    break;
+
   case MSG_MDS_EXPORTDIRDISCOVER:
     m = new MExportDirDiscover();
     break;
diff -Nru ceph-12.2.11/src/msg/Message.h ceph-12.2.12/src/msg/Message.h
--- ceph-12.2.11/src/msg/Message.h	2019-01-30 15:51:26.000000000 +0000
+++ ceph-12.2.12/src/msg/Message.h	2019-04-11 12:33:50.000000000 +0000
@@ -147,6 +147,7 @@
 #define MSG_MDS_OPENINO            0x20f
 #define MSG_MDS_OPENINOREPLY       0x210
 
+#define MSG_MDS_FRAGMENTNOTIFYACK  0x212
 #define MSG_MDS_LOCK               0x300
 #define MSG_MDS_INODEFILECAPS      0x301
 
diff -Nru ceph-12.2.11/src/msg/msg_types.h ceph-12.2.12/src/msg/msg_types.h
--- ceph-12.2.11/src/msg/msg_types.h	2019-01-30 15:51:26.000000000 +0000
+++ ceph-12.2.12/src/msg/msg_types.h	2019-04-11 12:33:50.000000000 +0000
@@ -435,7 +435,14 @@
     __u32 elen = get_sockaddr_len();
     ::encode(elen, bl);
     if (elen) {
+#if (__FreeBSD__) || defined(__APPLE__)
+      __le16 ss_family = u.sa.sa_family;
+      ::encode(ss_family, bl);
+      bl.append(u.sa.sa_data,
+		elen - sizeof(u.sa.sa_len) - sizeof(u.sa.sa_family));
+#else
       bl.append((char*)get_sockaddr(), elen);
+#endif
     }
     ENCODE_FINISH(bl);
   }
@@ -454,7 +461,30 @@
     __u32 elen;
     ::decode(elen, bl);
     if (elen) {
-      bl.copy(elen, (char*)get_sockaddr());
+#if defined(__FreeBSD__) || defined(__APPLE__)
+      u.sa.sa_len = 0;
+      __le16 ss_family;
+      if (elen < sizeof(ss_family)) {
+	throw buffer::malformed_input("elen smaller than family len");
+      }
+      ::decode(ss_family, bl);
+      u.sa.sa_family = ss_family;
+      elen -= sizeof(ss_family);
+      if (elen > get_sockaddr_len() - sizeof(u.sa.sa_family)) {
+	throw buffer::malformed_input("elen exceeds sockaddr len");
+      }
+      bl.copy(elen, u.sa.sa_data);
+#else
+      if (elen < sizeof(u.sa.sa_family)) {
+	throw buffer::malformed_input("elen smaller than family len");
+      }
+      bl.copy(sizeof(u.sa.sa_family), (char*)&u.sa.sa_family);
+      if (elen > get_sockaddr_len()) {
+	throw buffer::malformed_input("elen exceeds sockaddr len");
+      }
+      elen -= sizeof(u.sa.sa_family);
+      bl.copy(elen, u.sa.sa_data);
+#endif
     }
     DECODE_FINISH(bl);
   }
diff -Nru ceph-12.2.11/src/msg/simple/Pipe.cc ceph-12.2.12/src/msg/simple/Pipe.cc
--- ceph-12.2.11/src/msg/simple/Pipe.cc	2019-01-30 15:51:26.000000000 +0000
+++ ceph-12.2.12/src/msg/simple/Pipe.cc	2019-04-11 12:33:50.000000000 +0000
@@ -412,9 +412,13 @@
     ldout(msgr->cct,10) << "accept couldn't read peer_addr" << dendl;
     goto fail_unlocked;
   }
-  {
+  try {
     bufferlist::iterator ti = addrbl.begin();
     ::decode(peer_addr, ti);
+  } catch (const buffer::error& e) {
+    ldout(msgr->cct,2) << __func__ <<  " decode peer_addr failed: " << e.what()
+			<< dendl;
+    goto fail_unlocked;
   }
 
   ldout(msgr->cct,10) << "accept peer addr is " << peer_addr << dendl;
diff -Nru ceph-12.2.11/src/os/bluestore/Allocator.cc ceph-12.2.12/src/os/bluestore/Allocator.cc
--- ceph-12.2.11/src/os/bluestore/Allocator.cc	2019-01-30 15:51:26.000000000 +0000
+++ ceph-12.2.12/src/os/bluestore/Allocator.cc	2019-04-11 12:33:50.000000000 +0000
@@ -3,7 +3,7 @@
 
 #include "Allocator.h"
 #include "StupidAllocator.h"
-#include "BitMapAllocator.h"
+#include "BitmapAllocator.h"
 #include "common/debug.h"
 
 #define dout_subsys ceph_subsys_bluestore
@@ -14,9 +14,18 @@
   if (type == "stupid") {
     return new StupidAllocator(cct);
   } else if (type == "bitmap") {
-    return new BitMapAllocator(cct, size, block_size);
+    return new BitmapAllocator(cct, size, block_size);
   }
   lderr(cct) << "Allocator::" << __func__ << " unknown alloc type "
 	     << type << dendl;
   return nullptr;
 }
+
+void Allocator::release(const PExtentVector& release_vec)
+{
+  interval_set<uint64_t> release_set;
+  for (auto e : release_vec) {
+    release_set.insert(e.offset, e.length);
+  }
+  release(release_set);
+}
diff -Nru ceph-12.2.11/src/os/bluestore/Allocator.h ceph-12.2.12/src/os/bluestore/Allocator.h
--- ceph-12.2.11/src/os/bluestore/Allocator.h	2019-01-30 15:51:26.000000000 +0000
+++ ceph-12.2.12/src/os/bluestore/Allocator.h	2019-04-11 12:33:50.000000000 +0000
@@ -22,9 +22,6 @@
 public:
   virtual ~Allocator() {}
 
-  virtual int reserve(uint64_t need) = 0;
-  virtual void unreserve(uint64_t unused) = 0;
-
   /*
    * Allocate required number of blocks in n number of extents.
    * Min and Max number of extents are limited by:
@@ -36,15 +33,17 @@
    */
   virtual int64_t allocate(uint64_t want_size, uint64_t alloc_unit,
 			   uint64_t max_alloc_size, int64_t hint,
-			   AllocExtentVector *extents) = 0;
+			   PExtentVector *extents) = 0;
 
   int64_t allocate(uint64_t want_size, uint64_t alloc_unit,
-		   int64_t hint, AllocExtentVector *extents) {
+		   int64_t hint, PExtentVector *extents) {
     return allocate(want_size, alloc_unit, want_size, hint, extents);
   }
 
-  virtual void release(
-    uint64_t offset, uint64_t length) = 0;
+  /* Bulk release. Implementations may override this method to handle the whole
+   * set at once. This could save e.g. unnecessary mutex dance. */
+  virtual void release(const interval_set<uint64_t>& release_set) = 0;
+  void release(const PExtentVector& release_set);
 
   virtual void dump() = 0;
 
@@ -52,6 +51,10 @@
   virtual void init_rm_free(uint64_t offset, uint64_t length) = 0;
 
   virtual uint64_t get_free() = 0;
+  virtual double get_fragmentation(uint64_t alloc_unit)
+  {
+    return 0.0;
+  }
 
   virtual void shutdown() = 0;
   static Allocator *create(CephContext* cct, string type, int64_t size,
diff -Nru ceph-12.2.11/src/os/bluestore/BitAllocator.cc ceph-12.2.12/src/os/bluestore/BitAllocator.cc
--- ceph-12.2.11/src/os/bluestore/BitAllocator.cc	2019-01-30 15:51:26.000000000 +0000
+++ ceph-12.2.12/src/os/bluestore/BitAllocator.cc	1970-01-01 00:00:00.000000000 +0000
@@ -1,1420 +0,0 @@
-// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
-
-// vim: ts=8 sw=2 smarttab
-/*
- * Bitmap based in-memory allocator.
- * Author: Ramesh Chander, Ramesh.Chander@sandisk.com
- *
- * BitMap Tree Design:
- * Storage is divided into bitmap of blocks. Each bitmap has size of
- * unsigned long. Group of bitmap creates a Zone. Zone is a unit where
- * at a time single thread can be active as well as single biggest
- * contiguous allocation that can be requested.
- *
- * Rest of the nodes are classified into three categories:
- *   root node or Allocator
- *   internal nodes or BitMapAreaIN
- *   final nodes that contains Zones called BitMapAreaLeaf
- * This classification is according to their own implmentation of some
- * of the interfaces defined in BitMapArea.
- */
-
-#include "BitAllocator.h"
-#include <assert.h>
-#include "bluestore_types.h"
-#include "common/debug.h"
-#include <math.h>
-
-#define dout_context cct
-#define dout_subsys ceph_subsys_bluestore
-#undef dout_prefix
-#define dout_prefix *_dout << "bitalloc:"
-
-MEMPOOL_DEFINE_OBJECT_FACTORY(BitMapArea, BitMapArea, bluestore_alloc);
-MEMPOOL_DEFINE_OBJECT_FACTORY(BitMapAreaIN, BitMapAreaIN, bluestore_alloc);
-MEMPOOL_DEFINE_OBJECT_FACTORY(BitMapAreaLeaf, BitMapAreaLeaf, bluestore_alloc);
-MEMPOOL_DEFINE_OBJECT_FACTORY(BitMapZone, BitMapZone, bluestore_alloc);
-MEMPOOL_DEFINE_OBJECT_FACTORY(BmapEntry, BmapEntry, bluestore_alloc);
-MEMPOOL_DEFINE_OBJECT_FACTORY(BitAllocator, BitAllocator, bluestore_alloc);
-
-int64_t BitMapAreaLeaf::count = 0;
-int64_t BitMapZone::count = 0;
-int64_t BitMapZone::total_blocks = 0;
-
-
-
-int64_t BmapEntityListIter::index()
-{
-  return m_cur_idx;
-}
-
-BmapEntry::BmapEntry(CephContext*, const bool full)
-{
-  if (full) {
-    m_bits = BmapEntry::full_bmask();
-  } else {
-    m_bits = BmapEntry::empty_bmask();
-  }
-}
-
-BmapEntry::~BmapEntry()
-{
-
-}
-
-bool BmapEntry::check_bit(int bit)
-{
-  return (atomic_fetch() & bit_mask(bit));
-}
-
-bool BmapEntry::is_allocated(int64_t offset, int64_t num_bits)
-{
-  bmap_t bmask = BmapEntry::align_mask(num_bits) >> offset;
-  return ((m_bits & bmask) == bmask);
-}
-
-void BmapEntry::clear_bit(int bit)
-{
-  bmap_t bmask = bit_mask(bit);
-  m_bits &= ~(bmask);
-}
-
-void BmapEntry::clear_bits(int offset, int num_bits)
-{
-  if (num_bits == 0) {
-    return;
-  }
-
-  bmap_t bmask = BmapEntry::align_mask(num_bits) >> offset;
-  m_bits &= ~(bmask);
-}
-
-void BmapEntry::set_bits(int offset, int num_bits)
-{
-  if (num_bits == 0) {
-    return;
-  }
-
-  bmap_t bmask = BmapEntry::align_mask(num_bits) >> offset;
-  m_bits |= bmask;
-}
-
-/*
- * Allocate a bit if it was free.
- * Retruns true if it was free.
- */
-bool BmapEntry::check_n_set_bit(int bit)
-{
-  bmap_t bmask = bit_mask(bit);
-  bool res = !(m_bits & bmask);
-  m_bits |= bmask;
-  return res;
-}
-
-/*
- * Find N cont free bits in BitMap starting from an offset.
- *
- * Returns number of continuous bits found.
- */
-int BmapEntry::find_n_cont_bits(int start_offset, int64_t num_bits)
-{
-  int count = 0;
-  int i = 0;
-
-  if (num_bits == 0) {
-    return 0;
-  }
-
-  if (start_offset >= BmapEntry::size()) {
-    return 0;
-  }
-
-  for (i = start_offset; i < BmapEntry::size() && count < num_bits; i++) {
-    if (!check_n_set_bit(i)) {
-      break;
-    }
-    count++;
-  }
-
-  return count;
-}
-
-/*
- * Find N free bits starting search from a given offset.
- *
- * Returns number of bits found, start bit and end of
- * index next to bit where our search ended + 1.
- */
-int BmapEntry::find_n_free_bits(int start_idx, int64_t max_bits,
-         int *free_bit, int *end_idx)
-{
-  int i = 0;
-  int count = 0;
-
-  *free_bit = 0;
-  alloc_assert(max_bits > 0);
-
-  /*
-   * Find free bit aligned to bit_align return the bit_num in free_bit.
-   */
-  if (atomic_fetch() == BmapEntry::full_bmask()) {
-    /*
-     * All bits full, return fail.
-     */
-    *end_idx = BmapEntry::size();
-    return 0;
-  }
-
-  /*
-   * Do a serial scan on bitmap.
-   */
-  for (i = start_idx; i < BmapEntry::size(); i++) {
-    if (check_n_set_bit(i)) {
-      /*
-       * Found first free bit
-       */
-      *free_bit = i;
-      count++;
-      break;
-    }
-  }
-  count += find_n_cont_bits(i + 1, max_bits - 1);
-
-  (*end_idx) = i + count;
-  return count;
-}
-
-/*
- * Find first series of contiguous bits free in bitmap starting
- * from start offset that either
- * satisfy our need or are touching right edge of bitmap.
- *
- * Returns allocated bits, start bit of allocated, number of bits
- * scanned from start offset.
- */
-int
-BmapEntry::find_first_set_bits(int64_t required_blocks,
-          int bit_offset, int *start_offset,
-          int64_t *scanned)
-{
-  int allocated = 0;
-  int conti = 0;
-  int end_idx = 0;
-
-  *scanned = 0;
-
-  while (bit_offset < BmapEntry::size()) {
-    conti = find_n_free_bits(bit_offset, required_blocks,
-           start_offset, &end_idx);
-
-    *scanned += end_idx - bit_offset;
-    /*
-     * Either end of bitmap or got required.
-     */
-    if (conti == required_blocks ||
-        (conti + *start_offset == BmapEntry::size())) {
-      allocated += conti;
-      break;
-    }
-
-    /*
-     * Did not get expected, search from next index again.
-     */
-    clear_bits(*start_offset, conti);
-    allocated = 0;
-
-    bit_offset = end_idx;
-  }
-
-  return allocated;
-}
-
-void BmapEntry::dump_state(CephContext* const cct, const int& count)
-{
-  dout(0) << count << ":: 0x" << std::hex << m_bits << std::dec << dendl;
-}
-
-/*
- * Zone related functions.
- */
-void BitMapZone::init(CephContext* const cct,
-                      const int64_t zone_num,
-                      const int64_t total_blocks,
-                      const bool def)
-{
-  m_area_index = zone_num;
-  BitMapZone::total_blocks = total_blocks;
-  alloc_assert(size() > 0);
-
-  m_used_blocks = def? total_blocks: 0;
-
-  int64_t num_bmaps = total_blocks / BmapEntry::size();
-  alloc_assert(num_bmaps < std::numeric_limits<int16_t>::max());
-  alloc_assert(total_blocks < std::numeric_limits<int32_t>::max());
-  alloc_assert(!(total_blocks % BmapEntry::size()));
-
-  m_bmap_vec.resize(num_bmaps, BmapEntry(cct, def));
-  incr_count();
-}
-
-int64_t BitMapZone::sub_used_blocks(int64_t num_blocks)
-{
-  return std::atomic_fetch_sub(&m_used_blocks, (int32_t) num_blocks);
-}
-
-int64_t BitMapZone::add_used_blocks(int64_t num_blocks)
-{
-  return std::atomic_fetch_add(&m_used_blocks, (int32_t)num_blocks) + num_blocks;
-}
-
-/* Intensionally hinted because BitMapAreaLeaf::child_check_n_lock. */
-inline int64_t BitMapZone::get_used_blocks()
-{
-  return std::atomic_load(&m_used_blocks);
-}
-
-bool BitMapZone::reserve_blocks(int64_t num_blocks)
-{
-  ceph_abort();
-  return false;
-}
-
-void BitMapZone::unreserve(int64_t num_blocks, int64_t allocated)
-{
-  ceph_abort();
-}
-
-int64_t BitMapZone::get_reserved_blocks()
-{
-  ceph_abort();
-  return 0;
-}
-
-BitMapZone::BitMapZone(CephContext* cct, int64_t total_blocks,
-		       int64_t zone_num)
-  : BitMapArea(cct)
-{
-  init(cct, zone_num, total_blocks, false);
-}
-
-BitMapZone::BitMapZone(CephContext* cct, int64_t total_blocks,
-		       int64_t zone_num, bool def)
-  : BitMapArea(cct)
-{
-  init(cct, zone_num, total_blocks, def);
-}
-
-void BitMapZone::shutdown()
-{
-}
-
-BitMapZone::~BitMapZone()
-{
-}
-
-/*
- * Check if some search took zone marker to end.
- *
- * The inline hint has been added intensionally because of importance of this
- * method for BitMapAreaLeaf::child_check_n_lock, and thus for the overall
- * allocator's performance. Examination of disassemblies coming from GCC 5.4.0
- * showed that the compiler really needs that hint.
- */
-inline bool BitMapZone::is_exhausted()
-{
-  /* BitMapZone::get_used_blocks operates atomically. No need for lock. */
-  return BitMapZone::get_used_blocks() == BitMapZone::size();
-}
-
-bool BitMapZone::is_allocated(int64_t start_block, int64_t num_blocks)
-{
-  BmapEntry *bmap = NULL;
-  int bit = 0;
-  int64_t falling_in_bmap = 0;
-
-  while (num_blocks) {
-    bit = start_block % BmapEntry::size();
-    bmap = &m_bmap_vec[start_block / BmapEntry::size()];
-    falling_in_bmap = MIN(num_blocks, BmapEntry::size() - bit);
-
-    if (!bmap->is_allocated(bit, falling_in_bmap)) {
-      return false;
-    }
-
-    start_block += falling_in_bmap;
-    num_blocks -= falling_in_bmap;
-  }
-
-  return true;
-}
-
-void BitMapZone::set_blocks_used(int64_t start_block, int64_t num_blocks)
-{
-  BmapEntry *bmap = NULL;
-  int bit = 0;
-  int64_t falling_in_bmap = 0;
-  int64_t blks = num_blocks;
-
-  while (blks) {
-    bit = start_block % BmapEntry::size();
-    bmap = &m_bmap_vec[start_block / BmapEntry::size()];
-    falling_in_bmap = MIN(blks, BmapEntry::size() - bit);
-
-    bmap->set_bits(bit, falling_in_bmap);
-
-    start_block += falling_in_bmap;
-    blks -= falling_in_bmap;
-  }
-  add_used_blocks(num_blocks);
-}
-
-void BitMapZone::free_blocks_int(int64_t start_block, int64_t num_blocks)
-{
-  BmapEntry *bmap = NULL;
-  int bit = 0;
-  int64_t falling_in_bmap = 0;
-  int64_t count = num_blocks;
-  int64_t first_blk = start_block;
-  
-  if (num_blocks == 0) {
-    return; 
-  }
-  alloc_dbg_assert(is_allocated(start_block, num_blocks));
-
-  while (count) {
-    bit = first_blk % BmapEntry::size();
-    bmap = &m_bmap_vec[first_blk / BmapEntry::size()];
-    falling_in_bmap = MIN(count, BmapEntry::size() - bit);
-
-    bmap->clear_bits(bit, falling_in_bmap);
-
-    first_blk += falling_in_bmap;
-    count -= falling_in_bmap;
-  }
-  alloc_dbg_assert(!is_allocated(start_block, num_blocks));
-}
-
-void BitMapZone::lock_excl()
-{
-  m_lock.lock();
-}
-
-bool BitMapZone::lock_excl_try()
-{
-  return m_lock.try_lock();
-}
-
-void BitMapZone::unlock()
-{
-  m_lock.unlock();
-}
-
-bool BitMapZone::check_locked()
-{
-  return !lock_excl_try();
-}
-
-void BitMapZone::free_blocks(int64_t start_block, int64_t num_blocks)
-{
-  free_blocks_int(start_block, num_blocks);
-  sub_used_blocks(num_blocks);
-  alloc_assert(get_used_blocks() >= 0);
-}
-
-int64_t BitMapZone::alloc_blocks_dis(int64_t num_blocks,
-           int64_t min_alloc,
-     int64_t hint,
-     int64_t zone_blk_off, 
-     ExtentList *alloc_blocks)
-{
-  int64_t bmap_idx = hint / BmapEntry::size();
-  int bit = hint % BmapEntry::size();
-  BmapEntry *bmap = NULL;
-  int64_t allocated = 0;
-  int64_t blk_off = 0;
-  int64_t alloc_cont = 0;
-  int64_t last_cont = 0;
-  int64_t last_running_ext = 0;
-  int search_idx = bit;
-  int64_t scanned = 0;
-  int start_off = 0;
-  
-
-  alloc_assert(check_locked());
-
-  BitMapEntityIter <BmapEntry> iter = BitMapEntityIter<BmapEntry>(
-          &m_bmap_vec, bmap_idx);
-  bmap = iter.next();
-  if (!bmap) {
-    return 0;
-  }
-
-  while (allocated < num_blocks) {
-    blk_off = zone_blk_off + bmap_idx * bmap->size();
-    if (last_cont) {
-      /*
-       * We had bits free at end of last bitmap, try to complete required
-       * min alloc size using that.
-       */
-      alloc_cont = bmap->find_n_cont_bits(0, min_alloc - last_cont);
-      allocated += alloc_cont;
-      last_cont += alloc_cont;
-      
-      if (!alloc_cont) {
-        if (last_cont) {
-          this->free_blocks_int(last_running_ext - zone_blk_off, last_cont);
-        }
-        allocated -= last_cont;
-        last_cont = 0;
-      } else if (last_cont / min_alloc) {
-          /*
-           * Got contiguous min_alloc_size across bitmaps.
-           */
-          alloc_blocks->add_extents(last_running_ext, last_cont);
-          last_cont = 0;
-          last_running_ext = 0;
-      }
-      search_idx = alloc_cont;
-    } else {
-      /*
-       * Try to allocate  min_alloc_size bits from given bmap.
-       */
-      alloc_cont = bmap->find_first_set_bits(min_alloc, search_idx, &start_off, &scanned);
-      search_idx = search_idx + scanned;
-      allocated += alloc_cont;
-      if (alloc_cont / min_alloc) {
-        /*
-         * Got contiguous min_alloc_size within a bitmap.
-         */
-        alloc_blocks->add_extents(blk_off + start_off, min_alloc);
-      }
-      
-      if (alloc_cont % min_alloc) {
-        /*
-         * Got some bits at end of bitmap, carry them to try match with
-         * start bits from next bitmap.
-         */
-        if (!last_cont) {
-          last_running_ext = blk_off + start_off;
-        } 
-        last_cont += alloc_cont % min_alloc;
-      }
-    }
-  
-   
-    if (search_idx == BmapEntry::size()) {
-      search_idx = 0;
-      bmap_idx = iter.index();
-      if ((bmap = iter.next()) == NULL) {
-        if (last_cont) {
-          this->free_blocks_int(last_running_ext - zone_blk_off, last_cont);
-        }
-        allocated -= last_cont;
-        break;
-      }
-    }
-  }
-
-  add_used_blocks(allocated);
-  return allocated;
-}
-
-
-
-void BitMapZone::dump_state(CephContext* const cct, int& count)
-{
-  BmapEntry *bmap = NULL;
-  int bmap_idx = 0;
-  BitMapEntityIter <BmapEntry> iter = BitMapEntityIter<BmapEntry>(
-          &m_bmap_vec, 0);
-  dout(0) << __func__ << " zone " << count << " dump start " << dendl;
-  while ((bmap = static_cast<BmapEntry *>(iter.next()))) {
-    bmap->dump_state(cct, bmap_idx);
-    bmap_idx++;
-  }
-  dout(0) << __func__ << " zone " << count << " dump end " << dendl;
-  count++;
-}
-
-
-/*
- * BitMapArea Leaf and non-Leaf functions.
- */
-int64_t BitMapArea::get_zone_size(CephContext* cct)
-{
-  return cct->_conf->bluestore_bitmapallocator_blocks_per_zone;
-}
-
-int64_t BitMapArea::get_span_size(CephContext* cct)
-{
-  return cct->_conf->bluestore_bitmapallocator_span_size;
-}
-
-int BitMapArea::get_level(CephContext* cct, int64_t total_blocks)
-{
-  int level = 1;
-  int64_t zone_size_block = get_zone_size(cct);
-  int64_t span_size = get_span_size(cct);
-  int64_t spans = zone_size_block * span_size;
-  while (spans < total_blocks) {
-    spans *= span_size;
-    level++;
-  }
-  return level;
-}
-
-int64_t BitMapArea::get_level_factor(CephContext* cct, int level)
-{
-  alloc_assert(level > 0);
-
-  int64_t zone_size = get_zone_size(cct);
-  if (level == 1) {
-    return zone_size;
-  }
-
-  int64_t level_factor = zone_size;
-  int64_t span_size = get_span_size(cct);
-  while (--level) {
-    level_factor *= span_size;
-  }
-
-  return level_factor;
-}
-
-int64_t BitMapArea::get_index()
-{
-  return m_area_index;
-}
-
-/*
- * BitMapArea Leaf and Internal
- */
-BitMapAreaIN::BitMapAreaIN(CephContext* cct)
-  : BitMapArea(cct)
-{
-  // nothing
-}
-
-void BitMapAreaIN::init_common(CephContext* const cct,
-                               const int64_t total_blocks,
-                               const int64_t area_idx,
-                               const bool def)
-{
-  m_area_index = area_idx;
-  m_total_blocks = total_blocks;
-  m_level = BitMapArea::get_level(cct, total_blocks);
-  m_reserved_blocks = 0;
-
-  m_used_blocks = def? total_blocks: 0;
-}
-
-void BitMapAreaIN::init(CephContext* const cct,
-                        int64_t total_blocks,
-                        const int64_t area_idx,
-                        const bool def)
-{
-  int64_t num_child = 0;
-  alloc_assert(!(total_blocks % BmapEntry::size()));
-
-  init_common(cct, total_blocks, area_idx, def);
-  int64_t level_factor = BitMapArea::get_level_factor(cct, m_level);
-
-  num_child = (total_blocks + level_factor - 1) / level_factor;
-  alloc_assert(num_child < std::numeric_limits<int16_t>::max());
-
-  m_child_size_blocks = level_factor;
-
-  std::vector<BitMapArea*> children;
-  children.reserve(num_child);
-  int i = 0;
-  for (i = 0; i < num_child - 1; i++) {
-    if (m_level <= 2) {
-      children.push_back(new BitMapAreaLeaf(cct, m_child_size_blocks, i, def));
-    } else {
-      children.push_back(new BitMapAreaIN(cct, m_child_size_blocks, i, def));
-    }
-    total_blocks -= m_child_size_blocks;
-  }
-
-  int last_level = BitMapArea::get_level(cct, total_blocks);
-  if (last_level == 1) {
-    children.push_back(new BitMapAreaLeaf(cct, total_blocks, i, def));
-  } else {
-    children.push_back(new BitMapAreaIN(cct, total_blocks, i, def));
-  }
-  m_child_list = BitMapAreaList(std::move(children));
-}
-
-BitMapAreaIN::BitMapAreaIN(CephContext* cct,int64_t total_blocks,
-			   int64_t area_idx)
-  : BitMapArea(cct)
-{
-  init(cct, total_blocks, area_idx, false);
-}
-
-BitMapAreaIN::BitMapAreaIN(CephContext* cct, int64_t total_blocks,
-			   int64_t area_idx, bool def)
-  : BitMapArea(cct)
-{
-  init(cct, total_blocks, area_idx, def);
-}
-
-BitMapAreaIN::~BitMapAreaIN()
-{
-}
-
-void BitMapAreaIN::shutdown()
-{
-  lock_excl();
-  m_total_blocks = -1;
-  m_area_index = -2;
-  unlock();
-}
-
-bool BitMapAreaIN::child_check_n_lock(BitMapArea *child, int64_t required)
-{
-  child->lock_shared();
-
-  if (child->is_exhausted()) {
-    child->unlock();
-    return false;
-  }
-
-  int64_t child_used_blocks = child->get_used_blocks();
-  int64_t child_total_blocks = child->size();
-  if ((child_total_blocks - child_used_blocks) < required) {
-    child->unlock();
-    return false;
-  }
-
-  return true;
-}
-
-void BitMapAreaIN::child_unlock(BitMapArea *child)
-{
-  child->unlock();
-}
-
-bool BitMapAreaIN::is_exhausted()
-{
-  return get_used_blocks() == size();
-}
-
-int64_t BitMapAreaIN::add_used_blocks(int64_t blks)
-{
-  std::lock_guard<std::mutex> l(m_blocks_lock);
-  m_used_blocks += blks;
-  return m_used_blocks;
-}
-
-int64_t BitMapAreaIN::sub_used_blocks(int64_t num_blocks)
-{
-  std::lock_guard<std::mutex> l(m_blocks_lock);
-
-  int64_t used_blks = m_used_blocks;
-  m_used_blocks -= num_blocks;
-  alloc_assert(m_used_blocks >= 0);
-  return used_blks;
-}
-
-int64_t BitMapAreaIN::get_used_blocks()
-{
-  std::lock_guard<std::mutex> l(m_blocks_lock);
-  return m_used_blocks;
-}
-
-int64_t BitMapAreaIN::get_used_blocks_adj()
-{
-  std::lock_guard<std::mutex> l(m_blocks_lock);
-  return m_used_blocks - m_reserved_blocks;
-}
-
-bool BitMapAreaIN::reserve_blocks(int64_t num)
-{
-  bool res = false;
-  std::lock_guard<std::mutex> u_l(m_blocks_lock);
-  if (m_used_blocks + num <= size()) {
-    m_used_blocks += num;
-    m_reserved_blocks += num;
-    res = true;
-  }
-  alloc_assert(m_used_blocks <= size());
-  return res;
-}
-
-void BitMapAreaIN::unreserve(int64_t needed, int64_t allocated)
-{
-  std::lock_guard<std::mutex> l(m_blocks_lock);
-  m_used_blocks -= (needed - allocated);
-  m_reserved_blocks -= needed;
-  alloc_assert(m_used_blocks >= 0);
-  alloc_assert(m_reserved_blocks >= 0);
-}
-int64_t BitMapAreaIN::get_reserved_blocks()
-{
-  std::lock_guard<std::mutex> l(m_blocks_lock); 
-  return m_reserved_blocks;
-}
-
-bool BitMapAreaIN::is_allocated(int64_t start_block, int64_t num_blocks)
-{
-  BitMapArea *area = NULL;
-  int64_t area_block_offset = 0;
-  int64_t falling_in_area = 0;
-
-  alloc_assert(start_block >= 0 &&
-      (start_block + num_blocks <= size()));
-
-  if (num_blocks == 0) {
-    return true;
-  }
-
-  while (num_blocks) {
-    area = static_cast<BitMapArea *>(m_child_list.get_nth_item(
-                    start_block / m_child_size_blocks));
-
-    area_block_offset = start_block % m_child_size_blocks;
-    falling_in_area = MIN(m_child_size_blocks - area_block_offset,
-              num_blocks);
-    if (!area->is_allocated(area_block_offset, falling_in_area)) {
-      return false;
-    }
-    start_block += falling_in_area;
-    num_blocks -= falling_in_area;
-  }
-  return true;
-}
-
-int64_t BitMapAreaIN::alloc_blocks_dis_int_work(bool wrap, int64_t num_blocks, int64_t min_alloc, 
-           int64_t hint, int64_t area_blk_off, ExtentList *block_list)
-{
-  BitMapArea *child = NULL;
-  int64_t allocated = 0;
-  int64_t blk_off = 0;
-
-  BmapEntityListIter iter = BmapEntityListIter(
-        &m_child_list, hint / m_child_size_blocks, wrap);
-
-  while ((child = static_cast<BitMapArea *>(iter.next()))) {
-    if (!child_check_n_lock(child, 1)) {
-      hint = 0;
-      continue;
-    }
-
-    blk_off = child->get_index() * m_child_size_blocks + area_blk_off;
-    allocated += child->alloc_blocks_dis(num_blocks - allocated, min_alloc,
-                            hint % m_child_size_blocks, blk_off, block_list);
-    hint = 0;
-    child_unlock(child);
-    if (allocated == num_blocks) {
-      break;
-    }
-  }
-
-  return allocated;
-}
-
-int64_t BitMapAreaIN::alloc_blocks_dis_int(int64_t num_blocks, int64_t min_alloc,
-                       int64_t hint, int64_t area_blk_off, ExtentList *block_list)
-{
-  return alloc_blocks_dis_int_work(false, num_blocks, min_alloc, hint,
-                     area_blk_off, block_list);
-}
-
-int64_t BitMapAreaIN::alloc_blocks_dis(int64_t num_blocks, int64_t min_alloc,
-           int64_t hint, int64_t blk_off, ExtentList *block_list)
-{
-  int64_t allocated = 0;
-
-  lock_shared();
-  allocated += alloc_blocks_dis_int(num_blocks, min_alloc, hint, blk_off, block_list);
-  add_used_blocks(allocated);
-
-  unlock();
-  return allocated;
-}
-
-
-void BitMapAreaIN::set_blocks_used_int(int64_t start_block, int64_t num_blocks)
-{
-  BitMapArea *child = NULL;
-  int64_t child_block_offset = 0;
-  int64_t falling_in_child = 0;
-  int64_t blks = num_blocks;
-  int64_t start_blk = start_block;
-
-  alloc_assert(start_block >= 0);
-
-  while (blks) {
-    child = static_cast<BitMapArea *>(m_child_list.get_nth_item(
-                  start_blk / m_child_size_blocks));
-
-    child_block_offset = start_blk % child->size();
-    falling_in_child = MIN(m_child_size_blocks - child_block_offset,
-              blks);
-    child->set_blocks_used(child_block_offset, falling_in_child);
-    start_blk += falling_in_child;
-    blks -= falling_in_child;
-  }
-
-  add_used_blocks(num_blocks);
-  alloc_dbg_assert(is_allocated(start_block, num_blocks));
-}
-
-void BitMapAreaIN::set_blocks_used(int64_t start_block, int64_t num_blocks)
-{
-  if (num_blocks == 0) {
-    return;
-  }
-
-  lock_shared();
-  set_blocks_used_int(start_block, num_blocks);
-  unlock();
-}
-
-void BitMapAreaIN::free_blocks_int(int64_t start_block, int64_t num_blocks)
-{
-  BitMapArea *child = NULL;
-  int64_t child_block_offset = 0;
-  int64_t falling_in_child = 0;
-
-  alloc_assert(start_block >= 0 &&
-    (start_block + num_blocks) <= size());
-
-  if (num_blocks == 0) {
-    return;
-  }
-
-  while (num_blocks) {
-    child = static_cast<BitMapArea *>(m_child_list.get_nth_item(
-          start_block / m_child_size_blocks));
-
-    child_block_offset = start_block % m_child_size_blocks;
-
-    falling_in_child = MIN(m_child_size_blocks - child_block_offset,
-              num_blocks);
-    child->free_blocks(child_block_offset, falling_in_child);
-    start_block += falling_in_child;
-    num_blocks -= falling_in_child;
-  }
-
-}
-void BitMapAreaIN::free_blocks(int64_t start_block, int64_t num_blocks)
-{
-  if (num_blocks == 0) {
-    return;
-  }
-  lock_shared();
-  alloc_dbg_assert(is_allocated(start_block, num_blocks));
-
-  free_blocks_int(start_block, num_blocks);
-  (void) sub_used_blocks(num_blocks);
-
-  unlock();
-}
-
-void BitMapAreaIN::dump_state(CephContext* const cct, int& count)
-{
-  BitMapArea *child = NULL;
-
-  BmapEntityListIter iter = BmapEntityListIter(
-        &m_child_list, 0, false);
-
-  while ((child = static_cast<BitMapArea *>(iter.next()))) {
-    child->dump_state(cct, count);
-  }
-}
-
-/*
- * BitMapArea Leaf
- */
-BitMapAreaLeaf::BitMapAreaLeaf(CephContext* cct, int64_t total_blocks,
-			       int64_t area_idx)
-  : BitMapAreaIN(cct)
-{
-  init(cct, total_blocks, area_idx, false);
-}
-
-BitMapAreaLeaf::BitMapAreaLeaf(CephContext* cct, int64_t total_blocks,
-			       int64_t area_idx, bool def)
-  : BitMapAreaIN(cct)
-{
-  init(cct, total_blocks, area_idx, def);
-}
-
-void BitMapAreaLeaf::init(CephContext* const cct,
-                          const int64_t total_blocks,
-                          const int64_t area_idx,
-                          const bool def)
-{
-  int64_t num_child = 0;
-  alloc_assert(!(total_blocks % BmapEntry::size()));
-
-  init_common(cct, total_blocks, area_idx, def);
-  alloc_assert(m_level == 1);
-  int zone_size_block = get_zone_size(cct);
-  alloc_assert(zone_size_block > 0);
-  num_child = (total_blocks + zone_size_block - 1) / zone_size_block;
-  alloc_assert(num_child);
-  m_child_size_blocks = total_blocks / num_child;
-
-  std::vector<BitMapArea*> children;
-  children.reserve(num_child);
-  for (int i = 0; i < num_child; i++) {
-    children.emplace_back(new BitMapZone(cct, m_child_size_blocks, i, def));
-  }
-
-  m_child_list = BitMapAreaList(std::move(children));
-
-  BitMapAreaLeaf::incr_count();
-}
-
-BitMapAreaLeaf::~BitMapAreaLeaf()
-{
-  lock_excl();
-
-  for (int64_t i = 0; i < m_child_list.size(); i++) {
-    auto child = static_cast<BitMapArea *>(m_child_list.get_nth_item(i));
-    delete child;
-  }
-
-  unlock();
-}
-
-/* Intensionally hinted because BitMapAreaLeaf::alloc_blocks_dis_int. */
-inline bool BitMapAreaLeaf::child_check_n_lock(BitMapZone* const child,
-                                               const int64_t required,
-                                               const bool lock)
-{
-  /* The exhausted check can be performed without acquiring the lock. This
-   * is because 1) BitMapZone::is_exhausted() actually operates atomically
-   * and 2) it's followed by the exclusive, required-aware re-verification. */
-  if (child->BitMapZone::is_exhausted()) {
-    return false;
-  }
-
-  if (lock) {
-    child->lock_excl();
-  } else if (!child->lock_excl_try()) {
-    return false;
-  }
-
-  int64_t child_used_blocks = child->get_used_blocks();
-  int64_t child_total_blocks = child->size();
-  if ((child_total_blocks - child_used_blocks) < required) {
-    child->unlock();
-    return false;
-  }
-
-  return true;
-}
-
-int64_t BitMapAreaLeaf::alloc_blocks_dis_int(int64_t num_blocks, int64_t min_alloc, 
-                                 int64_t hint, int64_t area_blk_off, ExtentList *block_list)
-{
-  BitMapZone* child = nullptr;
-  int64_t allocated = 0;
-  int64_t blk_off = 0;
-
-  BmapEntityListIter iter = BmapEntityListIter(
-        &m_child_list, hint / m_child_size_blocks, false);
-
-  /* We're sure the only element type we aggregate is BitMapZone,
-   * so there is no business to go through vptr and thus prohibit
-   * compiler to inline the stuff. Consult BitMapAreaLeaf::init. */
-  while ((child = static_cast<BitMapZone*>(iter.next()))) {
-    if (!child_check_n_lock(child, 1, false)) {
-      hint = 0;
-      continue;
-    }
-
-    blk_off = child->get_index() * m_child_size_blocks + area_blk_off;
-    allocated += child->alloc_blocks_dis(num_blocks - allocated, min_alloc,
-                                         hint % m_child_size_blocks, blk_off, block_list);
-    child->unlock();
-    if (allocated == num_blocks) {
-      break;
-    }
-    hint = 0;
-  }
-  return allocated;
-}
-
-void BitMapAreaLeaf::free_blocks_int(int64_t start_block, int64_t num_blocks)
-{
-  BitMapArea *child = NULL;
-  int64_t child_block_offset = 0;
-  int64_t falling_in_child = 0;
-
-  alloc_assert(start_block >= 0 &&
-    (start_block + num_blocks) <= size());
-
-  if (num_blocks == 0) {
-    return;
-  }
-
-  while (num_blocks) {
-    child = static_cast<BitMapArea *>(m_child_list.get_nth_item(
-          start_block / m_child_size_blocks));
-
-    child_block_offset = start_block % m_child_size_blocks;
-
-    falling_in_child = MIN(m_child_size_blocks - child_block_offset,
-              num_blocks);
-
-    child->lock_excl();
-    child->free_blocks(child_block_offset, falling_in_child);
-    child->unlock();
-    start_block += falling_in_child;
-    num_blocks -= falling_in_child;
-  }
-}
-
-/*
- * Main allocator functions.
- */
-BitAllocator::BitAllocator(CephContext* cct, int64_t total_blocks,
-			   int64_t zone_size_block, bmap_alloc_mode_t mode)
-  : BitMapAreaIN(cct),
-    cct(cct)
-{
-  init_check(total_blocks, zone_size_block, mode, false, false);
-}
-
-BitAllocator::BitAllocator(CephContext* cct, int64_t total_blocks,
-			   int64_t zone_size_block, bmap_alloc_mode_t mode,
-			   bool def)
-  : BitMapAreaIN(cct),
-    cct(cct)
-{
-  init_check(total_blocks, zone_size_block, mode, def, false);
-}
-
-BitAllocator::BitAllocator(CephContext* cct, int64_t total_blocks,
-			   int64_t zone_size_block, bmap_alloc_mode_t mode,
-			   bool def, bool stats_on)
-  : BitMapAreaIN(cct),
-    cct(cct)
-{
-  init_check(total_blocks, zone_size_block, mode, def, stats_on);
-}
-
-void BitAllocator::init_check(int64_t total_blocks, int64_t zone_size_block,
-       bmap_alloc_mode_t mode, bool def, bool stats_on)
-{
-  int64_t unaligned_blocks = 0;
-
-  if (mode != SERIAL && mode != CONCURRENT) {
-    ceph_abort();
-  }
-
-  if (total_blocks <= 0) {
-    ceph_abort();
-  }
-
-  if (zone_size_block == 0 ||
-    zone_size_block < BmapEntry::size()) {
-    ceph_abort();
-  }
-
-  zone_size_block = (zone_size_block / BmapEntry::size()) *
-        BmapEntry::size();
-
-  unaligned_blocks = total_blocks % zone_size_block;
-  m_extra_blocks = unaligned_blocks? zone_size_block - unaligned_blocks: 0;
-  total_blocks = ROUND_UP_TO(total_blocks, zone_size_block);
-
-  m_alloc_mode = mode;
-  m_is_stats_on = stats_on;
-  if (m_is_stats_on) {
-    m_stats = new BitAllocatorStats();
-  }
-
-  pthread_rwlock_init(&m_rw_lock, NULL);
-  init(cct, total_blocks, 0, def);
-  if (!def && unaligned_blocks) {
-    /*
-     * Mark extra padded blocks used from beginning.
-     */
-    set_blocks_used(total_blocks - m_extra_blocks, m_extra_blocks);
-  }
-}
-
-void BitAllocator::lock_excl()
-{
-  pthread_rwlock_wrlock(&m_rw_lock);
-}
-
-void BitAllocator::lock_shared()
-{
-  pthread_rwlock_rdlock(&m_rw_lock);
-}
-
-bool BitAllocator::try_lock()
-{
-  bool get_lock = false;
-  if (pthread_rwlock_trywrlock(&m_rw_lock) == 0) {
-    get_lock = true;
-  }
-
-  return get_lock;
-}
-
-void BitAllocator::unlock()
-{
-  pthread_rwlock_unlock(&m_rw_lock);
-}
-
-BitAllocator::~BitAllocator()
-{
-  lock_excl();
-
-  for (int64_t i = 0; i < m_child_list.size(); i++) {
-    auto child = static_cast<BitMapArea *>(m_child_list.get_nth_item(i));
-    delete child;
-  }
-
-  unlock();
-  pthread_rwlock_destroy(&m_rw_lock);
-}
-
-void
-BitAllocator::shutdown()
-{
-  bool get_lock = try_lock();
-  assert(get_lock);
-  bool get_serial_lock = try_serial_lock();
-  assert(get_serial_lock);
-  serial_unlock();
-  unlock();
-}
-
-void BitAllocator::unreserve_blocks(int64_t unused)
-{
-  unreserve(unused, 0);
-}
-
-void BitAllocator::serial_lock()
-{
-  if (m_alloc_mode == SERIAL) {
-    m_serial_mutex.lock();
-  }
-}
-
-void BitAllocator::serial_unlock()
-{
-  if (m_alloc_mode == SERIAL) {
-    m_serial_mutex.unlock();
-  }
-}
-
-bool BitAllocator::try_serial_lock()
-{
-  bool get_lock = false;
-  if (m_alloc_mode == SERIAL) {
-    if (m_serial_mutex.try_lock() == 0) {
-      get_lock = true;
-    }
-  } else {
-    get_lock = true;
-  }
-  return get_lock;
-}
-
-bool BitAllocator::child_check_n_lock(BitMapArea *child, int64_t required)
-{
-  child->lock_shared();
-
-  if (child->is_exhausted()) {
-    child->unlock();
-    return false;
-  }
-
-  int64_t child_used_blocks = child->get_used_blocks();
-  int64_t child_total_blocks = child->size();
-  if ((child_total_blocks - child_used_blocks) < required) {
-    child->unlock();
-    return false;
-  }
-
-  return true;
-}
-
-void BitAllocator::child_unlock(BitMapArea *child)
-{
-  child->unlock();
-}
-
-bool BitAllocator::check_input_dis(int64_t num_blocks)
-{
-  if (num_blocks == 0 || num_blocks > size()) {
-    return false;
-  }
-  return true;
-}
-
-bool BitAllocator::check_input(int64_t num_blocks)
-{
-  if (num_blocks == 0 || num_blocks > get_zone_size(cct)) {
-    return false;
-  }
-  return true;
-}
-
-void BitAllocator::free_blocks(int64_t start_block, int64_t num_blocks)
-{
-  if (num_blocks == 0) {
-    return;
-  }
-
-  alloc_assert(start_block + num_blocks <= size());
-  if (is_stats_on()) {
-    m_stats->add_free_calls(1);
-    m_stats->add_freed(num_blocks);
-  }
-
-  lock_shared();
-  alloc_dbg_assert(is_allocated(start_block, num_blocks));
-
-  free_blocks_int(start_block, num_blocks);
-  (void) sub_used_blocks(num_blocks);
-
-  unlock();
-}
-
-
-void BitAllocator::set_blocks_used(int64_t start_block, int64_t num_blocks)
-{
-  if (num_blocks == 0) {
-    return;
-  }
-
-  alloc_assert(start_block + num_blocks <= size());
-  lock_shared();
-  serial_lock();
-  set_blocks_used_int(start_block, num_blocks);
-
-  serial_unlock();
-  unlock();
-}
-
-/*
- * Allocate N dis-contiguous blocks.
- */
-int64_t BitAllocator::alloc_blocks_dis_int(int64_t num_blocks, int64_t min_alloc,
-                       int64_t hint, int64_t area_blk_off, ExtentList *block_list)
-{
-  return alloc_blocks_dis_int_work(true, num_blocks, min_alloc, hint,
-                     area_blk_off, block_list);
-}
-
-int64_t BitAllocator::alloc_blocks_dis_res(int64_t num_blocks, int64_t min_alloc,
-                                           int64_t hint, ExtentList *block_list)
-{
-  return alloc_blocks_dis_work(num_blocks, min_alloc, hint, block_list, true);
-}
-
-int64_t BitAllocator::alloc_blocks_dis_work(int64_t num_blocks, int64_t min_alloc,
-                                            int64_t hint, ExtentList *block_list, bool reserved)
-{
-  int scans = 1;
-  int64_t allocated = 0;
-  /*
-   * This is root so offset is 0 yet.
-   */
-  int64_t blk_off = 0;
-
-  if (!check_input_dis(num_blocks)) {
-    return 0;
-  }
-
-  if (is_stats_on()) {
-    m_stats->add_alloc_calls(1);
-    m_stats->add_allocated(num_blocks);
-  }
-
-  lock_shared();
-  serial_lock();
-  if (!reserved && !reserve_blocks(num_blocks)) {
-    goto exit;
-  }
-
-  if (is_stats_on()) {
-    m_stats->add_concurrent_scans(scans);
-  }
-
-  while (scans && allocated < num_blocks) {
-    allocated += alloc_blocks_dis_int(num_blocks - allocated, min_alloc, hint + allocated, blk_off, block_list);
-    scans--;
-  }
-
-  if (allocated < num_blocks) {
-    /*
-     * Could not find anything in concurrent scan.
-     * Go in serial manner to get something for sure
-     * if available.
-     */
-    serial_unlock();
-    unlock();
-    lock_excl();
-    serial_lock();
-    allocated += alloc_blocks_dis_int(num_blocks - allocated, min_alloc, hint + allocated,
-                                      blk_off, block_list);
-    if (is_stats_on()) {
-      m_stats->add_serial_scans(1);
-    }
-  }
-
-  unreserve(num_blocks, allocated);
-  alloc_dbg_assert(is_allocated_dis(block_list, allocated));
-
-exit:
-  serial_unlock();
-  unlock();
-
-  return allocated;
-}
-
-bool BitAllocator::is_allocated_dis(ExtentList *blocks, int64_t num_blocks)
-{
-  int64_t count = 0;
-  for (int64_t j = 0; j < blocks->get_extent_count(); j++) {
-    auto p = blocks->get_nth_extent(j);
-    count += p.second;
-    if (!is_allocated(p.first, p.second)) {
-      return false;
-    }
-  }
-
-  alloc_assert(count == num_blocks);
-  return true;
-}
-
-void BitAllocator::free_blocks_dis(int64_t num_blocks, ExtentList *block_list)
-{
-  int64_t freed = 0;
-  lock_shared();
-  if (is_stats_on()) {
-    m_stats->add_free_calls(1);
-    m_stats->add_freed(num_blocks);
-  }
-
-  for (int64_t i = 0; i < block_list->get_extent_count(); i++) {
-    free_blocks_int(block_list->get_nth_extent(i).first,
-                    block_list->get_nth_extent(i).second);
-    freed += block_list->get_nth_extent(i).second;
-  }
-
-  alloc_assert(num_blocks == freed);
-  sub_used_blocks(num_blocks);
-  alloc_assert(get_used_blocks() >= 0);
-  unlock();
-}
-
-void BitAllocator::dump()
-{
-  int count = 0;
-  serial_lock(); 
-  dump_state(cct, count);
-  serial_unlock(); 
-}
diff -Nru ceph-12.2.11/src/os/bluestore/BitAllocator.h ceph-12.2.12/src/os/bluestore/BitAllocator.h
--- ceph-12.2.11/src/os/bluestore/BitAllocator.h	2019-01-30 15:51:26.000000000 +0000
+++ ceph-12.2.12/src/os/bluestore/BitAllocator.h	1970-01-01 00:00:00.000000000 +0000
@@ -1,569 +0,0 @@
-// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
-// vim: ts=8 sw=2 smarttab
-/*
- * Bitmap based in memory allocator.
- * Author: Ramesh Chander, Ramesh.Chander@sandisk.com
- */
-
-#ifndef  CEPH_OS_BLUESTORE_BITALLOCATOR_H
-#define CEPH_OS_BLUESTORE_BITALLOCATOR_H
-
-
-#include <assert.h>
-#include <stdint.h>
-#include <pthread.h>
-#include <mutex>
-#include <atomic>
-#include <vector>
-#include "include/intarith.h"
-#include "os/bluestore/bluestore_types.h"
-
-#define alloc_assert assert
-
-#ifdef BIT_ALLOCATOR_DEBUG
-#define alloc_dbg_assert(x) assert(x)
-#else
-#define alloc_dbg_assert(x) (static_cast<void> (0))
-#endif
-
-
-class BitAllocatorStats {
-public:
-  std::atomic<int64_t> m_total_alloc_calls;
-  std::atomic<int64_t> m_total_free_calls;
-  std::atomic<int64_t> m_total_allocated;
-  std::atomic<int64_t> m_total_freed;
-  std::atomic<int64_t> m_total_serial_scans;
-  std::atomic<int64_t> m_total_concurrent_scans;
-  std::atomic<int64_t> m_total_node_scanned;
-
-  BitAllocatorStats() {
-    m_total_alloc_calls = 0;
-    m_total_free_calls = 0;
-    m_total_allocated = 0;
-    m_total_freed = 0;
-    m_total_serial_scans = 0;
-    m_total_concurrent_scans = 0;
-    m_total_node_scanned = 0;
-  }
-
-  void add_alloc_calls(int64_t val) {
-    std::atomic_fetch_add(&m_total_alloc_calls, val);
-  }
-  void add_free_calls(int64_t val) {
-    std::atomic_fetch_add(&m_total_free_calls, val);
-  }
-  void add_allocated(int64_t val) {
-    std::atomic_fetch_add(&m_total_allocated, val);
-  }
-  void add_freed(int64_t val) {
-    std::atomic_fetch_add(&m_total_freed, val);
-  }
-  void add_serial_scans(int64_t val) {
-    std::atomic_fetch_add(&m_total_serial_scans, val);
-  }
-  void add_concurrent_scans(int64_t val) {
-    std::atomic_fetch_add(&m_total_concurrent_scans, val);
-  }
-  void add_node_scanned(int64_t val) {
-    std::atomic_fetch_add(&m_total_node_scanned, val);
-  }
-};
-
-template <class BitMapEntity>
-class BitMapEntityIter {
-  typedef mempool::bluestore_alloc::vector<BitMapEntity> BitMapEntityVector;
-  BitMapEntityVector *m_list;
-  int64_t m_start_idx;
-  int64_t m_cur_idx;
-  bool m_wrap;
-  bool m_wrapped;
-  bool m_end;
-public:
-
-  void init(BitMapEntityVector *list, bool wrap, int64_t start_idx) {
-    m_list = list;
-    m_wrap = wrap;
-    m_start_idx = start_idx;
-    m_cur_idx = m_start_idx;
-    m_wrapped = false;
-    m_end = false;
-  }
-
-  BitMapEntityIter(BitMapEntityVector *list, int64_t start_idx) {
-    init(list, false, start_idx);
-  }
-  BitMapEntityIter(BitMapEntityVector *list, int64_t start_idx, bool wrap) {
-    init(list, wrap, start_idx);
-  }
-
-  BitMapEntity *next() {
-    int64_t cur_idx = m_cur_idx;
-
-    if (m_wrapped &&
-      cur_idx == m_start_idx) {
-      /*
-       * End of wrap cycle + 1
-       */
-      if (!m_end) {
-        m_end = true;
-        return &(*m_list)[cur_idx];
-      }
-      return NULL;
-    }
-    m_cur_idx++;
-
-    if (m_cur_idx == (int64_t)m_list->size() &&
-        m_wrap) {
-      m_cur_idx = 0;
-      m_wrapped = true;
-    }
-
-    if (cur_idx == (int64_t)m_list->size()) {
-      /*
-       * End of list
-       */
-      return NULL;
-    }
-
-    alloc_assert(cur_idx < (int64_t)m_list->size());
-    return &(*m_list)[cur_idx];
-  }
-
-  int64_t index() {
-    return m_cur_idx;
-  }
-};
-
-typedef unsigned long bmap_t;
-typedef mempool::bluestore_alloc::vector<bmap_t> bmap_mask_vec_t;
-
-class BmapEntry {
-private:
-  bmap_t m_bits;
-
-public:
-  MEMPOOL_CLASS_HELPERS();
-  static bmap_t full_bmask() {
-    return (bmap_t) -1;
-  }
-  static int64_t size() {
-    return (sizeof(bmap_t) * 8);
-  }
-  static bmap_t empty_bmask() {
-    return (bmap_t) 0;
-  }
-  static bmap_t align_mask(int x) {
-    return ((x) >= BmapEntry::size()? (bmap_t) -1 : (~(((bmap_t) -1) >> (x))));
-  }
-  static bmap_t bit_mask(int bit_num) {
-    return (bmap_t) 0x1 << ((BmapEntry::size() - 1) - bit_num);
-  }
-  bmap_t atomic_fetch() {
-    return m_bits;
-  }
-  BmapEntry(CephContext*, bool val);
-  BmapEntry(CephContext*) {
-    m_bits = 0;
-  }
-  BmapEntry(const BmapEntry& bmap) {
-    m_bits = bmap.m_bits;
-  }
-
-  void clear_bit(int bit);
-  void clear_bits(int offset, int num_bits);
-  void set_bits(int offset, int num_bits);
-  bool check_n_set_bit(int bit);
-  bool check_bit(int bit);
-  bool is_allocated(int64_t start_bit, int64_t num_bits);
-
-  int find_n_cont_bits(int start_offset, int64_t num_bits);
-  int find_n_free_bits(int start_idx, int64_t max_bits,
-           int *free_bit, int *end_idx);
-  int find_first_set_bits(int64_t required_blocks, int bit_offset,
-          int *start_offset, int64_t *scanned);
-
-  void dump_state(CephContext* cct, const int& count);
-  ~BmapEntry();
-
-};
-
-class BitMapArea {
-protected:
-  int16_t m_area_index;
-
-public:
-  MEMPOOL_CLASS_HELPERS();
-  static int64_t get_zone_size(CephContext* cct);
-  static int64_t get_span_size(CephContext* cct);
-  static int get_level(CephContext* cct, int64_t total_blocks);
-  static int64_t get_level_factor(CephContext* cct, int level);
-  virtual bool is_allocated(int64_t start_block, int64_t num_blocks) = 0;
-  virtual bool is_exhausted() = 0;
-  virtual bool child_check_n_lock(BitMapArea *child, int64_t required) {
-      ceph_abort();
-      return true;
-  }
-  virtual bool child_check_n_lock(BitMapArea *child, int64_t required, bool lock) {
-      ceph_abort();
-      return true;
-  }
-  virtual void child_unlock(BitMapArea *child) {
-    ceph_abort();
-  }
-
-  virtual void lock_excl() = 0;
-  virtual bool lock_excl_try() {
-    ceph_abort();
-    return false;
-  }
-  virtual void lock_shared() {
-    ceph_abort();
-    return;
-  }
-  virtual void unlock() = 0;
-
-  virtual int64_t sub_used_blocks(int64_t num_blocks) = 0;
-  virtual int64_t add_used_blocks(int64_t num_blocks) = 0;
-  virtual bool reserve_blocks(int64_t num_blocks) = 0;
-  virtual void unreserve(int64_t num_blocks, int64_t allocated) = 0;
-  virtual int64_t get_reserved_blocks() = 0;
-  virtual int64_t get_used_blocks() = 0;
-
-  virtual void shutdown() = 0;
-
-  virtual int64_t alloc_blocks_dis(int64_t num_blocks, int64_t min_alloc,
-             int64_t hint, int64_t blk_off, ExtentList *block_list) {
-    ceph_abort();
-    return 0;
-  }
-
-  virtual void set_blocks_used(int64_t start_block, int64_t num_blocks) = 0;
-  virtual void free_blocks(int64_t start_block, int64_t num_blocks) = 0;
-  virtual int64_t size() = 0;
-
-  int64_t child_count();
-  int64_t get_index();
-  int64_t get_level();
-  virtual void dump_state(CephContext* cct, int& count) = 0;
-  BitMapArea(CephContext*) { }
-  virtual ~BitMapArea() { }
-};
-
-class BitMapAreaList {
-
-private:
-  std::vector<BitMapArea*> m_items;
-
-public:
-  /* Must be DefaultConstructible as BitMapAreaIN and derivates employ
-   * a deferred init, sorry. */
-  BitMapAreaList() = default;
-
-  BitMapAreaList(std::vector<BitMapArea*>&& m_items)
-    : m_items(std::move(m_items)) {
-  }
-
-  BitMapArea *get_nth_item(const int64_t idx) {
-    return m_items[idx];
-  }
-
-  /* FIXME: we really should use size_t. */
-  int64_t size() const {
-    return m_items.size();
-  }
-};
-
-/* Intensionally inlined for the sake of BitMapAreaLeaf::alloc_blocks_dis_int. */
-class BmapEntityListIter {
-  BitMapAreaList* m_list;
-  int64_t m_start_idx;
-  int64_t m_cur_idx;
-  bool m_wrap;
-  bool m_wrapped;
-  bool m_end;
-
-public:
-  BmapEntityListIter(BitMapAreaList* const list,
-                     const int64_t start_idx,
-                     const bool wrap = false)
-    : m_list(list),
-      m_start_idx(start_idx),
-      m_cur_idx(start_idx),
-      m_wrap(wrap),
-      m_wrapped(false),
-      m_end(false) {
-  }
-
-  BitMapArea* next() {
-    int64_t cur_idx = m_cur_idx;
-
-    if (m_wrapped &&
-      cur_idx == m_start_idx) {
-      /*
-       * End of wrap cycle + 1
-       */
-      if (!m_end) {
-        m_end = true;
-        return m_list->get_nth_item(cur_idx);
-      }
-      return NULL;
-    }
-    m_cur_idx++;
-
-    if (m_cur_idx == m_list->size() &&
-        m_wrap) {
-      m_cur_idx = 0;
-      m_wrapped = true;
-    }
-    if (cur_idx == m_list->size()) {
-      /*
-       * End of list
-       */
-      return NULL;
-    }
-
-    /* This method should be *really* fast as it's being executed over
-     * and over during traversal of allocators indexes. */
-    alloc_dbg_assert(cur_idx < m_list->size());
-    return m_list->get_nth_item(cur_idx);
-  }
-
-  int64_t index();
-};
-
-typedef mempool::bluestore_alloc::vector<BmapEntry> BmapEntryVector;
-
-class BitMapZone: public BitMapArea {
-
-private:
-  std::atomic<int32_t> m_used_blocks;
-  BmapEntryVector m_bmap_vec;
-  std::mutex m_lock;
-
-public:
-  MEMPOOL_CLASS_HELPERS();
-  static int64_t count;
-  static int64_t total_blocks;
-  static void incr_count() { count++;}
-  static int64_t get_total_blocks() {return total_blocks;}
-  bool is_allocated(int64_t start_block, int64_t num_blocks) override;
-  bool is_exhausted() override final;
-  void reset_marker();
-
-  int64_t sub_used_blocks(int64_t num_blocks) override;
-  int64_t add_used_blocks(int64_t num_blocks) override;
-  bool reserve_blocks(int64_t num_blocks) override;
-  void unreserve(int64_t num_blocks, int64_t allocated) override;
-  int64_t get_reserved_blocks() override;
-  int64_t get_used_blocks() override final;
-  int64_t size() override final {
-    return get_total_blocks();
-  }
-
-  void lock_excl() override;
-  bool lock_excl_try() override;
-  void unlock() override;
-  bool check_locked();
-
-  void free_blocks_int(int64_t start_block, int64_t num_blocks);
-  void init(CephContext* cct, int64_t zone_num, int64_t total_blocks, bool def);
-
-  BitMapZone(CephContext* cct, int64_t total_blocks, int64_t zone_num);
-  BitMapZone(CephContext* cct, int64_t total_blocks, int64_t zone_num, bool def);
-
-  ~BitMapZone() override;
-  void shutdown() override;
-  int64_t alloc_blocks_dis(int64_t num_blocks, int64_t min_alloc, int64_t hint,
-        int64_t blk_off, ExtentList *block_list) override;  
-  void set_blocks_used(int64_t start_block, int64_t num_blocks) override;
-
-  void free_blocks(int64_t start_block, int64_t num_blocks) override;
-  void dump_state(CephContext* cct, int& count) override;
-};
-
-class BitMapAreaIN: public BitMapArea{
-
-protected:
-  int64_t m_child_size_blocks;
-  int64_t m_total_blocks;
-  int16_t m_level;
-
-  int64_t m_used_blocks;
-  int64_t m_reserved_blocks;
-  std::mutex m_blocks_lock;
-  BitMapAreaList m_child_list;
-
-  bool is_allocated(int64_t start_block, int64_t num_blocks) override;
-  bool is_exhausted() override;
-
-  bool child_check_n_lock(BitMapArea *child, int64_t required, bool lock) override {
-    ceph_abort();
-    return false;
-  }
-
-  bool child_check_n_lock(BitMapArea *child, int64_t required) override;
-  void child_unlock(BitMapArea *child) override;
-
-  void lock_excl() override {
-    return;
-  }
-  void lock_shared() override {
-    return;
-  }
-  void unlock() override {
-    return;
-  }
-
-  void init(CephContext* cct, int64_t total_blocks, int64_t zone_size_block, bool def);
-  void init_common(CephContext* cct,
-                   int64_t total_blocks,
-                   int64_t zone_size_block,
-                   bool def);
-  int64_t alloc_blocks_dis_int_work(bool wrap, int64_t num_blocks, int64_t min_alloc, int64_t hint,
-        int64_t blk_off, ExtentList *block_list);  
-
-  int64_t alloc_blocks_int_work(bool wait, bool wrap,
-                         int64_t num_blocks, int64_t hint, int64_t *start_block);
-
-public:
-  MEMPOOL_CLASS_HELPERS();
-  BitMapAreaIN(CephContext* cct);
-  BitMapAreaIN(CephContext* cct, int64_t zone_num, int64_t total_blocks);
-  BitMapAreaIN(CephContext* cct, int64_t zone_num, int64_t total_blocks,
-	       bool def);
-
-  ~BitMapAreaIN() override;
-  void shutdown() override;
-  int64_t sub_used_blocks(int64_t num_blocks) override;
-  int64_t add_used_blocks(int64_t num_blocks) override;
-  bool reserve_blocks(int64_t num_blocks) override;
-  void unreserve(int64_t num_blocks, int64_t allocated) override;
-  int64_t get_reserved_blocks() override;
-  int64_t get_used_blocks() override;
-  virtual int64_t get_used_blocks_adj();
-  int64_t size() override {
-    return m_total_blocks;
-  }
-  using BitMapArea::alloc_blocks_dis; //non-wait version
-
-  virtual int64_t alloc_blocks_dis_int(int64_t num_blocks, int64_t min_alloc, int64_t hint,
-                                       int64_t blk_off, ExtentList *block_list);  
-  int64_t alloc_blocks_dis(int64_t num_blocks, int64_t min_alloc, int64_t hint,
-                           int64_t blk_off, ExtentList *block_list) override;  
-  virtual void set_blocks_used_int(int64_t start_block, int64_t num_blocks);
-  void set_blocks_used(int64_t start_block, int64_t num_blocks) override;
-
-  virtual void free_blocks_int(int64_t start_block, int64_t num_blocks);
-  void free_blocks(int64_t start_block, int64_t num_blocks) override;
-  void dump_state(CephContext* cct, int& count) override;
-};
-
-class BitMapAreaLeaf: public BitMapAreaIN{
-
-private:
-  void init(CephContext* cct, int64_t total_blocks, int64_t zone_size_block,
-            bool def);
-
-public:
-  MEMPOOL_CLASS_HELPERS();
-  static int64_t count;
-  static void incr_count() { count++;}
-  BitMapAreaLeaf(CephContext* cct) : BitMapAreaIN(cct) { }
-  BitMapAreaLeaf(CephContext* cct, int64_t zone_num, int64_t total_blocks);
-  BitMapAreaLeaf(CephContext* cct, int64_t zone_num, int64_t total_blocks,
-		 bool def);
-
-  using BitMapAreaIN::child_check_n_lock;
-  bool child_check_n_lock(BitMapArea *child, int64_t required) override {
-    ceph_abort();
-    return false;
-  }
-
-  bool child_check_n_lock(BitMapZone* child, int64_t required, bool lock);
-
-  int64_t alloc_blocks_int(int64_t num_blocks, int64_t hint, int64_t *start_block);
-  int64_t alloc_blocks_dis_int(int64_t num_blocks, int64_t min_alloc, int64_t hint,
-        int64_t blk_off, ExtentList *block_list) override;  
-  void free_blocks_int(int64_t start_block, int64_t num_blocks) override;
-
-  ~BitMapAreaLeaf() override;
-};
-
-
-typedef enum bmap_alloc_mode {
-  SERIAL = 1,
-  CONCURRENT = 2,
-} bmap_alloc_mode_t;
-
-class BitAllocator:public BitMapAreaIN{
-private:
-  CephContext* const cct;
-  bmap_alloc_mode_t m_alloc_mode;
-  std::mutex m_serial_mutex;
-  pthread_rwlock_t m_rw_lock;
-  BitAllocatorStats *m_stats;
-  bool m_is_stats_on;
-  int64_t m_extra_blocks;
-
-  bool is_stats_on() {
-    return m_is_stats_on;
-  }
-
-  using BitMapArea::child_check_n_lock;
-  bool child_check_n_lock(BitMapArea *child, int64_t required) override;
-  void child_unlock(BitMapArea *child) override;
-
-  void serial_lock();
-  bool try_serial_lock();
-  void serial_unlock();
-  void lock_excl() override;
-  void lock_shared() override;
-  bool try_lock();
-  void unlock() override;
-
-  bool check_input(int64_t num_blocks);
-  bool check_input_dis(int64_t num_blocks);
-  void init_check(int64_t total_blocks, int64_t zone_size_block,
-                 bmap_alloc_mode_t mode, bool def, bool stats_on);
-  int64_t alloc_blocks_dis_work(int64_t num_blocks, int64_t min_alloc, int64_t hint, ExtentList *block_list, bool reserved);
-
-  int64_t alloc_blocks_dis_int(int64_t num_blocks, int64_t min_alloc, 
-           int64_t hint, int64_t area_blk_off, ExtentList *block_list) override;
-
-public:
-  MEMPOOL_CLASS_HELPERS();
-
-  BitAllocator(CephContext* cct, int64_t total_blocks,
-	       int64_t zone_size_block, bmap_alloc_mode_t mode);
-  BitAllocator(CephContext* cct, int64_t total_blocks, int64_t zone_size_block,
-	       bmap_alloc_mode_t mode, bool def);
-  BitAllocator(CephContext* cct, int64_t total_blocks, int64_t zone_size_block,
-               bmap_alloc_mode_t mode, bool def, bool stats_on);
-  ~BitAllocator() override;
-  void shutdown() override;
-  using BitMapAreaIN::alloc_blocks_dis; //Wait version
-
-  void free_blocks(int64_t start_block, int64_t num_blocks) override;
-  void set_blocks_used(int64_t start_block, int64_t num_blocks) override;
-  void unreserve_blocks(int64_t blocks);
-
-  int64_t alloc_blocks_dis_res(int64_t num_blocks, int64_t min_alloc, int64_t hint, ExtentList *block_list);
-
-  void free_blocks_dis(int64_t num_blocks, ExtentList *block_list);
-  bool is_allocated_dis(ExtentList *blocks, int64_t num_blocks);
-
-  int64_t total_blocks() const {
-    return m_total_blocks - m_extra_blocks;
-  }
-  int64_t get_used_blocks() override {
-    return (BitMapAreaIN::get_used_blocks_adj() - m_extra_blocks);
-  }
-
-  BitAllocatorStats *get_stats() {
-      return m_stats;
-  }
-  void dump();
-};
-
-#endif //End of file
diff -Nru ceph-12.2.11/src/os/bluestore/BitmapAllocator.cc ceph-12.2.12/src/os/bluestore/BitmapAllocator.cc
--- ceph-12.2.11/src/os/bluestore/BitmapAllocator.cc	1970-01-01 00:00:00.000000000 +0000
+++ ceph-12.2.12/src/os/bluestore/BitmapAllocator.cc	2019-04-11 12:33:50.000000000 +0000
@@ -0,0 +1,101 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#include "BitmapAllocator.h"
+
+#define dout_context cct
+#define dout_subsys ceph_subsys_bluestore
+#undef dout_prefix
+#define dout_prefix *_dout << "fbmap_alloc " << this << " "
+
+BitmapAllocator::BitmapAllocator(CephContext* _cct,
+					 int64_t capacity,
+					 int64_t alloc_unit) :
+    cct(_cct)
+{
+  ldout(cct, 10) << __func__ << " 0x" << std::hex << capacity << "/"
+		 << alloc_unit << std::dec << dendl;
+  _init(capacity, alloc_unit, false);
+}
+
+int64_t BitmapAllocator::allocate(
+  uint64_t want_size, uint64_t alloc_unit, uint64_t max_alloc_size,
+  int64_t hint, PExtentVector *extents)
+{
+  uint64_t allocated = 0;
+
+  ldout(cct, 10) << __func__ << std::hex << " 0x" << want_size
+		 << "/" << alloc_unit << "," << max_alloc_size << "," << hint
+		 << std::dec << dendl;
+    
+    
+  _allocate_l2(want_size, alloc_unit, max_alloc_size, hint,
+    &allocated, extents);
+  if (!allocated) {
+    return -ENOSPC;
+  }
+  for (auto e : *extents) {
+    ldout(cct, 10) << __func__
+                   << " 0x" << std::hex << e.offset << "~" << e.length
+		   << "/" << alloc_unit << "," << max_alloc_size << "," << hint
+		   << std::dec << dendl;
+  }
+  return int64_t(allocated);
+}
+
+void BitmapAllocator::release(
+  const interval_set<uint64_t>& release_set)
+{
+  for (auto r : release_set) {
+    ldout(cct, 10) << __func__ << " 0x" << std::hex << r.first << "~" << r.second
+		  << std::dec << dendl;
+  }
+  _free_l2(release_set);
+  ldout(cct, 10) << __func__ << " done" << dendl;
+}
+
+
+void BitmapAllocator::init_add_free(uint64_t offset, uint64_t length)
+{
+  ldout(cct, 10) << __func__ << " 0x" << std::hex << offset << "~" << length
+		  << std::dec << dendl;
+
+  auto mas = get_min_alloc_size();
+  uint64_t offs = ROUND_UP_TO(offset, mas);
+  uint64_t l = P2ALIGN(offset + length - offs, mas);
+
+  _mark_free(offs, l);
+  ldout(cct, 10) << __func__ << " done" << dendl;
+}
+void BitmapAllocator::init_rm_free(uint64_t offset, uint64_t length)
+{
+  ldout(cct, 10) << __func__ << " 0x" << std::hex << offset << "~" << length
+		 << std::dec << dendl;
+  auto mas = get_min_alloc_size();
+  uint64_t offs = ROUND_UP_TO(offset, mas);
+  uint64_t l = P2ALIGN(offset + length - offs, mas);
+  _mark_allocated(offs, l);
+  ldout(cct, 10) << __func__ << " done" << dendl;
+}
+
+void BitmapAllocator::shutdown()
+{
+  ldout(cct, 1) << __func__ << dendl;
+  _shutdown();
+}
+
+void BitmapAllocator::dump()
+{
+  // bin -> interval count
+  std::map<size_t, size_t> bins_overall;
+  collect_stats(bins_overall);
+  auto it = bins_overall.begin();
+  while (it != bins_overall.end()) {
+    ldout(cct, 0) << __func__
+	            << " bin " << it->first
+		    << "(< " << byte_u_t((1 << (it->first + 1)) * get_min_alloc_size()) << ")"
+		    << " : " << it->second << " extents"
+		    << dendl;
+    ++it;
+  }
+}
diff -Nru ceph-12.2.11/src/os/bluestore/BitMapAllocator.cc ceph-12.2.12/src/os/bluestore/BitMapAllocator.cc
--- ceph-12.2.11/src/os/bluestore/BitMapAllocator.cc	2019-01-30 15:51:26.000000000 +0000
+++ ceph-12.2.12/src/os/bluestore/BitMapAllocator.cc	1970-01-01 00:00:00.000000000 +0000
@@ -1,220 +0,0 @@
-// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
-// vim: ts=8 sw=2 smarttab
-/*
- * Bitmap based in-memory allocator.
- * Author: Ramesh Chander, Ramesh.Chander@sandisk.com
- *
- */
-
-#include "BitAllocator.h"
-
-#include "BitMapAllocator.h"
-#include "bluestore_types.h"
-#include "common/debug.h"
-
-#define dout_context cct
-#define dout_subsys ceph_subsys_bluestore
-#undef dout_prefix
-#define dout_prefix *_dout << "bitmapalloc:"
-
-
-BitMapAllocator::BitMapAllocator(CephContext* cct, int64_t device_size,
-				 int64_t block_size)
-  : cct(cct)
-{
-  if (!ISP2(block_size)) {
-    derr << __func__ << " block_size " << block_size
-         << " not power of 2 aligned!"
-         << dendl;
-    assert(ISP2(block_size));
-    return;
-  }
-
-  int64_t zone_size_blks = cct->_conf->bluestore_bitmapallocator_blocks_per_zone;
-  if (!ISP2(zone_size_blks)) {
-    derr << __func__ << " zone_size " << zone_size_blks
-         << " not power of 2 aligned!"
-         << dendl;
-    assert(ISP2(zone_size_blks));
-    return;
-  }
-
-  int64_t span_size = cct->_conf->bluestore_bitmapallocator_span_size;
-  if (!ISP2(span_size)) {
-    derr << __func__ << " span_size " << span_size
-         << " not power of 2 aligned!"
-         << dendl;
-    assert(ISP2(span_size));
-    return;
-  }
-
-  m_block_size = block_size;
-  m_total_size = P2ALIGN(device_size, block_size);
-  m_bit_alloc = new BitAllocator(cct, device_size / block_size,
-				 zone_size_blks, CONCURRENT, true);
-  if (!m_bit_alloc) {
-    derr << __func__ << " Unable to intialize Bit Allocator" << dendl;
-    assert(m_bit_alloc);
-  }
-  dout(10) << __func__ << " instance " << (uint64_t) this
-           << " size 0x" << std::hex << device_size << std::dec
-           << dendl;
-}
-
-BitMapAllocator::~BitMapAllocator()
-{
-  delete m_bit_alloc;
-}
-
-void BitMapAllocator::insert_free(uint64_t off, uint64_t len)
-{
-  dout(20) << __func__ << " instance " << (uint64_t) this
-           << " off 0x" << std::hex << off
-           << " len 0x" << len << std::dec
-           << dendl;
-
-  assert(!(off % m_block_size));
-  assert(!(len % m_block_size));
-
-  m_bit_alloc->free_blocks(off / m_block_size,
-             len / m_block_size);
-}
-
-int BitMapAllocator::reserve(uint64_t need)
-{
-  int nblks = need / m_block_size; // apply floor
-  assert(!(need % m_block_size));
-  dout(10) << __func__ << " instance " << (uint64_t) this
-           << " num_used " << m_bit_alloc->get_used_blocks()
-           << " total " << m_bit_alloc->total_blocks()
-           << dendl;
-
-  if (!m_bit_alloc->reserve_blocks(nblks)) {
-    return -ENOSPC;
-  }
-  return 0;
-}
-
-void BitMapAllocator::unreserve(uint64_t unused)
-{
-  int nblks = unused / m_block_size;
-  assert(!(unused % m_block_size));
-
-  dout(10) << __func__ << " instance " << (uint64_t) this
-           << " unused " << nblks
-           << " num used " << m_bit_alloc->get_used_blocks()
-           << " total " << m_bit_alloc->total_blocks()
-           << dendl;
-
-  m_bit_alloc->unreserve_blocks(nblks);
-}
-
-int64_t BitMapAllocator::allocate(
-  uint64_t want_size, uint64_t alloc_unit, uint64_t max_alloc_size,
-  int64_t hint, mempool::bluestore_alloc::vector<AllocExtent> *extents)
-{
-
-  assert(!(alloc_unit % m_block_size));
-  assert(alloc_unit);
-
-  assert(!max_alloc_size || max_alloc_size >= alloc_unit);
-
-  dout(10) << __func__ <<" instance "<< (uint64_t) this
-     << " want_size " << want_size
-     << " alloc_unit " << alloc_unit
-     << " hint " << hint
-     << dendl;
-  hint = hint % m_total_size; // make hint error-tolerant
-  return allocate_dis(want_size, alloc_unit / m_block_size,
-                      max_alloc_size, hint / m_block_size, extents);
-}
-
-int64_t BitMapAllocator::allocate_dis(
-  uint64_t want_size, uint64_t alloc_unit, uint64_t max_alloc_size,
-  int64_t hint, mempool::bluestore_alloc::vector<AllocExtent> *extents)
-{
-  ExtentList block_list = ExtentList(extents, m_block_size, max_alloc_size);
-  int64_t nblks = (want_size + m_block_size - 1) / m_block_size;
-  int64_t num = 0;
-
-  num = m_bit_alloc->alloc_blocks_dis_res(nblks, alloc_unit, hint, &block_list);
-  if (num == 0) {
-    return -ENOSPC;
-  }
-
-  return num * m_block_size;
-}
-
-void BitMapAllocator::release(
-  uint64_t offset, uint64_t length)
-{
-  dout(10) << __func__ << " 0x"
-           << std::hex << offset << "~" << length << std::dec
-           << dendl;
-  insert_free(offset, length);
-}
-
-uint64_t BitMapAllocator::get_free()
-{
-  assert(m_bit_alloc->total_blocks() >= m_bit_alloc->get_used_blocks());
-  return ((
-    m_bit_alloc->total_blocks() - m_bit_alloc->get_used_blocks()) *
-    m_block_size);
-}
-
-void BitMapAllocator::dump()
-{
-  dout(0) << __func__ << " instance " << this << dendl;
-  m_bit_alloc->dump();
-}
-
-void BitMapAllocator::init_add_free(uint64_t offset, uint64_t length)
-{
-  dout(10) << __func__ << " instance " << (uint64_t) this
-           << " offset 0x" << std::hex << offset
-           << " length 0x" << length << std::dec
-           << dendl;
-  uint64_t size = m_bit_alloc->size() * m_block_size;
-
-  uint64_t offset_adj = ROUND_UP_TO(offset, m_block_size);
-  uint64_t length_adj = ((length - (offset_adj - offset)) /
-                         m_block_size) * m_block_size;
-
-  if ((offset_adj + length_adj) > size) {
-    assert(((offset_adj + length_adj) - m_block_size) < size);
-    length_adj = size - offset_adj;
-  }
-
-  insert_free(offset_adj, length_adj);
-}
-
-void BitMapAllocator::init_rm_free(uint64_t offset, uint64_t length)
-{
-  dout(10) << __func__ << " instance " << (uint64_t) this
-           << " offset 0x" << std::hex << offset
-           << " length 0x" << length << std::dec
-           << dendl;
-
-  // we use the same adjustment/alignment that init_add_free does
-  // above so that we can yank back some of the space.
-  uint64_t offset_adj = ROUND_UP_TO(offset, m_block_size);
-  uint64_t length_adj = ((length - (offset_adj - offset)) /
-                         m_block_size) * m_block_size;
-
-  assert(!(offset_adj % m_block_size));
-  assert(!(length_adj % m_block_size));
-
-  int64_t first_blk = offset_adj / m_block_size;
-  int64_t count = length_adj / m_block_size;
-
-  if (count)
-    m_bit_alloc->set_blocks_used(first_blk, count);
-}
-
-
-void BitMapAllocator::shutdown()
-{
-  dout(10) << __func__ << " instance " << (uint64_t) this << dendl;
-  m_bit_alloc->shutdown();
-}
-
diff -Nru ceph-12.2.11/src/os/bluestore/BitmapAllocator.h ceph-12.2.12/src/os/bluestore/BitmapAllocator.h
--- ceph-12.2.11/src/os/bluestore/BitmapAllocator.h	1970-01-01 00:00:00.000000000 +0000
+++ ceph-12.2.12/src/os/bluestore/BitmapAllocator.h	2019-04-11 12:33:50.000000000 +0000
@@ -0,0 +1,50 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#ifndef CEPH_OS_BLUESTORE_BITMAPFASTALLOCATOR_H
+#define CEPH_OS_BLUESTORE_BITMAPFASTALLOCATOR_H
+
+#include <mutex>
+
+#include "Allocator.h"
+#include "os/bluestore/bluestore_types.h"
+#include "fastbmap_allocator_impl.h"
+#include "include/mempool.h"
+#include "common/debug.h"
+
+class BitmapAllocator : public Allocator,
+  public AllocatorLevel02<AllocatorLevel01Loose> {
+  CephContext* cct;
+
+public:
+  BitmapAllocator(CephContext* _cct, int64_t capacity, int64_t alloc_unit);
+  ~BitmapAllocator() override
+  {
+  }
+
+
+  int64_t allocate(
+    uint64_t want_size, uint64_t alloc_unit, uint64_t max_alloc_size,
+    int64_t hint, PExtentVector *extents) override;
+
+  void release(
+    const interval_set<uint64_t>& release_set) override;
+
+  uint64_t get_free() override
+  {
+    return get_available();
+  }
+
+  void dump() override;
+  double get_fragmentation(uint64_t) override
+  {
+    return _get_fragmentation();
+  }
+
+  void init_add_free(uint64_t offset, uint64_t length) override;
+  void init_rm_free(uint64_t offset, uint64_t length) override;
+
+  void shutdown() override;
+};
+
+#endif
diff -Nru ceph-12.2.11/src/os/bluestore/BitMapAllocator.h ceph-12.2.12/src/os/bluestore/BitMapAllocator.h
--- ceph-12.2.11/src/os/bluestore/BitMapAllocator.h	2019-01-30 15:51:26.000000000 +0000
+++ ceph-12.2.12/src/os/bluestore/BitMapAllocator.h	1970-01-01 00:00:00.000000000 +0000
@@ -1,50 +0,0 @@
-// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
-// vim: ts=8 sw=2 smarttab
-
-#ifndef CEPH_OS_BLUESTORE_BITMAPALLOCATOR_H
-#define CEPH_OS_BLUESTORE_BITMAPALLOCATOR_H
-
-#include <mutex>
-
-#include "Allocator.h"
-#include "BitAllocator.h"
-
-class BitMapAllocator : public Allocator {
-  CephContext* cct;
-
-  int64_t m_block_size;
-  int64_t m_total_size;
-
-  BitAllocator *m_bit_alloc; // Bit allocator instance
-
-  void insert_free(uint64_t offset, uint64_t len);
-
-  int64_t allocate_dis(
-    uint64_t want_size, uint64_t alloc_unit, uint64_t max_alloc_size,
-    int64_t hint, mempool::bluestore_alloc::vector<AllocExtent> *extents);
-
-public:
-  BitMapAllocator(CephContext* cct, int64_t device_size, int64_t block_size);
-  ~BitMapAllocator() override;
-
-  int reserve(uint64_t need) override;
-  void unreserve(uint64_t unused) override;
-
-  int64_t allocate(
-    uint64_t want_size, uint64_t alloc_unit, uint64_t max_alloc_size,
-    int64_t hint, mempool::bluestore_alloc::vector<AllocExtent> *extents) override;
-
-  void release(
-    uint64_t offset, uint64_t length) override;
-
-  uint64_t get_free() override;
-
-  void dump() override;
-
-  void init_add_free(uint64_t offset, uint64_t length) override;
-  void init_rm_free(uint64_t offset, uint64_t length) override;
-
-  void shutdown() override;
-};
-
-#endif
diff -Nru ceph-12.2.11/src/os/bluestore/BlockDevice.h ceph-12.2.12/src/os/bluestore/BlockDevice.h
--- ceph-12.2.11/src/os/bluestore/BlockDevice.h	2019-01-30 15:51:26.000000000 +0000
+++ ceph-12.2.12/src/os/bluestore/BlockDevice.h	2019-04-11 12:33:50.000000000 +0000
@@ -64,13 +64,13 @@
   void aio_wait();
 
   void try_aio_wake() {
+    std::lock_guard<std::mutex> l(lock);
     if (num_running == 1) {
 
       // we might have some pending IOs submitted after the check
       // as there is no lock protection for aio_submit.
       // Hence we might have false conditional trigger.
       // aio_wait has to handle that hence do not care here.
-      std::lock_guard<std::mutex> l(lock);
       cond.notify_all();
       --num_running;
       assert(num_running >= 0);
diff -Nru ceph-12.2.11/src/os/bluestore/BlueFS.cc ceph-12.2.12/src/os/bluestore/BlueFS.cc
--- ceph-12.2.11/src/os/bluestore/BlueFS.cc	2019-01-30 15:51:26.000000000 +0000
+++ ceph-12.2.12/src/os/bluestore/BlueFS.cc	2019-04-11 12:33:50.000000000 +0000
@@ -186,23 +186,19 @@
 }
 
 int BlueFS::reclaim_blocks(unsigned id, uint64_t want,
-			   AllocExtentVector *extents)
+			   PExtentVector *extents)
 {
   std::unique_lock<std::mutex> l(lock);
   dout(1) << __func__ << " bdev " << id
           << " want 0x" << std::hex << want << std::dec << dendl;
   assert(id < alloc.size());
   assert(alloc[id]);
-  int r = alloc[id]->reserve(want);
-  assert(r == 0); // caller shouldn't ask for more than they can get
+
   int64_t got = alloc[id]->allocate(want, cct->_conf->bluefs_alloc_size, 0,
 				    extents);
-  if (got < (int64_t)want) {
-    alloc[id]->unreserve(want - MAX(0, got));
-  }
-  if (got <= 0) {
+  if (got < 0) {
     derr << __func__ << " failed to allocate space to return to bluestore"
-	 << dendl;
+      << dendl;
     alloc[id]->dump();
     return got;
   }
@@ -214,7 +210,7 @@
   }
 
   flush_bdev();
-  r = _flush_and_sync_log(l);
+  int r = _flush_and_sync_log(l);
   assert(r == 0);
 
   if (logger)
@@ -1406,6 +1402,9 @@
     return 0;
   }
 
+  vector<interval_set<uint64_t>> to_release(pending_release.size());
+  to_release.swap(pending_release);
+
   uint64_t seq = log_t.seq = ++log_seq;
   assert(want_seq == 0 || want_seq <= seq);
   log_t.uuid = super.uuid;
@@ -1498,6 +1497,14 @@
              << " already >= out seq " << seq
              << ", we lost a race against another log flush, done" << dendl;
   }
+
+  for (unsigned i = 0; i < to_release.size(); ++i) {
+    if (!to_release[i].empty()) {
+      /* OK, now we have the guarantee alloc[i] won't be null. */
+      alloc[i]->release(to_release[i]);
+    }
+  }
+
   _update_logger_stats();
 
   return 0;
@@ -1852,15 +1859,10 @@
   uint64_t min_alloc_size = cct->_conf->bluefs_alloc_size;
 
   uint64_t left = ROUND_UP_TO(len, min_alloc_size);
-  int r = -ENOSPC;
   int64_t alloc_len = 0;
-  AllocExtentVector extents;
+  PExtentVector extents;
   
   if (alloc[id]) {
-    r = alloc[id]->reserve(left);
-  }
-  
-  if (r == 0) {
     uint64_t hint = 0;
     if (!node->extents.empty() && node->extents.back().bdev == id) {
       hint = node->extents.back().end();
@@ -1868,12 +1870,9 @@
     extents.reserve(4);  // 4 should be (more than) enough for most allocations
     alloc_len = alloc[id]->allocate(left, min_alloc_size, hint, &extents);
   }
-  if (r < 0 || (alloc_len < (int64_t)left)) {
-    if (r == 0) {
-      alloc[id]->unreserve(left - alloc_len);
-      for (auto& p : extents) {
-        alloc[id]->release(p.offset, p.length);
-      }
+  if (alloc_len < (int64_t)left) {
+    if (alloc_len != 0) {
+      alloc[id]->release(extents);
     }
     if (id != BDEV_SLOW) {
       if (bdev[id]) {
@@ -1933,15 +1932,9 @@
   }
   dout(10) << __func__ << dendl;
   utime_t start = ceph_clock_now();
-  vector<interval_set<uint64_t>> to_release(pending_release.size());
-  to_release.swap(pending_release);
   flush_bdev(); // FIXME?
   _flush_and_sync_log(l);
-  for (unsigned i = 0; i < to_release.size(); ++i) {
-    for (auto p = to_release[i].begin(); p != to_release[i].end(); ++p) {
-      alloc[i]->release(p.get_start(), p.get_len());
-    }
-  }
+  dout(10) << __func__ << " done in " << (ceph_clock_now() - start) << dendl;
 
   if (_should_compact_log()) {
     if (cct->_conf->bluefs_compact_log_sync) {
diff -Nru ceph-12.2.11/src/os/bluestore/BlueFS.h ceph-12.2.12/src/os/bluestore/BlueFS.h
--- ceph-12.2.11/src/os/bluestore/BlueFS.h	2019-01-30 15:51:26.000000000 +0000
+++ ceph-12.2.12/src/os/bluestore/BlueFS.h	2019-04-11 12:33:50.000000000 +0000
@@ -403,7 +403,7 @@
 
   /// reclaim block space
   int reclaim_blocks(unsigned bdev, uint64_t want,
-		     AllocExtentVector *extents);
+		     PExtentVector *extents);
 
   void flush(FileWriter *h) {
     std::lock_guard<std::mutex> l(lock);
diff -Nru ceph-12.2.11/src/os/bluestore/bluefs_types.h ceph-12.2.12/src/os/bluestore/bluefs_types.h
--- ceph-12.2.11/src/os/bluestore/bluefs_types.h	2019-01-30 15:51:26.000000000 +0000
+++ ceph-12.2.12/src/os/bluestore/bluefs_types.h	2019-04-11 12:33:50.000000000 +0000
@@ -8,13 +8,16 @@
 #include "include/encoding.h"
 #include "include/denc.h"
 
-class bluefs_extent_t : public AllocExtent{
+class bluefs_extent_t {
 public:
   uint8_t bdev;
+  uint64_t offset = 0;
+  uint32_t length = 0;
 
   bluefs_extent_t(uint8_t b = 0, uint64_t o = 0, uint32_t l = 0)
-    : AllocExtent(o, l), bdev(b) {}
+    : bdev(b), offset(o), length(l) {}
 
+  uint64_t end() const { return  offset + length; }
   DENC(bluefs_extent_t, v, p) {
     DENC_START(1, 1, p);
     denc_lba(v.offset, p);
diff -Nru ceph-12.2.11/src/os/bluestore/BlueStore.cc ceph-12.2.12/src/os/bluestore/BlueStore.cc
--- ceph-12.2.11/src/os/bluestore/BlueStore.cc	2019-01-30 15:51:26.000000000 +0000
+++ ceph-12.2.12/src/os/bluestore/BlueStore.cc	2019-04-11 12:33:50.000000000 +0000
@@ -4192,6 +4192,8 @@
                     "Read EIO errors propagated to high level callers");
   b.add_u64_counter(l_bluestore_reads_with_retries, "bluestore_reads_with_retries",
                     "Read operations that required at least one retry due to failed checksum validation");
+  b.add_u64(l_bluestore_fragmentation, "bluestore_fragmentation_micros",
+            "How fragmented bluestore free space is (free extents / max possible number of free extents) * 1000");
   logger = b.create_perf_counters();
   cct->get_perfcounters_collection()->add(logger);
 }
@@ -5171,18 +5173,12 @@
     dout(10) << __func__ << " gifting " << gift
 	     << " (" << byte_u_t(gift) << ")" << dendl;
 
-    // fixme: just do one allocation to start...
-    int r = alloc->reserve(gift);
-    assert(r == 0);
-
-    AllocExtentVector exts;
     int64_t alloc_len = alloc->allocate(gift, cct->_conf->bluefs_alloc_size,
-					0, 0, &exts);
+					0, 0, extents);
 
     if (alloc_len <= 0) {
       dout(0) << __func__ << " no allocate on 0x" << std::hex << gift
               << " min_alloc_size 0x" << min_alloc_size << std::dec << dendl;
-      alloc->unreserve(gift);
       _dump_alloc_on_rebalance_failure();
       return 0;
     } else if (alloc_len < (int64_t)gift) {
@@ -5190,13 +5186,10 @@
               << " min_alloc_size 0x" << min_alloc_size 
 	      << " allocated 0x" << alloc_len
 	      << std::dec << dendl;
-      alloc->unreserve(gift - alloc_len);
       _dump_alloc_on_rebalance_failure();
     }
-    for (auto& p : exts) {
-      bluestore_pextent_t e = bluestore_pextent_t(p);
+    for (auto& e : *extents) {
       dout(1) << __func__ << " gifting " << e << " to bluefs" << dendl;
-      extents->push_back(e);
     }
     gift = 0;
 
@@ -5215,7 +5208,7 @@
 
     while (reclaim > 0) {
       // NOTE: this will block and do IO.
-      AllocExtentVector extents;
+      PExtentVector extents;
       int r = bluefs->reclaim_blocks(bluefs_shared_bdev, reclaim,
 				     &extents);
       if (r < 0) {
@@ -6184,12 +6177,23 @@
       }
       if (deep) {
 	bufferlist bl;
-	int r = _do_read(c.get(), o, 0, o->onode.size, bl, 0);
-	if (r < 0) {
-	  ++errors;
-	  derr << "fsck error: " << oid << " error during read: "
-	       << cpp_strerror(r) << dendl;
-	}
+	uint64_t max_read_block = cct->_conf->bluestore_fsck_read_bytes_cap;
+	uint64_t offset = 0;
+	do {
+	  uint64_t l = std::min(uint64_t(o->onode.size - offset), max_read_block);
+	  int r = _do_read(c.get(), o, offset, l, bl,
+	    CEPH_OSD_OP_FLAG_FADVISE_NOCACHE);
+	  if (r < 0) {
+	    ++errors;
+	    derr << "fsck error: " << oid << std::hex
+	         << " error during read: "
+		 << " " << offset << "~" << l
+		 << " " << cpp_strerror(r) << std::dec
+		 << dendl;
+	    break;
+	  }
+	  offset += l;
+	} while (offset < o->onode.size);
       }
       // omap
       if (o->onode.has_omap()) {
@@ -8525,21 +8529,24 @@
     dout(10) << __func__ << " reaping empty zombie osr " << osr << dendl;
     osr->_unregister();
   }
+  logger->set(l_bluestore_fragmentation,
+    (uint64_t)(alloc->get_fragmentation(min_alloc_size) * 1000));
 }
 
 void BlueStore::_txc_release_alloc(TransContext *txc)
 {
-  // update allocator with full released set
+  interval_set<uint64_t> bulk_release_extents;
+  // it's expected we're called with lazy_release_lock already taken!
   if (!cct->_conf->bluestore_debug_no_reuse_blocks) {
     dout(10) << __func__ << " " << txc << " " << std::hex
              << txc->released << std::dec << dendl;
-    for (interval_set<uint64_t>::iterator p = txc->released.begin();
-	 p != txc->released.end();
-	 ++p) {
-      alloc->release(p.get_start(), p.get_len());
-    }
+    // interval_set seems to be too costly for inserting things in
+    // bstore_kv_final. We could serialize in simpler format and perform
+    // the merge separately, maybe even in a dedicated thread.
+    bulk_release_extents.insert(txc->released);
   }
 
+  alloc->release(bulk_release_extents);
   txc->allocated.clear();
   txc->released.clear();
 }
@@ -8692,18 +8699,24 @@
   dout(10) << __func__ << " start" << dendl;
   std::unique_lock<std::mutex> l(kv_lock);
   assert(!kv_sync_started);
+  bool bluefs_do_check_balance = false;
   kv_sync_started = true;
   kv_cond.notify_all();
   while (true) {
     assert(kv_committing.empty());
     if (kv_queue.empty() &&
 	((deferred_done_queue.empty() && deferred_stable_queue.empty()) ||
-	 !deferred_aggressive)) {
+	 !deferred_aggressive) &&
+	(bluefs_do_check_balance == false)) {
       if (kv_stop)
 	break;
       dout(20) << __func__ << " sleep" << dendl;
-      kv_cond.wait(l);
+      std::cv_status status = kv_cond.wait_for(l,
+        std::chrono::milliseconds(int64_t(cct->_conf->bluestore_bluefs_balance_interval * 1000)));
       dout(20) << __func__ << " wake" << dendl;
+      if (status == std::cv_status::timeout) {
+        bluefs_do_check_balance = true;
+      }
     } else {
       deque<TransContext*> kv_submitting;
       deque<DeferredBatch*> deferred_done, deferred_stable;
@@ -8838,6 +8851,7 @@
 	  synct->set(PREFIX_SUPER, "bluefs_extents", bl);
 	}
       }
+      bluefs_do_check_balance = false;
 
       // cleanup sync deferred keys
       for (auto b : deferred_stable) {
@@ -8890,14 +8904,9 @@
 	if (!bluefs_gift_extents.empty()) {
 	  _commit_bluefs_freespace(bluefs_gift_extents);
 	}
-	for (auto p = bluefs_extents_reclaiming.begin();
-	     p != bluefs_extents_reclaiming.end();
-	     ++p) {
-	  dout(20) << __func__ << " releasing old bluefs 0x" << std::hex
-		   << p.get_start() << "~" << p.get_len() << std::dec
-		   << dendl;
-	  alloc->release(p.get_start(), p.get_len());
-	}
+	dout(20) << __func__ << " releasing old bluefs 0x" << std::hex
+		 << bluefs_extents_reclaiming << std::dec << dendl;
+	alloc->release(bluefs_extents_reclaiming);
 	bluefs_extents_reclaiming.clear();
       }
 
@@ -10273,36 +10282,58 @@
       // FIXME: memory alignment here is bad
       bufferlist t;
       int r = c->compress(wi.bl, t);
-      assert(r == 0);
-
-      bluestore_compression_header_t chdr;
-      chdr.type = c->get_type();
-      chdr.length = t.length();
-      ::encode(chdr, wi.compressed_bl);
-      wi.compressed_bl.claim_append(t);
 
-      wi.compressed_len = wi.compressed_bl.length();
-      uint64_t newlen = P2ROUNDUP(wi.compressed_len, min_alloc_size);
       uint64_t want_len_raw = wi.blob_length * crr;
       uint64_t want_len = P2ROUNDUP(want_len_raw, min_alloc_size);
-      if (newlen <= want_len && newlen < wi.blob_length) {
-	// Cool. We compressed at least as much as we were hoping to.
-	// pad out to min_alloc_size
-	wi.compressed_bl.append_zero(newlen - wi.compressed_len);
-	logger->inc(l_bluestore_write_pad_bytes, newlen - wi.compressed_len);
-	dout(20) << __func__ << std::hex << "  compressed 0x" << wi.blob_length
-		 << " -> 0x" << wi.compressed_len << " => 0x" << newlen
-		 << " with " << c->get_type()
-		 << std::dec << dendl;
-	txc->statfs_delta.compressed() += wi.compressed_len;
-	txc->statfs_delta.compressed_original() += wi.blob_length;
-	txc->statfs_delta.compressed_allocated() += newlen;
-	logger->inc(l_bluestore_compress_success_count);
-	wi.compressed = true;
-	need += newlen;
+      bool rejected = false;
+      uint64_t compressed_len = t.length();
+      // do an approximate (fast) estimation for resulting blob size
+      // that doesn't take header overhead  into account
+      uint64_t result_len = P2ROUNDUP(compressed_len, min_alloc_size);
+      if (r == 0 && result_len <= want_len && result_len < wi.blob_length) {
+	bluestore_compression_header_t chdr;
+	chdr.type = c->get_type();
+	chdr.length = t.length();
+	encode(chdr, wi.compressed_bl);
+	wi.compressed_bl.claim_append(t);
+
+	compressed_len = wi.compressed_bl.length();
+	result_len = P2ROUNDUP(compressed_len, min_alloc_size);
+	if (result_len <= want_len && result_len < wi.blob_length) {
+	  // Cool. We compressed at least as much as we were hoping to.
+	  // pad out to min_alloc_size
+	  wi.compressed_bl.append_zero(result_len - compressed_len);
+	  wi.compressed_len = compressed_len;
+	  wi.compressed = true;
+	  logger->inc(l_bluestore_write_pad_bytes, result_len - compressed_len);
+	  dout(20) << __func__ << std::hex << "  compressed 0x" << wi.blob_length
+		   << " -> 0x" << compressed_len << " => 0x" << result_len
+		   << " with " << c->get_type()
+		   << std::dec << dendl;
+	  txc->statfs_delta.compressed() += compressed_len;
+	  txc->statfs_delta.compressed_original() += wi.blob_length;
+	  txc->statfs_delta.compressed_allocated() += result_len;
+	  logger->inc(l_bluestore_compress_success_count);
+	  need += result_len;
+	} else {
+	  rejected = true;
+	}
+      } else if (r != 0) {
+	dout(5) << __func__ << std::hex << "  0x" << wi.blob_length
+		 << " bytes compressed using " << c->get_type_name()
+		 << std::dec
+		 << " failed with errcode = " << r
+		 << ", leaving uncompressed"
+		 << dendl;
+	logger->inc(l_bluestore_compress_rejected_count);
+	need += wi.blob_length;
       } else {
+	rejected = true;
+      }
+
+      if (rejected) {
 	dout(20) << __func__ << std::hex << "  0x" << wi.blob_length
-		 << " compressed to 0x" << wi.compressed_len << " -> 0x" << newlen
+		 << " compressed to 0x" << compressed_len << " -> 0x" << result_len
 		 << " with " << c->get_type()
 		 << ", which is more than required 0x" << want_len_raw
 		 << " -> 0x" << want_len
@@ -10317,19 +10348,19 @@
       need += wi.blob_length;
     }
   }
-  int r = alloc->reserve(need);
-  if (r < 0) {
-    derr << __func__ << " failed to reserve 0x" << std::hex << need << std::dec
-	 << dendl;
-    return r;
-  }
-  AllocExtentVector prealloc;
+  PExtentVector prealloc;
   prealloc.reserve(2 * wctx->writes.size());;
   int prealloc_left = 0;
   prealloc_left = alloc->allocate(
     need, min_alloc_size, need,
     0, &prealloc);
+  if (prealloc_left  < 0) {
+    derr << __func__ << " failed to allocate 0x" << std::hex << need << std::dec
+	 << dendl;
+    return -ENOSPC;
+  }
   assert(prealloc_left == (int64_t)need);
+
   dout(20) << __func__ << " prealloc " << prealloc << dendl;
   auto prealloc_pos = prealloc.begin();
 
@@ -10381,7 +10412,7 @@
       }
     }
 
-    AllocExtentVector extents;
+    PExtentVector extents;
     int64_t left = final_length;
     while (left > 0) {
       assert(prealloc_left > 0);
@@ -10963,7 +10994,7 @@
     );
   }
   txc->t->rmkey(PREFIX_OBJ, o->key.c_str(), o->key.size());
-  txc->removed(o);
+  txc->note_removed_object(o);
   o->extent_map.clear();
   o->onode = bluestore_onode_t();
   _debug_obj_on_delete(o->oid);
diff -Nru ceph-12.2.11/src/os/bluestore/BlueStore.h ceph-12.2.12/src/os/bluestore/BlueStore.h
--- ceph-12.2.11/src/os/bluestore/BlueStore.h	2019-01-30 15:51:26.000000000 +0000
+++ ceph-12.2.12/src/os/bluestore/BlueStore.h	2019-04-11 12:33:50.000000000 +0000
@@ -119,6 +119,7 @@
   l_bluestore_gc_merged,
   l_bluestore_read_eio,
   l_bluestore_reads_with_retries,
+  l_bluestore_fragmentation,
   l_bluestore_last
 };
 
@@ -942,7 +943,7 @@
       uint64_t min_alloc_size);
 
     /// return a collection of extents to perform GC on
-    const vector<AllocExtent>& get_extents_to_collect() const {
+    const vector<bluestore_pextent_t>& get_extents_to_collect() const {
       return extents_to_collect;
     }
     GarbageCollector(CephContext* _cct) : cct(_cct) {}
@@ -972,8 +973,8 @@
                                          ///< copies that are affected by the
                                          ///< specific write
 
-    vector<AllocExtent> extents_to_collect; ///< protrusive extents that should
-                                            ///< be collected if GC takes place
+    ///< protrusive extents that should be collected if GC takes place
+    vector<bluestore_pextent_t> extents_to_collect;
 
     boost::optional<uint64_t > used_alloc_unit; ///< last processed allocation
                                                 ///<  unit when traversing 
@@ -1605,9 +1606,9 @@
       // onode itself isn't written, though
       modified_objects.insert(o);
     }
-    void removed(OnodeRef& o) {
+    void note_removed_object(OnodeRef& o) {
       onodes.erase(o);
-      modified_objects.erase(o);
+      modified_objects.insert(o);
     }
 
     void aio_finish(BlueStore *store) override {
diff -Nru ceph-12.2.11/src/os/bluestore/bluestore_types.cc ceph-12.2.12/src/os/bluestore/bluestore_types.cc
--- ceph-12.2.11/src/os/bluestore/bluestore_types.cc	2019-01-30 15:51:26.000000000 +0000
+++ ceph-12.2.12/src/os/bluestore/bluestore_types.cc	2019-04-11 12:33:50.000000000 +0000
@@ -17,28 +17,6 @@
 #include "common/Checksummer.h"
 #include "include/stringify.h"
 
-void ExtentList::add_extents(int64_t start, int64_t count) {
-  AllocExtent *last_extent = NULL;
-  bool can_merge = false;
-
-  if (!m_extents->empty()) {
-    last_extent = &(m_extents->back());
-    uint64_t last_offset = last_extent->end() / m_block_size;
-    uint32_t last_length = last_extent->length / m_block_size;
-    if ((last_offset == (uint64_t) start) &&
-        (!m_max_blocks || (last_length + count) <= m_max_blocks)) {
-      can_merge = true;
-    }
-  }
-
-  if (can_merge) {
-    last_extent->length += (count * m_block_size);
-  } else {
-    m_extents->emplace_back(AllocExtent(start * m_block_size,
-					count * m_block_size));
-  }
-}
-
 // bluestore_bdev_label_t
 
 void bluestore_bdev_label_t::encode(bufferlist& bl) const
@@ -767,7 +745,7 @@
     return 0;
 }
 
-void bluestore_blob_t::allocated(uint32_t b_off, uint32_t length, const AllocExtentVector& allocs)
+void bluestore_blob_t::allocated(uint32_t b_off, uint32_t length, const PExtentVector& allocs)
 {
   if (extents.size() == 0) {
     // if blob is compressed then logical length to be already configured
@@ -779,6 +757,7 @@
     if (b_off) {
       extents.emplace_back(
         bluestore_pextent_t(bluestore_pextent_t::INVALID_OFFSET, b_off));
+
     }
     uint32_t new_len = b_off;
     for (auto& a : allocs) {
@@ -859,7 +838,8 @@
   void flush() {
     if (invalid) {
       v.emplace_back(bluestore_pextent_t(bluestore_pextent_t::INVALID_OFFSET,
-	invalid));
+        invalid));
+
       invalid = 0;
     }
   }
@@ -869,7 +849,7 @@
     }
     else {
       flush();
-      v.emplace_back(bluestore_pextent_t(offset, length));
+      v.emplace_back(offset, length);
     }
   }
 };
diff -Nru ceph-12.2.11/src/os/bluestore/bluestore_types.h ceph-12.2.12/src/os/bluestore/bluestore_types.h
--- ceph-12.2.11/src/os/bluestore/bluestore_types.h	2019-01-30 15:51:26.000000000 +0000
+++ ceph-12.2.12/src/os/bluestore/bluestore_types.h	2019-04-11 12:33:50.000000000 +0000
@@ -66,93 +66,37 @@
 
 ostream& operator<<(ostream& out, const bluestore_cnode_t& l);
 
-class AllocExtent;
-typedef mempool::bluestore_alloc::vector<AllocExtent> AllocExtentVector;
-class AllocExtent {
-public:
-  uint64_t offset;
-  uint32_t length;
-
-  AllocExtent() { 
-    offset = 0;
-    length = 0;
-  }
-
-  AllocExtent(int64_t off, int32_t len) : offset(off), length(len) { }
-  uint64_t end() const {
-    return offset + length;
-  }
-  bool operator==(const AllocExtent& other) const {
-    return offset == other.offset && length == other.length;
-  }
-};
-
-inline static ostream& operator<<(ostream& out, const AllocExtent& e) {
-  return out << "0x" << std::hex << e.offset << "~" << e.length << std::dec;
-}
-
-class ExtentList {
-  AllocExtentVector *m_extents;
-  int64_t m_block_size;
-  int64_t m_max_blocks;
+template <typename OFFS_TYPE, typename LEN_TYPE>
+struct bluestore_interval_t
+{
+  static const uint64_t INVALID_OFFSET = ~0ull;
 
-public:
-  void init(AllocExtentVector *extents, int64_t block_size,
-	    uint64_t max_alloc_size) {
-    m_extents = extents;
-    m_block_size = block_size;
-    m_max_blocks = max_alloc_size / block_size;
-    assert(m_extents->empty());
-  }
+  OFFS_TYPE offset = 0;
+  LEN_TYPE length = 0;
 
-  ExtentList(AllocExtentVector *extents, int64_t block_size) {
-    init(extents, block_size, 0);
-  }
+  bluestore_interval_t(){}
+  bluestore_interval_t(uint64_t o, uint64_t l) : offset(o), length(l) {}
 
-  ExtentList(AllocExtentVector *extents, int64_t block_size,
-	     uint64_t max_alloc_size) {
-    init(extents, block_size, max_alloc_size);
-  }
-
-  void reset() {
-    m_extents->clear();
+  bool is_valid() const {
+    return offset != INVALID_OFFSET;
   }
-
-  void add_extents(int64_t start, int64_t count);
-
-  AllocExtentVector *get_extents() {
-    return m_extents;
+  uint64_t end() const {
+    return offset != INVALID_OFFSET ? offset + length : INVALID_OFFSET;
   }
 
-  std::pair<int64_t, int64_t> get_nth_extent(int index) {
-      return std::make_pair
-            ((*m_extents)[index].offset / m_block_size,
-             (*m_extents)[index].length / m_block_size);
+  bool operator==(const bluestore_interval_t& other) const {
+    return offset == other.offset && length == other.length;
   }
 
-  int64_t get_extent_count() {
-    return m_extents->size();
-  }
 };
 
-
 /// pextent: physical extent
-struct bluestore_pextent_t : public AllocExtent {
-  const static uint64_t INVALID_OFFSET = ~0ull;
-
-  bluestore_pextent_t() : AllocExtent() {}
-  bluestore_pextent_t(uint64_t o, uint64_t l) : AllocExtent(o, l) {}
-  bluestore_pextent_t(const AllocExtent &ext) :
-    AllocExtent(ext.offset, ext.length) { }
-
-  bluestore_pextent_t& operator=(const AllocExtent &ext) {
-    offset = ext.offset;
-    length = ext.length;
-    return *this;
-  }
-  bool is_valid() const {
-    return offset != INVALID_OFFSET;
-  }
+struct bluestore_pextent_t : public bluestore_interval_t<uint64_t, uint32_t> 
+{
+  bluestore_pextent_t() {}
+  bluestore_pextent_t(uint64_t o, uint64_t l) : bluestore_interval_t(o, l) {}
+  bluestore_pextent_t(const bluestore_interval_t &ext) :
+    bluestore_interval_t(ext.offset, ext.length) {}
 
   DENC(bluestore_pextent_t, v, p) {
     denc_lba(v.offset, p);
@@ -201,7 +145,6 @@
   }
 };
 
-
 /// extent_map: a map of reference counted extents
 struct bluestore_extent_ref_map_t {
   struct record_t {
@@ -890,7 +833,7 @@
   }
 
   void split(uint32_t blob_offset, bluestore_blob_t& rb);
-  void allocated(uint32_t b_off, uint32_t length, const AllocExtentVector& allocs);
+  void allocated(uint32_t b_off, uint32_t length, const PExtentVector& allocs);
   void allocated_test(const bluestore_pextent_t& alloc); // intended for UT only
 
   /// updates blob's pextents container and return unused pextents eligible
diff -Nru ceph-12.2.11/src/os/bluestore/fastbmap_allocator_impl.cc ceph-12.2.12/src/os/bluestore/fastbmap_allocator_impl.cc
--- ceph-12.2.11/src/os/bluestore/fastbmap_allocator_impl.cc	1970-01-01 00:00:00.000000000 +0000
+++ ceph-12.2.12/src/os/bluestore/fastbmap_allocator_impl.cc	2019-04-11 12:33:50.000000000 +0000
@@ -0,0 +1,544 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+/*
+ * Bitmap based in-memory allocator implementation.
+ * Author: Igor Fedotov, ifedotov@suse.com
+ *
+ */
+
+#include "fastbmap_allocator_impl.h"
+
+uint64_t AllocatorLevel::l0_dives = 0;
+uint64_t AllocatorLevel::l0_iterations = 0;
+uint64_t AllocatorLevel::l0_inner_iterations = 0;
+uint64_t AllocatorLevel::alloc_fragments = 0;
+uint64_t AllocatorLevel::alloc_fragments_fast = 0;
+uint64_t AllocatorLevel::l2_allocs = 0;
+
+inline interval_t _align2units(uint64_t offset, uint64_t len, uint64_t min_length)
+{
+  interval_t res;
+  if (len >= min_length) {
+    res.offset = P2ROUNDUP(offset, min_length);
+    auto delta_off = res.offset - offset;
+    if (len > delta_off) {
+      res.length = len - delta_off;
+      res.length = P2ALIGN(res.length, min_length);
+      if (res.length) {
+	return res;
+      }
+    }
+  }
+  return interval_t();
+}
+
+interval_t AllocatorLevel01Loose::_get_longest_from_l0(uint64_t pos0,
+  uint64_t pos1, uint64_t min_length, interval_t* tail) const
+{
+  interval_t res;
+  if (pos0 >= pos1) {
+    return res;
+  }
+  auto pos = pos0;
+
+  interval_t res_candidate;
+  if (tail->length != 0) {
+    assert((tail->offset % l0_granularity) == 0);
+    assert((tail->length % l0_granularity) == 0);
+    res_candidate.offset = tail->offset / l0_granularity;
+    res_candidate.length = tail->length / l0_granularity;
+  }
+  *tail = interval_t();
+
+  auto d = bits_per_slot;
+  slot_t bits = l0[pos / d];
+  bits >>= pos % d;
+  bool end_loop = false;
+  auto min_granules = min_length / l0_granularity;
+
+  do {
+    if ((pos % d) == 0) {
+      bits = l0[pos / d];
+      if (pos1 - pos >= d) {
+        switch(bits) {
+	  case all_slot_set:
+	    // slot is totally free
+	    if (!res_candidate.length) {
+	      res_candidate.offset = pos;
+	    }
+	    res_candidate.length += d;
+	    pos += d;
+	    end_loop = pos >= pos1;
+	    if (end_loop) {
+	      *tail = res_candidate;
+	      res_candidate = _align2units(res_candidate.offset,
+		res_candidate.length, min_granules);
+	      if(res.length < res_candidate.length) {
+		res = res_candidate;
+	      }
+	    }
+	    continue;
+	  case all_slot_clear:
+	    // slot is totally allocated
+	    res_candidate = _align2units(res_candidate.offset,
+	      res_candidate.length, min_granules);
+	    if (res.length < res_candidate.length) {
+	      res = res_candidate;
+	    }
+	    res_candidate = interval_t();
+	    pos += d;
+	    end_loop = pos >= pos1;
+	    continue;
+	}
+      }
+    } //if ((pos % d) == 0)
+
+    end_loop = ++pos >= pos1;
+    if (bits & 1) {
+      // item is free
+      if (!res_candidate.length) {
+	res_candidate.offset = pos - 1;
+      }
+      ++res_candidate.length;
+      if (end_loop) {
+	*tail = res_candidate;
+	res_candidate = _align2units(res_candidate.offset,
+	  res_candidate.length, min_granules);
+	if (res.length < res_candidate.length) {
+	  res = res_candidate;
+	}
+      }
+    } else {
+      res_candidate = _align2units(res_candidate.offset,
+	res_candidate.length, min_granules);
+      if (res.length < res_candidate.length) {
+	res = res_candidate;
+      }
+      res_candidate = interval_t();
+    }
+    bits >>= 1;
+  } while (!end_loop);
+  res.offset *= l0_granularity;
+  res.length *= l0_granularity;
+  tail->offset *= l0_granularity;
+  tail->length *= l0_granularity;
+  return res;
+}
+
+void AllocatorLevel01Loose::_analyze_partials(uint64_t pos_start,
+  uint64_t pos_end, uint64_t length, uint64_t min_length, int mode,
+  search_ctx_t* ctx)
+{
+  auto d = CHILD_PER_SLOT;
+  assert((pos_start % d) == 0);
+  assert((pos_end % d) == 0);
+
+  uint64_t l0_w = slotset_width * CHILD_PER_SLOT_L0;
+
+  uint64_t l1_pos = pos_start;
+  const interval_t empty_tail;
+  interval_t prev_tail;
+
+  uint64_t next_free_l1_pos = 0;
+  for (auto pos = pos_start / d; pos < pos_end / d; ++pos) {
+    slot_t slot_val = l1[pos];
+    // FIXME minor: code below can be optimized to check slot_val against
+    // all_slot_set(_clear) value
+
+    for (auto c = 0; c < d; c++) {
+      switch (slot_val & L1_ENTRY_MASK) {
+      case L1_ENTRY_FREE:
+        prev_tail  = empty_tail;
+        if (!ctx->free_count) {
+          ctx->free_l1_pos = l1_pos;
+        } else if (l1_pos != next_free_l1_pos){
+	  auto o = ctx->free_l1_pos * l1_granularity;
+	  auto l = ctx->free_count * l1_granularity;
+          // check if already found extent fits min_length after alignment
+	  if (_align2units(o, l, min_length).length >= min_length) {
+	    break;
+	  }
+	  // if not - proceed with the next one
+          ctx->free_l1_pos = l1_pos;
+          ctx->free_count = 0;
+	}
+	next_free_l1_pos = l1_pos + 1;
+        ++ctx->free_count;
+        if (mode == STOP_ON_EMPTY) {
+          return;
+        }
+        break;
+      case L1_ENTRY_FULL:
+        prev_tail = empty_tail;
+        break;
+      case L1_ENTRY_PARTIAL:
+	interval_t longest;
+        ++ctx->partial_count;
+
+        longest = _get_longest_from_l0(l1_pos * l0_w, (l1_pos + 1) * l0_w, min_length, &prev_tail);
+
+        if (longest.length >= length) {
+          if ((ctx->affordable_len == 0) ||
+              ((ctx->affordable_len != 0) &&
+                (longest.length < ctx->affordable_len))) {
+            ctx->affordable_len = longest.length;
+	    ctx->affordable_offs = longest.offset;
+          }
+        }
+        if (longest.length >= min_length &&
+	    (ctx->min_affordable_len == 0 ||
+	      (longest.length < ctx->min_affordable_len))) {
+
+          ctx->min_affordable_len = P2ALIGN(longest.length, min_length);
+	  ctx->min_affordable_offs = longest.offset;
+        }
+        if (mode == STOP_ON_PARTIAL) {
+          return;
+        }
+        break;
+      }
+      slot_val >>= L1_ENTRY_WIDTH;
+      ++l1_pos;
+    }
+  }
+  ctx->fully_processed = true;
+}
+
+void AllocatorLevel01Loose::_mark_l1_on_l0(int64_t l0_pos, int64_t l0_pos_end)
+{
+  if (l0_pos == l0_pos_end) {
+    return;
+  }
+  auto d0 = bits_per_slotset;
+  uint64_t l1_w = CHILD_PER_SLOT;
+  // this should be aligned with slotset boundaries
+  assert(0 == (l0_pos % d0));
+  assert(0 == (l0_pos_end % d0));
+
+  int64_t idx = l0_pos / bits_per_slot;
+  int64_t idx_end = l0_pos_end / bits_per_slot;
+  slot_t mask_to_apply = L1_ENTRY_NOT_USED;
+
+  auto l1_pos = l0_pos / d0;
+
+  while (idx < idx_end) {
+    if (l0[idx] == all_slot_clear) {
+      // if not all prev slots are allocated then no need to check the
+      // current slot set, it's partial
+      ++idx;
+      if (mask_to_apply == L1_ENTRY_NOT_USED) {
+	mask_to_apply = L1_ENTRY_FULL;
+      } else if (mask_to_apply != L1_ENTRY_FULL) {
+	idx = P2ROUNDUP(idx, int64_t(slotset_width));
+        mask_to_apply = L1_ENTRY_PARTIAL;
+      }
+    } else if (l0[idx] == all_slot_set) {
+      // if not all prev slots are free then no need to check the
+      // current slot set, it's partial
+      ++idx;
+      if (mask_to_apply == L1_ENTRY_NOT_USED) {
+	mask_to_apply = L1_ENTRY_FREE;
+      } else if (mask_to_apply != L1_ENTRY_FREE) {
+	idx = P2ROUNDUP(idx, int64_t(slotset_width));
+        mask_to_apply = L1_ENTRY_PARTIAL;
+      }
+    } else {
+      // no need to check the current slot set, it's partial
+      mask_to_apply = L1_ENTRY_PARTIAL;
+      ++idx;
+      idx = P2ROUNDUP(idx, int64_t(slotset_width));
+    }
+    if ((idx % slotset_width) == 0) {
+      assert(mask_to_apply != L1_ENTRY_NOT_USED);
+      uint64_t shift = (l1_pos % l1_w) * L1_ENTRY_WIDTH;
+      slot_t& slot_val = l1[l1_pos / l1_w];
+      auto mask = slot_t(L1_ENTRY_MASK) << shift;
+
+      slot_t old_mask = (slot_val & mask) >> shift;
+      switch(old_mask) {
+      case L1_ENTRY_FREE:
+	unalloc_l1_count--;
+	break;
+      case L1_ENTRY_PARTIAL:
+	partial_l1_count--;
+	break;
+      }
+      slot_val &= ~mask;
+      slot_val |= slot_t(mask_to_apply) << shift;
+      switch(mask_to_apply) {
+      case L1_ENTRY_FREE:
+	unalloc_l1_count++;
+	break;
+      case L1_ENTRY_PARTIAL:
+	partial_l1_count++;
+	break;
+      }
+      mask_to_apply = L1_ENTRY_NOT_USED;
+      ++l1_pos;
+    }
+  }
+}
+
+void AllocatorLevel01Loose::_mark_alloc_l0(int64_t l0_pos_start,
+  int64_t l0_pos_end)
+{
+  auto d0 = CHILD_PER_SLOT_L0;
+
+  int64_t pos = l0_pos_start;
+  slot_t bits = (slot_t)1 << (l0_pos_start % d0);
+
+  while (pos < std::min(l0_pos_end, (int64_t)P2ROUNDUP(l0_pos_start, d0))) {
+    l0[pos / d0] &= ~bits;
+    bits <<= 1;
+    pos++;
+  }
+
+  while (pos < std::min(l0_pos_end, (int64_t)P2ALIGN(l0_pos_end, d0))) {
+    l0[pos / d0] = all_slot_clear;
+    pos += d0;
+  }
+  bits = 1;
+  while (pos < l0_pos_end) {
+    l0[pos / d0] &= ~bits;
+    bits <<= 1;
+    pos++;
+  }
+}
+
+interval_t AllocatorLevel01Loose::_allocate_l1_contiguous(uint64_t length,
+  uint64_t min_length, uint64_t max_length,
+  uint64_t pos_start, uint64_t pos_end)
+{
+  interval_t res = { 0, 0 };
+  uint64_t l0_w = slotset_width * CHILD_PER_SLOT_L0;
+
+  if (unlikely(length <= l0_granularity)) {
+    search_ctx_t ctx;
+    _analyze_partials(pos_start, pos_end, l0_granularity, l0_granularity,
+      STOP_ON_PARTIAL, &ctx);
+
+    // check partially free slot sets first (including neighboring),
+    // full length match required.
+    if (ctx.affordable_len) {
+      // allocate as specified
+      assert(ctx.affordable_len >= length);
+      auto pos = ctx.affordable_offs / l0_granularity;
+      _mark_alloc_l1_l0(pos, pos + 1);
+      res = interval_t(ctx.affordable_offs, length);
+      return res;
+    }
+
+    // allocate from free slot sets
+    if (ctx.free_count) {
+      auto l = std::min(length, ctx.free_count * l1_granularity);
+      assert((l % l0_granularity) == 0);
+      auto pos_end = ctx.free_l1_pos * l0_w + l / l0_granularity;
+
+      _mark_alloc_l1_l0(ctx.free_l1_pos * l0_w, pos_end);
+      res = interval_t(ctx.free_l1_pos * l1_granularity, l);
+      return res;
+    }
+  } else if (unlikely(length == l1_granularity)) {
+    search_ctx_t ctx;
+    _analyze_partials(pos_start, pos_end, length, min_length, STOP_ON_EMPTY, &ctx);
+
+    // allocate using contiguous extent found at l1 if any
+    if (ctx.free_count) {
+
+      auto l = std::min(length, ctx.free_count * l1_granularity);
+      assert((l % l0_granularity) == 0);
+      auto pos_end = ctx.free_l1_pos * l0_w + l / l0_granularity;
+
+      _mark_alloc_l1_l0(ctx.free_l1_pos * l0_w, pos_end);
+      res = interval_t(ctx.free_l1_pos * l1_granularity, l);
+
+      return res;
+    }
+
+    // we can terminate earlier on free entry only
+    assert(ctx.fully_processed);
+
+    // check partially free slot sets first (including neighboring),
+    // full length match required.
+    if (ctx.affordable_len) {
+      assert(ctx.affordable_len >= length);
+      assert((length % l0_granularity) == 0);
+      auto pos_start = ctx.affordable_offs + length / l0_granularity;
+      auto pos_end = (ctx.affordable_offs + length) / l0_granularity;
+      _mark_alloc_l1_l0(pos_start, pos_end);
+      res = interval_t(ctx.affordable_offs, length);
+      return res;
+    }
+    if (ctx.min_affordable_len) {
+      auto pos_start = ctx.min_affordable_offs / l0_granularity;
+      auto pos_end = (ctx.min_affordable_offs + ctx.min_affordable_len) / l0_granularity;
+      _mark_alloc_l1_l0(pos_start, pos_end);
+      return interval_t(ctx.min_affordable_offs, ctx.min_affordable_len);
+    }
+  } else {
+    search_ctx_t ctx;
+    _analyze_partials(pos_start, pos_end, length, min_length, NO_STOP, &ctx);
+    assert(ctx.fully_processed);
+    // check partially free slot sets first (including neighboring),
+    // full length match required.
+    if (ctx.affordable_len) {
+      assert(ctx.affordable_len >= length);
+      assert((length % l0_granularity) == 0);
+      auto pos_start = ctx.affordable_offs / l0_granularity;
+      auto pos_end = (ctx.affordable_offs + length) / l0_granularity;
+      _mark_alloc_l1_l0(pos_start, pos_end);
+      res = interval_t(ctx.affordable_offs, length);
+      return res;
+    }
+    // allocate using contiguous extent found at l1 if affordable
+    // align allocated extent with min_length
+    if (ctx.free_count) {
+      auto o = ctx.free_l1_pos * l1_granularity;
+      auto l = ctx.free_count * l1_granularity;
+      interval_t aligned_extent = _align2units(o, l, min_length);
+      if (aligned_extent.length > 0) {
+	aligned_extent.length = std::min(length,
+	  uint64_t(aligned_extent.length));
+	assert((aligned_extent.offset % l0_granularity) == 0);
+	assert((aligned_extent.length % l0_granularity) == 0);
+
+	auto pos_start = aligned_extent.offset / l0_granularity;
+	auto pos_end = (aligned_extent.offset + aligned_extent.length) / l0_granularity;
+
+	_mark_alloc_l1_l0(pos_start, pos_end);
+	return aligned_extent;
+      }
+    }
+    if (ctx.min_affordable_len) {
+      auto pos_start = ctx.min_affordable_offs / l0_granularity;
+      auto pos_end = (ctx.min_affordable_offs + ctx.min_affordable_len) / l0_granularity;
+      _mark_alloc_l1_l0(pos_start, pos_end);
+      return interval_t(ctx.min_affordable_offs, ctx.min_affordable_len);
+    }
+  }
+  return res;
+}
+
+bool AllocatorLevel01Loose::_allocate_l1(uint64_t length,
+  uint64_t min_length, uint64_t max_length,
+  uint64_t l1_pos_start, uint64_t l1_pos_end,
+  uint64_t* allocated,
+  interval_vector_t* res)
+{
+  uint64_t d0 = CHILD_PER_SLOT_L0;
+  uint64_t d1 = CHILD_PER_SLOT;
+
+  assert(0 == (l1_pos_start % (slotset_width * d1)));
+  assert(0 == (l1_pos_end % (slotset_width * d1)));
+  if (min_length != l0_granularity) {
+    // probably not the most effecient way but
+    // don't care much about that at the moment
+    bool has_space = true;
+    while (length > *allocated && has_space) {
+      interval_t i =
+        _allocate_l1_contiguous(length - *allocated, min_length, max_length,
+	  l1_pos_start, l1_pos_end);
+      if (i.length == 0) {
+        has_space = false;
+      } else {
+	_fragment_and_emplace(max_length, i.offset, i.length, res);
+        *allocated += i.length;
+      }
+    }
+  } else {
+    uint64_t l0_w = slotset_width * d0;
+
+    for (auto idx = l1_pos_start / d1;
+      idx < l1_pos_end / d1 && length > *allocated;
+      ++idx) {
+      slot_t& slot_val = l1[idx];
+      if (slot_val == all_slot_clear) {
+        continue;
+      } else if (slot_val == all_slot_set) {
+        uint64_t to_alloc = std::min(length - *allocated,
+          l1_granularity * d1);
+        *allocated += to_alloc;
+        ++alloc_fragments_fast;
+	_fragment_and_emplace(max_length, idx * d1 * l1_granularity, to_alloc,
+	  res);
+        _mark_alloc_l1_l0(idx * d1 * bits_per_slotset,
+	  idx * d1 * bits_per_slotset + to_alloc / l0_granularity);
+        continue;
+      }
+      auto free_pos = find_next_set_bit(slot_val, 0);
+      assert(free_pos < bits_per_slot);
+      do {
+        assert(length > *allocated);
+
+        bool empty;
+        empty = _allocate_l0(length, max_length,
+	  (idx * d1 + free_pos / L1_ENTRY_WIDTH) * l0_w,
+          (idx * d1 + free_pos / L1_ENTRY_WIDTH + 1) * l0_w,
+          allocated,
+          res);
+
+	auto mask = slot_t(L1_ENTRY_MASK) << free_pos;
+
+	slot_t old_mask = (slot_val & mask) >> free_pos;
+	switch(old_mask) {
+	case L1_ENTRY_FREE:
+	  unalloc_l1_count--;
+	  break;
+	case L1_ENTRY_PARTIAL:
+	  partial_l1_count--;
+	  break;
+	}
+        slot_val &= ~mask;
+        if (empty) {
+          // the next line is no op with the current L1_ENTRY_FULL but left
+          // as-is for the sake of uniformity and to avoid potential errors
+          // in future
+          slot_val |= slot_t(L1_ENTRY_FULL) << free_pos;
+        } else {
+          slot_val |= slot_t(L1_ENTRY_PARTIAL) << free_pos;
+	  partial_l1_count++;
+        }
+        if (length <= *allocated || slot_val == all_slot_clear) {
+          break;
+        }
+	free_pos = find_next_set_bit(slot_val, free_pos + L1_ENTRY_WIDTH);
+      } while (free_pos < bits_per_slot);
+    }
+  }
+  return _is_empty_l1(l1_pos_start, l1_pos_end);
+}
+
+void AllocatorLevel01Loose::collect_stats(
+  std::map<size_t, size_t>& bins_overall)
+{
+  size_t free_seq_cnt = 0;
+  for (auto slot : l0) {
+    if (slot == all_slot_set) {
+      free_seq_cnt += CHILD_PER_SLOT_L0;
+    } else if(slot != all_slot_clear) {
+      size_t pos = 0;
+      do {
+	auto pos1 = find_next_set_bit(slot, pos);
+	if (pos1 == pos) {
+	  free_seq_cnt++;
+	  pos = pos1 + 1;
+	} else {
+	  if (free_seq_cnt) {
+	    bins_overall[cbits(free_seq_cnt) - 1]++;
+	    free_seq_cnt = 0;
+	  }
+	  if (pos1 < bits_per_slot) {
+	    free_seq_cnt = 1;
+	  }
+          pos = pos1 + 1;
+	}
+      } while (pos < bits_per_slot);
+    } else if (free_seq_cnt) {
+      bins_overall[cbits(free_seq_cnt) - 1]++;
+      free_seq_cnt = 0;
+    }
+  }
+  if (free_seq_cnt) {
+    bins_overall[cbits(free_seq_cnt) - 1]++;
+  }
+}
diff -Nru ceph-12.2.11/src/os/bluestore/fastbmap_allocator_impl.h ceph-12.2.12/src/os/bluestore/fastbmap_allocator_impl.h
--- ceph-12.2.11/src/os/bluestore/fastbmap_allocator_impl.h	1970-01-01 00:00:00.000000000 +0000
+++ ceph-12.2.12/src/os/bluestore/fastbmap_allocator_impl.h	2019-04-11 12:33:50.000000000 +0000
@@ -0,0 +1,774 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+/*
+ * Bitmap based in-memory allocator implementation.
+ * Author: Igor Fedotov, ifedotov@suse.com
+ *
+ */
+
+#ifndef __FAST_BITMAP_ALLOCATOR_IMPL_H
+#define __FAST_BITMAP_ALLOCATOR_IMPL_H
+#include <type_traits>
+#include "include/intarith.h"
+
+#include <vector>
+#include <algorithm>
+#include <mutex>
+
+typedef uint64_t slot_t;
+
+#ifdef NON_CEPH_BUILD
+#include <assert.h>
+struct interval_t
+{
+  uint64_t offset = 0;
+  uint64_t length = 0;
+
+  interval_t() {}
+  interval_t(uint64_t o, uint64_t l) : offset(o), length(l) {}
+  interval_t(const interval_t &ext) :
+    offset(ext.offset), length(ext.length) {}
+};
+typedef std::vector<interval_t> interval_vector_t;
+typedef std::vector<slot_t> slot_vector_t;
+#else
+#include "include/assert.h"
+#include "common/likely.h"
+#include "os/bluestore/bluestore_types.h"
+#include "include/mempool.h"
+
+typedef bluestore_interval_t<uint64_t, uint64_t> interval_t;
+typedef PExtentVector interval_vector_t;
+
+typedef mempool::bluestore_alloc::vector<slot_t> slot_vector_t;
+
+#endif
+
+// fitting into cache line on x86_64
+static const size_t slotset_width = 8; // 8 slots per set
+static const size_t slotset_bytes = sizeof(slot_t) * slotset_width;
+static const size_t bits_per_slot = sizeof(slot_t) * 8;
+static const size_t bits_per_slotset = slotset_bytes * 8;
+static const slot_t all_slot_set = 0xffffffffffffffff;
+static const slot_t all_slot_clear = 0;
+
+inline size_t find_next_set_bit(slot_t slot_val, size_t start_pos)
+{
+#ifdef __GNUC__
+  if (start_pos == 0) {
+    start_pos = __builtin_ffsll(slot_val);
+    return start_pos ? start_pos - 1 : bits_per_slot;
+  }
+#endif
+  slot_t mask = slot_t(1) << start_pos;
+  while (start_pos < bits_per_slot && !(slot_val & mask)) {
+    mask <<= 1;
+    ++start_pos;
+  }
+  return start_pos;
+}
+
+
+class AllocatorLevel
+{
+protected:
+
+  virtual uint64_t _children_per_slot() const = 0;
+  virtual uint64_t _level_granularity() const = 0;
+
+public:
+  static uint64_t l0_dives;
+  static uint64_t l0_iterations;
+  static uint64_t l0_inner_iterations;
+  static uint64_t alloc_fragments;
+  static uint64_t alloc_fragments_fast;
+  static uint64_t l2_allocs;
+
+  virtual ~AllocatorLevel()
+  {}
+
+  virtual void collect_stats(
+    std::map<size_t, size_t>& bins_overall) = 0;
+
+};
+
+class AllocatorLevel01 : public AllocatorLevel
+{
+protected:
+  slot_vector_t l0; // set bit means free entry
+  slot_vector_t l1;
+  uint64_t l0_granularity = 0; // space per entry
+  uint64_t l1_granularity = 0; // space per entry
+
+  size_t partial_l1_count = 0;
+  size_t unalloc_l1_count = 0;
+
+  double get_fragmentation() const {
+    double res = 0.0;
+    auto total = unalloc_l1_count + partial_l1_count;
+    if (total) {
+      res = double(partial_l1_count) / double(total);
+    }
+    return res;
+  }
+
+  uint64_t _level_granularity() const override
+  {
+    return l1_granularity;
+  }
+
+  inline bool _is_slot_fully_allocated(uint64_t idx) const {
+    return l1[idx] == all_slot_clear;
+  }
+public:
+  inline uint64_t get_min_alloc_size() const
+  {
+    return l0_granularity;
+  }
+
+};
+
+template <class T>
+class AllocatorLevel02;
+
+class AllocatorLevel01Loose : public AllocatorLevel01
+{
+  enum {
+    L1_ENTRY_WIDTH = 2,
+    L1_ENTRY_MASK = (1 << L1_ENTRY_WIDTH) - 1,
+    L1_ENTRY_FULL = 0x00,
+    L1_ENTRY_PARTIAL = 0x01,
+    L1_ENTRY_NOT_USED = 0x02,
+    L1_ENTRY_FREE = 0x03,
+    CHILD_PER_SLOT = bits_per_slot / L1_ENTRY_WIDTH, // 32
+    CHILD_PER_SLOT_L0 = bits_per_slot, // 64
+  };
+  uint64_t _children_per_slot() const override
+  {
+    return CHILD_PER_SLOT;
+  }
+
+  interval_t _get_longest_from_l0(uint64_t pos0, uint64_t pos1,
+    uint64_t min_length, interval_t* tail) const;
+
+  inline void _fragment_and_emplace(uint64_t max_length, uint64_t offset,
+    uint64_t len,
+    interval_vector_t* res)
+  {
+    auto it = res->rbegin();
+    if (max_length) {
+      if (it != res->rend() && it->offset + it->length == offset) {
+	auto l = max_length - it->length;
+	if (l >= len) {
+	  it->length += len;
+	  return;
+	} else {
+	  offset += l;
+	  len -= l;
+	  it->length += l;
+	}
+      }
+
+      while (len > max_length) {
+	res->emplace_back(offset, max_length);
+	offset += max_length;
+	len -= max_length;
+      }
+      res->emplace_back(offset, len);
+      return;
+    }
+
+    if (it != res->rend() && it->offset + it->length == offset) {
+      it->length += len;
+    } else {
+      res->emplace_back(offset, len);
+    }
+  }
+
+  bool _allocate_l0(uint64_t length,
+    uint64_t max_length,
+    uint64_t l0_pos0, uint64_t l0_pos1,
+    uint64_t* allocated,
+    interval_vector_t* res)
+  {
+    uint64_t d0 = CHILD_PER_SLOT_L0;
+
+    ++l0_dives;
+
+    assert(l0_pos0 < l0_pos1);
+    assert(length > *allocated);
+    assert(0 == (l0_pos0 % (slotset_width * d0)));
+    assert(0 == (l0_pos1 % (slotset_width * d0)));
+    assert(((length - *allocated) % l0_granularity) == 0);
+
+    uint64_t need_entries = (length - *allocated) / l0_granularity;
+
+    for (auto idx = l0_pos0 / d0; (idx < l0_pos1 / d0) && (length > *allocated);
+      ++idx) {
+      ++l0_iterations;
+      slot_t& slot_val = l0[idx];
+      auto base = idx * d0;
+      if (slot_val == all_slot_clear) {
+        continue;
+      } else if (slot_val == all_slot_set) {
+        uint64_t to_alloc = std::min(need_entries, d0);
+        *allocated += to_alloc * l0_granularity;
+	++alloc_fragments;
+        need_entries -= to_alloc;
+
+	_fragment_and_emplace(max_length, base * l0_granularity,
+          to_alloc * l0_granularity, res);
+
+        if (to_alloc == d0) {
+          slot_val = all_slot_clear;
+        } else {
+          _mark_alloc_l0(base, base + to_alloc);
+        }
+        continue;
+      }
+
+      auto free_pos = find_next_set_bit(slot_val, 0);
+      assert(free_pos < bits_per_slot);
+      auto next_pos = free_pos + 1;
+      while (next_pos < bits_per_slot &&
+        (next_pos - free_pos) < need_entries) {
+	++l0_inner_iterations;
+
+        if (0 == (slot_val & (slot_t(1) << next_pos))) {
+          auto to_alloc = (next_pos - free_pos);
+          *allocated += to_alloc * l0_granularity;
+	  ++alloc_fragments;
+          need_entries -= to_alloc;
+	  _fragment_and_emplace(max_length, (base + free_pos) * l0_granularity,
+	    to_alloc * l0_granularity, res);
+          _mark_alloc_l0(base + free_pos, base + next_pos);
+          free_pos = find_next_set_bit(slot_val, next_pos + 1);
+          next_pos = free_pos + 1;
+        } else {
+          ++next_pos;
+        }
+      }
+      if (need_entries && free_pos < bits_per_slot) {
+        auto to_alloc = std::min(need_entries, d0 - free_pos);
+        *allocated += to_alloc * l0_granularity;
+	++alloc_fragments;
+	need_entries -= to_alloc;
+	_fragment_and_emplace(max_length, (base + free_pos) * l0_granularity,
+	  to_alloc * l0_granularity, res);
+        _mark_alloc_l0(base + free_pos, base + free_pos + to_alloc);
+      }
+    }
+    return _is_empty_l0(l0_pos0, l0_pos1);
+  }
+
+protected:
+
+  friend class AllocatorLevel02<AllocatorLevel01Loose>;
+
+  void _init(uint64_t capacity, uint64_t _alloc_unit, bool mark_as_free = true)
+  {
+    l0_granularity = _alloc_unit;
+    // 512 bits at L0 mapped to L1 entry
+    l1_granularity = l0_granularity * bits_per_slotset;
+
+    // capacity to have slot alignment at l1
+    auto aligned_capacity =
+      P2ROUNDUP((int64_t)capacity,
+        int64_t(l1_granularity * slotset_width * _children_per_slot()));
+    size_t slot_count =
+      aligned_capacity / l1_granularity / _children_per_slot();
+    // we use set bit(s) as a marker for (partially) free entry
+    l1.resize(slot_count, mark_as_free ? all_slot_set : all_slot_clear);
+
+    // l0 slot count
+    size_t slot_count_l0 = aligned_capacity / _alloc_unit / bits_per_slot;
+    // we use set bit(s) as a marker for (partially) free entry
+    l0.resize(slot_count_l0, mark_as_free ? all_slot_set : all_slot_clear);
+
+    partial_l1_count = unalloc_l1_count = 0;
+    if (mark_as_free) {
+      unalloc_l1_count = slot_count * _children_per_slot();
+      auto l0_pos_no_use = P2ROUNDUP((int64_t)capacity, (int64_t)l0_granularity) / l0_granularity;
+      _mark_alloc_l1_l0(l0_pos_no_use, aligned_capacity / l0_granularity);
+    }
+  }
+
+  struct search_ctx_t
+  {
+    size_t partial_count = 0;
+    size_t free_count = 0;
+    uint64_t free_l1_pos = 0;
+
+    uint64_t min_affordable_len = 0;
+    uint64_t min_affordable_offs = 0;
+    uint64_t affordable_len = 0;
+    uint64_t affordable_offs = 0;
+
+    bool fully_processed = false;
+
+    void reset()
+    {
+      *this = search_ctx_t();
+    }
+  };
+  enum {
+    NO_STOP,
+    STOP_ON_EMPTY,
+    STOP_ON_PARTIAL,
+  };
+  void _analyze_partials(uint64_t pos_start, uint64_t pos_end,
+    uint64_t length, uint64_t min_length, int mode,
+    search_ctx_t* ctx);
+
+  void _mark_l1_on_l0(int64_t l0_pos, int64_t l0_pos_end);
+  void _mark_alloc_l0(int64_t l0_pos_start, int64_t l0_pos_end);
+
+  void _mark_alloc_l1_l0(int64_t l0_pos_start, int64_t l0_pos_end)
+  {
+    _mark_alloc_l0(l0_pos_start, l0_pos_end);
+    l0_pos_start = P2ALIGN(l0_pos_start, int64_t(bits_per_slotset));
+    l0_pos_end = P2ROUNDUP(l0_pos_end, int64_t(bits_per_slotset));
+    _mark_l1_on_l0(l0_pos_start, l0_pos_end);
+  }
+
+  void _mark_free_l0(int64_t l0_pos_start, int64_t l0_pos_end)
+  {
+    auto d0 = CHILD_PER_SLOT_L0;
+
+    auto pos = l0_pos_start;
+    slot_t bits = (slot_t)1 << (l0_pos_start % d0);
+    slot_t& val_s = l0[pos / d0];
+    int64_t pos_e = std::min(l0_pos_end, (int64_t)P2ROUNDUP(l0_pos_start + 1, d0));
+    while (pos < pos_e) {
+      val_s |= bits;
+      bits <<= 1;
+      pos++;
+    }
+    pos_e = std::min(l0_pos_end, (int64_t)P2ALIGN(l0_pos_end, d0));
+    auto idx = pos / d0;
+    while (pos < pos_e) {
+      l0[idx++] = all_slot_set;
+      pos += d0;
+    }
+    bits = 1;
+    uint64_t& val_e = l0[pos / d0];
+    while (pos < l0_pos_end) {
+      val_e |= bits;
+      bits <<= 1;
+      pos++;
+    }
+  }
+
+  void _mark_free_l1_l0(int64_t l0_pos_start, int64_t l0_pos_end)
+  {
+    _mark_free_l0(l0_pos_start, l0_pos_end);
+    l0_pos_start = P2ALIGN(l0_pos_start, int64_t(bits_per_slotset));
+    l0_pos_end = P2ROUNDUP(l0_pos_end, int64_t(bits_per_slotset));
+    _mark_l1_on_l0(l0_pos_start, l0_pos_end);
+  }
+
+  bool _is_empty_l0(uint64_t l0_pos, uint64_t l0_pos_end)
+  {
+    bool no_free = true;
+    uint64_t d = slotset_width * CHILD_PER_SLOT_L0;
+    assert(0 == (l0_pos % d));
+    assert(0 == (l0_pos_end % d));
+
+    auto idx = l0_pos / CHILD_PER_SLOT_L0;
+    auto idx_end = l0_pos_end / CHILD_PER_SLOT_L0;
+    while (idx < idx_end && no_free) {
+      no_free = l0[idx] == all_slot_clear;
+      ++idx;
+    }
+    return no_free;
+  }
+  bool _is_empty_l1(uint64_t l1_pos, uint64_t l1_pos_end)
+  {
+    bool no_free = true;
+    uint64_t d = slotset_width * _children_per_slot();
+    assert(0 == (l1_pos % d));
+    assert(0 == (l1_pos_end % d));
+
+    auto idx = l1_pos / CHILD_PER_SLOT;
+    auto idx_end = l1_pos_end / CHILD_PER_SLOT;
+    while (idx < idx_end && no_free) {
+      no_free = _is_slot_fully_allocated(idx);
+      ++idx;
+    }
+    return no_free;
+  }
+
+  interval_t _allocate_l1_contiguous(uint64_t length,
+    uint64_t min_length, uint64_t max_length,
+    uint64_t pos_start, uint64_t pos_end);
+
+  bool _allocate_l1(uint64_t length,
+    uint64_t min_length, uint64_t max_length,
+    uint64_t l1_pos_start, uint64_t l1_pos_end,
+    uint64_t* allocated,
+    interval_vector_t* res);
+
+  uint64_t _mark_alloc_l1(const interval_t& r)
+  {
+    uint64_t l0_pos_start = r.offset / l0_granularity;
+    uint64_t l0_pos_end = P2ROUNDUP(r.offset + r.length, l0_granularity) / l0_granularity;
+    _mark_alloc_l1_l0(l0_pos_start, l0_pos_end);
+    return l0_granularity * (l0_pos_end - l0_pos_start);
+  }
+
+  uint64_t _free_l1(uint64_t offs, uint64_t len)
+  {
+    uint64_t l0_pos_start = offs / l0_granularity;
+    uint64_t l0_pos_end = P2ROUNDUP(offs + len, l0_granularity) / l0_granularity;
+    _mark_free_l1_l0(l0_pos_start, l0_pos_end);
+    return l0_granularity * (l0_pos_end - l0_pos_start);
+  }
+
+public:
+  uint64_t debug_get_allocated(uint64_t pos0 = 0, uint64_t pos1 = 0)
+  {
+    if (pos1 == 0) {
+      pos1 = l1.size() * CHILD_PER_SLOT;
+    }
+    auto avail = debug_get_free(pos0, pos1);
+    return (pos1 - pos0) * l1_granularity - avail;
+  }
+
+  uint64_t debug_get_free(uint64_t l1_pos0 = 0, uint64_t l1_pos1 = 0)
+  {
+    assert(0 == (l1_pos0 % CHILD_PER_SLOT));
+    assert(0 == (l1_pos1 % CHILD_PER_SLOT));
+
+    auto idx0 = l1_pos0 * slotset_width;
+    auto idx1 = l1_pos1 * slotset_width;
+
+    if (idx1 == 0) {
+      idx1 = l0.size();
+    }
+
+    uint64_t res = 0;
+    for (uint64_t i = idx0; i < idx1; ++i) {
+      auto v = l0[i];
+      if (v == all_slot_set) {
+        res += CHILD_PER_SLOT_L0;
+      } else if (v != all_slot_clear) {
+        size_t cnt = 0;
+#ifdef __GNUC__
+        cnt = __builtin_popcountll(v);
+#else
+        // Kernighan's Alg to count set bits
+        while (v) {
+          v &= (v - 1);
+          cnt++;
+        }
+#endif
+        res += cnt;
+      }
+    }
+    return res * l0_granularity;
+  }
+  void collect_stats(
+    std::map<size_t, size_t>& bins_overall) override;
+};
+
+class AllocatorLevel01Compact : public AllocatorLevel01
+{
+  uint64_t _children_per_slot() const override
+  {
+    return 8;
+  }
+public:
+  void collect_stats(
+    std::map<size_t, size_t>& bins_overall) override
+  {
+    // not implemented
+  }
+};
+
+template <class L1>
+class AllocatorLevel02 : public AllocatorLevel
+{
+public:
+  uint64_t debug_get_free(uint64_t pos0 = 0, uint64_t pos1 = 0)
+  {
+    std::lock_guard<std::mutex> l(lock);
+    return l1.debug_get_free(pos0 * l1._children_per_slot() * bits_per_slot,
+      pos1 * l1._children_per_slot() * bits_per_slot);
+  }
+  uint64_t debug_get_allocated(uint64_t pos0 = 0, uint64_t pos1 = 0)
+  {
+    std::lock_guard<std::mutex> l(lock);
+    return l1.debug_get_allocated(pos0 * l1._children_per_slot() * bits_per_slot,
+      pos1 * l1._children_per_slot() * bits_per_slot);
+  }
+
+  uint64_t get_available()
+  {
+    std::lock_guard<std::mutex> l(lock);
+    return available;
+  }
+  inline uint64_t get_min_alloc_size() const
+  {
+    return l1.get_min_alloc_size();
+  }
+  void collect_stats(
+    std::map<size_t, size_t>& bins_overall) override {
+
+      std::lock_guard<std::mutex> l(lock);
+      l1.collect_stats(bins_overall);
+  }
+
+protected:
+  std::mutex lock;
+  L1 l1;
+  slot_vector_t l2;
+  uint64_t l2_granularity = 0; // space per entry
+  uint64_t available = 0;
+  uint64_t last_pos = 0;
+
+  enum {
+    CHILD_PER_SLOT = bits_per_slot, // 64
+  };
+
+  uint64_t _children_per_slot() const override
+  {
+    return CHILD_PER_SLOT;
+  }
+  uint64_t _level_granularity() const override
+  {
+    return l2_granularity;
+  }
+
+  void _init(uint64_t capacity, uint64_t _alloc_unit, bool mark_as_free = true)
+  {
+    assert(ISP2(_alloc_unit));
+    l1._init(capacity, _alloc_unit, mark_as_free);
+
+    l2_granularity =
+      l1._level_granularity() * l1._children_per_slot() * slotset_width;
+
+    // capacity to have slot alignment at l2
+    auto aligned_capacity =
+      P2ROUNDUP((int64_t)capacity, (int64_t)l2_granularity * CHILD_PER_SLOT);
+    size_t elem_count = aligned_capacity / l2_granularity / CHILD_PER_SLOT;
+    // we use set bit(s) as a marker for (partially) free entry
+    l2.resize(elem_count, mark_as_free ? all_slot_set : all_slot_clear);
+
+    if (mark_as_free) {
+      // capacity to have slotset alignment at l1
+      auto l2_pos_no_use =
+	P2ROUNDUP((int64_t)capacity, (int64_t)l2_granularity) / l2_granularity;
+      _mark_l2_allocated(l2_pos_no_use, aligned_capacity / l2_granularity);
+      available = P2ALIGN(capacity, _alloc_unit);
+    } else {
+      available = 0;
+    }
+  }
+
+  void _mark_l2_allocated(int64_t l2_pos, int64_t l2_pos_end)
+  {
+    auto d = CHILD_PER_SLOT;
+    assert(0 <= l2_pos_end);
+    assert((int64_t)l2.size() >= (l2_pos_end / d));
+
+    while (l2_pos < l2_pos_end) {
+      l2[l2_pos / d] &= ~(slot_t(1) << (l2_pos % d));
+      ++l2_pos;
+    }
+  }
+
+  void _mark_l2_free(int64_t l2_pos, int64_t l2_pos_end)
+  {
+    auto d = CHILD_PER_SLOT;
+    assert(0 <= l2_pos_end);
+    assert((int64_t)l2.size() >= (l2_pos_end / d));
+
+    while (l2_pos < l2_pos_end) {
+        l2[l2_pos / d] |= (slot_t(1) << (l2_pos % d));
+        ++l2_pos;
+    }
+  }
+
+  void _mark_l2_on_l1(int64_t l2_pos, int64_t l2_pos_end)
+  {
+    auto d = CHILD_PER_SLOT;
+    assert(0 <= l2_pos_end);
+    assert((int64_t)l2.size() >= (l2_pos_end / d));
+
+    auto idx = l2_pos * slotset_width;
+    auto idx_end = l2_pos_end * slotset_width;
+    bool all_allocated = true;
+    while (idx < idx_end) {
+      if (!l1._is_slot_fully_allocated(idx)) {
+        all_allocated = false;
+        idx = P2ROUNDUP(int64_t(++idx), int64_t(slotset_width));
+      }
+      else {
+        ++idx;
+      }
+      if ((idx % slotset_width) == 0) {
+        if (all_allocated) {
+          l2[l2_pos / d] &= ~(slot_t(1) << (l2_pos % d));
+        }
+        else {
+          l2[l2_pos / d] |= (slot_t(1) << (l2_pos % d));
+        }
+        all_allocated = true;
+        ++l2_pos;
+      }
+    }
+  }
+
+  void _allocate_l2(uint64_t length,
+    uint64_t min_length,
+    uint64_t max_length,
+    uint64_t hint,
+    
+    uint64_t* allocated,
+    interval_vector_t* res)
+  {
+    uint64_t prev_allocated = *allocated;
+    uint64_t d = CHILD_PER_SLOT;
+    assert(ISP2(min_length));
+    assert(min_length <= l2_granularity);
+    assert(max_length == 0 || max_length >= min_length);
+    assert(max_length == 0 || (max_length % min_length) == 0);
+    assert(length >= min_length);
+    assert((length % min_length) == 0);
+
+    uint64_t cap = 1ull << 31;
+    if (max_length == 0 || max_length >= cap) {
+      max_length = cap;
+    }
+
+    uint64_t l1_w = slotset_width * l1._children_per_slot();
+
+    std::lock_guard<std::mutex> l(lock);
+
+    if (available < min_length) {
+      return;
+    }
+    if (hint != 0) {
+      last_pos = (hint / d) < l2.size() ? P2ALIGN(hint, d) : 0;
+    }
+    auto l2_pos = last_pos;
+    auto last_pos0 = last_pos;
+    auto pos = last_pos / d;
+    auto pos_end = l2.size();
+    // outer loop below is intended to optimize the performance by
+    // avoiding 'modulo' operations inside the internal loop.
+    // Looks like they have negative impact on the performance
+    for (auto i = 0; i < 2; ++i) {
+      for(; length > *allocated && pos < pos_end; ++pos) {
+	slot_t& slot_val = l2[pos];
+	size_t free_pos = 0;
+	bool all_set = false;
+	if (slot_val == all_slot_clear) {
+	  l2_pos += d;
+	  last_pos = l2_pos;
+	  continue;
+	} else if (slot_val == all_slot_set) {
+	  free_pos = 0;
+	  all_set = true;
+	} else {
+	  free_pos = find_next_set_bit(slot_val, 0);
+	  assert(free_pos < bits_per_slot);
+	}
+	do {
+	  assert(length > *allocated);
+	  bool empty = l1._allocate_l1(length,
+	    min_length,
+	    max_length,
+	    (l2_pos + free_pos) * l1_w,
+	    (l2_pos + free_pos + 1) * l1_w,
+	    allocated,
+	    res);
+	  if (empty) {
+	    slot_val &= ~(slot_t(1) << free_pos);
+	  }
+	  if (length <= *allocated || slot_val == all_slot_clear) {
+	    break;
+	  }
+	  ++free_pos;
+	  if (!all_set) {
+	    free_pos = find_next_set_bit(slot_val, free_pos);
+	  }
+	} while (free_pos < bits_per_slot);
+	last_pos = l2_pos;
+	l2_pos += d;
+      }
+      l2_pos = 0;
+      pos = 0;
+      pos_end = last_pos0 / d;
+    }
+
+    ++l2_allocs;
+    auto allocated_here = *allocated - prev_allocated;
+    assert(available >= allocated_here);
+    available -= allocated_here;
+  }
+
+#ifndef NON_CEPH_BUILD
+  // to provide compatibility with BlueStore's allocator interface
+  void _free_l2(const interval_set<uint64_t> & rr)
+  {
+    uint64_t released = 0;
+    std::lock_guard<std::mutex> l(lock);
+    for (auto r : rr) {
+      released += l1._free_l1(r.first, r.second);
+      uint64_t l2_pos = r.first / l2_granularity;
+      uint64_t l2_pos_end = P2ROUNDUP(int64_t(r.first + r.second), int64_t(l2_granularity)) / l2_granularity;
+
+      _mark_l2_free(l2_pos, l2_pos_end);
+    }
+    available += released;
+  }
+#endif
+
+  template <typename T>
+  void _free_l2(const T& rr)
+  {
+    uint64_t released = 0;
+    std::lock_guard<std::mutex> l(lock);
+    for (auto r : rr) {
+      released += l1._free_l1(r.offset, r.length);
+      uint64_t l2_pos = r.offset / l2_granularity;
+      uint64_t l2_pos_end = P2ROUNDUP(int64_t(r.offset + r.length), int64_t(l2_granularity)) / l2_granularity;
+
+      _mark_l2_free(l2_pos, l2_pos_end);
+    }
+    available += released;
+  }
+
+  void _mark_allocated(uint64_t o, uint64_t len)
+  {
+    uint64_t l2_pos = o / l2_granularity;
+    uint64_t l2_pos_end = P2ROUNDUP(int64_t(o + len), int64_t(l2_granularity)) / l2_granularity;
+
+    std::lock_guard<std::mutex> l(lock);
+    auto allocated = l1._mark_alloc_l1(interval_t(o, len));
+    assert(available >= allocated);
+    available -= allocated;
+    _mark_l2_on_l1(l2_pos, l2_pos_end);
+  }
+
+  void _mark_free(uint64_t o, uint64_t len)
+  {
+    uint64_t l2_pos = o / l2_granularity;
+    uint64_t l2_pos_end = P2ROUNDUP(int64_t(o + len), int64_t(l2_granularity)) / l2_granularity;
+
+    std::lock_guard<std::mutex> l(lock);
+    available += l1._free_l1(o, len);
+    _mark_l2_free(l2_pos, l2_pos_end);
+  }
+  void _shutdown()
+  {
+    last_pos = 0;
+  }
+  double _get_fragmentation() {
+    std::lock_guard<std::mutex> l(lock);
+    return l1.get_fragmentation();
+  }
+};
+
+#endif
diff -Nru ceph-12.2.11/src/os/bluestore/KernelDevice.cc ceph-12.2.12/src/os/bluestore/KernelDevice.cc
--- ceph-12.2.11/src/os/bluestore/KernelDevice.cc	2019-01-30 15:51:26.000000000 +0000
+++ ceph-12.2.12/src/os/bluestore/KernelDevice.cc	2019-04-11 12:33:50.000000000 +0000
@@ -697,7 +697,11 @@
   int r = ::pread(buffered ? fd_buffered : fd_direct,
 		  p.c_str(), len, off);
   if (r < 0) {
-    r = -errno;
+    if (ioc->allow_eio && is_expected_ioerr(r)) {
+      r = -EIO;
+    } else {
+      r = -errno;
+    }
     goto out;
   }
   assert((uint64_t)r == len);
diff -Nru ceph-12.2.11/src/os/bluestore/StupidAllocator.cc ceph-12.2.12/src/os/bluestore/StupidAllocator.cc
--- ceph-12.2.11/src/os/bluestore/StupidAllocator.cc	2019-01-30 15:51:26.000000000 +0000
+++ ceph-12.2.12/src/os/bluestore/StupidAllocator.cc	2019-04-11 12:33:50.000000000 +0000
@@ -12,7 +12,6 @@
 
 StupidAllocator::StupidAllocator(CephContext* cct)
   : cct(cct), num_free(0),
-    num_reserved(0),
     free(10),
     last_alloc(0)
 {
@@ -48,28 +47,6 @@
   }
 }
 
-int StupidAllocator::reserve(uint64_t need)
-{
-  std::lock_guard<std::mutex> l(lock);
-  dout(10) << __func__ << " need 0x" << std::hex << need
-	   << " num_free 0x" << num_free
-	   << " num_reserved 0x" << num_reserved << std::dec << dendl;
-  if ((int64_t)need > num_free - num_reserved)
-    return -ENOSPC;
-  num_reserved += need;
-  return 0;
-}
-
-void StupidAllocator::unreserve(uint64_t unused)
-{
-  std::lock_guard<std::mutex> l(lock);
-  dout(10) << __func__ << " unused 0x" << std::hex << unused
-	   << " num_free 0x" << num_free
-	   << " num_reserved 0x" << num_reserved << std::dec << dendl;
-  assert(num_reserved >= (int64_t)unused);
-  num_reserved -= unused;
-}
-
 /// return the effective length of the extent if we align to alloc_unit
 uint64_t StupidAllocator::_aligned_len(
   StupidAllocator::interval_set_t::iterator p,
@@ -195,9 +172,7 @@
   }
 
   num_free -= *length;
-  num_reserved -= *length;
   assert(num_free >= 0);
-  assert(num_reserved >= 0);
   last_alloc = *offset + *length;
   return 0;
 }
@@ -207,7 +182,7 @@
   uint64_t alloc_unit,
   uint64_t max_alloc_size,
   int64_t hint,
-  mempool::bluestore_alloc::vector<AllocExtent> *extents)
+  PExtentVector *extents)
 {
   uint64_t allocated_size = 0;
   uint64_t offset = 0;
@@ -218,8 +193,6 @@
     max_alloc_size = want_size;
   }
 
-  ExtentList block_list = ExtentList(extents, 1, max_alloc_size);
-
   while (allocated_size < want_size) {
     res = allocate_int(MIN(max_alloc_size, (want_size - allocated_size)),
        alloc_unit, hint, &offset, &length);
@@ -229,7 +202,19 @@
        */
       break;
     }
-    block_list.add_extents(offset, length);
+    bool can_append = true;
+    if (!extents->empty()) {
+      bluestore_pextent_t &last_extent  = extents->back();
+      if ((last_extent.end() == offset) &&
+	  ((last_extent.length + length) <= max_alloc_size)) {
+	can_append = false;
+	last_extent.length += length;
+      }
+    }
+    if (can_append) {
+      extents->emplace_back(bluestore_pextent_t(offset, length));
+    }
+
     allocated_size += length;
     hint = offset + length;
   }
@@ -241,13 +226,19 @@
 }
 
 void StupidAllocator::release(
-  uint64_t offset, uint64_t length)
+  const interval_set<uint64_t>& release_set)
 {
   std::lock_guard<std::mutex> l(lock);
-  dout(10) << __func__ << " 0x" << std::hex << offset << "~" << length
-	   << std::dec << dendl;
-  _insert_free(offset, length);
-  num_free += length;
+  for (interval_set<uint64_t>::const_iterator p = release_set.begin();
+       p != release_set.end();
+       ++p) {
+    const auto offset = p.get_start();
+    const auto length = p.get_len();
+    ldout(cct, 10) << __func__ << " 0x" << std::hex << offset << "~" << length
+		   << std::dec << dendl;
+    _insert_free(offset, length);
+    num_free += length;
+  }
 }
 
 uint64_t StupidAllocator::get_free()
@@ -256,6 +247,31 @@
   return num_free;
 }
 
+double StupidAllocator::get_fragmentation(uint64_t alloc_unit)
+{
+  assert(alloc_unit);
+  double res;
+  uint64_t max_intervals = 0;
+  uint64_t intervals = 0;
+  {
+    std::lock_guard<std::mutex> l(lock);
+    max_intervals = num_free / alloc_unit;
+    for (unsigned bin = 0; bin < free.size(); ++bin) {
+      intervals += free[bin].num_intervals();
+    }
+  }
+  ldout(cct, 30) << __func__ << " " << intervals << "/" << max_intervals 
+                 << dendl;
+  assert(intervals <= max_intervals);
+  if (!intervals || max_intervals <= 1) {
+    return 0.0;
+  }
+  intervals--;
+  max_intervals--;
+  res = (double)intervals / max_intervals;
+  return res;
+}
+
 void StupidAllocator::dump()
 {
   std::lock_guard<std::mutex> l(lock);
diff -Nru ceph-12.2.11/src/os/bluestore/StupidAllocator.h ceph-12.2.12/src/os/bluestore/StupidAllocator.h
--- ceph-12.2.11/src/os/bluestore/StupidAllocator.h	2019-01-30 15:51:26.000000000 +0000
+++ ceph-12.2.12/src/os/bluestore/StupidAllocator.h	2019-04-11 12:33:50.000000000 +0000
@@ -17,7 +17,6 @@
   std::mutex lock;
 
   int64_t num_free;     ///< total bytes in freelist
-  int64_t num_reserved; ///< reserved bytes
 
   typedef mempool::bluestore_alloc::pool_allocator<
     pair<const uint64_t,uint64_t>> allocator_t;
@@ -38,21 +37,19 @@
   StupidAllocator(CephContext* cct);
   ~StupidAllocator() override;
 
-  int reserve(uint64_t need) override;
-  void unreserve(uint64_t unused) override;
-
   int64_t allocate(
     uint64_t want_size, uint64_t alloc_unit, uint64_t max_alloc_size,
-    int64_t hint, mempool::bluestore_alloc::vector<AllocExtent> *extents) override;
+    int64_t hint, PExtentVector *extents) override;
 
   int64_t allocate_int(
     uint64_t want_size, uint64_t alloc_unit, int64_t hint,
     uint64_t *offset, uint32_t *length);
 
   void release(
-    uint64_t offset, uint64_t length) override;
+    const interval_set<uint64_t>& release_set) override;
 
   uint64_t get_free() override;
+  double get_fragmentation(uint64_t alloc_unit) override;
 
   void dump() override;
 
diff -Nru ceph-12.2.11/src/os/CMakeLists.txt ceph-12.2.12/src/os/CMakeLists.txt
--- ceph-12.2.11/src/os/CMakeLists.txt	2019-01-30 15:51:26.000000000 +0000
+++ ceph-12.2.12/src/os/CMakeLists.txt	2019-04-11 12:33:50.000000000 +0000
@@ -28,11 +28,11 @@
     bluestore/BlueRocksEnv.cc
     bluestore/BlueStore.cc
     bluestore/bluestore_types.cc
+    bluestore/fastbmap_allocator_impl.cc
     bluestore/FreelistManager.cc
     bluestore/KernelDevice.cc
     bluestore/StupidAllocator.cc
-    bluestore/BitMapAllocator.cc
-    bluestore/BitAllocator.cc
+    bluestore/BitmapAllocator.cc
     bluestore/aio.cc
   )
 endif(HAVE_LIBAIO)
diff -Nru ceph-12.2.11/src/os/filestore/FileStore.cc ceph-12.2.12/src/os/filestore/FileStore.cc
--- ceph-12.2.11/src/os/filestore/FileStore.cc	2019-01-30 15:51:26.000000000 +0000
+++ ceph-12.2.12/src/os/filestore/FileStore.cc	2019-04-11 12:33:50.000000000 +0000
@@ -2387,7 +2387,11 @@
   }
 
   // and make sure our xattr is durable.
-  ::fsync(fd);
+  r = ::fsync(fd);
+  if (r < 0) {
+    derr << __func__ << " fsync failed: " << cpp_strerror(errno) << dendl;
+    ceph_abort();
+  }
 
   _inject_failure();
 
@@ -2456,7 +2460,11 @@
   _inject_failure();
 
   // first make sure the previous operation commits
-  ::fsync(fd);
+  int r = ::fsync(fd);
+  if (r < 0) {
+    derr << __func__ << " fsync failed: " << cpp_strerror(errno) << dendl;
+    ceph_abort();
+  }
 
   if (!in_progress) {
     // sync object_map too.  even if this object has a header or keys,
@@ -2471,7 +2479,7 @@
   bufferlist v(40);
   ::encode(spos, v);
   ::encode(in_progress, v);
-  int r = chain_fsetxattr<true, true>(
+  r = chain_fsetxattr<true, true>(
     fd, REPLAY_GUARD_XATTR, v.c_str(), v.length());
   if (r < 0) {
     derr << "fsetxattr " << REPLAY_GUARD_XATTR << " got " << cpp_strerror(r) << dendl;
@@ -2479,7 +2487,11 @@
   }
 
   // and make sure our xattr is durable.
-  ::fsync(fd);
+  r = ::fsync(fd);
+  if (r < 0) {
+    derr << __func__ << " fsync failed: " << cpp_strerror(errno) << dendl;
+    ceph_abort();
+  }
 
   _inject_failure();
 
@@ -2529,7 +2541,11 @@
   }
 
   // and make sure our xattr is durable.
-  ::fsync(fd);
+  r = ::fsync(fd);
+  if (r < 0) {
+    derr << __func__ << " fsync failed: " << cpp_strerror(errno) << dendl;
+    ceph_abort();
+  }
 
   _inject_failure();
 
diff -Nru ceph-12.2.11/src/os/filestore/LFNIndex.cc ceph-12.2.12/src/os/filestore/LFNIndex.cc
--- ceph-12.2.11/src/os/filestore/LFNIndex.cc	2019-01-30 15:51:26.000000000 +0000
+++ ceph-12.2.12/src/os/filestore/LFNIndex.cc	2019-04-11 12:33:50.000000000 +0000
@@ -29,6 +29,7 @@
 #include "common/debug.h"
 #include "include/buffer.h"
 #include "common/ceph_crypto.h"
+#include "common/errno.h"
 #include "include/compat.h"
 #include "chain_xattr.h"
 
@@ -176,10 +177,11 @@
   maybe_inject_failure();
   int r = ::fsync(fd);
   maybe_inject_failure();
-  if (r < 0)
-    return -errno;
-  else
-    return 0;
+  if (r < 0) {
+    derr << __func__ << " fsync failed: " << cpp_strerror(errno) << dendl;
+    ceph_abort();
+  }
+  return 0;
 }
 
 int LFNIndex::link_object(const vector<string> &from,
diff -Nru ceph-12.2.11/src/os/filestore/WBThrottle.cc ceph-12.2.12/src/os/filestore/WBThrottle.cc
--- ceph-12.2.11/src/os/filestore/WBThrottle.cc	2019-01-30 15:51:26.000000000 +0000
+++ ceph-12.2.12/src/os/filestore/WBThrottle.cc	2019-04-11 12:33:50.000000000 +0000
@@ -5,6 +5,7 @@
 
 #include "os/filestore/WBThrottle.h"
 #include "common/perf_counters.h"
+#include "common/errno.h"
 
 WBThrottle::WBThrottle(CephContext *cct) :
   cur_ios(0), cur_size(0),
@@ -166,10 +167,14 @@
     logger->inc(l_wbthrottle_inodes_wb);
     lock.Unlock();
 #ifdef HAVE_FDATASYNC
-    ::fdatasync(**wb.get<1>());
+    int r = ::fdatasync(**wb.get<1>());
 #else
-    ::fsync(**wb.get<1>());
+    int r = ::fsync(**wb.get<1>());
 #endif
+    if (r < 0) {
+      lderr(cct) << "WBThrottle fsync failed: " << cpp_strerror(errno) << dendl;
+      ceph_abort();
+    }
 #ifdef HAVE_POSIX_FADVISE
     if (cct->_conf->filestore_fadvise && wb.get<2>().nocache) {
       int fa_r = posix_fadvise(**wb.get<1>(), 0, 0, POSIX_FADV_DONTNEED);
diff -Nru ceph-12.2.11/src/osd/ECBackend.cc ceph-12.2.12/src/osd/ECBackend.cc
--- ceph-12.2.11/src/osd/ECBackend.cc	2019-01-30 15:51:26.000000000 +0000
+++ ceph-12.2.12/src/osd/ECBackend.cc	2019-04-11 12:33:50.000000000 +0000
@@ -1836,13 +1836,12 @@
     return false;
   }
 
+  op->using_cache = pipeline_state.caching_enabled();
+
   if (op->invalidates_cache()) {
     dout(20) << __func__ << ": invalidating cache after this op"
 	     << dendl;
     pipeline_state.invalidate();
-    op->using_cache = false;
-  } else {
-    op->using_cache = pipeline_state.caching_enabled();
   }
 
   waiting_state.pop_front();
diff -Nru ceph-12.2.11/src/osd/OSD.cc ceph-12.2.12/src/osd/OSD.cc
--- ceph-12.2.11/src/osd/OSD.cc	2019-01-30 15:51:26.000000000 +0000
+++ ceph-12.2.12/src/osd/OSD.cc	2019-04-11 12:33:50.000000000 +0000
@@ -2987,11 +2987,20 @@
   r = admin_socket->register_command(
    "trigger_scrub",
    "trigger_scrub " \
-   "name=pgid,type=CephString ",
+   "name=pgid,type=CephString " \
+   "name=time,type=CephInt,req=false",
    test_ops_hook,
    "Trigger a scheduled scrub ");
   assert(r == 0);
   r = admin_socket->register_command(
+   "trigger_deep_scrub",
+   "trigger_deep_scrub " \
+   "name=pgid,type=CephString " \
+   "name=time,type=CephInt,req=false",
+   test_ops_hook,
+   "Trigger a scheduled deep scrub ");
+  ceph_assert(r == 0);
+  r = admin_socket->register_command(
    "injectfull",
    "injectfull " \
    "name=type,type=CephString,req=false " \
@@ -5613,8 +5622,9 @@
        << "to " << service->cct->_conf->osd_recovery_delay_start;
     return;
   }
-  if (command ==  "trigger_scrub") {
+  if (command ==  "trigger_scrub" || command == "trigger_deep_scrub") {
     spg_t pgid;
+    bool deep = (command == "trigger_deep_scrub");
     OSDMapRef curmap = service->get_osdmap();
 
     string pgidstr;
@@ -5625,6 +5635,9 @@
       return;
     }
 
+    int64_t time;
+    cmd_getval(service->cct, cmdmap, "time", time, (int64_t)0);
+
     PG *pg = service->osd->_lookup_lock_pg(pgid);
     if (pg == nullptr) {
       ss << "Can't find pg " << pgid;
@@ -5635,16 +5648,31 @@
       pg->unreg_next_scrub();
       const pg_pool_t *p = curmap->get_pg_pool(pgid.pool());
       double pool_scrub_max_interval = 0;
-      p->opts.get(pool_opts_t::SCRUB_MAX_INTERVAL, &pool_scrub_max_interval);
-      double scrub_max_interval = pool_scrub_max_interval > 0 ?
-        pool_scrub_max_interval : g_conf->osd_scrub_max_interval;
+      double scrub_max_interval;
+      if (deep) {
+        p->opts.get(pool_opts_t::DEEP_SCRUB_INTERVAL, &pool_scrub_max_interval);
+        scrub_max_interval = pool_scrub_max_interval > 0 ?
+          pool_scrub_max_interval : g_conf->osd_deep_scrub_interval;
+      } else {
+        p->opts.get(pool_opts_t::SCRUB_MAX_INTERVAL, &pool_scrub_max_interval);
+        scrub_max_interval = pool_scrub_max_interval > 0 ?
+          pool_scrub_max_interval : g_conf->osd_scrub_max_interval;
+      }
       // Instead of marking must_scrub force a schedule scrub
       utime_t stamp = ceph_clock_now();
-      stamp -= scrub_max_interval;
-      stamp -=  100.0;  // push back last scrub more for good measure
-      pg->info.history.last_scrub_stamp = stamp;
+      if (time == 0)
+        stamp -= scrub_max_interval;
+      else
+        stamp -=  (float)time;
+      stamp -= 100.0;  // push back last scrub more for good measure
+      if (deep) {
+        pg->set_last_deep_scrub_stamp(stamp);
+      } else {
+        pg->set_last_scrub_stamp(stamp);
+      }
       pg->reg_next_scrub();
-      ss << "ok";
+      pg->publish_stats_to_osd();
+      ss << "ok - set" << (deep ? " deep" : "" ) << " stamp " << stamp;
     } else {
       ss << "Not primary";
     }
@@ -9596,46 +9624,31 @@
 
 void OSDService::adjust_pg_priorities(const vector<PGRef>& pgs, int newflags)
 {
-  if (!pgs.size() || !(newflags & (OFR_BACKFILL | OFR_RECOVERY)))
+  if (!pgs.size() || !(newflags & (OFR_BACKFILL | OFR_RECOVERY))) {
     return;
-  int newstate = 0;
-
+  }
+  set<spg_t> did;
   if (newflags & OFR_BACKFILL) {
-    newstate = PG_STATE_FORCED_BACKFILL;
+    for (auto& pg : pgs) {
+      if (pg->set_force_backfill(!(newflags & OFR_CANCEL))) {
+	did.insert(pg->pg_id);
+      }
+    }
   } else if (newflags & OFR_RECOVERY) {
-    newstate = PG_STATE_FORCED_RECOVERY;
-  }
-
-  // debug output here may get large, don't generate it if debug level is below
-  // 10 and use abbreviated pg ids otherwise
-  if ((cct)->_conf->subsys.should_gather(ceph_subsys_osd, 10)) {
-    stringstream ss;
-
-    for (auto& i : pgs) {
-      ss << i->get_pgid() << " ";
+    for (auto& pg : pgs) {
+      if (pg->set_force_recovery(!(newflags & OFR_CANCEL))) {
+	did.insert(pg->pg_id);
+      }
     }
-
-    dout(10) << __func__ << " working on " << ss.str() << dendl;
   }
-
-  if (newflags & OFR_CANCEL) {
-    for (auto& i : pgs) {
-      i->lock();
-      i->_change_recovery_force_mode(newstate, true);
-      i->unlock();
-    }
+  if (did.empty()) {
+    dout(10) << __func__ << " " << ((newflags & OFR_CANCEL) ? "cleared" : "set")
+	     << " force_" << ((newflags & OFR_BACKFILL) ? "backfill" : "recovery")
+	     << " on no pgs" << dendl;
   } else {
-    for (auto& i : pgs) {
-      // make sure the PG is in correct state before forcing backfill or recovery, or
-      // else we'll make PG keeping FORCE_* flag forever, requiring osds restart
-      // or forcing somehow recovery/backfill.
-      i->lock();
-      int pgstate = i->get_state();
-      if ( ((newstate == PG_STATE_FORCED_RECOVERY) && (pgstate & (PG_STATE_DEGRADED | PG_STATE_RECOVERY_WAIT | PG_STATE_RECOVERING))) ||
-	    ((newstate == PG_STATE_FORCED_BACKFILL) && (pgstate & (PG_STATE_DEGRADED | PG_STATE_BACKFILL_WAIT | PG_STATE_BACKFILLING))) )
-        i->_change_recovery_force_mode(newstate, false);
-      i->unlock();
-    }
+    dout(10) << __func__ << " " << ((newflags & OFR_CANCEL) ? "cleared" : "set")
+	     << " force_" << ((newflags & OFR_BACKFILL) ? "backfill" : "recovery")
+	     << " on " << did << dendl;
   }
 }
 
diff -Nru ceph-12.2.11/src/osd/OSDMap.cc ceph-12.2.12/src/osd/OSDMap.cc
--- ceph-12.2.11/src/osd/OSDMap.cc	2019-01-30 15:51:26.000000000 +0000
+++ ceph-12.2.12/src/osd/OSDMap.cc	2019-04-11 12:33:50.000000000 +0000
@@ -1642,7 +1642,30 @@
       to_cancel.insert(pg);
       continue;
     }
+    vector<int> raw_up;
+    int primary;
+    tmpmap.pg_to_raw_up(pg, &raw_up, &primary);
+    vector<int> up;
+    up.reserve(raw_up.size());
+    for (auto osd : raw_up) {
+      // skip non-existent/down osd for erasure-coded PGs
+      if (osd == CRUSH_ITEM_NONE)
+        continue;
+      up.push_back(osd);
+    }
     auto crush_rule = tmpmap.get_pg_pool_crush_rule(pg);
+    auto r = tmpmap.crush->verify_upmap(cct,
+                                        crush_rule,
+                                        tmpmap.get_pg_pool_size(pg),
+                                        up);
+    if (r < 0) {
+      ldout(cct, 0) << __func__ << " verify_upmap of pg " << pg
+                    << " returning " << r
+                    << dendl;
+      to_cancel.insert(pg);
+      continue;
+    }
+    // below we check against crush-topology changing..
     map<int, float> weight_map;
     auto it = rule_weight_map.find(crush_rule);
     if (it == rule_weight_map.end()) {
@@ -1656,43 +1679,10 @@
     } else {
       weight_map = it->second;
     }
-    auto type = tmpmap.crush->get_rule_failure_domain(crush_rule);
-    if (type < 0) {
-      lderr(cct) << __func__ << " unable to load failure-domain-type of pg "
-                 << pg << dendl;
-      continue;
-    }
     ldout(cct, 10) << __func__ << " pg " << pg
-                   << " crush-rule-id " << crush_rule
                    << " weight_map " << weight_map
-                   << " failure-domain-type " << type
                    << dendl;
-    vector<int> raw;
-    int primary;
-    tmpmap.pg_to_raw_up(pg, &raw, &primary);
-    set<int> parents;
-    for (auto osd : raw) {
-      // skip non-existent/down osd for erasure-coded PGs
-      if (osd == CRUSH_ITEM_NONE)
-        continue;
-      if (type > 0) {
-        auto parent = tmpmap.crush->get_parent_of_type(osd, type, crush_rule);
-        if (parent < 0) {
-          auto r = parents.insert(parent);
-          if (!r.second) {
-            // two up-set osds come from same parent
-            to_cancel.insert(pg);
-            break;
-          }
-        } else {
-          lderr(cct) << __func__ << " unable to get parent of raw osd."
-                     << osd << " of pg " << pg
-                     << dendl;
-          // continue to do checks below
-        }
-      }
-      // the above check validates collision only
-      // below we continue to check against crush-topology changing..
+    for (auto osd : up) {
       auto it = weight_map.find(osd);
       if (it == weight_map.end()) {
         // osd is gone or has been moved out of the specific crush-tree
@@ -2263,6 +2253,17 @@
     *primary = _pick_primary(*raw);
 }
 
+void OSDMap::pg_to_raw_upmap(pg_t pg, vector<int> *raw_upmap) const
+{
+  auto pool = get_pg_pool(pg.pool());
+  if (!pool) {
+    raw_upmap->clear();
+    return;
+  }
+  _pg_to_raw_osds(*pool, pg, raw_upmap, NULL);
+  _apply_upmap(*pool, pg, raw_upmap);
+}
+
 void OSDMap::pg_to_raw_up(pg_t pg, vector<int> *up, int *primary) const
 {
   const pg_pool_t *pool = get_pg_pool(pg.pool());
@@ -3975,9 +3976,6 @@
   if (rule < 0)
     return false;
 
-  // get original mapping
-  _pg_to_raw_osds(*pool, pg, orig, NULL);
-
   // make sure there is something there to remap
   bool any = false;
   for (auto osd : *orig) {
@@ -4008,202 +4006,476 @@
   CephContext *cct,
   float max_deviation_ratio,
   int max,
-  const set<int64_t>& only_pools_orig,
+  const set<int64_t>& only_pools,
   OSDMap::Incremental *pending_inc)
 {
-  set<int64_t> only_pools;
-  if (only_pools_orig.empty()) {
-    for (auto& i : pools) {
-      only_pools.insert(i.first);
-    }
-  } else {
-    only_pools = only_pools_orig;
-  }
+  ldout(cct, 10) << __func__ << " pools " << only_pools << dendl;
   OSDMap tmp;
   tmp.deepish_copy_from(*this);
-  float start_deviation = 0;
-  float end_deviation = 0;
   int num_changed = 0;
-  while (true) {
-    map<int,set<pg_t>> pgs_by_osd;
-    int total_pgs = 0;
-    float osd_weight_total = 0;
-    map<int,float> osd_weight;
-    for (auto& i : pools) {
-      if (!only_pools.empty() && !only_pools.count(i.first))
-	continue;
-      for (unsigned ps = 0; ps < i.second.get_pg_num(); ++ps) {
-	pg_t pg(ps, i.first);
-	vector<int> up;
-	tmp.pg_to_up_acting_osds(pg, &up, nullptr, nullptr, nullptr);
-	for (auto osd : up) {
-	  if (osd != CRUSH_ITEM_NONE)
-	    pgs_by_osd[osd].insert(pg);
-	}
-      }
-      total_pgs += i.second.get_size() * i.second.get_pg_num();
-
-      map<int,float> pmap;
-      int ruleno = tmp.crush->find_rule(i.second.get_crush_rule(),
-					i.second.get_type(),
-					i.second.get_size());
-      tmp.crush->get_rule_weight_osd_map(ruleno, &pmap);
-      ldout(cct,30) << __func__ << " pool " << i.first << " ruleno " << ruleno << dendl;
-      for (auto p : pmap) {
-	auto adjusted_weight = tmp.get_weightf(p.first) * p.second;
-        if (adjusted_weight == 0) {
-          continue;
-        }
-	osd_weight[p.first] += adjusted_weight;
-	osd_weight_total += adjusted_weight;
+  map<int,set<pg_t>> pgs_by_osd;
+  int total_pgs = 0;
+  float osd_weight_total = 0;
+  map<int,float> osd_weight;
+  for (auto& i : pools) {
+    if (!only_pools.empty() && !only_pools.count(i.first))
+      continue;
+    for (unsigned ps = 0; ps < i.second.get_pg_num(); ++ps) {
+      pg_t pg(ps, i.first);
+      vector<int> up;
+      tmp.pg_to_up_acting_osds(pg, &up, nullptr, nullptr, nullptr);
+      ldout(cct, 20) << __func__ << " " << pg << " up " << up << dendl;
+      for (auto osd : up) {
+        if (osd != CRUSH_ITEM_NONE)
+	  pgs_by_osd[osd].insert(pg);
+      }
+    }
+    total_pgs += i.second.get_size() * i.second.get_pg_num();
+
+    map<int,float> pmap;
+    int ruleno = tmp.crush->find_rule(i.second.get_crush_rule(),
+				      i.second.get_type(),
+				      i.second.get_size());
+    tmp.crush->get_rule_weight_osd_map(ruleno, &pmap);
+    ldout(cct,20) << __func__ << " pool " << i.first
+                  << " ruleno " << ruleno
+                  << " weight-map " << pmap
+                  << dendl;
+    for (auto p : pmap) {
+      auto adjusted_weight = tmp.get_weightf(p.first) * p.second;
+      if (adjusted_weight == 0) {
+        continue;
       }
+      osd_weight[p.first] += adjusted_weight;
+      osd_weight_total += adjusted_weight;
     }
-    for (auto& i : osd_weight) {
-      int pgs = 0;
-      auto p = pgs_by_osd.find(i.first);
-      if (p != pgs_by_osd.end())
+  }
+  for (auto& i : osd_weight) {
+    int pgs = 0;
+    auto p = pgs_by_osd.find(i.first);
+    if (p != pgs_by_osd.end())
 	pgs = p->second.size();
-      else
+    else
 	pgs_by_osd.emplace(i.first, set<pg_t>());
-      ldout(cct, 20) << " osd." << i.first << " weight " << i.second
+    ldout(cct, 20) << " osd." << i.first << " weight " << i.second
 		     << " pgs " << pgs << dendl;
-    }
+  }
+  if (osd_weight_total == 0) {
+    lderr(cct) << __func__ << " abort due to osd_weight_total == 0" << dendl;
+    return 0;
+  }
+  float pgs_per_weight = total_pgs / osd_weight_total;
+  ldout(cct, 10) << " osd_weight_total " << osd_weight_total << dendl;
+  ldout(cct, 10) << " pgs_per_weight " << pgs_per_weight << dendl;
 
-    if (osd_weight_total == 0) {
-      lderr(cct) << __func__ << " abort due to osd_weight_total == 0" << dendl;
-      break;
-    }
-    float pgs_per_weight = total_pgs / osd_weight_total;
-    ldout(cct, 10) << " osd_weight_total " << osd_weight_total << dendl;
-    ldout(cct, 10) << " pgs_per_weight " << pgs_per_weight << dendl;
-
-    // osd deviation
-    float total_deviation = 0;
-    map<int,float> osd_deviation;       // osd, deviation(pgs)
-    multimap<float,int> deviation_osd;  // deviation(pgs), osd
+  if (max <= 0) {
+    lderr(cct) << __func__ << " abort due to max <= 0" << dendl;
+    return 0;
+  }
+  float decay_factor = 1.0 / float(max);
+  float stddev = 0;
+  map<int,float> osd_deviation;       // osd, deviation(pgs)
+  multimap<float,int> deviation_osd;  // deviation(pgs), osd
+  for (auto& i : pgs_by_osd) {
+    // make sure osd is still there (belongs to this crush-tree)
+    ceph_assert(osd_weight.count(i.first));
+    float target = osd_weight[i.first] * pgs_per_weight;
+    float deviation = (float)i.second.size() - target;
+    ldout(cct, 20) << " osd." << i.first
+                   << "\tpgs " << i.second.size()
+                   << "\ttarget " << target
+                   << "\tdeviation " << deviation
+                   << dendl;
+    osd_deviation[i.first] = deviation;
+    deviation_osd.insert(make_pair(deviation, i.first));
+    stddev += deviation * deviation;
+  }
+  if (stddev <= cct->_conf->get_val<double>("osd_calc_pg_upmaps_max_stddev")) {
+    ldout(cct, 10) << __func__ << " distribution is almost perfect"
+                   << dendl;
+    return 0;
+  }
+  bool skip_overfull = false;
+  auto aggressive =
+    cct->_conf->get_val<bool>("osd_calc_pg_upmaps_aggressively");
+  auto local_fallback_retries =
+    cct->_conf->get_val<uint64_t>("osd_calc_pg_upmaps_local_fallback_retries");
+  while (max--) {
+    // build overfull and underfull
     set<int> overfull;
-    for (auto& i : pgs_by_osd) {
-      // make sure osd is still there (belongs to this crush-tree)
-      assert(osd_weight.count(i.first));
-      float target = osd_weight[i.first] * pgs_per_weight;
-      float deviation = (float)i.second.size() - target;
-      ldout(cct, 20) << " osd." << i.first
-		     << "\tpgs " << i.second.size()
-		     << "\ttarget " << target
-		     << "\tdeviation " << deviation
-		     << dendl;
-      osd_deviation[i.first] = deviation;
-      deviation_osd.insert(make_pair(deviation, i.first));
-      if (deviation >= 1.0)
-	overfull.insert(i.first);
-      total_deviation += abs(deviation);
+    vector<int> underfull;
+    float decay = 0;
+    int decay_count = 0;
+    while (overfull.empty()) {
+      for (auto i = deviation_osd.rbegin(); i != deviation_osd.rend(); i++) {
+        if (i->first >= (1.0 - decay))
+          overfull.insert(i->second);
+      }
+      if (!overfull.empty())
+        break;
+      decay_count++;
+      decay = decay_factor * decay_count;
+      if (decay >= 1.0)
+        break;
+      ldout(cct, 30) << " decay_factor = " << decay_factor
+                     << " decay_count = " << decay_count
+                     << " decay (overfull) = " << decay
+                     << dendl;
     }
-    if (num_changed == 0) {
-      start_deviation = total_deviation;
+    if (overfull.empty()) {
+      lderr(cct) << __func__ << " failed to build overfull" << dendl;
+      break;
     }
-    end_deviation = total_deviation;
 
-    // build underfull, sorted from least-full to most-average
-    vector<int> underfull;
-    for (auto i = deviation_osd.begin();
-	 i != deviation_osd.end();
-	 ++i) {
-      if (i->first >= -.999)
-	break;
-      underfull.push_back(i->second);
+    decay = 0;
+    decay_count = 0;
+    while (underfull.empty()) {
+      for (auto i = deviation_osd.begin(); i != deviation_osd.end(); i++) {
+        if (i->first >= (-.999 + decay))
+          break;
+        underfull.push_back(i->second);
+      }
+      if (!underfull.empty())
+        break;
+      decay_count++;
+      decay = decay_factor * decay_count;
+      if (decay >= .999)
+        break;
+      ldout(cct, 30) << " decay_factor = " << decay_factor
+                     << " decay_count = " << decay_count
+                     << " decay (underfull) = " << decay
+                     << dendl;
     }
-    ldout(cct, 10) << " total_deviation " << total_deviation
-		   << " overfull " << overfull
-		   << " underfull " << underfull << dendl;
-    if (overfull.empty() || underfull.empty())
+    if (underfull.empty()) {
+      lderr(cct) << __func__ << " failed to build underfull" << dendl;
       break;
+    }
 
-    // pick fullest
-    bool restart = false;
+    ldout(cct, 10) << " overfull " << overfull
+                   << " underfull " << underfull
+                   << dendl;
+    set<pg_t> to_skip;
+    uint64_t local_fallback_retried = 0;
+
+  retry:
+
+    set<pg_t> to_unmap;
+    map<pg_t, mempool::osdmap::vector<pair<int32_t,int32_t>>> to_upmap;
+    auto temp_pgs_by_osd = pgs_by_osd;
+    // always start with fullest, break if we find any changes to make
     for (auto p = deviation_osd.rbegin(); p != deviation_osd.rend(); ++p) {
+      if (skip_overfull) {
+        ldout(cct, 10) << " skipping overfull " << dendl;
+        break; // fall through to check underfull
+      }
       int osd = p->second;
       float deviation = p->first;
       float target = osd_weight[osd] * pgs_per_weight;
-      assert(target > 0);
-      if (deviation/target < max_deviation_ratio) {
+      ceph_assert(target > 0);
+      float deviation_ratio = deviation / target;
+      if (deviation_ratio < max_deviation_ratio) {
 	ldout(cct, 10) << " osd." << osd
-		       << " target " << target
-		       << " deviation " << deviation
-		       << " -> ratio " << deviation/target
-		       << " < max ratio " << max_deviation_ratio << dendl;
+                       << " target " << target
+                       << " deviation " << deviation
+                       << " -> ratio " << deviation_ratio
+                       << " < max ratio " << max_deviation_ratio
+                       << dendl;
 	break;
       }
-      int num_to_move = deviation;
-      ldout(cct, 10) << " osd." << osd << " move " << num_to_move << dendl;
-      if (num_to_move < 1)
-	break;
-
-      set<pg_t>& pgs = pgs_by_osd[osd];
 
+      vector<pg_t> pgs;
+      pgs.reserve(pgs_by_osd[osd].size());
+      for (auto& pg : pgs_by_osd[osd]) {
+        if (to_skip.count(pg))
+          continue;
+        pgs.push_back(pg);
+      }
+      if (aggressive) {
+        // shuffle PG list so they all get equal (in)attention
+        std::random_device rd;
+        std::default_random_engine rng{rd()};
+        std::shuffle(pgs.begin(), pgs.end(), rng);
+      }
       // look for remaps we can un-remap
       for (auto pg : pgs) {
 	auto p = tmp.pg_upmap_items.find(pg);
-	if (p != tmp.pg_upmap_items.end()) {
-	  for (auto q : p->second) {
-	    if (q.second == osd) {
-	      ldout(cct, 10) << "  dropping pg_upmap_items " << pg
-			     << " " << p->second << dendl;
-	      tmp.pg_upmap_items.erase(p);
-	      pending_inc->old_pg_upmap_items.insert(pg);
-	      ++num_changed;
-	      restart = true;
-	    }
-	  }
-	}
-	if (restart)
-	  break;
-      } // pg loop
-      if (restart)
-	break;
+        if (p == tmp.pg_upmap_items.end())
+          continue;
+        mempool::osdmap::vector<pair<int32_t,int32_t>> new_upmap_items;
+        for (auto q : p->second) {
+	  if (q.second == osd) {
+            ldout(cct, 10) << " will try dropping existing"
+                           << " remapping pair "
+                           << q.first << " -> " << q.second
+                           << " which remapped " << pg
+                           << " into overfull osd." << osd
+                           << dendl;
+            temp_pgs_by_osd[q.second].erase(pg);
+            temp_pgs_by_osd[q.first].insert(pg);
+          } else {
+            new_upmap_items.push_back(q);
+          }
+        }
+        if (new_upmap_items.empty()) {
+          // drop whole item
+          ldout(cct, 10) << " existing pg_upmap_items " << p->second
+                         << " remapped " << pg << " into overfull osd." << osd
+                         << ", will try cancelling it entirely"
+                         << dendl;
+          to_unmap.insert(pg);
+          goto test_change;
+        } else if (new_upmap_items.size() != p->second.size()) {
+          // drop single remapping pair, updating
+          ceph_assert(new_upmap_items.size() < p->second.size());
+          ldout(cct, 10) << " existing pg_upmap_items " << p->second
+                         << " remapped " << pg << " into overfull osd." << osd
+                         << ", new_pg_upmap_items now " << new_upmap_items
+                         << dendl;
+          to_upmap[pg] = new_upmap_items;
+          goto test_change;
+        }
+      }
 
+      // try upmap
       for (auto pg : pgs) {
-	if (tmp.pg_upmap.count(pg) ||
-	    tmp.pg_upmap_items.count(pg)) {
-	  ldout(cct, 20) << "  already remapped " << pg << dendl;
+        auto temp_it = tmp.pg_upmap.find(pg);
+        if (temp_it != tmp.pg_upmap.end()) {
+          // leave pg_upmap alone
+          // it must be specified by admin since balancer does not
+          // support pg_upmap yet
+	  ldout(cct, 10) << " " << pg << " already has pg_upmap "
+                         << temp_it->second << ", skipping"
+                         << dendl;
 	  continue;
 	}
-	ldout(cct, 10) << "  trying " << pg << dendl;
+        auto pg_pool_size = tmp.get_pg_pool_size(pg);
+        mempool::osdmap::vector<pair<int32_t,int32_t>> new_upmap_items;
+        set<int> existing;
+        auto it = tmp.pg_upmap_items.find(pg);
+        if (it != tmp.pg_upmap_items.end() &&
+            it->second.size() >= (size_t)pg_pool_size) {
+          ldout(cct, 10) << " " << pg << " already has full-size pg_upmap_items "
+                         << it->second << ", skipping"
+                         << dendl;
+          continue;
+        } else if (it != tmp.pg_upmap_items.end()) {
+          ldout(cct, 10) << " " << pg << " already has pg_upmap_items "
+                         << it->second
+                         << dendl;
+          new_upmap_items = it->second;
+          // build existing too (for dedup)
+          for (auto i : it->second) {
+            existing.insert(i.first);
+            existing.insert(i.second);
+          }
+          // fall through
+          // to see if we can append more remapping pairs
+        }
+	ldout(cct, 10) << " trying " << pg << dendl;
 	vector<int> orig, out;
+        tmp.pg_to_raw_upmap(pg, &orig); // including existing upmaps too
 	if (!try_pg_upmap(cct, pg, overfull, underfull, &orig, &out)) {
 	  continue;
 	}
-	ldout(cct, 10) << "  " << pg << " " << orig << " -> " << out << dendl;
+	ldout(cct, 10) << " " << pg << " " << orig << " -> " << out << dendl;
 	if (orig.size() != out.size()) {
 	  continue;
 	}
-	assert(orig != out);
-	auto& rmi = tmp.pg_upmap_items[pg];
+	ceph_assert(orig != out);
 	for (unsigned i = 0; i < out.size(); ++i) {
-	  if (orig[i] != out[i]) {
-	    rmi.push_back(make_pair(orig[i], out[i]));
-	  }
+          if (orig[i] == out[i])
+            continue; // skip invalid remappings
+          if (existing.count(orig[i]) || existing.count(out[i]))
+            continue; // we want new remappings only!
+          ldout(cct, 10) << " will try adding new remapping pair "
+                         << orig[i] << " -> " << out[i] << " for " << pg
+                         << dendl;
+          existing.insert(orig[i]);
+          existing.insert(out[i]);
+          temp_pgs_by_osd[orig[i]].erase(pg);
+          temp_pgs_by_osd[out[i]].insert(pg);
+          ceph_assert(new_upmap_items.size() < (size_t)pg_pool_size);
+          new_upmap_items.push_back(make_pair(orig[i], out[i]));
+          // append new remapping pairs slowly
+          // This way we can make sure that each tiny change will
+          // definitely make distribution of PGs converging to
+          // the perfect status.
+          to_upmap[pg] = new_upmap_items;
+          goto test_change;
 	}
-	pending_inc->new_pg_upmap_items[pg] = rmi;
-	ldout(cct, 10) << "  " << pg << " pg_upmap_items " << rmi << dendl;
-	restart = true;
-	++num_changed;
-	break;
-      } // pg loop
-      if (restart)
-	break;
-    } // osd loop
+      }
+    }
 
-    if (!restart) {
-      ldout(cct, 10) << " failed to find any changes to make" << dendl;
-      break;
+    ceph_assert(!(to_unmap.size() || to_upmap.size()));
+    ldout(cct, 10) << " failed to find any changes for overfull osds"
+                   << dendl;
+    for (auto& p : deviation_osd) {
+      if (std::find(underfull.begin(), underfull.end(), p.second) ==
+                    underfull.end())
+        break;
+      int osd = p.second;
+      float deviation = p.first;
+      float target = osd_weight[osd] * pgs_per_weight;
+      ceph_assert(target > 0);
+      float deviation_ratio = abs(deviation / target);
+      if (deviation_ratio < max_deviation_ratio) {
+        // respect max_deviation_ratio too
+        ldout(cct, 10) << " osd." << osd
+                       << " target " << target
+                       << " deviation " << deviation
+                       << " -> absolute ratio " << deviation_ratio
+                       << " < max ratio " << max_deviation_ratio
+                       << dendl;
+        break;
+      }
+      // look for remaps we can un-remap
+      vector<pair<pg_t,
+        mempool::osdmap::vector<pair<int32_t,int32_t>>>> candidates;
+      candidates.reserve(tmp.pg_upmap_items.size());
+      for (auto& i : tmp.pg_upmap_items) {
+        if (to_skip.count(i.first))
+          continue;
+        if (!only_pools.empty() && !only_pools.count(i.first.pool()))
+          continue;
+        candidates.push_back(make_pair(i.first, i.second));
+      }
+      if (aggressive) {
+        // shuffle candidates so they all get equal (in)attention
+        std::random_device rd;
+        std::default_random_engine rng{rd()};
+        std::shuffle(candidates.begin(), candidates.end(), rng);
+      }
+      for (auto& i : candidates) {
+        auto pg = i.first;
+        mempool::osdmap::vector<pair<int32_t,int32_t>> new_upmap_items;
+        for (auto& j : i.second) {
+          if (j.first == osd) {
+            ldout(cct, 10) << " will try dropping existing"
+                           << " remapping pair "
+                           << j.first << " -> " << j.second
+                           << " which remapped " << pg
+                           << " out from underfull osd." << osd
+                           << dendl;
+            temp_pgs_by_osd[j.second].erase(pg);
+            temp_pgs_by_osd[j.first].insert(pg);
+          } else {
+            new_upmap_items.push_back(j);
+          }
+        }
+        if (new_upmap_items.empty()) {
+          // drop whole item
+          ldout(cct, 10) << " existing pg_upmap_items " << i.second
+                         << " remapped " << pg
+                         << " out from underfull osd." << osd
+                         << ", will try cancelling it entirely"
+                         << dendl;
+          to_unmap.insert(pg);
+          goto test_change;
+        } else if (new_upmap_items.size() != i.second.size()) {
+          // drop single remapping pair, updating
+          ceph_assert(new_upmap_items.size() < i.second.size());
+          ldout(cct, 10) << " existing pg_upmap_items " << i.second
+                         << " remapped " << pg
+                         << " out from underfull osd." << osd
+                         << ", new_pg_upmap_items now " << new_upmap_items
+                         << dendl;
+          to_upmap[pg] = new_upmap_items;
+          goto test_change;
+        }
+      }
     }
-    if (--max == 0) {
-      ldout(cct, 10) << " hit max iterations, stopping" << dendl;
+
+    ceph_assert(!(to_unmap.size() || to_upmap.size()));
+    ldout(cct, 10) << " failed to find any changes for underfull osds"
+                   << dendl;
+    if (!aggressive) {
+      ldout(cct, 10) << " break due to aggressive mode not enabled" << dendl;
+      break;
+    } else if (!skip_overfull) {
+      // safe to quit because below here we know
+      // we've done checking both overfull and underfull osds..
+      ldout(cct, 10) << " break due to not being able to find any"
+                     << " further optimizations"
+                     << dendl;
       break;
     }
+    // restart with fullest and do exhaustive searching
+    skip_overfull = false;
+    continue;
+
+  test_change:
+
+    // test change, apply if change is good
+    ceph_assert(to_unmap.size() || to_upmap.size());
+    float new_stddev = 0;
+    map<int,float> temp_osd_deviation;
+    multimap<float,int> temp_deviation_osd;
+    for (auto& i : temp_pgs_by_osd) {
+      // make sure osd is still there (belongs to this crush-tree)
+      ceph_assert(osd_weight.count(i.first));
+      float target = osd_weight[i.first] * pgs_per_weight;
+      float deviation = (float)i.second.size() - target;
+      ldout(cct, 20) << " osd." << i.first
+                     << "\tpgs " << i.second.size()
+                     << "\ttarget " << target
+                     << "\tdeviation " << deviation
+                     << dendl;
+      temp_osd_deviation[i.first] = deviation;
+      temp_deviation_osd.insert(make_pair(deviation, i.first));
+      new_stddev += deviation * deviation;
+    }
+    ldout(cct, 10) << " stddev " << stddev << " -> " << new_stddev << dendl;
+    if (new_stddev >= stddev) {
+      if (!aggressive) {
+        ldout(cct, 10) << " break because stddev is not decreasing"
+                       << " and aggressive mode is not enabled"
+                       << dendl;
+        break;
+      }
+      local_fallback_retried++;
+      if (local_fallback_retried >= local_fallback_retries) {
+        // does not make progress
+        // flip *skip_overfull* so both overfull and underfull
+        // get equal (in)attention
+        skip_overfull = !skip_overfull;
+        ldout(cct, 10) << " hit local_fallback_retries "
+                       << local_fallback_retries
+                       << dendl;
+        continue;
+      }
+      for (auto& i : to_unmap)
+        to_skip.insert(i);
+      for (auto& i : to_upmap)
+        to_skip.insert(i.first);
+      ldout(cct, 20) << " local_fallback_retried " << local_fallback_retried
+                     << " to_skip " << to_skip
+                     << dendl;
+      goto retry;
+    }
+
+    // ready to go
+    ceph_assert(new_stddev < stddev);
+    stddev = new_stddev;
+    pgs_by_osd = temp_pgs_by_osd;
+    osd_deviation = temp_osd_deviation;
+    deviation_osd = temp_deviation_osd;
+    for (auto& i : to_unmap) {
+      ldout(cct, 10) << " unmap pg " << i << dendl;
+      ceph_assert(tmp.pg_upmap_items.count(i));
+      tmp.pg_upmap_items.erase(i);
+      pending_inc->old_pg_upmap_items.insert(i);
+      ++num_changed;
+    }
+    for (auto& i : to_upmap) {
+      ldout(cct, 10) << " upmap pg " << i.first
+                     << " new pg_upmap_items " << i.second
+                     << dendl;
+      tmp.pg_upmap_items[i.first] = i.second;
+      pending_inc->new_pg_upmap_items[i.first] = i.second;
+      ++num_changed;
+    }
   }
-  ldout(cct, 10) << " start deviation " << start_deviation << dendl;
-  ldout(cct, 10) << " end deviation " << end_deviation << dendl;
+  ldout(cct, 10) << " num_changed = " << num_changed << dendl;
   return num_changed;
 }
 
diff -Nru ceph-12.2.11/src/osd/OSDMap.h ceph-12.2.12/src/osd/OSDMap.h
--- ceph-12.2.11/src/osd/OSDMap.h	2019-01-30 15:51:26.000000000 +0000
+++ ceph-12.2.12/src/osd/OSDMap.h	2019-04-11 12:33:50.000000000 +0000
@@ -1134,6 +1134,7 @@
    * raw and primary must be non-NULL
    */
   void pg_to_raw_osds(pg_t pg, vector<int> *raw, int *primary) const;
+  void pg_to_raw_upmap(pg_t pg, vector<int> *raw_upmap) const;
   /// map a pg to its acting set. @return acting set size
   void pg_to_acting_osds(const pg_t& pg, vector<int> *acting,
                         int *acting_primary) const {
diff -Nru ceph-12.2.11/src/osd/osd_types.h ceph-12.2.12/src/osd/osd_types.h
--- ceph-12.2.11/src/osd/osd_types.h	2019-01-30 15:51:26.000000000 +0000
+++ ceph-12.2.12/src/osd/osd_types.h	2019-04-11 12:33:50.000000000 +0000
@@ -82,9 +82,12 @@
 #define OSD_BACKFILL_INACTIVE_PRIORITY_BASE 220
 
 /// max manually/automatically set recovery priority for MBackfillReserve
-#define OSD_RECOVERY_PRIORITY_MAX 254
+#define OSD_RECOVERY_PRIORITY_MAX 253
 
-/// max recovery priority for MBackfillReserve, only when forced manually
+/// backfill priority for MBackfillReserve, when forced manually
+#define OSD_BACKFILL_PRIORITY_FORCED 254
+
+/// recovery priority for MRecoveryReserve, when forced manually
 #define OSD_RECOVERY_PRIORITY_FORCED 255
 
 
diff -Nru ceph-12.2.11/src/osd/PG.cc ceph-12.2.12/src/osd/PG.cc
--- ceph-12.2.11/src/osd/PG.cc	2019-01-30 15:51:26.000000000 +0000
+++ ceph-12.2.12/src/osd/PG.cc	2019-04-11 12:33:50.000000000 +0000
@@ -682,12 +682,10 @@
 			 << ")" << dendl;
       continue;
     }
-    if (oinfo.last_complete < need) {
-      if (omissing.is_missing(soid)) {
-	ldout(pg->cct, 10) << "search_for_missing " << soid << " " << need
-			   << " also missing on osd." << fromosd << dendl;
-	continue;
-      }
+    if (omissing.is_missing(soid)) {
+      ldout(pg->cct, 10) << "search_for_missing " << soid << " " << need
+			 << " also missing on osd." << fromosd << dendl;
+      continue;
     }
 
     ldout(pg->cct, 10) << "search_for_missing " << soid << " " << need
@@ -2134,17 +2132,64 @@
   kick_snap_trim();
 }
 
-void PG::_change_recovery_force_mode(int new_mode, bool clear)
+bool PG::set_force_recovery(bool b)
 {
+  bool did = false;
+  lock();
   if (!deleting) {
-    // we can't and shouldn't do anything if the PG is being deleted locally
-    if (clear) {
-      state_clear(new_mode);
-    } else {
-      state_set(new_mode);
+    if (b) {
+      if (!(state & PG_STATE_FORCED_RECOVERY) &&
+	  (state & (PG_STATE_DEGRADED |
+		    PG_STATE_RECOVERY_WAIT |
+		    PG_STATE_RECOVERING))) {
+	dout(20) << __func__ << " set" << dendl;
+	state_set(PG_STATE_FORCED_RECOVERY);
+	publish_stats_to_osd();
+	did = true;
+      }
+    } else if (state & PG_STATE_FORCED_RECOVERY) {
+      dout(20) << __func__ << " clear" << dendl;
+      state_clear(PG_STATE_FORCED_RECOVERY);
+      publish_stats_to_osd();
+      did = true;
     }
-    publish_stats_to_osd();
   }
+  unlock();
+  if (did) {
+    dout(20) << __func__ << " state " << pgstate_history.get_current_state() << dendl;
+    osd->local_reserver.update_priority(info.pgid, get_recovery_priority());
+  }
+  return did;
+}
+
+bool PG::set_force_backfill(bool b)
+{
+  bool did = false;
+  lock();
+  if (!deleting) {
+    if (b) {
+      if (!(state & PG_STATE_FORCED_BACKFILL) &&
+	  (state & (PG_STATE_DEGRADED |
+		    PG_STATE_BACKFILL_WAIT |
+		    PG_STATE_BACKFILLING))) {
+	dout(10) << __func__ << " set" << dendl;
+	state_set(PG_STATE_FORCED_BACKFILL);
+	publish_stats_to_osd();
+	did = true;
+      }
+    } else if (state & PG_STATE_FORCED_BACKFILL) {
+      dout(10) << __func__ << " clear" << dendl;
+      state_clear(PG_STATE_FORCED_BACKFILL);
+      publish_stats_to_osd();
+      did = true;
+    }
+  }
+  unlock();
+  if (did) {
+    dout(20) << __func__ << " state " << pgstate_history.get_current_state() << dendl;
+    osd->local_reserver.update_priority(info.pgid, get_backfill_priority());
+  }
+  return did;
 }
 
 inline int PG::clamp_recovery_priority(int priority)
@@ -2182,7 +2227,7 @@
   // a higher value -> a higher priority
   int ret = OSD_BACKFILL_PRIORITY_BASE;
   if (state & PG_STATE_FORCED_BACKFILL) {
-    ret = OSD_RECOVERY_PRIORITY_FORCED;
+    ret = OSD_BACKFILL_PRIORITY_FORCED;
   } else {
     if (acting.size() < pool.info.min_size) {
       // inactive: no. of replicas < min_size, highest priority since it blocks IO
@@ -2774,14 +2819,14 @@
         for (auto& ml: sml.second) {
           int missing_shards;
           if (sml.first == shard_id_t::NO_SHARD) {
-            dout(0) << __func__ << " ml " << ml.second << " upset size " << upset.size() << " up " << ml.first.up << dendl;
+            dout(20) << __func__ << " ml " << ml.second << " upset size " << upset.size() << " up " << ml.first.up << dendl;
             missing_shards = (int)upset.size() - ml.first.up;
           } else {
 	    // Handle shards not even in upset below
             if (!find_shard(upset, sml.first))
 	      continue;
 	    missing_shards = std::max(0, 1 - ml.first.up);
-            dout(0) << __func__ << " shard " << sml.first << " ml " << ml.second << " missing shards " << missing_shards << dendl;
+            dout(20) << __func__ << " shard " << sml.first << " ml " << ml.second << " missing shards " << missing_shards << dendl;
           }
           int odegraded = ml.second * missing_shards;
           // Copies on other osds but limited to the possible degraded
diff -Nru ceph-12.2.11/src/osd/PG.h ceph-12.2.12/src/osd/PG.h
--- ceph-12.2.11/src/osd/PG.h	2019-01-30 15:51:26.000000000 +0000
+++ ceph-12.2.12/src/osd/PG.h	2019-04-11 12:33:50.000000000 +0000
@@ -122,6 +122,11 @@
 
   void dump(Formatter* f) const;
 
+  string get_current_state() {
+    if (pi == nullptr) return "unknown";
+    return std::get<1>(pi->embedded_states.top());
+  }
+
 private:
   bool pg_in_destructor = false;
   PG* thispg = nullptr;
@@ -252,6 +257,10 @@
  */
 
 class PG : public DoutPrefixProvider {
+public:
+  bool set_force_recovery(bool b);
+  bool set_force_backfill(bool b);
+
 protected:
   OSDService *osd;
   CephContext *cct;
@@ -1059,6 +1068,7 @@
 
   void _update_calc_stats();
   void _update_blocked_by();
+  friend class TestOpsSocketHook;
   void publish_stats_to_osd();
   void clear_publish_stats();
 
@@ -1093,7 +1103,6 @@
   unsigned get_backfill_priority();
 
   void mark_clean();  ///< mark an active pg clean
-  void _change_recovery_force_mode(int new_mode, bool clear);
 
   /// return [start,end) bounds for required past_intervals
   static pair<epoch_t, epoch_t> get_required_past_interval_bounds(
@@ -2481,12 +2490,12 @@
   PG(OSDService *o, OSDMapRef curmap,
      const PGPool &pool, spg_t p);
   ~PG() override;
+  const spg_t pg_id;
 
  private:
   // Prevent copying
   explicit PG(const PG& rhs);
   PG& operator=(const PG& rhs);
-  const spg_t pg_id;
   uint64_t peer_features;
   uint64_t acting_features;
   uint64_t upacting_features;
@@ -2496,6 +2505,16 @@
  public:
   const spg_t&      get_pgid() const { return pg_id; }
 
+  void set_last_scrub_stamp(utime_t t) {
+    info.stats.last_scrub_stamp = t;
+    info.history.last_scrub_stamp = t;
+  }
+
+  void set_last_deep_scrub_stamp(utime_t t) {
+    info.stats.last_deep_scrub_stamp = t;
+    info.history.last_deep_scrub_stamp = t;
+  }
+
   void reset_min_peer_features() {
     peer_features = CEPH_FEATURES_SUPPORTED_DEFAULT;
   }
diff -Nru ceph-12.2.11/src/osd/PrimaryLogPG.cc ceph-12.2.12/src/osd/PrimaryLogPG.cc
--- ceph-12.2.11/src/osd/PrimaryLogPG.cc	2019-01-30 15:51:26.000000000 +0000
+++ ceph-12.2.12/src/osd/PrimaryLogPG.cc	2019-04-11 12:33:50.000000000 +0000
@@ -4966,7 +4966,12 @@
     }
     if (r >= 0)
       op.extent.length = r;
-    else {
+    else if (r == -EAGAIN) {
+      // EAGAIN should not change the length of extent or count the read op.
+      dout(10) << " read got " << r << " / " << op.extent.length
+              << " bytes from obj " << soid << ". try again." << dendl;
+      return -EAGAIN;
+    } else {
       result = r;
       op.extent.length = 0;
     }
@@ -9901,6 +9906,10 @@
     dout(10) << "handle_watch_timeout not active, no-op" << dendl;
     return;
   }
+  if (!obc->obs.exists) {
+    dout(10) << __func__ << " object " << obc->obs.oi.soid << " dne" << dendl;
+    return;
+  }
   if (is_degraded_or_backfilling_object(obc->obs.oi.soid)) {
     callbacks_for_degraded_object[obc->obs.oi.soid].push_back(
       watch->get_delayed_cb()
diff -Nru ceph-12.2.11/src/osdc/Objecter.cc ceph-12.2.12/src/osdc/Objecter.cc
--- ceph-12.2.11/src/osdc/Objecter.cc	2019-01-30 15:51:26.000000000 +0000
+++ ceph-12.2.12/src/osdc/Objecter.cc	2019-04-11 12:33:50.000000000 +0000
@@ -3294,11 +3294,11 @@
     if (i->op.op & CEPH_OSD_OP_MODE_WR) {
       op_budget += i->indata.length();
     } else if (ceph_osd_op_mode_read(i->op.op)) {
-      if (ceph_osd_op_type_data(i->op.op)) {
-	if ((int64_t)i->op.extent.length > 0)
-	  op_budget += (int64_t)i->op.extent.length;
+      if (ceph_osd_op_uses_extent(i->op.op)) {
+        if ((int64_t)i->op.extent.length > 0)
+          op_budget += (int64_t)i->op.extent.length;
       } else if (ceph_osd_op_type_attr(i->op.op)) {
-	op_budget += i->op.xattr.name_len + i->op.xattr.value_len;
+        op_budget += i->op.xattr.name_len + i->op.xattr.value_len;
       }
     }
   }
@@ -4427,7 +4427,10 @@
     if (session) {
       ldout(cct, 1) << "ms_handle_reset " << con << " session " << session
 		    << " osd." << session->osd << dendl;
-      if (!initialized) {
+      // the session maybe had been closed if new osdmap just handled
+      // says the osd down
+      if (!(initialized && osdmap->is_up(session->osd))) {
+	ldout(cct, 1) << "ms_handle_reset aborted,initialized=" << initialized << dendl;
 	wl.unlock();
 	return false;
       }
diff -Nru ceph-12.2.11/src/pybind/mgr/balancer/module.py ceph-12.2.12/src/pybind/mgr/balancer/module.py
--- ceph-12.2.11/src/pybind/mgr/balancer/module.py	2019-01-30 15:51:26.000000000 +0000
+++ ceph-12.2.12/src/pybind/mgr/balancer/module.py	2019-04-11 12:33:50.000000000 +0000
@@ -266,7 +266,7 @@
         {
             "cmd": "balancer execute name=plan,type=CephString",
             "desc": "Execute an optimization plan",
-            "perm": "r",
+            "perm": "rw",
         },
     ]
     active = False
@@ -377,25 +377,46 @@
         self.run = False
         self.event.set()
 
-    def time_in_interval(self, tod, begin, end):
-        if begin <= end:
-            return tod >= begin and tod < end
+    def time_permit(self):
+        local_time = time.localtime()
+        time_of_day = time.strftime('%H%M', local_time)
+        weekday = (local_time.tm_wday + 1) % 7 # be compatible with C
+        permit = False
+
+        begin_time = self.get_config('begin_time') or '0000'
+        end_time = self.get_config('end_time') or '2400'
+        if begin_time <= end_time:
+            permit = begin_time <= time_of_day < end_time
+        else:
+            permit = time_of_day >= begin_time or time_of_day < end_time
+        if not permit:
+            self.log.debug("should run between %s - %s, now %s, skipping",
+                           begin_time, end_time, time_of_day)
+            return False
+
+        begin_weekday = int(self.get_config('begin_weekday', 0))
+        end_weekday = int(self.get_config('end_weekday', 7))
+        if begin_weekday <= end_weekday:
+            permit = begin_weekday <= weekday < end_weekday
         else:
-            return tod >= begin or tod < end
+            permit = weekday >= begin_weekday or weekday < end_weekday
+        if not permit:
+            self.log.debug("should run between weekday %d - %d, now %d, skipping",
+                           begin_weekday, end_weekday, weekday)
+            return False
+
+        return True
 
     def serve(self):
         self.log.info('Starting')
         while self.run:
             self.active = self.get_config('active', '') is not ''
-            begin_time = self.get_config('begin_time') or '0000'
-            end_time = self.get_config('end_time') or '2400'
-            timeofday = time.strftime('%H%M', time.localtime())
-            self.log.debug('Waking up [%s, scheduled for %s-%s, now %s]',
-                           "active" if self.active else "inactive",
-                           begin_time, end_time, timeofday)
             sleep_interval = float(self.get_config('sleep_interval',
                                                    default_sleep_interval))
-            if self.active and self.time_in_interval(timeofday, begin_time, end_time):
+            self.log.debug('Waking up [%s, now %s]',
+                           "active" if self.active else "inactive",
+                           time.strftime(TIME_FORMAT, time.localtime()))
+            if self.active and self.time_permit():
                 self.log.debug('Running')
                 name = 'auto_%s' % time.strftime(TIME_FORMAT, time.gmtime())
                 plan = self.plan_create(name, self.get_osdmap(), [])
diff -Nru ceph-12.2.11/src/pybind/mgr/dashboard/module.py ceph-12.2.12/src/pybind/mgr/dashboard/module.py
--- ceph-12.2.11/src/pybind/mgr/dashboard/module.py	2019-01-30 15:51:26.000000000 +0000
+++ ceph-12.2.12/src/pybind/mgr/dashboard/module.py	2019-04-11 12:33:50.000000000 +0000
@@ -27,6 +27,7 @@
 import cherrypy
 import jinja2
 import urlparse
+from distutils.version import StrictVersion
 
 from mgr_module import MgrModule, MgrStandbyModule, CommandResult
 
@@ -46,6 +47,20 @@
 # python module for the convenience of the GUI?
 LOG_BUFFER_SIZE = 30
 
+# When the CherryPy server in 3.2.2 (and later) starts it attempts to verify
+# that the ports its listening on are in fact bound. When using the any address
+# "::" it tries both ipv4 and ipv6, and in some environments (e.g. kubernetes)
+# ipv6 isn't yet configured / supported and CherryPy throws an uncaught
+# exception.
+if cherrypy is not None:
+    v = StrictVersion(cherrypy.__version__)
+    # the issue was fixed in 3.2.3. it's present in 3.2.2 (current version on
+    # centos:7) and back to at least 3.0.0.
+    if StrictVersion("3.1.2") <= v < StrictVersion("3.2.3"):
+        # https://github.com/cherrypy/cherrypy/issues/1100
+        from cherrypy.process import servers
+        servers.wait_for_occupied_port = lambda host, port: None
+
 # cherrypy likes to sys.exit on error.  don't let it take us down too!
 def os_exit_noop(*args, **kwargs):
     pass
diff -Nru ceph-12.2.11/src/pybind/mgr/prometheus/module.py ceph-12.2.12/src/pybind/mgr/prometheus/module.py
--- ceph-12.2.11/src/pybind/mgr/prometheus/module.py	2019-01-30 15:51:26.000000000 +0000
+++ ceph-12.2.12/src/pybind/mgr/prometheus/module.py	2019-04-11 12:33:50.000000000 +0000
@@ -1,4 +1,5 @@
 import cherrypy
+from distutils.version import StrictVersion
 import json
 import errno
 import math
@@ -15,6 +16,19 @@
 DEFAULT_ADDR = '::'
 DEFAULT_PORT = 9283
 
+# When the CherryPy server in 3.2.2 (and later) starts it attempts to verify
+# that the ports its listening on are in fact bound. When using the any address
+# "::" it tries both ipv4 and ipv6, and in some environments (e.g. kubernetes)
+# ipv6 isn't yet configured / supported and CherryPy throws an uncaught
+# exception.
+if cherrypy is not None:
+    v = StrictVersion(cherrypy.__version__)
+    # the issue was fixed in 3.2.3. it's present in 3.2.2 (current version on
+    # centos:7) and back to at least 3.0.0.
+    if StrictVersion("3.1.2") <= v < StrictVersion("3.2.3"):
+        # https://github.com/cherrypy/cherrypy/issues/1100
+        from cherrypy.process import servers
+        servers.wait_for_occupied_port = lambda host, port: None
 
 # cherrypy likes to sys.exit on error.  don't let it take us down too!
 def os_exit_noop(*args, **kwargs):
@@ -91,8 +105,9 @@
 
 MON_METADATA = ('ceph_daemon', 'hostname', 'public_addr', 'rank', 'ceph_version')
 
-OSD_METADATA = ('ceph_daemon', 'cluster_addr', 'device_class', 'hostname',
-                'public_addr', 'ceph_version')
+OSD_METADATA = ('back_iface', 'ceph_daemon', 'cluster_addr', 'device_class',
+                'front_iface', 'hostname', 'objectstore', 'public_addr',
+                'ceph_version')
 
 OSD_STATUS = ['weight', 'up', 'in']
 
@@ -464,12 +479,25 @@
 
             host_version = servers.get((str(id_), 'osd'), ('',''))
 
+            # collect disk occupation metadata
+            osd_metadata = self.get_metadata("osd", str(id_))
+            if osd_metadata is None:
+                continue
+
+            obj_store = osd_metadata.get('osd_objectstore', '')
+            f_iface = osd_metadata.get('front_iface', '')
+            b_iface = osd_metadata.get('back_iface', '')
+
             self.metrics['osd_metadata'].set(1, (
+                b_iface,
                 'osd.{}'.format(id_),
                 c_addr,
                 dev_class,
+                f_iface,
                 host_version[0],
-                p_addr, host_version[1]
+                obj_store,
+                p_addr,
+                host_version[1]
             ))
 
             # collect osd status
@@ -479,19 +507,13 @@
                     'osd.{}'.format(id_),
                 ))
 
-            # collect disk occupation metadata
-            osd_metadata = self.get_metadata("osd", str(id_))
-            if osd_metadata is None:
-                continue
-
-            osd_objectstore = osd_metadata.get('osd_objectstore', None)
-            if osd_objectstore == "filestore":
+            if obj_store == "filestore":
             # collect filestore backend device
                 osd_dev_node = osd_metadata.get('backend_filestore_dev_node', None)
             # collect filestore journal device
                 osd_wal_dev_node = osd_metadata.get('osd_journal', '')
                 osd_db_dev_node = ''
-            elif osd_objectstore == "bluestore":
+            elif obj_store == "bluestore":
             # collect bluestore backend device
                 osd_dev_node = osd_metadata.get('bluestore_bdev_dev_node', None)
             # collect bluestore wal backend
diff -Nru ceph-12.2.11/src/pybind/mgr/restful/api/crush.py ceph-12.2.12/src/pybind/mgr/restful/api/crush.py
--- ceph-12.2.11/src/pybind/mgr/restful/api/crush.py	2019-01-30 15:51:26.000000000 +0000
+++ ceph-12.2.12/src/pybind/mgr/restful/api/crush.py	2019-04-11 12:33:50.000000000 +0000
@@ -14,11 +14,11 @@
         """
         Show crush rules
         """
-        rules = module.instance.get('osd_map_crush')['rules']
-        nodes = module.instance.get('osd_map_tree')['nodes']
+        crush = module.instance.get('osd_map_crush')
+        rules = crush['rules']
 
         for rule in rules:
-            rule['osd_count'] = len(common.crush_rule_osds(nodes, rule))
+            rule['osd_count'] = len(common.crush_rule_osds(crush['buckets'], rule))
 
         return rules
 
diff -Nru ceph-12.2.11/src/pybind/mgr/restful/common.py ceph-12.2.12/src/pybind/mgr/restful/common.py
--- ceph-12.2.11/src/pybind/mgr/restful/common.py	2019-01-30 15:51:26.000000000 +0000
+++ ceph-12.2.12/src/pybind/mgr/restful/common.py	2019-04-11 12:33:50.000000000 +0000
@@ -88,31 +88,34 @@
 
     return commands
 
-
-def crush_rule_osds(nodes, rule):
-    nodes_by_id = dict((n['id'], n) for n in nodes)
+def crush_rule_osds(node_buckets, rule):
+    nodes_by_id = dict((b['id'], b) for b in node_buckets)
 
     def _gather_leaf_ids(node):
         if node['id'] >= 0:
             return set([node['id']])
 
         result = set()
-        for child_id in node['children']:
-            if child_id >= 0:
-                result.add(child_id)
+        for item in node['items']:
+            if item['id'] >= 0:
+                result.add(item['id'])
             else:
-                result |= _gather_leaf_ids(nodes_by_id[child_id])
+                result |= _gather_leaf_ids(nodes_by_id[item['id']])
 
         return result
 
     def _gather_descendent_ids(node, typ):
         result = set()
-        for child_id in node['children']:
-            child_node = nodes_by_id[child_id]
-            if child_node['type'] == typ:
-                result.add(child_node['id'])
-            elif 'children' in child_node:
-                result |= _gather_descendent_ids(child_node, typ)
+        for item in node['items']:
+            if item['id'] >= 0:
+                if typ == "osd":
+                    result.add(item['id'])
+            else:
+                child_node = nodes_by_id[item['id']]
+                if child_node['type_name'] == typ:
+                    result.add(child_node['id'])
+                elif 'items' in child_node:
+                    result |= _gather_descendent_ids(child_node, typ)
 
         return result
 
@@ -124,17 +127,26 @@
         step = steps[0]
         if step['op'] == 'choose_firstn':
             # Choose all descendents of the current node of type 'type'
-            d = _gather_descendent_ids(root, step['type'])
-            for desc_node in [nodes_by_id[i] for i in d]:
-                osds |= _gather_osds(desc_node, steps[1:])
+            descendent_ids = _gather_descendent_ids(root, step['type'])
+            for node_id in descendent_ids:
+                if node_id >= 0:
+                    osds.add(node_id)
+                else:
+                    for desc_node in nodes_by_id[node_id]:
+                        osds |= _gather_osds(desc_node, steps[1:])
         elif step['op'] == 'chooseleaf_firstn':
             # Choose all descendents of the current node of type 'type',
             # and select all leaves beneath those
-            for desc_node in [nodes_by_id[i] for i in _gather_descendent_ids(root, step['type'])]:
-                # Short circuit another iteration to find the emit
-                # and assume anything we've done a chooseleaf on
-                # is going to be part of the selected set of osds
-                osds |= _gather_leaf_ids(desc_node)
+            descendent_ids = _gather_descendent_ids(root, step['type'])
+            for node_id in descendent_ids:
+                if node_id >= 0:
+                    osds.add(node_id)
+                else:
+                    for desc_node in nodes_by_id[node_id]['items']:
+                        # Short circuit another iteration to find the emit
+                        # and assume anything we've done a chooseleaf on
+                        # is going to be part of the selected set of osds
+                        osds |= _gather_leaf_ids(desc_node)
         elif step['op'] == 'emit':
             if root['id'] >= 0:
                 osds |= root['id']
diff -Nru ceph-12.2.11/src/pybind/mgr/restful/module.py ceph-12.2.12/src/pybind/mgr/restful/module.py
--- ceph-12.2.11/src/pybind/mgr/restful/module.py	2019-01-30 15:51:26.000000000 +0000
+++ ceph-12.2.12/src/pybind/mgr/restful/module.py	2019-04-11 12:33:50.000000000 +0000
@@ -214,7 +214,7 @@
         {
             "cmd": "restful list-keys",
             "desc": "List all API keys",
-            "perm": "rw"
+            "perm": "r"
         },
         {
             "cmd": "restful create-self-signed-cert",
@@ -503,14 +503,15 @@
     def get_osd_pools(self):
         osds = dict(map(lambda x: (x['osd'], []), self.get('osd_map')['osds']))
         pools = dict(map(lambda x: (x['pool'], x), self.get('osd_map')['pools']))
-        crush_rules = self.get('osd_map_crush')['rules']
+        crush = self.get('osd_map_crush')
+        crush_rules = crush['rules']
 
         osds_by_pool = {}
         for pool_id, pool in pools.items():
             pool_osds = None
             for rule in [r for r in crush_rules if r['rule_id'] == pool['crush_rule']]:
                 if rule['min_size'] <= pool['size'] <= rule['max_size']:
-                    pool_osds = common.crush_rule_osds(self.get('osd_map_tree')['nodes'], rule)
+                    pool_osds = common.crush_rule_osds(crush['buckets'], rule)
 
             osds_by_pool[pool_id] = pool_osds
 
diff -Nru ceph-12.2.11/src/rbdmap ceph-12.2.12/src/rbdmap
--- ceph-12.2.11/src/rbdmap	2019-01-30 15:51:26.000000000 +0000
+++ ceph-12.2.12/src/rbdmap	2019-04-11 12:33:50.000000000 +0000
@@ -1,5 +1,52 @@
 #!/bin/bash
 
+create_cmd_params() {
+	local PARAMS="$1"
+	local CMDPARAMS=""
+	local STATE="START"
+	for (( i=0; i<${#PARAMS}; i++ )); do
+		CHAR="${PARAMS:$i:1}"
+		case $CHAR in
+			"#")
+				break
+				;;
+			"'")
+				if [ "$STATE" == "INQUOTE" ];then
+					STATE="NORMAL"
+				else
+					STATE="INQUOTE"
+				fi
+				;;
+			"=")
+				if [ "$STATE" == "INQUOTE" ]; then
+					CMDPARAMS="${CMDPARAMS}${CHAR}"
+				else
+					CMDPARAMS="${CMDPARAMS} "
+				fi
+				;;
+			",")
+				if [ "$STATE" == "INQUOTE" ]; then
+					CMDPARAMS="${CMDPARAMS}${CHAR}"
+				elif [ "$STATE" == "START" ]; then
+					STATE="NORMAL"
+					CMDPARAMS="${CMDPARAMS} --"
+				else
+					CMDPARAMS="${CMDPARAMS} --"
+				fi
+				;;
+			*)
+				if [ "$STATE" == "START" ];then
+					STATE="NORMAL"
+					CMDPARAMS="${CMDPARAMS}--${CHAR}"
+				else
+					CMDPARAMS="${CMDPARAMS}${CHAR}"
+				fi
+				;;
+		esac
+	done
+	echo -n "$CMDPARAMS"
+}
+
 do_map() {
 	# Read /etc/rbdtab to create non-existant mapping
 	RET=0
@@ -14,16 +61,12 @@
 			DEV=rbd/$DEV
 			;;
 		esac
+
+		CMDPARAMS="$(create_cmd_params "${PARAMS}")"
 		logger -p "daemon.debug" -t rbdmap "Mapping '${DEV}'"
 		newrbd=""
 		MAP_RV=""
-		OIFS=$IFS
-		IFS=','
-		CMDPARAMS=""
-		for PARAM in ${PARAMS[@]}; do
-			CMDPARAMS="$CMDPARAMS --$(echo $PARAM | tr '=' ' ')"
-		done
-		IFS=$OIFS
+
 		if [ -b /dev/rbd/$DEV ]; then
 			MAP_RV="$(readlink -f /dev/rbd/$DEV)"
 		else
diff -Nru ceph-12.2.11/src/rgw/CMakeLists.txt ceph-12.2.12/src/rgw/CMakeLists.txt
--- ceph-12.2.11/src/rgw/CMakeLists.txt	2019-01-30 15:51:26.000000000 +0000
+++ ceph-12.2.12/src/rgw/CMakeLists.txt	2019-04-11 12:33:50.000000000 +0000
@@ -177,9 +177,7 @@
 
 add_library(radosgw_a STATIC ${radosgw_srcs}
   $<TARGET_OBJECTS:civetweb_common_objs>)
-if (WITH_RADOSGW_BEAST_FRONTEND AND WITH_RADOSGW_BEAST_OPENSSL)
-  target_link_libraries(radosgw_a rgw_a ${SSL_LIBRARIES})
-endif()
+target_link_libraries(radosgw_a rgw_a ${SSL_LIBRARIES})
 
 add_executable(radosgw rgw_main.cc)
 target_link_libraries(radosgw radosgw_a librados
@@ -195,10 +193,6 @@
   cls_version cls_replica_log cls_user)
 install(TARGETS radosgw DESTINATION bin)
 
-if (WITH_RADOSGW_BEAST_FRONTEND)
-  target_link_libraries(radosgw_a ${OPENSSL_LIBRARIES})
-endif()
-
 set(radosgw_admin_srcs
   rgw_admin.cc
   rgw_orphan.cc)
diff -Nru ceph-12.2.11/src/rgw/rgw_admin.cc ceph-12.2.12/src/rgw/rgw_admin.cc
--- ceph-12.2.11/src/rgw/rgw_admin.cc	2019-01-30 15:51:26.000000000 +0000
+++ ceph-12.2.12/src/rgw/rgw_admin.cc	2019-04-11 12:33:50.000000000 +0000
@@ -453,6 +453,7 @@
   OPT_DATA_SYNC_RUN,
   OPT_DATALOG_LIST,
   OPT_DATALOG_STATUS,
+  OPT_DATALOG_AUTOTRIM,
   OPT_DATALOG_TRIM,
   OPT_OPSTATE_LIST,
   OPT_OPSTATE_SET,
@@ -892,6 +893,8 @@
   } else if (strcmp(prev_cmd, "datalog") == 0) {
     if (strcmp(cmd, "list") == 0)
       return OPT_DATALOG_LIST;
+    if (strcmp(cmd, "autotrim") == 0)
+      return OPT_DATALOG_AUTOTRIM;
     if (strcmp(cmd, "trim") == 0)
       return OPT_DATALOG_TRIM;
     if (strcmp(cmd, "status") == 0)
@@ -5621,7 +5624,7 @@
 
     rgw_cls_bi_entry entry;
 
-    ret = store->bi_get(bucket, obj, bi_index_type, &entry);
+    ret = store->bi_get(bucket_info, obj, bi_index_type, &entry);
     if (ret < 0) {
       cerr << "ERROR: bi_get(): " << cpp_strerror(-ret) << std::endl;
       return -ret;
@@ -7239,6 +7242,24 @@
     formatter->flush(cout);
   }
 
+  if (opt_cmd == OPT_DATALOG_AUTOTRIM) {
+    RGWCoroutinesManager crs(store->ctx(), store->get_cr_registry());
+    RGWHTTPManager http(store->ctx(), crs.get_completion_mgr());
+    int ret = http.set_threaded();
+    if (ret < 0) {
+      cerr << "failed to initialize http client with " << cpp_strerror(ret) << std::endl;
+      return -ret;
+    }
+
+    auto num_shards = g_conf->rgw_data_log_num_shards;
+    std::vector<std::string> markers(num_shards);
+    ret = crs.run(create_admin_data_log_trim_cr(store, &http, num_shards, markers));
+    if (ret < 0) {
+      cerr << "automated datalog trim failed with " << cpp_strerror(ret) << std::endl;
+      return -ret;
+    }
+  }
+
   if (opt_cmd == OPT_DATALOG_TRIM) {
     utime_t start_time, end_time;
 
diff -Nru ceph-12.2.11/src/rgw/rgw_auth_s3.cc ceph-12.2.12/src/rgw/rgw_auth_s3.cc
--- ceph-12.2.11/src/rgw/rgw_auth_s3.cc	2019-01-30 15:51:26.000000000 +0000
+++ ceph-12.2.12/src/rgw/rgw_auth_s3.cc	2019-04-11 12:33:50.000000000 +0000
@@ -116,6 +116,7 @@
   const char* const content_type,
   const char* const date,
   const std::map<std::string, std::string>& meta_map,
+  const std::map<std::string, std::string>& qs_map,
   const char* const request_uri,
   const std::map<std::string, std::string>& sub_resources,
   std::string& dest_str)
@@ -143,6 +144,7 @@
   dest.append("\n");
 
   dest.append(get_canon_amz_hdr(meta_map));
+  dest.append(get_canon_amz_hdr(qs_map));
   dest.append(get_canon_resource(request_uri, sub_resources));
 
   dest_str = dest;
@@ -152,6 +154,17 @@
   return (isalnum(c) || isspace(c) || (c == '+') || (c == '/') || (c == '='));
 }
 
+static inline void get_v2_qs_map(const req_info& info,
+				 std::map<std::string, std::string>& qs_map) {
+  const auto& params = const_cast<RGWHTTPArgs&>(info.args).get_params();
+  for (const auto& elt : params) {
+    std::string k = boost::algorithm::to_lower_copy(elt.first);
+    if (k.find("x-amz-meta-") == /* offset */ 0) {
+      add_amz_meta_header(qs_map, k, elt.second);
+    }
+  }
+}
+
 /*
  * get the header authentication  information required to
  * compute a request's signature
@@ -175,7 +188,10 @@
   const char *content_type = info.env->get("CONTENT_TYPE");
 
   std::string date;
+  std::map<std::string, std::string> qs_map;
+
   if (qsr) {
+    get_v2_qs_map(info, qs_map); // handle qs metadata
     date = info.args.get("Expires");
   } else {
     const char *str = info.env->get("HTTP_X_AMZ_DATE");
@@ -214,8 +230,8 @@
   }
 
   rgw_create_s3_canonical_header(info.method, content_md5, content_type,
-                                 date.c_str(), meta_map, request_uri.c_str(),
-                                 sub_resources, dest);
+                                 date.c_str(), meta_map, qs_map,
+				 request_uri.c_str(), sub_resources, dest);
   return true;
 }
 
@@ -412,13 +428,13 @@
   return 0;
 }
 
-int parse_credentials(const req_info& info,                     /* in */
-                      boost::string_view& access_key_id,        /* out */
-                      boost::string_view& credential_scope,     /* out */
-                      boost::string_view& signedheaders,        /* out */
-                      boost::string_view& signature,            /* out */
-                      boost::string_view& date,                 /* out */
-                      bool& using_qs)                           /* out */
+int parse_v4_credentials(const req_info& info,                     /* in */
+			 boost::string_view& access_key_id,        /* out */
+			 boost::string_view& credential_scope,     /* out */
+			 boost::string_view& signedheaders,        /* out */
+			 boost::string_view& signature,            /* out */
+			 boost::string_view& date,                 /* out */
+                         bool& using_qs)                           /* out */
 {
   const char* const http_auth = info.env->get("HTTP_AUTHORIZATION");
   using_qs = http_auth == nullptr || http_auth[0] == '\0';
diff -Nru ceph-12.2.11/src/rgw/rgw_auth_s3.h ceph-12.2.12/src/rgw/rgw_auth_s3.h
--- ceph-12.2.11/src/rgw/rgw_auth_s3.h	2019-01-30 15:51:26.000000000 +0000
+++ ceph-12.2.12/src/rgw/rgw_auth_s3.h	2019-04-11 12:33:50.000000000 +0000
@@ -79,8 +79,7 @@
 
     }
 
-    if (cct->_conf->rgw_s3_auth_use_ldap &&
-        ! cct->_conf->rgw_ldap_uri.empty()) {
+    if (ldap_engine.valid()) {
       add_engine(Control::SUFFICIENT, ldap_engine);
     }
   }
@@ -350,6 +349,7 @@
   const char *content_type,
   const char *date,
   const std::map<std::string, std::string>& meta_map,
+  const std::map<std::string, std::string>& qs_map,
   const char *request_uri,
   const std::map<std::string, std::string>& sub_resources,
   std::string& dest_str);
@@ -381,13 +381,13 @@
 static constexpr char AWS4_STREAMING_PAYLOAD_HASH[] = \
   "STREAMING-AWS4-HMAC-SHA256-PAYLOAD";
 
-int parse_credentials(const req_info& info,                     /* in */
-                      boost::string_view& access_key_id,        /* out */
-                      boost::string_view& credential_scope,     /* out */
-                      boost::string_view& signedheaders,        /* out */
-                      boost::string_view& signature,            /* out */
-                      boost::string_view& date,                 /* out */
-                      bool& using_qs);                          /* out */
+int parse_v4_credentials(const req_info& info,                     /* in */
+			 boost::string_view& access_key_id,        /* out */
+			 boost::string_view& credential_scope,     /* out */
+			 boost::string_view& signedheaders,        /* out */
+			 boost::string_view& signature,            /* out */
+			 boost::string_view& date,                 /* out */
+                         bool& using_qs);                          /* out */
 
 static inline std::string get_v4_canonical_uri(const req_info& info) {
   /* The code should normalize according to RFC 3986 but S3 does NOT do path
diff -Nru ceph-12.2.11/src/rgw/rgw_bucket.cc ceph-12.2.12/src/rgw/rgw_bucket.cc
--- ceph-12.2.11/src/rgw/rgw_bucket.cc	2019-01-30 15:51:26.000000000 +0000
+++ ceph-12.2.12/src/rgw/rgw_bucket.cc	2019-04-11 12:33:50.000000000 +0000
@@ -540,11 +540,13 @@
   int max = 1000;
 
   list_op.params.list_versions = true;
+  list_op.params.allow_unordered = true;
 
+  bool is_truncated = false;
   do {
     objs.clear();
 
-    ret = list_op.list_objects(max, &objs, &common_prefixes, NULL);
+    ret = list_op.list_objects(max, &objs, &common_prefixes, &is_truncated);
     if (ret < 0)
       return ret;
 
@@ -556,11 +558,11 @@
     for (const auto& obj : objs) {
       rgw_obj_key key(obj.key);
       ret = rgw_remove_object(store, info, bucket, key);
-      if (ret < 0)
+      if (ret < 0 && ret != -ENOENT) {
         return ret;
+      }
     }
-
-  } while (!objs.empty());
+  } while(is_truncated);
 
   string prefix, delimiter;
 
@@ -576,9 +578,12 @@
 
   RGWObjVersionTracker objv_tracker;
 
-  ret = store->delete_bucket(info, objv_tracker);
+  // if we deleted children above we will force delete, as any that
+  // remain is detrius from a prior bug
+  ret = store->delete_bucket(info, objv_tracker, !delete_children);
   if (ret < 0) {
-    lderr(store->ctx()) << "ERROR: could not remove bucket " << bucket.name << dendl;
+    lderr(store->ctx()) << "ERROR: could not remove bucket " <<
+      bucket.name << dendl;
     return ret;
   }
 
@@ -645,16 +650,20 @@
   RGWRados::Bucket::List list_op(&target);
 
   list_op.params.list_versions = true;
+  list_op.params.allow_unordered = true;
 
   std::list<librados::AioCompletion*> handles;
 
   int max = 1000;
   int max_aio = concurrent_max;
-  ret = list_op.list_objects(max, &objs, &common_prefixes, NULL);
-  if (ret < 0)
-    return ret;
+  bool is_truncated = true;
+
+  while (is_truncated) {
+    objs.clear();
+    ret = list_op.list_objects(max, &objs, &common_prefixes, &is_truncated);
+    if (ret < 0)
+      return ret;
 
-  while (!objs.empty()) {
     std::vector<rgw_bucket_dir_entry>::iterator it = objs.begin();
     for (; it != objs.end(); ++it) {
       RGWObjState *astate = NULL;
@@ -717,11 +726,6 @@
         max_aio = concurrent_max;
       }
     } // for all RGW objects
-    objs.clear();
-
-    ret = list_op.list_objects(max, &objs, &common_prefixes, NULL);
-    if (ret < 0)
-      return ret;
   }
 
   ret = drain_handles(handles);
@@ -737,7 +741,10 @@
 
   RGWObjVersionTracker objv_tracker;
 
-  ret = store->delete_bucket(info, objv_tracker);
+  // this function can only be run if caller wanted children to be
+  // deleted, so we can ignore the check for children as any that
+  // remain are detritus from a prior bug
+  ret = store->delete_bucket(info, objv_tracker, false);
   if (ret < 0) {
     lderr(store->ctx()) << "ERROR: could not remove bucket " << bucket.name << dendl;
     return ret;
@@ -1470,14 +1477,15 @@
   formatter->open_array_section("users");
 
   for (const auto& user_id : user_ids) {
+
     formatter->open_object_section("user");
     formatter->dump_string("user_id", user_id);
-    bool done;
     formatter->open_array_section("buckets");
+
+    string marker;
+    bool is_truncated{false};
     do {
       RGWUserBuckets buckets;
-      string marker;
-      bool is_truncated;
 
       ret = rgw_read_user_buckets(store, user_id, buckets,
 				  marker, string(), max_entries, false,
@@ -1552,9 +1560,8 @@
 	  }
 	}
       }
-
-      done = (m_buckets.size() < max_entries);
-    } while (!done); /* foreach: bucket */
+      formatter->flush(cout);
+    } while (is_truncated); /* foreach: bucket */
 
     formatter->close_section();
     formatter->close_section();
diff -Nru ceph-12.2.11/src/rgw/rgw_common.cc ceph-12.2.12/src/rgw/rgw_common.cc
--- ceph-12.2.11/src/rgw/rgw_common.cc	2019-01-30 15:51:26.000000000 +0000
+++ ceph-12.2.12/src/rgw/rgw_common.cc	2019-04-11 12:33:50.000000000 +0000
@@ -6,8 +6,6 @@
 #include <algorithm>
 #include <string>
 #include <boost/tokenizer.hpp>
-#include <boost/algorithm/string.hpp>
-#include <boost/utility/string_view.hpp>
 
 #include "json_spirit/json_spirit.h"
 #include "common/ceph_json.h"
@@ -404,7 +402,6 @@
                                    STR_LEN_ENTRY("HTTP_X_ACCOUNT"),
                                    {NULL, 0} };
 
-
 void req_info::init_meta_info(bool *found_bad_meta)
 {
   x_meta_map.clear();
diff -Nru ceph-12.2.11/src/rgw/rgw_common.h ceph-12.2.12/src/rgw/rgw_common.h
--- ceph-12.2.11/src/rgw/rgw_common.h	2019-01-30 15:51:26.000000000 +0000
+++ ceph-12.2.12/src/rgw/rgw_common.h	2019-04-11 12:33:50.000000000 +0000
@@ -18,6 +18,7 @@
 
 #include <array>
 
+#include <boost/algorithm/string.hpp>
 #include <boost/utility/string_view.hpp>
 
 #include "common/ceph_crypto.h"
@@ -645,11 +646,15 @@
       type(TYPE_NONE) {
   }
 
-  RGWAccessKey* get_key0() {
+  RGWAccessKey* get_key(const string& access_key) {
     if (access_keys.empty())
       return nullptr;
+
+    auto k = access_keys.find(access_key);
+    if (k == access_keys.end())
+      return nullptr;
     else
-      return &(access_keys.begin()->second);
+      return &(k->second);
   }
 
   void encode(bufferlist& bl) const {
@@ -2207,6 +2212,25 @@
   return ((bytes + 4095) & ~4095) / 1024;
 }
 
+/* implement combining step, S3 header canonicalization;  k is a
+ * valid header and in lc form */
+static inline void add_amz_meta_header(
+  std::map<std::string, std::string>& x_meta_map,
+  const std::string& k,
+  const std::string& v)
+{
+  auto it = x_meta_map.find(k);
+  if (it != x_meta_map.end()) {
+    std::string old = it->second;
+    boost::algorithm::trim_right(old);
+    old.append(",");
+    old.append(v);
+    x_meta_map[k] = old;
+  } else {
+    x_meta_map[k] = v;
+  }
+} /* add_amz_meta_header */
+
 extern string rgw_string_unquote(const string& s);
 extern void parse_csv_string(const string& ival, vector<string>& ovals);
 extern int parse_key_value(string& in_str, string& key, string& val);
diff -Nru ceph-12.2.11/src/rgw/rgw_cr_rados.cc ceph-12.2.12/src/rgw/rgw_cr_rados.cc
--- ceph-12.2.11/src/rgw/rgw_cr_rados.cc	2019-01-30 15:51:26.000000000 +0000
+++ ceph-12.2.12/src/rgw/rgw_cr_rados.cc	2019-04-11 12:33:50.000000000 +0000
@@ -1,3 +1,4 @@
+#include "include/compat.h"
 #include "rgw_rados.h"
 #include "rgw_coroutine.h"
 #include "rgw_cr_rados.h"
@@ -797,9 +798,10 @@
 int RGWSyncLogTrimCR::request_complete()
 {
   int r = RGWRadosTimelogTrimCR::request_complete();
-  if (r < 0 && r != -ENODATA) {
+  if (r != -ENODATA) {
     return r;
   }
+  // nothing left to trim, update last_trim_marker
   if (*last_trim_marker < to_marker) {
     *last_trim_marker = to_marker;
   }
diff -Nru ceph-12.2.11/src/rgw/rgw_cr_rest.h ceph-12.2.12/src/rgw/rgw_cr_rest.h
--- ceph-12.2.11/src/rgw/rgw_cr_rest.h	2019-01-30 15:51:26.000000000 +0000
+++ ceph-12.2.12/src/rgw/rgw_cr_rest.h	2019-04-11 12:33:50.000000000 +0000
@@ -15,6 +15,8 @@
   param_vec_t params;
   T *result;
 
+  param_vec_t extra_headers;
+public:
   boost::intrusive_ptr<RGWRESTReadResource> http_op;
 
 public:
@@ -25,13 +27,24 @@
       path(_path), params(make_param_list(params)), result(_result)
   {}
 
+ RGWReadRESTResourceCR(CephContext *_cct, RGWRESTConn *_conn,
+                          RGWHTTPManager *_http_manager, const string& _path,
+                          rgw_http_param_pair *params,
+                          std::map <std::string, std::string> *hdrs,
+                          T *_result)
+   : RGWSimpleCoroutine(_cct), conn(_conn), http_manager(_http_manager),
+    path(_path), params(make_param_list(params)),
+    result(_result), extra_headers(make_param_list(hdrs))
+    {}
+
+
   ~RGWReadRESTResourceCR() override {
     request_cleanup();
   }
 
   int send_request() override {
     auto op = boost::intrusive_ptr<RGWRESTReadResource>(
-        new RGWRESTReadResource(conn, path, params, NULL, http_manager));
+        new RGWRESTReadResource(conn, path, params, &extra_headers, http_manager));
 
     op->set_user_info((void *)stack);
 
@@ -67,15 +80,17 @@
   }
 };
 
-template <class S, class T>
+template <class S, class T, class E = int>
 class RGWSendRESTResourceCR : public RGWSimpleCoroutine {
   RGWRESTConn *conn;
   RGWHTTPManager *http_manager;
   string method;
   string path;
   param_vec_t params;
+  param_vec_t headers;
   T *result;
-  S input;
+  E *err_result;
+  bufferlist input_bl;
 
   boost::intrusive_ptr<RGWRESTSendResource> http_op;
 
@@ -83,11 +98,18 @@
   RGWSendRESTResourceCR(CephContext *_cct, RGWRESTConn *_conn,
                         RGWHTTPManager *_http_manager,
                         const string& _method, const string& _path,
-                        rgw_http_param_pair *_params, S& _input, T *_result)
+                        rgw_http_param_pair *_params, map<string, string> *_attrs,
+                        S& _input, T *_result, E *_err_result = nullptr)
     : RGWSimpleCoroutine(_cct), conn(_conn), http_manager(_http_manager),
-      method(_method), path(_path), params(make_param_list(_params)), result(_result),
-      input(_input)
-  {}
+      method(_method), path(_path), params(make_param_list(_params)), headers(make_param_list(_attrs)),
+      result(_result), err_result(_err_result) {
+    JSONFormatter jf;
+    encode_json("data", _input, &jf);
+    std::stringstream ss;
+    jf.flush(ss);
+    //bufferlist bl;
+    this->input_bl.append(ss.str());
+  }
 
   ~RGWSendRESTResourceCR() override {
     request_cleanup();
@@ -95,18 +117,11 @@
 
   int send_request() override {
     auto op = boost::intrusive_ptr<RGWRESTSendResource>(
-        new RGWRESTSendResource(conn, method, path, params, NULL, http_manager));
+        new RGWRESTSendResource(conn, method, path, params, &headers, http_manager));
 
     op->set_user_info((void *)stack);
 
-    JSONFormatter jf;
-    encode_json("data", input, &jf);
-    std::stringstream ss;
-    jf.flush(ss);
-    bufferlist bl;
-    bl.append(ss.str());
-
-    int ret = op->aio_send(bl);
+    int ret = op->aio_send(input_bl);
     if (ret < 0) {
       lsubdout(cct, rgw, 0) << "ERROR: failed to send request" << dendl;
       op->put();
@@ -118,8 +133,8 @@
 
   int request_complete() override {
     int ret;
-    if (result) {
-      ret = http_op->wait(result);
+    if (result || err_result) {
+      ret = http_op->wait(result, err_result);
     } else {
       bufferlist bl;
       ret = http_op->wait_bl(&bl);
@@ -145,28 +160,42 @@
   }
 };
 
-template <class S, class T>
-class RGWPostRESTResourceCR : public RGWSendRESTResourceCR<S, T> {
+template <class S, class T, class E = int>
+class RGWPostRESTResourceCR : public RGWSendRESTResourceCR<S, T, E> {
 public:
   RGWPostRESTResourceCR(CephContext *_cct, RGWRESTConn *_conn,
                         RGWHTTPManager *_http_manager,
                         const string& _path,
-                        rgw_http_param_pair *_params, S& _input, T *_result)
-    : RGWSendRESTResourceCR<S, T>(_cct, _conn, _http_manager,
+                        rgw_http_param_pair *_params, S& _input,
+                        T *_result, E *_err_result = nullptr)
+    : RGWSendRESTResourceCR<S, T, E>(_cct, _conn, _http_manager,
                             "POST", _path,
-                            _params, _input, _result) {}
+                            _params, nullptr, _input, _result, _err_result) {}
 };
 
-template <class S, class T>
-class RGWPutRESTResourceCR : public RGWSendRESTResourceCR<S, T> {
+template <class S, class T, class E = int>
+class RGWPutRESTResourceCR : public RGWSendRESTResourceCR<S, T, E> {
 public:
   RGWPutRESTResourceCR(CephContext *_cct, RGWRESTConn *_conn,
                         RGWHTTPManager *_http_manager,
                         const string& _path,
-                        rgw_http_param_pair *_params, S& _input, T *_result)
-    : RGWSendRESTResourceCR<S, T>(_cct, _conn, _http_manager,
-                            "PUT", _path,
-                            _params, _input, _result) {}
+                        rgw_http_param_pair *_params, S& _input,
+                        T *_result, E *_err_result = nullptr)
+    : RGWSendRESTResourceCR<S, T, E>(_cct, _conn, _http_manager,
+                                  "PUT", _path,
+                                  _params, nullptr, _input,
+                                  _result, _err_result) {}
+
+  RGWPutRESTResourceCR(CephContext *_cct, RGWRESTConn *_conn,
+                       RGWHTTPManager *_http_manager,
+                       const string& _path,
+                       rgw_http_param_pair *_params,
+                       map <string, string> *_attrs,
+                       S& _input, T *_result, E *_err_result = nullptr)
+    : RGWSendRESTResourceCR<S, T, E>(_cct, _conn, _http_manager,
+                                  "PUT", _path,
+                                  _params, _attrs, _input,
+                                  _result, _err_result) {}
 };
 
 class RGWDeleteRESTResourceCR : public RGWSimpleCoroutine {
diff -Nru ceph-12.2.11/src/rgw/rgw_crypt.cc ceph-12.2.12/src/rgw/rgw_crypt.cc
--- ceph-12.2.11/src/rgw/rgw_crypt.cc	2019-01-30 15:51:26.000000000 +0000
+++ ceph-12.2.12/src/rgw/rgw_crypt.cc	2019-04-11 12:33:50.000000000 +0000
@@ -31,6 +31,7 @@
 #define dout_subsys ceph_subsys_rgw
 
 using namespace rgw;
+using ceph::crypto::PK11_ImportSymKey_FIPS;
 
 /**
  * Encryption in CTR mode. offset is used as IV for each block.
@@ -129,7 +130,7 @@
       keyItem.data = key;
       keyItem.len = AES_256_KEYSIZE;
 
-      symkey = PK11_ImportSymKey(slot, CKM_AES_CTR, PK11_OriginUnwrap, CKA_UNWRAP, &keyItem, NULL);
+      symkey = PK11_ImportSymKey_FIPS(slot, CKM_AES_CTR, PK11_OriginUnwrap, CKA_UNWRAP, &keyItem, NULL);
       if (symkey) {
         static_assert(sizeof(ctr_params.cb) >= AES_256_IVSIZE, "Must fit counter");
         ctr_params.ulCounterBits = 128;
@@ -317,7 +318,7 @@
       keyItem.type = siBuffer;
       keyItem.data = const_cast<unsigned char*>(&key[0]);
       keyItem.len = AES_256_KEYSIZE;
-      symkey = PK11_ImportSymKey(slot, CKM_AES_CBC, PK11_OriginUnwrap, CKA_UNWRAP, &keyItem, NULL);
+      symkey = PK11_ImportSymKey_FIPS(slot, CKM_AES_CBC, PK11_OriginUnwrap, CKA_UNWRAP, &keyItem, NULL);
       if (symkey) {
         memcpy(ctr_params.iv, iv, AES_256_IVSIZE);
         ivItem.type = siBuffer;
@@ -577,7 +578,7 @@
 
       param = PK11_ParamFromIV(CKM_AES_ECB, NULL);
       if (param) {
-        symkey = PK11_ImportSymKey(slot, CKM_AES_ECB, PK11_OriginUnwrap, CKA_UNWRAP, &keyItem, NULL);
+        symkey = PK11_ImportSymKey_FIPS(slot, CKM_AES_ECB, PK11_OriginUnwrap, CKA_UNWRAP, &keyItem, NULL);
         if (symkey) {
           ectx = PK11_CreateContextBySymKey(CKM_AES_ECB, CKA_ENCRYPT, symkey, param);
           if (ectx) {
@@ -666,29 +667,28 @@
     off_t in_end = bl_end;
 
     size_t i = 0;
-    while (i<parts_len.size() && (in_ofs > (off_t)parts_len[i])) {
+    while (i<parts_len.size() && (in_ofs >= (off_t)parts_len[i])) {
       in_ofs -= parts_len[i];
       i++;
     }
     //in_ofs is inside block i
     size_t j = 0;
-    while (j<parts_len.size() && (in_end > (off_t)parts_len[j])) {
+    while (j<(parts_len.size() - 1) && (in_end >= (off_t)parts_len[j])) {
       in_end -= parts_len[j];
       j++;
     }
-    //in_end is inside block j
+    //in_end is inside part j, OR j is the last part
 
-    size_t rounded_end;
-    rounded_end = ( in_end & ~(block_size - 1) ) + (block_size - 1);
-    if (rounded_end + 1 >= parts_len[j]) {
+    size_t rounded_end = ( in_end & ~(block_size - 1) ) + (block_size - 1);
+    if (rounded_end > parts_len[j]) {
       rounded_end = parts_len[j] - 1;
     }
 
     enc_begin_skip = in_ofs & (block_size - 1);
     ofs = bl_ofs - enc_begin_skip;
     end = bl_end;
-    bl_ofs = bl_ofs - enc_begin_skip;
     bl_end += rounded_end - in_end;
+    bl_ofs = std::min(bl_ofs - enc_begin_skip, bl_end);
   }
   else
   {
@@ -703,31 +703,47 @@
   return 0;
 }
 
+int RGWGetObj_BlockDecrypt::process(bufferlist& in, size_t part_ofs, size_t size)
+{
+  bufferlist data;
+  if (!crypt->decrypt(in, 0, size, data, part_ofs)) {
+    return -ERR_INTERNAL_ERROR;
+  }
+  off_t send_size = size - enc_begin_skip;
+  if (ofs + enc_begin_skip + send_size > end + 1) {
+    send_size = end + 1 - ofs - enc_begin_skip;
+  }
+  int res = next->handle_data(data, enc_begin_skip, send_size);
+  enc_begin_skip = 0;
+  ofs += size;
+  in.splice(0, size);
+  return res;
+}
 
 int RGWGetObj_BlockDecrypt::handle_data(bufferlist& bl, off_t bl_ofs, off_t bl_len) {
-  int res = 0;
   ldout(cct, 25) << "Decrypt " << bl_len << " bytes" << dendl;
+  bl.copy(bl_ofs, bl_len, cache);
+
+  int res = 0;
   size_t part_ofs = ofs;
-  size_t i = 0;
-  while (i<parts_len.size() && (part_ofs >= parts_len[i])) {
-    part_ofs -= parts_len[i];
-    i++;
+  for (size_t part : parts_len) {
+    if (part_ofs >= part) {
+      part_ofs -= part;
+    } else if (part_ofs + cache.length() >= part) {
+      // flush data up to part boundaries, aligned or not
+      res = process(cache, part_ofs, part - part_ofs);
+      if (res < 0) {
+        return res;
+      }
+      part_ofs = 0;
+    } else {
+      break;
+    }
   }
-  bl.copy(bl_ofs, bl_len, cache);
+  // write up to block boundaries, aligned only
   off_t aligned_size = cache.length() & ~(block_size - 1);
   if (aligned_size > 0) {
-    bufferlist data;
-    if (! crypt->decrypt(cache, 0, aligned_size, data, part_ofs) ) {
-      return -ERR_INTERNAL_ERROR;
-    }
-    off_t send_size = aligned_size - enc_begin_skip;
-    if (ofs + enc_begin_skip + send_size > end + 1) {
-      send_size = end + 1 - ofs - enc_begin_skip;
-    }
-    res = next->handle_data(data, enc_begin_skip, send_size);
-    enc_begin_skip = 0;
-    ofs += aligned_size;
-    cache.splice(0, aligned_size);
+    res = process(cache, part_ofs, aligned_size);
   }
   return res;
 }
@@ -736,25 +752,26 @@
  * flush remainder of data to output
  */
 int RGWGetObj_BlockDecrypt::flush() {
+  ldout(cct, 25) << "Decrypt flushing " << cache.length() << " bytes" << dendl;
   int res = 0;
   size_t part_ofs = ofs;
-  size_t i = 0;
-  while (i<parts_len.size() && (part_ofs > parts_len[i])) {
-    part_ofs -= parts_len[i];
-    i++;
+  for (size_t part : parts_len) {
+    if (part_ofs >= part) {
+      part_ofs -= part;
+    } else if (part_ofs + cache.length() >= part) {
+      // flush data up to part boundaries, aligned or not
+      res = process(cache, part_ofs, part - part_ofs);
+      if (res < 0) {
+        return res;
+      }
+      part_ofs = 0;
+    } else {
+      break;
+    }
   }
+  // flush up to block boundaries, aligned or not
   if (cache.length() > 0) {
-    bufferlist data;
-    if (! crypt->decrypt(cache, 0, cache.length(), data, part_ofs) ) {
-      return -ERR_INTERNAL_ERROR;
-    }
-    off_t send_size = cache.length() - enc_begin_skip;
-    if (ofs + enc_begin_skip + send_size > end + 1) {
-      send_size = end + 1 - ofs - enc_begin_skip;
-    }
-    res = next->handle_data(data, enc_begin_skip, send_size);
-    enc_begin_skip = 0;
-    ofs += send_size;
+    res = process(cache, part_ofs, cache.length());
   }
   return res;
 }
diff -Nru ceph-12.2.11/src/rgw/rgw_crypt.h ceph-12.2.12/src/rgw/rgw_crypt.h
--- ceph-12.2.11/src/rgw/rgw_crypt.h	2019-01-30 15:51:26.000000000 +0000
+++ ceph-12.2.12/src/rgw/rgw_crypt.h	2019-04-11 12:33:50.000000000 +0000
@@ -93,6 +93,10 @@
   off_t end; /**< stream offset of last byte that is requested */
   bufferlist cache; /**< stores extra data that could not (yet) be processed by BlockCrypt */
   size_t block_size; /**< snapshot of \ref BlockCrypt.get_block_size() */
+
+  int process(bufferlist& cipher, size_t part_ofs, size_t size);
+
+protected:
   std::vector<size_t> parts_len; /**< size of parts of multipart object, parsed from manifest */
 public:
   RGWGetObj_BlockDecrypt(CephContext* cct,
diff -Nru ceph-12.2.11/src/rgw/rgw_data_sync.cc ceph-12.2.12/src/rgw/rgw_data_sync.cc
--- ceph-12.2.11/src/rgw/rgw_data_sync.cc	2019-01-30 15:51:26.000000000 +0000
+++ ceph-12.2.12/src/rgw/rgw_data_sync.cc	2019-04-11 12:33:50.000000000 +0000
@@ -308,24 +308,24 @@
 class RGWReadRemoteDataLogShardCR : public RGWCoroutine {
   RGWDataSyncEnv *sync_env;
 
-  RGWRESTReadResource *http_op;
+  RGWRESTReadResource *http_op = nullptr;
 
   int shard_id;
-  string *pmarker;
+  const std::string& marker;
+  string *pnext_marker;
   list<rgw_data_change_log_entry> *entries;
   bool *truncated;
 
   read_remote_data_log_response response;
 
 public:
-  RGWReadRemoteDataLogShardCR(RGWDataSyncEnv *_sync_env,
-                              int _shard_id, string *_pmarker, list<rgw_data_change_log_entry> *_entries, bool *_truncated) : RGWCoroutine(_sync_env->cct),
-                                                      sync_env(_sync_env),
-                                                      http_op(NULL),
-                                                      shard_id(_shard_id),
-                                                      pmarker(_pmarker),
-                                                      entries(_entries),
-                                                      truncated(_truncated) {
+  RGWReadRemoteDataLogShardCR(RGWDataSyncEnv *_sync_env, int _shard_id,
+                              const std::string& marker, string *pnext_marker,
+                              list<rgw_data_change_log_entry> *_entries,
+                              bool *_truncated)
+    : RGWCoroutine(_sync_env->cct), sync_env(_sync_env),
+      shard_id(_shard_id), marker(marker), pnext_marker(pnext_marker),
+      entries(_entries), truncated(_truncated) {
   }
   ~RGWReadRemoteDataLogShardCR() override {
     if (http_op) {
@@ -340,7 +340,7 @@
 	snprintf(buf, sizeof(buf), "%d", shard_id);
         rgw_http_param_pair pairs[] = { { "type" , "data" },
 	                                { "id", buf },
-	                                { "marker", pmarker->c_str() },
+	                                { "marker", marker.c_str() },
 	                                { "extra-info", "true" },
 	                                { NULL, NULL } };
 
@@ -366,7 +366,7 @@
         }
         entries->clear();
         entries->swap(response.entries);
-        *pmarker = response.marker;
+        *pnext_marker = response.marker;
         *truncated = response.truncated;
         return set_cr_done();
       }
@@ -1112,6 +1112,7 @@
 
   RGWDataSyncShardMarkerTrack *marker_tracker;
 
+  std::string next_marker;
   list<rgw_data_change_log_entry> log_entries;
   list<rgw_data_change_log_entry>::iterator log_iter;
   bool truncated;
@@ -1158,7 +1159,7 @@
 public:
   RGWDataSyncShardCR(RGWDataSyncEnv *_sync_env,
                      rgw_pool& _pool,
-		     uint32_t _shard_id, rgw_data_sync_marker& _marker, bool *_reset_backoff) : RGWCoroutine(_sync_env->cct),
+		     uint32_t _shard_id, const rgw_data_sync_marker& _marker, bool *_reset_backoff) : RGWCoroutine(_sync_env->cct),
                                                       sync_env(_sync_env),
 						      pool(_pool),
 						      shard_id(_shard_id),
@@ -1242,6 +1243,7 @@
         if (lease_cr->is_done()) {
           ldout(cct, 5) << "lease cr failed, done early " << dendl;
           set_status("lease lock failed, early abort");
+          drain_all();
           return set_cr_error(lease_cr->get_ret_status());
         }
         set_sleeping(true);
@@ -1323,6 +1325,7 @@
           if (lease_cr->is_done()) {
             ldout(cct, 5) << "lease cr failed, done early " << dendl;
             set_status("lease lock failed, early abort");
+            drain_all();
             return set_cr_error(lease_cr->get_ret_status());
           }
           set_sleeping(true);
@@ -1387,7 +1390,8 @@
 #define INCREMENTAL_MAX_ENTRIES 100
 	      ldout(sync_env->cct, 20) << __func__ << ":" << __LINE__ << ": shard_id=" << shard_id << " sync_marker=" << sync_marker.marker << dendl;
         spawned_keys.clear();
-        yield call(new RGWReadRemoteDataLogShardCR(sync_env, shard_id, &sync_marker.marker, &log_entries, &truncated));
+        yield call(new RGWReadRemoteDataLogShardCR(sync_env, shard_id, sync_marker.marker,
+                                                   &next_marker, &log_entries, &truncated));
         if (retcode < 0) {
           ldout(sync_env->cct, 0) << "ERROR: failed to read remote data log info: ret=" << retcode << dendl;
           stop_spawned_services();
@@ -1430,11 +1434,17 @@
             }
             /* not waiting for child here */
           }
-	      }
-	      ldout(sync_env->cct, 20) << __func__ << ":" << __LINE__ << ": shard_id=" << shard_id << " sync_marker=" << sync_marker.marker << " truncated=" << truncated << dendl;
-	      if (!truncated) {
-        yield wait(get_idle_interval());
-      }
+        }
+        ldout(sync_env->cct, 20) << __func__ << ":" << __LINE__ << ": shard_id=" << shard_id << " sync_marker=" << sync_marker.marker
+            << " next_marker=" << next_marker << " truncated=" << truncated << dendl;
+        if (!truncated) {
+          yield wait(get_idle_interval());
+        }
+        if (!next_marker.empty()) {
+          sync_marker.marker = next_marker;
+        } else if (!log_entries.empty()) {
+          sync_marker.marker = log_entries.back().log_id;
+        }
       } while (true);
     }
     return 0;
@@ -2086,6 +2096,7 @@
   rgw_data_sync_marker* sync_marker;
   int count;
 
+  std::string next_marker;
   list<rgw_data_change_log_entry> log_entries;
   bool truncated;
 
@@ -2121,7 +2132,8 @@
     marker = sync_marker->marker;
     count = 0;
     do{
-      yield call(new RGWReadRemoteDataLogShardCR(sync_env, shard_id, &marker, &log_entries, &truncated));
+      yield call(new RGWReadRemoteDataLogShardCR(sync_env, shard_id, marker,
+                                                 &next_marker, &log_entries, &truncated));
 
       if (retcode == -ENOENT) {
         break;
@@ -2229,6 +2241,16 @@
     JSONDecoder::decode_json("VersionedEpoch", versioned_epoch, obj);
     JSONDecoder::decode_json("RgwxTag", rgw_tag, obj);
   }
+
+  RGWModifyOp get_modify_op() const {
+    if (delete_marker) {
+      return CLS_RGW_OP_LINK_OLH_DM;
+    } else if (!key.instance.empty() && key.instance != "null") {
+      return CLS_RGW_OP_LINK_OLH;
+    } else {
+      return CLS_RGW_OP_ADD;
+    }
+  }
 };
 
 struct bucket_list_result {
@@ -2607,7 +2629,6 @@
   RGWBucketFullSyncShardMarkerTrack marker_tracker;
   rgw_obj_key list_marker;
   bucket_list_entry *entry{nullptr};
-  RGWModifyOp op{CLS_RGW_OP_ADD};
 
   int total_entries{0};
 
@@ -2669,12 +2690,11 @@
         if (!marker_tracker.start(entry->key, total_entries, real_time())) {
           ldout(sync_env->cct, 0) << "ERROR: cannot start syncing " << entry->key << ". Duplicate entry?" << dendl;
         } else {
-          op = (entry->key.instance.empty() || entry->key.instance == "null" ? CLS_RGW_OP_ADD : CLS_RGW_OP_LINK_OLH);
           using SyncCR = RGWBucketSyncSingleEntryCR<rgw_obj_key, rgw_obj_key>;
           yield spawn(new SyncCR(sync_env, bucket_info, bs, entry->key,
                                  false, /* versioned, only matters for object removal */
                                  entry->versioned_epoch, entry->mtime,
-                                 entry->owner, op, CLS_RGW_STATE_COMPLETE,
+                                 entry->owner, entry->get_modify_op(), CLS_RGW_STATE_COMPLETE,
                                  entry->key, &marker_tracker, zones_trace),
                       false);
         }
@@ -3025,6 +3045,7 @@
       if (lease_cr->is_done()) {
         ldout(cct, 5) << "lease cr failed, done early" << dendl;
         set_status("lease lock failed, early abort");
+        drain_all();
         return set_cr_error(lease_cr->get_ret_status());
       }
       set_sleeping(true);
@@ -3077,6 +3098,14 @@
     do {
       if (sync_status.state == rgw_bucket_shard_sync_info::StateInit) {
         yield call(new RGWInitBucketShardSyncStatusCoroutine(sync_env, bs, sync_status));
+        if (retcode == -ENOENT) {
+          ldout(sync_env->cct, 0) << "bucket sync disabled" << dendl;
+          lease_cr->abort(); // deleted lease object, abort/wakeup instead of unlock
+          lease_cr->wakeup();
+          lease_cr.reset();
+          drain_all();
+          return set_cr_done();
+        }
         if (retcode < 0) {
           ldout(sync_env->cct, 0) << "ERROR: init sync on " << bucket_shard_str{bs}
               << " failed, retcode=" << retcode << dendl;
@@ -3431,6 +3460,14 @@
   return 0;
 }
 
+RGWCoroutine* create_admin_data_log_trim_cr(RGWRados *store,
+                                            RGWHTTPManager *http,
+                                            int num_shards,
+                                            std::vector<std::string>& markers)
+{
+  return new DataLogTrimCR(store, http, num_shards, markers);
+}
+
 class DataLogTrimPollCR : public RGWCoroutine {
   RGWRados *store;
   RGWHTTPManager *http;
diff -Nru ceph-12.2.11/src/rgw/rgw_data_sync.h ceph-12.2.12/src/rgw/rgw_data_sync.h
--- ceph-12.2.11/src/rgw/rgw_data_sync.h	2019-01-30 15:51:26.000000000 +0000
+++ ceph-12.2.12/src/rgw/rgw_data_sync.h	2019-04-11 12:33:50.000000000 +0000
@@ -79,7 +79,7 @@
       state = StateInit;
     }
     JSONDecoder::decode_json("num_shards", num_shards, obj);
-    JSONDecoder::decode_json("instance_id", num_shards, obj);
+    JSONDecoder::decode_json("instance_id", instance_id, obj);
   }
   static void generate_test_instances(std::list<rgw_data_sync_info*>& o);
 
@@ -566,4 +566,10 @@
                                              RGWHTTPManager *http,
                                              int num_shards, utime_t interval);
 
+// factory function for datalog trim via radosgw-admin
+RGWCoroutine* create_admin_data_log_trim_cr(RGWRados *store,
+                                            RGWHTTPManager *http,
+                                            int num_shards,
+                                            std::vector<std::string>& markers);
+
 #endif
diff -Nru ceph-12.2.11/src/rgw/rgw_es_query.cc ceph-12.2.12/src/rgw/rgw_es_query.cc
--- ceph-12.2.11/src/rgw/rgw_es_query.cc	2019-01-30 15:51:26.000000000 +0000
+++ ceph-12.2.12/src/rgw/rgw_es_query.cc	2019-04-11 12:33:50.000000000 +0000
@@ -29,6 +29,7 @@
   { "<",   3 },
   { "<=",  3 },
   { "==",  3 },
+  { "!=",  3 },
   { ">=",  3 },
   { ">",   3 },
 };
@@ -301,6 +302,33 @@
   }
 };
 
+class ESQueryNode_Op_NotEqual : public ESQueryNode_Op {
+public:
+  explicit ESQueryNode_Op_NotEqual(ESQueryCompiler *compiler) : ESQueryNode_Op(compiler) {}
+  ESQueryNode_Op_NotEqual(ESQueryCompiler *compiler, const string& f, const string& v) : ESQueryNode_Op(compiler) {
+    op = "!=";
+    field = f;
+    str_val = v;
+  }
+
+  bool init(ESQueryStack *s, ESQueryNode **pnode, string *perr) override {
+    if (op.empty()) {
+      return ESQueryNode_Op::init(s, pnode, perr);
+    }
+    return do_init(pnode, perr);
+  }
+
+  virtual void dump(Formatter *f) const override {
+    f->open_object_section("bool");
+    f->open_object_section("must_not");
+    f->open_object_section("term");
+    val->encode_json(field, f);
+    f->close_section();
+    f->close_section();
+    f->close_section();
+  }
+};
+
 class ESQueryNode_Op_Range : public ESQueryNode_Op {
   string range_str;
 public:
@@ -438,6 +466,8 @@
     node = new ESQueryNode_Bool(compiler);
   } else if (op == "==") {
     node = new ESQueryNode_Op_Equal(compiler);
+  } else if (op == "!=") {
+    node = new ESQueryNode_Op_NotEqual(compiler);
   } else {
     static map<string, string> range_op_map = {
       { "<", "lt"},
@@ -470,6 +500,7 @@
     case ')':
     case '<':
     case '>':
+    case '!':
     case '@':
     case ',':
     case ';':
@@ -493,6 +524,7 @@
 static bool is_op_char(char c)
 {
   switch (c) {
+    case '!':
     case '<':
     case '=':
     case '>':
@@ -534,7 +566,7 @@
    * condition: <key> <operator> <val>
    *
    * whereas key: needs to conform to http header field restrictions
-   *         operator: one of the following: < <= == >= >
+   *         operator: one of the following: < <= == != >= >
    *         val: ascii, terminated by either space or ')' (or end of string)
    */
 
diff -Nru ceph-12.2.11/src/rgw/rgw_file.h ceph-12.2.12/src/rgw/rgw_file.h
--- ceph-12.2.11/src/rgw/rgw_file.h	2019-01-30 15:51:26.000000000 +0000
+++ ceph-12.2.12/src/rgw/rgw_file.h	2019-04-11 12:33:50.000000000 +0000
@@ -918,9 +918,8 @@
     int authorize(RGWRados* store) {
       int ret = rgw_get_user_info_by_access_key(store, key.id, user);
       if (ret == 0) {
-	RGWAccessKey* key0 = user.get_key0();
-	if (!key0 ||
-	    (key0->key != key.key))
+	RGWAccessKey* k = user.get_key(key.id);
+	if (!k || (k->key != key.key))
 	  return -EINVAL;
 	if (user.suspended)
 	  return -ERR_USER_SUSPENDED;
@@ -1291,6 +1290,7 @@
 
     // woo
     s->user = user;
+    s->bucket_tenant = user->user_id.tenant;
 
     return 0;
   }
@@ -1427,6 +1427,7 @@
 
     // woo
     s->user = user;
+    s->bucket_tenant = user->user_id.tenant;
 
     prefix = rgw_fh->relative_object_name();
     if (prefix.length() > 0)
@@ -1439,7 +1440,7 @@
   int operator()(const boost::string_ref name, const rgw_obj_key& marker,
 		uint8_t type) {
 
-    assert(name.length() > 0); // XXX
+    assert(name.length() > 0); // all cases handled in callers
 
     /* hash offset of name in parent (short name) for NFS readdir cookie */
     uint64_t off = XXH64(name.data(), name.length(), fh_key::seed);
@@ -1525,6 +1526,12 @@
 			     << " cpref=" << sref
 			     << dendl;
 
+      if (sref.empty()) {
+	/* null path segment--could be created in S3 but has no NFS
+	 * interpretation */
+	return;
+      }
+
       this->operator()(sref, next_marker, RGW_FS_TYPE_DIRECTORY);
       ++ix;
     }
@@ -1596,6 +1603,7 @@
     s->info.domain = ""; /* XXX ? */
 
     s->user = user;
+    s->bucket_tenant = user->user_id.tenant;
 
     prefix = rgw_fh->relative_object_name();
     if (prefix.length() > 0)
@@ -1683,6 +1691,7 @@
 
     // woo
     s->user = user;
+    s->bucket_tenant = user->user_id.tenant;
 
     return 0;
   }
@@ -1746,6 +1755,7 @@
 
     // woo
     s->user = user;
+    s->bucket_tenant = user->user_id.tenant;
 
     return 0;
   }
@@ -1811,6 +1821,7 @@
 
     // woo
     s->user = user;
+    s->bucket_tenant = user->user_id.tenant;
 
     return 0;
   }
@@ -1900,6 +1911,7 @@
 
     // woo
     s->user = user;
+    s->bucket_tenant = user->user_id.tenant;
 
     return 0;
   }
@@ -1985,6 +1997,7 @@
 
     // woo
     s->user = user;
+    s->bucket_tenant = user->user_id.tenant;
 
     return 0;
   }
@@ -2065,6 +2078,7 @@
 
     // woo
     s->user = user;
+    s->bucket_tenant = user->user_id.tenant;
 
     return 0;
   }
@@ -2145,6 +2159,7 @@
 
     // woo
     s->user = user;
+    s->bucket_tenant = user->user_id.tenant;
 
     return 0;
   }
@@ -2215,6 +2230,7 @@
 
     // woo
     s->user = user;
+    s->bucket_tenant = user->user_id.tenant;
 
     prefix = rgw_fh->relative_object_name();
     if (prefix.length() > 0)
@@ -2337,6 +2353,7 @@
 
     // woo
     s->user = user;
+    s->bucket_tenant = user->user_id.tenant;
 
     return 0;
   }
@@ -2466,6 +2483,7 @@
 
     // woo
     s->user = user;
+    s->bucket_tenant = user->user_id.tenant;
 
     return 0;
   }
@@ -2526,6 +2544,7 @@
 
     // woo
     s->user = user;
+    s->bucket_tenant = user->user_id.tenant;
 
     return 0;
   }
diff -Nru ceph-12.2.11/src/rgw/rgw_gc.cc ceph-12.2.12/src/rgw/rgw_gc.cc
--- ceph-12.2.11/src/rgw/rgw_gc.cc	2019-01-30 15:51:26.000000000 +0000
+++ ceph-12.2.12/src/rgw/rgw_gc.cc	2019-04-11 12:33:50.000000000 +0000
@@ -189,6 +189,7 @@
           ctx = new IoCtx;
 	  ret = rgw_init_ioctx(store->get_rados_handle(), obj.pool, *ctx);
 	  if (ret < 0) {
+	    last_pool = "";
 	    dout(0) << "ERROR: failed to create ioctx pool=" << obj.pool << dendl;
 	    continue;
 	  }
diff -Nru ceph-12.2.11/src/rgw/rgw_iam_policy.h ceph-12.2.12/src/rgw/rgw_iam_policy.h
--- ceph-12.2.11/src/rgw/rgw_iam_policy.h	2019-01-30 15:51:26.000000000 +0000
+++ ceph-12.2.12/src/rgw/rgw_iam_policy.h	2019-04-11 12:33:50.000000000 +0000
@@ -99,6 +99,8 @@
 static constexpr std::uint64_t s3All = (1ULL << s3Count) - 1;
 
 namespace {
+// Please update the table in doc/radosgw/s3/authentication.rst if you
+// modify this function.
 inline int op_to_perm(std::uint64_t op) {
   switch (op) {
   case s3GetObject:
diff -Nru ceph-12.2.11/src/rgw/rgw_ldap.cc ceph-12.2.12/src/rgw/rgw_ldap.cc
--- ceph-12.2.11/src/rgw/rgw_ldap.cc	2019-01-30 15:51:26.000000000 +0000
+++ ceph-12.2.12/src/rgw/rgw_ldap.cc	2019-04-11 12:33:50.000000000 +0000
@@ -27,7 +27,7 @@
       memset(bindpw, 0, 1024);
       int pwlen = safe_read_file("" /* base */, ldap_secret.c_str(),
 				 bindpw, 1023);
-    if (pwlen) {
+    if (pwlen > 0) {
       ldap_bindpw = bindpw;
       boost::algorithm::trim(ldap_bindpw);
       if (ldap_bindpw.back() == '\n')
diff -Nru ceph-12.2.11/src/rgw/rgw_loadgen.cc ceph-12.2.12/src/rgw/rgw_loadgen.cc
--- ceph-12.2.11/src/rgw/rgw_loadgen.cc	2019-01-30 15:51:26.000000000 +0000
+++ ceph-12.2.12/src/rgw/rgw_loadgen.cc	2019-04-11 12:33:50.000000000 +0000
@@ -29,6 +29,7 @@
                                  content_type.c_str(),
                                  date_str.c_str(),
                                  meta_map,
+				 map<string, string>{},
                                  uri.c_str(),
                                  sub_resources,
                                  canonical_header);
diff -Nru ceph-12.2.11/src/rgw/rgw_op.cc ceph-12.2.12/src/rgw/rgw_op.cc
--- ceph-12.2.11/src/rgw/rgw_op.cc	2019-01-30 15:51:26.000000000 +0000
+++ ceph-12.2.12/src/rgw/rgw_op.cc	2019-04-11 12:33:50.000000000 +0000
@@ -4238,9 +4238,9 @@
 				 rgw::IAM::s3DeleteObjectVersion,
 				 ARN(s->bucket, s->object.name));
     if (r == Effect::Allow)
-      return true;
+      return 0;
     else if (r == Effect::Deny)
-      return false;
+      return -EACCES;
   }
 
   if (!verify_bucket_permission_no_policy(s, RGW_PERM_WRITE)) {
@@ -4625,6 +4625,9 @@
 				    rgw::IAM::s3GetObjectAcl :
 				    rgw::IAM::s3GetObjectVersionAcl);
   } else {
+    if (!s->bucket_exists) {
+      return -ERR_NO_SUCH_BUCKET;
+    }
     perm = verify_bucket_permission(s, rgw::IAM::s3GetBucketAcl);
   }
   if (!perm)
diff -Nru ceph-12.2.11/src/rgw/rgw_rados.cc ceph-12.2.12/src/rgw/rgw_rados.cc
--- ceph-12.2.11/src/rgw/rgw_rados.cc	2019-01-30 15:51:26.000000000 +0000
+++ ceph-12.2.12/src/rgw/rgw_rados.cc	2019-04-11 12:33:50.000000000 +0000
@@ -6796,6 +6796,23 @@
   return 0;
 }
 
+int RGWRados::BucketShard::init(const RGWBucketInfo& bucket_info,
+                                const rgw_obj& obj)
+{
+  bucket = bucket_info.bucket;
+
+  int ret = store->open_bucket_index_shard(bucket_info, index_ctx,
+                                           obj.get_hash_object(), &bucket_obj,
+                                           &shard_id);
+  if (ret < 0) {
+    ldout(store->ctx(), 0) << "ERROR: open_bucket_index_shard() returned ret=" << ret << dendl;
+    return ret;
+  }
+  ldout(store->ctx(), 20) << " bucket index object: " << bucket_obj << dendl;
+
+  return 0;
+}
+
 int RGWRados::BucketShard::init(const RGWBucketInfo& bucket_info, int sid)
 {
   bucket = bucket_info.bucket;
@@ -7563,6 +7580,15 @@
 
       src_attrs.erase(RGW_ATTR_COMPRESSION);
       src_attrs.erase(RGW_ATTR_MANIFEST); // not interested in original object layout
+
+      // filter out olh attributes
+      auto iter = src_attrs.lower_bound(RGW_ATTR_OLH_PREFIX);
+      while (iter != src_attrs.end()) {
+        if (!boost::algorithm::starts_with(iter->first, RGW_ATTR_OLH_PREFIX)) {
+          break;
+        }
+        iter = src_attrs.erase(iter);
+      }
     }
 
     if (plugin && src_attrs.find(RGW_ATTR_CRYPT_MODE) == src_attrs.end()) {
@@ -11412,6 +11438,61 @@
   return 0;
 }
 
+// a multisite sync bug resulted in the OLH head attributes being overwritten by
+// the attributes from another zone, causing link_olh() to fail endlessly due to
+// olh_tag mismatch. this attempts to detect this case and reconstruct the OLH
+// attributes from the bucket index. see http://tracker.ceph.com/issues/37792
+int RGWRados::repair_olh(RGWObjState* state, const RGWBucketInfo& bucket_info,
+                         const rgw_obj& obj)
+{
+  // fetch the current olh entry from the bucket index
+  rgw_bucket_olh_entry olh;
+  int r = bi_get_olh(bucket_info, obj, &olh);
+  if (r < 0) {
+    ldout(cct, 0) << "repair_olh failed to read olh entry for " << obj << dendl;
+    return r;
+  }
+  if (olh.tag == state->olh_tag.to_str()) { // mismatch already resolved?
+    return 0;
+  }
+
+  ldout(cct, 4) << "repair_olh setting olh_tag=" << olh.tag
+      << " key=" << olh.key << " delete_marker=" << olh.delete_marker << dendl;
+
+  // rewrite OLH_ID_TAG and OLH_INFO from current olh
+  ObjectWriteOperation op;
+  // assert this is the same olh tag we think we're fixing
+  bucket_index_guard_olh_op(*state, op);
+  // preserve existing mtime
+  struct timespec mtime_ts = ceph::real_clock::to_timespec(state->mtime);
+  op.mtime2(&mtime_ts);
+  {
+    bufferlist bl;
+    bl.append(olh.tag.c_str(), olh.tag.size());
+    op.setxattr(RGW_ATTR_OLH_ID_TAG, bl);
+  }
+  {
+    RGWOLHInfo info;
+    info.target = rgw_obj(bucket_info.bucket, olh.key);
+    info.removed = olh.delete_marker;
+    bufferlist bl;
+    encode(info, bl);
+    op.setxattr(RGW_ATTR_OLH_INFO, bl);
+  }
+  rgw_rados_ref ref;
+  r = get_obj_head_ref(bucket_info, obj, &ref);
+  if (r < 0) {
+    return r;
+  }
+  r = ref.ioctx.operate(ref.oid, &op);
+  if (r < 0) {
+    ldout(cct, 0) << "repair_olh failed to write olh attributes with "
+        << cpp_strerror(r) << dendl;
+    return r;
+  }
+  return 0;
+}
+
 int RGWRados::bucket_index_trim_olh_log(const RGWBucketInfo& bucket_info, RGWObjState& state, const rgw_obj& obj_instance, uint64_t ver)
 {
   rgw_rados_ref ref;
@@ -11493,6 +11574,11 @@
   op.cmpxattr(RGW_ATTR_OLH_ID_TAG, CEPH_OSD_CMPXATTR_OP_EQ, olh_tag);
   op.cmpxattr(RGW_ATTR_OLH_VER, CEPH_OSD_CMPXATTR_OP_GT, last_ver);
 
+  bufferlist ver_bl;
+  string last_ver_s = to_string(last_ver);
+  ver_bl.append(last_ver_s.c_str(), last_ver_s.size());
+  op.setxattr(RGW_ATTR_OLH_VER, ver_bl);
+
   struct timespec mtime_ts = real_clock::to_timespec(state.mtime);
   op.mtime2(&mtime_ts);
 
@@ -11584,7 +11670,7 @@
     ObjectWriteOperation rm_op;
 
     rm_op.cmpxattr(RGW_ATTR_OLH_ID_TAG, CEPH_OSD_CMPXATTR_OP_EQ, olh_tag);
-    rm_op.cmpxattr(RGW_ATTR_OLH_VER, CEPH_OSD_CMPXATTR_OP_GT, last_ver);
+    rm_op.cmpxattr(RGW_ATTR_OLH_VER, CEPH_OSD_CMPXATTR_OP_EQ, last_ver);
     cls_obj_check_prefix_exist(rm_op, RGW_ATTR_OLH_PENDING_PREFIX, true); /* fail if found one of these, pending modification */
     rm_op.remove();
 
@@ -11668,6 +11754,12 @@
     if (ret < 0) {
       ldout(cct, 20) << "bucket_index_link_olh() target_obj=" << target_obj << " delete_marker=" << (int)delete_marker << " returned " << ret << dendl;
       if (ret == -ECANCELED) {
+        // the bucket index rejected the link_olh() due to olh tag mismatch;
+        // attempt to reconstruct olh head attributes based on the bucket index
+        int r2 = repair_olh(state, bucket_info, olh_obj);
+        if (r2 < 0 && r2 != -ECANCELED) {
+          return r2;
+        }
         continue;
       }
       return ret;
@@ -11966,7 +12058,7 @@
 int RGWRados::get_bucket_stats(RGWBucketInfo& bucket_info, int shard_id, string *bucket_ver, string *master_ver,
     map<RGWObjCategory, RGWStorageStats>& stats, string *max_marker, bool *syncstopped)
 {
-  map<string, rgw_bucket_dir_header> headers;
+  vector<rgw_bucket_dir_header> headers;
   map<int, string> bucket_instance_ids;
   int r = cls_bucket_head(bucket_info, shard_id, headers, &bucket_instance_ids);
   if (r < 0) {
@@ -11975,25 +12067,25 @@
 
   assert(headers.size() == bucket_instance_ids.size());
 
-  map<string, rgw_bucket_dir_header>::iterator iter = headers.begin();
+  auto iter = headers.begin();
   map<int, string>::iterator viter = bucket_instance_ids.begin();
   BucketIndexShardsManager ver_mgr;
   BucketIndexShardsManager master_ver_mgr;
   BucketIndexShardsManager marker_mgr;
   char buf[64];
   for(; iter != headers.end(); ++iter, ++viter) {
-    accumulate_raw_stats(iter->second, stats);
-    snprintf(buf, sizeof(buf), "%lu", (unsigned long)iter->second.ver);
+    accumulate_raw_stats(*iter, stats);
+    snprintf(buf, sizeof(buf), "%lu", (unsigned long)iter->ver);
     ver_mgr.add(viter->first, string(buf));
-    snprintf(buf, sizeof(buf), "%lu", (unsigned long)iter->second.master_ver);
+    snprintf(buf, sizeof(buf), "%lu", (unsigned long)iter->master_ver);
     master_ver_mgr.add(viter->first, string(buf));
     if (shard_id >= 0) {
-      *max_marker = iter->second.max_marker;
+      *max_marker = iter->max_marker;
     } else {
-      marker_mgr.add(viter->first, iter->second.max_marker);
+      marker_mgr.add(viter->first, iter->max_marker);
     }
     if (syncstopped != NULL)
-      *syncstopped = iter->second.syncstopped;
+      *syncstopped = iter->syncstopped;
   }
   ver_mgr.to_string(bucket_ver);
   master_ver_mgr.to_string(master_ver);
@@ -12006,7 +12098,7 @@
 int RGWRados::get_bi_log_status(RGWBucketInfo& bucket_info, int shard_id,
     map<int, string>& markers)
 {
-  map<string, rgw_bucket_dir_header> headers;
+  vector<rgw_bucket_dir_header> headers;
   map<int, string> bucket_instance_ids;
   int r = cls_bucket_head(bucket_info, shard_id, headers, &bucket_instance_ids);
   if (r < 0)
@@ -12014,14 +12106,14 @@
 
   assert(headers.size() == bucket_instance_ids.size());
 
-  map<string, rgw_bucket_dir_header>::iterator iter = headers.begin();
+  auto iter = headers.begin();
   map<int, string>::iterator viter = bucket_instance_ids.begin();
 
   for(; iter != headers.end(); ++iter, ++viter) {
     if (shard_id >= 0) {
-      markers[shard_id] = iter->second.max_marker;
+      markers[shard_id] = iter->max_marker;
     } else {
-      markers[viter->first] = iter->second.max_marker;
+      markers[viter->first] = iter->max_marker;
     }
   }
   return 0;
@@ -12573,7 +12665,7 @@
     ent.size = 0;
     ent.size_rounded = 0;
 
-    map<string, rgw_bucket_dir_header> headers;
+    vector<rgw_bucket_dir_header> headers;
 
     RGWBucketInfo bucket_info;
     int ret = get_bucket_instance_info(obj_ctx, bucket, bucket_info, NULL, NULL);
@@ -12585,11 +12677,11 @@
     if (r < 0)
       return r;
 
-    map<string, rgw_bucket_dir_header>::iterator hiter = headers.begin();
+    auto hiter = headers.begin();
     for (; hiter != headers.end(); ++hiter) {
       RGWObjCategory category = main_category;
-      map<uint8_t, struct rgw_bucket_category_stats>::iterator iter = (hiter->second.stats).find((uint8_t)category);
-      if (iter != hiter->second.stats.end()) {
+      map<uint8_t, struct rgw_bucket_category_stats>::iterator iter = (hiter->stats).find((uint8_t)category);
+      if (iter != hiter->stats.end()) {
         struct rgw_bucket_category_stats& stats = iter->second;
         ent.count += stats.num_entries;
         ent.size += stats.total_size;
@@ -12950,25 +13042,42 @@
   return CLSRGWIssueBucketBILogStop(index_ctx, bucket_objs, cct->_conf->rgw_bucket_index_max_aio)();
 }
 
-int RGWRados::bi_get_instance(const RGWBucketInfo& bucket_info, rgw_obj& obj, rgw_bucket_dir_entry *dirent)
+int RGWRados::bi_get_instance(const RGWBucketInfo& bucket_info, const rgw_obj& obj,
+                              rgw_bucket_dir_entry *dirent)
 {
-  rgw_rados_ref ref;
-  int r = get_obj_head_ref(bucket_info, obj, &ref);
+  rgw_cls_bi_entry bi_entry;
+  int r = bi_get(bucket_info, obj, InstanceIdx, &bi_entry);
+  if (r < 0 && r != -ENOENT) {
+    ldout(cct, 0) << "ERROR: bi_get() returned r=" << r << dendl;
+  }
   if (r < 0) {
     return r;
   }
+  bufferlist::iterator iter = bi_entry.data.begin();
+  try {
+    ::decode(*dirent, iter);
+  } catch (buffer::error& err) {
+    ldout(cct, 0) << "ERROR: failed to decode bi_entry()" << dendl;
+    return -EIO;
+  }
+
+  return 0;
+}
 
+int RGWRados::bi_get_olh(const RGWBucketInfo& bucket_info, const rgw_obj& obj,
+                         rgw_bucket_olh_entry *olh)
+{
   rgw_cls_bi_entry bi_entry;
-  r = bi_get(obj.bucket, obj, InstanceIdx, &bi_entry);
+  int r = bi_get(bucket_info, obj, OLHIdx, &bi_entry);
   if (r < 0 && r != -ENOENT) {
     ldout(cct, 0) << "ERROR: bi_get() returned r=" << r << dendl;
   }
   if (r < 0) {
     return r;
   }
-  bufferlist::iterator iter = bi_entry.data.begin();
+  auto iter = bi_entry.data.begin();
   try {
-    ::decode(*dirent, iter);
+    decode(*olh, iter);
   } catch (buffer::error& err) {
     ldout(cct, 0) << "ERROR: failed to decode bi_entry()" << dendl;
     return -EIO;
@@ -12977,10 +13086,11 @@
   return 0;
 }
 
-int RGWRados::bi_get(rgw_bucket& bucket, rgw_obj& obj, BIIndexType index_type, rgw_cls_bi_entry *entry)
+int RGWRados::bi_get(const RGWBucketInfo& bucket_info, const rgw_obj& obj,
+                     BIIndexType index_type, rgw_cls_bi_entry *entry)
 {
   BucketShard bs(this);
-  int ret = bs.init(bucket, obj, nullptr /* no RGWBucketInfo */);
+  int ret = bs.init(bucket_info, obj);
   if (ret < 0) {
     ldout(cct, 5) << "bs.init() returned ret=" << ret << dendl;
     return ret;
@@ -12988,11 +13098,7 @@
 
   cls_rgw_obj_key key(obj.key.get_index_key_name(), obj.key.instance);
   
-  ret = cls_rgw_bi_get(bs.index_ctx, bs.bucket_obj, index_type, key, entry);
-  if (ret < 0)
-    return ret;
-
-  return 0;
+  return cls_rgw_bi_get(bs.index_ctx, bs.bucket_obj, index_type, key, entry);
 }
 
 void RGWRados::bi_put(ObjectWriteOperation& op, BucketShard& bs, rgw_cls_bi_entry& entry)
@@ -13641,7 +13747,7 @@
   return 0;
 }
 
-int RGWRados::cls_bucket_head(const RGWBucketInfo& bucket_info, int shard_id, map<string, struct rgw_bucket_dir_header>& headers, map<int, string> *bucket_instance_ids)
+int RGWRados::cls_bucket_head(const RGWBucketInfo& bucket_info, int shard_id, vector<rgw_bucket_dir_header>& headers, map<int, string> *bucket_instance_ids)
 {
   librados::IoCtx index_ctx;
   map<int, string> oids;
@@ -13656,7 +13762,7 @@
 
   map<int, struct rgw_cls_list_ret>::iterator iter = list_results.begin();
   for(; iter != list_results.end(); ++iter) {
-    headers[oids[iter->first]] = iter->second.dir.header;
+    headers.push_back(std::move(iter->second.dir.header));
   }
   return 0;
 }
@@ -13745,7 +13851,7 @@
 
 int RGWRados::cls_user_sync_bucket_stats(rgw_raw_obj& user_obj, const RGWBucketInfo& bucket_info)
 {
-  map<string, struct rgw_bucket_dir_header> headers;
+  vector<rgw_bucket_dir_header> headers;
   int r = cls_bucket_head(bucket_info, RGW_NO_SHARD, headers);
   if (r < 0) {
     ldout(cct, 20) << "cls_bucket_header() returned " << r << dendl;
@@ -13757,7 +13863,7 @@
   bucket_info.bucket.convert(&entry.bucket);
 
   for (const auto& hiter : headers) {
-    for (const auto& iter : hiter.second.stats) {
+    for (const auto& iter : hiter.stats) {
       const struct rgw_bucket_category_stats& header_stats = iter.second;
       entry.size += header_stats.total_size;
       entry.size_rounded += header_stats.total_size_rounded;
@@ -13779,7 +13885,7 @@
 
 int RGWRados::cls_user_get_bucket_stats(const rgw_bucket& bucket, cls_user_bucket_entry& entry)
 {
-  map<string, struct rgw_bucket_dir_header> headers;
+  vector<rgw_bucket_dir_header> headers;
   RGWBucketInfo bucket_info;
   RGWObjectCtx obj_ctx(this);
   int ret = get_bucket_instance_info(obj_ctx, bucket, bucket_info, NULL, NULL);
@@ -13796,7 +13902,7 @@
   bucket.convert(&entry.bucket);
 
   for (const auto& hiter : headers) {
-    for (const auto& iter : hiter.second.stats) {
+    for (const auto& iter : hiter.stats) {
       const struct rgw_bucket_category_stats& header_stats = iter.second;
       entry.size += header_stats.total_size;
       entry.size_rounded += header_stats.total_size_rounded;
diff -Nru ceph-12.2.11/src/rgw/rgw_rados.h ceph-12.2.12/src/rgw/rgw_rados.h
--- ceph-12.2.11/src/rgw/rgw_rados.h	2019-01-30 15:51:26.000000000 +0000
+++ ceph-12.2.12/src/rgw/rgw_rados.h	2019-04-11 12:33:50.000000000 +0000
@@ -2710,6 +2710,7 @@
     explicit BucketShard(RGWRados *_store) : store(_store), shard_id(-1) {}
     int init(const rgw_bucket& _bucket, const rgw_obj& obj, RGWBucketInfo* out);
     int init(const rgw_bucket& _bucket, int sid, RGWBucketInfo* out);
+    int init(const RGWBucketInfo& bucket_info, const rgw_obj& obj);
     int init(const RGWBucketInfo& bucket_info, int sid);
   };
 
@@ -3384,6 +3385,8 @@
   int set_olh(RGWObjectCtx& obj_ctx, RGWBucketInfo& bucket_info, const rgw_obj& target_obj, bool delete_marker, rgw_bucket_dir_entry_meta *meta,
               uint64_t olh_epoch, ceph::real_time unmod_since, bool high_precision_time,
               rgw_zone_set *zones_trace = nullptr, bool log_data_change = false);
+  int repair_olh(RGWObjState* state, const RGWBucketInfo& bucket_info,
+                 const rgw_obj& obj);
   int unlink_obj_instance(RGWObjectCtx& obj_ctx, RGWBucketInfo& bucket_info, const rgw_obj& target_obj,
                           uint64_t olh_epoch, rgw_zone_set *zones_trace = nullptr);
 
@@ -3542,7 +3545,7 @@
 				vector<rgw_bucket_dir_entry>& ent_list,
 				bool *is_truncated, rgw_obj_index_key *last_entry,
 				bool (*force_check_filter)(const string& name) = nullptr);
-  int cls_bucket_head(const RGWBucketInfo& bucket_info, int shard_id, map<string, struct rgw_bucket_dir_header>& headers, map<int, string> *bucket_instance_ids = NULL);
+  int cls_bucket_head(const RGWBucketInfo& bucket_info, int shard_id, vector<rgw_bucket_dir_header>& headers, map<int, string> *bucket_instance_ids = NULL);
   int cls_bucket_head_async(const RGWBucketInfo& bucket_info, int shard_id, RGWGetDirHeader_CB *ctx, int *num_aio);
   int list_bi_log_entries(RGWBucketInfo& bucket_info, int shard_id, string& marker, uint32_t max, std::list<rgw_bi_log_entry>& result, bool *truncated);
   int trim_bi_log_entries(RGWBucketInfo& bucket_info, int shard_id, string& marker, string& end_marker);
@@ -3550,8 +3553,9 @@
   int stop_bi_log_entries(RGWBucketInfo& bucket_info, int shard_id);
   int get_bi_log_status(RGWBucketInfo& bucket_info, int shard_id, map<int, string>& max_marker);
 
-  int bi_get_instance(const RGWBucketInfo& bucket_info, rgw_obj& obj, rgw_bucket_dir_entry *dirent);
-  int bi_get(rgw_bucket& bucket, rgw_obj& obj, BIIndexType index_type, rgw_cls_bi_entry *entry);
+  int bi_get_instance(const RGWBucketInfo& bucket_info, const rgw_obj& obj, rgw_bucket_dir_entry *dirent);
+  int bi_get_olh(const RGWBucketInfo& bucket_info, const rgw_obj& obj, rgw_bucket_olh_entry *olh);
+  int bi_get(const RGWBucketInfo& bucket_info, const rgw_obj& obj, BIIndexType index_type, rgw_cls_bi_entry *entry);
   void bi_put(librados::ObjectWriteOperation& op, BucketShard& bs, rgw_cls_bi_entry& entry);
   int bi_put(BucketShard& bs, rgw_cls_bi_entry& entry);
   int bi_put(rgw_bucket& bucket, rgw_obj& obj, rgw_cls_bi_entry& entry);
diff -Nru ceph-12.2.11/src/rgw/rgw_reshard.cc ceph-12.2.12/src/rgw/rgw_reshard.cc
--- ceph-12.2.11/src/rgw/rgw_reshard.cc	2019-01-30 15:51:26.000000000 +0000
+++ ceph-12.2.12/src/rgw/rgw_reshard.cc	2019-04-11 12:33:50.000000000 +0000
@@ -727,6 +727,11 @@
       "\"created after successful resharding with error " << ret << dendl;
   }
 
+  ldout(store->ctx(), 1) << __func__ <<
+    " INFO: reshard of bucket \"" << bucket_info.bucket.name << "\" from \"" <<
+    bucket_info.bucket.get_key() << "\" to \"" <<
+    new_bucket_info.bucket.get_key() << "\" completed successfully" << dendl;
+
   return 0;
 
 error_out:
diff -Nru ceph-12.2.11/src/rgw/rgw_rest_client.cc ceph-12.2.12/src/rgw/rgw_rest_client.cc
--- ceph-12.2.11/src/rgw/rgw_rest_client.cc	2019-01-30 15:51:26.000000000 +0000
+++ ceph-12.2.12/src/rgw/rgw_rest_client.cc	2019-04-11 12:33:50.000000000 +0000
@@ -123,7 +123,7 @@
   map<string, string> meta_map;
   map<string, string> sub_resources;
   rgw_create_s3_canonical_header(method, NULL, NULL, date_str.c_str(),
-                            meta_map, new_url.c_str(), sub_resources,
+                            meta_map, meta_map, new_url.c_str(), sub_resources,
                             canonical_header);
 
   string digest;
diff -Nru ceph-12.2.11/src/rgw/rgw_rest_conn.h ceph-12.2.12/src/rgw/rgw_rest_conn.h
--- ceph-12.2.11/src/rgw/rgw_rest_conn.h	2019-01-30 15:51:26.000000000 +0000
+++ ceph-12.2.12/src/rgw/rgw_rest_conn.h	2019-04-11 12:33:50.000000000 +0000
@@ -49,6 +49,18 @@
   return params;
 }
 
+inline param_vec_t make_param_list(const map<string, string> *pp)
+{
+  param_vec_t params;
+  if (!pp) {
+    return params;
+  }
+  for (auto iter : *pp) {
+    params.emplace_back(make_pair(iter.first, iter.second));
+  }
+  return params;
+}
+
 class RGWRESTConn
 {
   CephContext *cct;
@@ -304,8 +316,8 @@
     return req.get_user_info();
   }
 
-  template <class T>
-  int decode_resource(T *dest);
+  template <class T, class E>
+  int decode_resource(T *dest, E *err_result);
 
   int send(bufferlist& bl);
 
@@ -332,17 +344,25 @@
     return 0;
   }
 
-  template <class T>
-  int wait(T *dest);
+  template <class T, class E>
+  int wait(T *dest, E *err_result = nullptr);
 };
 
-template <class T>
-int RGWRESTSendResource::decode_resource(T *dest)
+template <class T, class E>
+int RGWRESTSendResource::decode_resource(T *dest, E *err_result)
 {
   int ret = req.get_status();
   if (ret < 0) {
+    if (err_result) {
+      parse_decode_json(cct, *err_result, bl);
+    }
     return ret;
   }
+
+  if (!dest) {
+    return 0;
+  }
+
   ret = parse_decode_json(cct, *dest, bl);
   if (ret < 0) {
     return ret;
@@ -350,15 +370,17 @@
   return 0;
 }
 
-template <class T>
-int RGWRESTSendResource::wait(T *dest)
-{
+template <class T, class E>
+int RGWRESTSendResource::wait(T *dest, E *err_result){
   int ret = req.wait();
   if (ret < 0) {
+    if (err_result) {
+      parse_decode_json(cct, *err_result, bl);
+    }
     return ret;
   }
 
-  ret = decode_resource(dest);
+  ret = decode_resource(dest, err_result);
   if (ret < 0) {
     return ret;
   }
diff -Nru ceph-12.2.11/src/rgw/rgw_rest_s3.cc ceph-12.2.12/src/rgw/rgw_rest_s3.cc
--- ceph-12.2.11/src/rgw/rgw_rest_s3.cc	2019-01-30 15:51:26.000000000 +0000
+++ ceph-12.2.12/src/rgw/rgw_rest_s3.cc	2019-04-11 12:33:50.000000000 +0000
@@ -352,14 +352,11 @@
   res = rgw_s3_prepare_decrypt(s, attrs, &block_crypt, crypt_http_responses);
   if (res == 0) {
     if (block_crypt != nullptr) {
-      auto f = std::unique_ptr<RGWGetObj_BlockDecrypt>(new RGWGetObj_BlockDecrypt(s->cct, cb, std::move(block_crypt)));
-      //RGWGetObj_BlockDecrypt* f = new RGWGetObj_BlockDecrypt(s->cct, cb, std::move(block_crypt));
-      if (f != nullptr) {
-        if (manifest_bl != nullptr) {
-          res = f->read_manifest(*manifest_bl);
-          if (res == 0) {
-            *filter = std::move(f);
-          }
+      auto f = ceph::make_unique<RGWGetObj_BlockDecrypt>(s->cct, cb, std::move(block_crypt));
+      if (manifest_bl != nullptr) {
+        res = f->read_manifest(*manifest_bl);
+        if (res == 0) {
+          *filter = std::move(f);
         }
       }
     }
@@ -1243,6 +1240,19 @@
   }
 }
 
+static inline void map_qs_metadata(struct req_state* s)
+{
+  /* merge S3 valid user metadata from the query-string into
+   * x_meta_map, which maps them to attributes */
+  const auto& params = const_cast<RGWHTTPArgs&>(s->info.args).get_params();
+  for (const auto& elt : params) {
+    std::string k = boost::algorithm::to_lower_copy(elt.first);
+    if (k.find("x-amz-meta-") == /* offset */ 0) {
+      add_amz_meta_header(s->info.x_meta_map, k, elt.second);
+    }
+  }
+}
+
 int RGWPutObj_ObjStore_S3::get_params()
 {
   if (!s->length)
@@ -1253,6 +1263,8 @@
   size_t pos;
   int ret;
 
+  map_qs_metadata(s);
+
   RGWAccessControlPolicy_S3 s3policy(s->cct);
   ret = create_s3_policy(s, store, s3policy, s->owner);
   if (ret < 0)
@@ -1547,6 +1559,8 @@
     return op_ret;
   }
 
+  map_qs_metadata(s);
+
   ldout(s->cct, 20) << "adding bucket to policy env: " << s->bucket.name
 		    << dendl;
   env.add_var("bucket", s->bucket.name);
@@ -2539,6 +2553,8 @@
     return ret;
   }
 
+  map_qs_metadata(s);
+
   return do_aws4_auth_completion();
 }
 
@@ -3736,13 +3752,13 @@
   boost::string_view credential_scope;
   boost::string_view client_signature;
 
-  int ret = rgw::auth::s3::parse_credentials(s->info,
-                                             access_key_id,
-                                             credential_scope,
-                                             signed_hdrs,
-                                             client_signature,
-                                             date,
-                                             using_qs);
+  int ret = rgw::auth::s3::parse_v4_credentials(s->info,
+						access_key_id,
+						credential_scope,
+						signed_hdrs,
+						client_signature,
+						date,
+						using_qs);
   if (ret < 0) {
     throw ret;
   }
@@ -4059,6 +4075,11 @@
 
 void rgw::auth::s3::LDAPEngine::init(CephContext* const cct)
 {
+  if (! cct->_conf->rgw_s3_auth_use_ldap ||
+      cct->_conf->rgw_ldap_uri.empty()) {
+    return;
+  }
+
   if (! ldh) {
     std::lock_guard<std::mutex> lck(mtx);
     if (! ldh) {
@@ -4078,6 +4099,11 @@
   }
 }
 
+bool rgw::auth::s3::LDAPEngine::valid() {
+  std::lock_guard<std::mutex> lck(mtx);
+  return (!!ldh);
+}
+
 rgw::auth::RemoteApplier::acl_strategy_t
 rgw::auth::s3::LDAPEngine::get_acl_strategy() const
 {
diff -Nru ceph-12.2.11/src/rgw/rgw_rest_s3.h ceph-12.2.12/src/rgw/rgw_rest_s3.h
--- ceph-12.2.11/src/rgw/rgw_rest_s3.h	2019-01-30 15:51:26.000000000 +0000
+++ ceph-12.2.12/src/rgw/rgw_rest_s3.h	2019-04-11 12:33:50.000000000 +0000
@@ -847,6 +847,8 @@
   const char* get_name() const noexcept override {
     return "rgw::auth::s3::LDAPEngine";
   }
+
+  static bool valid();
 };
 
 
diff -Nru ceph-12.2.11/src/rgw/rgw_sync_module.cc ceph-12.2.12/src/rgw/rgw_sync_module.cc
--- ceph-12.2.11/src/rgw/rgw_sync_module.cc	2019-01-30 15:51:26.000000000 +0000
+++ ceph-12.2.12/src/rgw/rgw_sync_module.cc	2019-04-11 12:33:50.000000000 +0000
@@ -35,8 +35,8 @@
       return set_cr_error(retcode);
     }
     ldout(sync_env->cct, 20) << "stat of remote obj: z=" << sync_env->source_zone
-      << " b=" << bucket_info.bucket << " k=" << key << " size=" << size << " mtime=" << mtime
-      << " attrs=" << attrs << dendl;
+                             << " b=" << bucket_info.bucket << " k=" << key
+                             << " size=" << size << " mtime=" << mtime << dendl;
     yield {
       RGWStatRemoteObjCBCR *cb = allocate_callback();
       if (cb) {
diff -Nru ceph-12.2.11/src/rgw/rgw_sync_module_es.cc ceph-12.2.12/src/rgw/rgw_sync_module_es.cc
--- ceph-12.2.11/src/rgw/rgw_sync_module_es.cc	2019-01-30 15:51:26.000000000 +0000
+++ ceph-12.2.12/src/rgw/rgw_sync_module_es.cc	2019-04-11 12:33:50.000000000 +0000
@@ -1,3 +1,7 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#include "rgw_b64.h"
 #include "rgw_common.h"
 #include "rgw_coroutine.h"
 #include "rgw_sync_module.h"
@@ -103,6 +107,55 @@
 #define ES_NUM_SHARDS_DEFAULT 16
 #define ES_NUM_REPLICAS_DEFAULT 1
 
+using ESVersion = std::pair<int,int>;
+static constexpr ESVersion ES_V5{5,0};
+
+struct ESInfo {
+  std::string name;
+  std::string cluster_name;
+  std::string cluster_uuid;
+  ESVersion version;
+
+  void decode_json(JSONObj *obj);
+
+  std::string get_version_str(){
+    return std::to_string(version.first) + "." + std::to_string(version.second);
+  }
+};
+
+// simple wrapper structure to wrap the es version nested type
+struct es_version_decoder {
+  ESVersion version;
+
+  int parse_version(const std::string& s) {
+    int major, minor;
+    int ret = sscanf(s.c_str(), "%d.%d", &major, &minor);
+    if (ret < 0) {
+      return ret;
+    }
+    version = std::make_pair(major,minor);
+    return 0;
+  }
+
+  void decode_json(JSONObj *obj) {
+    std::string s;
+    JSONDecoder::decode_json("number",s,obj);
+    if (parse_version(s) < 0)
+      throw JSONDecoder::err("Failed to parse ElasticVersion");
+  }
+};
+
+
+void ESInfo::decode_json(JSONObj *obj)
+{
+  JSONDecoder::decode_json("name", name, obj);
+  JSONDecoder::decode_json("cluster_name", cluster_name, obj);
+  JSONDecoder::decode_json("cluster_uuid", cluster_uuid, obj);
+  es_version_decoder esv;
+  JSONDecoder::decode_json("version", esv, obj);
+  version = std::move(esv.version);
+}
+
 struct ElasticConfig {
   uint64_t sync_instance{0};
   string id;
@@ -114,6 +167,7 @@
   ItemList allow_owners;
   uint32_t num_shards{0};
   uint32_t num_replicas{0};
+  std::map <string,string> default_headers = {{ "Content-Type", "application/json" }};
 
   void init(CephContext *cct, const map<string, string, ltstr_nocase>& config) {
     string elastic_endpoint = rgw_conf_get(config, "endpoint", "");
@@ -128,6 +182,12 @@
       num_shards = ES_NUM_SHARDS_MIN;
     }
     num_replicas = rgw_conf_get_int(config, "num_replicas", ES_NUM_REPLICAS_DEFAULT);
+    string user = rgw_conf_get(config, "username", "");
+    string pw = rgw_conf_get(config, "password", "");
+    if (!user.empty() && !pw.empty()) {
+      auto auth_string = user + ":" + pw;
+      default_headers.emplace("AUTHORIZATION", "Basic " + rgw::to_base64(auth_string));
+    }
   }
 
   void init_instance(RGWRealm& realm, uint64_t instance_id) {
@@ -148,6 +208,10 @@
     return index_path;
   }
 
+  map<string, string>& get_request_headers() {
+    return default_headers;
+  }
+
   string get_obj_path(const RGWBucketInfo& bucket_info, const rgw_obj_key& key) {
     return index_path +  "/object/" + url_encode(bucket_info.bucket.bucket_id + ":" + key.name + ":" + (key.instance.empty() ? "null" : key.instance));
   }
@@ -160,58 +224,146 @@
 
 using ElasticConfigRef = std::shared_ptr<ElasticConfig>;
 
-struct es_dump_type {
-  const char *type;
-  const char *format;
-  bool analyzed;
+static const char *es_type_to_str(const ESType& t) {
+  switch (t) {
+  case ESType::String: return "string";
+  case ESType::Text: return "text";
+  case ESType::Keyword: return "keyword";
+  case ESType::Long: return "long";
+  case ESType::Integer: return "integer";
+  case ESType::Short: return "short";
+  case ESType::Byte: return "byte";
+  case ESType::Double: return "double";
+  case ESType::Float: return "float";
+  case ESType::Half_Float: return "half_float";
+  case ESType::Scaled_Float: return "scaled_float";
+  case ESType::Date: return "date";
+  case ESType::Boolean: return "boolean";
+  case ESType::Integer_Range: return "integer_range";
+  case ESType::Float_Range: return "float_range";
+  case ESType::Double_Range: return "date_range";
+  case ESType::Date_Range: return "date_range";
+  case ESType::Geo_Point: return "geo_point";
+  case ESType::Ip: return "ip";
+  default:
+    return "<unknown>";
+  }
+}
 
-  es_dump_type(const char *t, const char *f = nullptr, bool a = false) : type(t), format(f), analyzed(a) {}
+struct es_type_v2 {
+  ESType estype;
+  const char *format{nullptr};
+  boost::optional<bool> analyzed;
+
+  es_type_v2(ESType et) : estype(et) {}
 
   void dump(Formatter *f) const {
-    encode_json("type", type, f);
+    const char *type_str = es_type_to_str(estype);
+    encode_json("type", type_str, f);
     if (format) {
       encode_json("format", format, f);
     }
-    if (!analyzed && strcmp(type, "string") == 0) {
-      encode_json("index", "not_analyzed", f);
+
+    auto is_analyzed = analyzed;
+
+    if (estype == ESType::String &&
+        !is_analyzed) {
+      is_analyzed = false;
+    }
+
+    if (is_analyzed) {
+      encode_json("index", (is_analyzed.value() ? "analyzed" : "not_analyzed"), f);
     }
   }
 };
 
+struct es_type_v5 {
+  ESType estype;
+  const char *format{nullptr};
+  boost::optional<bool> analyzed;
+  boost::optional<bool> index;
+
+  es_type_v5(ESType et) : estype(et) {}
+
+  void dump(Formatter *f) const {
+    ESType new_estype;
+    if (estype != ESType::String) {
+      new_estype = estype;
+    } else {
+      bool is_analyzed = analyzed.value_or(false);
+      new_estype = (is_analyzed ? ESType::Text : ESType::Keyword);
+      /* index = true; ... Not setting index=true, because that's the default,
+       * and dumping a boolean value *might* be a problem when backporting this
+       * because value might get quoted
+       */
+    }
+
+    const char *type_str = es_type_to_str(new_estype);
+    encode_json("type", type_str, f);
+    if (format) {
+      encode_json("format", format, f);
+    }
+    if (index) {
+      encode_json("index", index.value(), f);
+    }
+  }
+};
+
+template <class T>
+struct es_type : public T {
+  es_type(T t) : T(t) {}
+  es_type& set_format(const char *f) {
+    T::format = f;
+    return *this;
+  }
+
+  es_type& set_analyzed(bool a) {
+    T::analyzed = a;
+    return *this;
+  }
+};
+
+template <class T>
 struct es_index_mappings {
-  void dump_custom(Formatter *f, const char *section, const char *type, const char *format) const {
+  ESType string_type {ESType::String};
+
+  es_type<T> est(ESType t) const {
+    return es_type<T>(t);
+  }
+
+  void dump_custom(const char *section, ESType type, const char *format, Formatter *f) const {
     f->open_object_section(section);
     ::encode_json("type", "nested", f);
     f->open_object_section("properties");
-    encode_json("name", es_dump_type("string"), f);
-    encode_json("value", es_dump_type(type, format), f);
+    encode_json("name", est(string_type), f);
+    encode_json("value", est(type).set_format(format), f);
     f->close_section(); // entry
     f->close_section(); // custom-string
   }
+
   void dump(Formatter *f) const {
     f->open_object_section("object");
     f->open_object_section("properties");
-    encode_json("bucket", es_dump_type("string"), f);
-    encode_json("name", es_dump_type("string"), f);
-    encode_json("instance", es_dump_type("string"), f);
-    encode_json("versioned_epoch", es_dump_type("long"), f);
+    encode_json("bucket", est(string_type), f);
+    encode_json("name", est(string_type), f);
+    encode_json("instance", est(string_type), f);
+    encode_json("versioned_epoch", est(ESType::Long), f);
     f->open_object_section("meta");
     f->open_object_section("properties");
-    encode_json("cache_control", es_dump_type("string"), f);
-    encode_json("content_disposition", es_dump_type("string"), f);
-    encode_json("content_encoding", es_dump_type("string"), f);
-    encode_json("content_language", es_dump_type("string"), f);
-    encode_json("content_type", es_dump_type("string"), f);
-    encode_json("etag", es_dump_type("string"), f);
-    encode_json("expires", es_dump_type("string"), f);
-    f->open_object_section("mtime");
-    ::encode_json("type", "date", f);
-    ::encode_json("format", "strict_date_optional_time||epoch_millis", f);
-    f->close_section(); // mtime
-    encode_json("size", es_dump_type("long"), f);
-    dump_custom(f, "custom-string", "string", nullptr);
-    dump_custom(f, "custom-int", "long", nullptr);
-    dump_custom(f, "custom-date", "date", "strict_date_optional_time||epoch_millis");
+    encode_json("cache_control", est(string_type), f);
+    encode_json("content_disposition", est(string_type), f);
+    encode_json("content_encoding", est(string_type), f);
+    encode_json("content_language", est(string_type), f);
+    encode_json("content_type", est(string_type), f);
+    encode_json("storage_class", est(string_type), f);
+    encode_json("etag", est(string_type), f);
+    encode_json("expires", est(string_type), f);
+    encode_json("mtime", est(ESType::Date)
+                         .set_format("strict_date_optional_time||epoch_millis"), f);
+    encode_json("size", est(ESType::Long), f);
+    dump_custom("custom-string", string_type, nullptr, f);
+    dump_custom("custom-int", ESType::Long, nullptr, f);
+    dump_custom("custom-date", ESType::Date, "strict_date_optional_time||epoch_millis", f);
     f->close_section(); // properties
     f->close_section(); // meta
     f->close_section(); // properties
@@ -231,11 +383,17 @@
   }
 };
 
-struct es_index_config {
+struct es_index_config_base {
+  virtual ~es_index_config_base() {}
+  virtual void dump(Formatter *f) const = 0;
+};
+
+template <class T>
+struct es_index_config : public es_index_config_base {
   es_index_settings settings;
-  es_index_mappings mappings;
+  es_index_mappings<T> mappings;
 
-  es_index_config(es_index_settings& _s, es_index_mappings& _m) : settings(_s), mappings(_m) {}
+  es_index_config(es_index_settings& _s) : settings(_s) {}
 
   void dump(Formatter *f) const {
     encode_json("settings", settings, f);
@@ -256,6 +414,16 @@
   return std::find(rgw_sys_attrs.begin(), rgw_sys_attrs.end(), attr_name) != rgw_sys_attrs.end();
 }
 
+static size_t attr_len(const bufferlist& val)
+{
+  size_t len = val.length();
+  if (len && val[len - 1] == '\0') {
+    --len;
+  }
+
+  return len;
+}
+
 struct es_obj_metadata {
   CephContext *cct;
   ElasticConfigRef es_conf;
@@ -282,17 +450,22 @@
       const string& attr_name = i.first;
       bufferlist& val = i.second;
 
-      if (attr_name.compare(0, sizeof(RGW_ATTR_PREFIX) - 1, RGW_ATTR_PREFIX) != 0) {
+      if (!boost::algorithm::starts_with(attr_name, RGW_ATTR_PREFIX)) {
         continue;
       }
 
-      if (attr_name.compare(0, sizeof(RGW_ATTR_META_PREFIX) - 1, RGW_ATTR_META_PREFIX) == 0) {
+      if (boost::algorithm::starts_with(attr_name, RGW_ATTR_META_PREFIX)) {
         custom_meta.emplace(attr_name.substr(sizeof(RGW_ATTR_META_PREFIX) - 1),
-                            string(val.c_str(), (val.length() > 0 ? val.length() - 1 : 0)));
+                            string(val.c_str(), attr_len(val)));
+        continue;
+      }
+
+      if (boost::algorithm::starts_with(attr_name, RGW_ATTR_CRYPT_PREFIX)) {
         continue;
       }
 
-      if (attr_name.compare(0, sizeof(RGW_ATTR_CRYPT_PREFIX) -1, RGW_ATTR_CRYPT_PREFIX) == 0) {
+      if (boost::algorithm::starts_with(attr_name, RGW_ATTR_OLH_PREFIX)) {
+        // skip versioned object olh info
         continue;
       }
 
@@ -341,13 +514,16 @@
       } else {
         if (!is_sys_attr(attr_name)) {
           out_attrs.emplace(attr_name.substr(sizeof(RGW_ATTR_PREFIX) - 1),
-                            std::string(val.c_str(), (val.length() > 0 ? val.length() - 1 : 0)));
+                            std::string(val.c_str(), attr_len(val)));
         }
       }
     }
     ::encode_json("bucket", bucket_info.bucket.name, f);
     ::encode_json("name", key.name, f);
-    ::encode_json("instance", key.instance, f);
+    string instance = key.instance;
+    if (instance.empty())
+      instance = "null";
+    ::encode_json("instance", instance, f);
     ::encode_json("versioned_epoch", versioned_epoch, f);
     ::encode_json("owner", policy.get_owner(), f);
     ::encode_json("permissions", permissions, f);
@@ -449,6 +625,28 @@
 class RGWElasticInitConfigCBCR : public RGWCoroutine {
   RGWDataSyncEnv *sync_env;
   ElasticConfigRef conf;
+  ESInfo es_info;
+
+  struct _err_response {
+    struct err_reason {
+      vector<err_reason> root_cause;
+      string type;
+      string reason;
+      string index;
+
+      void decode_json(JSONObj *obj) {
+        JSONDecoder::decode_json("root_cause", root_cause, obj);
+        JSONDecoder::decode_json("type", type, obj);
+        JSONDecoder::decode_json("reason", reason, obj);
+        JSONDecoder::decode_json("index", index, obj);
+      }
+    } error;
+
+    void decode_json(JSONObj *obj) {
+      JSONDecoder::decode_json("error", error, obj);
+    }
+  } err_response;
+
 public:
   RGWElasticInitConfigCBCR(RGWDataSyncEnv *_sync_env,
                           ElasticConfigRef _conf) : RGWCoroutine(_sync_env->cct),
@@ -457,21 +655,46 @@
   int operate() override {
     reenter(this) {
       ldout(sync_env->cct, 0) << ": init elasticsearch config zone=" << sync_env->source_zone << dendl;
+      yield call(new RGWReadRESTResourceCR<ESInfo> (sync_env->cct,
+                                                    conf->conn.get(),
+                                                    sync_env->http_manager,
+                                                    "/", nullptr /*params*/,
+                                                    &(conf->default_headers),
+                                                    &es_info));
+      if (retcode < 0) {
+        return set_cr_error(retcode);
+      }
+
       yield {
         string path = conf->get_index_path();
+        ldout(sync_env->cct, 5) << "got elastic version=" << es_info.get_version_str() << dendl;
 
         es_index_settings settings(conf->num_replicas, conf->num_shards);
-        es_index_mappings mappings;
 
-        es_index_config index_conf(settings, mappings);
+        std::unique_ptr<es_index_config_base> index_conf;
 
-        call(new RGWPutRESTResourceCR<es_index_config, int>(sync_env->cct, conf->conn.get(),
-                                                              sync_env->http_manager,
-                                                              path, nullptr /* params */,
-                                                              index_conf, nullptr /* result */));
+        if (es_info.version >= ES_V5) {
+          ldout(sync_env->cct, 0) << "elasticsearch: index mapping: version >= 5" << dendl;
+          index_conf.reset(new es_index_config<es_type_v5>(settings));
+        } else {
+          ldout(sync_env->cct, 0) << "elasticsearch: index mapping: version < 5" << dendl;
+          index_conf.reset(new es_index_config<es_type_v2>(settings));
+        }
+        call(new RGWPutRESTResourceCR<es_index_config_base, int, _err_response> (sync_env->cct,
+                                                             conf->conn.get(),
+                                                             sync_env->http_manager,
+                                                             path, nullptr /*params*/,
+                                                             &(conf->default_headers),
+                                                             *index_conf, nullptr, &err_response));
       }
       if (retcode < 0) {
-        return set_cr_error(retcode);
+        ldout(sync_env->cct, 0) << "elasticsearch: failed to initialize index: response.type=" << err_response.error.type << " response.reason=" << err_response.error.reason << dendl;
+
+        if (err_response.error.type != "index_already_exists_exception") {
+          return set_cr_error(retcode);
+        }
+
+        ldout(sync_env->cct, 0) << "elasticsearch: index already exists, assuming external initialization" << dendl;
       }
       return set_cr_done();
     }
@@ -491,8 +714,9 @@
   int operate() override {
     reenter(this) {
       ldout(sync_env->cct, 10) << ": stat of remote obj: z=" << sync_env->source_zone
-                               << " b=" << bucket_info.bucket << " k=" << key << " size=" << size << " mtime=" << mtime
-                               << " attrs=" << attrs << dendl;
+                               << " b=" << bucket_info.bucket << " k=" << key
+                               << " size=" << size << " mtime=" << mtime << dendl;
+
       yield {
         string path = conf->get_obj_path(bucket_info, key);
         es_obj_metadata doc(sync_env->cct, conf, bucket_info, key, mtime, size, attrs, versioned_epoch);
@@ -500,6 +724,7 @@
         call(new RGWPutRESTResourceCR<es_obj_metadata, int>(sync_env->cct, conf->conn.get(),
                                                             sync_env->http_manager,
                                                             path, nullptr /* params */,
+                                                            &(conf->default_headers),
                                                             doc, nullptr /* result */));
 
       }
@@ -609,6 +834,10 @@
   string get_index_path() {
     return conf->get_index_path();
   }
+
+  map<string, string>& get_request_headers() {
+    return conf->get_request_headers();
+  }
 };
 
 RGWElasticSyncModuleInstance::RGWElasticSyncModuleInstance(CephContext *cct, const map<string, string, ltstr_nocase>& config)
@@ -630,6 +859,10 @@
   return data_handler->get_index_path();
 }
 
+map<string, string>& RGWElasticSyncModuleInstance::get_request_headers() {
+  return data_handler->get_request_headers();
+}
+
 RGWRESTMgr *RGWElasticSyncModuleInstance::get_rest_filter(int dialect, RGWRESTMgr *orig) {
   if (dialect != RGW_REST_S3) {
     return orig;
diff -Nru ceph-12.2.11/src/rgw/rgw_sync_module_es.h ceph-12.2.12/src/rgw/rgw_sync_module_es.h
--- ceph-12.2.11/src/rgw/rgw_sync_module_es.h	2019-01-30 15:51:26.000000000 +0000
+++ ceph-12.2.12/src/rgw/rgw_sync_module_es.h	2019-04-11 12:33:50.000000000 +0000
@@ -3,6 +3,33 @@
 
 #include "rgw_sync_module.h"
 
+enum class ESType {
+  /* string datatypes */
+  String, /* Deprecated Since 5.X+ */
+  Text,
+  Keyword,
+
+  /* Numeric Types */
+  Long, Integer, Short, Byte, Double, Float, Half_Float, Scaled_Float,
+
+  /* Date Type */
+  Date,
+
+  /* Boolean */
+  Boolean,
+
+  /* Binary; Must Be Base64 Encoded */
+  Binary,
+
+  /* Range Types */
+  Integer_Range, Float_Range, Long_Range, Double_Range, Date_Range,
+
+  /* A Few Specialized Types */
+  Geo_Point,
+  Ip
+};
+
+
 class RGWElasticSyncModule : public RGWSyncModule {
 public:
   RGWElasticSyncModule() {}
@@ -23,6 +50,7 @@
   RGWRESTMgr *get_rest_filter(int dialect, RGWRESTMgr *orig) override;
   RGWRESTConn *get_rest_conn();
   std::string get_index_path();
+  map<string, string>& get_request_headers();
 };
 
 #endif
diff -Nru ceph-12.2.11/src/rgw/rgw_sync_module_es_rest.cc ceph-12.2.12/src/rgw/rgw_sync_module_es_rest.cc
--- ceph-12.2.11/src/rgw/rgw_sync_module_es_rest.cc	2019-01-30 15:51:26.000000000 +0000
+++ ceph-12.2.12/src/rgw/rgw_sync_module_es_rest.cc	2019-04-11 12:33:50.000000000 +0000
@@ -20,6 +20,7 @@
     ceph::real_time mtime;
     string etag;
     string content_type;
+    string storage_class;
     map<string, string> custom_str;
     map<string, int64_t> custom_int;
     map<string, string> custom_date;
@@ -41,6 +42,7 @@
       parse_time(mtime_str.c_str(), &mtime);
       JSONDecoder::decode_json("etag", etag, obj);
       JSONDecoder::decode_json("content_type", content_type, obj);
+      JSONDecoder::decode_json("storage_class", storage_class, obj);
       list<_custom_entry<string> > str_entries;
       JSONDecoder::decode_json("custom-string", str_entries, obj);
       for (auto& e : str_entries) {
@@ -180,7 +182,11 @@
                                   { "size", "meta.size" },
                                   { "mtime", "meta.mtime" },
                                   { "lastmodified", "meta.mtime" },
-                                  { "contenttype", "meta.contenttype" },
+                                  { "last_modified", "meta.mtime" },
+                                  { "contenttype", "meta.content_type" },
+                                  { "content_type", "meta.content_type" },
+                                  { "storageclass", "meta.storage_class" },
+                                  { "storage_class", "meta.storage_class" },
   };
   es_query.set_field_aliases(&aliases);
 
@@ -189,9 +195,10 @@
                                                            {"instance", ESEntityTypeMap::ES_ENTITY_STR},
                                                            {"permissions", ESEntityTypeMap::ES_ENTITY_STR},
                                                            {"meta.etag", ESEntityTypeMap::ES_ENTITY_STR},
-                                                           {"meta.contenttype", ESEntityTypeMap::ES_ENTITY_STR},
+                                                           {"meta.content_type", ESEntityTypeMap::ES_ENTITY_STR},
                                                            {"meta.mtime", ESEntityTypeMap::ES_ENTITY_DATE},
-                                                           {"meta.size", ESEntityTypeMap::ES_ENTITY_INT} };
+                                                           {"meta.size", ESEntityTypeMap::ES_ENTITY_INT},
+                                                           {"meta.storage_class", ESEntityTypeMap::ES_ENTITY_STR} };
   ESEntityTypeMap gm(generic_map);
   es_query.set_generic_type_map(&gm);
 
@@ -236,7 +243,8 @@
     params.push_back(param_pair_t("from", marker_str.c_str()));
   }
   ldout(s->cct, 20) << "sending request to elasticsearch, payload=" << string(in.c_str(), in.length()) << dendl;
-  op_ret = conn->get_resource(resource, &params, nullptr, out, &in);
+  auto& extra_headers = es_module->get_request_headers();
+  op_ret = conn->get_resource(resource, &params, &extra_headers, out, &in);
   if (op_ret < 0) {
     ldout(s->cct, 0) << "ERROR: failed to fetch resource (r=" << resource << ", ret=" << op_ret << ")" << dendl;
     return;
@@ -332,6 +340,7 @@
       s->formatter->dump_int("Size", e.meta.size);
       s->formatter->dump_format("ETag", "\"%s\"", e.meta.etag.c_str());
       s->formatter->dump_string("ContentType", e.meta.content_type.c_str());
+      s->formatter->dump_string("StorageClass", e.meta.storage_class.c_str());
       dump_owner(s, e.owner.get_id(), e.owner.get_display_name());
       s->formatter->open_array_section("CustomMetadata");
       for (auto& m : e.meta.custom_str) {
diff -Nru ceph-12.2.11/src/test/centos-6/ceph.spec.in ceph-12.2.12/src/test/centos-6/ceph.spec.in
--- ceph-12.2.11/src/test/centos-6/ceph.spec.in	2019-01-30 15:51:26.000000000 +0000
+++ ceph-12.2.12/src/test/centos-6/ceph.spec.in	2019-04-11 12:33:50.000000000 +0000
@@ -382,7 +382,7 @@
 %if 0%{?suse_version}
 Group:		System/Filesystems
 %endif
-Requires:	ceph-common = %{_epoch_prefix}%{version}-%{release}
+Requires:	ceph-base = %{_epoch_prefix}%{version}-%{release}
 Requires:	librados2 = %{_epoch_prefix}%{version}-%{release}
 %description -n rbd-mirror
 Daemon for mirroring RBD images between Ceph clusters, streaming
@@ -403,7 +403,7 @@
 %if 0%{?suse_version}
 Group:		System/Filesystems
 %endif
-Requires:	ceph-common = %{_epoch_prefix}%{version}-%{release}
+Requires:	ceph-base = %{_epoch_prefix}%{version}-%{release}
 %if 0%{with selinux}
 Requires:	ceph-selinux = %{_epoch_prefix}%{version}-%{release}
 %endif
diff -Nru ceph-12.2.11/src/test/centos-7/ceph.spec.in ceph-12.2.12/src/test/centos-7/ceph.spec.in
--- ceph-12.2.11/src/test/centos-7/ceph.spec.in	2019-01-30 15:51:26.000000000 +0000
+++ ceph-12.2.12/src/test/centos-7/ceph.spec.in	2019-04-11 12:33:50.000000000 +0000
@@ -382,7 +382,7 @@
 %if 0%{?suse_version}
 Group:		System/Filesystems
 %endif
-Requires:	ceph-common = %{_epoch_prefix}%{version}-%{release}
+Requires:	ceph-base = %{_epoch_prefix}%{version}-%{release}
 Requires:	librados2 = %{_epoch_prefix}%{version}-%{release}
 %description -n rbd-mirror
 Daemon for mirroring RBD images between Ceph clusters, streaming
@@ -403,7 +403,7 @@
 %if 0%{?suse_version}
 Group:		System/Filesystems
 %endif
-Requires:	ceph-common = %{_epoch_prefix}%{version}-%{release}
+Requires:	ceph-base = %{_epoch_prefix}%{version}-%{release}
 %if 0%{with selinux}
 Requires:	ceph-selinux = %{_epoch_prefix}%{version}-%{release}
 %endif
diff -Nru ceph-12.2.11/src/test/cli/osdmaptool/upmap-out.t ceph-12.2.12/src/test/cli/osdmaptool/upmap-out.t
--- ceph-12.2.11/src/test/cli/osdmaptool/upmap-out.t	2019-01-30 15:51:26.000000000 +0000
+++ ceph-12.2.12/src/test/cli/osdmaptool/upmap-out.t	2019-04-11 12:33:50.000000000 +0000
@@ -1,7 +1,7 @@
   $ osdmaptool --create-from-conf om -c $TESTDIR/ceph.conf.withracks --with-default-pool
   osdmaptool: osdmap file 'om'
   osdmaptool: writing epoch 1 to om
-  $ osdmaptool om --mark-up-in --mark-out 147 --upmap-max 11 --upmap c
+  $ osdmaptool --osd_calc_pg_upmaps_aggressively=false om --mark-up-in --mark-out 147 --upmap-max 11 --upmap c
   osdmaptool: osdmap file 'om'
   marking all OSDs up and in
   marking OSD@147 as out
@@ -10,14 +10,11 @@
   upmap, max-count 11, max deviation 0.01
   $ cat c
   ceph osd pg-upmap-items 1.7 142 145
-  ceph osd pg-upmap-items 1.8 219 223 99 103
+  ceph osd pg-upmap-items 1.8 219 223
   ceph osd pg-upmap-items 1.17 171 173 201 202
-  ceph osd pg-upmap-items 1.1a 201 202 115 114
-  ceph osd pg-upmap-items 1.1c 171 173 201 202 127 130
+  ceph osd pg-upmap-items 1.1a 201 202
+  ceph osd pg-upmap-items 1.1c 171 173 201 202
   ceph osd pg-upmap-items 1.20 88 87 201 202
-  ceph osd pg-upmap-items 1.21 207 206 142 145
-  ceph osd pg-upmap-items 1.51 201 202 65 64 186 189
   ceph osd pg-upmap-items 1.62 219 223
-  ceph osd pg-upmap-items 1.6f 219 223 108 111
-  ceph osd pg-upmap-items 1.82 219 223 157 158 6 3
+  ceph osd pg-upmap-items 1.6f 219 223
   $ rm -f om c
diff -Nru ceph-12.2.11/src/test/cli/osdmaptool/upmap.t ceph-12.2.12/src/test/cli/osdmaptool/upmap.t
--- ceph-12.2.11/src/test/cli/osdmaptool/upmap.t	2019-01-30 15:51:26.000000000 +0000
+++ ceph-12.2.12/src/test/cli/osdmaptool/upmap.t	2019-04-11 12:33:50.000000000 +0000
@@ -1,7 +1,7 @@
   $ osdmaptool --create-from-conf om -c $TESTDIR/ceph.conf.withracks --with-default-pool
   osdmaptool: osdmap file 'om'
   osdmaptool: writing epoch 1 to om
-  $ osdmaptool om --mark-up-in --upmap-max 11 --upmap c
+  $ osdmaptool --osd_calc_pg_upmaps_aggressively=false om --mark-up-in --upmap-max 11 --upmap c
   osdmaptool: osdmap file 'om'
   marking all OSDs up and in
   writing upmap command output to: c
@@ -11,12 +11,9 @@
   ceph osd pg-upmap-items 1.7 142 147
   ceph osd pg-upmap-items 1.8 219 223
   ceph osd pg-upmap-items 1.17 171 173 201 202
-  ceph osd pg-upmap-items 1.1a 201 202 115 114
-  ceph osd pg-upmap-items 1.1c 171 173 201 202 127 130
+  ceph osd pg-upmap-items 1.1a 201 202
+  ceph osd pg-upmap-items 1.1c 171 173 201 202
   ceph osd pg-upmap-items 1.20 88 87 201 202
-  ceph osd pg-upmap-items 1.24 32 35 232 233
-  ceph osd pg-upmap-items 1.51 201 202 65 64 186 189
+  ceph osd pg-upmap-items 1.51 201 202
   ceph osd pg-upmap-items 1.62 219 223
-  ceph osd pg-upmap-items 1.6f 219 223 108 111
-  ceph osd pg-upmap-items 1.f8 201 202
   $ rm -f om c
diff -Nru ceph-12.2.11/src/test/cli/rbd/help.t ceph-12.2.12/src/test/cli/rbd/help.t
--- ceph-12.2.11/src/test/cli/rbd/help.t	2019-01-30 15:51:26.000000000 +0000
+++ ceph-12.2.12/src/test/cli/rbd/help.t	2019-04-11 12:33:50.000000000 +0000
@@ -182,7 +182,7 @@
     --stripe-count arg        stripe count
     --data-pool arg           data pool
     --journal-splay-width arg number of active journal objects
-    --journal-object-size arg size of journal objects
+    --journal-object-size arg size of journal objects [4K <= size <= 64M]
     --journal-pool arg        pool for journal objects
   
   Image Features:
@@ -229,7 +229,7 @@
     --stripe-count arg           stripe count
     --data-pool arg              data pool
     --journal-splay-width arg    number of active journal objects
-    --journal-object-size arg    size of journal objects
+    --journal-object-size arg    size of journal objects [4K <= size <= 64M]
     --journal-pool arg           pool for journal objects
     --sparse-size arg            sparse size in B/K/M [default: 4K]
     --no-progress                disable progress output
@@ -274,7 +274,7 @@
     --stripe-count arg        stripe count
     --data-pool arg           data pool
     --journal-splay-width arg number of active journal objects
-    --journal-object-size arg size of journal objects
+    --journal-object-size arg size of journal objects [4K <= size <= 64M]
     --journal-pool arg        pool for journal objects
     -s [ --size ] arg         image size (in M/G/T) [default: M]
   
@@ -406,7 +406,7 @@
     -p [ --pool ] arg         pool name
     --image arg               image name
     --journal-splay-width arg number of active journal objects
-    --journal-object-size arg size of journal objects
+    --journal-object-size arg size of journal objects [4K <= size <= 64M]
     --journal-pool arg        pool for journal objects
   
   rbd help flatten
@@ -527,7 +527,7 @@
     --stripe-count arg        stripe count
     --data-pool arg           data pool
     --journal-splay-width arg number of active journal objects
-    --journal-object-size arg size of journal objects
+    --journal-object-size arg size of journal objects [4K <= size <= 64M]
     --journal-pool arg        pool for journal objects
     --sparse-size arg         sparse size in B/K/M [default: 4K]
     --no-progress             disable progress output
diff -Nru ceph-12.2.11/src/test/common/test_str_map.cc ceph-12.2.12/src/test/common/test_str_map.cc
--- ceph-12.2.11/src/test/common/test_str_map.cc	2019-01-30 15:51:26.000000000 +0000
+++ ceph-12.2.12/src/test/common/test_str_map.cc	2019-04-11 12:33:50.000000000 +0000
@@ -66,6 +66,18 @@
   }
 }
 
+TEST(str_map, empty_values) {
+  {
+    map<string,string> str_map;
+    ASSERT_EQ(0, get_str_map("M= P= L=",
+			     &str_map));
+    ASSERT_EQ(3u, str_map.size());
+    ASSERT_EQ("", str_map["M"]);
+    ASSERT_EQ("", str_map["P"]);
+    ASSERT_EQ("", str_map["L"]);
+  }
+}
+
 /* 
  * Local Variables:
  * compile-command: "cd ../.. ; make -j4 && 
diff -Nru ceph-12.2.11/src/test/debian-jessie/debian/changelog ceph-12.2.12/src/test/debian-jessie/debian/changelog
--- ceph-12.2.11/src/test/debian-jessie/debian/changelog	2019-04-26 12:17:04.000000000 +0000
+++ ceph-12.2.12/src/test/debian-jessie/debian/changelog	2019-06-12 10:05:49.000000000 +0000
@@ -1,3 +1,11 @@
+ceph (12.2.12-0ubuntu0.18.04.1) bionic; urgency=medium
+
+  * d/copyright: Exclude cruft from upstream tarballs.
+  * New upstream point release (LP: #1829716).
+  * d/p/s390x-link.patch: Drop, included upstream.
+
+ -- James Page <james.page@ubuntu.com>  Wed, 12 Jun 2019 11:05:49 +0100
+
 ceph (12.2.11-0ubuntu0.18.04.2) bionic; urgency=medium
 
   * d/control: Use openssl1.0 at build and runtime as
diff -Nru ceph-12.2.11/src/test/debian-jessie/debian/copyright ceph-12.2.12/src/test/debian-jessie/debian/copyright
--- ceph-12.2.11/src/test/debian-jessie/debian/copyright	2019-02-11 11:06:34.000000000 +0000
+++ ceph-12.2.12/src/test/debian-jessie/debian/copyright	2019-06-12 10:04:50.000000000 +0000
@@ -2,7 +2,14 @@
 Upstream-Name: ceph
 Upstream-Contact: Sage Weil <sage@newdream.net>
 Source: http://ceph.com/
-Files-Excluded: debian
+Files-Excluded:
+ debian
+ src/civetweb/examples/websocket_client/ssl/server.key.orig
+ src/civetweb/resources/cert/client.key.orig
+ src/civetweb/resources/cert/server.key.orig
+ src/erasure-code/jerasure/jerasure/Examples/makefile.orig
+ src/erasure-code/jerasure/jerasure/include/config.h.in~
+ src/erasure-code/jerasure/jerasure/makefile.orig
 
 Files: *
 Copyright: 2004-2014 Sage Weil <sage@newdream.net>
diff -Nru ceph-12.2.11/src/test/debian-jessie/debian/patches/s390x-link.patch ceph-12.2.12/src/test/debian-jessie/debian/patches/s390x-link.patch
--- ceph-12.2.11/src/test/debian-jessie/debian/patches/s390x-link.patch	2019-02-13 17:11:15.000000000 +0000
+++ ceph-12.2.12/src/test/debian-jessie/debian/patches/s390x-link.patch	1970-01-01 00:00:00.000000000 +0000
@@ -1,16 +0,0 @@
-Descrption: Fix linking issues on s390x
-Origin: https://github.com/ceph/ceph/pull/21380
-
---- a/src/rgw/CMakeLists.txt
-+++ b/src/rgw/CMakeLists.txt
-@@ -177,9 +177,7 @@ endif (WITH_RADOSGW_BEAST_FRONTEND)
- 
- add_library(radosgw_a STATIC ${radosgw_srcs}
-   $<TARGET_OBJECTS:civetweb_common_objs>)
--if (WITH_RADOSGW_BEAST_FRONTEND AND WITH_RADOSGW_BEAST_OPENSSL)
--  target_link_libraries(radosgw_a rgw_a ${SSL_LIBRARIES})
--endif()
-+target_link_libraries(radosgw_a rgw_a ${SSL_LIBRARIES})
- 
- add_executable(radosgw rgw_main.cc)
- target_link_libraries(radosgw radosgw_a librados
diff -Nru ceph-12.2.11/src/test/debian-jessie/debian/patches/series ceph-12.2.12/src/test/debian-jessie/debian/patches/series
--- ceph-12.2.11/src/test/debian-jessie/debian/patches/series	2019-02-13 17:11:15.000000000 +0000
+++ ceph-12.2.12/src/test/debian-jessie/debian/patches/series	2019-06-12 10:04:50.000000000 +0000
@@ -7,4 +7,3 @@
 # Ubuntu: FTBFS on armhf
 armhf-ftbfs.patch
 misc-32-bit-fixes.patch
-s390x-link.patch
diff -Nru ceph-12.2.11/src/test/fedora-21/ceph.spec.in ceph-12.2.12/src/test/fedora-21/ceph.spec.in
--- ceph-12.2.11/src/test/fedora-21/ceph.spec.in	2019-01-30 15:51:26.000000000 +0000
+++ ceph-12.2.12/src/test/fedora-21/ceph.spec.in	2019-04-11 12:33:50.000000000 +0000
@@ -382,7 +382,7 @@
 %if 0%{?suse_version}
 Group:		System/Filesystems
 %endif
-Requires:	ceph-common = %{_epoch_prefix}%{version}-%{release}
+Requires:	ceph-base = %{_epoch_prefix}%{version}-%{release}
 Requires:	librados2 = %{_epoch_prefix}%{version}-%{release}
 %description -n rbd-mirror
 Daemon for mirroring RBD images between Ceph clusters, streaming
@@ -403,7 +403,7 @@
 %if 0%{?suse_version}
 Group:		System/Filesystems
 %endif
-Requires:	ceph-common = %{_epoch_prefix}%{version}-%{release}
+Requires:	ceph-base = %{_epoch_prefix}%{version}-%{release}
 %if 0%{with selinux}
 Requires:	ceph-selinux = %{_epoch_prefix}%{version}-%{release}
 %endif
diff -Nru ceph-12.2.11/src/test/librbd/fsx.cc ceph-12.2.12/src/test/librbd/fsx.cc
--- ceph-12.2.11/src/test/librbd/fsx.cc	2019-01-30 15:51:26.000000000 +0000
+++ ceph-12.2.12/src/test/librbd/fsx.cc	2019-04-11 12:33:50.000000000 +0000
@@ -963,7 +963,7 @@
 	int ret;
 
 	/*
-	 * BLKDISCARD goes straight to disk and doesn't do anything
+	 * BLKZEROOUT goes straight to disk and doesn't do anything
 	 * about dirty buffers.  This means we need to flush so that
 	 *
 	 *   write 0..3M
@@ -977,19 +977,22 @@
 	 *
 	 * returns "data 0000 data" rather than "data data data" in
 	 * case 1..2M was cached.
+	 *
+         * Note: These cache coherency issues are supposed to be fixed
+         * in recent kernels.
 	 */
 	ret = __krbd_flush(ctx, true);
 	if (ret < 0)
 		return ret;
 
 	/*
-	 * off and len must be 512-byte aligned, otherwise BLKDISCARD
+	 * off and len must be 512-byte aligned, otherwise BLKZEROOUT
 	 * will fail with -EINVAL.  This means that -K (enable krbd
 	 * mode) requires -h 512 or similar.
 	 */
-	if (ioctl(ctx->krbd_fd, BLKDISCARD, &range) < 0) {
+	if (ioctl(ctx->krbd_fd, BLKZEROOUT, &range) < 0) {
 		ret = -errno;
-		prt("BLKDISCARD(%llu, %llu) failed\n", off, len);
+		prt("BLKZEROOUT(%llu, %llu) failed\n", off, len);
 		return ret;
 	}
 
diff -Nru ceph-12.2.11/src/test/mds/TestSessionFilter.cc ceph-12.2.12/src/test/mds/TestSessionFilter.cc
--- ceph-12.2.11/src/test/mds/TestSessionFilter.cc	2019-01-30 15:51:26.000000000 +0000
+++ ceph-12.2.12/src/test/mds/TestSessionFilter.cc	2019-04-11 12:33:50.000000000 +0000
@@ -74,8 +74,8 @@
   SessionFilter filter;
   std::stringstream ss;
   filter.parse({"id=123"}, &ss);
-  Session *a = new Session();;
-  Session *b = new Session();;
+  Session *a = new Session(nullptr);;
+  Session *b = new Session(nullptr);;
   a->info.inst.name.parse("client.123");
   b->info.inst.name.parse("client.456");
 
@@ -90,9 +90,9 @@
   SessionFilter filter;
   std::stringstream ss;
   filter.parse({"state=closing"}, &ss);
-  Session *a = new Session();
+  Session *a = new Session(nullptr);
   a->set_state(Session::STATE_CLOSING);
-  Session *b = new Session();
+  Session *b = new Session(nullptr);
   b->set_state(Session::STATE_OPENING);
 
   ASSERT_TRUE(filter.match(*a, [](client_t c) -> bool {return false;}));
@@ -106,9 +106,9 @@
   SessionFilter filter;
   std::stringstream ss;
   filter.parse({"auth_name=rhubarb"}, &ss);
-  Session *a = new Session();
+  Session *a = new Session(nullptr);
   a->info.auth_name.set_id("rhubarb");
-  Session *b = new Session();
+  Session *b = new Session(nullptr);
   b->info.auth_name.set_id("custard");
 
   ASSERT_TRUE(filter.match(*a, [](client_t c) -> bool {return false;}));
@@ -123,10 +123,10 @@
   std::stringstream ss;
   int r = filter.parse({"client_metadata.root=/rhubarb"}, &ss);
   ASSERT_EQ(r, 0);
-  Session *a = new Session();
-  a->set_client_metadata({{"root", "/rhubarb"}});
-  Session *b = new Session();
-  b->set_client_metadata({{"root", "/custard"}});
+  Session *a = new Session(nullptr);
+  a->set_client_metadata(std::map<std::string,std::string>({{"root", "/rhubarb"}}));
+  Session *b = new Session(nullptr);
+  b->set_client_metadata(std::map<std::string,std::string>({{"root", "/custard"}}));
 
   ASSERT_TRUE(filter.match(*a, [](client_t c) -> bool {return false;}));
   ASSERT_FALSE(filter.match(*b, [](client_t c) -> bool {return false;}));
@@ -140,7 +140,7 @@
   std::stringstream ss;
   int r = filter.parse({"reconnecting=true"}, &ss);
   ASSERT_EQ(r, 0);
-  Session *a = new Session();
+  Session *a = new Session(nullptr);
 
   ASSERT_TRUE(filter.match(*a, [](client_t c) -> bool {return true;}));
   ASSERT_FALSE(filter.match(*a, [](client_t c) -> bool {return false;}));
diff -Nru ceph-12.2.11/src/test/objectstore/Allocator_bench.cc ceph-12.2.12/src/test/objectstore/Allocator_bench.cc
--- ceph-12.2.11/src/test/objectstore/Allocator_bench.cc	1970-01-01 00:00:00.000000000 +0000
+++ ceph-12.2.12/src/test/objectstore/Allocator_bench.cc	2019-04-11 12:33:50.000000000 +0000
@@ -0,0 +1,340 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+/*
+ * In memory space allocator benchmarks.
+ * Author: Igor Fedotov, ifedotov@suse.com
+ */
+#include <iostream>
+#include <boost/scoped_ptr.hpp>
+#include <gtest/gtest.h>
+
+#include "common/Mutex.h"
+#include "common/Cond.h"
+#include "common/errno.h"
+#include "include/stringify.h"
+#include "include/Context.h"
+#include "os/bluestore/Allocator.h"
+
+#include <boost/random/uniform_int.hpp>
+typedef boost::mt11213b gen_type;
+
+#include "common/debug.h"
+#define dout_context g_ceph_context
+#define dout_subsys ceph_subsys_
+
+#if GTEST_HAS_PARAM_TEST
+
+class AllocTest : public ::testing::TestWithParam<const char*> {
+
+public:
+  boost::scoped_ptr<Allocator> alloc;
+  AllocTest(): alloc(0) { }
+  void init_alloc(int64_t size, uint64_t min_alloc_size) {
+    std::cout << "Creating alloc type " << string(GetParam()) << " \n";
+    alloc.reset(Allocator::create(g_ceph_context, string(GetParam()), size,
+				  min_alloc_size));
+  }
+
+  void init_close() {
+    alloc.reset(0);
+  }
+  void doOverwriteTest(uint64_t capacity, uint64_t prefill,
+    uint64_t overwrite);
+};
+
+const uint64_t _1m = 1024 * 1024;
+const uint64_t _2m = 2 * 1024 * 1024;
+
+void dump_mempools()
+{
+  ostringstream ostr;
+  Formatter* f = Formatter::create("json-pretty", "json-pretty", "json-pretty");
+  ostr << "Mempools: ";
+  f->open_object_section("mempools");
+  mempool::dump(f);
+  f->close_section();
+  f->flush(ostr);
+  delete f;
+  ldout(g_ceph_context, 0) << ostr.str() << dendl;
+}
+
+class AllocTracker
+{
+  std::vector<uint64_t> allocations;
+  uint64_t head = 0;
+  uint64_t tail = 0;
+  uint64_t size = 0;
+  boost::uniform_int<> u1;
+
+public:
+  AllocTracker(uint64_t capacity, uint64_t alloc_unit)
+    : u1(capacity, alloc_unit)
+  {
+    assert(alloc_unit >= 0x100);
+    assert(capacity <= (uint64_t(1) << 48)); // we use 5 octets (bytes 1 - 5) to store
+				 // offset to save the required space.
+				 // This supports capacity up to 281 TB
+
+    allocations.resize(capacity / alloc_unit);
+  }
+  inline uint64_t get_head() const
+  {
+    return head;
+  }
+
+  inline uint64_t get_tail() const
+  {
+    return tail;
+  }
+
+  bool push(uint64_t offs, uint32_t len)
+  {
+    assert((len & 0xff) == 0);
+    assert((offs & 0xff) == 0);
+    assert((offs & 0xffff000000000000) == 0);
+
+    if (head + 1 == tail)
+      return false;
+    uint64_t val = (offs << 16) | (len >> 8);
+    allocations[head++] = val;
+    head %= allocations.size();
+    ++size;
+    return true;
+  }
+  bool pop(uint64_t* offs, uint32_t* len)
+  {
+    if (size == 0)
+      return false;
+    uint64_t val = allocations[tail++];
+    *len = uint64_t((val & 0xffffff) << 8);
+    *offs = (val >> 16) & ~uint64_t(0xff);
+    tail %= allocations.size();
+    --size;
+    return true;
+  }
+  bool pop_random(gen_type& rng, uint64_t* offs, uint32_t* len,
+    uint32_t max_len = 0)
+  {
+    if (size == 0)
+      return false;
+
+    uint64_t pos = (u1(rng) % size) + tail;
+    pos %= allocations.size();
+    uint64_t val = allocations[pos];
+    *len = uint64_t((val & 0xffffff) << 8);
+    *offs = (val >> 16) & ~uint64_t(0xff);
+    if (max_len && *len > max_len) {
+      val = ((*offs + max_len) << 16) | ((*len - max_len) >> 8);
+      allocations[pos] = val;
+      *len = max_len;
+    } else {
+      allocations[pos] = allocations[tail++];
+      tail %= allocations.size();
+      --size;
+    }
+    return true;
+  }
+};
+
+TEST_P(AllocTest, test_alloc_bench_seq)
+{
+  uint64_t capacity = uint64_t(1024) * 1024 * 1024 * 1024;
+  uint64_t alloc_unit = 4096;
+  uint64_t want_size = alloc_unit;
+  PExtentVector allocated, tmp;
+
+  init_alloc(capacity, alloc_unit);
+  alloc->init_add_free(0, capacity);
+
+  utime_t start = ceph_clock_now();
+  for (uint64_t i = 0; i < capacity; i += want_size)
+  {
+    tmp.clear();
+    EXPECT_EQ(want_size, alloc->allocate(want_size, alloc_unit, 0, 0, &tmp));
+    if (0 == (i % (1 * 1024 * _1m))) {
+      std::cout << "alloc " << i / 1024 / 1024 << " mb of "
+        << capacity / 1024 / 1024 << std::endl;
+    }
+  }
+
+  std::cout << "releasing..." << std::endl;
+  for (size_t i = 0; i < capacity; i += want_size)
+  {
+    interval_set<uint64_t> release_set;
+    release_set.insert(i, want_size);
+    alloc->release(release_set);
+    if (0 == (i % (1 * 1024 * _1m))) {
+      std::cout << "release " << i / 1024 / 1024 << " mb of "
+        << capacity / 1024 / 1024 << std::endl;
+    }
+  }
+  std::cout<<"Executed in "<< ceph_clock_now() - start << std::endl;
+  dump_mempools();
+}
+
+TEST_P(AllocTest, test_alloc_bench)
+{
+  uint64_t capacity = uint64_t(1024) * 1024 * 1024 * 1024;
+  uint64_t alloc_unit = 4096;
+  PExtentVector allocated, tmp;
+  AllocTracker at(capacity, alloc_unit);
+
+  init_alloc(capacity, alloc_unit);
+  alloc->init_add_free(0, capacity);
+
+  gen_type rng(time(NULL));
+  boost::uniform_int<> u1(0, 9); // 4K-2M
+  boost::uniform_int<> u2(0, 7); // 4K-512K
+
+  utime_t start = ceph_clock_now();
+  for (uint64_t i = 0; i < capacity * 2; )
+  {
+    uint32_t want = alloc_unit << u1(rng);
+
+    tmp.clear();
+    auto r = alloc->allocate(want, alloc_unit, 0, 0, &tmp);
+    if (r < want) {
+      break;
+    }
+    i += r;
+
+    for(auto a : tmp) {
+      bool full = !at.push(a.offset, a.length);
+      EXPECT_EQ(full, false);
+    }
+    uint64_t want_release = alloc_unit << u2(rng);
+    uint64_t released = 0;
+    do {
+      uint64_t o = 0;
+      uint32_t l = 0;
+      interval_set<uint64_t> release_set;
+      if (!at.pop_random(rng, &o, &l, want_release - released)) {
+	break;
+      }
+      release_set.insert(o, l);
+      alloc->release(release_set);
+      released += l;
+    } while (released < want_release);
+
+    if (0 == (i % (1 * 1024 * _1m))) {
+      std::cout << "alloc " << i / 1024 / 1024 << " mb of "
+        << capacity / 1024 / 1024 << std::endl;
+    }
+  }
+  std::cout<<"Executed in "<< ceph_clock_now() - start << std::endl;
+  std::cout<<"Avail "<< alloc->get_free() / _1m << " MB" << std::endl;
+  dump_mempools();
+}
+
+void AllocTest::doOverwriteTest(uint64_t capacity, uint64_t prefill,
+  uint64_t overwrite)
+{
+  uint64_t alloc_unit = 4096;
+  PExtentVector allocated, tmp;
+  AllocTracker at(capacity, alloc_unit);
+
+  init_alloc(capacity, alloc_unit);
+  alloc->init_add_free(0, capacity);
+
+  gen_type rng(time(NULL));
+  boost::uniform_int<> u1(0, 9); // 4K-2M
+  boost::uniform_int<> u2(0, 9); // 4K-512K
+
+  utime_t start = ceph_clock_now();
+  // allocate 90% of the capacity
+  auto cap = prefill;
+  for (uint64_t i = 0; i < cap; )
+  {
+    uint32_t want = alloc_unit << u1(rng);
+    tmp.clear();
+    auto r = alloc->allocate(want, alloc_unit, 0, 0, &tmp);
+    if (r < want) {
+      break;
+    }
+    i += r;
+
+    for(auto a : tmp) {
+      bool full = !at.push(a.offset, a.length);
+      EXPECT_EQ(full, false);
+    }
+    if (0 == (i % (1 * 1024 * _1m))) {
+      std::cout << "alloc " << i / 1024 / 1024 << " mb of "
+        << cap / 1024 / 1024 << std::endl;
+    }
+  }
+
+  cap = overwrite;
+  for (uint64_t i = 0; i < cap; )
+  {
+    uint64_t want_release = alloc_unit << u2(rng);
+    uint64_t released = 0;
+    do {
+      uint64_t o = 0;
+      uint32_t l = 0;
+      interval_set<uint64_t> release_set;
+      if (!at.pop_random(rng, &o, &l, want_release - released)) {
+	break;
+      }
+      release_set.insert(o, l);
+      alloc->release(release_set);
+      released += l;
+    } while (released < want_release);
+
+    uint32_t want = alloc_unit << u1(rng);
+    tmp.clear();
+    auto r = alloc->allocate(want, alloc_unit, 0, 0, &tmp);
+    if (r != want) {
+      std::cout<<"Can't allocate more space, stopping."<< std::endl;
+      break;
+    }
+    i += r;
+
+    for(auto a : tmp) {
+      bool full = !at.push(a.offset, a.length);
+      EXPECT_EQ(full, false);
+    }
+
+    if (0 == (i % (1 * 1024 * _1m))) {
+      std::cout << "reuse " << i / 1024 / 1024 << " mb of "
+        << cap / 1024 / 1024 << std::endl;
+    }
+  }
+  std::cout<<"Executed in "<< ceph_clock_now() - start << std::endl;
+  std::cout<<"Avail "<< alloc->get_free() / _1m << " MB" << std::endl;
+
+  dump_mempools();
+}
+
+TEST_P(AllocTest, test_alloc_bench_90_300)
+{
+  uint64_t capacity = uint64_t(1024) * 1024 * 1024 * 1024;
+  auto prefill = capacity - capacity / 10;
+  auto overwrite = capacity * 3;
+  doOverwriteTest(capacity, prefill, overwrite);
+}
+
+TEST_P(AllocTest, test_alloc_bench_50_300)
+{
+  uint64_t capacity = uint64_t(1024) * 1024 * 1024 * 1024;
+  auto prefill = capacity / 2;
+  auto overwrite = capacity * 3;
+  doOverwriteTest(capacity, prefill, overwrite);
+}
+
+TEST_P(AllocTest, test_alloc_bench_10_300)
+{
+  uint64_t capacity = uint64_t(1024) * 1024 * 1024 * 1024;
+  auto prefill = capacity / 10;
+  auto overwrite = capacity * 3;
+  doOverwriteTest(capacity, prefill, overwrite);
+}
+
+INSTANTIATE_TEST_CASE_P(
+  Allocator,
+  AllocTest,
+  ::testing::Values("stupid", "bitmap"));
+
+#else
+
+TEST(DummyTest, ValueParameterizedTestsAreNotSupportedOnThisPlatform) {}
+#endif
diff -Nru ceph-12.2.11/src/test/objectstore/Allocator_test.cc ceph-12.2.12/src/test/objectstore/Allocator_test.cc
--- ceph-12.2.11/src/test/objectstore/Allocator_test.cc	2019-01-30 15:51:26.000000000 +0000
+++ ceph-12.2.12/src/test/objectstore/Allocator_test.cc	2019-04-11 12:33:50.000000000 +0000
@@ -14,37 +14,39 @@
 #include "include/stringify.h"
 #include "include/Context.h"
 #include "os/bluestore/Allocator.h"
-#include "os/bluestore/BitAllocator.h"
 
+#include <boost/random/uniform_int.hpp>
+typedef boost::mt11213b gen_type;
 
 #if GTEST_HAS_PARAM_TEST
 
 class AllocTest : public ::testing::TestWithParam<const char*> {
+
 public:
-    boost::scoped_ptr<Allocator> alloc;
-    AllocTest(): alloc(0) { }
-    void init_alloc(int64_t size, uint64_t min_alloc_size) {
-      std::cout << "Creating alloc type " << string(GetParam()) << " \n";
-      alloc.reset(Allocator::create(g_ceph_context, string(GetParam()), size,
-				    min_alloc_size));
-    }
+  boost::scoped_ptr<Allocator> alloc;
+  AllocTest(): alloc(0) { }
+  void init_alloc(int64_t size, uint64_t min_alloc_size) {
+    std::cout << "Creating alloc type " << string(GetParam()) << " \n";
+    alloc.reset(Allocator::create(g_ceph_context, string(GetParam()), size,
+				  min_alloc_size));
+  }
 
-    void init_close() {
-      alloc.reset(0);
-    }
+  void init_close() {
+    alloc.reset(0);
+  }
 };
 
 TEST_P(AllocTest, test_alloc_init)
 {
-  int64_t blocks = BmapEntry::size();
+  int64_t blocks = 64;
   init_alloc(blocks, 1);
   ASSERT_EQ(0U, alloc->get_free());
   alloc->shutdown(); 
-  blocks = BitMapZone::get_total_blocks() * 2 + 16;
+  blocks = 1024 * 2 + 16;
   init_alloc(blocks, 1);
   ASSERT_EQ(0U, alloc->get_free());
   alloc->shutdown(); 
-  blocks = BitMapZone::get_total_blocks() * 2;
+  blocks = 1024 * 2;
   init_alloc(blocks, 1);
   ASSERT_EQ(alloc->get_free(), (uint64_t) 0);
 }
@@ -52,13 +54,13 @@
 TEST_P(AllocTest, test_alloc_min_alloc)
 {
   int64_t block_size = 1024;
-  int64_t blocks = BitMapZone::get_total_blocks() * 2 * block_size;
+  int64_t capacity = 4 * 1024 * block_size;
 
   {
-    init_alloc(blocks, block_size);
+    init_alloc(capacity, block_size);
+
     alloc->init_add_free(block_size, block_size);
-    EXPECT_EQ(alloc->reserve(block_size), 0);
-    AllocExtentVector extents;
+    PExtentVector extents;
     EXPECT_EQ(block_size, alloc->allocate(block_size, block_size,
 					  0, (int64_t) 0, &extents));
   }
@@ -68,8 +70,7 @@
    */   
   {
     alloc->init_add_free(0, block_size * 4);
-    EXPECT_EQ(alloc->reserve(block_size * 4), 0);
-    AllocExtentVector extents;
+    PExtentVector extents;
     EXPECT_EQ(4*block_size,
 	      alloc->allocate(4 * (uint64_t)block_size, (uint64_t) block_size,
 			      0, (int64_t) 0, &extents));
@@ -83,8 +84,7 @@
   {
     alloc->init_add_free(0, block_size * 2);
     alloc->init_add_free(3 * block_size, block_size * 2);
-    EXPECT_EQ(alloc->reserve(block_size * 4), 0);
-    AllocExtentVector extents;
+    PExtentVector extents;
   
     EXPECT_EQ(4*block_size,
 	      alloc->allocate(4 * (uint64_t)block_size, (uint64_t) block_size,
@@ -99,9 +99,9 @@
 TEST_P(AllocTest, test_alloc_min_max_alloc)
 {
   int64_t block_size = 1024;
-  int64_t blocks = BitMapZone::get_total_blocks() * 2 * block_size;
 
-  init_alloc(blocks, block_size);
+  int64_t capacity = 4 * 1024 * block_size;
+  init_alloc(capacity, block_size);
 
   /*
    * Make sure we get all extents different when
@@ -109,8 +109,7 @@
    */
   {
     alloc->init_add_free(0, block_size * 4);
-    EXPECT_EQ(alloc->reserve(block_size * 4), 0);
-    AllocExtentVector extents;
+    PExtentVector extents;
     EXPECT_EQ(4*block_size,
 	      alloc->allocate(4 * (uint64_t)block_size, (uint64_t) block_size,
 			      block_size, (int64_t) 0, &extents));
@@ -127,8 +126,7 @@
    */
   {
     alloc->init_add_free(0, block_size * 4);
-    EXPECT_EQ(alloc->reserve(block_size * 4), 0);
-    AllocExtentVector extents;
+    PExtentVector extents;
     EXPECT_EQ(4*block_size,
 	      alloc->allocate(4 * (uint64_t)block_size, (uint64_t) block_size,
 			      2 * block_size, (int64_t) 0, &extents));
@@ -143,8 +141,7 @@
    */
   {
     alloc->init_add_free(0, block_size * 1024);
-    EXPECT_EQ(alloc->reserve(block_size * 1024), 0);
-    AllocExtentVector extents;
+    PExtentVector extents;
     EXPECT_EQ(1024 * block_size,
 	      alloc->allocate(1024 * (uint64_t)block_size,
 			      (uint64_t) block_size * 4,
@@ -160,8 +157,7 @@
    */
   {
     alloc->init_add_free(0, block_size * 16);
-    EXPECT_EQ(alloc->reserve(block_size * 16), 0);
-    AllocExtentVector extents;
+    PExtentVector extents;
     EXPECT_EQ(16 * block_size,
 	      alloc->allocate(16 * (uint64_t)block_size, (uint64_t) block_size,
 			      2 * block_size, (int64_t) 0, &extents));
@@ -176,15 +172,14 @@
 TEST_P(AllocTest, test_alloc_failure)
 {
   int64_t block_size = 1024;
-  int64_t blocks = BitMapZone::get_total_blocks() * block_size;
+  int64_t capacity = 4 * 1024 * block_size;
 
-  init_alloc(blocks, block_size);
+  init_alloc(capacity, block_size);
   {
     alloc->init_add_free(0, block_size * 256);
     alloc->init_add_free(block_size * 512, block_size * 256);
 
-    EXPECT_EQ(alloc->reserve(block_size * 512), 0);
-    AllocExtentVector extents;
+    PExtentVector extents;
     EXPECT_EQ(512 * block_size,
 	      alloc->allocate(512 * (uint64_t)block_size,
 			      (uint64_t) block_size * 256,
@@ -192,7 +187,6 @@
     alloc->init_add_free(0, block_size * 256);
     alloc->init_add_free(block_size * 512, block_size * 256);
     extents.clear();
-    EXPECT_EQ(alloc->reserve(block_size * 512), 0);
     EXPECT_EQ(-ENOSPC,
 	      alloc->allocate(512 * (uint64_t)block_size,
 			      (uint64_t) block_size * 512,
@@ -209,70 +203,12 @@
   alloc->init_add_free(2*block_size, (blocks-2)*block_size);
   for (int64_t big = mas; big < 1048576*128; big*=2) {
     cout << big << std::endl;
-    EXPECT_EQ(alloc->reserve(big), 0);
-    AllocExtentVector extents;
+    PExtentVector extents;
     EXPECT_EQ(big,
 	      alloc->allocate(big, mas, 0, &extents));
   }
 }
 
-TEST_P(AllocTest, test_alloc_hint_bmap)
-{
-  if (GetParam() == std::string("stupid")) {
-    return;
-  }
-  int64_t blocks = BitMapArea::get_level_factor(g_ceph_context, 2) * 4;
-  int64_t allocated = 0;
-  int64_t zone_size = 1024;
-  g_conf->set_val("bluestore_bitmapallocator_blocks_per_zone",
-		  std::to_string(zone_size));
-
-  init_alloc(blocks, 1);
-  alloc->init_add_free(0, blocks);
-
-  AllocExtentVector extents;
-  alloc->reserve(blocks);
-
-  allocated = alloc->allocate(1, 1, 1, zone_size, &extents);
-  ASSERT_EQ(1, allocated);
-  ASSERT_EQ(1u, extents.size());
-  ASSERT_EQ(extents[0].offset, (uint64_t) zone_size);
-
-  extents.clear();
-  allocated = alloc->allocate(1, 1, 1, zone_size * 2 - 1, &extents);
-  EXPECT_EQ(1, allocated);
-  ASSERT_EQ(1u, extents.size());
-  EXPECT_EQ((int64_t) extents[0].offset, zone_size * 2 - 1);
-
-  /*
-   * Wrap around with hint
-   */
-  extents.clear();
-  allocated = alloc->allocate(zone_size * 2, 1, 1,  blocks - zone_size * 2,
-			      &extents);
-  ASSERT_EQ(zone_size * 2, allocated);
-  EXPECT_EQ(zone_size * 2, (int)extents.size());
-  EXPECT_EQ((int64_t)extents[0].offset, blocks - zone_size * 2);
-
-  extents.clear();
-  allocated = alloc->allocate(zone_size, 1, 1, blocks - zone_size, &extents);
-  ASSERT_EQ(zone_size, allocated);
-  EXPECT_EQ(zone_size, (int)extents.size());
-  EXPECT_EQ(extents[0].offset, (uint64_t) 0);
-  /*
-   * Verify out-of-bound hint
-   */
-  extents.clear();
-  allocated = alloc->allocate(1, 1, 1, blocks, &extents);
-  ASSERT_EQ(1, allocated);
-  EXPECT_EQ(1, (int)extents.size());
-
-  extents.clear();
-  allocated = alloc->allocate(1, 1, 1, blocks * 3 + 1 , &extents);
-  ASSERT_EQ(1, allocated);
-  EXPECT_EQ(1, (int)extents.size());
-}
-
 TEST_P(AllocTest, test_alloc_non_aligned_len)
 {
   int64_t block_size = 1 << 12;
@@ -285,11 +221,100 @@
   alloc->init_add_free(2097152, 1064960);
   alloc->init_add_free(3670016, 2097152);
 
-  EXPECT_EQ(0, alloc->reserve(want_size));
-  AllocExtentVector extents;
+  PExtentVector extents;
   EXPECT_EQ(want_size, alloc->allocate(want_size, alloc_unit, 0, &extents));
 }
 
+TEST_P(AllocTest, test_alloc_fragmentation)
+{
+  uint64_t capacity = 4 * 1024 * 1024;
+  uint64_t alloc_unit = 4096;
+  uint64_t want_size = alloc_unit;
+  PExtentVector allocated, tmp;
+  
+  init_alloc(capacity, alloc_unit);
+  alloc->init_add_free(0, capacity);
+  bool bitmap_alloc = GetParam() == std::string("bitmap");
+  
+  EXPECT_EQ(0.0, alloc->get_fragmentation(alloc_unit));
+
+  for (size_t i = 0; i < capacity / alloc_unit; ++i)
+  {
+    tmp.clear();
+    EXPECT_EQ(want_size, alloc->allocate(want_size, alloc_unit, 0, 0, &tmp));
+    allocated.insert(allocated.end(), tmp.begin(), tmp.end());
+
+    // bitmap fragmentation calculation doesn't provide such constant
+    // estimate
+    if (!bitmap_alloc) {
+      EXPECT_EQ(0.0, alloc->get_fragmentation(alloc_unit));
+    }
+  }
+  EXPECT_EQ(-ENOSPC, alloc->allocate(want_size, alloc_unit, 0, 0, &tmp));
+
+  for (size_t i = 0; i < allocated.size(); i += 2)
+  {
+    interval_set<uint64_t> release_set;
+    release_set.insert(allocated[i].offset, allocated[i].length);
+    alloc->release(release_set);
+  }
+  EXPECT_EQ(1.0, alloc->get_fragmentation(alloc_unit));
+  for (size_t i = 1; i < allocated.size() / 2; i += 2)
+  {
+    interval_set<uint64_t> release_set;
+    release_set.insert(allocated[i].offset, allocated[i].length);
+    alloc->release(release_set);
+  }
+  if (bitmap_alloc) {
+    // fragmentation = one l1 slot is free + one l1 slot is partial
+    EXPECT_EQ(50, uint64_t(alloc->get_fragmentation(alloc_unit) * 100));
+  } else {
+    // fragmentation approx = 257 intervals / 768 max intervals
+    EXPECT_EQ(33, uint64_t(alloc->get_fragmentation(alloc_unit) * 100));
+  }
+
+  for (size_t i = allocated.size() / 2 + 1; i < allocated.size(); i += 2)
+  {
+    interval_set<uint64_t> release_set;
+    release_set.insert(allocated[i].offset, allocated[i].length);
+    alloc->release(release_set);
+  }
+  // doing some rounding trick as stupid allocator doesn't merge all the 
+  // extents that causes some minor fragmentation (minor bug or by-design behavior?).
+  // Hence leaving just two 
+  // digits after decimal point due to this.
+  EXPECT_EQ(0, uint64_t(alloc->get_fragmentation(alloc_unit) * 100));
+}
+
+TEST_P(AllocTest, test_alloc_bug_24598)
+{
+  if (string(GetParam()) != "bitmap")
+    return;
+  
+  uint64_t capacity = 0x2625a0000ull;
+  uint64_t alloc_unit = 0x4000;
+  uint64_t want_size = 0x200000;
+  PExtentVector allocated, tmp;
+
+  init_alloc(capacity, alloc_unit);
+
+  alloc->init_add_free(0x4800000, 0x100000);
+  alloc->init_add_free(0x4a00000, 0x100000);
+
+  alloc->init_rm_free(0x4800000, 0x100000);
+  alloc->init_rm_free(0x4a00000, 0x100000);
+
+  alloc->init_add_free(0x3f00000, 0x500000);
+  alloc->init_add_free(0x4500000, 0x100000);
+  alloc->init_add_free(0x4700000, 0x100000);
+  alloc->init_add_free(0x4900000, 0x100000);
+  alloc->init_add_free(0x4b00000, 0x200000);
+
+  EXPECT_EQ(want_size, alloc->allocate(want_size, 0x100000, 0, 0, &tmp));
+  EXPECT_EQ(tmp[0].offset, 0x4b00000);
+  EXPECT_EQ(tmp[0].length, 0x200000);
+  EXPECT_EQ(tmp.size(), 1);
+}
 
 INSTANTIATE_TEST_CASE_P(
   Allocator,
diff -Nru ceph-12.2.11/src/test/objectstore/BitAllocator_test.cc ceph-12.2.12/src/test/objectstore/BitAllocator_test.cc
--- ceph-12.2.11/src/test/objectstore/BitAllocator_test.cc	2019-01-30 15:51:26.000000000 +0000
+++ ceph-12.2.12/src/test/objectstore/BitAllocator_test.cc	1970-01-01 00:00:00.000000000 +0000
@@ -1,593 +0,0 @@
-// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
-// vim: ts=8 sw=2 smarttab
-/*
- * Bitmap based in-memory allocator unit test cases.
- * Author: Ramesh Chander, Ramesh.Chander@sandisk.com
- */
-
-#include "include/Context.h"
-#include "os/bluestore/BitAllocator.h"
-#include <stdio.h>
-#include <assert.h>
-#include <math.h>
-#include <sstream>
-#include <gtest/gtest.h>
-
-
-//#define bmap_test_assert(x) ASSERT_EQ(true, (x))
-#define bmap_test_assert(x) assert((x))
-#define NUM_THREADS 16
-#define MAX_BLOCKS (1024 * 1024 * 1)
-
-TEST(BitAllocator, test_bmap_iter)
-{
-  int num_items = 5;
-  int off = 2;
-
-  class BmapEntityTmp {
-      int64_t m_num;
-      int64_t m_len;
-    public:
-      void init(int index) {
-        m_num = index;
-      }
-      BmapEntityTmp() {
-
-      }
-      BmapEntityTmp(int num) {
-        m_num = num;
-        m_len = num;
-      }
-
-      int64_t get_index() {
-        return m_num;
-      }
-      bool is_allocated(int64_t s, int64_t num)
-      {
-        return true;
-      }
-  };
-  BmapEntityTmp *obj = NULL;
-  int i = 0;
-  mempool::bluestore_alloc::vector<BmapEntityTmp> *arr = new mempool::bluestore_alloc::vector<BmapEntityTmp>(num_items);
-  for (i = 0; i < num_items; i++) {
-    (*arr)[i].init(i);
-  }
-  BitMapEntityIter<BmapEntityTmp> iter = BitMapEntityIter<BmapEntityTmp>(arr, off, false);
-
-  i = off;
-  int count = 0;
-  int64_t last_idx = off;
-  while ((obj = iter.next())) {
-    bmap_test_assert(obj->get_index() == last_idx);
-    bmap_test_assert(obj->get_index() == i);
-    bmap_test_assert(obj == &(*arr)[i]);
-    last_idx = iter.index();
-    i++;
-    count++;
-  }
-  bmap_test_assert(i == num_items);
-  bmap_test_assert(count == num_items - off);
-
-  iter = BitMapEntityIter<BmapEntityTmp>(arr, off, true);
-
-  i = off;
-  last_idx = off;
-  count = 0;
-  while ((obj = iter.next())) {
-    bmap_test_assert(obj->get_index() == last_idx);
-    bmap_test_assert(obj->get_index() == i);
-    bmap_test_assert(obj == &(*arr)[i]);
-    last_idx = iter.index();
-
-    i = (i + 1) % num_items;
-    count++;
-  }
-  bmap_test_assert(i == off + 1);
-  bmap_test_assert(count == num_items + 1);
-
-  delete arr;
-
-  num_items = 4;
-  off = num_items - 1;
-
-  arr = new mempool::bluestore_alloc::vector<BmapEntityTmp>(num_items);
-  for (i = 0; i < num_items; i++) {
-    (*arr)[i].init(i);
-  }
-  iter = BitMapEntityIter<BmapEntityTmp>(arr, off, true);
-  i = off;
-  last_idx = off;
-  count = 0;
-  while ((obj = static_cast<BmapEntityTmp*>(iter.next()))) {
-    bmap_test_assert(obj->get_index() == last_idx);
-    bmap_test_assert(obj->get_index() == i);
-    bmap_test_assert(obj == &(*arr)[i]);
-    last_idx = iter.index();
-    i = (i + 1) % num_items;
-    count++;
-  }
-  bmap_test_assert(i == (off + 1)%num_items);
-  bmap_test_assert(count == num_items + 1);
-
-  delete arr;
-
-  /*
-   * BitMapArea Iter tests.
-   */
-  BitMapArea *area = nullptr;
-  std::vector<BitMapArea*> children;
-  children.reserve(num_items);
-  for (i = 0; i < num_items; i++) {
-    children.emplace_back(new BitMapAreaLeaf(
-      g_ceph_context,
-      BitMapArea::get_span_size(g_ceph_context), i, false));
-  }
-
-  off = 0;
-  BitMapAreaList *area_list = \
-    new BitMapAreaList(std::vector<BitMapArea*>(children));
-  BmapEntityListIter area_iter = BmapEntityListIter(
-                                area_list, (int64_t) 0);
-  i = off;
-  last_idx = off;
-  count = 0;
-  while ((area = area_iter.next())) {
-    bmap_test_assert(area->get_index() == last_idx);
-    bmap_test_assert(area->get_index() == i);
-    bmap_test_assert(area == children[i]);
-    last_idx = area_iter.index();
-    i = (i + 1) % num_items;
-    count++;
-  }
-  bmap_test_assert(i == off);
-  bmap_test_assert(count == num_items);
-
-  off = 0;
-  area_iter = BmapEntityListIter(area_list, off, true);
-  i = off;
-  last_idx = off;
-  count = 0;
-  while ((area = area_iter.next())) {
-    bmap_test_assert(area->get_index() == last_idx);
-    bmap_test_assert(area->get_index() == i);
-    bmap_test_assert(area == children[i]);
-    last_idx = area_iter.index();
-    i = (i + 1) % num_items;
-    count++;
-  }
-  bmap_test_assert(i == (off + 1)%num_items);
-  bmap_test_assert(count == num_items + 1);
-
-  for (i = 0; i < num_items; i++)
-    delete children[i];
-
-  delete area_list;
-}
-
-TEST(BitAllocator, test_bmap_entry)
-{
-  int i = 0;
-  int start = 0;
-  int64_t scanned = 0;
-  int64_t allocated = 0;
-  int size = BmapEntry::size();
-
-  BmapEntry *bmap = new BmapEntry(g_ceph_context, true);
-
-  // Clear bits one by one and check they are cleared
-  for (i = 0; i < size; i++) {
-    bmap->clear_bit(i);
-    bmap_test_assert(!bmap->check_bit(i));
-  }
-
-  // Set all bits again using set_bits
-  bmap->set_bits(0, size);
-
-  // clear 4 bits at a time and then check allocated
-  for (i = 0; i < size/4; i++) {
-    bmap->clear_bits(i * 4, 4);
-    bmap_test_assert(!bmap->is_allocated(i * 4, 4));
-  }
-
-  // set all bits again
-  bmap->set_bits(0, size);
-
-  // clear alternate bits, check and set those bits
-  for (i = 0; i < size/2; i++) {
-    bmap->clear_bit(i * 2 + 1);
-    bmap_test_assert(!bmap->check_bit(i * 2 + 1));
-    bmap_test_assert(bmap->check_n_set_bit(i * 2 + 1));
-  }
-
-  // free 1, 2 and size bits at a time and try to find n cont bits
-  for (i = 0; i < size / 4; i++) {
-    bmap->clear_bits(i * 2 + 1, i + 1);
-    bmap_test_assert(!bmap->check_bit(i * 2 + 1));
-    bmap_test_assert(bmap->find_n_cont_bits(i * 2 + 1, i + 1) ==
-        i + 1);
-  }
-
-  // free 1, 2 and size bits at a time and try to find any cont bits
-  for (i = 0; i < size / 4; i++) {
-    bmap->clear_bits(i * 2 + 1, i + 1);
-    bmap_test_assert(!bmap->is_allocated(i * 2 + 1, i + 1));
-  }
-
-  for (i = 0; i < size / 4; i++) {
-    bmap->clear_bits(i * 2 + 1, i + 1);
-    allocated = bmap->find_first_set_bits(i + 1, 0, &start, &scanned);
-
-    bmap_test_assert(allocated == i + 1);
-    bmap_test_assert(scanned == ((i * 2 + 1) + (i + 1)));
-    bmap_test_assert(start == i * 2 + 1);
-    bmap->set_bits(0, BmapEntry::size());
-
-  }
-
-
-
-  // Find few bits at end of bitmap and find those
-  bmap->clear_bits(0, 4);
-  bmap->clear_bits(BmapEntry::size() - 12, 5);
-  bmap->clear_bits(BmapEntry::size() - 6, 6);
-  allocated = bmap->find_first_set_bits(6, 0, &start, &scanned);
-
-  bmap_test_assert(allocated == 6);
-  bmap_test_assert(scanned == BmapEntry::size() - 6 + 6);
-  bmap_test_assert(start == BmapEntry::size() - 6);
-  bmap_test_assert(bmap->is_allocated(start, 6));
-
-  delete bmap;
-
-  {
-
-    bmap = new BmapEntry(g_ceph_context, false);
-    start = -1;
-    scanned = 0;
-    allocated = 0;
-    allocated = bmap->find_first_set_bits(1, 1, &start, &scanned);
-    bmap_test_assert(allocated == 1);
-    bmap_test_assert(start == 1);
-
-    allocated = bmap->find_first_set_bits(1, BmapEntry::size() - 2, &start, &scanned);
-    bmap_test_assert(allocated == 1);
-    bmap_test_assert(start == BmapEntry::size() - 2);
-
-    bmap->clear_bits(0, BmapEntry::size());
-    bmap->set_bits(0, BmapEntry::size() / 4);
-    allocated = bmap->find_first_set_bits(4, 2, &start, &scanned);
-    bmap_test_assert(allocated == 4);
-    bmap_test_assert(start == BmapEntry::size() / 4);
-    delete bmap;
-  }
-
-  bmap = new BmapEntry(g_ceph_context, false);
-  bmap->set_bits(4, BmapEntry::size() - 4);
-  bmap_test_assert(bmap->is_allocated(4, BmapEntry::size() - 4));
-  bmap_test_assert(!bmap->is_allocated(0, 4));
-  bmap->set_bits(0, 4);
-  bmap_test_assert(bmap->is_allocated(0, BmapEntry::size()));
-  delete bmap;
-
-}
-
-TEST(BitAllocator, test_zone_alloc)
-{
-  int total_blocks = 1024;
-  int64_t allocated = 0;
-
-  BitMapZone *zone = new BitMapZone(g_ceph_context, total_blocks, 0);
-
-  // Allocate all blocks and see that it is allocating in order.
-  bool lock = zone->lock_excl_try();
-  bmap_test_assert(lock);
-
-  int64_t blk_size = 1024;
-  AllocExtentVector extents;
-  ExtentList *block_list = new ExtentList(&extents, blk_size);
-  allocated = zone->alloc_blocks_dis(zone->size() / 2, 1, 0, 0, block_list);
-  bmap_test_assert(allocated == zone->size() / 2);
-
-
-  {
-    int64_t blk_size = 1024;
-    AllocExtentVector extents;
-    ExtentList *block_list = new ExtentList(&extents, blk_size);
-
-    zone = new BitMapZone(g_ceph_context, total_blocks, 0);
-    lock = zone->lock_excl_try();
-    bmap_test_assert(lock);
-    for (int i = 0; i < zone->size(); i += 4) {
-      block_list->reset();
-      allocated = zone->alloc_blocks_dis(1, 1, i, 0, block_list);
-      bmap_test_assert(allocated == 1);
-      EXPECT_EQ(extents[0].offset, (uint64_t) i * blk_size);
-    }
-
-    for (int i = 0; i < zone->size(); i += 4) {
-      zone->free_blocks(i, 1);
-    }
-  }
-
-  /*
-   * Min alloc size cases.
-   */
-  {
-    int64_t blk_size = 1;
-    AllocExtentVector extents;
-
-    for (int i = 1; i <= total_blocks - BmapEntry::size(); i = i << 1) {
-      for (int64_t j = 0; j <= BmapEntry::size(); j = 1 << j) {
-	extents.clear();
-        ExtentList *block_list = new ExtentList(&extents, blk_size);
-	zone = new BitMapZone(g_ceph_context, total_blocks, 0);
-        lock = zone->lock_excl_try();
-        bmap_test_assert(lock);
-
-        block_list->reset();
-        int64_t need_blks = (((total_blocks - j) / i) * i);
-        allocated = zone->alloc_blocks_dis(need_blks, i, j, 0, block_list);
-        bmap_test_assert(allocated == need_blks);
-        bmap_test_assert(extents[0].offset ==  (uint64_t) j);
-        delete block_list;
-        delete zone;
-      }
-    }
-
-    //allocation in loop
-    {
-      extents.clear();
-      ExtentList *block_list = new ExtentList(&extents, blk_size);
-      zone = new BitMapZone(g_ceph_context, total_blocks, 0);
-      lock = zone->lock_excl_try();
-
-      for (int iter = 1; iter < 5; iter++) {
-        for (int i = 1; i <= total_blocks; i = i << 1) {
-          for (int j = 0; j < total_blocks; j +=i) {
-            bmap_test_assert(lock);
-            block_list->reset();
-            int64_t need_blks = i;
-            allocated = zone->alloc_blocks_dis(need_blks, i, 0, 0, block_list);
-            bmap_test_assert(allocated == need_blks);
-            bmap_test_assert(extents[0].offset ==  (uint64_t) j);
-            block_list->reset();
-          }
-          {
-            allocated = zone->alloc_blocks_dis(1, 1, 0, 0, block_list);
-            bmap_test_assert(allocated == 0);
-            block_list->reset();
-          }
-         
-          for (int j = 0; j < total_blocks; j +=i) {
-            zone->free_blocks(j, i);
-          }
-        }
-      }
-      delete block_list;
-      delete zone;
-    }
-
-    {
-      extents.clear();
-      ExtentList *block_list = new ExtentList(&extents, blk_size);
-      zone = new BitMapZone(g_ceph_context, total_blocks, 0);
-      lock = zone->lock_excl_try();
-      bmap_test_assert(lock);
-
-      block_list->reset();
-      allocated = zone->alloc_blocks_dis(total_blocks + 1, total_blocks + 1, 0, 1024, block_list);
-      bmap_test_assert(allocated == 0);
-
-      block_list->reset();
-      allocated = zone->alloc_blocks_dis(total_blocks, total_blocks, 1, 1024, block_list);
-      bmap_test_assert(allocated == 0);
-
-      block_list->reset();
-      allocated = zone->alloc_blocks_dis(total_blocks, total_blocks, 0, 0, block_list);
-      bmap_test_assert(allocated == total_blocks);
-      bmap_test_assert(extents[0].offset == 0);
-
-      zone->free_blocks(extents[0].offset, allocated);
-        
-      delete block_list;
-      extents.clear();
-      block_list = new ExtentList(&extents, blk_size, total_blocks / 4 * blk_size);
-      allocated = zone->alloc_blocks_dis(total_blocks, total_blocks / 4, 0, 0, block_list);
-      bmap_test_assert(allocated == total_blocks);
-      for (int i = 0; i < 4; i++) {
-	bmap_test_assert(extents[i].offset == (uint64_t) i * (total_blocks / 4));
-      }
-    }
-  }
-}
-
-TEST(BitAllocator, test_bmap_alloc)
-{
-  const int max_iter = 3;
-
-  for (int round = 0; round < 3; round++) {
-    // Test zone of different sizes: 512, 1024, 2048
-    int64_t zone_size = 512ull << round;
-    ostringstream val;
-    val << zone_size;
-    g_conf->set_val("bluestore_bitmapallocator_blocks_per_zone", val.str());
-
-    // choose randomized span_size
-    int64_t span_size = 512ull << (rand() % 4);
-    val.str("");
-    val << span_size;
-    g_conf->set_val("bluestore_bitmapallocator_span_size", val.str());
-    g_ceph_context->_conf->apply_changes(NULL);
-
-    int64_t total_blocks = zone_size * 4;
-    int64_t allocated = 0;
-
-    BitAllocator *alloc = new BitAllocator(g_ceph_context, total_blocks,
-					   zone_size, CONCURRENT);
-    int64_t alloc_size = 2;
-    for (int64_t iter = 0; iter < max_iter; iter++) {
-      for (int64_t j = 0; alloc_size <= total_blocks; j++) {
-        int64_t blk_size = 1024;
-        AllocExtentVector extents;
-        ExtentList *block_list = new ExtentList(&extents, blk_size, alloc_size);
-        for (int64_t i = 0; i < total_blocks; i += alloc_size) {
-          bmap_test_assert(alloc->reserve_blocks(alloc_size) == true);
-          allocated = alloc->alloc_blocks_dis_res(alloc_size, MIN(alloc_size, zone_size),
-                                                  0, block_list);
-          bmap_test_assert(alloc_size == allocated);
-          bmap_test_assert(block_list->get_extent_count() == 
-                           (alloc_size > zone_size? alloc_size / zone_size: 1));
-          bmap_test_assert(extents[0].offset == (uint64_t) i * blk_size);
-          bmap_test_assert((int64_t) extents[0].length == 
-                           ((alloc_size > zone_size? zone_size: alloc_size) * blk_size));
-          block_list->reset();
-        }
-        for (int64_t i = 0; i < total_blocks; i += alloc_size) {
-          alloc->free_blocks(i, alloc_size);
-        }
-        alloc_size = 2 << j; 
-      }
-    }
-
-    int64_t blk_size = 1024;
-    AllocExtentVector extents;
-
-    ExtentList *block_list = new ExtentList(&extents, blk_size);
-  
-    ASSERT_EQ(alloc->reserve_blocks(alloc->size() / 2), true);
-    allocated = alloc->alloc_blocks_dis_res(alloc->size()/2, 1, 0, block_list);
-    ASSERT_EQ(alloc->size()/2, allocated);
-
-    block_list->reset();
-    ASSERT_EQ(alloc->reserve_blocks(1), true);
-    allocated = alloc->alloc_blocks_dis_res(1, 1, 0, block_list);
-    bmap_test_assert(allocated == 1);
-
-    alloc->free_blocks(alloc->size()/2, 1);
-
-    block_list->reset();
-    ASSERT_EQ(alloc->reserve_blocks(1), true);
-    allocated = alloc->alloc_blocks_dis_res(1, 1, 0, block_list);
-    bmap_test_assert(allocated == 1);
-
-    bmap_test_assert((int64_t) extents[0].offset == alloc->size()/2 * blk_size);
-
-    delete block_list;
-    delete alloc;
-
-  }
-
-  // restore to typical value
-  g_conf->set_val("bluestore_bitmapallocator_blocks_per_zone", "1024");
-  g_conf->set_val("bluestore_bitmapallocator_span_size", "1024");
-  g_ceph_context->_conf->apply_changes(NULL);
-}
-
-bool alloc_extents_max_block(BitAllocator *alloc,
-           int64_t max_alloc,
-           int64_t total_alloc)
-{
-  int64_t blk_size = 1;
-  int64_t allocated = 0;
-  int64_t verified = 0;
-  int64_t count = 0;
-  AllocExtentVector extents;
-
-  ExtentList *block_list = new ExtentList(&extents, blk_size, max_alloc);
-
-  EXPECT_EQ(alloc->reserve_blocks(total_alloc), true);
-  allocated = alloc->alloc_blocks_dis_res(total_alloc, blk_size, 0, block_list);
-  EXPECT_EQ(allocated, total_alloc);
-
-  max_alloc = total_alloc > max_alloc? max_alloc: total_alloc;
-
-  for (auto &p: extents) {
-    count++;
-    EXPECT_EQ(p.length,  max_alloc);
-    verified += p.length;
-    if (verified >= total_alloc) {
-      break;
-    }
-  }
-
-  EXPECT_EQ(total_alloc / max_alloc, count);
-  return true;
-}
-
-TEST(BitAllocator, test_bmap_alloc2)
-{
-  int64_t total_blocks = 1024 * 4;
-  int64_t zone_size = 1024;
-  BitAllocator *alloc = new BitAllocator(g_ceph_context, total_blocks,
-					 zone_size, CONCURRENT);
-
-  alloc_extents_max_block(alloc, 1, 16);
-  alloc_extents_max_block(alloc, 4, 16);
-  alloc_extents_max_block(alloc, 16, 16);
-  alloc_extents_max_block(alloc, 32, 16);
-}
-
-__thread int my_tid;
-
-void
-do_work_dis(BitAllocator *alloc)
-{
-  int num_iters = 10;
-  int64_t alloced = 0;
-  int64_t num_blocks = alloc->size() / NUM_THREADS;
-
-  AllocExtentVector extents;
-  ExtentList *block_list = new ExtentList(&extents, 4096);
-
-  while (num_iters--) {
-    alloc_assert(alloc->reserve_blocks(num_blocks));
-    alloced = alloc->alloc_blocks_dis_res(num_blocks, 1, 0, block_list);
-    alloc_assert(alloced == num_blocks);
-
-    alloc_assert(alloc->is_allocated_dis(block_list, num_blocks));
-    alloc->free_blocks_dis(num_blocks, block_list);
-    block_list->reset();
-  }
-}
-
-int tid = 0;
-static bool cont = true;
-
-void *
-worker(void *args)
-{
-  my_tid = __sync_fetch_and_add(&tid, 1);
-  BitAllocator *alloc = (BitAllocator *) args;
-  printf("Starting thread %d", my_tid);
-  do_work_dis(alloc);
-
-  return NULL;
-}
-
-TEST(BitAllocator, test_bmap_alloc_concurrent)
-{
-  int64_t total_blocks = MAX_BLOCKS;
-  int64_t zone_size = 1024;
-  pthread_t pthreads[NUM_THREADS] = {0};
-
-  bmap_test_assert(total_blocks <= MAX_BLOCKS);
-
-  BitAllocator *alloc = new BitAllocator(g_ceph_context, total_blocks,
-					 zone_size, CONCURRENT);
-
-  for (int k = 0; k < 2; k++) {
-    cont = k;
-    printf("Spawning %d threads for parallel test. Mode Cont = %d.....\n", NUM_THREADS, cont);
-    for (int j = 0; j < NUM_THREADS; j++) {
-      if (pthread_create(&pthreads[j], NULL, worker, alloc)) {
-        printf("Unable to create worker thread.\n");
-        exit(0);
-      }
-    }
-
-    for (int j = 0; j < NUM_THREADS; j++) {
-      pthread_join(pthreads[j], NULL);
-    }
-  }
-}
diff -Nru ceph-12.2.11/src/test/objectstore/CMakeLists.txt ceph-12.2.12/src/test/objectstore/CMakeLists.txt
--- ceph-12.2.11/src/test/objectstore/CMakeLists.txt	2019-01-30 15:51:26.000000000 +0000
+++ ceph-12.2.12/src/test/objectstore/CMakeLists.txt	2019-04-11 12:33:50.000000000 +0000
@@ -108,13 +108,6 @@
 target_link_libraries(unittest_rocksdb_option global os ${BLKID_LIBRARIES})
 
 if(HAVE_LIBAIO)
-  # unittest_bit_alloc
-  add_executable(unittest_bit_alloc
-    BitAllocator_test.cc
-    $<TARGET_OBJECTS:unit-main>
-    )
-  add_ceph_unittest(unittest_bit_alloc ${CMAKE_RUNTIME_OUTPUT_DIRECTORY}/unittest_bit_alloc)
-  target_link_libraries(unittest_bit_alloc os global)
 
   add_executable(unittest_alloc
     Allocator_test.cc
@@ -123,6 +116,24 @@
   add_ceph_unittest(unittest_alloc ${CMAKE_RUNTIME_OUTPUT_DIRECTORY}/unittest_alloc)
   target_link_libraries(unittest_alloc os global)
 
+
+  add_executable(unittest_alloc_bench
+    Allocator_bench.cc
+    $<TARGET_OBJECTS:unit-main>
+    )
+  add_ceph_unittest(unittest_alloc_bench ${CMAKE_RUNTIME_OUTPUT_DIRECTORY}/unittest_alloc_bench)
+  target_link_libraries(unittest_alloc_bench os global)
+
+  add_executable(unittest_fastbmap_allocator
+    fastbmap_allocator_test.cc
+    $<TARGET_OBJECTS:unit-main>
+    )
+  add_ceph_unittest(unittest_fastbmap_allocator ${CMAKE_RUNTIME_OUTPUT_DIRECTORY}/unittest_fastbmap_allocator)
+  target_link_libraries(unittest_fastbmap_allocator os global)
+
+  set_target_properties(unittest_fastbmap_allocator PROPERTIES COMPILE_FLAGS
+  ${UNITTEST_CXX_FLAGS})
+
   # unittest_bluefs
   add_executable(unittest_bluefs
     test_bluefs.cc
diff -Nru ceph-12.2.11/src/test/objectstore/fastbmap_allocator_test.cc ceph-12.2.12/src/test/objectstore/fastbmap_allocator_test.cc
--- ceph-12.2.11/src/test/objectstore/fastbmap_allocator_test.cc	1970-01-01 00:00:00.000000000 +0000
+++ ceph-12.2.12/src/test/objectstore/fastbmap_allocator_test.cc	2019-04-11 12:33:50.000000000 +0000
@@ -0,0 +1,933 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#include <iostream>
+#include <gtest/gtest.h>
+
+#include "os/bluestore/fastbmap_allocator_impl.h"
+
+class TestAllocatorLevel01 : public AllocatorLevel01Loose
+{
+public:
+  void init(uint64_t capacity, uint64_t alloc_unit)
+  {
+    _init(capacity, alloc_unit);
+  }
+  interval_t allocate_l1_cont(uint64_t length, uint64_t min_length,
+    uint64_t pos_start, uint64_t pos_end)
+  {
+    return _allocate_l1_contiguous(length, min_length, 0, pos_start, pos_end);
+  }
+  void free_l1(const interval_t& r)
+  {
+    _free_l1(r.offset, r.length);
+  }
+};
+
+class TestAllocatorLevel02 : public AllocatorLevel02<AllocatorLevel01Loose>
+{
+public:
+  void init(uint64_t capacity, uint64_t alloc_unit)
+  {
+    _init(capacity, alloc_unit);
+  }
+  void allocate_l2(uint64_t length, uint64_t min_length,
+    uint64_t* allocated0,
+    interval_vector_t* res)
+  {
+    uint64_t allocated = 0;
+    uint64_t hint = 0; // trigger internal l2 hint support
+    _allocate_l2(length, min_length, 0, hint, &allocated, res);
+    *allocated0 += allocated;
+  }
+  void free_l2(const interval_vector_t& r)
+  {
+    _free_l2(r);
+  }
+};
+
+const uint64_t _1m = 1024 * 1024;
+const uint64_t _2m = 2 * 1024 * 1024;
+
+TEST(TestAllocatorLevel01, test_l1)
+{
+  TestAllocatorLevel01 al1;
+  uint64_t num_l1_entries = 3 * 256;
+  uint64_t capacity = num_l1_entries * 512 * 4096;
+  al1.init(capacity, 0x1000);
+  ASSERT_EQ(capacity, al1.debug_get_free());
+
+  auto i1 = al1.allocate_l1_cont(0x1000, 0x1000, 0, num_l1_entries);
+  ASSERT_EQ(i1.offset, 0);
+  ASSERT_EQ(i1.length, 0x1000);
+  ASSERT_EQ(capacity - 0x1000, al1.debug_get_free());
+
+  auto i2 = al1.allocate_l1_cont(0x1000, 0x1000, 0, num_l1_entries);
+  ASSERT_EQ(i2.offset, 0x1000);
+  ASSERT_EQ(i2.length, 0x1000);
+  al1.free_l1(i2);
+  al1.free_l1(i1);
+  i1 = al1.allocate_l1_cont(0x1000, 0x1000, 0, num_l1_entries);
+  ASSERT_EQ(i1.offset, 0);
+  ASSERT_EQ(i1.length, 0x1000);
+  i2 = al1.allocate_l1_cont(0x1000, 0x1000, 0, num_l1_entries);
+  ASSERT_EQ(i2.offset, 0x1000);
+  ASSERT_EQ(i2.length, 0x1000);
+  al1.free_l1(i1);
+  al1.free_l1(i2);
+
+  i1 = al1.allocate_l1_cont(0x2000, 0x1000, 0, num_l1_entries);
+  ASSERT_EQ(i1.offset, 0);
+  ASSERT_EQ(i1.length, 0x2000);
+
+  i2 = al1.allocate_l1_cont(0x3000, 0x1000, 0, num_l1_entries);
+  ASSERT_EQ(i2.offset, 0x2000);
+  ASSERT_EQ(i2.length, 0x3000);
+
+  al1.free_l1(i1);
+  al1.free_l1(i2);
+
+  i1 = al1.allocate_l1_cont(0x2000, 0x1000, 0, num_l1_entries);
+  ASSERT_EQ(i1.offset, 0);
+  ASSERT_EQ(i1.length, 0x2000);
+
+  i2 = al1.allocate_l1_cont(2 * 1024 * 1024, 0x1000, 0, num_l1_entries);
+  ASSERT_EQ(i2.offset, 2 * 1024 * 1024);
+  ASSERT_EQ(i2.length, 2 * 1024 * 1024);
+
+  al1.free_l1(i1);
+  i1 = al1.allocate_l1_cont(1024 * 1024, 0x1000, 0, num_l1_entries);
+  ASSERT_EQ(i1.offset, 0);
+  ASSERT_EQ(i1.length, 1024 * 1024);
+
+  auto i3 = al1.allocate_l1_cont(1024 * 1024 + 0x1000, 0x1000, 0, num_l1_entries);
+  ASSERT_EQ(i3.offset, 2 * 2 * 1024 * 1024);
+  ASSERT_EQ(i3.length, 1024 * 1024 + 0x1000);
+
+  // here we have the following layout:
+  // Alloc: 0~1M, 2M~2M, 4M~1M+4K
+  // Free: 1M~1M, 4M+4K ~ 2M-4K, 6M ~...
+  //
+  auto i4 = al1.allocate_l1_cont(1024 * 1024, 0x1000, 0, num_l1_entries);
+  ASSERT_EQ(i4.offset, 1 * 1024 * 1024);
+  ASSERT_EQ(i4.length, 1024 * 1024);
+  al1.free_l1(i4);
+
+  i4 = al1.allocate_l1_cont(1024 * 1024 - 0x1000, 0x1000, 0, num_l1_entries);
+  ASSERT_EQ(i4.offset, 5 * 1024 * 1024 + 0x1000);
+  ASSERT_EQ(i4.length, 1024 * 1024 - 0x1000);
+  al1.free_l1(i4);
+
+  i4 = al1.allocate_l1_cont(1024 * 1024 + 0x1000, 0x1000, 0, num_l1_entries);
+  ASSERT_EQ(i4.offset, 6 * 1024 * 1024);
+  //ASSERT_EQ(i4.offset, 5 * 1024 * 1024 + 0x1000);
+  ASSERT_EQ(i4.length, 1024 * 1024 + 0x1000);
+
+  al1.free_l1(i1);
+  al1.free_l1(i2);
+  al1.free_l1(i3);
+  al1.free_l1(i4);
+
+  i1 = al1.allocate_l1_cont(1024 * 1024, 0x1000, 0, num_l1_entries);
+  ASSERT_EQ(i1.offset, 0);
+  ASSERT_EQ(i1.length, 1024 * 1024);
+
+  i2 = al1.allocate_l1_cont(1024 * 1024, 0x1000, 0, num_l1_entries);
+  ASSERT_EQ(i2.offset, 1 * 1024 * 1024);
+  ASSERT_EQ(i2.length, 1024 * 1024 );
+
+  i3 = al1.allocate_l1_cont(512 * 1024, 0x1000, 0, num_l1_entries);
+  ASSERT_EQ(i3.offset, 2 * 1024 * 1024);
+  ASSERT_EQ(i3.length, 512 * 1024);
+
+  i4 = al1.allocate_l1_cont(1536 * 1024, 0x1000, 0, num_l1_entries);
+  ASSERT_EQ(i4.offset, (2 * 1024 + 512) * 1024 );
+  ASSERT_EQ(i4.length, 1536 * 1024);
+  // making a hole 1.5 Mb length
+  al1.free_l1(i2);
+  al1.free_l1(i3);
+  // and trying to fill it
+  i2 = al1.allocate_l1_cont(1536 * 1024, 0x1000, 0, num_l1_entries);
+  ASSERT_EQ(i2.offset, 1024 * 1024);
+  ASSERT_EQ(i2.length, 1536 * 1024);
+
+  al1.free_l1(i2);
+  // and trying to fill it partially
+  i2 = al1.allocate_l1_cont(1528 * 1024, 0x1000, 0, num_l1_entries);
+  ASSERT_EQ(i2.offset, 1024 * 1024);
+  ASSERT_EQ(i2.length, 1528 * 1024);
+
+  i3 = al1.allocate_l1_cont(8 * 1024, 0x1000, 0, num_l1_entries);
+  ASSERT_EQ(i3.offset, 2552 * 1024);
+  ASSERT_EQ(i3.length, 8 * 1024);
+
+  al1.free_l1(i2);
+  // here we have the following layout:
+  // Alloc: 0~1M, 2552K~8K, num_l1_entries0K~1.5M
+  // Free: 1M~1528K, 4M ~...
+  //
+  i2 = al1.allocate_l1_cont(1536 * 1024, 0x1000, 0, num_l1_entries);
+  ASSERT_EQ(i2.offset, 4 * 1024 * 1024);
+  ASSERT_EQ(i2.length, 1536 * 1024);
+
+  al1.free_l1(i1);
+  al1.free_l1(i2);
+  al1.free_l1(i3);
+  al1.free_l1(i4);
+  ASSERT_EQ(capacity, al1.debug_get_free());
+
+  for (uint64_t i = 0; i < capacity; i += _2m) {
+    i1 = al1.allocate_l1_cont(_2m, _2m, 0, num_l1_entries);
+    ASSERT_EQ(i1.offset, i);
+    ASSERT_EQ(i1.length, _2m);
+  }
+  ASSERT_EQ(0, al1.debug_get_free());
+  i2 = al1.allocate_l1_cont(_2m, _2m, 0, num_l1_entries);
+  ASSERT_EQ(i2.length, 0);
+  ASSERT_EQ(0, al1.debug_get_free());
+
+  al1.free_l1(i1);
+  i2 = al1.allocate_l1_cont(_2m, _2m, 0, num_l1_entries);
+  ASSERT_EQ(i2, i1);
+  al1.free_l1(i2);
+  i2 = al1.allocate_l1_cont(_1m, _1m, 0, num_l1_entries);
+  ASSERT_EQ(i2.offset, i1.offset);
+  ASSERT_EQ(i2.length, _1m);
+
+  i3 = al1.allocate_l1_cont(_2m, _2m, 0, num_l1_entries);
+  ASSERT_EQ(i3.length, 0);
+
+  i3 = al1.allocate_l1_cont(_2m, _1m, 0, num_l1_entries);
+  ASSERT_EQ(i3.length, _1m);
+
+  i4 = al1.allocate_l1_cont(_2m, _1m, 0, num_l1_entries);
+  ASSERT_EQ(i4.length, 0);
+
+  al1.free_l1(i2);
+  i2 = al1.allocate_l1_cont(_2m, _2m, 0, num_l1_entries);
+  ASSERT_EQ(i2.length, 0);
+
+  i2 = al1.allocate_l1_cont(_2m, 0x1000, 0, num_l1_entries);
+  ASSERT_EQ(i2.length, _1m);
+
+  al1.free_l1(i2);
+  al1.free_l1(i3);
+  ASSERT_EQ(_2m, al1.debug_get_free());
+
+  i1 = al1.allocate_l1_cont(_2m - 3 * 0x1000, 0x1000, 0, num_l1_entries);
+  i2 = al1.allocate_l1_cont(0x1000, 0x1000, 0, num_l1_entries);
+  i3 = al1.allocate_l1_cont(0x1000, 0x1000, 0, num_l1_entries);
+  i4 = al1.allocate_l1_cont(0x1000, 0x1000, 0, num_l1_entries);
+  ASSERT_EQ(0, al1.debug_get_free());
+
+  al1.free_l1(i2);
+  al1.free_l1(i4);
+
+  i2 = al1.allocate_l1_cont(0x4000, 0x2000, 0, num_l1_entries);
+  ASSERT_EQ(i2.length, 0);
+  i2 = al1.allocate_l1_cont(0x4000, 0x1000, 0, num_l1_entries);
+  ASSERT_EQ(i2.length, 0x1000);
+
+  al1.free_l1(i3);
+  i3 = al1.allocate_l1_cont(0x6000, 0x3000, 0, num_l1_entries);
+  ASSERT_EQ(i3.length, 0);
+  i3 = al1.allocate_l1_cont(0x6000, 0x1000, 0, num_l1_entries);
+  ASSERT_EQ(i3.length, 0x2000);
+  ASSERT_EQ(0, al1.debug_get_free());
+
+  std::cout << "Done L1" << std::endl;
+}
+
+TEST(TestAllocatorLevel01, test_l2)
+{
+  TestAllocatorLevel02 al2;
+  uint64_t num_l2_entries = 64;// *512;
+  uint64_t capacity = num_l2_entries * 256 * 512 * 4096;
+  al2.init(capacity, 0x1000);
+  std::cout << "Init L2" << std::endl;
+
+  uint64_t allocated1 = 0;
+  interval_vector_t a1;
+  al2.allocate_l2(0x2000, 0x2000, &allocated1, &a1);
+  ASSERT_EQ(allocated1, 0x2000);
+  ASSERT_EQ(a1[0].offset, 0);
+  ASSERT_EQ(a1[0].length, 0x2000);
+
+  // limit query range in debug_get_free for the sake of performance
+  ASSERT_EQ(0x2000, al2.debug_get_allocated(0, 1));
+  ASSERT_EQ(0, al2.debug_get_allocated(1, 2));
+
+  uint64_t allocated2 = 0;
+  interval_vector_t a2;
+  al2.allocate_l2(0x2000, 0x2000, &allocated2, &a2);
+  ASSERT_EQ(allocated2, 0x2000);
+  ASSERT_EQ(a2[0].offset, 0x2000);
+  ASSERT_EQ(a2[0].length, 0x2000);
+  // limit query range in debug_get_free for the sake of performance
+  ASSERT_EQ(0x4000, al2.debug_get_allocated(0, 1));
+  ASSERT_EQ(0, al2.debug_get_allocated(1, 2));
+
+  al2.free_l2(a1);
+
+  allocated2 = 0;
+  a2.clear();
+  al2.allocate_l2(0x1000, 0x1000, &allocated2, &a2);
+  ASSERT_EQ(allocated2, 0x1000);
+  ASSERT_EQ(a2[0].offset, 0x0000);
+  ASSERT_EQ(a2[0].length, 0x1000);
+  // limit query range in debug_get_free for the sake of performance
+  ASSERT_EQ(0x3000, al2.debug_get_allocated(0, 1));
+  ASSERT_EQ(0, al2.debug_get_allocated(1, 2));
+
+  uint64_t allocated3 = 0;
+  interval_vector_t a3;
+  al2.allocate_l2(0x2000, 0x1000, &allocated3, &a3);
+  ASSERT_EQ(allocated3, 0x2000);
+  ASSERT_EQ(a3.size(), 2);
+  ASSERT_EQ(a3[0].offset, 0x1000);
+  ASSERT_EQ(a3[0].length, 0x1000);
+  ASSERT_EQ(a3[1].offset, 0x4000);
+  ASSERT_EQ(a3[1].length, 0x1000);
+  // limit query range in debug_get_free for the sake of performance
+  ASSERT_EQ(0x5000, al2.debug_get_allocated(0, 1));
+  ASSERT_EQ(0, al2.debug_get_allocated(1, 2));
+  {
+    interval_vector_t r;
+    r.emplace_back(0x0, 0x5000);
+    al2.free_l2(r);
+  }
+
+  a3.clear();
+  allocated3 = 0;
+  al2.allocate_l2(_1m, _1m, &allocated3, &a3);
+  ASSERT_EQ(a3.size(), 1);
+  ASSERT_EQ(a3[0].offset, 0);
+  ASSERT_EQ(a3[0].length, _1m);
+
+  al2.free_l2(a3);
+
+  a3.clear();
+  allocated3 = 0;
+  al2.allocate_l2(4 * _1m, _1m, &allocated3, &a3);
+  ASSERT_EQ(a3.size(), 1);
+  ASSERT_EQ(a3[0].offset, 0);
+  ASSERT_EQ(a3[0].length, 4 * _1m);
+
+  al2.free_l2(a3);
+
+#ifndef _DEBUG
+  for (uint64_t i = 0; i < capacity; i += 0x1000) {
+    uint64_t allocated4 = 0;
+    interval_vector_t a4;
+    al2.allocate_l2(0x1000, 0x1000, &allocated4, &a4);
+    ASSERT_EQ(a4.size(), 1);
+    ASSERT_EQ(a4[0].offset, i);
+    ASSERT_EQ(a4[0].length, 0x1000);
+    if (0 == (i % (1 * 1024 * _1m))) {
+      std::cout << "alloc1 " << i / 1024 / 1024 << " mb of "
+        << capacity / 1024 / 1024 << std::endl;
+    }
+  }
+#else
+  for (uint64_t i = 0; i < capacity; i += _2m) {
+    uint64_t allocated4 = 0;
+    interval_vector_t a4;
+    al2.allocate_l2(_2m, _2m, &allocated4, &a4);
+    ASSERT_EQ(a4.size(), 1);
+    ASSERT_EQ(a4[0].offset, i);
+    ASSERT_EQ(a4[0].length, _2m);
+    if (0 == (i % (1 * 1024 * _1m))) {
+      std::cout << "alloc1 " << i / 1024 / 1024 << " mb of "
+                << capacity / 1024 / 1024 << std::endl;
+    }
+  }
+#endif
+
+  ASSERT_EQ(0, al2.debug_get_free());
+  for (uint64_t i = 0; i < capacity; i += _1m) {
+    interval_vector_t r;
+    r.emplace_back(i, _1m);
+    al2.free_l2(r);
+    if (0 == (i % (1 * 1024 * _1m))) {
+      std::cout << "free1 " << i / 1024 / 1024 << " mb of "
+        << capacity / 1024 / 1024 << std::endl;
+    }
+  }
+  ASSERT_EQ(capacity, al2.debug_get_free());
+
+  for (uint64_t i = 0; i < capacity; i += _1m) {
+    uint64_t allocated4 = 0;
+    interval_vector_t a4;
+    al2.allocate_l2(_1m, _1m, &allocated4, &a4);
+    ASSERT_EQ(a4.size(), 1);
+    ASSERT_EQ(allocated4, _1m);
+    ASSERT_EQ(a4[0].offset, i);
+    ASSERT_EQ(a4[0].length, _1m);
+    if (0 == (i % (1 * 1024 * _1m))) {
+      std::cout << "alloc2 " << i / 1024 / 1024 << " mb of "
+        << capacity / 1024 / 1024 << std::endl;
+    }
+  }
+  ASSERT_EQ(0, al2.debug_get_free());
+  uint64_t allocated4 = 0;
+  interval_vector_t a4;
+  al2.allocate_l2(_1m, _1m, &allocated4, &a4);
+  ASSERT_EQ(a4.size(), 0);
+  al2.allocate_l2(0x1000, 0x1000, &allocated4, &a4);
+  ASSERT_EQ(a4.size(), 0);
+
+  for (uint64_t i = 0; i < capacity; i += 0x2000) {
+    interval_vector_t r;
+    r.emplace_back(i, 0x1000);
+    al2.free_l2(r);
+    if (0 == (i % (1 * 1024 * _1m))) {
+      std::cout << "free2 " << i / 1024 / 1024 << " mb of "
+        << capacity / 1024 / 1024 << std::endl;
+    }
+  }
+  ASSERT_EQ(capacity / 2, al2.debug_get_free());
+
+  // unable to allocate due to fragmentation
+  al2.allocate_l2(_1m, _1m, &allocated4, &a4);
+  ASSERT_EQ(a4.size(), 0);
+
+  for (uint64_t i = 0; i < capacity; i += 2 * _1m) {
+    a4.clear();
+    allocated4 = 0;
+    al2.allocate_l2(_1m, 0x1000, &allocated4, &a4);
+    ASSERT_EQ(a4.size(), _1m / 0x1000);
+    ASSERT_EQ(allocated4, _1m);
+    ASSERT_EQ(a4[0].offset, i);
+    ASSERT_EQ(a4[0].length, 0x1000);
+    if (0 == (i % (1 * 1024 * _1m))) {
+      std::cout << "alloc3 " << i / 1024 / 1024 << " mb of "
+        << capacity / 1024 / 1024 << std::endl;
+    }
+  }
+  ASSERT_EQ(0, al2.debug_get_free());
+
+  std::cout << "Done L2" << std::endl;
+}
+
+TEST(TestAllocatorLevel01, test_l2_huge)
+{
+  TestAllocatorLevel02 al2;
+  uint64_t num_l2_entries = 4 * 512;
+  uint64_t capacity = num_l2_entries * 256 * 512 * 4096; // 1 TB
+  al2.init(capacity, 0x1000);
+  std::cout << "Init L2 Huge" << std::endl;
+
+  for (uint64_t i = 0; i < capacity; i += _1m) {
+    uint64_t allocated4 = 0;
+    interval_vector_t a4;
+    al2.allocate_l2(0x1000, 0x1000, &allocated4, &a4);
+    ASSERT_EQ(a4.size(), 1);
+    ASSERT_EQ(allocated4, 0x1000);
+    ASSERT_EQ(a4[0].offset, i);
+    ASSERT_EQ(a4[0].length, 0x1000);
+
+    allocated4 = 0;
+    a4.clear();
+    al2.allocate_l2(_1m - 0x1000, 0x1000, &allocated4, &a4);
+    ASSERT_EQ(a4.size(), 1);
+    ASSERT_EQ(allocated4, _1m - 0x1000);
+    ASSERT_EQ(a4[0].offset, i + 0x1000);
+    ASSERT_EQ(a4[0].length, _1m - 0x1000);
+    if (0 == (i % (1 * 1024 * _1m))) {
+      std::cout << "allocH " << i / 1024 / 1024 << " mb of "
+        << capacity / 1024 / 1024 << std::endl;
+    }
+  }
+  for (uint64_t i = 0; i < capacity; i += _1m) {
+    interval_vector_t a4;
+    a4.emplace_back(i, 0x1000);
+    al2.free_l2(a4);
+    if (0 == (i % (1 * 1024 * _1m))) {
+      std::cout << "freeH1 " << i / 1024 / 1024 << " mb of "
+        << capacity / 1024 / 1024 << std::endl;
+    }
+  }
+  {
+    std::cout << "Try" << std::endl;
+    time_t t = time(NULL);
+    for (int i = 0; i < 10; ++i) {
+      uint64_t allocated = 0;
+      interval_vector_t a;
+      al2.allocate_l2(0x2000, 0x2000, &allocated, &a);
+      ASSERT_EQ(a.size(), 0);
+    }
+    std::cout << "End try in " << time(NULL) - t << " seconds" << std::endl;
+  }
+  {
+    std::cout << "Try" << std::endl;
+    time_t t = time(NULL);
+    for (int i = 0; i < 10; ++i) {
+      uint64_t allocated = 0;
+      interval_vector_t a;
+      al2.allocate_l2(_2m, _2m, &allocated, &a);
+      ASSERT_EQ(a.size(), 0);
+    }
+    std::cout << "End try in " << time(NULL) - t << " seconds" << std::endl;
+  }
+
+  ASSERT_EQ((capacity / _1m) * 0x1000, al2.debug_get_free());
+
+  std::cout << "Done L2 Huge" << std::endl;
+}
+
+TEST(TestAllocatorLevel01, test_l2_unaligned)
+{
+  {
+    TestAllocatorLevel02 al2;
+    uint64_t num_l2_entries = 3;
+    uint64_t capacity = num_l2_entries * 256 * 512 * 4096; // 3x512 MB
+    al2.init(capacity, 0x1000);
+    std::cout << "Init L2 Unaligned" << std::endl;
+
+    for (uint64_t i = 0; i < capacity; i += _1m / 2) {
+      uint64_t allocated4 = 0;
+      interval_vector_t a4;
+      al2.allocate_l2(_1m / 2, _1m / 2, &allocated4, &a4);
+      ASSERT_EQ(a4.size(), 1);
+      ASSERT_EQ(allocated4, _1m / 2);
+      ASSERT_EQ(a4[0].offset, i);
+      ASSERT_EQ(a4[0].length, _1m / 2);
+      if (0 == (i % (1 * 1024 * _1m))) {
+        std::cout << "allocU " << i / 1024 / 1024 << " mb of "
+          << capacity / 1024 / 1024 << std::endl;
+      }
+    }
+    ASSERT_EQ(0, al2.debug_get_free());
+    {
+      // no space to allocate
+      uint64_t allocated4 = 0;
+      interval_vector_t a4;
+      al2.allocate_l2(0x1000, 0x1000, &allocated4, &a4);
+      ASSERT_EQ(a4.size(), 0);
+    }
+  }
+  {
+    TestAllocatorLevel02 al2;
+    uint64_t capacity = 500 * 512 * 4096; // 500x2 MB
+    al2.init(capacity, 0x1000);
+    std::cout << ("Init L2 Unaligned2\n");
+    for (uint64_t i = 0; i < capacity; i += _1m / 2) {
+      uint64_t allocated4 = 0;
+      interval_vector_t a4;
+      al2.allocate_l2(_1m / 2, _1m / 2, &allocated4, &a4);
+      ASSERT_EQ(a4.size(), 1);
+      ASSERT_EQ(allocated4, _1m / 2);
+      ASSERT_EQ(a4[0].offset, i);
+      ASSERT_EQ(a4[0].length, _1m / 2);
+      if (0 == (i % (1 * 1024 * _1m))) {
+        std::cout << "allocU2 " << i / 1024 / 1024 << " mb of "
+          << capacity / 1024 / 1024 << std::endl;
+      }
+    }
+    ASSERT_EQ(0, al2.debug_get_free());
+    {
+      // no space to allocate
+      uint64_t allocated4 = 0;
+      interval_vector_t a4;
+      al2.allocate_l2(0x1000, 0x1000, &allocated4, &a4);
+      ASSERT_EQ(a4.size(), 0);
+    }
+  }
+
+  {
+    TestAllocatorLevel02 al2;
+    uint64_t capacity = 100 * 512 * 4096 + 127 * 4096;
+    al2.init(capacity, 0x1000);
+    std::cout << "Init L2 Unaligned2" << std::endl;
+    for (uint64_t i = 0; i < capacity; i += 0x1000) {
+      uint64_t allocated4 = 0;
+      interval_vector_t a4;
+      al2.allocate_l2(0x1000, 0x1000, &allocated4, &a4);
+      ASSERT_EQ(a4.size(), 1);
+      ASSERT_EQ(allocated4, 0x1000);
+      ASSERT_EQ(a4[0].offset, i);
+      ASSERT_EQ(a4[0].length, 0x1000);
+    }
+    ASSERT_EQ(0, al2.debug_get_free());
+    {
+      // no space to allocate
+      uint64_t allocated4 = 0;
+      interval_vector_t a4;
+      al2.allocate_l2(0x1000, 0x1000, &allocated4, &a4);
+      ASSERT_EQ(a4.size(), 0);
+    }
+  }
+  {
+    TestAllocatorLevel02 al2;
+    uint64_t capacity = 3 * 4096;
+    al2.init(capacity, 0x1000);
+    std::cout << "Init L2 Unaligned2" << std::endl;
+    for (uint64_t i = 0; i < capacity; i += 0x1000) {
+      uint64_t allocated4 = 0;
+      interval_vector_t a4;
+      al2.allocate_l2(0x1000, 0x1000, &allocated4, &a4);
+      ASSERT_EQ(a4.size(), 1);
+      ASSERT_EQ(allocated4, 0x1000);
+      ASSERT_EQ(a4[0].offset, i);
+      ASSERT_EQ(a4[0].length, 0x1000);
+    }
+    ASSERT_EQ(0, al2.debug_get_free());
+    {
+      // no space to allocate
+      uint64_t allocated4 = 0;
+      interval_vector_t a4;
+      al2.allocate_l2(0x1000, 0x1000, &allocated4, &a4);
+      ASSERT_EQ(a4.size(), 0);
+    }
+  }
+
+  std::cout << "Done L2 Unaligned" << std::endl;
+}
+
+TEST(TestAllocatorLevel01, test_l2_contiguous_alignment)
+{
+  {
+    TestAllocatorLevel02 al2;
+    uint64_t num_l2_entries = 3;
+    uint64_t capacity = num_l2_entries * 256 * 512 * 4096; // 3x512 MB
+    uint64_t num_chunks = capacity / 4096;
+    al2.init(capacity, 4096);
+    std::cout << "Init L2 cont aligned" << std::endl;
+
+    std::map<size_t, size_t> bins_overall;
+    al2.collect_stats(bins_overall);
+    ASSERT_EQ(bins_overall.size(), 1u);
+//    std::cout<<bins_overall.begin()->first << std::endl;
+    ASSERT_EQ(bins_overall[cbits(num_chunks) - 1], 1u);
+
+    for (uint64_t i = 0; i < capacity / 2; i += _1m) {
+      uint64_t allocated4 = 0;
+      interval_vector_t a4;
+      al2.allocate_l2(_1m, _1m, &allocated4, &a4);
+      ASSERT_EQ(a4.size(), 1);
+      ASSERT_EQ(allocated4, _1m);
+      ASSERT_EQ(a4[0].offset, i);
+      ASSERT_EQ(a4[0].length, _1m);
+    }
+    ASSERT_EQ(capacity / 2, al2.debug_get_free());
+
+    bins_overall.clear();
+    al2.collect_stats(bins_overall);
+    ASSERT_EQ(bins_overall.size(), 1u);
+    ASSERT_EQ(bins_overall[cbits(num_chunks / 2) - 1], 1u);
+
+    {
+      size_t to_release = 2 * _1m + 0x1000;
+      // release 2M + 4K at the beginning
+      interval_vector_t r;
+      r.emplace_back(0, to_release);
+      al2.free_l2(r);
+      bins_overall.clear();
+      al2.collect_stats(bins_overall);
+      ASSERT_EQ(bins_overall.size(), 2u);
+      ASSERT_EQ(bins_overall[cbits(to_release / 0x1000) - 1], 1u);
+      ASSERT_EQ(bins_overall[cbits(num_chunks / 2) - 1], 1u);
+    }
+    {
+      // allocate 4K within the deallocated range
+      uint64_t allocated4 = 0;
+      interval_vector_t a4;
+      al2.allocate_l2(0x1000, 0x1000, &allocated4, &a4);
+      ASSERT_EQ(a4.size(), 1u);
+      ASSERT_EQ(allocated4, 0x1000u);
+      ASSERT_EQ(a4[0].offset, 0u);
+      ASSERT_EQ(a4[0].length, 0x1000u);
+      bins_overall.clear();
+      al2.collect_stats(bins_overall);
+      ASSERT_EQ(bins_overall.size(), 2u);
+      ASSERT_EQ(bins_overall[cbits(2 * _1m / 0x1000) - 1], 1u);
+      ASSERT_EQ(bins_overall[cbits(num_chunks / 2) - 1], 1u);
+    }
+    {
+      // allocate 1M - should go to the second 1M chunk
+      uint64_t allocated4 = 0;
+      interval_vector_t a4;
+      al2.allocate_l2(_1m, _1m, &allocated4, &a4);
+      ASSERT_EQ(a4.size(), 1);
+      ASSERT_EQ(allocated4, _1m);
+      ASSERT_EQ(a4[0].offset, _1m);
+      ASSERT_EQ(a4[0].length, _1m);
+      bins_overall.clear();
+      al2.collect_stats(bins_overall);
+      ASSERT_EQ(bins_overall.size(), 3u);
+      ASSERT_EQ(bins_overall[0], 1u);
+      ASSERT_EQ(bins_overall[cbits((_1m - 0x1000) / 0x1000) - 1], 1u);
+      ASSERT_EQ(bins_overall[cbits(num_chunks / 2) - 1], 1u);
+    }
+    {
+      // and allocate yet another 8K within the deallocated range
+      uint64_t allocated4 = 0;
+      interval_vector_t a4;
+      al2.allocate_l2(0x2000, 0x1000, &allocated4, &a4);
+      ASSERT_EQ(a4.size(), 1u);
+      ASSERT_EQ(allocated4, 0x2000u);
+      ASSERT_EQ(a4[0].offset, 0x1000u);
+      ASSERT_EQ(a4[0].length, 0x2000u);
+      bins_overall.clear();
+      al2.collect_stats(bins_overall);
+      ASSERT_EQ(bins_overall[0], 1u);
+      ASSERT_EQ(bins_overall[cbits((_1m - 0x3000) / 0x1000) - 1], 1u);
+      ASSERT_EQ(bins_overall[cbits(num_chunks / 2) - 1], 1u);
+    }
+    {
+      // release just allocated 1M
+      interval_vector_t r;
+      r.emplace_back(_1m, _1m);
+      al2.free_l2(r);
+      bins_overall.clear();
+      al2.collect_stats(bins_overall);
+      ASSERT_EQ(bins_overall.size(), 2u);
+      ASSERT_EQ(bins_overall[cbits((2 * _1m - 0x3000) / 0x1000) - 1], 1u);
+      ASSERT_EQ(bins_overall[cbits(num_chunks / 2) - 1], 1u);
+    }
+    {
+      // allocate 3M - should go to the second 1M chunk and @capacity/2
+      uint64_t allocated4 = 0;
+      interval_vector_t a4;
+      al2.allocate_l2(3 * _1m, _1m, &allocated4, &a4);
+      ASSERT_EQ(a4.size(), 2);
+      ASSERT_EQ(allocated4, 3 * _1m);
+      ASSERT_EQ(a4[0].offset, _1m);
+      ASSERT_EQ(a4[0].length, _1m);
+      ASSERT_EQ(a4[1].offset, capacity / 2);
+      ASSERT_EQ(a4[1].length, 2 * _1m);
+      bins_overall.clear();
+      al2.collect_stats(bins_overall);
+      ASSERT_EQ(bins_overall.size(), 3u);
+      ASSERT_EQ(bins_overall[0], 1u);
+      ASSERT_EQ(bins_overall[cbits((_1m - 0x3000) / 0x1000) - 1], 1u);
+      ASSERT_EQ(bins_overall[cbits((num_chunks - 512) / 2) - 1], 1u);
+    }
+    {
+      // release allocated 1M in the second meg chunk except
+      // the first 4K chunk
+      interval_vector_t r;
+      r.emplace_back(_1m + 0x1000, _1m);
+      al2.free_l2(r);
+      bins_overall.clear();
+      al2.collect_stats(bins_overall);
+      ASSERT_EQ(bins_overall.size(), 3u);
+      ASSERT_EQ(bins_overall[cbits(_1m / 0x1000) - 1], 1u);
+      ASSERT_EQ(bins_overall[cbits((_1m - 0x3000) / 0x1000) - 1], 1u);
+      ASSERT_EQ(bins_overall[cbits((num_chunks - 512) / 2) - 1], 1u);
+    }
+    {
+      // release 2M @(capacity / 2)
+      interval_vector_t r;
+      r.emplace_back(capacity / 2, 2 * _1m);
+      al2.free_l2(r);
+      bins_overall.clear();
+      al2.collect_stats(bins_overall);
+      ASSERT_EQ(bins_overall.size(), 3u);
+      ASSERT_EQ(bins_overall[cbits(_1m / 0x1000) - 1], 1u);
+      ASSERT_EQ(bins_overall[cbits((_1m - 0x3000) / 0x1000) - 1], 1u);
+      ASSERT_EQ(bins_overall[cbits((num_chunks) / 2) - 1], 1u);
+    }
+    {
+      // allocate 4x512K - should go to the second halves of
+      // the first and second 1M chunks and @(capacity / 2)
+      uint64_t allocated4 = 0;
+      interval_vector_t a4;
+      al2.allocate_l2(2 * _1m, _1m / 2, &allocated4, &a4);
+      ASSERT_EQ(a4.size(), 3);
+      ASSERT_EQ(allocated4, 2 * _1m);
+      ASSERT_EQ(a4[0].offset, _1m / 2);
+      ASSERT_EQ(a4[0].length, _1m / 2);
+      ASSERT_EQ(a4[1].offset, _1m + _1m / 2);
+      ASSERT_EQ(a4[1].length, _1m / 2);
+      ASSERT_EQ(a4[2].offset, capacity / 2);
+      ASSERT_EQ(a4[2].length, _1m);
+
+      bins_overall.clear();
+      al2.collect_stats(bins_overall);
+      ASSERT_EQ(bins_overall.size(), 3u);
+      ASSERT_EQ(bins_overall[0], 1u);
+      // below we have 512K - 4K & 512K - 12K chunks which both fit into
+      // the same bin = 6
+      ASSERT_EQ(bins_overall[6], 2u);
+      ASSERT_EQ(bins_overall[cbits((num_chunks - 256) / 2) - 1], 1u);
+
+    }
+    {
+      // cleanup first 2M except except the last 4K chunk
+      interval_vector_t r;
+      r.emplace_back(0, 2 * _1m - 0x1000);
+      al2.free_l2(r);
+      bins_overall.clear();
+      al2.collect_stats(bins_overall);
+
+      ASSERT_EQ(bins_overall.size(), 3u);
+      ASSERT_EQ(bins_overall[0], 1u);
+      ASSERT_EQ(bins_overall[cbits((_2m - 0x1000) / 0x1000) - 1], 1u);
+      ASSERT_EQ(bins_overall[cbits((num_chunks - 256) / 2) - 1], 1u);
+    }
+    {
+      // release 2M @(capacity / 2)
+      interval_vector_t r;
+      r.emplace_back(capacity / 2, 2 * _1m);
+      al2.free_l2(r);
+      bins_overall.clear();
+      al2.collect_stats(bins_overall);
+
+      ASSERT_EQ(bins_overall.size(), 3u);
+      ASSERT_EQ(bins_overall[0], 1u);
+      ASSERT_EQ(bins_overall[cbits((_2m - 0x1000) / 0x1000) - 1], 1u);
+      ASSERT_EQ(bins_overall[cbits(num_chunks / 2) - 1], 1u);
+    }
+    {
+      // allocate 132M using 4M granularity should go to (capacity / 2)
+      uint64_t allocated4 = 0;
+      interval_vector_t a4;
+      al2.allocate_l2(132 * _1m, 4 * _1m , &allocated4, &a4);
+      ASSERT_EQ(a4.size(), 1);
+      ASSERT_EQ(a4[0].offset, capacity / 2);
+      ASSERT_EQ(a4[0].length, 132 * _1m);
+
+      bins_overall.clear();
+      al2.collect_stats(bins_overall);
+      ASSERT_EQ(bins_overall.size(), 3u);
+    }
+    {
+      // cleanup left 4K chunk in the first 2M
+      interval_vector_t r;
+      r.emplace_back(2 * _1m - 0x1000, 0x1000);
+      al2.free_l2(r);
+      bins_overall.clear();
+      al2.collect_stats(bins_overall);
+
+      ASSERT_EQ(bins_overall.size(), 2u);
+    }
+    {
+      // release 132M @(capacity / 2)
+      interval_vector_t r;
+      r.emplace_back(capacity / 2, 132 * _1m);
+      al2.free_l2(r);
+    }
+    {
+      // allocate 132M using 2M granularity should go to the first chunk and to
+      // (capacity / 2)
+      uint64_t allocated4 = 0;
+      interval_vector_t a4;
+      al2.allocate_l2(132 * _1m, 2 * _1m , &allocated4, &a4);
+      ASSERT_EQ(a4.size(), 2);
+      ASSERT_EQ(a4[0].offset, 0);
+      ASSERT_EQ(a4[0].length, 2 * _1m);
+      ASSERT_EQ(a4[1].offset, capacity / 2);
+      ASSERT_EQ(a4[1].length, 130 * _1m);
+    }
+    {
+      // release 130M @(capacity / 2)
+      interval_vector_t r;
+      r.emplace_back(capacity / 2, 132 * _1m);
+      al2.free_l2(r);
+    }
+    {
+      // release 4K~16K
+      // release 28K~32K
+      // release 68K~24K
+      interval_vector_t r;
+      r.emplace_back(0x1000, 0x4000);
+      r.emplace_back(0x7000, 0x8000);
+      r.emplace_back(0x11000, 0x6000);
+      al2.free_l2(r);
+    }
+    {
+      // allocate 32K using 16K granularity - should bypass the first
+      // unaligned extent, use the second free extent partially given
+      // the 16K alignment and then fallback to capacity / 2
+      uint64_t allocated4 = 0;
+      interval_vector_t a4;
+      al2.allocate_l2(0x8000, 0x4000, &allocated4, &a4);
+      ASSERT_EQ(a4.size(), 2);
+      ASSERT_EQ(a4[0].offset, 0x8000);
+      ASSERT_EQ(a4[0].length, 0x4000);
+      ASSERT_EQ(a4[1].offset, capacity / 2);
+      ASSERT_EQ(a4[1].length, 0x4000);
+    }
+
+  }
+  std::cout << "Done L2 cont aligned" << std::endl;
+}
+
+TEST(TestAllocatorLevel01, test_4G_alloc_bug)
+{
+  {
+    TestAllocatorLevel02 al2;
+    uint64_t capacity = 0x8000 * _1m; // = 32GB
+    al2.init(capacity, 0x10000);
+    std::cout << "Init L2 cont aligned" << std::endl;
+
+      uint64_t allocated4 = 0;
+      interval_vector_t a4;
+      al2.allocate_l2(_1m, _1m, &allocated4, &a4);
+      ASSERT_EQ(a4.size(), 1u); // the bug caused no allocations here
+      ASSERT_EQ(allocated4, _1m);
+      ASSERT_EQ(a4[0].offset, 0u);
+      ASSERT_EQ(a4[0].length, _1m);
+  }
+}
+
+TEST(TestAllocatorLevel01, test_4G_alloc_bug2)
+{
+  {
+    TestAllocatorLevel02 al2;
+    uint64_t capacity = 0x8000 * _1m; // = 32GB
+    al2.init(capacity, 0x10000);
+
+    for (uint64_t i = 0; i < capacity; i += _1m) {
+      uint64_t allocated4 = 0;
+      interval_vector_t a4;
+      al2.allocate_l2(_1m, _1m, &allocated4, &a4);
+      ASSERT_EQ(a4.size(), 1u);
+      ASSERT_EQ(allocated4, _1m);
+      ASSERT_EQ(a4[0].offset, i);
+      ASSERT_EQ(a4[0].length, _1m);
+    }
+    ASSERT_EQ(0u , al2.debug_get_free());
+
+    interval_vector_t r;
+    r.emplace_back(0x5fec30000, 0x13d0000);
+    r.emplace_back(0x628000000, 0x80000000);
+    r.emplace_back(0x6a8000000, 0x80000000);
+    r.emplace_back(0x728100000, 0x70000);
+    al2.free_l2(r);
+
+    std::map<size_t, size_t> bins_overall;
+    al2.collect_stats(bins_overall);
+
+    uint64_t allocated4 = 0;
+    interval_vector_t a4;
+    al2.allocate_l2(0x3e000000, _1m, &allocated4, &a4);
+    ASSERT_EQ(a4.size(), 2u);
+    ASSERT_EQ(allocated4, 0x3e000000u);
+    ASSERT_EQ(a4[0].offset, 0x5fed00000u);
+    ASSERT_EQ(a4[0].length, 0x1300000u);
+    ASSERT_EQ(a4[1].offset, 0x628000000u);
+    ASSERT_EQ(a4[1].length, 0x3cd00000u);
+  }
+}
+
+TEST(TestAllocatorLevel01, test_4G_alloc_bug3)
+{
+  {
+    TestAllocatorLevel02 al2;
+    uint64_t capacity = 0x8000 * _1m; // = 32GB
+    al2.init(capacity, 0x10000);
+    std::cout << "Init L2 cont aligned" << std::endl;
+
+      uint64_t allocated4 = 0;
+      interval_vector_t a4;
+      al2.allocate_l2(4096ull * _1m, _1m, &allocated4, &a4);
+      ASSERT_EQ(a4.size(), 2u); // allocator has to split into 2 allocations
+      ASSERT_EQ(allocated4, 4096ull * _1m);
+      ASSERT_EQ(a4[0].offset, 0u);
+      ASSERT_EQ(a4[0].length, 2048ull * _1m);
+      ASSERT_EQ(a4[1].offset, 2048ull * _1m);
+      ASSERT_EQ(a4[1].length, 2048ull * _1m);
+  }
+}
diff -Nru ceph-12.2.11/src/test/objectstore/test_bluestore_types.cc ceph-12.2.12/src/test/objectstore/test_bluestore_types.cc
--- ceph-12.2.11/src/test/objectstore/test_bluestore_types.cc	2019-01-30 15:51:26.000000000 +0000
+++ ceph-12.2.12/src/test/objectstore/test_bluestore_types.cc	2019-04-11 12:33:50.000000000 +0000
@@ -1242,7 +1242,7 @@
     ASSERT_EQ(saving, 1);
     auto& to_collect = gc.get_extents_to_collect();
     ASSERT_EQ(to_collect.size(), 1u);
-    ASSERT_EQ(to_collect[0], AllocExtent(100,10) );
+    ASSERT_EQ(to_collect[0], bluestore_pextent_t(100,10) );
 
     em.clear();
     old_extents.clear();
@@ -1312,10 +1312,10 @@
     ASSERT_EQ(saving, 2);
     auto& to_collect = gc.get_extents_to_collect();
     ASSERT_EQ(to_collect.size(), 2u);
-    ASSERT_TRUE(to_collect[0] == AllocExtent(0x0,0x8000) ||
-		  to_collect[1] == AllocExtent(0x0,0x8000));
-    ASSERT_TRUE(to_collect[0] == AllocExtent(0x3f000,0x1000) ||
-		  to_collect[1] == AllocExtent(0x3f000,0x1000));
+    ASSERT_TRUE(to_collect[0] == bluestore_pextent_t(0x0,0x8000) ||
+		  to_collect[1] == bluestore_pextent_t(0x0,0x8000));
+    ASSERT_TRUE(to_collect[0] == bluestore_pextent_t(0x3f000,0x1000) ||
+		  to_collect[1] == bluestore_pextent_t(0x3f000,0x1000));
 
     em.clear();
     old_extents.clear();
@@ -1433,10 +1433,10 @@
     ASSERT_EQ(saving, 2);
     auto& to_collect = gc.get_extents_to_collect();
     ASSERT_EQ(to_collect.size(), 2u);
-    ASSERT_TRUE(to_collect[0] == AllocExtent(0x0,0x8000) ||
-		  to_collect[1] == AllocExtent(0x0,0x8000));
-    ASSERT_TRUE(to_collect[0] == AllocExtent(0x3f000,0x1000) ||
-		  to_collect[1] == AllocExtent(0x3f000,0x1000));
+    ASSERT_TRUE(to_collect[0] == bluestore_pextent_t(0x0,0x8000) ||
+		  to_collect[1] == bluestore_pextent_t(0x0,0x8000));
+    ASSERT_TRUE(to_collect[0] == bluestore_pextent_t(0x3f000,0x1000) ||
+		  to_collect[1] == bluestore_pextent_t(0x3f000,0x1000));
 
     em.clear();
     old_extents.clear();
diff -Nru ceph-12.2.11/src/test/opensuse-13.2/ceph.spec.in ceph-12.2.12/src/test/opensuse-13.2/ceph.spec.in
--- ceph-12.2.11/src/test/opensuse-13.2/ceph.spec.in	2019-01-30 15:51:26.000000000 +0000
+++ ceph-12.2.12/src/test/opensuse-13.2/ceph.spec.in	2019-04-11 12:33:50.000000000 +0000
@@ -382,7 +382,7 @@
 %if 0%{?suse_version}
 Group:		System/Filesystems
 %endif
-Requires:	ceph-common = %{_epoch_prefix}%{version}-%{release}
+Requires:	ceph-base = %{_epoch_prefix}%{version}-%{release}
 Requires:	librados2 = %{_epoch_prefix}%{version}-%{release}
 %description -n rbd-mirror
 Daemon for mirroring RBD images between Ceph clusters, streaming
@@ -403,7 +403,7 @@
 %if 0%{?suse_version}
 Group:		System/Filesystems
 %endif
-Requires:	ceph-common = %{_epoch_prefix}%{version}-%{release}
+Requires:	ceph-base = %{_epoch_prefix}%{version}-%{release}
 %if 0%{with selinux}
 Requires:	ceph-selinux = %{_epoch_prefix}%{version}-%{release}
 %endif
diff -Nru ceph-12.2.11/src/test/osd/TestOSDMap.cc ceph-12.2.12/src/test/osd/TestOSDMap.cc
--- ceph-12.2.11/src/test/osd/TestOSDMap.cc	2019-01-30 15:51:26.000000000 +0000
+++ ceph-12.2.12/src/test/osd/TestOSDMap.cc	2019-04-11 12:33:50.000000000 +0000
@@ -28,7 +28,7 @@
 }
 
 class OSDMapTest : public testing::Test {
-  const static int num_osds = 6;
+  int num_osds = 6;
 public:
   OSDMap osdmap;
   OSDMapMapping mapping;
@@ -38,7 +38,8 @@
 
   OSDMapTest() {}
 
-  void set_up_map() {
+  void set_up_map(int new_num_osds = 6, bool no_default_pools = false) {
+    num_osds = new_num_osds;
     uuid_d fsid;
     osdmap.build_simple(g_ceph_context, 0, fsid, num_osds);
     OSDMap::Incremental pending_inc(osdmap.get_epoch() + 1);
@@ -57,6 +58,8 @@
       pending_inc.new_uuid[i] = sample_uuid;
     }
     osdmap.apply_incremental(pending_inc);
+    if (no_default_pools) // do not create any default pool(s)
+      return;
 
     // Create an EC ruleset and a pool using it
     int r = osdmap.crush->add_simple_rule(
@@ -92,17 +95,17 @@
     osdmap.apply_incremental(new_pool_inc);
   }
   unsigned int get_num_osds() { return num_osds; }
-  void get_crush(CrushWrapper& newcrush) {
+  void get_crush(const OSDMap& tmap, CrushWrapper& newcrush) {
     bufferlist bl;
-    osdmap.crush->encode(bl, CEPH_FEATURES_SUPPORTED_DEFAULT);
+    tmap.crush->encode(bl, CEPH_FEATURES_SUPPORTED_DEFAULT);
     bufferlist::iterator p = bl.begin();
     newcrush.decode(p);
   }
-  int crush_move(const string &name, const vector<string> &argvec) {
+  int crush_move(OSDMap& tmap, const string &name, const vector<string> &argvec) {
     map<string,string> loc;
     CrushWrapper::parse_loc_map(argvec, &loc);
     CrushWrapper newcrush;
-    get_crush(newcrush);
+    get_crush(tmap, newcrush);
     if (!newcrush.name_exists(name)) {
        return -ENOENT;
     }
@@ -115,10 +118,10 @@
         err = newcrush.move_bucket(g_ceph_context, id, loc);
       }
       if (err >= 0) {
-        OSDMap::Incremental pending_inc(osdmap.get_epoch() + 1);
+        OSDMap::Incremental pending_inc(tmap.get_epoch() + 1);
         pending_inc.crush.clear();
         newcrush.encode(pending_inc.crush, CEPH_FEATURES_SUPPORTED_DEFAULT);
-        osdmap.apply_incremental(pending_inc);
+        tmap.apply_incremental(pending_inc);
         err = 0;
       }
     } else {
@@ -134,7 +137,7 @@
       return osdmap.crush->get_rule_id(name);
     }
     CrushWrapper newcrush;
-    get_crush(newcrush);
+    get_crush(osdmap, newcrush);
     string device_class;
     stringstream ss;
     int ruleno = newcrush.add_simple_rule(
@@ -559,7 +562,7 @@
     move_to.push_back("root=default");
     string host_loc = "host=" + host_name.str();
     move_to.push_back(host_loc);
-    int r = crush_move(osd_name.str(), move_to);
+    int r = crush_move(osdmap, osd_name.str(), move_to);
     ASSERT_EQ(0, r);
   }
   const string upmap_rule = "upmap";
@@ -736,6 +739,132 @@
   }
 
   {
+    // http://tracker.ceph.com/issues/37968
+    
+    // build a temporary crush topology of 2 hosts, 3 osds per host
+    OSDMap tmp; // use a tmpmap here, so we do not dirty origin map..
+    tmp.deepish_copy_from(osdmap);
+    const int expected_host_num = 2;
+    int osd_per_host = get_num_osds() / expected_host_num;
+    ASSERT_GE(osd_per_host, 3);
+    int index = 0;
+    for (int i = 0; i < (int)get_num_osds(); i++) {
+      if (i && i % osd_per_host == 0) {
+        ++index;
+      }
+      stringstream osd_name;
+      stringstream host_name;
+      vector<string> move_to;
+      osd_name << "osd." << i;
+      host_name << "host-" << index;
+      move_to.push_back("root=default");
+      string host_loc = "host=" + host_name.str();
+      move_to.push_back(host_loc);
+      auto r = crush_move(tmp, osd_name.str(), move_to);
+      ASSERT_EQ(0, r);
+    }
+      
+    // build crush rule
+    CrushWrapper crush;
+    get_crush(tmp, crush);
+    string rule_name = "rule_37968";
+    int rule_type = pg_pool_t::TYPE_ERASURE;
+    ASSERT_TRUE(!crush.rule_exists(rule_name));
+    int rno;
+    for (rno = 0; rno < crush.get_max_rules(); rno++) {
+      if (!crush.rule_exists(rno) && !crush.ruleset_exists(rno))
+        break;
+    }
+    string root_name = "default";
+    int root = crush.get_item_id(root_name);
+    int min_size = 3;
+    int max_size = 4;
+    int steps = 6;
+    crush_rule *rule = crush_make_rule(steps, rno, rule_type, min_size, max_size);
+    int step = 0;
+    crush_rule_set_step(rule, step++, CRUSH_RULE_SET_CHOOSELEAF_TRIES, 5, 0);
+    crush_rule_set_step(rule, step++, CRUSH_RULE_SET_CHOOSE_TRIES, 100, 0);
+    crush_rule_set_step(rule, step++, CRUSH_RULE_TAKE, root, 0);
+    crush_rule_set_step(rule, step++, CRUSH_RULE_CHOOSE_INDEP, 2, 1 /* host*/); 
+    crush_rule_set_step(rule, step++, CRUSH_RULE_CHOOSE_INDEP, 2, 0 /* osd */); 
+    crush_rule_set_step(rule, step++, CRUSH_RULE_EMIT, 0, 0);
+    ASSERT_TRUE(step == steps);
+    auto r = crush_add_rule(crush.get_crush_map(), rule, rno);
+    ASSERT_TRUE(r >= 0);
+    crush.set_rule_name(rno, rule_name);
+    {
+      OSDMap::Incremental pending_inc(tmp.get_epoch() + 1);
+      pending_inc.crush.clear();
+      crush.encode(pending_inc.crush, CEPH_FEATURES_SUPPORTED_DEFAULT);
+      tmp.apply_incremental(pending_inc);
+    }
+
+    // create a erasuce-coded pool referencing the above rule
+    int64_t pool_37968;
+    {
+      OSDMap::Incremental new_pool_inc(tmp.get_epoch() + 1);
+      new_pool_inc.new_pool_max = tmp.get_pool_max();
+      new_pool_inc.fsid = tmp.get_fsid();
+      pg_pool_t empty;
+      pool_37968 = ++new_pool_inc.new_pool_max;
+      pg_pool_t *p = new_pool_inc.get_new_pool(pool_37968, &empty);
+      p->size = 4;
+      p->set_pg_num(8);
+      p->set_pgp_num(8);
+      p->type = pg_pool_t::TYPE_ERASURE;
+      p->crush_rule = rno;
+      p->set_flag(pg_pool_t::FLAG_HASHPSPOOL);
+      new_pool_inc.new_pool_names[pool_37968] = "pool_37968";
+      tmp.apply_incremental(new_pool_inc);
+    }
+
+    pg_t ec_pg(0, pool_37968);
+    pg_t ec_pgid = tmp.raw_pg_to_pg(ec_pg);
+    int from = -1;
+    int to = -1;
+    {
+      // insert a valid pg_upmap_item
+      vector<int> ec_up;
+      int ec_up_primary;
+      tmp.pg_to_raw_up(ec_pgid, &ec_up, &ec_up_primary);
+      ASSERT_TRUE(ec_up.size() == 4);
+      from = *(ec_up.begin());
+      ASSERT_TRUE(from >= 0);
+      auto parent = tmp.crush->get_parent_of_type(from, 1 /* host */, rno);
+      ASSERT_TRUE(parent < 0);
+      // pick an osd of the same parent with *from*
+      for (int i = 0; i < (int)get_num_osds(); i++) {
+        if (std::find(ec_up.begin(), ec_up.end(), i) == ec_up.end()) {
+          auto p = tmp.crush->get_parent_of_type(i, 1 /* host */, rno);
+          if (p == parent) {
+            to = i;
+            break;
+          }
+        }
+      }
+      ASSERT_TRUE(to >= 0);
+      ASSERT_TRUE(from != to);
+      vector<pair<int32_t,int32_t>> new_pg_upmap_items;
+      new_pg_upmap_items.push_back(make_pair(from, to));
+      OSDMap::Incremental pending_inc(tmp.get_epoch() + 1);
+      pending_inc.new_pg_upmap_items[ec_pgid] =
+        mempool::osdmap::vector<pair<int32_t,int32_t>>(
+          new_pg_upmap_items.begin(), new_pg_upmap_items.end());
+      tmp.apply_incremental(pending_inc);
+      ASSERT_TRUE(tmp.have_pg_upmaps(ec_pgid));
+    }
+    {
+      // *maybe_remove_pg_upmaps* should not remove the above upmap_item
+      OSDMap::Incremental pending_inc(tmp.get_epoch() + 1);
+      OSDMap nextmap;
+      nextmap.deepish_copy_from(tmp);
+      nextmap.maybe_remove_pg_upmaps(g_ceph_context, nextmap, &pending_inc);
+      tmp.apply_incremental(pending_inc);
+      ASSERT_TRUE(tmp.have_pg_upmaps(ec_pgid));
+    }
+  }
+
+  {
     // TEST pg_upmap
     {
       // STEP-1: enumerate all children of up[0]'s parent,
@@ -941,6 +1070,229 @@
   }
 }
 
+TEST_F(OSDMapTest, BUG_38897) {
+  // http://tracker.ceph.com/issues/38897
+  // build a fresh map with 12 OSDs, without any default pools
+  set_up_map(12, true);
+  const string pool_1("pool1");
+  const string pool_2("pool2");
+  int64_t pool_1_id = -1;
+
+  {
+    // build customized crush rule for "pool1"
+    string host_name = "host_for_pool_1";
+    // build a customized host to capture osd.1~5
+    for (int i = 1; i < 5; i++) {
+      stringstream osd_name;
+      vector<string> move_to;
+      osd_name << "osd." << i;
+      move_to.push_back("root=default");
+      string host_loc = "host=" + host_name;
+      move_to.push_back(host_loc);
+      auto r = crush_move(osdmap, osd_name.str(), move_to);
+      ASSERT_EQ(0, r);
+    }
+    CrushWrapper crush;
+    get_crush(osdmap, crush);
+    auto host_id = crush.get_item_id(host_name);
+    ASSERT_TRUE(host_id < 0);
+    string rule_name = "rule_for_pool1";
+    int rule_type = pg_pool_t::TYPE_REPLICATED;
+    ASSERT_TRUE(!crush.rule_exists(rule_name));
+    int rno;
+    for (rno = 0; rno < crush.get_max_rules(); rno++) {
+      if (!crush.rule_exists(rno) && !crush.ruleset_exists(rno))
+        break;
+    }
+    int min_size = 3;
+    int max_size = 3;
+    int steps = 7;
+    crush_rule *rule = crush_make_rule(steps, rno, rule_type, min_size, max_size);
+    int step = 0;
+    crush_rule_set_step(rule, step++, CRUSH_RULE_SET_CHOOSELEAF_TRIES, 5, 0);
+    crush_rule_set_step(rule, step++, CRUSH_RULE_SET_CHOOSE_TRIES, 100, 0);
+    // always choose osd.0
+    crush_rule_set_step(rule, step++, CRUSH_RULE_TAKE, 0, 0);
+    crush_rule_set_step(rule, step++, CRUSH_RULE_EMIT, 0, 0);
+    // then pick any other random osds
+    crush_rule_set_step(rule, step++, CRUSH_RULE_TAKE, host_id, 0);
+    crush_rule_set_step(rule, step++, CRUSH_RULE_CHOOSELEAF_FIRSTN, 2, 0);
+    crush_rule_set_step(rule, step++, CRUSH_RULE_EMIT, 0, 0);
+    ASSERT_TRUE(step == steps);
+    auto r = crush_add_rule(crush.get_crush_map(), rule, rno);
+    ASSERT_TRUE(r >= 0);
+    crush.set_rule_name(rno, rule_name);
+    {
+      OSDMap::Incremental pending_inc(osdmap.get_epoch() + 1);
+      pending_inc.crush.clear();
+      crush.encode(pending_inc.crush, CEPH_FEATURES_SUPPORTED_DEFAULT);
+      osdmap.apply_incremental(pending_inc);
+    }
+
+    // create "pool1"
+    OSDMap::Incremental pending_inc(osdmap.get_epoch() + 1);
+    pending_inc.new_pool_max = osdmap.get_pool_max();
+    auto pool_id = ++pending_inc.new_pool_max;
+    pool_1_id = pool_id;
+    pg_pool_t empty;
+    auto p = pending_inc.get_new_pool(pool_id, &empty);
+    p->size = 3;
+    p->min_size = 1;
+    p->set_pg_num(3);
+    p->set_pgp_num(3);
+    p->type = pg_pool_t::TYPE_REPLICATED;
+    p->crush_rule = rno;
+    p->set_flag(pg_pool_t::FLAG_HASHPSPOOL);
+    pending_inc.new_pool_names[pool_id] = pool_1;
+    osdmap.apply_incremental(pending_inc);
+    ASSERT_TRUE(osdmap.have_pg_pool(pool_id));
+    ASSERT_TRUE(osdmap.get_pool_name(pool_id) == pool_1);
+    {
+      for (unsigned i = 0; i < 3; i++) {
+        // 1.x -> [1]
+        pg_t rawpg(i, pool_id);
+        pg_t pgid = osdmap.raw_pg_to_pg(rawpg);
+        vector<int> up;
+        int up_primary;
+        osdmap.pg_to_raw_up(pgid, &up, &up_primary);
+        ASSERT_TRUE(up.size() == 3);
+        ASSERT_TRUE(up[0] == 0);
+
+        // insert a new pg_upmap
+        vector<int32_t> new_up;
+        // and remap 1.x to osd.1 only
+        // this way osd.0 is deemed to be *underfull*
+        // and osd.1 is deemed to be *overfull*
+        new_up.push_back(1);
+        {
+          OSDMap::Incremental pending_inc(osdmap.get_epoch() + 1);
+          pending_inc.new_pg_upmap[pgid] = mempool::osdmap::vector<int32_t>(
+            new_up.begin(), new_up.end());
+          osdmap.apply_incremental(pending_inc);
+        }
+        osdmap.pg_to_raw_up(pgid, &up, &up_primary);
+        ASSERT_TRUE(up.size() == 1);
+        ASSERT_TRUE(up[0] == 1);
+      }
+    }
+  }
+
+  {
+    // build customized crush rule for "pool2"
+    string host_name = "host_for_pool_2";
+    // build a customized host to capture osd.6~11
+    for (int i = 6; i < (int)get_num_osds(); i++) {
+      stringstream osd_name;
+      vector<string> move_to;
+      osd_name << "osd." << i;
+      move_to.push_back("root=default");
+      string host_loc = "host=" + host_name;
+      move_to.push_back(host_loc);
+      auto r = crush_move(osdmap, osd_name.str(), move_to);
+      ASSERT_EQ(0, r);
+    }
+    CrushWrapper crush;
+    get_crush(osdmap, crush);
+    auto host_id = crush.get_item_id(host_name);
+    ASSERT_TRUE(host_id < 0);
+    string rule_name = "rule_for_pool2";
+    int rule_type = pg_pool_t::TYPE_REPLICATED;
+    ASSERT_TRUE(!crush.rule_exists(rule_name));
+    int rno;
+    for (rno = 0; rno < crush.get_max_rules(); rno++) {
+      if (!crush.rule_exists(rno) && !crush.ruleset_exists(rno))
+        break;
+    }
+    int min_size = 3;
+    int max_size = 3;
+    int steps = 7;
+    crush_rule *rule = crush_make_rule(steps, rno, rule_type, min_size, max_size);
+    int step = 0;
+    crush_rule_set_step(rule, step++, CRUSH_RULE_SET_CHOOSELEAF_TRIES, 5, 0);
+    crush_rule_set_step(rule, step++, CRUSH_RULE_SET_CHOOSE_TRIES, 100, 0);
+    // always choose osd.0
+    crush_rule_set_step(rule, step++, CRUSH_RULE_TAKE, 0, 0);
+    crush_rule_set_step(rule, step++, CRUSH_RULE_EMIT, 0, 0);
+    // then pick any other random osds
+    crush_rule_set_step(rule, step++, CRUSH_RULE_TAKE, host_id, 0);
+    crush_rule_set_step(rule, step++, CRUSH_RULE_CHOOSELEAF_FIRSTN, 2, 0);
+    crush_rule_set_step(rule, step++, CRUSH_RULE_EMIT, 0, 0);
+    ASSERT_TRUE(step == steps);
+    auto r = crush_add_rule(crush.get_crush_map(), rule, rno);
+    ASSERT_TRUE(r >= 0);
+    crush.set_rule_name(rno, rule_name);
+    {
+      OSDMap::Incremental pending_inc(osdmap.get_epoch() + 1);
+      pending_inc.crush.clear();
+      crush.encode(pending_inc.crush, CEPH_FEATURES_SUPPORTED_DEFAULT);
+      osdmap.apply_incremental(pending_inc);
+    }
+
+    // create "pool2"
+    OSDMap::Incremental pending_inc(osdmap.get_epoch() + 1);
+    pending_inc.new_pool_max = osdmap.get_pool_max();
+    auto pool_id = ++pending_inc.new_pool_max;
+    pg_pool_t empty;
+    auto p = pending_inc.get_new_pool(pool_id, &empty);
+    p->size = 3;
+    // include a single PG
+    p->set_pg_num(1);
+    p->set_pgp_num(1);
+    p->type = pg_pool_t::TYPE_REPLICATED;
+    p->crush_rule = rno;
+    p->set_flag(pg_pool_t::FLAG_HASHPSPOOL);
+    pending_inc.new_pool_names[pool_id] = pool_2;
+    osdmap.apply_incremental(pending_inc);
+    ASSERT_TRUE(osdmap.have_pg_pool(pool_id));
+    ASSERT_TRUE(osdmap.get_pool_name(pool_id) == pool_2);
+    pg_t rawpg(0, pool_id);
+    pg_t pgid = osdmap.raw_pg_to_pg(rawpg);
+    EXPECT_TRUE(!osdmap.have_pg_upmaps(pgid));
+    vector<int> up;
+    int up_primary;
+    osdmap.pg_to_raw_up(pgid, &up, &up_primary);
+    ASSERT_TRUE(up.size() == 3);
+    ASSERT_TRUE(up[0] == 0);
+
+    {
+      // build a pg_upmap_item that will
+      // remap pg out from *underfull* osd.0
+      vector<pair<int32_t,int32_t>> new_pg_upmap_items;
+      new_pg_upmap_items.push_back(make_pair(0, 10)); // osd.0 -> osd.10
+      OSDMap::Incremental pending_inc(osdmap.get_epoch() + 1);
+      pending_inc.new_pg_upmap_items[pgid] =
+      mempool::osdmap::vector<pair<int32_t,int32_t>>(
+        new_pg_upmap_items.begin(), new_pg_upmap_items.end());
+      osdmap.apply_incremental(pending_inc);
+      ASSERT_TRUE(osdmap.have_pg_upmaps(pgid));
+      vector<int> up;
+      int up_primary;
+      osdmap.pg_to_raw_up(pgid, &up, &up_primary);
+      ASSERT_TRUE(up.size() == 3);
+      ASSERT_TRUE(up[0] == 10);
+    }
+  }
+
+  // ready to go
+  {
+    // require perfect distribution!
+    auto ret = g_ceph_context->_conf->set_val(
+      "osd_calc_pg_upmaps_max_stddev", "0");
+    ASSERT_EQ(0, ret);
+    g_ceph_context->_conf->apply_changes(nullptr);
+    set<int64_t> only_pools;
+    ASSERT_TRUE(pool_1_id >= 0);
+    only_pools.insert(pool_1_id);
+    OSDMap::Incremental pending_inc(osdmap.get_epoch() + 1);
+    osdmap.calc_pg_upmaps(g_ceph_context,
+                          0, // so we can force optimizing
+                          100,
+                          only_pools,
+                          &pending_inc);
+    osdmap.apply_incremental(pending_inc);
+  }
+}
+
 TEST(PGTempMap, basic)
 {
   PGTempMap m;
diff -Nru ceph-12.2.11/src/test/osd/TestPGLog.cc ceph-12.2.12/src/test/osd/TestPGLog.cc
--- ceph-12.2.11/src/test/osd/TestPGLog.cc	2019-01-30 15:51:26.000000000 +0000
+++ ceph-12.2.12/src/test/osd/TestPGLog.cc	2019-04-11 12:33:50.000000000 +0000
@@ -2834,6 +2834,7 @@
 {
   SetUp(1, 2, 20);
   PGLog::IndexedLog log;
+  EXPECT_EQ(0u, log.dup_index.size()); // Sanity check
   log.head = mk_evt(24, 0);
   log.skip_can_rollback_to_to_head();
   log.head = mk_evt(9, 0);
@@ -2856,6 +2857,7 @@
   EXPECT_EQ(6u, trimmed.size());
   EXPECT_EQ(5u, log.dups.size());
   EXPECT_EQ(0u, trimmed_dups.size());
+  EXPECT_EQ(0u, log.dup_index.size()); // dup_index entry should be trimmed
 }
 
 
diff -Nru ceph-12.2.11/src/test/rgw/rgw_multi/tests.py ceph-12.2.12/src/test/rgw/rgw_multi/tests.py
--- ceph-12.2.11/src/test/rgw/rgw_multi/tests.py	2019-01-30 15:51:26.000000000 +0000
+++ ceph-12.2.12/src/test/rgw/rgw_multi/tests.py	2019-04-11 12:33:50.000000000 +0000
@@ -96,6 +96,15 @@
 def mdlog_autotrim(zone):
     zone.cluster.admin(['mdlog', 'autotrim'])
 
+def datalog_list(zone, period = None):
+    cmd = ['datalog', 'list']
+    (datalog_json, _) = zone.cluster.admin(cmd, read_only=True)
+    datalog_json = datalog_json.decode('utf-8')
+    return json.loads(datalog_json)
+
+def datalog_autotrim(zone):
+    zone.cluster.admin(['datalog', 'autotrim'])
+
 def bilog_list(zone, bucket, args = None):
     cmd = ['bilog', 'list', '--bucket', bucket] + (args or [])
     bilog, _ = zone.cluster.admin(cmd, read_only=True)
@@ -280,7 +289,7 @@
 def data_source_log_status(source_zone):
     source_cluster = source_zone.cluster
     cmd = ['datalog', 'status'] + source_zone.zone_args()
-    datalog_status_json, retcode = source_cluster.rgw_admin(cmd, read_only=True)
+    datalog_status_json, retcode = source_cluster.admin(cmd, read_only=True)
     datalog_status = json.loads(datalog_status_json.decode('utf-8'))
 
     markers = {i: s['marker'] for i, s in enumerate(datalog_status)}
@@ -345,7 +354,7 @@
 
     return True
 
-def zone_data_checkpoint(target_zone, source_zone_conn):
+def zone_data_checkpoint(target_zone, source_zone):
     if target_zone == source_zone:
         return
 
@@ -367,6 +376,13 @@
     assert False, 'failed data checkpoint for target_zone=%s source_zone=%s' % \
                   (target_zone.name, source_zone.name)
 
+def zonegroup_data_checkpoint(zonegroup_conns):
+    for source_conn in zonegroup_conns.rw_zones:
+        for target_conn in zonegroup_conns.zones:
+            if source_conn.zone == target_conn.zone:
+                continue
+            log.debug('data checkpoint: source=%s target=%s', source_conn.zone.name, target_conn.zone.name)
+            zone_data_checkpoint(target_conn.zone, source_conn.zone)
 
 def zone_bucket_checkpoint(target_zone, source_zone, bucket_name):
     if target_zone == source_zone:
@@ -688,6 +704,90 @@
     for _, bucket in zone_bucket:
         zonegroup_bucket_checkpoint(zonegroup_conns, bucket.name)
 
+def test_delete_marker_full_sync():
+    zonegroup = realm.master_zonegroup()
+    zonegroup_conns = ZonegroupConns(zonegroup)
+    buckets, zone_bucket = create_bucket_per_zone(zonegroup_conns)
+
+    # enable versioning
+    for _, bucket in zone_bucket:
+        bucket.configure_versioning(True)
+    zonegroup_meta_checkpoint(zonegroup)
+
+    for zone, bucket in zone_bucket:
+        # upload an initial object
+        key1 = new_key(zone, bucket, 'obj')
+        key1.set_contents_from_string('')
+
+        # create a delete marker
+        key2 = new_key(zone, bucket, 'obj')
+        key2.delete()
+
+    # wait for full sync
+    for _, bucket in zone_bucket:
+        zonegroup_bucket_checkpoint(zonegroup_conns, bucket.name)
+
+def test_suspended_delete_marker_full_sync():
+    zonegroup = realm.master_zonegroup()
+    zonegroup_conns = ZonegroupConns(zonegroup)
+    buckets, zone_bucket = create_bucket_per_zone(zonegroup_conns)
+
+    # enable/suspend versioning
+    for _, bucket in zone_bucket:
+        bucket.configure_versioning(True)
+        bucket.configure_versioning(False)
+    zonegroup_meta_checkpoint(zonegroup)
+
+    for zone, bucket in zone_bucket:
+        # upload an initial object
+        key1 = new_key(zone, bucket, 'obj')
+        key1.set_contents_from_string('')
+
+        # create a delete marker
+        key2 = new_key(zone, bucket, 'obj')
+        key2.delete()
+
+    # wait for full sync
+    for _, bucket in zone_bucket:
+        zonegroup_bucket_checkpoint(zonegroup_conns, bucket.name)
+
+def test_version_suspended_incremental_sync():
+    zonegroup = realm.master_zonegroup()
+    zonegroup_conns = ZonegroupConns(zonegroup)
+
+    zone = zonegroup_conns.rw_zones[0]
+
+    # create a non-versioned bucket
+    bucket = zone.create_bucket(gen_bucket_name())
+    log.debug('created bucket=%s', bucket.name)
+    zonegroup_meta_checkpoint(zonegroup)
+
+    # upload an initial object
+    key1 = new_key(zone, bucket, 'obj')
+    key1.set_contents_from_string('')
+    log.debug('created initial version id=%s', key1.version_id)
+    zonegroup_bucket_checkpoint(zonegroup_conns, bucket.name)
+
+    # enable versioning
+    bucket.configure_versioning(True)
+    zonegroup_meta_checkpoint(zonegroup)
+
+    # re-upload the object as a new version
+    key2 = new_key(zone, bucket, 'obj')
+    key2.set_contents_from_string('')
+    log.debug('created new version id=%s', key2.version_id)
+    zonegroup_bucket_checkpoint(zonegroup_conns, bucket.name)
+
+    # suspend versioning
+    bucket.configure_versioning(False)
+    zonegroup_meta_checkpoint(zonegroup)
+
+    # re-upload the object as a 'null' version
+    key3 = new_key(zone, bucket, 'obj')
+    key3.set_contents_from_string('')
+    log.debug('created null version id=%s', key3.version_id)
+    zonegroup_bucket_checkpoint(zonegroup_conns, bucket.name)
+
 
 def test_bucket_versioning():
     buckets, zone_bucket = create_bucket_per_zone_in_realm()
@@ -822,6 +922,25 @@
             mdlog = mdlog_list(zone, period)
             assert len(mdlog) == 0
 
+def test_datalog_autotrim():
+    zonegroup = realm.master_zonegroup()
+    zonegroup_conns = ZonegroupConns(zonegroup)
+    buckets, zone_bucket = create_bucket_per_zone(zonegroup_conns)
+
+    # upload an object to each zone to generate a datalog entry
+    for zone, bucket in zone_bucket:
+        k = new_key(zone, bucket.name, 'key')
+        k.set_contents_from_string('body')
+
+    # wait for data sync to catch up
+    zonegroup_data_checkpoint(zonegroup_conns)
+
+    # trim each datalog
+    for zone, _ in zone_bucket:
+        datalog_autotrim(zone.zone)
+        datalog = datalog_list(zone.zone)
+        assert len(datalog) == 0
+
 def test_zonegroup_remove():
     zonegroup = realm.master_zonegroup()
     zonegroup_conns = ZonegroupConns(zonegroup)
@@ -913,6 +1032,8 @@
     for zone in zonegroup.zones:
         check_buckets_sync_status_obj_not_exist(zone, buckets)
 
+    zonegroup_data_checkpoint(zonegroup_conns)
+
 def test_bucket_sync_enable_right_after_disable():
     zonegroup = realm.master_zonegroup()
     zonegroup_conns = ZonegroupConns(zonegroup)
@@ -943,6 +1064,8 @@
     for bucket_name in buckets:
         zonegroup_bucket_checkpoint(zonegroup_conns, bucket_name)
 
+    zonegroup_data_checkpoint(zonegroup_conns)
+
 def test_bucket_sync_disable_enable():
     zonegroup = realm.master_zonegroup()
     zonegroup_conns = ZonegroupConns(zonegroup)
@@ -979,6 +1102,8 @@
     for bucket_name in buckets:
         zonegroup_bucket_checkpoint(zonegroup_conns, bucket_name)
 
+    zonegroup_data_checkpoint(zonegroup_conns)
+
 def test_multipart_object_sync():
     zonegroup = realm.master_zonegroup()
     zonegroup_conns = ZonegroupConns(zonegroup)
diff -Nru ceph-12.2.11/src/test/rgw/rgw_multi/zone_rados.py ceph-12.2.12/src/test/rgw/rgw_multi/zone_rados.py
--- ceph-12.2.11/src/test/rgw/rgw_multi/zone_rados.py	2019-01-30 15:51:26.000000000 +0000
+++ ceph-12.2.12/src/test/rgw/rgw_multi/zone_rados.py	2019-04-11 12:33:50.000000000 +0000
@@ -1,4 +1,5 @@
 import logging
+from boto.s3.deletemarker import DeleteMarker
 
 try:
     from itertools import izip_longest as zip_longest
@@ -16,6 +17,13 @@
     assert k2
     log.debug('comparing key name=%s', k1.name)
     eq(k1.name, k2.name)
+    eq(k1.version_id, k2.version_id)
+    eq(k1.is_latest, k2.is_latest)
+    eq(k1.last_modified, k2.last_modified)
+    if isinstance(k1, DeleteMarker):
+        assert isinstance(k2, DeleteMarker)
+        return
+
     eq(k1.get_contents_as_string(), k2.get_contents_as_string())
     eq(k1.metadata, k2.metadata)
     eq(k1.cache_control, k2.cache_control)
@@ -24,16 +32,13 @@
     eq(k1.content_disposition, k2.content_disposition)
     eq(k1.content_language, k2.content_language)
     eq(k1.etag, k2.etag)
-    eq(k1.last_modified, k2.last_modified)
     if check_extra:
         eq(k1.owner.id, k2.owner.id)
         eq(k1.owner.display_name, k2.owner.display_name)
     eq(k1.storage_class, k2.storage_class)
     eq(k1.size, k2.size)
-    eq(k1.version_id, k2.version_id)
     eq(k1.encrypted, k2.encrypted)
 
-
 class RadosZone(Zone):
     def __init__(self, name, zonegroup = None, cluster = None, data = None, zone_id = None, gateways = None):
         super(RadosZone, self).__init__(name, zonegroup, cluster, data, zone_id, gateways)
@@ -57,14 +62,17 @@
             b1 = self.get_bucket(bucket_name)
             b2 = zone_conn.get_bucket(bucket_name)
 
+            b1_versions = b1.list_versions()
             log.debug('bucket1 objects:')
-            for o in b1.get_all_versions():
+            for o in b1_versions:
                 log.debug('o=%s', o.name)
+
+            b2_versions = b2.list_versions()
             log.debug('bucket2 objects:')
-            for o in b2.get_all_versions():
+            for o in b2_versions:
                 log.debug('o=%s', o.name)
 
-            for k1, k2 in zip_longest(b1.get_all_versions(), b2.get_all_versions()):
+            for k1, k2 in zip_longest(b1_versions, b2_versions):
                 if k1 is None:
                     log.critical('key=%s is missing from zone=%s', k2.name, self.name)
                     assert False
@@ -74,11 +82,23 @@
 
                 check_object_eq(k1, k2)
 
-                # now get the keys through a HEAD operation, verify that the available data is the same
-                k1_head = b1.get_key(k1.name)
-                k2_head = b2.get_key(k2.name)
-
-                check_object_eq(k1_head, k2_head, False)
+                if isinstance(k1, DeleteMarker):
+                    # verify that HEAD sees a delete marker
+                    assert b1.get_key(k1.name) is None
+                    assert b2.get_key(k2.name) is None
+                else:
+                    # now get the keys through a HEAD operation, verify that the available data is the same
+                    k1_head = b1.get_key(k1.name, version_id=k1.version_id)
+                    k2_head = b2.get_key(k2.name, version_id=k2.version_id)
+                    check_object_eq(k1_head, k2_head, False)
+
+                    if k1.version_id:
+                        # compare the olh to make sure they agree about the current version
+                        k1_olh = b1.get_key(k1.name)
+                        k2_olh = b2.get_key(k2.name)
+                        # if there's a delete marker, HEAD will return None
+                        if k1_olh or k2_olh:
+                            check_object_eq(k1_olh, k2_olh, False)
 
             log.info('success, bucket identical: bucket=%s zones={%s, %s}', bucket_name, self.name, zone_conn.name)
 
diff -Nru ceph-12.2.11/src/test/rgw/test_rgw_crypto.cc ceph-12.2.12/src/test/rgw/test_rgw_crypto.cc
--- ceph-12.2.11/src/test/rgw/test_rgw_crypto.cc	2019-01-30 15:51:26.000000000 +0000
+++ ceph-12.2.12/src/test/rgw/test_rgw_crypto.cc	2019-04-11 12:33:50.000000000 +0000
@@ -68,12 +68,14 @@
 
 
 class BlockCryptNone: public BlockCrypt {
+  size_t block_size = 256;
 public:
   BlockCryptNone(){};
+  BlockCryptNone(size_t sz) : block_size(sz) {}
   virtual ~BlockCryptNone(){};
   size_t get_block_size() override
   {
-    return 256;
+    return block_size;
   }
   bool encrypt(bufferlist& input,
                        off_t in_ofs,
@@ -526,6 +528,172 @@
   ASSERT_EQ(fixup_range(&decrypt,513,1024), range_t(512,1024+255));
 }
 
+using parts_len_t = std::vector<size_t>;
+
+class TestRGWGetObj_BlockDecrypt : public RGWGetObj_BlockDecrypt {
+  using RGWGetObj_BlockDecrypt::RGWGetObj_BlockDecrypt;
+public:
+  void set_parts_len(parts_len_t&& other) {
+    parts_len = std::move(other);
+  }
+};
+
+std::vector<size_t> create_mp_parts(size_t obj_size, size_t mp_part_len){
+  std::vector<size_t> parts_len;
+  size_t part_size;
+  size_t ofs=0;
+
+  while (ofs < obj_size){
+    part_size = std::min(mp_part_len, (obj_size - ofs));
+    ofs += part_size;
+    parts_len.push_back(part_size);
+  }
+  return parts_len;
+}
+
+const size_t part_size = 5*1024*1024;
+const size_t obj_size = 30*1024*1024;
+
+TEST(TestRGWCrypto, check_RGWGetObj_BlockDecrypt_fixup_simple)
+{
+
+  ut_get_sink get_sink;
+  auto nonecrypt = ceph::make_unique<BlockCryptNone>(4096);
+  TestRGWGetObj_BlockDecrypt decrypt(g_ceph_context, &get_sink,
+				     std::move(nonecrypt));
+  decrypt.set_parts_len(create_mp_parts(obj_size, part_size));
+  ASSERT_EQ(fixup_range(&decrypt,0,0),     range_t(0,4095));
+  ASSERT_EQ(fixup_range(&decrypt,1,4096),   range_t(0,8191));
+  ASSERT_EQ(fixup_range(&decrypt,0,4095),   range_t(0,4095));
+  ASSERT_EQ(fixup_range(&decrypt,4095,4096), range_t(0,8191));
+
+  // ranges are end-end inclusive, we request bytes just spanning short of first
+  // part to exceeding the first part, part_size - 1 is aligned to a 4095 boundary
+  ASSERT_EQ(fixup_range(&decrypt, 0, part_size - 2), range_t(0, part_size -1));
+  ASSERT_EQ(fixup_range(&decrypt, 0, part_size - 1), range_t(0, part_size -1));
+  ASSERT_EQ(fixup_range(&decrypt, 0, part_size),     range_t(0, part_size + 4095));
+  ASSERT_EQ(fixup_range(&decrypt, 0, part_size + 1), range_t(0, part_size + 4095));
+
+  // request bytes spanning 2 parts
+  ASSERT_EQ(fixup_range(&decrypt, part_size -2, part_size + 2),
+	    range_t(part_size - 4096, part_size + 4095));
+
+  // request last byte
+  ASSERT_EQ(fixup_range(&decrypt, obj_size - 1, obj_size -1),
+	    range_t(obj_size - 4096, obj_size -1));
+
+}
+
+TEST(TestRGWCrypto, check_RGWGetObj_BlockDecrypt_fixup_non_aligned_obj_size)
+{
+
+  ut_get_sink get_sink;
+  auto nonecrypt = ceph::make_unique<BlockCryptNone>(4096);
+  TestRGWGetObj_BlockDecrypt decrypt(g_ceph_context, &get_sink,
+				     std::move(nonecrypt));
+  auto na_obj_size = obj_size + 1;
+  decrypt.set_parts_len(create_mp_parts(na_obj_size, part_size));
+
+  // these should be unaffected here
+  ASSERT_EQ(fixup_range(&decrypt, 0, part_size - 2), range_t(0, part_size -1));
+  ASSERT_EQ(fixup_range(&decrypt, 0, part_size - 1), range_t(0, part_size -1));
+  ASSERT_EQ(fixup_range(&decrypt, 0, part_size),     range_t(0, part_size + 4095));
+  ASSERT_EQ(fixup_range(&decrypt, 0, part_size + 1), range_t(0, part_size + 4095));
+
+
+  // request last 2 bytes; spanning 2 parts
+  ASSERT_EQ(fixup_range(&decrypt, na_obj_size -2 , na_obj_size -1),
+	    range_t(na_obj_size - 1 - 4096, na_obj_size - 1));
+
+  // request last byte, spans last 1B part only
+  ASSERT_EQ(fixup_range(&decrypt, na_obj_size -1, na_obj_size - 1),
+	    range_t(na_obj_size - 1, na_obj_size -1));
+
+}
+
+TEST(TestRGWCrypto, check_RGWGetObj_BlockDecrypt_fixup_non_aligned_part_size)
+{
+
+  ut_get_sink get_sink;
+  auto nonecrypt = ceph::make_unique<BlockCryptNone>(4096);
+  TestRGWGetObj_BlockDecrypt decrypt(g_ceph_context, &get_sink,
+				     std::move(nonecrypt));
+  auto na_part_size = part_size + 1;
+  decrypt.set_parts_len(create_mp_parts(obj_size, na_part_size));
+
+  // na_part_size -2, ie. part_size -1  is aligned to 4095 boundary
+  ASSERT_EQ(fixup_range(&decrypt, 0, na_part_size - 2), range_t(0, na_part_size -2));
+  // even though na_part_size -1 should not align to a 4095 boundary, the range
+  // should not span the next part
+  ASSERT_EQ(fixup_range(&decrypt, 0, na_part_size - 1), range_t(0, na_part_size -1));
+
+  ASSERT_EQ(fixup_range(&decrypt, 0, na_part_size),     range_t(0, na_part_size + 4095));
+  ASSERT_EQ(fixup_range(&decrypt, 0, na_part_size + 1), range_t(0, na_part_size + 4095));
+
+  // request spanning 2 parts
+  ASSERT_EQ(fixup_range(&decrypt, na_part_size - 2, na_part_size + 2),
+	    range_t(na_part_size - 1 - 4096, na_part_size + 4095));
+
+  // request last byte, this will be interesting, since this a multipart upload
+  // with 5MB+1 size, the last part is actually 5 bytes short of 5 MB, which
+  // should be considered for the ranges alignment; an easier way to look at
+  // this will be that the last offset aligned to a 5MiB part will be 5MiB -
+  // 4095, this is a part that is 5MiB - 5 B
+  ASSERT_EQ(fixup_range(&decrypt, obj_size - 1, obj_size -1),
+	    range_t(obj_size +5 -4096, obj_size -1));
+
+}
+
+TEST(TestRGWCrypto, check_RGWGetObj_BlockDecrypt_fixup_non_aligned)
+{
+
+  ut_get_sink get_sink;
+  auto nonecrypt = ceph::make_unique<BlockCryptNone>(4096);
+  TestRGWGetObj_BlockDecrypt decrypt(g_ceph_context, &get_sink,
+				     std::move(nonecrypt));
+  auto na_part_size = part_size + 1;
+  auto na_obj_size = obj_size + 7; // (6*(5MiB + 1) + 1) for the last 1B overflow
+  decrypt.set_parts_len(create_mp_parts(na_obj_size, na_part_size));
+
+  // na_part_size -2, ie. part_size -1  is aligned to 4095 boundary
+  ASSERT_EQ(fixup_range(&decrypt, 0, na_part_size - 2), range_t(0, na_part_size -2));
+  // even though na_part_size -1 should not align to a 4095 boundary, the range
+  // should not span the next part
+  ASSERT_EQ(fixup_range(&decrypt, 0, na_part_size - 1), range_t(0, na_part_size -1));
+
+  ASSERT_EQ(fixup_range(&decrypt, 0, na_part_size),     range_t(0, na_part_size + 4095));
+  ASSERT_EQ(fixup_range(&decrypt, 0, na_part_size + 1), range_t(0, na_part_size + 4095));
+
+  // request last byte, spans last 1B part only
+  ASSERT_EQ(fixup_range(&decrypt, na_obj_size -1, na_obj_size - 1),
+	    range_t(na_obj_size - 1, na_obj_size -1));
+
+  ASSERT_EQ(fixup_range(&decrypt, na_obj_size -2, na_obj_size -1),
+	    range_t(na_obj_size - 2, na_obj_size -1));
+
+}
+
+TEST(TestRGWCrypto, check_RGWGetObj_BlockDecrypt_fixup_invalid_ranges)
+{
+
+  ut_get_sink get_sink;
+  auto nonecrypt = ceph::make_unique<BlockCryptNone>(4096);
+  TestRGWGetObj_BlockDecrypt decrypt(g_ceph_context, &get_sink,
+				     std::move(nonecrypt));
+
+  decrypt.set_parts_len(create_mp_parts(obj_size, part_size));
+
+  // the ranges below would be mostly unreachable in current code as rgw
+  // would've returned a 411 before reaching, but we're just doing this to make
+  // sure we don't have invalid access
+  ASSERT_EQ(fixup_range(&decrypt, obj_size - 1, obj_size + 100),
+            range_t(obj_size - 4096, obj_size - 1));
+  ASSERT_EQ(fixup_range(&decrypt, obj_size, obj_size + 1),
+            range_t(obj_size - 1, obj_size - 1));
+  ASSERT_EQ(fixup_range(&decrypt, obj_size+1, obj_size + 100),
+            range_t(obj_size - 1, obj_size - 1));
+
+}
 
 TEST(TestRGWCrypto, verify_RGWPutObj_BlockEncrypt_chunks)
 {
diff -Nru ceph-12.2.11/src/test/smoke.sh ceph-12.2.12/src/test/smoke.sh
--- ceph-12.2.11/src/test/smoke.sh	2019-01-30 15:51:26.000000000 +0000
+++ ceph-12.2.12/src/test/smoke.sh	2019-04-11 12:33:50.000000000 +0000
@@ -2,11 +2,13 @@
 
 source $CEPH_ROOT/qa/standalone/ceph-helpers.sh
 
+mon_port=$(get_unused_port)
+
 function run() {
     local dir=$1
     shift
 
-    export CEPH_MON="127.0.0.1:7224" # git grep '\<7224\>' : there must be only one
+    export CEPH_MON="127.0.0.1:$mon_port"
     export CEPH_ARGS
     CEPH_ARGS+="--fsid=$(uuidgen) --auth-supported=none "
     CEPH_ARGS+="--mon-host=$CEPH_MON "
@@ -34,10 +36,9 @@
 
 function TEST_multimon() {
     local dir=$1
-
-    MONA="127.0.0.1:7224" # git grep '\<7224\>' : there must be only one
-    MONB="127.0.0.1:7225" # git grep '\<7225\>' : there must be only one
-    MONC="127.0.0.1:7226" # git grep '\<7226\>' : there must be only one
+    MONA="127.0.0.1:$((mon_port++))"
+    MONB="127.0.0.1:$((mon_port++))"
+    MONC="127.0.0.1:$((mon_port++))"
 
     run_mon $dir a --public-addr $MONA
     run_mon $dir b --public-addr $MONB
diff -Nru ceph-12.2.11/src/test/ubuntu-12.04/debian/changelog ceph-12.2.12/src/test/ubuntu-12.04/debian/changelog
--- ceph-12.2.11/src/test/ubuntu-12.04/debian/changelog	2019-04-26 12:17:04.000000000 +0000
+++ ceph-12.2.12/src/test/ubuntu-12.04/debian/changelog	2019-06-12 10:05:49.000000000 +0000
@@ -1,3 +1,11 @@
+ceph (12.2.12-0ubuntu0.18.04.1) bionic; urgency=medium
+
+  * d/copyright: Exclude cruft from upstream tarballs.
+  * New upstream point release (LP: #1829716).
+  * d/p/s390x-link.patch: Drop, included upstream.
+
+ -- James Page <james.page@ubuntu.com>  Wed, 12 Jun 2019 11:05:49 +0100
+
 ceph (12.2.11-0ubuntu0.18.04.2) bionic; urgency=medium
 
   * d/control: Use openssl1.0 at build and runtime as
diff -Nru ceph-12.2.11/src/test/ubuntu-12.04/debian/copyright ceph-12.2.12/src/test/ubuntu-12.04/debian/copyright
--- ceph-12.2.11/src/test/ubuntu-12.04/debian/copyright	2019-02-11 11:06:34.000000000 +0000
+++ ceph-12.2.12/src/test/ubuntu-12.04/debian/copyright	2019-06-12 10:04:50.000000000 +0000
@@ -2,7 +2,14 @@
 Upstream-Name: ceph
 Upstream-Contact: Sage Weil <sage@newdream.net>
 Source: http://ceph.com/
-Files-Excluded: debian
+Files-Excluded:
+ debian
+ src/civetweb/examples/websocket_client/ssl/server.key.orig
+ src/civetweb/resources/cert/client.key.orig
+ src/civetweb/resources/cert/server.key.orig
+ src/erasure-code/jerasure/jerasure/Examples/makefile.orig
+ src/erasure-code/jerasure/jerasure/include/config.h.in~
+ src/erasure-code/jerasure/jerasure/makefile.orig
 
 Files: *
 Copyright: 2004-2014 Sage Weil <sage@newdream.net>
diff -Nru ceph-12.2.11/src/test/ubuntu-12.04/debian/patches/s390x-link.patch ceph-12.2.12/src/test/ubuntu-12.04/debian/patches/s390x-link.patch
--- ceph-12.2.11/src/test/ubuntu-12.04/debian/patches/s390x-link.patch	2019-02-13 17:11:15.000000000 +0000
+++ ceph-12.2.12/src/test/ubuntu-12.04/debian/patches/s390x-link.patch	1970-01-01 00:00:00.000000000 +0000
@@ -1,16 +0,0 @@
-Descrption: Fix linking issues on s390x
-Origin: https://github.com/ceph/ceph/pull/21380
-
---- a/src/rgw/CMakeLists.txt
-+++ b/src/rgw/CMakeLists.txt
-@@ -177,9 +177,7 @@ endif (WITH_RADOSGW_BEAST_FRONTEND)
- 
- add_library(radosgw_a STATIC ${radosgw_srcs}
-   $<TARGET_OBJECTS:civetweb_common_objs>)
--if (WITH_RADOSGW_BEAST_FRONTEND AND WITH_RADOSGW_BEAST_OPENSSL)
--  target_link_libraries(radosgw_a rgw_a ${SSL_LIBRARIES})
--endif()
-+target_link_libraries(radosgw_a rgw_a ${SSL_LIBRARIES})
- 
- add_executable(radosgw rgw_main.cc)
- target_link_libraries(radosgw radosgw_a librados
diff -Nru ceph-12.2.11/src/test/ubuntu-12.04/debian/patches/series ceph-12.2.12/src/test/ubuntu-12.04/debian/patches/series
--- ceph-12.2.11/src/test/ubuntu-12.04/debian/patches/series	2019-02-13 17:11:15.000000000 +0000
+++ ceph-12.2.12/src/test/ubuntu-12.04/debian/patches/series	2019-06-12 10:04:50.000000000 +0000
@@ -7,4 +7,3 @@
 # Ubuntu: FTBFS on armhf
 armhf-ftbfs.patch
 misc-32-bit-fixes.patch
-s390x-link.patch
diff -Nru ceph-12.2.11/src/test/ubuntu-14.04/debian/changelog ceph-12.2.12/src/test/ubuntu-14.04/debian/changelog
--- ceph-12.2.11/src/test/ubuntu-14.04/debian/changelog	2019-04-26 12:17:04.000000000 +0000
+++ ceph-12.2.12/src/test/ubuntu-14.04/debian/changelog	2019-06-12 10:05:49.000000000 +0000
@@ -1,3 +1,11 @@
+ceph (12.2.12-0ubuntu0.18.04.1) bionic; urgency=medium
+
+  * d/copyright: Exclude cruft from upstream tarballs.
+  * New upstream point release (LP: #1829716).
+  * d/p/s390x-link.patch: Drop, included upstream.
+
+ -- James Page <james.page@ubuntu.com>  Wed, 12 Jun 2019 11:05:49 +0100
+
 ceph (12.2.11-0ubuntu0.18.04.2) bionic; urgency=medium
 
   * d/control: Use openssl1.0 at build and runtime as
diff -Nru ceph-12.2.11/src/test/ubuntu-14.04/debian/copyright ceph-12.2.12/src/test/ubuntu-14.04/debian/copyright
--- ceph-12.2.11/src/test/ubuntu-14.04/debian/copyright	2019-02-11 11:06:34.000000000 +0000
+++ ceph-12.2.12/src/test/ubuntu-14.04/debian/copyright	2019-06-12 10:04:50.000000000 +0000
@@ -2,7 +2,14 @@
 Upstream-Name: ceph
 Upstream-Contact: Sage Weil <sage@newdream.net>
 Source: http://ceph.com/
-Files-Excluded: debian
+Files-Excluded:
+ debian
+ src/civetweb/examples/websocket_client/ssl/server.key.orig
+ src/civetweb/resources/cert/client.key.orig
+ src/civetweb/resources/cert/server.key.orig
+ src/erasure-code/jerasure/jerasure/Examples/makefile.orig
+ src/erasure-code/jerasure/jerasure/include/config.h.in~
+ src/erasure-code/jerasure/jerasure/makefile.orig
 
 Files: *
 Copyright: 2004-2014 Sage Weil <sage@newdream.net>
diff -Nru ceph-12.2.11/src/test/ubuntu-14.04/debian/patches/s390x-link.patch ceph-12.2.12/src/test/ubuntu-14.04/debian/patches/s390x-link.patch
--- ceph-12.2.11/src/test/ubuntu-14.04/debian/patches/s390x-link.patch	2019-02-13 17:11:15.000000000 +0000
+++ ceph-12.2.12/src/test/ubuntu-14.04/debian/patches/s390x-link.patch	1970-01-01 00:00:00.000000000 +0000
@@ -1,16 +0,0 @@
-Descrption: Fix linking issues on s390x
-Origin: https://github.com/ceph/ceph/pull/21380
-
---- a/src/rgw/CMakeLists.txt
-+++ b/src/rgw/CMakeLists.txt
-@@ -177,9 +177,7 @@ endif (WITH_RADOSGW_BEAST_FRONTEND)
- 
- add_library(radosgw_a STATIC ${radosgw_srcs}
-   $<TARGET_OBJECTS:civetweb_common_objs>)
--if (WITH_RADOSGW_BEAST_FRONTEND AND WITH_RADOSGW_BEAST_OPENSSL)
--  target_link_libraries(radosgw_a rgw_a ${SSL_LIBRARIES})
--endif()
-+target_link_libraries(radosgw_a rgw_a ${SSL_LIBRARIES})
- 
- add_executable(radosgw rgw_main.cc)
- target_link_libraries(radosgw radosgw_a librados
diff -Nru ceph-12.2.11/src/test/ubuntu-14.04/debian/patches/series ceph-12.2.12/src/test/ubuntu-14.04/debian/patches/series
--- ceph-12.2.11/src/test/ubuntu-14.04/debian/patches/series	2019-02-13 17:11:15.000000000 +0000
+++ ceph-12.2.12/src/test/ubuntu-14.04/debian/patches/series	2019-06-12 10:04:50.000000000 +0000
@@ -7,4 +7,3 @@
 # Ubuntu: FTBFS on armhf
 armhf-ftbfs.patch
 misc-32-bit-fixes.patch
-s390x-link.patch
diff -Nru ceph-12.2.11/src/test/ubuntu-16.04/debian/changelog ceph-12.2.12/src/test/ubuntu-16.04/debian/changelog
--- ceph-12.2.11/src/test/ubuntu-16.04/debian/changelog	2019-04-26 12:17:04.000000000 +0000
+++ ceph-12.2.12/src/test/ubuntu-16.04/debian/changelog	2019-06-12 10:05:49.000000000 +0000
@@ -1,3 +1,11 @@
+ceph (12.2.12-0ubuntu0.18.04.1) bionic; urgency=medium
+
+  * d/copyright: Exclude cruft from upstream tarballs.
+  * New upstream point release (LP: #1829716).
+  * d/p/s390x-link.patch: Drop, included upstream.
+
+ -- James Page <james.page@ubuntu.com>  Wed, 12 Jun 2019 11:05:49 +0100
+
 ceph (12.2.11-0ubuntu0.18.04.2) bionic; urgency=medium
 
   * d/control: Use openssl1.0 at build and runtime as
diff -Nru ceph-12.2.11/src/test/ubuntu-16.04/debian/copyright ceph-12.2.12/src/test/ubuntu-16.04/debian/copyright
--- ceph-12.2.11/src/test/ubuntu-16.04/debian/copyright	2019-02-11 11:06:34.000000000 +0000
+++ ceph-12.2.12/src/test/ubuntu-16.04/debian/copyright	2019-06-12 10:04:50.000000000 +0000
@@ -2,7 +2,14 @@
 Upstream-Name: ceph
 Upstream-Contact: Sage Weil <sage@newdream.net>
 Source: http://ceph.com/
-Files-Excluded: debian
+Files-Excluded:
+ debian
+ src/civetweb/examples/websocket_client/ssl/server.key.orig
+ src/civetweb/resources/cert/client.key.orig
+ src/civetweb/resources/cert/server.key.orig
+ src/erasure-code/jerasure/jerasure/Examples/makefile.orig
+ src/erasure-code/jerasure/jerasure/include/config.h.in~
+ src/erasure-code/jerasure/jerasure/makefile.orig
 
 Files: *
 Copyright: 2004-2014 Sage Weil <sage@newdream.net>
diff -Nru ceph-12.2.11/src/test/ubuntu-16.04/debian/patches/s390x-link.patch ceph-12.2.12/src/test/ubuntu-16.04/debian/patches/s390x-link.patch
--- ceph-12.2.11/src/test/ubuntu-16.04/debian/patches/s390x-link.patch	2019-02-13 17:11:15.000000000 +0000
+++ ceph-12.2.12/src/test/ubuntu-16.04/debian/patches/s390x-link.patch	1970-01-01 00:00:00.000000000 +0000
@@ -1,16 +0,0 @@
-Descrption: Fix linking issues on s390x
-Origin: https://github.com/ceph/ceph/pull/21380
-
---- a/src/rgw/CMakeLists.txt
-+++ b/src/rgw/CMakeLists.txt
-@@ -177,9 +177,7 @@ endif (WITH_RADOSGW_BEAST_FRONTEND)
- 
- add_library(radosgw_a STATIC ${radosgw_srcs}
-   $<TARGET_OBJECTS:civetweb_common_objs>)
--if (WITH_RADOSGW_BEAST_FRONTEND AND WITH_RADOSGW_BEAST_OPENSSL)
--  target_link_libraries(radosgw_a rgw_a ${SSL_LIBRARIES})
--endif()
-+target_link_libraries(radosgw_a rgw_a ${SSL_LIBRARIES})
- 
- add_executable(radosgw rgw_main.cc)
- target_link_libraries(radosgw radosgw_a librados
diff -Nru ceph-12.2.11/src/test/ubuntu-16.04/debian/patches/series ceph-12.2.12/src/test/ubuntu-16.04/debian/patches/series
--- ceph-12.2.11/src/test/ubuntu-16.04/debian/patches/series	2019-02-13 17:11:15.000000000 +0000
+++ ceph-12.2.12/src/test/ubuntu-16.04/debian/patches/series	2019-06-12 10:04:50.000000000 +0000
@@ -7,4 +7,3 @@
 # Ubuntu: FTBFS on armhf
 armhf-ftbfs.patch
 misc-32-bit-fixes.patch
-s390x-link.patch
diff -Nru ceph-12.2.11/src/tools/ceph_monstore_tool.cc ceph-12.2.12/src/tools/ceph_monstore_tool.cc
--- ceph-12.2.11/src/tools/ceph_monstore_tool.cc	2019-01-30 15:51:26.000000000 +0000
+++ ceph-12.2.12/src/tools/ceph_monstore_tool.cc	2019-04-11 12:33:50.000000000 +0000
@@ -33,6 +33,7 @@
 #include "mon/MgrMap.h"
 #include "osd/OSDMap.h"
 #include "crush/CrushCompiler.h"
+#include "mon/CreatingPGs.h"
 
 namespace po = boost::program_options;
 using namespace std;
@@ -175,7 +176,6 @@
  *  replay-trace
  *  random-gen
  *  rewrite-crush
- *  inflate-pgmap
  *
  * wanted syntax:
  *
@@ -222,8 +222,6 @@
   << "                                  (random-gen -- --help for more info)\n"
   << "  rewrite-crush [-- options]      add a rewrite commit to the store\n"
   << "                                  (rewrite-crush -- --help for more info)\n"
-  << "  inflate-pgmap [-- options]      add given number of pgmaps to store\n"
-  << "                                  (inflate-pgmap -- --help for more info)\n"
   << "  rebuild                         rebuild store\n"
   << "                                  (rebuild -- --help for more info)\n"
   << std::endl;
@@ -454,70 +452,6 @@
   return 0;
 }
 
-int inflate_pgmap(MonitorDBStore& st, unsigned n, bool can_be_trimmed) {
-  // put latest pg map into monstore to bloat it up
-  // only format version == 1 is supported
-  version_t last = st.get("pgmap", "last_committed");
-  bufferlist bl;
-
-  // get the latest delta
-  int r = st.get("pgmap", last, bl);
-  if (r) {
-    std::cerr << "Error getting pgmap: " << cpp_strerror(r) << std::endl;
-    return r;
-  }
-
-  // try to pull together an idempotent "delta"
-  ceph::unordered_map<pg_t, pg_stat_t> pg_stat;
-  for (KeyValueDB::Iterator i = st.get_iterator("pgmap_pg");
-       i->valid(); i->next()) {
-    pg_t pgid;
-    if (!pgid.parse(i->key().c_str())) {
-      std::cerr << "unable to parse key " << i->key() << std::endl;
-      continue;
-    }
-    bufferlist pg_bl = i->value();
-    pg_stat_t ps;
-    bufferlist::iterator p = pg_bl.begin();
-    ::decode(ps, p);
-    // will update the last_epoch_clean of all the pgs.
-    pg_stat[pgid] = ps;
-  }
-
-  version_t first = st.get("pgmap", "first_committed");
-  version_t ver = last;
-  auto txn(std::make_shared<MonitorDBStore::Transaction>());
-  for (unsigned i = 0; i < n; i++) {
-    bufferlist trans_bl;
-    bufferlist dirty_pgs;
-    for (ceph::unordered_map<pg_t, pg_stat_t>::iterator ps = pg_stat.begin();
-	 ps != pg_stat.end(); ++ps) {
-      ::encode(ps->first, dirty_pgs);
-      if (!can_be_trimmed) {
-	ps->second.last_epoch_clean = first;
-      }
-      ::encode(ps->second, dirty_pgs);
-    }
-    utime_t inc_stamp = ceph_clock_now();
-    ::encode(inc_stamp, trans_bl);
-    ::encode_destructively(dirty_pgs, trans_bl);
-    bufferlist dirty_osds;
-    ::encode(dirty_osds, trans_bl);
-    txn->put("pgmap", ++ver, trans_bl);
-    // update the db in batch
-    if (txn->size() > 1024) {
-      st.apply_transaction(txn);
-      // reset the transaction
-      txn.reset(new MonitorDBStore::Transaction);
-    }
-  }
-  txn->put("pgmap", "last_committed", ver);
-  txn->put("pgmap_meta", "version", ver);
-  // this will also piggy back the leftover pgmap added in the loop above
-  st.apply_transaction(txn);
-  return 0;
-}
-
 static int update_auth(MonitorDBStore& st, const string& keyring_path)
 {
   // import all keyrings stored in the keyring file
@@ -593,6 +527,35 @@
 }
 
 // rebuild
+//  - creating_pgs
+static int update_creating_pgs(MonitorDBStore& st)
+{
+  bufferlist bl;
+  auto last_osdmap_epoch = st.get("osdmap", "last_committed");
+  int r = st.get("osdmap", st.combine_strings("full", last_osdmap_epoch), bl);
+  if (r < 0) {
+    cerr << "unable to losd osdmap e" << last_osdmap_epoch << std::endl;
+    return r;
+  }
+
+  OSDMap osdmap;
+  osdmap.decode(bl);
+  creating_pgs_t creating;
+  for (auto& i : osdmap.get_pools()) {
+    creating.created_pools.insert(i.first);
+  }
+  creating.last_scan_epoch = last_osdmap_epoch;
+
+  bufferlist newbl;
+  ::encode(creating, newbl);
+
+  auto t = make_shared<MonitorDBStore::Transaction>();
+  t->put("osd_pg_creating", "creating", newbl);
+  st.apply_transaction(t);
+  return 0;
+}
+
+// rebuild
 //  - mgr
 //  - mgr_command_desc
 static int update_mgrmap(MonitorDBStore& st)
@@ -635,8 +598,7 @@
   {
     MonitorDBStore::Transaction t;
     vector<string> prefixes = {"auth", "osdmap",
-			       "mgr", "mgr_command_desc",
-			       "pgmap", "pgmap_pg", "pgmap_meta"};
+			       "mgr", "mgr_command_desc"};
     for (const auto& prefix : prefixes) {
       for (auto i = st.get_iterator(prefix); i->valid(); i->next()) {
 	auto key = i->raw_key();
@@ -658,60 +620,6 @@
   return 0;
 }
 
-// rebuild
-//  - pgmap_meta/version
-//  - pgmap_meta/last_osdmap_epoch
-//  - pgmap_meta/last_pg_scan
-//  - pgmap_meta/full_ratio
-//  - pgmap_meta/nearfull_ratio
-//  - pgmap_meta/stamp
-static int update_pgmap_meta(MonitorDBStore& st)
-{
-  const string prefix("pgmap_meta");
-  auto t = make_shared<MonitorDBStore::Transaction>();
-  // stolen from PGMonitor::create_pending()
-  // the first pgmap_meta
-  t->put(prefix, "version", 1);
-  {
-    auto stamp = ceph_clock_now();
-    bufferlist bl;
-    ::encode(stamp, bl);
-    t->put(prefix, "stamp", bl);
-  }
-  {
-    auto last_osdmap_epoch = st.get("osdmap", "last_committed");
-    t->put(prefix, "last_osdmap_epoch", last_osdmap_epoch);
-  }
-  // be conservative, so PGMonitor will scan the all pools for pg changes
-  t->put(prefix, "last_pg_scan", 1);
-  {
-    auto full_ratio = g_ceph_context->_conf->mon_osd_full_ratio;
-    if (full_ratio > 1.0)
-      full_ratio /= 100.0;
-    bufferlist bl;
-    ::encode(full_ratio, bl);
-    t->put(prefix, "full_ratio", bl);
-  }
-  {
-    auto backfillfull_ratio = g_ceph_context->_conf->mon_osd_backfillfull_ratio;
-    if (backfillfull_ratio > 1.0)
-      backfillfull_ratio /= 100.0;
-    bufferlist bl;
-    ::encode(backfillfull_ratio, bl);
-    t->put(prefix, "backfillfull_ratio", bl);
-  }
-  {
-    auto nearfull_ratio = g_ceph_context->_conf->mon_osd_nearfull_ratio;
-    if (nearfull_ratio > 1.0)
-      nearfull_ratio /= 100.0;
-    bufferlist bl;
-    ::encode(nearfull_ratio, bl);
-    t->put(prefix, "nearfull_ratio", bl);
-  }
-  st.apply_transaction(t);
-  return 0;
-}
-
 int rebuild_monstore(const char* progname,
 		     vector<string>& subcmds,
 		     MonitorDBStore& st)
@@ -732,7 +640,7 @@
   }
   if (!keyring_path.empty())
     update_auth(st, keyring_path);
-  if ((r = update_pgmap_meta(st))) {
+  if ((r = update_creating_pgs(st))) {
     return r;
   }
   if ((r = update_mgrmap(st))) {
@@ -1328,29 +1236,6 @@
               << std::endl;
   } else if (cmd == "rewrite-crush") {
     err = rewrite_crush(argv[0], subcmds, st);
-  } else if (cmd == "inflate-pgmap") {
-    unsigned n = 2000;
-    bool can_be_trimmed = false;
-    po::options_description op_desc("Allowed 'inflate-pgmap' options");
-    op_desc.add_options()
-      ("num-maps,n", po::value<unsigned>(&n),
-       "number of maps to add (default: 2000)")
-      ("can-be-trimmed", po::value<bool>(&can_be_trimmed),
-       "can be trimmed (default: false)")
-      ;
-
-    po::variables_map op_vm;
-    try {
-      po::parsed_options op_parsed = po::command_line_parser(subcmds).
-        options(op_desc).run();
-      po::store(op_parsed, op_vm);
-      po::notify(op_vm);
-    } catch (po::error &e) {
-      std::cerr << "error: " << e.what() << std::endl;
-      err = EINVAL;
-      goto done;
-    }
-    err = inflate_pgmap(st, n, can_be_trimmed);
   } else if (cmd == "rebuild") {
     err = rebuild_monstore(argv[0], subcmds, st);
   } else {
diff -Nru ceph-12.2.11/src/tools/ceph_objectstore_tool.cc ceph-12.2.12/src/tools/ceph_objectstore_tool.cc
--- ceph-12.2.11/src/tools/ceph_objectstore_tool.cc	2019-01-30 15:51:26.000000000 +0000
+++ ceph-12.2.12/src/tools/ceph_objectstore_tool.cc	2019-04-11 12:33:50.000000000 +0000
@@ -35,6 +35,7 @@
 #include "osd/PGLog.h"
 #include "osd/OSD.h"
 #include "osd/PG.h"
+#include "osd/ECUtil.h"
 
 #include "json_spirit/json_spirit_value.h"
 #include "json_spirit/json_spirit_reader.h"
@@ -2399,6 +2400,22 @@
       formatter->close_section();
     }
   }
+  bufferlist hattr;
+  gr = store->getattr(coll, ghobj, ECUtil::get_hinfo_key(), hattr);
+  if (gr == 0) {
+    ECUtil::HashInfo hinfo;
+    auto hp = hattr.begin();
+    try {
+      decode(hinfo, hp);
+      formatter->open_object_section("hinfo");
+      hinfo.dump(formatter);
+      formatter->close_section();
+    } catch (...) {
+      r = -EINVAL;
+      cerr << "Error decoding hinfo on : " << make_pair(coll, ghobj) << ", "
+           << cpp_strerror(r) << std::endl;
+    }
+  }
   formatter->close_section();
   formatter->flush(cout);
   cout << std::endl;
diff -Nru ceph-12.2.11/src/tools/rbd/ArgumentTypes.cc ceph-12.2.12/src/tools/rbd/ArgumentTypes.cc
--- ceph-12.2.11/src/tools/rbd/ArgumentTypes.cc	2019-01-30 15:51:26.000000000 +0000
+++ ceph-12.2.12/src/tools/rbd/ArgumentTypes.cc	2019-04-11 12:33:50.000000000 +0000
@@ -292,7 +292,7 @@
     (JOURNAL_SPLAY_WIDTH.c_str(), po::value<uint64_t>(),
      "number of active journal objects")
     (JOURNAL_OBJECT_SIZE.c_str(), po::value<JournalObjectSize>(),
-     "size of journal objects")
+     "size of journal objects [4K <= size <= 64M]")
     (JOURNAL_POOL.c_str(), po::value<std::string>(),
      "pool for journal objects");
 }
@@ -506,7 +506,7 @@
 
   std::string parse_error;
   uint64_t size = strict_iecstrtoll(s.c_str(), &parse_error);
-  if (parse_error.empty() && (size >= (1 << 12))) {
+  if (parse_error.empty() && (size >= (1 << 12)) && (size <= (1 << 26))) {
     v = boost::any(size);
     return;
   }
diff -Nru ceph-12.2.11/src/tools/rbd_mirror/ImageReplayer.cc ceph-12.2.12/src/tools/rbd_mirror/ImageReplayer.cc
--- ceph-12.2.11/src/tools/rbd_mirror/ImageReplayer.cc	2019-01-30 15:51:26.000000000 +0000
+++ ceph-12.2.12/src/tools/rbd_mirror/ImageReplayer.cc	2019-04-11 12:33:50.000000000 +0000
@@ -806,7 +806,7 @@
   }
 
   set_state_description(r, desc);
-  update_mirror_image_status(false, boost::none);
+  update_mirror_image_status(true, boost::none);
   reschedule_update_status_task(-1);
   shut_down(0);
 }
@@ -1411,7 +1411,7 @@
   case STATE_STOPPING:
     if (stopping_replay) {
       status.state = cls::rbd::MIRROR_IMAGE_STATUS_STATE_STOPPING_REPLAY;
-      status.description = "stopping replay";
+      status.description = state_desc.empty() ? "stopping replay" : state_desc;
       break;
     }
     // FALLTHROUGH
diff -Nru ceph-12.2.11/src/tools/rebuild_mondb.cc ceph-12.2.12/src/tools/rebuild_mondb.cc
--- ceph-12.2.11/src/tools/rebuild_mondb.cc	2019-01-30 15:51:26.000000000 +0000
+++ ceph-12.2.12/src/tools/rebuild_mondb.cc	2019-04-11 12:33:50.000000000 +0000
@@ -12,7 +12,6 @@
 static int update_osdmap(ObjectStore& fs,
                          OSDSuperblock& sb,
                          MonitorDBStore& ms);
-static int update_pgmap_pg(ObjectStore& fs, MonitorDBStore& ms);
 
 int update_mon_db(ObjectStore& fs, OSDSuperblock& sb,
                   const string& keyring,
@@ -30,9 +29,6 @@
   if ((r = update_osdmap(fs, sb, ms)) < 0) {
     goto out;
   }
-  if ((r = update_pgmap_pg(fs, ms)) < 0) {
-    goto out;
-  }
   if ((r = update_monitor(sb, ms)) < 0) {
     goto out;
   }
@@ -339,59 +335,3 @@
        << nadded << " osdmaps added." << std::endl;
   return 0;
 }
-
-// rebuild
-//  - pgmap_pg/${pgid}
-int update_pgmap_pg(ObjectStore& fs, MonitorDBStore& ms)
-{
-  // pgmap/${epoch} is the incremental of: stamp, pgmap_pg, pgmap_osd
-  // if PGMonitor fails to read it, it will fall back to the pgmap_pg, i.e.
-  // the fullmap.
-  vector<coll_t> collections;
-  int r = fs.list_collections(collections);
-  if (r < 0) {
-    cerr << "failed to list pgs: "  << cpp_strerror(r) << std::endl;
-    return r;
-  }
-  const string prefix("pgmap_pg");
-  // in general, there are less than 100 PGs per OSD, so no need to apply
-  // transaction in batch.
-  auto t = make_shared<MonitorDBStore::Transaction>();
-  unsigned npg = 0;
-  for (const auto& coll : collections) {
-    spg_t pgid;
-    if (!coll.is_pg(&pgid))
-      continue;
-    bufferlist bl;
-    pg_info_t info(pgid);
-    PastIntervals past_intervals;
-    __u8 struct_v;
-    r = PG::read_info(&fs, pgid, coll, bl, info, past_intervals, struct_v);
-    if (r < 0) {
-      cerr << "failed to read_info: " << cpp_strerror(r) << std::endl;
-      return r;
-    }
-    if (struct_v < PG::cur_struct_v) {
-      cerr << "incompatible pg_info: v" << struct_v << std::endl;
-      return -EINVAL;
-    }
-    version_t latest_epoch = 0;
-    r = ms.get(prefix, stringify(pgid.pgid), bl);
-    if (r >= 0) {
-      pg_stat_t pg_stat;
-      auto bp = bl.begin();
-      ::decode(pg_stat, bp);
-      latest_epoch = pg_stat.reported_epoch;
-    }
-    if (info.stats.reported_epoch > latest_epoch) {
-      bufferlist bl;
-      ::encode(info.stats, bl);
-      t->put(prefix, stringify(pgid.pgid), bl);
-      npg++;
-    }
-  }
-  ms.apply_transaction(t);
-  cout << std::left << setw(10)
-       << " " << npg << " pgs added." << std::endl;
-  return 0;
-}
diff -Nru ceph-12.2.11/src/valgrind.supp ceph-12.2.12/src/valgrind.supp
--- ceph-12.2.11/src/valgrind.supp	2019-01-30 15:51:26.000000000 +0000
+++ ceph-12.2.12/src/valgrind.supp	1970-01-01 00:00:00.000000000 +0000
@@ -1,263 +0,0 @@
-# some valgrind suppressions
-# to load these automagically,
-# cat > ~/.valgrindrc
-# --suppressions=valgrind.supp 
-# <control-d>
-
-
-# this one makes valgrind shut up about what appears to be a bug in libc's writev.
-{
-   writev uninit bytes thing -sage
-   Memcheck:Param
-   writev(vector[...])
-   fun:writev
-   fun:_ZN11BlockDevice6_writeEijjRN6buffer4listE
-   fun:_ZN11BlockDevice5do_ioEiRSt4listIPNS_6biovecESaIS2_EE
-   fun:_ZN11BlockDevice15io_thread_entryEv
-   fun:_ZN11BlockDevice8IOThread5entryEv
-   fun:_ZN6Thread11_entry_funcEPv
-   fun:start_thread
-   fun:clone
-   obj:*
-   obj:*
-   obj:*
-   obj:*
-}
-
-# gethostbyname
-{
-   gethostbyname on issdm
-   Memcheck:Param
-   socketcall.sendto(msg)
-   fun:send
-   fun:get_mapping
-   fun:__nscd_get_map_ref
-   fun:nscd_gethst_r
-   fun:__nscd_gethostbyname_r
-   fun:gethostbyname_r@@GLIBC_2.2.5
-   fun:gethostbyname
-   fun:_ZN4Rank8Accepter5startEv
-   fun:_ZN4Rank10start_rankEv
-   fun:main
-}
-{
-   <insert a suppression name here>
-   Memcheck:Param
-   socketcall.sendto(msg)
-   fun:send
-   fun:get_mapping
-   fun:__nscd_get_map_ref
-   fun:nscd_gethst_r
-   fun:__nscd_gethostbyname_r
-   fun:gethostbyname_r@@GLIBC_2.2.5
-   fun:gethostbyname
-   fun:_ZN4Rank8Accepter5startEv
-   fun:_ZN4Rank10start_rankEv
-   fun:_Z17mpi_bootstrap_newRiRPPcP6MonMap
-   fun:main
-}
-
-# gethostbyname
-
-{
-   gethostbyname on foil
-   Memcheck:Addr8
-   obj:/lib/ld-2.6.1.so
-   obj:/lib/ld-2.6.1.so
-   obj:/lib/ld-2.6.1.so
-   obj:/lib/ld-2.6.1.so
-   obj:/lib/ld-2.6.1.so
-   obj:/lib/ld-2.6.1.so
-   obj:/lib/ld-2.6.1.so
-   obj:/lib/libc-2.6.1.so
-   obj:/lib/ld-2.6.1.so
-   fun:__libc_dlopen_mode
-   fun:__nss_lookup_function
-   obj:/lib/libc-2.6.1.so
-}
-
-# mpi on issdm
-{
-   <insert a suppression name here>
-   Memcheck:Overlap
-   fun:memcpy
-   fun:MPIR_Localcopy
-   fun:MPIR_Gather
-   fun:MPI_Gather
-   fun:_Z17mpi_bootstrap_newRiRPPcP6MonMap
-   fun:main
-}
-{
-   <insert a suppression name here>
-   Memcheck:Param
-   writev(vector[...])
-   fun:writev
-   fun:MPIDU_Sock_writev
-   fun:MPIDI_CH3_iStartMsgv
-   fun:MPIDI_CH3_EagerContigSend
-   fun:MPID_Send
-   fun:MPIC_Send
-   fun:MPIR_Bcast
-   fun:MPI_Bcast
-   fun:_Z17mpi_bootstrap_newRiRPPcP6MonMap
-   fun:main
-}
-{
-   <insert a suppression name here>
-   Memcheck:Param
-   write(buf)
-   obj:/lib64/tls/libpthread-2.3.4.so
-   fun:MPIDU_Sock_write
-   fun:MPIDI_CH3_iSend
-   fun:MPID_Isend
-   fun:MPIC_Sendrecv
-   fun:MPIR_Barrier
-   fun:MPI_Barrier
-   fun:_Z17mpi_bootstrap_newRiRPPcP6MonMap
-   fun:main
-}
-{
-   <insert a suppression name here>
-   Memcheck:Param
-   write(buf)
-   obj:/lib64/tls/libpthread-2.3.4.so
-   fun:MPIDU_Sock_write
-   fun:MPIDI_CH3_iStartMsg
-   fun:MPIDI_CH3U_VC_SendClose
-   fun:MPIDI_PG_Close_VCs
-   fun:MPID_Finalize
-   fun:MPI5:    <insert a suppression name here>
-   fun:main
-}
-{
-   <insert a suppression name 5:    obj:/lib64/tls/libpthread-2.3.4.so
-   Memcheck:Param
-   write(buf)
-   obj:/lib64/tls/libpthread-2.3.4.so
-   fun:MPIDU_Sock_write
-   fun:MPIDI_CH3_iStartMsg
-   fun:MPIDI_CH3_PktHandler_Close
-   fun:MPIDI_CH3I_Progress_handle_sock_event
-   fun:MPIDI_CH3_Progress_wait
-   fun:MPIDI_CH3U_VC_WaitForClose
-   fun:MPID_Finalize
-   fun:MPI_Finalize
-   fun:main
-}
-{
-   <insert a suppression name here>
-   Memcheck:Param
-   write(buf)
-   obj:/lib64/tls/libpthread-2.3.4.so
-   fun:MPIDU_Sock_write
-   fun:MPIDI_CH3_iSend
-   fun:MPID_Isend
-   fun:MPIC_Sendrecv
-   fun:MPIR_Barrier
-   fun:MPI_Barrier
-   fun:main
-}
-{
-   <insert a suppression name here>
-   Memcheck:Param
-   write(buf)
-   obj:/lib64/tls/libpthread-2.3.4.so
-   fun:MPIDU_Sock_write
-   fun:MPIDI_CH3_iStartMsg
-   fun:MPIDI_CH3U_VC_SendClose
-   fun:MPIDI_PG_Close_VCs
-   fun:MPID_Finalize
-   fun:MPI_Finalize
-   fun:main
-}
- {
-   <insert a suppression name here>
-   Memcheck:Param
-   writev(vector[...])
-   fun:writev
-   fun:MPIDU_Socki_handle_write
-   fun:MPIDU_Sock_wait
-   fun:MPIDI_CH3_Progress_wait
-   fun:MPIC_Wait
-   fun:MPIC_Send
-   fun:MPIR_Gather
-   fun:MPI_Gather
-   fun:_Z17mpi_bootstrap_newRiRPPcP6MonMap
-   fun:main
-}
-
-# lttng-ust
-{
-   <insert_a_suppression_name_here>
-   Memcheck:Leak
-   fun:calloc
-   fun:_dl_allocate_tls
-   fun:pthread_create@@GLIBC_2.2.5
-   obj:/usr/*lib*/liblttng-ust.*
-   fun:call_init.part.0
-   fun:_dl_init
-   obj:*
-}
-
-# PK11_CreateContextBySymKey
-{
-   <insert_a_suppression_name_here>
-   Helgrind:Race
-   obj:/usr/*lib*/libfreebl*3.so
-   ...
-   obj:/usr/*lib*/libsoftokn3.so
-   ...
-   obj:/usr/*lib*/libnss3.so
-   ...
-   fun:PK11_CreateContextBySymKey
-   ...
-}
-
-# _dl_allocate_tls_init
-{
-   <insert_a_suppression_name_here>
-   Helgrind:Race
-   fun:mempcpy
-   fun:_dl_allocate_tls_init
-   ...
-   fun:pthread_create@*
-   ...
-}
-
-# rados cython constants
-{
-   <insert_a_suppression_name_here>
-   Memcheck:Leak
-   match-leak-kinds: definite
-   fun:malloc
-   fun:PyObject_Malloc
-   fun:PyCode_New
-   fun:__Pyx_InitCachedConstants
-   fun:initrados
-   fun:_PyImport_LoadDynamicModule
-   ...
-   fun:PyImport_ImportModuleLevel
-   ...
-   fun:PyObject_Call
-   fun:PyEval_CallObjectWithKeywords
-   fun:PyEval_EvalFrameEx
-}
-
-# rbd cython constants
-{
-   <insert_a_suppression_name_here>
-   Memcheck:Leak
-   match-leak-kinds: definite
-   fun:malloc
-   fun:PyObject_Malloc
-   fun:PyCode_New
-   fun:__Pyx_InitCachedConstants
-   fun:initrbd
-   fun:_PyImport_LoadDynamicModule
-   ...
-   fun:PyImport_ImportModuleLevel
-   ...
-   fun:PyObject_Call
-   fun:PyEval_CallObjectWithKeywords
-   fun:PyEval_EvalFrameEx
-}