diff -Nru rocksdb-5.15.10/buckifier/buckify_rocksdb.py rocksdb-5.17.2/buckifier/buckify_rocksdb.py
--- rocksdb-5.15.10/buckifier/buckify_rocksdb.py	2018-09-13 17:25:20.000000000 +0000
+++ rocksdb-5.17.2/buckifier/buckify_rocksdb.py	2018-11-12 19:57:32.000000000 +0000
@@ -109,12 +109,14 @@
         "rocksdb_test_lib",
         src_mk.get("MOCK_LIB_SOURCES", []) +
         src_mk.get("TEST_LIB_SOURCES", []) +
-        src_mk.get("EXP_LIB_SOURCES", []),
+        src_mk.get("EXP_LIB_SOURCES", []) +
+        src_mk.get("ANALYZER_LIB_SOURCES", []),
         [":rocksdb_lib"])
     # rocksdb_tools_lib
     TARGETS.add_library(
         "rocksdb_tools_lib",
         src_mk.get("BENCH_LIB_SOURCES", []) +
+        src_mk.get("ANALYZER_LIB_SOURCES", []) +
         ["util/testutil.cc"],
         [":rocksdb_lib"])
 
diff -Nru rocksdb-5.15.10/buckifier/targets_builder.py rocksdb-5.17.2/buckifier/targets_builder.py
--- rocksdb-5.15.10/buckifier/targets_builder.py	2018-09-13 17:25:20.000000000 +0000
+++ rocksdb-5.17.2/buckifier/targets_builder.py	2018-11-12 19:57:32.000000000 +0000
@@ -10,7 +10,7 @@
 
     if len(lst) == 1:
         return "\"%s\"" % lst[0]
-    
+
     separator = "\",\n%s\"" % (" " * indent)
     res = separator.join(sorted(lst))
     res = "\n" + (" " * indent) + "\"" + res + "\",\n" + (" " * (indent - 4))
@@ -31,13 +31,16 @@
         self.targets_file.close()
 
     def add_library(self, name, srcs, deps=None, headers=None):
+        headers_attr_prefix = ""
         if headers is None:
+            headers_attr_prefix = "auto_"
             headers = "AutoHeaders.RECURSIVE_GLOB"
-        self.targets_file.write(targets_cfg.library_template % (
-            name,
-            pretty_list(srcs),
-            headers,
-            pretty_list(deps)))
+        self.targets_file.write(targets_cfg.library_template.format(
+            name=name,
+            srcs=pretty_list(srcs),
+            headers_attr_prefix=headers_attr_prefix,
+            headers=headers,
+            deps=pretty_list(deps)))
         self.total_lib = self.total_lib + 1
 
     def add_binary(self, name, srcs, deps=None):
diff -Nru rocksdb-5.15.10/buckifier/targets_cfg.py rocksdb-5.17.2/buckifier/targets_cfg.py
--- rocksdb-5.15.10/buckifier/targets_cfg.py	2018-09-13 17:25:20.000000000 +0000
+++ rocksdb-5.17.2/buckifier/targets_cfg.py	2018-11-12 19:57:32.000000000 +0000
@@ -2,7 +2,9 @@
 from __future__ import division
 from __future__ import print_function
 from __future__ import unicode_literals
-rocksdb_target_header = """REPO_PATH = package_name() + "/"
+rocksdb_target_header = """load("@fbcode_macros//build_defs:auto_headers.bzl", "AutoHeaders")
+
+REPO_PATH = package_name() + "/"
 
 BUCK_BINS = "buck-out/gen/" + REPO_PATH
 
@@ -73,13 +75,13 @@
 
 library_template = """
 cpp_library(
-    name = "%s",
-    srcs = [%s],
-    headers = %s,
+    name = "{name}",
+    srcs = [{srcs}],
+    {headers_attr_prefix}headers = {headers},
     arch_preprocessor_flags = rocksdb_arch_preprocessor_flags,
     compiler_flags = rocksdb_compiler_flags,
     preprocessor_flags = rocksdb_preprocessor_flags,
-    deps = [%s],
+    deps = [{deps}],
     external_deps = rocksdb_external_deps,
 )
 """
@@ -118,21 +120,20 @@
         ttype = "gtest" if test_cfg[2] == "parallel" else "simple"
         test_bin = test_name + "_bin"
 
-        cpp_binary (
-          name = test_bin,
-          srcs = [test_cc],
-          deps = [":rocksdb_test_lib"],
-          preprocessor_flags = rocksdb_preprocessor_flags,
-          arch_preprocessor_flags = rocksdb_arch_preprocessor_flags,
-          compiler_flags = rocksdb_compiler_flags,
-          external_deps = rocksdb_external_deps,
+        cpp_binary(
+            name = test_bin,
+            srcs = [test_cc],
+            arch_preprocessor_flags = rocksdb_arch_preprocessor_flags,
+            compiler_flags = rocksdb_compiler_flags,
+            preprocessor_flags = rocksdb_preprocessor_flags,
+            deps = [":rocksdb_test_lib"],
+            external_deps = rocksdb_external_deps,
         )
 
         custom_unittest(
-          name = test_name,
-          type = ttype,
-          deps = [":" + test_bin],
-          command = [TEST_RUNNER, BUCK_BINS + test_bin]
+            name = test_name,
+            command = [TEST_RUNNER, BUCK_BINS + test_bin],
+            type = ttype,
+            deps = [":" + test_bin],
         )
-
 """
diff -Nru rocksdb-5.15.10/build_tools/dependencies.sh rocksdb-5.17.2/build_tools/dependencies.sh
--- rocksdb-5.15.10/build_tools/dependencies.sh	2018-09-13 17:25:20.000000000 +0000
+++ rocksdb-5.17.2/build_tools/dependencies.sh	2018-11-12 19:57:32.000000000 +0000
@@ -1,19 +1,18 @@
-# shellcheck disable=SC2148
-GCC_BASE=/mnt/gvfs/third-party2/gcc/8219ec1bcedf8ad9da05e121e193364de2cc4f61/5.x/centos6-native/c447969
-CLANG_BASE=/mnt/gvfs/third-party2/llvm-fb/64d8d58e3d84f8bde7a029763d4f5baf39d0d5b9/stable/centos6-native/6aaf4de
-LIBGCC_BASE=/mnt/gvfs/third-party2/libgcc/ba9be983c81de7299b59fe71950c664a84dcb5f8/5.x/gcc-5-glibc-2.23/339d858
-GLIBC_BASE=/mnt/gvfs/third-party2/glibc/f20197cf3d4bd50339c9777aaa0b2ccadad9e2cb/2.23/gcc-5-glibc-2.23/ca1d1c0
-SNAPPY_BASE=/mnt/gvfs/third-party2/snappy/6427ce8c7496e4ab06c2da81543b94c0de8be3d0/1.1.3/gcc-5-glibc-2.23/9bc6787
-ZLIB_BASE=/mnt/gvfs/third-party2/zlib/8f1e8b867d26efef93eac2fabbdb2e1d512665d7/1.2.8/gcc-5-glibc-2.23/9bc6787
-BZIP2_BASE=/mnt/gvfs/third-party2/bzip2/70471c0571559fe0af7db6d7e8860b93a7eadfe1/1.0.6/gcc-5-glibc-2.23/9bc6787
-LZ4_BASE=/mnt/gvfs/third-party2/lz4/453c89d6f0e68cdf1c151c769197fabedad9cac8/r131/gcc-5-glibc-2.23/9bc6787
-ZSTD_BASE=/mnt/gvfs/third-party2/zstd/00a40fa5f8bd2cd0622f2e868552793aef37ccf4/1.3.0/gcc-5-glibc-2.23/03859b5
-GFLAGS_BASE=/mnt/gvfs/third-party2/gflags/47eef08f9acb77de982fbda6047c26d330739538/2.2.0/gcc-5-glibc-2.23/9bc6787
-JEMALLOC_BASE=/mnt/gvfs/third-party2/jemalloc/4414ddc78df8008b35cc4adac23590ad29148584/master/gcc-5-glibc-2.23/d506c82
-NUMA_BASE=/mnt/gvfs/third-party2/numa/9d7ae2693d05d62f9a579cb21e6b717cf257a75d/2.0.11/gcc-5-glibc-2.23/9bc6787
-LIBUNWIND_BASE=/mnt/gvfs/third-party2/libunwind/2b2dd58e3a52ccf2c1d827def59e5f740de0ad15/1.2/gcc-5-glibc-2.23/b443de1
-TBB_BASE=/mnt/gvfs/third-party2/tbb/379addf7ab2468a2b4293b47456cfcd1c9cb318d/4.3/gcc-5-glibc-2.23/9bc6787
-KERNEL_HEADERS_BASE=/mnt/gvfs/third-party2/kernel-headers/3f68f5fe65a85b7c2d3e66852268fbd1efdb3151/4.0.9-36_fbk5_2933_gd092e3f/gcc-5-glibc-2.23/da39a3e
-BINUTILS_BASE=/mnt/gvfs/third-party2/binutils/b9fab0aec99d9c36408e810b2677e91c12807afd/2.28/centos6-native/da39a3e
-VALGRIND_BASE=/mnt/gvfs/third-party2/valgrind/423431d61786b20bcc3bde8972901130cb29e6b3/3.11.0/gcc-5-glibc-2.23/9bc6787
-LUA_BASE=/mnt/gvfs/third-party2/lua/3b0bb3bd9a0f690a069c479fcc0f7424fc7456d2/5.2.3/gcc-5-glibc-2.23/65372bd
+GCC_BASE=/mnt/gvfs/third-party2/gcc/112ec378fec7002ad3e09afde022e656049f7191/5.x/centos7-native/c447969
+CLANG_BASE=/mnt/gvfs/third-party2/llvm-fb/04999bdb3ce81a11073535dcb00b5e13dc1cbaf5/stable/centos7-native/c9f9104
+LIBGCC_BASE=/mnt/gvfs/third-party2/libgcc/92b0c8e5c8eecc71eb042594ce1ab3413799b385/5.x/gcc-5-glibc-2.23/339d858
+GLIBC_BASE=/mnt/gvfs/third-party2/glibc/3d8698d5973ba94f41620a80a67e4457fdf01e90/2.23/gcc-5-glibc-2.23/ca1d1c0
+SNAPPY_BASE=/mnt/gvfs/third-party2/snappy/7f9bdaada18f59bc27ec2b0871eb8a6144343aef/1.1.3/gcc-5-glibc-2.23/9bc6787
+ZLIB_BASE=/mnt/gvfs/third-party2/zlib/22c2d65676fb7c23cfa797c4f6937f38b026f3cf/1.2.8/gcc-5-glibc-2.23/9bc6787
+BZIP2_BASE=/mnt/gvfs/third-party2/bzip2/dc49a21c5fceec6456a7a28a94dcd16690af1337/1.0.6/gcc-5-glibc-2.23/9bc6787
+LZ4_BASE=/mnt/gvfs/third-party2/lz4/907b498203d297947f3bb70b9466f47e100f1873/r131/gcc-5-glibc-2.23/9bc6787
+ZSTD_BASE=/mnt/gvfs/third-party2/zstd/af6628a46758f1a15484a1760cd7294164bc5ba1/1.3.5/gcc-5-glibc-2.23/03859b5
+GFLAGS_BASE=/mnt/gvfs/third-party2/gflags/0b9929d2588991c65a57168bf88aff2db87c5d48/2.2.0/gcc-5-glibc-2.23/9bc6787
+JEMALLOC_BASE=/mnt/gvfs/third-party2/jemalloc/b1a0e56c1e3e6929813a4331ade3a58ff083afbb/master/gcc-5-glibc-2.23/aa64d6b
+NUMA_BASE=/mnt/gvfs/third-party2/numa/9cbf2460284c669ed19c3ccb200a71f7dd7e53c7/2.0.11/gcc-5-glibc-2.23/9bc6787
+LIBUNWIND_BASE=/mnt/gvfs/third-party2/libunwind/bf3d7497fe4e6d007354f0adffa16ce3003f8338/1.3/gcc-5-glibc-2.23/b443de1
+TBB_BASE=/mnt/gvfs/third-party2/tbb/ff4e0b093534704d8abab678a4fd7f5ea7b094c7/2018_U5/gcc-5-glibc-2.23/9bc6787
+KERNEL_HEADERS_BASE=/mnt/gvfs/third-party2/kernel-headers/b5c4a61a5c483ba24722005ae07895971a2ac707/4.0.9-36_fbk5_2933_gd092e3f/gcc-5-glibc-2.23/da39a3e
+BINUTILS_BASE=/mnt/gvfs/third-party2/binutils/55031de95a2b46c82948743419a603b3d6aefe28/2.29.1/centos7-native/da39a3e
+VALGRIND_BASE=/mnt/gvfs/third-party2/valgrind/f3f697a28122e6bcd513273dd9c1ff23852fc59f/3.13.0/gcc-5-glibc-2.23/9bc6787
+LUA_BASE=/mnt/gvfs/third-party2/lua/f0cd714433206d5139df61659eb7b28b1dea6683/5.2.3/gcc-5-glibc-2.23/65372bd
diff -Nru rocksdb-5.15.10/build_tools/error_filter.py rocksdb-5.17.2/build_tools/error_filter.py
--- rocksdb-5.15.10/build_tools/error_filter.py	2018-09-13 17:25:20.000000000 +0000
+++ rocksdb-5.17.2/build_tools/error_filter.py	2018-11-12 19:57:32.000000000 +0000
@@ -64,8 +64,12 @@
 
 class CompilerErrorParser(MatchErrorParser):
     def __init__(self):
-        # format: '<filename>:<line #>:<column #>: error: <error msg>'
-        super(CompilerErrorParser, self).__init__(r'\S+:\d+:\d+: error:')
+        # format (compile error):
+        #   '<filename>:<line #>:<column #>: error: <error msg>'
+        # format (link error):
+        #   '<filename>:<line #>: error: <error msg>'
+        # The below regex catches both
+        super(CompilerErrorParser, self).__init__(r'\S+:\d+: error:')
 
 
 class ScanBuildErrorParser(MatchErrorParser):
diff -Nru rocksdb-5.15.10/build_tools/fbcode_config.sh rocksdb-5.17.2/build_tools/fbcode_config.sh
--- rocksdb-5.15.10/build_tools/fbcode_config.sh	2018-09-13 17:25:20.000000000 +0000
+++ rocksdb-5.17.2/build_tools/fbcode_config.sh	2018-11-12 19:57:32.000000000 +0000
@@ -43,11 +43,15 @@
   LZ4_INCLUDE=" -I $LZ4_BASE/include/"
   LZ4_LIBS=" $LZ4_BASE/lib/liblz4.a"
   CFLAGS+=" -DLZ4"
+fi
 
-  ZSTD_INCLUDE=" -I $ZSTD_BASE/include/"
+ZSTD_INCLUDE=" -I $ZSTD_BASE/include/"
+if test -z $PIC_BUILD; then
   ZSTD_LIBS=" $ZSTD_BASE/lib/libzstd.a"
-  CFLAGS+=" -DZSTD"
+else
+  ZSTD_LIBS=" $ZSTD_BASE/lib/libzstd_pic.a"
 fi
+CFLAGS+=" -DZSTD"
 
 # location of gflags headers and libraries
 GFLAGS_INCLUDE=" -I $GFLAGS_BASE/include/"
diff -Nru rocksdb-5.15.10/build_tools/rocksdb-lego-determinator rocksdb-5.17.2/build_tools/rocksdb-lego-determinator
--- rocksdb-5.15.10/build_tools/rocksdb-lego-determinator	2018-09-13 17:25:20.000000000 +0000
+++ rocksdb-5.17.2/build_tools/rocksdb-lego-determinator	2018-11-12 19:57:32.000000000 +0000
@@ -88,6 +88,8 @@
 LITE="OPT=\"-DROCKSDB_LITE -g\""
 TSAN="COMPILE_WITH_TSAN=1"
 UBSAN="COMPILE_WITH_UBSAN=1"
+TSAN_CRASH='CRASH_TEST_EXT_ARGS="--compression_type=zstd --log2_keys_per_lock=22"'
+NON_TSAN_CRASH="CRASH_TEST_EXT_ARGS=--compression_type=zstd"
 DISABLE_JEMALLOC="DISABLE_JEMALLOC=1"
 HTTP_PROXY="https_proxy=http://fwdproxy.29.prn1:8080 http_proxy=http://fwdproxy.29.prn1:8080 ftp_proxy=http://fwdproxy.29.prn1:8080"
 SETUP_JAVA_ENV="export $HTTP_PROXY; export JAVA_HOME=/usr/local/jdk-8u60-64/; export PATH=\$JAVA_HOME/bin:\$PATH"
@@ -380,14 +382,14 @@
             $CLEANUP_ENV,
             {
                 'name':'Build and run RocksDB debug stress tests',
-                'shell':'$SHM $DEBUG make J=1 db_stress || $CONTRUN_NAME=db_stress $TASK_CREATION_TOOL',
+                'shell':'$SHM $DEBUG $NON_TSAN_CRASH make J=1 db_stress || $CONTRUN_NAME=db_stress $TASK_CREATION_TOOL',
                 'user':'root',
                 $PARSER
             },
             {
                 'name':'Build and run RocksDB debug crash tests',
                 'timeout': 86400,
-                'shell':'$SHM $DEBUG make J=1 crash_test || $CONTRUN_NAME=crash_test $TASK_CREATION_TOOL',
+                'shell':'$SHM $DEBUG $NON_TSAN_CRASH make J=1 crash_test || $CONTRUN_NAME=crash_test $TASK_CREATION_TOOL',
                 'user':'root',
                 $PARSER
             }
@@ -452,7 +454,7 @@
             {
                 'name':'Build and run RocksDB debug asan_crash_test',
                 'timeout': 86400,
-                'shell':'$SHM $DEBUG make J=1 asan_crash_test || $CONTRUN_NAME=asan_crash_test $TASK_CREATION_TOOL',
+                'shell':'$SHM $DEBUG $NON_TSAN_CRASH make J=1 asan_crash_test || $CONTRUN_NAME=asan_crash_test $TASK_CREATION_TOOL',
                 'user':'root',
                 $PARSER
             },
@@ -494,7 +496,7 @@
             {
                 'name':'Build and run RocksDB debug ubsan_crash_test',
                 'timeout': 86400,
-                'shell':'$SHM $DEBUG make J=1 ubsan_crash_test || $CONTRUN_NAME=ubsan_crash_test $TASK_CREATION_TOOL',
+                'shell':'$SHM $DEBUG $NON_TSAN_CRASH make J=1 ubsan_crash_test || $CONTRUN_NAME=ubsan_crash_test $TASK_CREATION_TOOL',
                 'user':'root',
                 $PARSER
             },
@@ -560,7 +562,7 @@
             {
                 'name':'Compile and run',
                 'timeout': 86400,
-                'shell':'set -o pipefail && $SHM $DEBUG $TSAN CRASH_TEST_KILL_ODD=1887 CRASH_TEST_EXT_ARGS=--log2_keys_per_lock=22  make J=1 crash_test || $CONTRUN_NAME=tsan_crash_test $TASK_CREATION_TOOL',
+                'shell':'set -o pipefail && $SHM $DEBUG $TSAN $TSAN_CRASH CRASH_TEST_KILL_ODD=1887 make J=1 crash_test || $CONTRUN_NAME=tsan_crash_test $TASK_CREATION_TOOL',
                 'user':'root',
                 $PARSER
             },
diff -Nru rocksdb-5.15.10/build_tools/update_dependencies.sh rocksdb-5.17.2/build_tools/update_dependencies.sh
--- rocksdb-5.15.10/build_tools/update_dependencies.sh	2018-09-13 17:25:20.000000000 +0000
+++ rocksdb-5.17.2/build_tools/update_dependencies.sh	2018-11-12 19:57:32.000000000 +0000
@@ -65,8 +65,8 @@
 echo "Writing dependencies to $OUTPUT"
 
 # Compilers locations
-GCC_BASE=`readlink -f $TP2_LATEST/gcc/5.x/centos6-native/*/`
-CLANG_BASE=`readlink -f $TP2_LATEST/llvm-fb/stable/centos6-native/*/`
+GCC_BASE=`readlink -f $TP2_LATEST/gcc/5.x/centos7-native/*/`
+CLANG_BASE=`readlink -f $TP2_LATEST/llvm-fb/stable/centos7-native/*/`
 
 log_variable GCC_BASE
 log_variable CLANG_BASE
@@ -86,7 +86,7 @@
 get_lib_base tbb        LATEST  gcc-5-glibc-2.23
 
 get_lib_base kernel-headers 4.0.9-36_fbk5_2933_gd092e3f gcc-5-glibc-2.23
-get_lib_base binutils   LATEST centos6-native 
+get_lib_base binutils   LATEST centos7-native
 get_lib_base valgrind   LATEST gcc-5-glibc-2.23
 get_lib_base lua        5.2.3 gcc-5-glibc-2.23
 
diff -Nru rocksdb-5.15.10/CMakeLists.txt rocksdb-5.17.2/CMakeLists.txt
--- rocksdb-5.15.10/CMakeLists.txt	2018-09-13 17:25:20.000000000 +0000
+++ rocksdb-5.17.2/CMakeLists.txt	2018-11-12 19:57:32.000000000 +0000
@@ -336,12 +336,18 @@
 
 # Used to run CI build and tests so we can run faster
 option(OPTDBG "Build optimized debug build with MSVC" OFF)
+option(WITH_RUNTIME_DEBUG "build with debug version of runtime library" ON)
 if(MSVC)
   if(OPTDBG)
     message(STATUS "Debug optimization is enabled")
-    set(CMAKE_CXX_FLAGS_DEBUG "/Oxt /${RUNTIME_LIBRARY}d")
+    set(CMAKE_CXX_FLAGS_DEBUG "/Oxt")
   else()
-    set(CMAKE_CXX_FLAGS_DEBUG "${CMAKE_CXX_FLAGS_DEBUG} /Od /RTC1 /Gm /${RUNTIME_LIBRARY}d")
+    set(CMAKE_CXX_FLAGS_DEBUG "${CMAKE_CXX_FLAGS_DEBUG} /Od /RTC1 /Gm")
+  endif()
+  if(WITH_RUNTIME_DEBUG)
+    set(CMAKE_CXX_FLAGS_DEBUG "${CMAKE_CXX_FLAGS_DEBUG} /${RUNTIME_LIBRARY}d")
+  else()
+    set(CMAKE_CXX_FLAGS_DEBUG "${CMAKE_CXX_FLAGS_DEBUG} /${RUNTIME_LIBRARY}")
   endif()
   set(CMAKE_CXX_FLAGS_RELEASE "${CMAKE_CXX_FLAGS_RELEASE} /Oxt /Zp8 /Gm- /Gy /${RUNTIME_LIBRARY}")
 
@@ -549,6 +555,8 @@
         table/cuckoo_table_builder.cc
         table/cuckoo_table_factory.cc
         table/cuckoo_table_reader.cc
+        table/data_block_hash_index.cc
+        table/data_block_footer.cc
         table/flush_block_policy.cc
         table/format.cc
         table/full_filter_block.cc
@@ -572,6 +580,7 @@
         tools/ldb_cmd.cc
         tools/ldb_tool.cc
         tools/sst_dump_tool.cc
+        tools/trace_analyzer_tool.cc
         util/arena.cc
         util/auto_roll_logger.cc
         util/bloom.cc
@@ -596,19 +605,20 @@
         util/slice.cc
         util/sst_file_manager_impl.cc
         util/status.cc
-        util/status_message.cc
         util/string_util.cc
         util/sync_point.cc
         util/sync_point_impl.cc
         util/testutil.cc
         util/thread_local.cc
         util/threadpool_imp.cc
+        util/trace_replay.cc
         util/transaction_test_util.cc
         util/xxhash.cc
         utilities/backupable/backupable_db.cc
         utilities/blob_db/blob_compaction_filter.cc
         utilities/blob_db/blob_db.cc
         utilities/blob_db/blob_db_impl.cc
+        utilities/blob_db/blob_db_impl_filesnapshot.cc
         utilities/blob_db/blob_dump_tool.cc
         utilities/blob_db/blob_file.cc
         utilities/blob_db/blob_log_reader.cc
@@ -650,6 +660,7 @@
         utilities/simulator_cache/sim_cache.cc
         utilities/spatialdb/spatial_db.cc
         utilities/table_properties_collectors/compact_on_deletion_collector.cc
+        utilities/trace/file_trace_reader_writer.cc
         utilities/transactions/optimistic_transaction_db_impl.cc
         utilities/transactions/optimistic_transaction.cc
         utilities/transactions/pessimistic_transaction.cc
@@ -913,12 +924,14 @@
         table/cleanable_test.cc
         table/cuckoo_table_builder_test.cc
         table/cuckoo_table_reader_test.cc
+        table/data_block_hash_index_test.cc
         table/full_filter_block_test.cc
         table/merger_test.cc
         table/table_test.cc
         tools/ldb_cmd_test.cc
         tools/reduce_levels_test.cc
         tools/sst_dump_test.cc
+        tools/trace_analyzer_test.cc
         util/arena_test.cc
         util/auto_roll_logger_test.cc
         util/autovector_test.cc
@@ -933,6 +946,7 @@
         util/hash_test.cc
         util/heap_test.cc
         util/rate_limiter_test.cc
+        util/repeatable_thread_test.cc
         util/slice_transform_test.cc
         util/timer_queue_test.cc
         util/thread_list_test.cc
@@ -975,6 +989,7 @@
   set(BENCHMARKS
     cache/cache_bench.cc
     memtable/memtablerep_bench.cc
+    db/range_del_aggregator_bench.cc
     tools/db_bench.cc
     table/table_reader_bench.cc
     utilities/column_aware_encoding_exp.cc
diff -Nru rocksdb-5.15.10/db/builder.cc rocksdb-5.17.2/db/builder.cc
--- rocksdb-5.15.10/db/builder.cc	2018-09-13 17:25:20.000000000 +0000
+++ rocksdb-5.17.2/db/builder.cc	2018-11-12 19:57:32.000000000 +0000
@@ -121,8 +121,8 @@
       file->SetIOPriority(io_priority);
       file->SetWriteLifeTimeHint(write_hint);
 
-      file_writer.reset(new WritableFileWriter(std::move(file), env_options,
-                                               ioptions.statistics));
+      file_writer.reset(new WritableFileWriter(
+          std::move(file), fname, env_options, ioptions.statistics));
       builder = NewTableBuilder(
           ioptions, mutable_cf_options, internal_comparator,
           int_tbl_prop_collector_factories, column_family_id,
diff -Nru rocksdb-5.15.10/db/builder.h rocksdb-5.17.2/db/builder.h
--- rocksdb-5.15.10/db/builder.h	2018-09-13 17:25:20.000000000 +0000
+++ rocksdb-5.17.2/db/builder.h	2018-11-12 19:57:32.000000000 +0000
@@ -35,7 +35,6 @@
 class TableBuilder;
 class WritableFileWriter;
 class InternalStats;
-class InternalIterator;
 
 // @param column_family_name Name of the column family that is also identified
 //    by column_family_id, or empty string if unknown. It must outlive the
diff -Nru rocksdb-5.15.10/db/c.cc rocksdb-5.17.2/db/c.cc
--- rocksdb-5.15.10/db/c.cc	2018-09-13 17:25:20.000000000 +0000
+++ rocksdb-5.17.2/db/c.cc	2018-11-12 19:57:32.000000000 +0000
@@ -33,6 +33,7 @@
 #include "rocksdb/utilities/backupable_db.h"
 #include "rocksdb/utilities/checkpoint.h"
 #include "rocksdb/utilities/db_ttl.h"
+#include "rocksdb/utilities/memory_util.h"
 #include "rocksdb/utilities/optimistic_transaction_db.h"
 #include "rocksdb/utilities/transaction.h"
 #include "rocksdb/utilities/transaction_db.h"
@@ -41,6 +42,10 @@
 #include "rocksdb/perf_context.h"
 #include "utilities/merge_operators.h"
 
+#include <vector>
+#include <unordered_set>
+#include <map>
+
 using rocksdb::BytewiseComparator;
 using rocksdb::Cache;
 using rocksdb::ColumnFamilyDescriptor;
@@ -108,8 +113,12 @@
 using rocksdb::BatchResult;
 using rocksdb::PerfLevel;
 using rocksdb::PerfContext;
+using rocksdb::MemoryUtil;
 
 using std::shared_ptr;
+using std::vector;
+using std::unordered_set;
+using std::map;
 
 extern "C" {
 
@@ -2402,7 +2411,7 @@
 
 void rocksdb_options_set_writable_file_max_buffer_size(rocksdb_options_t* opt,
                                                        uint64_t v) {
-  opt->rep.writable_file_max_buffer_size = v;
+  opt->rep.writable_file_max_buffer_size = static_cast<size_t>(v);
 }
 
 void rocksdb_options_set_allow_concurrent_memtable_write(rocksdb_options_t* opt,
@@ -2433,6 +2442,20 @@
   opt->rep.max_write_buffer_number_to_maintain = n;
 }
 
+void rocksdb_options_set_enable_pipelined_write(rocksdb_options_t* opt,
+                                                unsigned char v) {
+  opt->rep.enable_pipelined_write = v;
+}
+
+void rocksdb_options_set_max_subcompactions(rocksdb_options_t* opt,
+                                            uint32_t n) {
+  opt->rep.max_subcompactions = n;
+}
+
+void rocksdb_options_set_max_background_jobs(rocksdb_options_t* opt, int n) {
+  opt->rep.max_background_jobs = n;
+}
+
 void rocksdb_options_set_max_background_compactions(rocksdb_options_t* opt, int n) {
   opt->rep.max_background_compactions = n;
 }
@@ -4087,6 +4110,98 @@
   *vlen = v->rep.size();
   return v->rep.data();
 }
+
+// container to keep databases and caches in order to use rocksdb::MemoryUtil
+struct rocksdb_memory_consumers_t {
+  std::vector<rocksdb_t*> dbs;
+  std::unordered_set<rocksdb_cache_t*> caches;
+};
+
+// initializes new container of memory consumers
+rocksdb_memory_consumers_t* rocksdb_memory_consumers_create() {
+  return new rocksdb_memory_consumers_t;
+}
+
+// adds datatabase to the container of memory consumers
+void rocksdb_memory_consumers_add_db(rocksdb_memory_consumers_t* consumers,
+                                     rocksdb_t* db) {
+  consumers->dbs.push_back(db);
+}
+
+// adds cache to the container of memory consumers
+void rocksdb_memory_consumers_add_cache(rocksdb_memory_consumers_t* consumers,
+                                        rocksdb_cache_t* cache) {
+  consumers->caches.insert(cache);
+}
+
+// deletes container with memory consumers
+void rocksdb_memory_consumers_destroy(rocksdb_memory_consumers_t* consumers) {
+  delete consumers;
+}
+
+// contains memory usage statistics provided by rocksdb::MemoryUtil
+struct rocksdb_memory_usage_t {
+  uint64_t mem_table_total;
+  uint64_t mem_table_unflushed;
+  uint64_t mem_table_readers_total;
+  uint64_t cache_total;
+};
+
+// estimates amount of memory occupied by consumers (dbs and caches)
+rocksdb_memory_usage_t* rocksdb_approximate_memory_usage_create(
+    rocksdb_memory_consumers_t* consumers, char** errptr) {
+
+  vector<DB*> dbs;
+  for (auto db : consumers->dbs) {
+    dbs.push_back(db->rep);
+  }
+
+  unordered_set<const Cache*> cache_set;
+  for (auto cache : consumers->caches) {
+    cache_set.insert(const_cast<const Cache*>(cache->rep.get()));
+  }
+
+  std::map<rocksdb::MemoryUtil::UsageType, uint64_t> usage_by_type;
+
+  auto status = MemoryUtil::GetApproximateMemoryUsageByType(dbs, cache_set,
+                                                            &usage_by_type);
+  if (SaveError(errptr, status)) {
+    return nullptr;
+  }
+
+  auto result = new rocksdb_memory_usage_t;
+  result->mem_table_total = usage_by_type[MemoryUtil::kMemTableTotal];
+  result->mem_table_unflushed = usage_by_type[MemoryUtil::kMemTableUnFlushed];
+  result->mem_table_readers_total = usage_by_type[MemoryUtil::kTableReadersTotal];
+  result->cache_total = usage_by_type[MemoryUtil::kCacheTotal];
+  return result;
+}
+
+uint64_t rocksdb_approximate_memory_usage_get_mem_table_total(
+    rocksdb_memory_usage_t* memory_usage) {
+  return memory_usage->mem_table_total;
+}
+
+uint64_t rocksdb_approximate_memory_usage_get_mem_table_unflushed(
+    rocksdb_memory_usage_t* memory_usage) {
+  return memory_usage->mem_table_unflushed;
+}
+
+uint64_t rocksdb_approximate_memory_usage_get_mem_table_readers_total(
+    rocksdb_memory_usage_t* memory_usage) {
+  return memory_usage->mem_table_readers_total;
+}
+
+uint64_t rocksdb_approximate_memory_usage_get_cache_total(
+    rocksdb_memory_usage_t* memory_usage) {
+  return memory_usage->cache_total;
+}
+
+// deletes container with memory usage estimates
+void rocksdb_approximate_memory_usage_destroy(rocksdb_memory_usage_t* usage) {
+  delete usage;
+}
+
 }  // end extern "C"
 
 #endif  // !ROCKSDB_LITE
diff -Nru rocksdb-5.15.10/db/compacted_db_impl.cc rocksdb-5.17.2/db/compacted_db_impl.cc
--- rocksdb-5.15.10/db/compacted_db_impl.cc	2018-09-13 17:25:20.000000000 +0000
+++ rocksdb-5.17.2/db/compacted_db_impl.cc	2018-11-12 19:57:32.000000000 +0000
@@ -25,22 +25,12 @@
 }
 
 size_t CompactedDBImpl::FindFile(const Slice& key) {
-  size_t left = 0;
   size_t right = files_.num_files - 1;
-  while (left < right) {
-    size_t mid = (left + right) >> 1;
-    const FdWithKeyRange& f = files_.files[mid];
-    if (user_comparator_->Compare(ExtractUserKey(f.largest_key), key) < 0) {
-      // Key at "mid.largest" is < "target".  Therefore all
-      // files at or before "mid" are uninteresting.
-      left = mid + 1;
-    } else {
-      // Key at "mid.largest" is >= "target".  Therefore all files
-      // after "mid" are uninteresting.
-      right = mid;
-    }
-  }
-  return right;
+  auto cmp = [&](const FdWithKeyRange& f, const Slice& k) -> bool {
+    return user_comparator_->Compare(ExtractUserKey(f.largest_key), k) < 0;
+  };
+  return static_cast<size_t>(std::lower_bound(files_.files,
+                            files_.files + right, key, cmp) - files_.files);
 }
 
 Status CompactedDBImpl::Get(const ReadOptions& options, ColumnFamilyHandle*,
diff -Nru rocksdb-5.15.10/db/compact_files_test.cc rocksdb-5.17.2/db/compact_files_test.cc
--- rocksdb-5.15.10/db/compact_files_test.cc	2018-09-13 17:25:20.000000000 +0000
+++ rocksdb-5.17.2/db/compact_files_test.cc	2018-11-12 19:57:32.000000000 +0000
@@ -308,6 +308,55 @@
   delete db;
 }
 
+TEST_F(CompactFilesTest, SentinelCompressionType) {
+  if (!Zlib_Supported()) {
+    fprintf(stderr, "zlib compression not supported, skip this test\n");
+    return;
+  }
+  if (!Snappy_Supported()) {
+    fprintf(stderr, "snappy compression not supported, skip this test\n");
+    return;
+  }
+  // Check that passing `CompressionType::kDisableCompressionOption` to
+  // `CompactFiles` causes it to use the column family compression options.
+  for (auto compaction_style :
+       {CompactionStyle::kCompactionStyleLevel,
+        CompactionStyle::kCompactionStyleUniversal,
+        CompactionStyle::kCompactionStyleNone}) {
+    DestroyDB(db_name_, Options());
+    Options options;
+    options.compaction_style = compaction_style;
+    // L0: Snappy, L1: ZSTD, L2: Snappy
+    options.compression_per_level = {CompressionType::kSnappyCompression,
+                                     CompressionType::kZlibCompression,
+                                     CompressionType::kSnappyCompression};
+    options.create_if_missing = true;
+    FlushedFileCollector* collector = new FlushedFileCollector();
+    options.listeners.emplace_back(collector);
+    DB* db = nullptr;
+    ASSERT_OK(DB::Open(options, db_name_, &db));
+
+    db->Put(WriteOptions(), "key", "val");
+    db->Flush(FlushOptions());
+
+    auto l0_files = collector->GetFlushedFiles();
+    ASSERT_EQ(1, l0_files.size());
+
+    // L0->L1 compaction, so output should be ZSTD-compressed
+    CompactionOptions compaction_opts;
+    compaction_opts.compression = CompressionType::kDisableCompressionOption;
+    ASSERT_OK(db->CompactFiles(compaction_opts, l0_files, 1));
+
+    rocksdb::TablePropertiesCollection all_tables_props;
+    ASSERT_OK(db->GetPropertiesOfAllTables(&all_tables_props));
+    for (const auto& name_and_table_props : all_tables_props) {
+      ASSERT_EQ(CompressionTypeToString(CompressionType::kZlibCompression),
+                name_and_table_props.second->compression_name);
+    }
+    delete db;
+  }
+}
+
 }  // namespace rocksdb
 
 int main(int argc, char** argv) {
diff -Nru rocksdb-5.15.10/db/compaction_iterator.cc rocksdb-5.17.2/db/compaction_iterator.cc
--- rocksdb-5.15.10/db/compaction_iterator.cc	2018-09-13 17:25:20.000000000 +0000
+++ rocksdb-5.17.2/db/compaction_iterator.cc	2018-11-12 19:57:32.000000000 +0000
@@ -18,8 +18,8 @@
     SequenceNumber earliest_write_conflict_snapshot,
     const SnapshotChecker* snapshot_checker, Env* env,
     bool report_detailed_time, bool expect_valid_internal_key,
-    RangeDelAggregator* range_del_agg,
-    const Compaction* compaction, const CompactionFilter* compaction_filter,
+    RangeDelAggregator* range_del_agg, const Compaction* compaction,
+    const CompactionFilter* compaction_filter,
     const std::atomic<bool>* shutting_down,
     const SequenceNumber preserve_deletes_seqnum)
     : CompactionIterator(
@@ -77,6 +77,12 @@
     earliest_snapshot_ = snapshots_->at(0);
     latest_snapshot_ = snapshots_->back();
   }
+#ifndef NDEBUG
+  // findEarliestVisibleSnapshot assumes this ordering.
+  for (size_t i = 1; i < snapshots_->size(); ++i) {
+    assert(snapshots_->at(i - 1) <= snapshots_->at(i));
+  }
+#endif
   if (compaction_filter_ != nullptr) {
     if (compaction_filter_->IgnoreSnapshots()) {
       ignore_snapshots_ = true;
@@ -505,6 +511,31 @@
         ++iter_stats_.num_optimized_del_drop_obsolete;
       }
       input_->Next();
+    } else if ((ikey_.type == kTypeDeletion) && bottommost_level_ &&
+               ikeyNotNeededForIncrementalSnapshot()) {
+      // Handle the case where we have a delete key at the bottom most level
+      // We can skip outputting the key iff there are no subsequent puts for this
+      // key
+      ParsedInternalKey next_ikey;
+      input_->Next();
+      // Skip over all versions of this key that happen to occur in the same snapshot
+      // range as the delete
+      while (input_->Valid() &&
+             ParseInternalKey(input_->key(), &next_ikey) &&
+             cmp_->Equal(ikey_.user_key, next_ikey.user_key) &&
+             (prev_snapshot == 0 || next_ikey.sequence > prev_snapshot ||
+              (snapshot_checker_ != nullptr &&
+               UNLIKELY(!snapshot_checker_->IsInSnapshot(next_ikey.sequence,
+                                                         prev_snapshot))))) {
+        input_->Next();
+      }
+      // If you find you still need to output a row with this key, we need to output the
+      // delete too
+      if (input_->Valid() && ParseInternalKey(input_->key(), &next_ikey) &&
+          cmp_->Equal(ikey_.user_key, next_ikey.user_key)) {
+        valid_ = true;
+        at_next_ = true;
+      }
     } else if (ikey_.type == kTypeMerge) {
       if (!merge_helper_->HasOperator()) {
         status_ = Status::InvalidArgument(
@@ -603,18 +634,23 @@
 inline SequenceNumber CompactionIterator::findEarliestVisibleSnapshot(
     SequenceNumber in, SequenceNumber* prev_snapshot) {
   assert(snapshots_->size());
-  SequenceNumber prev = kMaxSequenceNumber;
-  for (const auto cur : *snapshots_) {
-    assert(prev == kMaxSequenceNumber || prev <= cur);
-    if (cur >= in && (snapshot_checker_ == nullptr ||
-                      snapshot_checker_->IsInSnapshot(in, cur))) {
-      *prev_snapshot = prev == kMaxSequenceNumber ? 0 : prev;
+  auto snapshots_iter = std::lower_bound(
+      snapshots_->begin(), snapshots_->end(), in);
+  if (snapshots_iter == snapshots_->begin()) {
+    *prev_snapshot = 0;
+  } else {
+    *prev_snapshot = *std::prev(snapshots_iter);
+    assert(*prev_snapshot < in);
+  }
+  for (; snapshots_iter != snapshots_->end(); ++snapshots_iter) {
+    auto cur = *snapshots_iter;
+    assert(in <= cur);
+    if (snapshot_checker_ == nullptr ||
+        snapshot_checker_->IsInSnapshot(in, cur)) {
       return cur;
     }
-    prev = cur;
-    assert(prev < kMaxSequenceNumber);
+    *prev_snapshot = cur;
   }
-  *prev_snapshot = prev;
   return kMaxSequenceNumber;
 }
 
diff -Nru rocksdb-5.15.10/db/compaction_iterator_test.cc rocksdb-5.17.2/db/compaction_iterator_test.cc
--- rocksdb-5.15.10/db/compaction_iterator_test.cc	2018-09-13 17:25:20.000000000 +0000
+++ rocksdb-5.17.2/db/compaction_iterator_test.cc	2018-11-12 19:57:32.000000000 +0000
@@ -247,9 +247,8 @@
     c_iter_.reset(new CompactionIterator(
         iter_.get(), cmp_, merge_helper_.get(), last_sequence, &snapshots_,
         earliest_write_conflict_snapshot, snapshot_checker_.get(),
-        Env::Default(), false /* report_detailed_time */,
-        false, range_del_agg_.get(), std::move(compaction), filter,
-        &shutting_down_));
+        Env::Default(), false /* report_detailed_time */, false,
+        range_del_agg_.get(), std::move(compaction), filter, &shutting_down_));
   }
 
   void AddSnapshot(SequenceNumber snapshot,
@@ -672,8 +671,12 @@
 TEST_P(CompactionIteratorTest, RemoveDeletionAtBottomLevel) {
   AddSnapshot(1);
   RunTest({test::KeyStr("a", 1, kTypeDeletion),
-           test::KeyStr("b", 2, kTypeDeletion)},
-          {"", ""}, {test::KeyStr("b", 2, kTypeDeletion)}, {""},
+           test::KeyStr("b", 3, kTypeDeletion),
+           test::KeyStr("b", 1, kTypeValue)},
+          {"", "", ""},
+          {test::KeyStr("b", 3, kTypeDeletion),
+           test::KeyStr("b", 0, kTypeValue)},
+          {"", ""},
           kMaxSequenceNumber /*last_commited_seq*/, nullptr /*merge_operator*/,
           nullptr /*compaction_filter*/, true /*bottommost_level*/);
 }
@@ -842,12 +845,25 @@
       {test::KeyStr("a", 1, kTypeDeletion), test::KeyStr("b", 2, kTypeDeletion),
        test::KeyStr("c", 3, kTypeDeletion)},
       {"", "", ""},
-      {test::KeyStr("b", 2, kTypeDeletion),
-       test::KeyStr("c", 3, kTypeDeletion)},
+      {},
       {"", ""}, kMaxSequenceNumber /*last_commited_seq*/,
       nullptr /*merge_operator*/, nullptr /*compaction_filter*/,
       true /*bottommost_level*/);
 }
+
+TEST_F(CompactionIteratorWithSnapshotCheckerTest,
+       NotRemoveDeletionIfValuePresentToEarlierSnapshot) {
+  AddSnapshot(2,1);
+  RunTest(
+      {test::KeyStr("a", 4, kTypeDeletion), test::KeyStr("a", 1, kTypeValue),
+          test::KeyStr("b", 3, kTypeValue)},
+      {"", "", ""},
+      {test::KeyStr("a", 4, kTypeDeletion), test::KeyStr("a", 0, kTypeValue),
+            test::KeyStr("b", 3, kTypeValue)},
+      {"", "", ""}, kMaxSequenceNumber /*last_commited_seq*/,
+      nullptr /*merge_operator*/, nullptr /*compaction_filter*/,
+      true /*bottommost_level*/);
+}
 
 TEST_F(CompactionIteratorWithSnapshotCheckerTest,
        NotRemoveSingleDeletionIfNotVisibleToEarliestSnapshot) {
diff -Nru rocksdb-5.15.10/db/compaction_job.cc rocksdb-5.17.2/db/compaction_job.cc
--- rocksdb-5.15.10/db/compaction_job.cc	2018-09-13 17:25:20.000000000 +0000
+++ rocksdb-5.17.2/db/compaction_job.cc	2018-11-12 19:57:32.000000000 +0000
@@ -593,13 +593,11 @@
     thread.join();
   }
 
-  if (output_directory_) {
-    output_directory_->Fsync();
-  }
-
   compaction_stats_.micros = env_->NowMicros() - start_micros;
   MeasureTime(stats_, COMPACTION_TIME, compaction_stats_.micros);
 
+  TEST_SYNC_POINT("CompactionJob::Run:BeforeVerify");
+
   // Check if any thread encountered an error during execution
   Status status;
   for (const auto& state : compact_->sub_compact_states) {
@@ -609,6 +607,10 @@
     }
   }
 
+  if (status.ok() && output_directory_) {
+    status = output_directory_->Fsync();
+  }
+
   if (status.ok()) {
     thread_pool.clear();
     std::vector<const FileMetaData*> files_meta;
@@ -1307,9 +1309,7 @@
     // VersionEdit.
     assert(!sub_compact->outputs.empty());
     sub_compact->outputs.pop_back();
-    sub_compact->builder.reset();
-    sub_compact->current_output_file_size = 0;
-    return s;
+    meta = nullptr;
   }
 
   if (s.ok() && (current_entries > 0 || tp.num_range_deletions > 0)) {
@@ -1463,8 +1463,9 @@
   writable_file->SetWriteLifeTimeHint(write_hint_);
   writable_file->SetPreallocationBlockSize(static_cast<size_t>(
       sub_compact->compaction->OutputFilePreallocationSize()));
-  sub_compact->outfile.reset(new WritableFileWriter(
-      std::move(writable_file), env_options_, db_options_.statistics.get()));
+  sub_compact->outfile.reset(
+      new WritableFileWriter(std::move(writable_file), fname, env_options_,
+                             db_options_.statistics.get()));
 
   // If the Column family flag is to only optimize filters for hits,
   // we can skip creating filters if this is the bottommost_level where
diff -Nru rocksdb-5.15.10/db/compaction_job_test.cc rocksdb-5.17.2/db/compaction_job_test.cc
--- rocksdb-5.15.10/db/compaction_job_test.cc	2018-09-13 17:25:20.000000000 +0000
+++ rocksdb-5.17.2/db/compaction_job_test.cc	2018-11-12 19:57:32.000000000 +0000
@@ -79,7 +79,7 @@
         shutting_down_(false),
         preserve_deletes_seqnum_(0),
         mock_table_factory_(new mock::MockTableFactory()),
-        error_handler_(db_options_, &mutex_) {
+        error_handler_(nullptr, db_options_, &mutex_) {
     EXPECT_OK(env_->CreateDirIfMissing(dbname_));
     db_options_.db_paths.emplace_back(dbname_,
                                       std::numeric_limits<uint64_t>::max());
@@ -205,7 +205,7 @@
         manifest, &file, env_->OptimizeForManifestWrite(env_options_));
     ASSERT_OK(s);
     unique_ptr<WritableFileWriter> file_writer(
-        new WritableFileWriter(std::move(file), env_options_));
+        new WritableFileWriter(std::move(file), manifest, env_options_));
     {
       log::Writer log(std::move(file_writer), 0, false);
       std::string record;
diff -Nru rocksdb-5.15.10/db/compaction_picker.cc rocksdb-5.17.2/db/compaction_picker.cc
--- rocksdb-5.15.10/db/compaction_picker.cc	2018-09-13 17:25:20.000000000 +0000
+++ rocksdb-5.17.2/db/compaction_picker.cc	2018-11-12 19:57:32.000000000 +0000
@@ -49,7 +49,7 @@
   // increasing.
   size_t new_compact_bytes_per_del_file = 0;
   for (span_len = 1; span_len < level_files.size(); ++span_len) {
-    compact_bytes += level_files[span_len]->fd.file_size;
+    compact_bytes += static_cast<size_t>(level_files[span_len]->fd.file_size);
     new_compact_bytes_per_del_file = compact_bytes / span_len;
     if (level_files[span_len]->being_compacted ||
         new_compact_bytes_per_del_file > compact_bytes_per_del_file) {
@@ -219,7 +219,8 @@
 
 bool CompactionPicker::ExpandInputsToCleanCut(const std::string& /*cf_name*/,
                                               VersionStorageInfo* vstorage,
-                                              CompactionInputFiles* inputs) {
+                                              CompactionInputFiles* inputs,
+                                              InternalKey** next_smallest) {
   // This isn't good compaction
   assert(!inputs->empty());
 
@@ -242,7 +243,8 @@
     GetRange(*inputs, &smallest, &largest);
     inputs->clear();
     vstorage->GetOverlappingInputs(level, &smallest, &largest, &inputs->files,
-                                   hint_index, &hint_index);
+                                   hint_index, &hint_index, true,
+                                   next_smallest);
   } while (inputs->size() > old_size);
 
   // we started off with inputs non-empty and the previous loop only grew
@@ -315,13 +317,29 @@
   // shouldn't have been released since.
   assert(!FilesRangeOverlapWithCompaction(input_files, output_level));
 
-  auto c =
-      new Compaction(vstorage, ioptions_, mutable_cf_options, input_files,
-                     output_level, compact_options.output_file_size_limit,
-                     mutable_cf_options.max_compaction_bytes, output_path_id,
-                     compact_options.compression, ioptions_.compression_opts,
-                     compact_options.max_subcompactions,
-                     /* grandparents */ {}, true);
+  CompressionType compression_type;
+  if (compact_options.compression == kDisableCompressionOption) {
+    int base_level;
+    if (ioptions_.compaction_style == kCompactionStyleLevel) {
+      base_level = vstorage->base_level();
+    } else {
+      base_level = 1;
+    }
+    compression_type =
+        GetCompressionType(ioptions_, vstorage, mutable_cf_options,
+                           output_level, base_level);
+  } else {
+    // TODO(ajkr): `CompactionOptions` offers configurable `CompressionType`
+    // without configurable `CompressionOptions`, which is inconsistent.
+    compression_type = compact_options.compression;
+  }
+  auto c = new Compaction(
+      vstorage, ioptions_, mutable_cf_options, input_files, output_level,
+      compact_options.output_file_size_limit,
+      mutable_cf_options.max_compaction_bytes, output_path_id, compression_type,
+      GetCompressionOptions(ioptions_, vstorage, output_level),
+      compact_options.max_subcompactions,
+      /* grandparents */ {}, true);
   RegisterCompaction(c);
   return c;
 }
@@ -633,7 +651,6 @@
       uint64_t s = inputs[i]->compensated_file_size;
       total += s;
       if (total >= limit) {
-        **compaction_end = inputs[i + 1]->smallest;
         covering_the_whole_range = false;
         inputs.files.resize(i + 1);
         break;
@@ -642,7 +659,10 @@
   }
   assert(output_path_id < static_cast<uint32_t>(ioptions_.cf_paths.size()));
 
-  if (ExpandInputsToCleanCut(cf_name, vstorage, &inputs) == false) {
+  InternalKey key_storage;
+  InternalKey* next_smallest = &key_storage;
+  if (ExpandInputsToCleanCut(cf_name, vstorage, &inputs, &next_smallest) ==
+      false) {
     // manual compaction is now multi-threaded, so it can
     // happen that ExpandWhileOverlapping fails
     // we handle it higher in RunManualCompaction
@@ -650,8 +670,10 @@
     return nullptr;
   }
 
-  if (covering_the_whole_range) {
+  if (covering_the_whole_range || !next_smallest) {
     *compaction_end = nullptr;
+  } else {
+    **compaction_end = *next_smallest;
   }
 
   CompactionInputFiles output_level_inputs;
diff -Nru rocksdb-5.15.10/db/compaction_picker.h rocksdb-5.17.2/db/compaction_picker.h
--- rocksdb-5.15.10/db/compaction_picker.h	2018-09-13 17:25:20.000000000 +0000
+++ rocksdb-5.17.2/db/compaction_picker.h	2018-11-12 19:57:32.000000000 +0000
@@ -151,7 +151,8 @@
   // Will return false if it is impossible to apply this compaction.
   bool ExpandInputsToCleanCut(const std::string& cf_name,
                               VersionStorageInfo* vstorage,
-                              CompactionInputFiles* inputs);
+                              CompactionInputFiles* inputs,
+                              InternalKey** next_smallest = nullptr);
 
   // Returns true if any one of the parent files are being compacted
   bool IsRangeInCompaction(VersionStorageInfo* vstorage,
diff -Nru rocksdb-5.15.10/db/compaction_picker_test.cc rocksdb-5.17.2/db/compaction_picker_test.cc
--- rocksdb-5.15.10/db/compaction_picker_test.cc	2018-09-13 17:25:20.000000000 +0000
+++ rocksdb-5.17.2/db/compaction_picker_test.cc	2018-11-12 19:57:32.000000000 +0000
@@ -90,8 +90,8 @@
     f->fd = FileDescriptor(file_number, path_id, file_size);
     f->smallest = InternalKey(smallest, smallest_seq, kTypeValue);
     f->largest = InternalKey(largest, largest_seq, kTypeValue);
-    f->smallest_seqno = smallest_seq;
-    f->largest_seqno = largest_seq;
+    f->fd.smallest_seqno = smallest_seq;
+    f->fd.largest_seqno = largest_seq;
     f->compensated_file_size = file_size;
     f->refs = 0;
     vstorage_->AddFile(level, f);
diff -Nru rocksdb-5.15.10/db/compaction_picker_universal.cc rocksdb-5.17.2/db/compaction_picker_universal.cc
--- rocksdb-5.15.10/db/compaction_picker_universal.cc	2018-09-13 17:25:20.000000000 +0000
+++ rocksdb-5.17.2/db/compaction_picker_universal.cc	2018-11-12 19:57:32.000000000 +0000
@@ -97,17 +97,17 @@
                              SequenceNumber* largest_seqno) {
   bool is_first = true;
   for (FileMetaData* f : files) {
-    assert(f->smallest_seqno <= f->largest_seqno);
+    assert(f->fd.smallest_seqno <= f->fd.largest_seqno);
     if (is_first) {
       is_first = false;
-      *smallest_seqno = f->smallest_seqno;
-      *largest_seqno = f->largest_seqno;
+      *smallest_seqno = f->fd.smallest_seqno;
+      *largest_seqno = f->fd.largest_seqno;
     } else {
-      if (f->smallest_seqno < *smallest_seqno) {
-        *smallest_seqno = f->smallest_seqno;
+      if (f->fd.smallest_seqno < *smallest_seqno) {
+        *smallest_seqno = f->fd.smallest_seqno;
       }
-      if (f->largest_seqno > *largest_seqno) {
-        *largest_seqno = f->largest_seqno;
+      if (f->fd.largest_seqno > *largest_seqno) {
+        *largest_seqno = f->fd.largest_seqno;
       }
     }
   }
@@ -365,11 +365,11 @@
   size_t level_index = 0U;
   if (c->start_level() == 0) {
     for (auto f : *c->inputs(0)) {
-      assert(f->smallest_seqno <= f->largest_seqno);
+      assert(f->fd.smallest_seqno <= f->fd.largest_seqno);
       if (is_first) {
         is_first = false;
       }
-      prev_smallest_seqno = f->smallest_seqno;
+      prev_smallest_seqno = f->fd.smallest_seqno;
     }
     level_index = 1U;
   }
diff -Nru rocksdb-5.15.10/db/c_test.c rocksdb-5.17.2/db/c_test.c
--- rocksdb-5.15.10/db/c_test.c	2018-09-13 17:25:20.000000000 +0000
+++ rocksdb-5.17.2/db/c_test.c	2018-11-12 19:57:32.000000000 +0000
@@ -19,11 +19,8 @@
 
 // Can not use port/port.h macros as this is a c file
 #ifdef OS_WIN
-
 #include <windows.h>
 
-#define snprintf _snprintf
-
 // Ok for uniqueness
 int geteuid() {
   int result = 0;
@@ -34,6 +31,11 @@
   return result;
 }
 
+// VS < 2015
+#if defined(_MSC_VER) && (_MSC_VER < 1900)
+#define snprintf _snprintf
+#endif
+
 #endif
 
 const char* phase = "";
@@ -47,12 +49,19 @@
   fprintf(stderr, "=== Test %s\n", name);
   phase = name;
 }
+#ifdef _MSC_VER
+#pragma warning(push)
+#pragma warning (disable: 4996) // getenv security warning
+#endif
 static const char* GetTempDir(void) {
     const char* ret = getenv("TEST_TMPDIR");
     if (ret == NULL || ret[0] == '\0')
         ret = "/tmp";
     return ret;
 }
+#ifdef _MSC_VER
+#pragma warning(pop)
+#endif
 
 #define CheckNoError(err)                                               \
   if ((err) != NULL) {                                                  \
@@ -643,7 +652,7 @@
     rocksdb_sstfilewriter_t* writer =
         rocksdb_sstfilewriter_create(env_opt, io_options);
 
-    unlink(sstfilename);
+    remove(sstfilename);
     rocksdb_sstfilewriter_open(writer, sstfilename, &err);
     CheckNoError(err);
     rocksdb_sstfilewriter_put(writer, "sstk1", 5, "v1", 2, &err);
@@ -664,7 +673,7 @@
     CheckGet(db, roptions, "sstk2", "v2");
     CheckGet(db, roptions, "sstk3", "v3");
 
-    unlink(sstfilename);
+    remove(sstfilename);
     rocksdb_sstfilewriter_open(writer, sstfilename, &err);
     CheckNoError(err);
     rocksdb_sstfilewriter_put(writer, "sstk2", 5, "v4", 2, &err);
@@ -1334,6 +1343,47 @@
     rocksdb_destroy_db(options, dbname, &err);
   }
 
+  // Check memory usage stats
+  StartPhase("approximate_memory_usage");
+  {
+    // Create database
+    db = rocksdb_open(options, dbname, &err);
+    CheckNoError(err);
+
+    rocksdb_memory_consumers_t* consumers;
+    consumers = rocksdb_memory_consumers_create();
+    rocksdb_memory_consumers_add_db(consumers, db);
+    rocksdb_memory_consumers_add_cache(consumers, cache);
+
+    // take memory usage report before write-read operation
+    rocksdb_memory_usage_t* mu1;
+    mu1 = rocksdb_approximate_memory_usage_create(consumers, &err);
+    CheckNoError(err);
+
+    // Put data (this should affect memtables)
+    rocksdb_put(db, woptions, "memory", 6, "test", 4, &err);
+    CheckNoError(err);
+    CheckGet(db, roptions, "memory", "test");
+
+    // take memory usage report after write-read operation
+    rocksdb_memory_usage_t* mu2;
+    mu2 = rocksdb_approximate_memory_usage_create(consumers, &err);
+    CheckNoError(err);
+
+    // amount of memory used within memtables should grow
+    CheckCondition(rocksdb_approximate_memory_usage_get_mem_table_total(mu2) >=
+                   rocksdb_approximate_memory_usage_get_mem_table_total(mu1));
+    CheckCondition(rocksdb_approximate_memory_usage_get_mem_table_unflushed(mu2) >=
+                   rocksdb_approximate_memory_usage_get_mem_table_unflushed(mu1));
+
+    rocksdb_memory_consumers_destroy(consumers);
+    rocksdb_approximate_memory_usage_destroy(mu1);
+    rocksdb_approximate_memory_usage_destroy(mu2);
+    rocksdb_close(db);
+    rocksdb_destroy_db(options, dbname, &err);
+    CheckNoError(err);
+  }
+
   StartPhase("cuckoo_options");
   {
     rocksdb_cuckoo_table_options_t* cuckoo_options;
@@ -1675,7 +1725,7 @@
     db = rocksdb_open(options, dbname, &err);
     CheckNoError(err);
   }
-  
+
   StartPhase("cleanup");
   rocksdb_close(db);
   rocksdb_options_destroy(options);
diff -Nru rocksdb-5.15.10/db/db_bloom_filter_test.cc rocksdb-5.17.2/db/db_bloom_filter_test.cc
--- rocksdb-5.15.10/db/db_bloom_filter_test.cc	2018-09-13 17:25:20.000000000 +0000
+++ rocksdb-5.17.2/db/db_bloom_filter_test.cc	2018-11-12 19:57:32.000000000 +0000
@@ -22,11 +22,12 @@
 
 class DBBloomFilterTestWithParam
     : public DBTestBase,
-      public testing::WithParamInterface<std::tuple<bool, bool>> {
+      public testing::WithParamInterface<std::tuple<bool, bool, uint32_t>> {
   //                             public testing::WithParamInterface<bool> {
  protected:
   bool use_block_based_filter_;
   bool partition_filters_;
+  uint32_t format_version_;
 
  public:
   DBBloomFilterTestWithParam() : DBTestBase("/db_bloom_filter_tests") {}
@@ -36,9 +37,12 @@
   void SetUp() override {
     use_block_based_filter_ = std::get<0>(GetParam());
     partition_filters_ = std::get<1>(GetParam());
+    format_version_ = std::get<2>(GetParam());
   }
 };
 
+class DBBloomFilterTestDefFormatVersion : public DBBloomFilterTestWithParam {};
+
 class SliceTransformLimitedDomainGeneric : public SliceTransform {
   const char* Name() const override {
     return "SliceTransformLimitedDomainGeneric";
@@ -62,7 +66,7 @@
 // KeyMayExist can lead to a few false positives, but not false negatives.
 // To make test deterministic, use a much larger number of bits per key-20 than
 // bits in the key, so that false positives are eliminated
-TEST_P(DBBloomFilterTestWithParam, KeyMayExist) {
+TEST_P(DBBloomFilterTestDefFormatVersion, KeyMayExist) {
   do {
     ReadOptions ropts;
     std::string value;
@@ -401,6 +405,11 @@
       table_options.index_type =
           BlockBasedTableOptions::IndexType::kTwoLevelIndexSearch;
     }
+    table_options.format_version = format_version_;
+    if (format_version_ >= 4) {
+      // value delta encoding challenged more with index interval > 1
+      table_options.index_block_restart_interval = 8;
+    }
     table_options.metadata_block_size = 32;
     options.table_factory.reset(NewBlockBasedTableFactory(table_options));
 
@@ -456,10 +465,26 @@
   } while (ChangeCompactOptions());
 }
 
-INSTANTIATE_TEST_CASE_P(DBBloomFilterTestWithParam, DBBloomFilterTestWithParam,
-                        ::testing::Values(std::make_tuple(true, false),
-                                          std::make_tuple(false, true),
-                                          std::make_tuple(false, false)));
+INSTANTIATE_TEST_CASE_P(
+    FormatDef, DBBloomFilterTestDefFormatVersion,
+    ::testing::Values(std::make_tuple(true, false, test::kDefaultFormatVersion),
+                      std::make_tuple(false, true, test::kDefaultFormatVersion),
+                      std::make_tuple(false, false,
+                                      test::kDefaultFormatVersion)));
+
+INSTANTIATE_TEST_CASE_P(
+    FormatDef, DBBloomFilterTestWithParam,
+    ::testing::Values(std::make_tuple(true, false, test::kDefaultFormatVersion),
+                      std::make_tuple(false, true, test::kDefaultFormatVersion),
+                      std::make_tuple(false, false,
+                                      test::kDefaultFormatVersion)));
+
+INSTANTIATE_TEST_CASE_P(
+    FormatLatest, DBBloomFilterTestWithParam,
+    ::testing::Values(std::make_tuple(true, false, test::kLatestFormatVersion),
+                      std::make_tuple(false, true, test::kLatestFormatVersion),
+                      std::make_tuple(false, false,
+                                      test::kLatestFormatVersion)));
 
 TEST_F(DBBloomFilterTest, BloomFilterRate) {
   while (ChangeFilterOptions()) {
diff -Nru rocksdb-5.15.10/db/db_compaction_test.cc rocksdb-5.17.2/db/db_compaction_test.cc
--- rocksdb-5.15.10/db/db_compaction_test.cc	2018-09-13 17:25:20.000000000 +0000
+++ rocksdb-5.17.2/db/db_compaction_test.cc	2018-11-12 19:57:32.000000000 +0000
@@ -120,13 +120,12 @@
  public:
   SstStatsCollector() : num_ssts_creation_started_(0) {}
 
-  void OnTableFileCreationStarted(const TableFileCreationBriefInfo& /* info */) override {
+  void OnTableFileCreationStarted(
+      const TableFileCreationBriefInfo& /* info */) override {
     ++num_ssts_creation_started_;
   }
 
-  int num_ssts_creation_started() {
-    return num_ssts_creation_started_;
-  }
+  int num_ssts_creation_started() { return num_ssts_creation_started_; }
 
  private:
   std::atomic<int> num_ssts_creation_started_;
@@ -2478,6 +2477,7 @@
 
     // Compaction range overlaps files
     Compact(1, "p1", "p9", 1);
+    ASSERT_OK(dbfull()->TEST_WaitForCompact());
     ASSERT_EQ("0,1", FilesPerLevel(1));
     ASSERT_EQ(1, GetSstFileCount(options.db_paths[1].path));
     ASSERT_EQ(0, GetSstFileCount(options.db_paths[0].path));
@@ -2493,6 +2493,7 @@
 
     // Compact just the new range
     Compact(1, "b", "f", 1);
+    ASSERT_OK(dbfull()->TEST_WaitForCompact());
     ASSERT_EQ("0,2", FilesPerLevel(1));
     ASSERT_EQ(2, GetSstFileCount(options.db_paths[1].path));
     ASSERT_EQ(0, GetSstFileCount(options.db_paths[0].path));
@@ -2509,6 +2510,7 @@
     compact_options.target_path_id = 1;
     compact_options.exclusive_manual_compaction = exclusive_manual_compaction_;
     db_->CompactRange(compact_options, handles_[1], nullptr, nullptr);
+    ASSERT_OK(dbfull()->TEST_WaitForCompact());
 
     ASSERT_EQ("0,1", FilesPerLevel(1));
     ASSERT_EQ(1, GetSstFileCount(options.db_paths[1].path));
@@ -3501,12 +3503,13 @@
       // ensure the auto compaction doesn't finish until manual compaction has
       // had a chance to be delayed.
       rocksdb::SyncPoint::GetInstance()->LoadDependency(
-          {{"DBImpl::CompactRange:StallWait", "CompactionJob::Run():End"}});
+          {{"DBImpl::WaitUntilFlushWouldNotStallWrites:StallWait",
+            "CompactionJob::Run():End"}});
     } else {
       // ensure the auto-compaction doesn't finish until manual compaction has
       // continued without delay.
       rocksdb::SyncPoint::GetInstance()->LoadDependency(
-          {{"DBImpl::CompactRange:StallWaitDone", "CompactionJob::Run():End"}});
+          {{"DBImpl::FlushMemTable:StallWaitDone", "CompactionJob::Run():End"}});
     }
     rocksdb::SyncPoint::GetInstance()->EnableProcessing();
 
@@ -3554,12 +3557,13 @@
       // ensure the flush doesn't finish until manual compaction has had a
       // chance to be delayed.
       rocksdb::SyncPoint::GetInstance()->LoadDependency(
-          {{"DBImpl::CompactRange:StallWait", "FlushJob::WriteLevel0Table"}});
+          {{"DBImpl::WaitUntilFlushWouldNotStallWrites:StallWait",
+            "FlushJob::WriteLevel0Table"}});
     } else {
       // ensure the flush doesn't finish until manual compaction has continued
       // without delay.
       rocksdb::SyncPoint::GetInstance()->LoadDependency(
-          {{"DBImpl::CompactRange:StallWaitDone",
+          {{"DBImpl::FlushMemTable:StallWaitDone",
             "FlushJob::WriteLevel0Table"}});
     }
     rocksdb::SyncPoint::GetInstance()->EnableProcessing();
@@ -3569,6 +3573,7 @@
       ASSERT_OK(Put(Key(0), RandomString(&rnd, 1024)));
       FlushOptions flush_opts;
       flush_opts.wait = false;
+      flush_opts.allow_write_stall = true;
       dbfull()->Flush(flush_opts);
     }
 
@@ -3604,7 +3609,7 @@
     // The auto-compaction waits until the manual compaction finishes to ensure
     // the signal comes from closing CF/DB, not from compaction making progress.
     rocksdb::SyncPoint::GetInstance()->LoadDependency(
-        {{"DBImpl::CompactRange:StallWait",
+        {{"DBImpl::WaitUntilFlushWouldNotStallWrites:StallWait",
           "DBCompactionTest::CompactRangeShutdownWhileDelayed:PreShutdown"},
          {"DBCompactionTest::CompactRangeShutdownWhileDelayed:PostManual",
           "CompactionJob::Run():End"}});
@@ -3655,18 +3660,21 @@
   // began. So it unblocks CompactRange and precludes its flush. Throughout the
   // test, stall conditions are upheld via high L0 file count.
   rocksdb::SyncPoint::GetInstance()->LoadDependency(
-      {{"DBImpl::CompactRange:StallWait",
+      {{"DBImpl::WaitUntilFlushWouldNotStallWrites:StallWait",
         "DBCompactionTest::CompactRangeSkipFlushAfterDelay:PreFlush"},
        {"DBCompactionTest::CompactRangeSkipFlushAfterDelay:PostFlush",
-        "DBImpl::CompactRange:StallWaitDone"},
-       {"DBImpl::CompactRange:StallWaitDone", "CompactionJob::Run():End"}});
+        "DBImpl::FlushMemTable:StallWaitDone"},
+       {"DBImpl::FlushMemTable:StallWaitDone", "CompactionJob::Run():End"}});
   rocksdb::SyncPoint::GetInstance()->EnableProcessing();
 
+  //used for the delayable flushes
+  FlushOptions flush_opts;
+  flush_opts.allow_write_stall = true;
   for (int i = 0; i < kNumL0FilesLimit - 1; ++i) {
     for (int j = 0; j < 2; ++j) {
       ASSERT_OK(Put(Key(j), RandomString(&rnd, 1024)));
     }
-    Flush();
+    dbfull()->Flush(flush_opts);
   }
   auto manual_compaction_thread = port::Thread([this]() {
     CompactRangeOptions cro;
@@ -3676,7 +3684,7 @@
 
   TEST_SYNC_POINT("DBCompactionTest::CompactRangeSkipFlushAfterDelay:PreFlush");
   Put(ToString(0), RandomString(&rnd, 1024));
-  Flush();
+  dbfull()->Flush(flush_opts);
   Put(ToString(0), RandomString(&rnd, 1024));
   TEST_SYNC_POINT("DBCompactionTest::CompactRangeSkipFlushAfterDelay:PostFlush");
   manual_compaction_thread.join();
@@ -3953,6 +3961,50 @@
                       CompactionPri::kOldestSmallestSeqFirst,
                       CompactionPri::kMinOverlappingRatio));
 
+class NoopMergeOperator : public MergeOperator {
+ public:
+  NoopMergeOperator() {}
+
+  virtual bool FullMergeV2(const MergeOperationInput& /*merge_in*/,
+                           MergeOperationOutput* merge_out) const override {
+    std::string val("bar");
+    merge_out->new_value = val;
+    return true;
+  }
+
+  virtual const char* Name() const override { return "Noop"; }
+};
+
+TEST_F(DBCompactionTest, PartialManualCompaction) {
+  Options opts = CurrentOptions();
+  opts.num_levels = 3;
+  opts.level0_file_num_compaction_trigger = 10;
+  opts.compression = kNoCompression;
+  opts.merge_operator.reset(new NoopMergeOperator());
+  opts.target_file_size_base = 10240;
+  DestroyAndReopen(opts);
+
+  Random rnd(301);
+  for (auto i = 0; i < 8; ++i) {
+    for (auto j = 0; j < 10; ++j) {
+      Merge("foo", RandomString(&rnd, 1024));
+    }
+    Flush();
+  }
+
+  MoveFilesToLevel(2);
+
+  std::string prop;
+  EXPECT_TRUE(dbfull()->GetProperty(DB::Properties::kLiveSstFilesSize, &prop));
+  uint64_t max_compaction_bytes = atoi(prop.c_str()) / 2;
+  ASSERT_OK(dbfull()->SetOptions(
+      {{"max_compaction_bytes", std::to_string(max_compaction_bytes)}}));
+
+  CompactRangeOptions cro;
+  cro.bottommost_level_compaction = BottommostLevelCompaction::kForce;
+  dbfull()->CompactRange(cro, nullptr, nullptr);
+}
+
 #endif // !defined(ROCKSDB_LITE)
 }  // namespace rocksdb
 
diff -Nru rocksdb-5.15.10/db/db_flush_test.cc rocksdb-5.17.2/db/db_flush_test.cc
--- rocksdb-5.15.10/db/db_flush_test.cc	2018-09-13 17:25:20.000000000 +0000
+++ rocksdb-5.17.2/db/db_flush_test.cc	2018-11-12 19:57:32.000000000 +0000
@@ -35,6 +35,7 @@
   Reopen(options);
   FlushOptions no_wait;
   no_wait.wait = false;
+  no_wait.allow_write_stall=true;
 
   SyncPoint::GetInstance()->LoadDependency(
       {{"VersionSet::LogAndApply:WriteManifest",
@@ -55,6 +56,9 @@
 #endif  // ROCKSDB_LITE
 }
 
+#ifndef TRAVIS
+// Disable this test temporarily on Travis as it fails intermittently.
+// Github issue: #4151
 TEST_F(DBFlushTest, SyncFail) {
   std::unique_ptr<FaultInjectionTestEnv> fault_injection_env(
       new FaultInjectionTestEnv(env_));
@@ -92,6 +96,7 @@
   ASSERT_EQ(refs_before, cfd->current()->TEST_refs());
   Destroy(options);
 }
+#endif  // TRAVIS
 
 TEST_F(DBFlushTest, FlushInLowPriThreadPool) {
   // Verify setting an empty high-pri (flush) thread pool causes flushes to be
diff -Nru rocksdb-5.15.10/db/dbformat.cc rocksdb-5.17.2/db/dbformat.cc
--- rocksdb-5.15.10/db/dbformat.cc	2018-09-13 17:25:20.000000000 +0000
+++ rocksdb-5.17.2/db/dbformat.cc	2018-11-12 19:57:32.000000000 +0000
@@ -48,6 +48,8 @@
       return kEntryMerge;
     case kTypeRangeDeletion:
       return kEntryRangeDeletion;
+    case kTypeBlobIndex:
+      return kEntryBlobIndex;
     default:
       return kEntryOther;
   }
diff -Nru rocksdb-5.15.10/db/db_impl.cc rocksdb-5.17.2/db/db_impl.cc
--- rocksdb-5.15.10/db/db_impl.cc	2018-09-13 17:25:20.000000000 +0000
+++ rocksdb-5.17.2/db/db_impl.cc	2018-11-12 19:57:32.000000000 +0000
@@ -215,9 +215,11 @@
       // requires a custom gc for compaction, we use that to set use_custom_gc_
       // as well.
       use_custom_gc_(seq_per_batch),
+      shutdown_initiated_(false),
+      own_sfm_(options.sst_file_manager == nullptr),
       preserve_deletes_(options.preserve_deletes),
       closed_(false),
-      error_handler_(immutable_db_options_, &mutex_) {
+      error_handler_(this, immutable_db_options_, &mutex_) {
   // !batch_per_trx_ implies seq_per_batch_ because it is only unset for
   // WriteUnprepared, which should use seq_per_batch_.
   assert(batch_per_txn_ || seq_per_batch_);
@@ -259,16 +261,62 @@
     return Status::OK();
   }
 
-  Status s = error_handler_.GetBGError();
-  if (s.severity() > Status::Severity::kHardError) {
+  if (error_handler_.IsRecoveryInProgress()) {
+    // Don't allow a mix of manual and automatic recovery
+    return Status::Busy();
+  }
+
+  mutex_.Unlock();
+  Status s = error_handler_.RecoverFromBGError(true);
+  mutex_.Lock();
+  return s;
+}
+
+// This function implements the guts of recovery from a background error. It
+// is eventually called for both manual as well as automatic recovery. It does
+// the following -
+// 1. Wait for currently scheduled background flush/compaction to exit, in
+//    order to inadvertently causing an error and thinking recovery failed
+// 2. Flush memtables if there's any data for all the CFs. This may result
+//    another error, which will be saved by error_handler_ and reported later
+//    as the recovery status
+// 3. Find and delete any obsolete files
+// 4. Schedule compactions if needed for all the CFs. This is needed as the
+//    flush in the prior step might have been a no-op for some CFs, which
+//    means a new super version wouldn't have been installed
+Status DBImpl::ResumeImpl() {
+  mutex_.AssertHeld();
+  WaitForBackgroundWork();
+
+  Status bg_error = error_handler_.GetBGError();
+  Status s;
+  if (shutdown_initiated_) {
+    // Returning shutdown status to SFM during auto recovery will cause it
+    // to abort the recovery and allow the shutdown to progress
+    s = Status::ShutdownInProgress();
+  }
+  if (s.ok() && bg_error.severity() > Status::Severity::kHardError) {
     ROCKS_LOG_INFO(immutable_db_options_.info_log,
         "DB resume requested but failed due to Fatal/Unrecoverable error");
-    return s;
+    s = bg_error;
+  }
+
+  // We cannot guarantee consistency of the WAL. So force flush Memtables of
+  // all the column families
+  if (s.ok()) {
+    s = FlushAllCFs(FlushReason::kErrorRecovery);
+    if (!s.ok()) {
+      ROCKS_LOG_INFO(immutable_db_options_.info_log,
+                     "DB resume requested but failed due to Flush failure [%s]",
+                     s.ToString().c_str());
+    }
   }
 
   JobContext job_context(0);
   FindObsoleteFiles(&job_context, true);
-  error_handler_.ClearBGError();
+  if (s.ok()) {
+    s = error_handler_.ClearBGError();
+  }
   mutex_.Unlock();
 
   job_context.manifest_file_number = 1;
@@ -277,13 +325,36 @@
   }
   job_context.Clean();
 
-  ROCKS_LOG_INFO(immutable_db_options_.info_log, "Successfully resumed DB");
+  if (s.ok()) {
+    ROCKS_LOG_INFO(immutable_db_options_.info_log, "Successfully resumed DB");
+  }
   mutex_.Lock();
-  MaybeScheduleFlushOrCompaction();
+  // Check for shutdown again before scheduling further compactions,
+  // since we released and re-acquired the lock above
+  if (shutdown_initiated_) {
+    s = Status::ShutdownInProgress();
+  }
+  if (s.ok()) {
+    for (auto cfd : *versions_->GetColumnFamilySet()) {
+      SchedulePendingCompaction(cfd);
+    }
+    MaybeScheduleFlushOrCompaction();
+  }
+
+  // Wake up any waiters - in this case, it could be the shutdown thread
+  bg_cv_.SignalAll();
 
   // No need to check BGError again. If something happened, event listener would be
   // notified and the operation causing it would have failed
-  return Status::OK();
+  return s;
+}
+
+void DBImpl::WaitForBackgroundWork() {
+  // Wait for background work to finish
+  while (bg_bottom_compaction_scheduled_ || bg_compaction_scheduled_ ||
+         bg_flush_scheduled_) {
+    bg_cv_.Wait();
+  }
 }
 
 // Will lock the mutex_,  will wait for completion if wait is true
@@ -313,14 +384,20 @@
   if (!wait) {
     return;
   }
-  // Wait for background work to finish
-  while (bg_bottom_compaction_scheduled_ || bg_compaction_scheduled_ ||
-         bg_flush_scheduled_) {
-    bg_cv_.Wait();
-  }
+  WaitForBackgroundWork();
 }
 
 Status DBImpl::CloseHelper() {
+  // Guarantee that there is no background error recovery in progress before
+  // continuing with the shutdown
+  mutex_.Lock();
+  shutdown_initiated_ = true;
+  error_handler_.CancelErrorRecovery();
+  while (error_handler_.IsRecoveryInProgress()) {
+    bg_cv_.Wait();
+  }
+  mutex_.Unlock();
+
   // CancelAllBackgroundWork called with false means we just set the shutdown
   // marker. After this we do a variant of the waiting and unschedule work
   // (to consider: moving all the waiting into CancelAllBackgroundWork(true))
@@ -338,7 +415,8 @@
   // Wait for background work to finish
   while (bg_bottom_compaction_scheduled_ || bg_compaction_scheduled_ ||
          bg_flush_scheduled_ || bg_purge_scheduled_ ||
-         pending_purge_obsolete_files_) {
+         pending_purge_obsolete_files_ ||
+         error_handler_.IsRecoveryInProgress()) {
     TEST_SYNC_POINT("DBImpl::~DBImpl:WaitJob");
     bg_cv_.Wait();
   }
@@ -348,9 +426,12 @@
   flush_scheduler_.Clear();
 
   while (!flush_queue_.empty()) {
-    auto cfd = PopFirstFromFlushQueue();
-    if (cfd->Unref()) {
-      delete cfd;
+    const FlushRequest& flush_req = PopFirstFromFlushQueue();
+    for (const auto& iter : flush_req) {
+      ColumnFamilyData* cfd = iter.first;
+      if (cfd->Unref()) {
+        delete cfd;
+      }
     }
   }
   while (!compaction_queue_.empty()) {
@@ -440,6 +521,17 @@
   ROCKS_LOG_INFO(immutable_db_options_.info_log, "Shutdown complete");
   LogFlush(immutable_db_options_.info_log);
 
+#ifndef ROCKSDB_LITE
+  // If the sst_file_manager was allocated by us during DB::Open(), ccall
+  // Close() on it before closing the info_log. Otherwise, background thread
+  // in SstFileManagerImpl might try to log something
+  if (immutable_db_options_.sst_file_manager && own_sfm_) {
+    auto sfm = static_cast<SstFileManagerImpl*>(
+        immutable_db_options_.sst_file_manager.get());
+    sfm->Close();
+  }
+#endif // ROCKSDB_LITE
+
   if (immutable_db_options_.info_log && own_info_log_) {
     Status s = immutable_db_options_.info_log->Close();
     if (ret.ok()) {
@@ -1047,7 +1139,7 @@
   } else {
     CleanupSuperVersion(super_version);
   }
-  return NewErrorInternalIterator(s, arena);
+  return NewErrorInternalIterator<Slice>(s, arena);
 }
 
 ColumnFamilyHandle* DBImpl::DefaultColumnFamily() const {
@@ -1071,6 +1163,15 @@
   auto cfh = reinterpret_cast<ColumnFamilyHandleImpl*>(column_family);
   auto cfd = cfh->cfd();
 
+  if (tracer_) {
+    // TODO: This mutex should be removed later, to improve performance when
+    // tracing is enabled.
+    InstrumentedMutexLock lock(&trace_mutex_);
+    if (tracer_) {
+      tracer_->Get(column_family, key);
+    }
+  }
+
   // Acquire SuperVersion
   SuperVersion* sv = GetAndRefSuperVersion(cfd);
 
@@ -1609,8 +1710,8 @@
     result = NewDBIterator(
         env_, read_options, *cfd->ioptions(), sv->mutable_cf_options,
         cfd->user_comparator(), iter, kMaxSequenceNumber,
-        sv->mutable_cf_options.max_sequential_skip_in_iterations,
-        read_callback);
+        sv->mutable_cf_options.max_sequential_skip_in_iterations, read_callback,
+        this, cfd);
 #endif
   } else {
     // Note: no need to consider the special case of
@@ -1677,9 +1778,8 @@
   ArenaWrappedDBIter* db_iter = NewArenaWrappedDbIterator(
       env_, read_options, *cfd->ioptions(), sv->mutable_cf_options, snapshot,
       sv->mutable_cf_options.max_sequential_skip_in_iterations,
-      sv->version_number, read_callback,
-      ((read_options.snapshot != nullptr) ? nullptr : this), cfd, allow_blob,
-      allow_refresh);
+      sv->version_number, read_callback, this, cfd, allow_blob,
+      ((read_options.snapshot != nullptr) ? false : allow_refresh));
 
   InternalIterator* internal_iter =
       NewInternalIterator(read_options, cfd, sv, db_iter->GetArena(),
@@ -1716,7 +1816,7 @@
           env_, read_options, *cfd->ioptions(), sv->mutable_cf_options,
           cfd->user_comparator(), iter, kMaxSequenceNumber,
           sv->mutable_cf_options.max_sequential_skip_in_iterations,
-          read_callback));
+          read_callback, this, cfd));
     }
 #endif
   } else {
@@ -2227,9 +2327,9 @@
     status = versions_->LogAndApply(cfd, *cfd->GetLatestMutableCFOptions(),
                                     &edit, &mutex_, directories_.GetDbDir());
     if (status.ok()) {
-      InstallSuperVersionAndScheduleWork(cfd, &job_context.superversion_context,
-                                         *cfd->GetLatestMutableCFOptions(),
-                                         FlushReason::kDeleteFiles);
+      InstallSuperVersionAndScheduleWork(
+          cfd, &job_context.superversion_contexts[0],
+          *cfd->GetLatestMutableCFOptions(), FlushReason::kDeleteFiles);
     }
     FindObsoleteFiles(&job_context, false);
   }  // lock released here
@@ -2312,9 +2412,9 @@
     status = versions_->LogAndApply(cfd, *cfd->GetLatestMutableCFOptions(),
                                     &edit, &mutex_, directories_.GetDbDir());
     if (status.ok()) {
-      InstallSuperVersionAndScheduleWork(cfd, &job_context.superversion_context,
-                                         *cfd->GetLatestMutableCFOptions(),
-                                         FlushReason::kDeleteFiles);
+      InstallSuperVersionAndScheduleWork(
+          cfd, &job_context.superversion_contexts[0],
+          *cfd->GetLatestMutableCFOptions(), FlushReason::kDeleteFiles);
     }
     for (auto* deleted_file : deleted_files) {
       deleted_file->being_compacted = false;
@@ -2402,7 +2502,7 @@
   if (!s.ok()) {
     return s;
   }
-  char* buffer = reinterpret_cast<char*>(alloca(file_size));
+  char* buffer = reinterpret_cast<char*>(alloca(static_cast<size_t>(file_size)));
   Slice id;
   s = id_file_reader->Read(static_cast<size_t>(file_size), &id, buffer);
   if (!s.ok()) {
@@ -2879,6 +2979,10 @@
     ColumnFamilyHandle* column_family,
     const std::vector<std::string>& external_files,
     const IngestExternalFileOptions& ingestion_options) {
+  if (external_files.empty()) {
+    return Status::InvalidArgument("external_files is empty");
+  }
+
   Status status;
   auto cfh = reinterpret_cast<ColumnFamilyHandleImpl*>(column_family);
   auto cfd = cfh->cfd();
@@ -2896,6 +3000,9 @@
                                             immutable_db_options_, env_options_,
                                             &snapshots_, ingestion_options);
 
+  SuperVersionContext dummy_sv_ctx(/* create_superversion */ true);
+  VersionEdit dummy_edit;
+  uint64_t next_file_number = 0;
   std::list<uint64_t>::iterator pending_output_elem;
   {
     InstrumentedMutexLock l(&mutex_);
@@ -2906,10 +3013,29 @@
 
     // Make sure that bg cleanup wont delete the files that we are ingesting
     pending_output_elem = CaptureCurrentFileNumberInPendingOutputs();
+
+    // If crash happen after a hard link established, Recover function may
+    // reuse the file number that has already assigned to the internal file,
+    // and this will overwrite the external file. To protect the external
+    // file, we have to make sure the file number will never being reused.
+    next_file_number = versions_->FetchAddFileNumber(external_files.size());
+    auto cf_options = cfd->GetLatestMutableCFOptions();
+    status = versions_->LogAndApply(cfd, *cf_options, &dummy_edit, &mutex_,
+                                    directories_.GetDbDir());
+    if (status.ok()) {
+      InstallSuperVersionAndScheduleWork(cfd, &dummy_sv_ctx, *cf_options);
+    }
+  }
+  dummy_sv_ctx.Clean();
+  if (!status.ok()) {
+    InstrumentedMutexLock l(&mutex_);
+    ReleaseFileNumberFromPendingOutputs(pending_output_elem);
+    return status;
   }
 
   SuperVersion* super_version = cfd->GetReferencedSuperVersion(&mutex_);
-  status = ingestion_job.Prepare(external_files, super_version);
+  status =
+      ingestion_job.Prepare(external_files, next_file_number, super_version);
   CleanupSuperVersion(super_version);
   if (!status.ok()) {
     InstrumentedMutexLock l(&mutex_);
@@ -3060,7 +3186,6 @@
 
 void DBImpl::NotifyOnExternalFileIngested(
     ColumnFamilyData* cfd, const ExternalSstFileIngestionJob& ingestion_job) {
-#ifndef ROCKSDB_LITE
   if (immutable_db_options_.listeners.empty()) {
     return;
   }
@@ -3076,8 +3201,6 @@
       listener->OnExternalFileIngested(this, info);
     }
   }
-
-#endif
 }
 
 void DBImpl::WaitForIngestFile() {
@@ -3087,5 +3210,43 @@
   }
 }
 
+Status DBImpl::StartTrace(const TraceOptions& /* options */,
+                          std::unique_ptr<TraceWriter>&& trace_writer) {
+  InstrumentedMutexLock lock(&trace_mutex_);
+  tracer_.reset(new Tracer(env_, std::move(trace_writer)));
+  return Status::OK();
+}
+
+Status DBImpl::EndTrace() {
+  InstrumentedMutexLock lock(&trace_mutex_);
+  Status s = tracer_->Close();
+  tracer_.reset();
+  return s;
+}
+
+Status DBImpl::TraceIteratorSeek(const uint32_t& cf_id, const Slice& key) {
+  Status s;
+  if (tracer_) {
+    InstrumentedMutexLock lock(&trace_mutex_);
+    if (tracer_) {
+      s = tracer_->IteratorSeek(cf_id, key);
+    }
+  }
+  return s;
+}
+
+Status DBImpl::TraceIteratorSeekForPrev(const uint32_t& cf_id,
+                                        const Slice& key) {
+  Status s;
+  if (tracer_) {
+    InstrumentedMutexLock lock(&trace_mutex_);
+    if (tracer_) {
+      s = tracer_->IteratorSeekForPrev(cf_id, key);
+    }
+  }
+  return s;
+}
+
 #endif  // ROCKSDB_LITE
+
 }  // namespace rocksdb
diff -Nru rocksdb-5.15.10/db/db_impl_compaction_flush.cc rocksdb-5.17.2/db/db_impl_compaction_flush.cc
--- rocksdb-5.15.10/db/db_impl_compaction_flush.cc	2018-09-13 17:25:20.000000000 +0000
+++ rocksdb-5.17.2/db/db_impl_compaction_flush.cc	2018-11-12 19:57:32.000000000 +0000
@@ -26,7 +26,7 @@
 namespace rocksdb {
 
 bool DBImpl::EnoughRoomForCompaction(
-    const std::vector<CompactionInputFiles>& inputs,
+    ColumnFamilyData* cfd, const std::vector<CompactionInputFiles>& inputs,
     bool* sfm_reserved_compact_space, LogBuffer* log_buffer) {
   // Check if we have enough room to do the compaction
   bool enough_room = true;
@@ -34,12 +34,17 @@
   auto sfm = static_cast<SstFileManagerImpl*>(
       immutable_db_options_.sst_file_manager.get());
   if (sfm) {
-    enough_room = sfm->EnoughRoomForCompaction(inputs);
+    // Pass the current bg_error_ to SFM so it can decide what checks to
+    // perform. If this DB instance hasn't seen any error yet, the SFM can be
+    // optimistic and not do disk space checks
+    enough_room =
+        sfm->EnoughRoomForCompaction(cfd, inputs, error_handler_.GetBGError());
     if (enough_room) {
       *sfm_reserved_compact_space = true;
     }
   }
 #else
+  (void)cfd;
   (void)inputs;
   (void)sfm_reserved_compact_space;
 #endif  // ROCKSDB_LITE
@@ -104,7 +109,8 @@
 
 Status DBImpl::FlushMemTableToOutputFile(
     ColumnFamilyData* cfd, const MutableCFOptions& mutable_cf_options,
-    bool* made_progress, JobContext* job_context, LogBuffer* log_buffer) {
+    bool* made_progress, JobContext* job_context,
+    SuperVersionContext* superversion_context, LogBuffer* log_buffer) {
   mutex_.AssertHeld();
   assert(cfd->imm()->NumNotFlushed() != 0);
   assert(cfd->imm()->IsFlushPending());
@@ -160,7 +166,7 @@
   }
 
   if (s.ok()) {
-    InstallSuperVersionAndScheduleWork(cfd, &job_context->superversion_context,
+    InstallSuperVersionAndScheduleWork(cfd, superversion_context,
                                        mutable_cf_options);
     if (made_progress) {
       *made_progress = 1;
@@ -200,6 +206,25 @@
   return s;
 }
 
+Status DBImpl::FlushMemTablesToOutputFiles(
+    const autovector<BGFlushArg>& bg_flush_args, bool* made_progress,
+    JobContext* job_context, LogBuffer* log_buffer) {
+  Status s;
+  for (auto& arg : bg_flush_args) {
+    ColumnFamilyData* cfd = arg.cfd_;
+    const MutableCFOptions& mutable_cf_options =
+        *cfd->GetLatestMutableCFOptions();
+    SuperVersionContext* superversion_context = arg.superversion_context_;
+    s = FlushMemTableToOutputFile(cfd, mutable_cf_options, made_progress,
+                                  job_context, superversion_context,
+                                  log_buffer);
+    if (!s.ok()) {
+      break;
+    }
+  }
+  return s;
+}
+
 void DBImpl::NotifyOnFlushBegin(ColumnFamilyData* cfd, FileMetaData* file_meta,
                                 const MutableCFOptions& mutable_cf_options,
                                 int job_id, TableProperties prop) {
@@ -230,8 +255,8 @@
     info.job_id = job_id;
     info.triggered_writes_slowdown = triggered_writes_slowdown;
     info.triggered_writes_stop = triggered_writes_stop;
-    info.smallest_seqno = file_meta->smallest_seqno;
-    info.largest_seqno = file_meta->largest_seqno;
+    info.smallest_seqno = file_meta->fd.smallest_seqno;
+    info.largest_seqno = file_meta->fd.largest_seqno;
     info.table_properties = prop;
     info.flush_reason = cfd->GetFlushReason();
     for (auto listener : immutable_db_options_.listeners) {
@@ -281,8 +306,8 @@
     info.job_id = job_id;
     info.triggered_writes_slowdown = triggered_writes_slowdown;
     info.triggered_writes_stop = triggered_writes_stop;
-    info.smallest_seqno = file_meta->smallest_seqno;
-    info.largest_seqno = file_meta->largest_seqno;
+    info.smallest_seqno = file_meta->fd.smallest_seqno;
+    info.largest_seqno = file_meta->fd.largest_seqno;
     info.table_properties = prop;
     info.flush_reason = cfd->GetFlushReason();
     for (auto listener : immutable_db_options_.listeners) {
@@ -324,60 +349,12 @@
     CleanupSuperVersion(super_version);
   }
 
-  if (!options.allow_write_stall && flush_needed) {
-    InstrumentedMutexLock l(&mutex_);
-    uint64_t orig_active_memtable_id = cfd->mem()->GetID();
-    WriteStallCondition write_stall_condition = WriteStallCondition::kNormal;
-    do {
-      if (write_stall_condition != WriteStallCondition::kNormal) {
-        TEST_SYNC_POINT("DBImpl::CompactRange:StallWait");
-        ROCKS_LOG_INFO(immutable_db_options_.info_log,
-                       "[%s] CompactRange waiting on stall conditions to clear",
-                       cfd->GetName().c_str());
-        bg_cv_.Wait();
-      }
-      if (cfd->IsDropped() || shutting_down_.load(std::memory_order_acquire)) {
-        return Status::ShutdownInProgress();
-      }
-
-      uint64_t earliest_memtable_id =
-          std::min(cfd->mem()->GetID(), cfd->imm()->GetEarliestMemTableID());
-      if (earliest_memtable_id > orig_active_memtable_id) {
-        // We waited so long that the memtable we were originally waiting on was
-        // flushed.
-        flush_needed = false;
-        break;
-      }
-
-      const auto& mutable_cf_options = *cfd->GetLatestMutableCFOptions();
-      const auto* vstorage = cfd->current()->storage_info();
-
-      // Skip stalling check if we're below auto-flush and auto-compaction
-      // triggers. If it stalled in these conditions, that'd mean the stall
-      // triggers are so low that stalling is needed for any background work. In
-      // that case we shouldn't wait since background work won't be scheduled.
-      if (cfd->imm()->NumNotFlushed() <
-              cfd->ioptions()->min_write_buffer_number_to_merge &&
-          vstorage->l0_delay_trigger_count() <
-              mutable_cf_options.level0_file_num_compaction_trigger) {
-        break;
-      }
-
-      // check whether one extra immutable memtable or an extra L0 file would
-      // cause write stalling mode to be entered. It could still enter stall
-      // mode due to pending compaction bytes, but that's less common
-      write_stall_condition =
-          ColumnFamilyData::GetWriteStallConditionAndCause(
-              cfd->imm()->NumNotFlushed() + 1,
-              vstorage->l0_delay_trigger_count() + 1,
-              vstorage->estimated_compaction_needed_bytes(), mutable_cf_options)
-              .first;
-    } while (write_stall_condition != WriteStallCondition::kNormal);
-  }
-  TEST_SYNC_POINT("DBImpl::CompactRange:StallWaitDone");
   Status s;
   if (flush_needed) {
-    s = FlushMemTable(cfd, FlushOptions(), FlushReason::kManualCompaction);
+    FlushOptions fo;
+    fo.allow_write_stall = options.allow_write_stall;
+    s = FlushMemTable(cfd, fo, FlushReason::kManualCompaction,
+      false /* writes_stopped*/);
     if (!s.ok()) {
       LogFlush(immutable_db_options_.info_log);
       return s;
@@ -612,7 +589,7 @@
   bool sfm_reserved_compact_space = false;
   // First check if we have enough room to do the compaction
   bool enough_room = EnoughRoomForCompaction(
-      input_files, &sfm_reserved_compact_space, log_buffer);
+      cfd, input_files, &sfm_reserved_compact_space, log_buffer);
 
   if (!enough_room) {
     // m's vars will get set properly at the end of this function,
@@ -691,7 +668,7 @@
   Status status = compaction_job.Install(*c->mutable_cf_options());
   if (status.ok()) {
     InstallSuperVersionAndScheduleWork(
-        c->column_family_data(), &job_context->superversion_context,
+        c->column_family_data(), &job_context->superversion_contexts[0],
         *c->mutable_cf_options(), FlushReason::kManualCompaction);
   }
   c->ReleaseCompactionFiles(s);
@@ -885,7 +862,7 @@
       edit.DeleteFile(level, f->fd.GetNumber());
       edit.AddFile(to_level, f->fd.GetNumber(), f->fd.GetPathId(),
                    f->fd.GetFileSize(), f->smallest, f->largest,
-                   f->smallest_seqno, f->largest_seqno,
+                   f->fd.smallest_seqno, f->fd.largest_seqno,
                    f->marked_for_compaction);
     }
     ROCKS_LOG_DEBUG(immutable_db_options_.info_log,
@@ -942,6 +919,68 @@
   return s;
 }
 
+
+Status DBImpl::FlushAllCFs(FlushReason flush_reason) {
+  Status s;
+  WriteContext context;
+  WriteThread::Writer w;
+
+  mutex_.AssertHeld();
+  write_thread_.EnterUnbatched(&w, &mutex_);
+
+  FlushRequest flush_req;
+  for (auto cfd : *versions_->GetColumnFamilySet()) {
+    if (cfd->imm()->NumNotFlushed() == 0 && cfd->mem()->IsEmpty() &&
+        cached_recoverable_state_empty_.load()) {
+      // Nothing to flush
+      continue;
+    }
+
+    // SwitchMemtable() will release and reacquire mutex during execution
+    s = SwitchMemtable(cfd, &context);
+    if (!s.ok()) {
+      break;
+    }
+
+    cfd->imm()->FlushRequested();
+
+    flush_req.emplace_back(cfd, cfd->imm()->GetLatestMemTableID());
+  }
+
+  // schedule flush
+  if (s.ok() && !flush_req.empty()) {
+    SchedulePendingFlush(flush_req, flush_reason);
+    MaybeScheduleFlushOrCompaction();
+  }
+
+  write_thread_.ExitUnbatched(&w);
+
+  if (s.ok()) {
+    for (auto& flush : flush_req) {
+      auto cfd = flush.first;
+      auto flush_memtable_id = flush.second;
+      while (cfd->imm()->NumNotFlushed() > 0 &&
+             cfd->imm()->GetEarliestMemTableID() <= flush_memtable_id) {
+        if (!error_handler_.GetRecoveryError().ok()) {
+          break;
+        }
+        if (cfd->IsDropped()) {
+          // FlushJob cannot flush a dropped CF, if we did not break here
+          // we will loop forever since cfd->imm()->NumNotFlushed() will never
+          // drop to zero
+          continue;
+        }
+        cfd->Ref();
+        bg_cv_.Wait();
+        cfd->Unref();
+      }
+    }
+  }
+
+  flush_req.clear();
+  return s;
+}
+
 Status DBImpl::RunManualCompaction(ColumnFamilyData* cfd, int input_level,
                                    int output_level, uint32_t output_path_id,
                                    uint32_t max_subcompactions,
@@ -1077,63 +1116,164 @@
                              FlushReason flush_reason, bool writes_stopped) {
   Status s;
   uint64_t flush_memtable_id = 0;
+  if (!flush_options.allow_write_stall) {
+    bool flush_needed = true;
+    s = WaitUntilFlushWouldNotStallWrites(cfd, &flush_needed);
+    TEST_SYNC_POINT("DBImpl::FlushMemTable:StallWaitDone");
+    if (!s.ok() || !flush_needed) {
+      return s;
+    }
+  }
+  FlushRequest flush_req;
   {
     WriteContext context;
     InstrumentedMutexLock guard_lock(&mutex_);
 
-    if (cfd->imm()->NumNotFlushed() == 0 && cfd->mem()->IsEmpty() &&
-        cached_recoverable_state_empty_.load()) {
-      // Nothing to flush
-      return Status::OK();
-    }
-
     WriteThread::Writer w;
     if (!writes_stopped) {
       write_thread_.EnterUnbatched(&w, &mutex_);
     }
 
-    // SwitchMemtable() will release and reacquire mutex during execution
-    s = SwitchMemtable(cfd, &context);
-    flush_memtable_id = cfd->imm()->GetLatestMemTableID();
+    if (cfd->imm()->NumNotFlushed() != 0 || !cfd->mem()->IsEmpty() ||
+        !cached_recoverable_state_empty_.load()) {
+      s = SwitchMemtable(cfd, &context);
+      flush_memtable_id = cfd->imm()->GetLatestMemTableID();
+      flush_req.emplace_back(cfd, flush_memtable_id);
+    }
+
+    if (s.ok() && !flush_req.empty()) {
+      for (auto& elem : flush_req) {
+        ColumnFamilyData* loop_cfd = elem.first;
+        loop_cfd->imm()->FlushRequested();
+      }
+      SchedulePendingFlush(flush_req, flush_reason);
+      MaybeScheduleFlushOrCompaction();
+    }
 
     if (!writes_stopped) {
       write_thread_.ExitUnbatched(&w);
     }
-
-    cfd->imm()->FlushRequested();
-
-    // schedule flush
-    SchedulePendingFlush(cfd, flush_reason);
-    MaybeScheduleFlushOrCompaction();
   }
 
   if (s.ok() && flush_options.wait) {
-    // Wait until the compaction completes
-    s = WaitForFlushMemTable(cfd, &flush_memtable_id);
+    autovector<ColumnFamilyData*> cfds;
+    autovector<const uint64_t*> flush_memtable_ids;
+    for (auto& iter : flush_req) {
+      cfds.push_back(iter.first);
+      flush_memtable_ids.push_back(&(iter.second));
+    }
+    s = WaitForFlushMemTables(cfds, flush_memtable_ids);
   }
   TEST_SYNC_POINT("FlushMemTableFinished");
   return s;
 }
 
-Status DBImpl::WaitForFlushMemTable(ColumnFamilyData* cfd,
-                                    const uint64_t* flush_memtable_id) {
-  Status s;
+// Calling FlushMemTable(), whether from DB::Flush() or from Backup Engine, can
+// cause write stall, for example if one memtable is being flushed already.
+// This method tries to avoid write stall (similar to CompactRange() behavior)
+// it emulates how the SuperVersion / LSM would change if flush happens, checks
+// it against various constrains and delays flush if it'd cause write stall.
+// Called should check status and flush_needed to see if flush already happened.
+Status DBImpl::WaitUntilFlushWouldNotStallWrites(ColumnFamilyData* cfd,
+    bool* flush_needed) {
+  {
+    *flush_needed = true;
+    InstrumentedMutexLock l(&mutex_);
+    uint64_t orig_active_memtable_id = cfd->mem()->GetID();
+    WriteStallCondition write_stall_condition = WriteStallCondition::kNormal;
+    do {
+      if (write_stall_condition != WriteStallCondition::kNormal) {
+        TEST_SYNC_POINT("DBImpl::WaitUntilFlushWouldNotStallWrites:StallWait");
+        ROCKS_LOG_INFO(immutable_db_options_.info_log,
+                       "[%s] WaitUntilFlushWouldNotStallWrites"
+                       " waiting on stall conditions to clear",
+                       cfd->GetName().c_str());
+        bg_cv_.Wait();
+      }
+      if (cfd->IsDropped() || shutting_down_.load(std::memory_order_acquire)) {
+        return Status::ShutdownInProgress();
+      }
+
+      uint64_t earliest_memtable_id =
+          std::min(cfd->mem()->GetID(), cfd->imm()->GetEarliestMemTableID());
+      if (earliest_memtable_id > orig_active_memtable_id) {
+        // We waited so long that the memtable we were originally waiting on was
+        // flushed.
+        *flush_needed = false;
+        return Status::OK();
+      }
+
+      const auto& mutable_cf_options = *cfd->GetLatestMutableCFOptions();
+      const auto* vstorage = cfd->current()->storage_info();
+
+      // Skip stalling check if we're below auto-flush and auto-compaction
+      // triggers. If it stalled in these conditions, that'd mean the stall
+      // triggers are so low that stalling is needed for any background work. In
+      // that case we shouldn't wait since background work won't be scheduled.
+      if (cfd->imm()->NumNotFlushed() <
+              cfd->ioptions()->min_write_buffer_number_to_merge &&
+          vstorage->l0_delay_trigger_count() <
+              mutable_cf_options.level0_file_num_compaction_trigger) {
+        break;
+      }
+
+      // check whether one extra immutable memtable or an extra L0 file would
+      // cause write stalling mode to be entered. It could still enter stall
+      // mode due to pending compaction bytes, but that's less common
+      write_stall_condition =
+          ColumnFamilyData::GetWriteStallConditionAndCause(
+              cfd->imm()->NumNotFlushed() + 1,
+              vstorage->l0_delay_trigger_count() + 1,
+              vstorage->estimated_compaction_needed_bytes(), mutable_cf_options)
+              .first;
+    } while (write_stall_condition != WriteStallCondition::kNormal);
+  }
+  return Status::OK();
+}
+
+// Wait for memtables to be flushed for multiple column families.
+// let N = cfds.size()
+// for i in [0, N),
+//  1) if flush_memtable_ids[i] is not null, then the memtables with lower IDs
+//     have to be flushed for THIS column family;
+//  2) if flush_memtable_ids[i] is null, then all memtables in THIS column
+//     family have to be flushed.
+// Finish waiting when ALL column families finish flushing memtables.
+Status DBImpl::WaitForFlushMemTables(
+    const autovector<ColumnFamilyData*>& cfds,
+    const autovector<const uint64_t*>& flush_memtable_ids) {
+  int num = static_cast<int>(cfds.size());
   // Wait until the compaction completes
   InstrumentedMutexLock l(&mutex_);
-  while (cfd->imm()->NumNotFlushed() > 0 && !error_handler_.IsDBStopped() &&
-         (flush_memtable_id == nullptr ||
-          cfd->imm()->GetEarliestMemTableID() <= *flush_memtable_id)) {
+  while (!error_handler_.IsDBStopped()) {
     if (shutting_down_.load(std::memory_order_acquire)) {
       return Status::ShutdownInProgress();
     }
-    if (cfd->IsDropped()) {
-      // FlushJob cannot flush a dropped CF, if we did not break here
-      // we will loop forever since cfd->imm()->NumNotFlushed() will never
-      // drop to zero
+    // Number of column families that have been dropped.
+    int num_dropped = 0;
+    // Number of column families that have finished flush.
+    int num_finished = 0;
+    for (int i = 0; i < num; ++i) {
+      if (cfds[i]->IsDropped()) {
+        ++num_dropped;
+      } else if (cfds[i]->imm()->NumNotFlushed() == 0 ||
+                 (flush_memtable_ids[i] != nullptr &&
+                  cfds[i]->imm()->GetEarliestMemTableID() >
+                      *flush_memtable_ids[i])) {
+        ++num_finished;
+      }
+    }
+    if (1 == num_dropped && 1 == num) {
       return Status::InvalidArgument("Cannot flush a dropped CF");
     }
+    // Column families involved in this flush request have either been dropped
+    // or finished flush. Then it's time to finish waiting.
+    if (num_dropped + num_finished == num) {
+      break;
+    }
     bg_cv_.Wait();
   }
+  Status s;
   if (error_handler_.IsDBStopped()) {
     s = error_handler_.GetBGError();
   }
@@ -1163,6 +1303,12 @@
   if (bg_work_paused_ > 0) {
     // we paused the background work
     return;
+  } else if (error_handler_.IsBGWorkStopped() &&
+      !error_handler_.IsRecoveryInProgress()) {
+    // There has been a hard error and this call is not part of the recovery
+    // sequence. Bail out here so we don't get into an endless loop of
+    // scheduling BG work which will again call this function
+    return;
   } else if (shutting_down_.load(std::memory_order_acquire)) {
     // DB is being deleted; no more background compactions
     return;
@@ -1172,7 +1318,6 @@
       env_->GetBackgroundThreads(Env::Priority::HIGH) == 0;
   while (!is_flush_pool_empty && unscheduled_flushes_ > 0 &&
          bg_flush_scheduled_ < bg_job_limits.max_flushes) {
-    unscheduled_flushes_--;
     bg_flush_scheduled_++;
     env_->Schedule(&DBImpl::BGWorkFlush, this, Env::Priority::HIGH, this);
   }
@@ -1183,7 +1328,6 @@
     while (unscheduled_flushes_ > 0 &&
            bg_flush_scheduled_ + bg_compaction_scheduled_ <
                bg_job_limits.max_flushes) {
-      unscheduled_flushes_--;
       bg_flush_scheduled_++;
       env_->Schedule(&DBImpl::BGWorkFlush, this, Env::Priority::LOW, this);
     }
@@ -1192,6 +1336,12 @@
   if (bg_compaction_paused_ > 0) {
     // we paused the background compaction
     return;
+  } else if (error_handler_.IsBGWorkStopped()) {
+    // Compaction is not part of the recovery sequence from a hard error. We
+    // might get here because recovery might do a flush and install a new
+    // super version, which will try to schedule pending compactions. Bail
+    // out here and let the higher level recovery handle compactions
+    return;
   }
 
   if (HasExclusiveManualCompaction()) {
@@ -1260,30 +1410,28 @@
   return cfd;
 }
 
-void DBImpl::AddToFlushQueue(ColumnFamilyData* cfd, FlushReason flush_reason) {
-  assert(!cfd->queued_for_flush());
-  cfd->Ref();
-  flush_queue_.push_back(cfd);
-  cfd->set_queued_for_flush(true);
-  cfd->SetFlushReason(flush_reason);
-}
-
-ColumnFamilyData* DBImpl::PopFirstFromFlushQueue() {
+DBImpl::FlushRequest DBImpl::PopFirstFromFlushQueue() {
   assert(!flush_queue_.empty());
-  auto cfd = *flush_queue_.begin();
+  FlushRequest flush_req = flush_queue_.front();
+  assert(unscheduled_flushes_ >= static_cast<int>(flush_req.size()));
+  unscheduled_flushes_ -= static_cast<int>(flush_req.size());
   flush_queue_.pop_front();
-  assert(cfd->queued_for_flush());
-  cfd->set_queued_for_flush(false);
   // TODO: need to unset flush reason?
-  return cfd;
+  return flush_req;
 }
 
-void DBImpl::SchedulePendingFlush(ColumnFamilyData* cfd,
+void DBImpl::SchedulePendingFlush(const FlushRequest& flush_req,
                                   FlushReason flush_reason) {
-  if (!cfd->queued_for_flush() && cfd->imm()->IsFlushPending()) {
-    AddToFlushQueue(cfd, flush_reason);
-    ++unscheduled_flushes_;
+  if (flush_req.empty()) {
+    return;
   }
+  for (auto& iter : flush_req) {
+    ColumnFamilyData* cfd = iter.first;
+    cfd->Ref();
+    cfd->SetFlushReason(flush_reason);
+  }
+  unscheduled_flushes_ += static_cast<int>(flush_req.size());
+  flush_queue_.push_back(flush_req);
 }
 
 void DBImpl::SchedulePendingCompaction(ColumnFamilyData* cfd) {
@@ -1351,15 +1499,18 @@
 }
 
 Status DBImpl::BackgroundFlush(bool* made_progress, JobContext* job_context,
-                               LogBuffer* log_buffer) {
+                               LogBuffer* log_buffer, FlushReason* reason) {
   mutex_.AssertHeld();
 
   Status status;
+  *reason = FlushReason::kOthers;
+  // If BG work is stopped due to an error, but a recovery is in progress,
+  // that means this flush is part of the recovery. So allow it to go through
   if (!error_handler_.IsBGWorkStopped()) {
     if (shutting_down_.load(std::memory_order_acquire)) {
       status = Status::ShutdownInProgress();
     }
-  } else {
+  } else if (!error_handler_.IsRecoveryInProgress()) {
     status = error_handler_.GetBGError();
   }
 
@@ -1367,40 +1518,58 @@
     return status;
   }
 
-  ColumnFamilyData* cfd = nullptr;
+  autovector<BGFlushArg> bg_flush_args;
+  std::vector<SuperVersionContext>& superversion_contexts =
+      job_context->superversion_contexts;
   while (!flush_queue_.empty()) {
     // This cfd is already referenced
-    auto first_cfd = PopFirstFromFlushQueue();
-
-    if (first_cfd->IsDropped() || !first_cfd->imm()->IsFlushPending()) {
-      // can't flush this CF, try next one
-      if (first_cfd->Unref()) {
-        delete first_cfd;
+    const FlushRequest& flush_req = PopFirstFromFlushQueue();
+    superversion_contexts.clear();
+    superversion_contexts.reserve(flush_req.size());
+
+    for (const auto& iter : flush_req) {
+      ColumnFamilyData* cfd = iter.first;
+      if (cfd->IsDropped() || !cfd->imm()->IsFlushPending()) {
+        // can't flush this CF, try next one
+        if (cfd->Unref()) {
+          delete cfd;
+        }
+        continue;
       }
-      continue;
+      superversion_contexts.emplace_back(SuperVersionContext(true));
+      bg_flush_args.emplace_back(cfd, iter.second,
+                                 &(superversion_contexts.back()));
+    }
+    if (!bg_flush_args.empty()) {
+      break;
     }
-
-    // found a flush!
-    cfd = first_cfd;
-    break;
   }
 
-  if (cfd != nullptr) {
-    const MutableCFOptions mutable_cf_options =
-        *cfd->GetLatestMutableCFOptions();
+  if (!bg_flush_args.empty()) {
     auto bg_job_limits = GetBGJobLimits();
-    ROCKS_LOG_BUFFER(
-        log_buffer,
-        "Calling FlushMemTableToOutputFile with column "
-        "family [%s], flush slots available %d, compaction slots available %d, "
-        "flush slots scheduled %d, compaction slots scheduled %d",
-        cfd->GetName().c_str(), bg_job_limits.max_flushes,
-        bg_job_limits.max_compactions, bg_flush_scheduled_,
-        bg_compaction_scheduled_);
-    status = FlushMemTableToOutputFile(cfd, mutable_cf_options, made_progress,
-                                       job_context, log_buffer);
-    if (cfd->Unref()) {
-      delete cfd;
+    for (const auto& arg : bg_flush_args) {
+      ColumnFamilyData* cfd = arg.cfd_;
+      ROCKS_LOG_BUFFER(
+          log_buffer,
+          "Calling FlushMemTableToOutputFile with column "
+          "family [%s], flush slots available %d, compaction slots available "
+          "%d, "
+          "flush slots scheduled %d, compaction slots scheduled %d",
+          cfd->GetName().c_str(), bg_job_limits.max_flushes,
+          bg_job_limits.max_compactions, bg_flush_scheduled_,
+          bg_compaction_scheduled_);
+    }
+    status = FlushMemTablesToOutputFiles(bg_flush_args, made_progress,
+                                         job_context, log_buffer);
+    // All the CFDs in the FlushReq must have the same flush reason, so just
+    // grab the first one
+    *reason = bg_flush_args[0].cfd_->GetFlushReason();
+    for (auto& arg : bg_flush_args) {
+      ColumnFamilyData* cfd = arg.cfd_;
+      if (cfd->Unref()) {
+        delete cfd;
+        arg.cfd_ = nullptr;
+      }
     }
   }
   return status;
@@ -1421,9 +1590,12 @@
 
     auto pending_outputs_inserted_elem =
         CaptureCurrentFileNumberInPendingOutputs();
+    FlushReason reason;
 
-    Status s = BackgroundFlush(&made_progress, &job_context, &log_buffer);
-    if (!s.ok() && !s.IsShutdownInProgress()) {
+    Status s =
+        BackgroundFlush(&made_progress, &job_context, &log_buffer, &reason);
+    if (!s.ok() && !s.IsShutdownInProgress() &&
+        reason != FlushReason::kErrorRecovery) {
       // Wait a little bit before retrying background flush in
       // case this is an environmental problem and we do not want to
       // chew up resources for failed flushes for the duration of
@@ -1613,6 +1785,11 @@
     }
   } else {
     status = error_handler_.GetBGError();
+    // If we get here, it means a hard error happened after this compaction
+    // was scheduled by MaybeScheduleFlushOrCompaction(), but before it got
+    // a chance to execute. Since we didn't pop a cfd from the compaction
+    // queue, increment unscheduled_compactions_
+    unscheduled_compactions_++;
   }
 
   if (!status.ok()) {
@@ -1648,7 +1825,7 @@
     } else {
       // First check if we have enough room to do the compaction
       bool enough_room = EnoughRoomForCompaction(
-          *(c->inputs()), &sfm_reserved_compact_space, log_buffer);
+          m->cfd, *(c->inputs()), &sfm_reserved_compact_space, log_buffer);
 
       if (!enough_room) {
         // Then don't do the compaction
@@ -1711,7 +1888,7 @@
 
       if (c != nullptr) {
         bool enough_room = EnoughRoomForCompaction(
-            *(c->inputs()), &sfm_reserved_compact_space, log_buffer);
+            cfd, *(c->inputs()), &sfm_reserved_compact_space, log_buffer);
 
         if (!enough_room) {
           // Then don't do the compaction
@@ -1775,7 +1952,7 @@
                                     *c->mutable_cf_options(), c->edit(),
                                     &mutex_, directories_.GetDbDir());
     InstallSuperVersionAndScheduleWork(
-        c->column_family_data(), &job_context->superversion_context,
+        c->column_family_data(), &job_context->superversion_contexts[0],
         *c->mutable_cf_options(), FlushReason::kAutoCompaction);
     ROCKS_LOG_BUFFER(log_buffer, "[%s] Deleted %d files\n",
                      c->column_family_data()->GetName().c_str(),
@@ -1804,8 +1981,8 @@
         c->edit()->DeleteFile(c->level(l), f->fd.GetNumber());
         c->edit()->AddFile(c->output_level(), f->fd.GetNumber(),
                            f->fd.GetPathId(), f->fd.GetFileSize(), f->smallest,
-                           f->largest, f->smallest_seqno, f->largest_seqno,
-                           f->marked_for_compaction);
+                           f->largest, f->fd.smallest_seqno,
+                           f->fd.largest_seqno, f->marked_for_compaction);
 
         ROCKS_LOG_BUFFER(
             log_buffer,
@@ -1822,7 +1999,7 @@
                                     &mutex_, directories_.GetDbDir());
     // Use latest MutableCFOptions
     InstallSuperVersionAndScheduleWork(
-        c->column_family_data(), &job_context->superversion_context,
+        c->column_family_data(), &job_context->superversion_contexts[0],
         *c->mutable_cf_options(), FlushReason::kAutoCompaction);
 
     VersionStorageInfo::LevelSummaryStorage tmp;
@@ -1899,7 +2076,7 @@
     status = compaction_job.Install(*c->mutable_cf_options());
     if (status.ok()) {
       InstallSuperVersionAndScheduleWork(
-          c->column_family_data(), &job_context->superversion_context,
+          c->column_family_data(), &job_context->superversion_contexts[0],
           *c->mutable_cf_options(), FlushReason::kAutoCompaction);
     }
     *made_progress = true;
@@ -1920,8 +2097,6 @@
     NotifyOnCompactionCompleted(c->column_family_data(), c.get(), status,
                                 compaction_job_stats, job_context->job_id);
   }
-  // this will unref its input_version and column_family_data
-  c.reset();
 
   if (status.ok() || status.IsCompactionTooLarge()) {
     // Done
@@ -1931,7 +2106,26 @@
     ROCKS_LOG_WARN(immutable_db_options_.info_log, "Compaction error: %s",
                    status.ToString().c_str());
     error_handler_.SetBGError(status, BackgroundErrorReason::kCompaction);
+    if (c != nullptr && !is_manual && !error_handler_.IsBGWorkStopped()) {
+      // Put this cfd back in the compaction queue so we can retry after some
+      // time
+      auto cfd = c->column_family_data();
+      assert(cfd != nullptr);
+      // Since this compaction failed, we need to recompute the score so it
+      // takes the original input files into account
+      c->column_family_data()
+          ->current()
+          ->storage_info()
+          ->ComputeCompactionScore(*(c->immutable_cf_options()),
+                                   *(c->mutable_cf_options()));
+      if (!cfd->queued_for_compaction()) {
+        AddToCompactionQueue(cfd);
+        ++unscheduled_compactions_;
+      }
+    }
   }
+  // this will unref its input_version and column_family_data
+  c.reset();
 
   if (is_manual) {
     ManualCompactionState* m = manual_compaction;
@@ -2080,7 +2274,10 @@
 
 void DBImpl::InstallSuperVersionAndScheduleWork(
     ColumnFamilyData* cfd, SuperVersionContext* sv_context,
-    const MutableCFOptions& mutable_cf_options, FlushReason flush_reason) {
+    const MutableCFOptions& mutable_cf_options,
+    FlushReason /* flush_reason */) {
+  // TODO(yanqin) investigate if 'flush_reason' can be removed since it's not
+  // used.
   mutex_.AssertHeld();
 
   // Update max_total_in_memory_state_
@@ -2091,14 +2288,14 @@
                         old_sv->mutable_cf_options.max_write_buffer_number;
   }
 
-  if (sv_context->new_superversion == nullptr) {
+  // this branch is unlikely to step in
+  if (UNLIKELY(sv_context->new_superversion == nullptr)) {
     sv_context->NewSuperVersion();
   }
   cfd->InstallSuperVersion(sv_context, &mutex_, mutable_cf_options);
 
   // Whenever we install new SuperVersion, we might need to issue new flushes or
   // compactions.
-  SchedulePendingFlush(cfd, flush_reason);
   SchedulePendingCompaction(cfd);
   MaybeScheduleFlushOrCompaction();
 
diff -Nru rocksdb-5.15.10/db/db_impl_debug.cc rocksdb-5.17.2/db/db_impl_debug.cc
--- rocksdb-5.15.10/db/db_impl_debug.cc	2018-09-13 17:25:20.000000000 +0000
+++ rocksdb-5.17.2/db/db_impl_debug.cc	2018-11-12 19:57:32.000000000 +0000
@@ -100,9 +100,11 @@
   return SwitchMemtable(cfd, &write_context);
 }
 
-Status DBImpl::TEST_FlushMemTable(bool wait, ColumnFamilyHandle* cfh) {
+Status DBImpl::TEST_FlushMemTable(bool wait, bool allow_write_stall,
+    ColumnFamilyHandle* cfh) {
   FlushOptions fo;
   fo.wait = wait;
+  fo.allow_write_stall = allow_write_stall;
   ColumnFamilyData* cfd;
   if (cfh == nullptr) {
     cfd = default_cf_handle_->cfd();
@@ -135,7 +137,7 @@
   while ((bg_bottom_compaction_scheduled_ || bg_compaction_scheduled_ ||
           bg_flush_scheduled_ ||
           (wait_unscheduled && unscheduled_compactions_)) &&
-         !error_handler_.IsDBStopped()) {
+         (error_handler_.GetBGError() == Status::OK())) {
     bg_cv_.Wait();
   }
   return error_handler_.GetBGError();
@@ -235,5 +237,11 @@
   }
 }
 
+size_t DBImpl::TEST_GetWalPreallocateBlockSize(
+    uint64_t write_buffer_size) const {
+  InstrumentedMutexLock l(&mutex_);
+  return GetWalPreallocateBlockSize(write_buffer_size);
+}
+
 }  // namespace rocksdb
 #endif  // NDEBUG
diff -Nru rocksdb-5.15.10/db/db_impl_experimental.cc rocksdb-5.17.2/db/db_impl_experimental.cc
--- rocksdb-5.15.10/db/db_impl_experimental.cc	2018-09-13 17:25:20.000000000 +0000
+++ rocksdb-5.17.2/db/db_impl_experimental.cc	2018-11-12 19:57:32.000000000 +0000
@@ -131,16 +131,16 @@
       edit.DeleteFile(0, f->fd.GetNumber());
       edit.AddFile(target_level, f->fd.GetNumber(), f->fd.GetPathId(),
                    f->fd.GetFileSize(), f->smallest, f->largest,
-                   f->smallest_seqno, f->largest_seqno,
+                   f->fd.smallest_seqno, f->fd.largest_seqno,
                    f->marked_for_compaction);
     }
 
     status = versions_->LogAndApply(cfd, *cfd->GetLatestMutableCFOptions(),
                                     &edit, &mutex_, directories_.GetDbDir());
     if (status.ok()) {
-      InstallSuperVersionAndScheduleWork(
-          cfd, &job_context.superversion_context,
-          *cfd->GetLatestMutableCFOptions());
+      InstallSuperVersionAndScheduleWork(cfd,
+                                         &job_context.superversion_contexts[0],
+                                         *cfd->GetLatestMutableCFOptions());
     }
   }  // lock released here
   LogFlush(immutable_db_options_.info_log);
diff -Nru rocksdb-5.15.10/db/db_impl.h rocksdb-5.17.2/db/db_impl.h
--- rocksdb-5.15.10/db/db_impl.h	2018-09-13 17:25:20.000000000 +0000
+++ rocksdb-5.17.2/db/db_impl.h	2018-11-12 19:57:32.000000000 +0000
@@ -22,9 +22,9 @@
 #include "db/column_family.h"
 #include "db/compaction_job.h"
 #include "db/dbformat.h"
-#include "db/external_sst_file_ingestion_job.h"
 #include "db/error_handler.h"
 #include "db/event_helpers.h"
+#include "db/external_sst_file_ingestion_job.h"
 #include "db/flush_job.h"
 #include "db/flush_scheduler.h"
 #include "db/internal_stats.h"
@@ -46,6 +46,7 @@
 #include "rocksdb/env.h"
 #include "rocksdb/memtablerep.h"
 #include "rocksdb/status.h"
+#include "rocksdb/trace_reader_writer.h"
 #include "rocksdb/transaction_log.h"
 #include "rocksdb/write_buffer_manager.h"
 #include "table/scoped_arena_iterator.h"
@@ -54,6 +55,7 @@
 #include "util/hash.h"
 #include "util/stop_watch.h"
 #include "util/thread_local.h"
+#include "util/trace_replay.h"
 
 namespace rocksdb {
 
@@ -333,6 +335,15 @@
 
   virtual Status VerifyChecksum() override;
 
+  using DB::StartTrace;
+  virtual Status StartTrace(
+      const TraceOptions& options,
+      std::unique_ptr<TraceWriter>&& trace_writer) override;
+
+  using DB::EndTrace;
+  virtual Status EndTrace() override;
+  Status TraceIteratorSeek(const uint32_t& cf_id, const Slice& key);
+  Status TraceIteratorSeekForPrev(const uint32_t& cf_id, const Slice& key);
 #endif  // ROCKSDB_LITE
 
   // Similar to GetSnapshot(), but also lets the db know that this snapshot
@@ -385,7 +396,7 @@
   Status TEST_SwitchMemtable(ColumnFamilyData* cfd = nullptr);
 
   // Force current memtable contents to be flushed.
-  Status TEST_FlushMemTable(bool wait = true,
+  Status TEST_FlushMemTable(bool wait = true, bool allow_write_stall = false,
                             ColumnFamilyHandle* cfh = nullptr);
 
   // Wait for memtable compaction
@@ -453,6 +464,7 @@
 
   int TEST_BGCompactionsAllowed() const;
   int TEST_BGFlushesAllowed() const;
+  size_t TEST_GetWalPreallocateBlockSize(uint64_t write_buffer_size) const;
 
 #endif  // NDEBUG
 
@@ -697,6 +709,8 @@
   Statistics* stats_;
   std::unordered_map<std::string, RecoveredTransaction*>
       recovered_transactions_;
+  std::unique_ptr<Tracer> tracer_;
+  InstrumentedMutex trace_mutex_;
 
   // Except in DB::Open(), WriteOptionsFile can only be called when:
   // Persist options to options file.
@@ -782,6 +796,7 @@
 
  private:
   friend class DB;
+  friend class ErrorHandler;
   friend class InternalStats;
   friend class PessimisticTransaction;
   friend class TransactionBaseImpl;
@@ -790,18 +805,21 @@
   friend class WritePreparedTxnDB;
   friend class WriteBatchWithIndex;
   friend class WriteUnpreparedTxnDB;
+  friend class WriteUnpreparedTxn;
+
 #ifndef ROCKSDB_LITE
   friend class ForwardIterator;
 #endif
   friend struct SuperVersion;
   friend class CompactedDBImpl;
   friend class DBTest_ConcurrentFlushWAL_Test;
+  friend class DBTest_MixedSlowdownOptionsStop_Test;
 #ifndef NDEBUG
   friend class DBTest2_ReadCallbackTest_Test;
   friend class WriteCallbackTest_WriteWithCallbackTest_Test;
   friend class XFTransactionWriteHandler;
   friend class DBBlobIndexTest;
-  friend class WriteUnpreparedTransactionTest_RecoveryRollbackUnprepared_Test;
+  friend class WriteUnpreparedTransactionTest_RecoveryTest_Test;
 #endif
   struct CompactionState;
 
@@ -830,6 +848,8 @@
                  bool read_only = false, bool error_if_log_file_exist = false,
                  bool error_if_data_exists_in_logs = false);
 
+  Status ResumeImpl();
+
   void MaybeIgnoreError(Status* s) const;
 
   const Status CreateArchivalDirectory();
@@ -869,12 +889,41 @@
   Status SyncClosedLogs(JobContext* job_context);
 
   // Flush the in-memory write buffer to storage.  Switches to a new
-  // log-file/memtable and writes a new descriptor iff successful.
+  // log-file/memtable and writes a new descriptor iff successful. Then
+  // installs a new super version for the column family.
   Status FlushMemTableToOutputFile(ColumnFamilyData* cfd,
                                    const MutableCFOptions& mutable_cf_options,
                                    bool* madeProgress, JobContext* job_context,
+                                   SuperVersionContext* superversion_context,
                                    LogBuffer* log_buffer);
 
+  // Argument required by background flush thread.
+  struct BGFlushArg {
+    BGFlushArg()
+        : cfd_(nullptr), memtable_id_(0), superversion_context_(nullptr) {}
+    BGFlushArg(ColumnFamilyData* cfd, uint64_t memtable_id,
+               SuperVersionContext* superversion_context)
+        : cfd_(cfd),
+          memtable_id_(memtable_id),
+          superversion_context_(superversion_context) {}
+
+    // Column family to flush.
+    ColumnFamilyData* cfd_;
+    // Maximum ID of memtable to flush. In this column family, memtables with
+    // IDs smaller than this value must be flushed before this flush completes.
+    uint64_t memtable_id_;
+    // Pointer to a SuperVersionContext object. After flush completes, RocksDB
+    // installs a new superversion for the column family. This operation
+    // requires a SuperVersionContext object (currently embedded in JobContext).
+    SuperVersionContext* superversion_context_;
+  };
+
+  // Flush the memtables of (multiple) column families to multiple files on
+  // persistent storage.
+  Status FlushMemTablesToOutputFiles(
+      const autovector<BGFlushArg>& bg_flush_args, bool* made_progress,
+      JobContext* job_context, LogBuffer* log_buffer);
+
   // REQUIRES: log_numbers are sorted in ascending order
   Status RecoverLogFiles(const std::vector<uint64_t>& log_numbers,
                          SequenceNumber* next_sequence, bool read_only);
@@ -887,6 +936,12 @@
   Status WriteLevel0TableForRecovery(int job_id, ColumnFamilyData* cfd,
                                      MemTable* mem, VersionEdit* edit);
 
+  // Restore alive_log_files_ and total_log_size_ after recovery.
+  // It needs to run only when there's no flush during recovery
+  // (e.g. avoid_flush_during_recovery=true). May also trigger flush
+  // in case total_log_size > max_total_wal_size.
+  Status RestoreAliveLogFiles(const std::vector<uint64_t>& log_numbers);
+
   // num_bytes: for slowdown case, delay time is calculated based on
   //            `num_bytes` going through.
   Status DelayWrite(uint64_t num_bytes, const WriteOptions& write_options);
@@ -896,19 +951,28 @@
 
   Status ScheduleFlushes(WriteContext* context);
 
-  Status SwitchMemtable(ColumnFamilyData* cfd, WriteContext* context,
-                        FlushReason flush_reason = FlushReason::kOthers);
+  Status SwitchMemtable(ColumnFamilyData* cfd, WriteContext* context);
 
   // Force current memtable contents to be flushed.
   Status FlushMemTable(ColumnFamilyData* cfd, const FlushOptions& options,
                        FlushReason flush_reason, bool writes_stopped = false);
 
+  // Wait until flushing this column family won't stall writes
+  Status WaitUntilFlushWouldNotStallWrites(ColumnFamilyData* cfd,
+                                           bool* flush_needed);
+
   // Wait for memtable flushed.
   // If flush_memtable_id is non-null, wait until the memtable with the ID
   // gets flush. Otherwise, wait until the column family don't have any
   // memtable pending flush.
   Status WaitForFlushMemTable(ColumnFamilyData* cfd,
-                              const uint64_t* flush_memtable_id = nullptr);
+                              const uint64_t* flush_memtable_id = nullptr) {
+    return WaitForFlushMemTables({cfd}, {flush_memtable_id});
+  }
+  // Wait for memtables to be flushed for multiple column families.
+  Status WaitForFlushMemTables(
+      const autovector<ColumnFamilyData*>& cfds,
+      const autovector<const uint64_t*>& flush_memtable_ids);
 
   // REQUIRES: mutex locked
   Status SwitchWAL(WriteContext* write_context);
@@ -964,7 +1028,17 @@
   ColumnFamilyData* GetColumnFamilyDataByName(const std::string& cf_name);
 
   void MaybeScheduleFlushOrCompaction();
-  void SchedulePendingFlush(ColumnFamilyData* cfd, FlushReason flush_reason);
+
+  // A flush request specifies the column families to flush as well as the
+  // largest memtable id to persist for each column family. Once all the
+  // memtables whose IDs are smaller than or equal to this per-column-family
+  // specified value, this flush request is considered to have completed its
+  // work of flushing this column family. After completing the work for all
+  // column families in this request, this flush is considered complete.
+  typedef std::vector<std::pair<ColumnFamilyData*, uint64_t>> FlushRequest;
+
+  void SchedulePendingFlush(const FlushRequest& req, FlushReason flush_reason);
+
   void SchedulePendingCompaction(ColumnFamilyData* cfd);
   void SchedulePendingPurge(std::string fname, std::string dir_to_sync,
                             FileType type, uint64_t number, int job_id);
@@ -983,9 +1057,10 @@
                               LogBuffer* log_buffer,
                               PrepickedCompaction* prepicked_compaction);
   Status BackgroundFlush(bool* madeProgress, JobContext* job_context,
-                         LogBuffer* log_buffer);
+                         LogBuffer* log_buffer, FlushReason* reason);
 
-  bool EnoughRoomForCompaction(const std::vector<CompactionInputFiles>& inputs,
+  bool EnoughRoomForCompaction(ColumnFamilyData* cfd,
+                               const std::vector<CompactionInputFiles>& inputs,
                                bool* sfm_bookkeeping, LogBuffer* log_buffer);
 
   void PrintStatistics();
@@ -1006,8 +1081,7 @@
   // helper functions for adding and removing from flush & compaction queues
   void AddToCompactionQueue(ColumnFamilyData* cfd);
   ColumnFamilyData* PopFirstFromCompactionQueue();
-  void AddToFlushQueue(ColumnFamilyData* cfd, FlushReason flush_reason);
-  ColumnFamilyData* PopFirstFromFlushQueue();
+  FlushRequest PopFirstFromFlushQueue();
 
   // helper function to call after some of the logs_ were synced
   void MarkLogsSynced(uint64_t up_to, bool synced_dir, const Status& status);
@@ -1020,6 +1094,10 @@
 
   Status CloseHelper();
 
+  Status FlushAllCFs(FlushReason flush_reason);
+
+  void WaitForBackgroundWork();
+
   // table_cache_ provides its own synchronization
   std::shared_ptr<Cache> table_cache_;
 
@@ -1240,7 +1318,7 @@
   // in MaybeScheduleFlushOrCompaction()
   // invariant(column family present in flush_queue_ <==>
   // ColumnFamilyData::pending_flush_ == true)
-  std::deque<ColumnFamilyData*> flush_queue_;
+  std::deque<FlushRequest> flush_queue_;
   // invariant(column family present in compaction_queue_ <==>
   // ColumnFamilyData::pending_compaction_ == true)
   std::deque<ColumnFamilyData*> compaction_queue_;
@@ -1464,6 +1542,16 @@
   // flush/compaction and if it is not provided vis SnapshotChecker, we should
   // disable gc to be safe.
   const bool use_custom_gc_;
+  // Flag to indicate that the DB instance shutdown has been initiated. This
+  // different from shutting_down_ atomic in that it is set at the beginning
+  // of shutdown sequence, specifically in order to prevent any background
+  // error recovery from going on in parallel. The latter, shutting_down_,
+  // is set a little later during the shutdown after scheduling memtable
+  // flushes
+  bool shutdown_initiated_;
+  // Flag to indicate whether sst_file_manager object was allocated in
+  // DB::Open() or passed to us
+  bool own_sfm_;
 
   // Clients must periodically call SetPreserveDeletesSequenceNumber()
   // to advance this seqnum. Default value is 0 which means ALL deletes are
diff -Nru rocksdb-5.15.10/db/db_impl_open.cc rocksdb-5.17.2/db/db_impl_open.cc
--- rocksdb-5.15.10/db/db_impl_open.cc	2018-09-13 17:25:20.000000000 +0000
+++ rocksdb-5.17.2/db/db_impl_open.cc	2018-11-12 19:57:32.000000000 +0000
@@ -134,8 +134,15 @@
   for (size_t i = 0; i < result.db_paths.size(); i++) {
     DeleteScheduler::CleanupDirectory(result.env, sfm, result.db_paths[i].path);
   }
-#endif
 
+  // Create a default SstFileManager for purposes of tracking compaction size
+  // and facilitating recovery from out of space errors.
+  if (result.sst_file_manager.get() == nullptr) {
+    std::shared_ptr<SstFileManager> sst_file_manager(
+        NewSstFileManager(result.env, result.info_log));
+    result.sst_file_manager = sst_file_manager;
+  }
+#endif
   return result;
 }
 
@@ -232,7 +239,7 @@
     file->SetPreallocationBlockSize(
         immutable_db_options_.manifest_preallocation_size);
     unique_ptr<WritableFileWriter> file_writer(
-        new WritableFileWriter(std::move(file), env_options));
+        new WritableFileWriter(std::move(file), manifest, env_options));
     log::Writer log(std::move(file_writer), 0, false);
     std::string record;
     new_db.EncodeTo(&record);
@@ -361,7 +368,7 @@
       s = env_->NewRandomAccessFile(IdentityFileName(dbname_), &idfile,
                                     customized_env);
       if (!s.ok()) {
-        const char* error_msg = s.ToString().c_str();
+        std::string error_str = s.ToString();
         // Check if unsupported Direct I/O is the root cause
         customized_env.use_direct_reads = false;
         s = env_->NewRandomAccessFile(IdentityFileName(dbname_), &idfile,
@@ -371,7 +378,7 @@
               "Direct I/O is not supported by the specified DB.");
         } else {
           return Status::InvalidArgument(
-              "Found options incompatible with filesystem", error_msg);
+              "Found options incompatible with filesystem", error_str.c_str());
         }
       }
     }
@@ -389,6 +396,16 @@
       }
     }
   }
+
+  // Initial max_total_in_memory_state_ before recovery logs. Log recovery
+  // may check this value to decide whether to flush.
+  max_total_in_memory_state_ = 0;
+  for (auto cfd : *versions_->GetColumnFamilySet()) {
+    auto* mutable_cf_options = cfd->GetLatestMutableCFOptions();
+    max_total_in_memory_state_ += mutable_cf_options->write_buffer_size *
+                                  mutable_cf_options->max_write_buffer_number;
+  }
+
   if (s.ok()) {
     SequenceNumber next_sequence(kMaxSequenceNumber);
     default_cf_handle_ = new ColumnFamilyHandleImpl(
@@ -461,14 +478,6 @@
     }
   }
 
-  // Initial value
-  max_total_in_memory_state_ = 0;
-  for (auto cfd : *versions_->GetColumnFamilySet()) {
-    auto* mutable_cf_options = cfd->GetLatestMutableCFOptions();
-    max_total_in_memory_state_ += mutable_cf_options->write_buffer_size *
-                                  mutable_cf_options->max_write_buffer_number;
-  }
-
   return s;
 }
 
@@ -598,8 +607,7 @@
     // to be skipped instead of propagating bad information (like overly
     // large sequence numbers).
     log::Reader reader(immutable_db_options_.info_log, std::move(file_reader),
-                       &reporter, true /*checksum*/, 0 /*initial_offset*/,
-                       log_number);
+                       &reporter, true /*checksum*/, log_number);
 
     // Determine if we should tolerate incomplete records at the tail end of the
     // Read all the records and add to a memtable
@@ -879,18 +887,8 @@
     }
   }
 
-  if (data_seen && !flushed) {
-    // Mark these as alive so they'll be considered for deletion later by
-    // FindObsoleteFiles()
-    if (two_write_queues_) {
-      log_write_mutex_.Lock();
-    }
-    for (auto log_number : log_numbers) {
-      alive_log_files_.push_back(LogFileNumberSize(log_number));
-    }
-    if (two_write_queues_) {
-      log_write_mutex_.Unlock();
-    }
+  if (status.ok() && data_seen && !flushed) {
+    status = RestoreAliveLogFiles(log_numbers);
   }
 
   event_logger_.Log() << "job" << job_id << "event"
@@ -899,6 +897,60 @@
   return status;
 }
 
+Status DBImpl::RestoreAliveLogFiles(const std::vector<uint64_t>& log_numbers) {
+  if (log_numbers.empty()) {
+    return Status::OK();
+  }
+  Status s;
+  mutex_.AssertHeld();
+  assert(immutable_db_options_.avoid_flush_during_recovery);
+  if (two_write_queues_) {
+    log_write_mutex_.Lock();
+  }
+  // Mark these as alive so they'll be considered for deletion later by
+  // FindObsoleteFiles()
+  total_log_size_ = 0;
+  log_empty_ = false;
+  for (auto log_number : log_numbers) {
+    LogFileNumberSize log(log_number);
+    std::string fname = LogFileName(immutable_db_options_.wal_dir, log_number);
+    // This gets the appear size of the logs, not including preallocated space.
+    s = env_->GetFileSize(fname, &log.size);
+    if (!s.ok()) {
+      break;
+    }
+    total_log_size_ += log.size;
+    alive_log_files_.push_back(log);
+    // We preallocate space for logs, but then after a crash and restart, those
+    // preallocated space are not needed anymore. It is likely only the last
+    // log has such preallocated space, so we only truncate for the last log.
+    if (log_number == log_numbers.back()) {
+      std::unique_ptr<WritableFile> last_log;
+      Status truncate_status = env_->ReopenWritableFile(
+          fname, &last_log,
+          env_->OptimizeForLogWrite(
+              env_options_,
+              BuildDBOptions(immutable_db_options_, mutable_db_options_)));
+      if (truncate_status.ok()) {
+        truncate_status = last_log->Truncate(log.size);
+      }
+      if (truncate_status.ok()) {
+        truncate_status = last_log->Close();
+      }
+      // Not a critical error if fail to truncate.
+      if (!truncate_status.ok()) {
+        ROCKS_LOG_WARN(immutable_db_options_.info_log,
+                       "Failed to truncate log #%" PRIu64 ": %s", log_number,
+                       truncate_status.ToString().c_str());
+      }
+    }
+  }
+  if (two_write_queues_) {
+    log_write_mutex_.Unlock();
+  }
+  return s;
+}
+
 Status DBImpl::WriteLevel0TableForRecovery(int job_id, ColumnFamilyData* cfd,
                                            MemTable* mem, VersionEdit* edit) {
   mutex_.AssertHeld();
@@ -969,7 +1021,7 @@
   if (s.ok() && meta.fd.GetFileSize() > 0) {
     edit->AddFile(level, meta.fd.GetNumber(), meta.fd.GetPathId(),
                   meta.fd.GetFileSize(), meta.smallest, meta.largest,
-                  meta.smallest_seqno, meta.largest_seqno,
+                  meta.fd.smallest_seqno, meta.fd.largest_seqno,
                   meta.marked_for_compaction);
   }
 
@@ -1051,6 +1103,12 @@
         break;
       }
     }
+
+    // For recovery from NoSpace() error, we can only handle
+    // the case where the database is stored in a single path
+    if (paths.size() <= 1) {
+      impl->error_handler_.EnableAutoRecovery();
+    }
   }
 
   if (!s.ok()) {
@@ -1075,10 +1133,10 @@
         impl->immutable_db_options_.env->OptimizeForLogWrite(
             soptions, BuildDBOptions(impl->immutable_db_options_,
                                      impl->mutable_db_options_));
-    s = NewWritableFile(
-        impl->immutable_db_options_.env,
-        LogFileName(impl->immutable_db_options_.wal_dir, new_log_number),
-        &lfile, opt_env_options);
+    std::string log_fname =
+        LogFileName(impl->immutable_db_options_.wal_dir, new_log_number);
+    s = NewWritableFile(impl->immutable_db_options_.env, log_fname, &lfile,
+                        opt_env_options);
     if (s.ok()) {
       lfile->SetWriteLifeTimeHint(write_hint);
       lfile->SetPreallocationBlockSize(
@@ -1086,8 +1144,8 @@
       {
         InstrumentedMutexLock wl(&impl->log_write_mutex_);
         impl->logfile_number_ = new_log_number;
-        unique_ptr<WritableFileWriter> file_writer(
-            new WritableFileWriter(std::move(lfile), opt_env_options));
+        unique_ptr<WritableFileWriter> file_writer(new WritableFileWriter(
+            std::move(lfile), log_fname, opt_env_options));
         impl->logs_.emplace_back(
             new_log_number,
             new log::Writer(
@@ -1214,6 +1272,14 @@
         }
       }
     }
+
+    // Reserve some disk buffer space. This is a heuristic - when we run out
+    // of disk space, this ensures that there is atleast write_buffer_size
+    // amount of free space before we resume DB writes. In low disk space
+    // conditions, we want to avoid a lot of small L0 files due to frequent
+    // WAL write failures and resultant forced flushes
+    sfm->ReserveDiskBuffer(max_write_buffer_size,
+                           impl->immutable_db_options_.db_paths[0].path);
   }
 #endif  // !ROCKSDB_LITE
 
diff -Nru rocksdb-5.15.10/db/db_impl_readonly.cc rocksdb-5.17.2/db/db_impl_readonly.cc
--- rocksdb-5.15.10/db/db_impl_readonly.cc	2018-09-13 17:25:20.000000000 +0000
+++ rocksdb-5.17.2/db/db_impl_readonly.cc	2018-11-12 19:57:32.000000000 +0000
@@ -31,22 +31,38 @@
                            ColumnFamilyHandle* column_family, const Slice& key,
                            PinnableSlice* pinnable_val) {
   assert(pinnable_val != nullptr);
+  // TODO: stopwatch DB_GET needed?, perf timer needed?
+  PERF_TIMER_GUARD(get_snapshot_time);
   Status s;
   SequenceNumber snapshot = versions_->LastSequence();
   auto cfh = reinterpret_cast<ColumnFamilyHandleImpl*>(column_family);
   auto cfd = cfh->cfd();
+  if (tracer_) {
+    InstrumentedMutexLock lock(&trace_mutex_);
+    if (tracer_) {
+      tracer_->Get(column_family, key);
+    }
+  }
   SuperVersion* super_version = cfd->GetSuperVersion();
   MergeContext merge_context;
   RangeDelAggregator range_del_agg(cfd->internal_comparator(), snapshot);
   LookupKey lkey(key, snapshot);
+  PERF_TIMER_STOP(get_snapshot_time);
   if (super_version->mem->Get(lkey, pinnable_val->GetSelf(), &s, &merge_context,
                               &range_del_agg, read_options)) {
     pinnable_val->PinSelf();
+    RecordTick(stats_, MEMTABLE_HIT);
   } else {
     PERF_TIMER_GUARD(get_from_output_files_time);
     super_version->current->Get(read_options, lkey, pinnable_val, &s,
                                 &merge_context, &range_del_agg);
+    RecordTick(stats_, MEMTABLE_MISS);
   }
+  RecordTick(stats_, NUMBER_KEYS_READ);
+  size_t size = pinnable_val->size();
+  RecordTick(stats_, BYTES_READ, size);
+  MeasureTime(stats_, BYTES_PER_READ, size);
+  PERF_COUNTER_ADD(get_read_bytes, size);
   return s;
 }
 
diff -Nru rocksdb-5.15.10/db/db_impl_write.cc rocksdb-5.17.2/db/db_impl_write.cc
--- rocksdb-5.15.10/db/db_impl_write.cc	2018-09-13 17:25:20.000000000 +0000
+++ rocksdb-5.17.2/db/db_impl_write.cc	2018-11-12 19:57:32.000000000 +0000
@@ -76,6 +76,12 @@
   if (my_batch == nullptr) {
     return Status::Corruption("Batch is nullptr!");
   }
+  if (tracer_) {
+    InstrumentedMutexLock lock(&trace_mutex_);
+    if (tracer_) {
+      tracer_->Write(my_batch);
+    }
+  }
   if (write_options.sync && write_options.disableWAL) {
     return Status::InvalidArgument("Sync writes has to enable WAL.");
   }
@@ -311,7 +317,8 @@
         w.status = WriteBatchInternal::InsertInto(
             write_group, current_sequence, column_family_memtables_.get(),
             &flush_scheduler_, write_options.ignore_missing_column_families,
-            0 /*recovery_log_number*/, this, parallel, seq_per_batch_);
+            0 /*recovery_log_number*/, this, parallel, seq_per_batch_,
+            batch_per_txn_);
       } else {
         SequenceNumber next_sequence = current_sequence;
         // Note: the logic for advancing seq here must be consistent with the
@@ -346,7 +353,7 @@
               &w, w.sequence, &column_family_memtables, &flush_scheduler_,
               write_options.ignore_missing_column_families, 0 /*log_number*/,
               this, true /*concurrent_memtable_writes*/, seq_per_batch_,
-              w.batch_cnt);
+              w.batch_cnt, batch_per_txn_);
         }
       }
       if (seq_used != nullptr) {
@@ -508,7 +515,8 @@
       memtable_write_group.status = WriteBatchInternal::InsertInto(
           memtable_write_group, w.sequence, column_family_memtables_.get(),
           &flush_scheduler_, write_options.ignore_missing_column_families,
-          0 /*log_number*/, this, seq_per_batch_);
+          0 /*log_number*/, this, false /*concurrent_memtable_writes*/,
+          seq_per_batch_, batch_per_txn_);
       versions_->SetLastSequence(memtable_write_group.last_sequence);
       write_thread_.ExitAsMemTableWriter(&w, memtable_write_group);
     }
@@ -565,7 +573,6 @@
   }
   // else we are the leader of the write batch group
   assert(w.state == WriteThread::STATE_GROUP_LEADER);
-  WriteContext write_context;
   WriteThread::WriteGroup write_group;
   uint64_t last_sequence;
   nonmem_write_thread_.EnterAsBatchGroupLeader(&w, &write_group);
@@ -703,6 +710,10 @@
   assert(write_context != nullptr && need_log_sync != nullptr);
   Status status;
 
+  if (error_handler_.IsDBStopped()) {
+    status = error_handler_.GetBGError();
+  }
+
   PERF_TIMER_GUARD(write_scheduling_flushes_compactions_time);
 
   assert(!single_column_family_mode_ ||
@@ -721,10 +732,6 @@
     status = HandleWriteBufferFull(write_context);
   }
 
-  if (UNLIKELY(status.ok())) {
-    status = error_handler_.GetBGError();
-  }
-
   if (UNLIKELY(status.ok() && !flush_scheduler_.Empty())) {
     status = ScheduleFlushes(write_context);
   }
@@ -1057,6 +1064,7 @@
                  oldest_alive_log, total_log_size_.load(), GetMaxTotalWalSize());
   // no need to refcount because drop is happening in write thread, so can't
   // happen while we're in the write thread
+  FlushRequest flush_req;
   for (auto cfd : *versions_->GetColumnFamilySet()) {
     if (cfd->IsDropped()) {
       continue;
@@ -1066,11 +1074,14 @@
       if (!status.ok()) {
         break;
       }
+      flush_req.emplace_back(cfd, cfd->imm()->GetLatestMemTableID());
       cfd->imm()->FlushRequested();
-      SchedulePendingFlush(cfd, FlushReason::kWriteBufferManager);
     }
   }
-  MaybeScheduleFlushOrCompaction();
+  if (status.ok()) {
+    SchedulePendingFlush(flush_req, FlushReason::kWriteBufferManager);
+    MaybeScheduleFlushOrCompaction();
+  }
   return status;
 }
 
@@ -1109,14 +1120,26 @@
       }
     }
   }
+
+  autovector<ColumnFamilyData*> cfds;
   if (cfd_picked != nullptr) {
-    status = SwitchMemtable(cfd_picked, write_context,
-                            FlushReason::kWriteBufferFull);
-    if (status.ok()) {
-      cfd_picked->imm()->FlushRequested();
-      SchedulePendingFlush(cfd_picked, FlushReason::kWriteBufferFull);
-      MaybeScheduleFlushOrCompaction();
+    cfds.push_back(cfd_picked);
+  }
+  FlushRequest flush_req;
+  for (const auto cfd : cfds) {
+    cfd->Ref();
+    status = SwitchMemtable(cfd, write_context);
+    cfd->Unref();
+    if (!status.ok()) {
+      break;
     }
+    uint64_t flush_memtable_id = cfd->imm()->GetLatestMemTableID();
+    cfd->imm()->FlushRequested();
+    flush_req.emplace_back(cfd, flush_memtable_id);
+  }
+  if (status.ok()) {
+    SchedulePendingFlush(flush_req, FlushReason::kWriteBufferFull);
+    MaybeScheduleFlushOrCompaction();
   }
   return status;
 }
@@ -1139,10 +1162,14 @@
     uint64_t delay = write_controller_.GetDelay(env_, num_bytes);
     if (delay > 0) {
       if (write_options.no_slowdown) {
-        return Status::Incomplete();
+        return Status::Incomplete("Write stall");
       }
       TEST_SYNC_POINT("DBImpl::DelayWrite:Sleep");
 
+      // Notify write_thread_ about the stall so it can setup a barrier and
+      // fail any pending writers with no_slowdown
+      write_thread_.BeginWriteStall();
+      TEST_SYNC_POINT("DBImpl::DelayWrite:BeginWriteStallDone");
       mutex_.Unlock();
       // We will delay the write until we have slept for delay ms or
       // we don't need a delay anymore
@@ -1159,15 +1186,25 @@
         env_->SleepForMicroseconds(kDelayInterval);
       }
       mutex_.Lock();
+      write_thread_.EndWriteStall();
     }
 
-    while (!error_handler_.IsDBStopped() && write_controller_.IsStopped()) {
+    // Don't wait if there's a background error, even if its a soft error. We
+    // might wait here indefinitely as the background compaction may never
+    // finish successfully, resulting in the stall condition lasting
+    // indefinitely
+    while (error_handler_.GetBGError().ok() && write_controller_.IsStopped()) {
       if (write_options.no_slowdown) {
-        return Status::Incomplete();
+        return Status::Incomplete("Write stall");
       }
       delayed = true;
+
+      // Notify write_thread_ about the stall so it can setup a barrier and
+      // fail any pending writers with no_slowdown
+      write_thread_.BeginWriteStall();
       TEST_SYNC_POINT("DBImpl::DelayWrite:Wait");
       bg_cv_.Wait();
+      write_thread_.EndWriteStall();
     }
   }
   assert(!delayed || !write_options.no_slowdown);
@@ -1177,7 +1214,19 @@
     RecordTick(stats_, STALL_MICROS, time_delayed);
   }
 
-  return error_handler_.GetBGError();
+  // If DB is not in read-only mode and write_controller is not stopping
+  // writes, we can ignore any background errors and allow the write to
+  // proceed
+  Status s;
+  if (write_controller_.IsStopped()) {
+    // If writes are still stopped, it means we bailed due to a background
+    // error
+    s = Status::Incomplete(error_handler_.GetBGError().ToString());
+  }
+  if (error_handler_.IsDBStopped()) {
+    s = error_handler_.GetBGError();
+  }
+  return s;
 }
 
 Status DBImpl::ThrottleLowPriWritesIfNeeded(const WriteOptions& write_options,
@@ -1212,16 +1261,28 @@
 
 Status DBImpl::ScheduleFlushes(WriteContext* context) {
   ColumnFamilyData* cfd;
+  FlushRequest flush_req;
+  Status status;
   while ((cfd = flush_scheduler_.TakeNextColumnFamily()) != nullptr) {
-    auto status = SwitchMemtable(cfd, context, FlushReason::kWriteBufferFull);
+    status = SwitchMemtable(cfd, context);
+    bool should_schedule = true;
     if (cfd->Unref()) {
       delete cfd;
+      should_schedule = false;
     }
     if (!status.ok()) {
-      return status;
+      break;
+    }
+    if (should_schedule) {
+      uint64_t flush_memtable_id = cfd->imm()->GetLatestMemTableID();
+      flush_req.emplace_back(cfd, flush_memtable_id);
     }
   }
-  return Status::OK();
+  if (status.ok()) {
+    SchedulePendingFlush(flush_req, FlushReason::kWriteBufferFull);
+    MaybeScheduleFlushOrCompaction();
+  }
+  return status;
 }
 
 #ifndef ROCKSDB_LITE
@@ -1242,8 +1303,7 @@
 
 // REQUIRES: mutex_ is held
 // REQUIRES: this thread is currently at the front of the writer queue
-Status DBImpl::SwitchMemtable(ColumnFamilyData* cfd, WriteContext* context,
-                              FlushReason flush_reason) {
+Status DBImpl::SwitchMemtable(ColumnFamilyData* cfd, WriteContext* context) {
   mutex_.AssertHeld();
   WriteThread::Writer nonmem_w;
   if (two_write_queues_) {
@@ -1312,6 +1372,8 @@
   auto write_hint = CalculateWALWriteHint();
   mutex_.Unlock();
   {
+    std::string log_fname =
+        LogFileName(immutable_db_options_.wal_dir, new_log_number);
     if (creating_new_log) {
       EnvOptions opt_env_opt =
           env_->OptimizeForLogWrite(env_options_, db_options);
@@ -1319,14 +1381,12 @@
         ROCKS_LOG_INFO(immutable_db_options_.info_log,
                        "reusing log %" PRIu64 " from recycle list\n",
                        recycle_log_number);
-        s = env_->ReuseWritableFile(
-            LogFileName(immutable_db_options_.wal_dir, new_log_number),
-            LogFileName(immutable_db_options_.wal_dir, recycle_log_number),
-            &lfile, opt_env_opt);
+        std::string old_log_fname =
+            LogFileName(immutable_db_options_.wal_dir, recycle_log_number);
+        s = env_->ReuseWritableFile(log_fname, old_log_fname, &lfile,
+                                    opt_env_opt);
       } else {
-        s = NewWritableFile(
-            env_, LogFileName(immutable_db_options_.wal_dir, new_log_number),
-            &lfile, opt_env_opt);
+        s = NewWritableFile(env_, log_fname, &lfile, opt_env_opt);
       }
       if (s.ok()) {
         // Our final size should be less than write_buffer_size
@@ -1337,7 +1397,7 @@
         lfile->SetPreallocationBlockSize(preallocate_block_size);
         lfile->SetWriteLifeTimeHint(write_hint);
         unique_ptr<WritableFileWriter> file_writer(
-            new WritableFileWriter(std::move(lfile), opt_env_opt));
+            new WritableFileWriter(std::move(lfile), log_fname, opt_env_opt));
         new_log = new log::Writer(
             std::move(file_writer), new_log_number,
             immutable_db_options_.recycle_log_file_num > 0, manual_wal_flush_);
@@ -1415,7 +1475,7 @@
   new_mem->Ref();
   cfd->SetMemtable(new_mem);
   InstallSuperVersionAndScheduleWork(cfd, &context->superversion_context,
-                                     mutable_cf_options, flush_reason);
+                                     mutable_cf_options);
   if (two_write_queues_) {
     nonmem_write_thread_.ExitUnbatched(&nonmem_w);
   }
diff -Nru rocksdb-5.15.10/db/db_iterator_test.cc rocksdb-5.17.2/db/db_iterator_test.cc
--- rocksdb-5.15.10/db/db_iterator_test.cc	2018-09-13 17:25:20.000000000 +0000
+++ rocksdb-5.17.2/db/db_iterator_test.cc	2018-11-12 19:57:32.000000000 +0000
@@ -2093,6 +2093,34 @@
   iter.reset();
 }
 
+TEST_P(DBIteratorTest, RefreshWithSnapshot) {
+  ASSERT_OK(Put("x", "y"));
+  const Snapshot* snapshot = db_->GetSnapshot();
+  ReadOptions options;
+  options.snapshot = snapshot;
+  Iterator* iter = NewIterator(options);
+
+  iter->Seek(Slice("a"));
+  ASSERT_TRUE(iter->Valid());
+  ASSERT_EQ(iter->key().compare(Slice("x")), 0);
+  iter->Next();
+  ASSERT_FALSE(iter->Valid());
+
+  ASSERT_OK(Put("c", "d"));
+
+  iter->Seek(Slice("a"));
+  ASSERT_TRUE(iter->Valid());
+  ASSERT_EQ(iter->key().compare(Slice("x")), 0);
+  iter->Next();
+  ASSERT_FALSE(iter->Valid());
+
+  Status s;
+  s = iter->Refresh();
+  ASSERT_TRUE(s.IsNotSupported());
+  db_->ReleaseSnapshot(snapshot);
+  delete iter;
+}
+
 TEST_P(DBIteratorTest, CreationFailure) {
   SyncPoint::GetInstance()->SetCallBack(
       "DBImpl::NewInternalIterator:StatusCallback", [](void* arg) {
diff -Nru rocksdb-5.15.10/db/db_iter.cc rocksdb-5.17.2/db/db_iter.cc
--- rocksdb-5.15.10/db/db_iter.cc	2018-09-13 17:25:20.000000000 +0000
+++ rocksdb-5.17.2/db/db_iter.cc	2018-11-12 19:57:32.000000000 +0000
@@ -27,6 +27,7 @@
 #include "util/logging.h"
 #include "util/mutexlock.h"
 #include "util/string_util.h"
+#include "util/trace_replay.h"
 
 namespace rocksdb {
 
@@ -114,7 +115,8 @@
          const MutableCFOptions& mutable_cf_options, const Comparator* cmp,
          InternalIterator* iter, SequenceNumber s, bool arena_mode,
          uint64_t max_sequential_skip_in_iterations,
-         ReadCallback* read_callback, bool allow_blob)
+         ReadCallback* read_callback, DBImpl* db_impl, ColumnFamilyData* cfd,
+         bool allow_blob)
       : arena_mode_(arena_mode),
         env_(_env),
         logger_(cf_options.info_log),
@@ -135,6 +137,8 @@
         range_del_agg_(cf_options.internal_comparator, s,
                        true /* collapse_deletions */),
         read_callback_(read_callback),
+        db_impl_(db_impl),
+        cfd_(cfd),
         allow_blob_(allow_blob),
         is_blob_(false),
         start_seqnum_(read_options.iter_start_seqnum) {
@@ -344,6 +348,8 @@
   LocalStatistics local_stats_;
   PinnedIteratorsManager pinned_iters_mgr_;
   ReadCallback* read_callback_;
+  DBImpl* db_impl_;
+  ColumnFamilyData* cfd_;
   bool allow_blob_;
   bool is_blob_;
   // for diff snapshots we want the lower bound on the seqnum;
@@ -1267,6 +1273,12 @@
   saved_key_.Clear();
   saved_key_.SetInternalKey(target, seq);
 
+#ifndef ROCKSDB_LITE
+  if (db_impl_ != nullptr && cfd_ != nullptr) {
+    db_impl_->TraceIteratorSeek(cfd_->GetID(), target);
+  }
+#endif  // ROCKSDB_LITE
+
   if (iterate_lower_bound_ != nullptr &&
       user_comparator_->Compare(saved_key_.GetUserKey(),
                                 *iterate_lower_bound_) < 0) {
@@ -1331,6 +1343,12 @@
     range_del_agg_.InvalidateRangeDelMapPositions();
   }
 
+#ifndef ROCKSDB_LITE
+  if (db_impl_ != nullptr && cfd_ != nullptr) {
+    db_impl_->TraceIteratorSeekForPrev(cfd_->GetID(), target);
+  }
+#endif  // ROCKSDB_LITE
+
   RecordTick(statistics_, NUMBER_DB_SEEK);
   if (iter_->Valid()) {
     if (prefix_extractor_ && prefix_same_as_start_) {
@@ -1453,11 +1471,12 @@
                         InternalIterator* internal_iter,
                         const SequenceNumber& sequence,
                         uint64_t max_sequential_skip_in_iterations,
-                        ReadCallback* read_callback, bool allow_blob) {
-  DBIter* db_iter =
-      new DBIter(env, read_options, cf_options, mutable_cf_options,
-                 user_key_comparator, internal_iter, sequence, false,
-                 max_sequential_skip_in_iterations, read_callback, allow_blob);
+                        ReadCallback* read_callback, DBImpl* db_impl,
+                        ColumnFamilyData* cfd, bool allow_blob) {
+  DBIter* db_iter = new DBIter(
+      env, read_options, cf_options, mutable_cf_options, user_key_comparator,
+      internal_iter, sequence, false, max_sequential_skip_in_iterations,
+      read_callback, db_impl, cfd, allow_blob);
   return db_iter;
 }
 
@@ -1504,13 +1523,14 @@
                               const SequenceNumber& sequence,
                               uint64_t max_sequential_skip_in_iteration,
                               uint64_t version_number,
-                              ReadCallback* read_callback, bool allow_blob,
+                              ReadCallback* read_callback, DBImpl* db_impl,
+                              ColumnFamilyData* cfd, bool allow_blob,
                               bool allow_refresh) {
   auto mem = arena_.AllocateAligned(sizeof(DBIter));
-  db_iter_ = new (mem)
-      DBIter(env, read_options, cf_options, mutable_cf_options,
-             cf_options.user_comparator, nullptr, sequence, true,
-             max_sequential_skip_in_iteration, read_callback, allow_blob);
+  db_iter_ = new (mem) DBIter(env, read_options, cf_options, mutable_cf_options,
+                              cf_options.user_comparator, nullptr, sequence,
+                              true, max_sequential_skip_in_iteration,
+                              read_callback, db_impl, cfd, allow_blob);
   sv_number_ = version_number;
   allow_refresh_ = allow_refresh;
 }
@@ -1534,7 +1554,8 @@
     SuperVersion* sv = cfd_->GetReferencedSuperVersion(db_impl_->mutex());
     Init(env, read_options_, *(cfd_->ioptions()), sv->mutable_cf_options,
          latest_seq, sv->mutable_cf_options.max_sequential_skip_in_iterations,
-         cur_sv_number, read_callback_, allow_blob_, allow_refresh_);
+         cur_sv_number, read_callback_, db_impl_, cfd_, allow_blob_,
+         allow_refresh_);
 
     InternalIterator* internal_iter = db_impl_->NewInternalIterator(
         read_options_, cfd_, sv, &arena_, db_iter_->GetRangeDelAggregator());
@@ -1556,7 +1577,7 @@
   ArenaWrappedDBIter* iter = new ArenaWrappedDBIter();
   iter->Init(env, read_options, cf_options, mutable_cf_options, sequence,
              max_sequential_skip_in_iterations, version_number, read_callback,
-             allow_blob, allow_refresh);
+             db_impl, cfd, allow_blob, allow_refresh);
   if (db_impl != nullptr && cfd != nullptr && allow_refresh) {
     iter->StoreRefreshInfo(read_options, db_impl, cfd, read_callback,
                            allow_blob);
diff -Nru rocksdb-5.15.10/db/db_iter.h rocksdb-5.17.2/db/db_iter.h
--- rocksdb-5.15.10/db/db_iter.h	2018-09-13 17:25:20.000000000 +0000
+++ rocksdb-5.17.2/db/db_iter.h	2018-11-12 19:57:32.000000000 +0000
@@ -23,20 +23,18 @@
 
 class Arena;
 class DBIter;
-class InternalIterator;
 
 // Return a new iterator that converts internal keys (yielded by
 // "*internal_iter") that were live at the specified "sequence" number
 // into appropriate user keys.
-extern Iterator* NewDBIterator(Env* env, const ReadOptions& read_options,
-                               const ImmutableCFOptions& cf_options,
-                               const MutableCFOptions& mutable_cf_options,
-                               const Comparator* user_key_comparator,
-                               InternalIterator* internal_iter,
-                               const SequenceNumber& sequence,
-                               uint64_t max_sequential_skip_in_iterations,
-                               ReadCallback* read_callback,
-                               bool allow_blob = false);
+extern Iterator* NewDBIterator(
+    Env* env, const ReadOptions& read_options,
+    const ImmutableCFOptions& cf_options,
+    const MutableCFOptions& mutable_cf_options,
+    const Comparator* user_key_comparator, InternalIterator* internal_iter,
+    const SequenceNumber& sequence, uint64_t max_sequential_skip_in_iterations,
+    ReadCallback* read_callback, DBImpl* db_impl = nullptr,
+    ColumnFamilyData* cfd = nullptr, bool allow_blob = false);
 
 // A wrapper iterator which wraps DB Iterator and the arena, with which the DB
 // iterator is supposed be allocated. This class is used as an entry point of
@@ -75,7 +73,8 @@
             const MutableCFOptions& mutable_cf_options,
             const SequenceNumber& sequence,
             uint64_t max_sequential_skip_in_iterations, uint64_t version_number,
-            ReadCallback* read_callback, bool allow_blob, bool allow_refresh);
+            ReadCallback* read_callback, DBImpl* db_impl, ColumnFamilyData* cfd,
+            bool allow_blob, bool allow_refresh);
 
   void StoreRefreshInfo(const ReadOptions& read_options, DBImpl* db_impl,
                         ColumnFamilyData* cfd, ReadCallback* read_callback,
diff -Nru rocksdb-5.15.10/db/db_properties_test.cc rocksdb-5.17.2/db/db_properties_test.cc
--- rocksdb-5.15.10/db/db_properties_test.cc	2018-09-13 17:25:20.000000000 +0000
+++ rocksdb-5.17.2/db/db_properties_test.cc	2018-11-12 19:57:32.000000000 +0000
@@ -180,17 +180,16 @@
   ResetTableProperties(tp);
   sscanf(tp_string.c_str(),
          "# data blocks %" SCNu64 " # entries %" SCNu64
-         " # range deletions %" SCNu64
-         " raw key size %" SCNu64
+         " # range deletions %" SCNu64 " raw key size %" SCNu64
          " raw average key size %lf "
          " raw value size %" SCNu64
          " raw average value size %lf "
          " data block size %" SCNu64 " index block size (user-key? %" SCNu64
-         ") %" SCNu64 " filter block size %" SCNu64,
+         ", delta-value? %" SCNu64 ") %" SCNu64 " filter block size %" SCNu64,
          &tp->num_data_blocks, &tp->num_entries, &tp->num_range_deletions,
          &tp->raw_key_size, &dummy_double, &tp->raw_value_size, &dummy_double,
-         &tp->data_size, &tp->index_key_is_user_key, &tp->index_size,
-         &tp->filter_size);
+         &tp->data_size, &tp->index_key_is_user_key,
+         &tp->index_value_is_delta_encoded, &tp->index_size, &tp->filter_size);
 }
 
 void VerifySimilar(uint64_t a, uint64_t b, double bias) {
@@ -224,14 +223,11 @@
   ASSERT_EQ(base_tp.num_range_deletions, new_tp.num_range_deletions);
 }
 
-void GetExpectedTableProperties(TableProperties* expected_tp,
-                                const int kKeySize, const int kValueSize,
-                                const int kKeysPerTable,
-                                const int kRangeDeletionsPerTable,
-                                const int kTableCount,
-                                const int kBloomBitsPerKey,
-                                const size_t kBlockSize,
-                                const bool index_key_is_user_key) {
+void GetExpectedTableProperties(
+    TableProperties* expected_tp, const int kKeySize, const int kValueSize,
+    const int kKeysPerTable, const int kRangeDeletionsPerTable,
+    const int kTableCount, const int kBloomBitsPerKey, const size_t kBlockSize,
+    const bool index_key_is_user_key, const bool value_delta_encoding) {
   const int kKeyCount = kTableCount * kKeysPerTable;
   const int kRangeDeletionCount = kTableCount * kRangeDeletionsPerTable;
   const int kAvgSuccessorSize = kKeySize / 5;
@@ -248,7 +244,9 @@
       kTableCount * (kKeysPerTable * (kKeySize + 8 + kValueSize));
   expected_tp->index_size =
       expected_tp->num_data_blocks *
-      (kAvgSuccessorSize + (index_key_is_user_key ? 0 : 8));
+      (kAvgSuccessorSize + (index_key_is_user_key ? 0 : 8) -
+       // discount 1 byte as value size is not encoded in value delta encoding
+       (value_delta_encoding ? 1 : 0));
   expected_tp->filter_size =
       kTableCount * (kKeysPerTable * kBloomBitsPerKey / 8);
 }
@@ -342,12 +340,14 @@
     TableProperties output_tp;
     ParseTablePropertiesString(property, &output_tp);
     bool index_key_is_user_key = output_tp.index_key_is_user_key > 0;
+    bool value_is_delta_encoded = output_tp.index_value_is_delta_encoded > 0;
 
     TableProperties expected_tp;
     GetExpectedTableProperties(&expected_tp, kKeySize, kValueSize,
                                kKeysPerTable, kRangeDeletionsPerTable,
                                kTableCount, kBloomBitsPerKey,
-                               table_options.block_size, index_key_is_user_key);
+                               table_options.block_size, index_key_is_user_key,
+                               value_is_delta_encoded);
 
     VerifyTableProperties(expected_tp, output_tp);
   }
@@ -533,6 +533,7 @@
     db_->GetProperty(DB::Properties::kAggregatedTableProperties, &tp_string);
     ParseTablePropertiesString(tp_string, &tp);
     bool index_key_is_user_key = tp.index_key_is_user_key > 0;
+    bool value_is_delta_encoded = tp.index_value_is_delta_encoded > 0;
     ASSERT_EQ(sum_tp.data_size, tp.data_size);
     ASSERT_EQ(sum_tp.index_size, tp.index_size);
     ASSERT_EQ(sum_tp.filter_size, tp.filter_size);
@@ -542,10 +543,10 @@
     ASSERT_EQ(sum_tp.num_entries, tp.num_entries);
     ASSERT_EQ(sum_tp.num_range_deletions, tp.num_range_deletions);
     if (table > 3) {
-      GetExpectedTableProperties(
-          &expected_tp, kKeySize, kValueSize, kKeysPerTable,
-          kRangeDeletionsPerTable, table, kBloomBitsPerKey,
-          table_options.block_size, index_key_is_user_key);
+      GetExpectedTableProperties(&expected_tp, kKeySize, kValueSize,
+                                 kKeysPerTable, kRangeDeletionsPerTable, table,
+                                 kBloomBitsPerKey, table_options.block_size,
+                                 index_key_is_user_key, value_is_delta_encoded);
       // Gives larger bias here as index block size, filter block size,
       // and data block size become much harder to estimate in this test.
       VerifyTableProperties(expected_tp, tp, 0.5, 0.4, 0.4, 0.25);
diff -Nru rocksdb-5.15.10/db/db_range_del_test.cc rocksdb-5.17.2/db/db_range_del_test.cc
--- rocksdb-5.15.10/db/db_range_del_test.cc	2018-09-13 17:25:20.000000000 +0000
+++ rocksdb-5.17.2/db/db_range_del_test.cc	2018-11-12 19:57:32.000000000 +0000
@@ -191,7 +191,7 @@
 
   std::vector<std::vector<FileMetaData>> files;
   dbfull()->TEST_GetFilesMetaData(db_->DefaultColumnFamily(), &files);
-  ASSERT_GT(files[0][0].smallest_seqno, 0);
+  ASSERT_GT(files[0][0].fd.smallest_seqno, 0);
 
   db_->ReleaseSnapshot(snapshot);
 }
diff -Nru rocksdb-5.15.10/db/db_sst_test.cc rocksdb-5.17.2/db/db_sst_test.cc
--- rocksdb-5.15.10/db/db_sst_test.cc	2018-09-13 17:25:20.000000000 +0000
+++ rocksdb-5.17.2/db/db_sst_test.cc	2018-11-12 19:57:32.000000000 +0000
@@ -436,7 +436,7 @@
 // deleted from first db_path were deleted using DeleteScheduler and
 // files in the second path were not.
 TEST_F(DBSSTTest, DeleteSchedulerMultipleDBPaths) {
-  int bg_delete_file = 0;
+  std::atomic<int> bg_delete_file(0);
   rocksdb::SyncPoint::GetInstance()->SetCallBack(
       "DeleteScheduler::DeleteTrashFile:DeleteFile",
       [&](void* /*arg*/) { bg_delete_file++; });
@@ -703,26 +703,19 @@
   // When bg_error_ is set we will verify that the DB size is greater
   // than the limit.
 
-  std::vector<int> max_space_limits_mbs = {1, 2, 4, 8, 10};
-  decltype(max_space_limits_mbs)::value_type limit_mb_cb;
-  bool bg_error_set = false;
-  uint64_t total_sst_files_size = 0;
-
-  std::atomic<int> estimate_multiplier(1);
-  int reached_max_space_on_flush = 0;
-  int reached_max_space_on_compaction = 0;
+  std::vector<int> max_space_limits_mbs = {1, 10};
+  std::atomic<bool> bg_error_set(false);
+
+  std::atomic<int> reached_max_space_on_flush(0);
+  std::atomic<int> reached_max_space_on_compaction(0);
   rocksdb::SyncPoint::GetInstance()->SetCallBack(
       "DBImpl::FlushMemTableToOutputFile:MaxAllowedSpaceReached",
       [&](void* arg) {
         Status* bg_error = static_cast<Status*>(arg);
         bg_error_set = true;
-        GetAllSSTFiles(&total_sst_files_size);
         reached_max_space_on_flush++;
-        // low limit for size calculated using sst files
-        ASSERT_GE(total_sst_files_size, limit_mb_cb * 1024 * 1024);
         // clear error to ensure compaction callback is called
         *bg_error = Status::OK();
-        estimate_multiplier++;  // used in the main loop assert
       });
 
   rocksdb::SyncPoint::GetInstance()->SetCallBack(
@@ -735,15 +728,11 @@
       "CompactionJob::FinishCompactionOutputFile:MaxAllowedSpaceReached",
       [&](void* /*arg*/) {
         bg_error_set = true;
-        GetAllSSTFiles(&total_sst_files_size);
         reached_max_space_on_compaction++;
       });
 
   for (auto limit_mb : max_space_limits_mbs) {
     bg_error_set = false;
-    total_sst_files_size = 0;
-    estimate_multiplier = 1;
-    limit_mb_cb = limit_mb;
     rocksdb::SyncPoint::GetInstance()->ClearTrace();
     rocksdb::SyncPoint::GetInstance()->EnableProcessing();
     std::shared_ptr<SstFileManager> sst_file_manager(NewSstFileManager(env_));
@@ -757,21 +746,17 @@
 
     sfm->SetMaxAllowedSpaceUsage(limit_mb * 1024 * 1024);
 
-    int keys_written = 0;
-    uint64_t estimated_db_size = 0;
+    // It is easy to detect if the test is stuck in a loop. No need for
+    // complex termination logic.
     while (true) {
       auto s = Put(RandomString(&rnd, 10), RandomString(&rnd, 50));
       if (!s.ok()) {
         break;
       }
-      keys_written++;
-      // Check the estimated db size vs the db limit just to make sure we
-      // dont run into an infinite loop
-      estimated_db_size = keys_written * 60;  // ~60 bytes per key
-      ASSERT_LT(estimated_db_size,
-                estimate_multiplier * limit_mb * 1024 * 1024 * 2);
     }
     ASSERT_TRUE(bg_error_set);
+    uint64_t total_sst_files_size = 0;
+    GetAllSSTFiles(&total_sst_files_size);
     ASSERT_GE(total_sst_files_size, limit_mb * 1024 * 1024);
     rocksdb::SyncPoint::GetInstance()->DisableProcessing();
   }
diff -Nru rocksdb-5.15.10/db/db_table_properties_test.cc rocksdb-5.17.2/db/db_table_properties_test.cc
--- rocksdb-5.15.10/db/db_table_properties_test.cc	2018-09-13 17:25:20.000000000 +0000
+++ rocksdb-5.17.2/db/db_table_properties_test.cc	2018-11-12 19:57:32.000000000 +0000
@@ -252,13 +252,14 @@
 }
 
 TEST_F(DBTablePropertiesTest, DeletionTriggeredCompactionMarking) {
-  const int kNumKeys = 1000;
-  const int kWindowSize = 100;
-  const int kNumDelsTrigger = 90;
+  int kNumKeys = 1000;
+  int kWindowSize = 100;
+  int kNumDelsTrigger = 90;
+  std::shared_ptr<TablePropertiesCollectorFactory> compact_on_del =
+    NewCompactOnDeletionCollectorFactory(kWindowSize, kNumDelsTrigger);
 
   Options opts = CurrentOptions();
-  opts.table_properties_collector_factories.emplace_back(
-      NewCompactOnDeletionCollectorFactory(kWindowSize, kNumDelsTrigger));
+  opts.table_properties_collector_factories.emplace_back(compact_on_del);
   Reopen(opts);
 
   // add an L1 file to prevent tombstones from dropping due to obsolescence
@@ -280,6 +281,48 @@
   dbfull()->TEST_WaitForCompact();
   ASSERT_EQ(0, NumTableFilesAtLevel(0));
   ASSERT_GT(NumTableFilesAtLevel(1), 0);
+
+  // Change the window size and deletion trigger and ensure new values take
+  // effect
+  kWindowSize = 50;
+  kNumDelsTrigger = 40;
+  static_cast<CompactOnDeletionCollectorFactory*>
+    (compact_on_del.get())->SetWindowSize(kWindowSize);
+  static_cast<CompactOnDeletionCollectorFactory*>
+    (compact_on_del.get())->SetDeletionTrigger(kNumDelsTrigger);
+  for (int i = 0; i < kNumKeys; ++i) {
+    if (i >= kNumKeys - kWindowSize &&
+        i < kNumKeys - kWindowSize + kNumDelsTrigger) {
+      Delete(Key(i));
+    } else {
+      Put(Key(i), "val");
+    }
+  }
+  Flush();
+
+  dbfull()->TEST_WaitForCompact();
+  ASSERT_EQ(0, NumTableFilesAtLevel(0));
+  ASSERT_GT(NumTableFilesAtLevel(1), 0);
+
+  // Change the window size to disable delete triggered compaction
+  kWindowSize = 0;
+  static_cast<CompactOnDeletionCollectorFactory*>
+    (compact_on_del.get())->SetWindowSize(kWindowSize);
+  static_cast<CompactOnDeletionCollectorFactory*>
+    (compact_on_del.get())->SetDeletionTrigger(kNumDelsTrigger);
+  for (int i = 0; i < kNumKeys; ++i) {
+    if (i >= kNumKeys - kWindowSize &&
+        i < kNumKeys - kWindowSize + kNumDelsTrigger) {
+      Delete(Key(i));
+    } else {
+      Put(Key(i), "val");
+    }
+  }
+  Flush();
+
+  dbfull()->TEST_WaitForCompact();
+  ASSERT_EQ(1, NumTableFilesAtLevel(0));
+
 }
 
 }  // namespace rocksdb
diff -Nru rocksdb-5.15.10/db/db_test2.cc rocksdb-5.17.2/db/db_test2.cc
--- rocksdb-5.15.10/db/db_test2.cc	2018-09-13 17:25:20.000000000 +0000
+++ rocksdb-5.17.2/db/db_test2.cc	2018-11-12 19:57:32.000000000 +0000
@@ -2321,9 +2321,9 @@
         options.rate_limiter->GetTotalBytesThrough(Env::IO_LOW);
     // Include the explicit prefetch of the footer in direct I/O case.
     size_t direct_io_extra = use_direct_io ? 512 * 1024 : 0;
-    ASSERT_GE(rate_limited_bytes,
-              static_cast<size_t>(kNumKeysPerFile * kBytesPerKey * kNumL0Files +
-                                  direct_io_extra));
+    ASSERT_GE(
+        rate_limited_bytes,
+        static_cast<size_t>(kNumKeysPerFile * kBytesPerKey * kNumL0Files));
     ASSERT_LT(
         rate_limited_bytes,
         static_cast<size_t>(2 * kNumKeysPerFile * kBytesPerKey * kNumL0Files +
@@ -2500,6 +2500,108 @@
   rocksdb::SyncPoint::GetInstance()->DisableProcessing();
 }
 
+TEST_F(DBTest2, TraceAndReplay) {
+  Options options = CurrentOptions();
+  options.merge_operator = MergeOperators::CreatePutOperator();
+  ReadOptions ro;
+  WriteOptions wo;
+  TraceOptions trace_opts;
+  EnvOptions env_opts;
+  CreateAndReopenWithCF({"pikachu"}, options);
+  Random rnd(301);
+  Iterator* single_iter = nullptr;
+
+  std::string trace_filename = dbname_ + "/rocksdb.trace";
+  std::unique_ptr<TraceWriter> trace_writer;
+  ASSERT_OK(NewFileTraceWriter(env_, env_opts, trace_filename, &trace_writer));
+  ASSERT_OK(db_->StartTrace(trace_opts, std::move(trace_writer)));
+
+  ASSERT_OK(Put(0, "a", "1"));
+  ASSERT_OK(Merge(0, "b", "2"));
+  ASSERT_OK(Delete(0, "c"));
+  ASSERT_OK(SingleDelete(0, "d"));
+  ASSERT_OK(db_->DeleteRange(wo, dbfull()->DefaultColumnFamily(), "e", "f"));
+
+  WriteBatch batch;
+  ASSERT_OK(batch.Put("f", "11"));
+  ASSERT_OK(batch.Merge("g", "12"));
+  ASSERT_OK(batch.Delete("h"));
+  ASSERT_OK(batch.SingleDelete("i"));
+  ASSERT_OK(batch.DeleteRange("j", "k"));
+  ASSERT_OK(db_->Write(wo, &batch));
+
+  single_iter = db_->NewIterator(ro);
+  single_iter->Seek("f");
+  single_iter->SeekForPrev("g");
+  delete single_iter;
+
+  ASSERT_EQ("1", Get(0, "a"));
+  ASSERT_EQ("12", Get(0, "g"));
+
+  ASSERT_OK(Put(1, "foo", "bar"));
+  ASSERT_OK(Put(1, "rocksdb", "rocks"));
+  ASSERT_EQ("NOT_FOUND", Get(1, "leveldb"));
+
+  ASSERT_OK(db_->EndTrace());
+  // These should not get into the trace file as it is after EndTrace.
+  Put("hello", "world");
+  Merge("foo", "bar");
+
+  // Open another db, replay, and verify the data
+  std::string value;
+  std::string dbname2 = test::TmpDir(env_) + "/db_replay";
+  ASSERT_OK(DestroyDB(dbname2, options));
+
+  // Using a different name than db2, to pacify infer's use-after-lifetime
+  // warnings (http://fbinfer.com).
+  DB* db2_init = nullptr;
+  options.create_if_missing = true;
+  ASSERT_OK(DB::Open(options, dbname2, &db2_init));
+  ColumnFamilyHandle* cf;
+  ASSERT_OK(
+      db2_init->CreateColumnFamily(ColumnFamilyOptions(), "pikachu", &cf));
+  delete cf;
+  delete db2_init;
+
+  DB* db2 = nullptr;
+  std::vector<ColumnFamilyDescriptor> column_families;
+  ColumnFamilyOptions cf_options;
+  cf_options.merge_operator = MergeOperators::CreatePutOperator();
+  column_families.push_back(ColumnFamilyDescriptor("default", cf_options));
+  column_families.push_back(
+      ColumnFamilyDescriptor("pikachu", ColumnFamilyOptions()));
+  std::vector<ColumnFamilyHandle*> handles;
+  ASSERT_OK(DB::Open(DBOptions(), dbname2, column_families, &handles, &db2));
+
+  env_->SleepForMicroseconds(100);
+  // Verify that the keys don't already exist
+  ASSERT_TRUE(db2->Get(ro, handles[0], "a", &value).IsNotFound());
+  ASSERT_TRUE(db2->Get(ro, handles[0], "g", &value).IsNotFound());
+
+  std::unique_ptr<TraceReader> trace_reader;
+  ASSERT_OK(NewFileTraceReader(env_, env_opts, trace_filename, &trace_reader));
+  Replayer replayer(db2, handles_, std::move(trace_reader));
+  ASSERT_OK(replayer.Replay());
+
+  ASSERT_OK(db2->Get(ro, handles[0], "a", &value));
+  ASSERT_EQ("1", value);
+  ASSERT_OK(db2->Get(ro, handles[0], "g", &value));
+  ASSERT_EQ("12", value);
+  ASSERT_TRUE(db2->Get(ro, handles[0], "hello", &value).IsNotFound());
+  ASSERT_TRUE(db2->Get(ro, handles[0], "world", &value).IsNotFound());
+
+  ASSERT_OK(db2->Get(ro, handles[1], "foo", &value));
+  ASSERT_EQ("bar", value);
+  ASSERT_OK(db2->Get(ro, handles[1], "rocksdb", &value));
+  ASSERT_EQ("rocks", value);
+
+  for (auto handle : handles) {
+    delete handle;
+  }
+  delete db2;
+  ASSERT_OK(DestroyDB(dbname2, options));
+}
+
 #endif  // ROCKSDB_LITE
 
 TEST_F(DBTest2, PinnableSliceAndMmapReads) {
@@ -2547,6 +2649,192 @@
 #endif
 }
 
+TEST_F(DBTest2, DISABLED_IteratorPinnedMemory) {
+  Options options = CurrentOptions();
+  options.create_if_missing = true;
+  options.statistics = rocksdb::CreateDBStatistics();
+  BlockBasedTableOptions bbto;
+  bbto.no_block_cache = false;
+  bbto.cache_index_and_filter_blocks = false;
+  bbto.block_cache = NewLRUCache(100000);
+  bbto.block_size = 400;  // small block size
+  options.table_factory.reset(new BlockBasedTableFactory(bbto));
+  Reopen(options);
+
+  Random rnd(301);
+  std::string v = RandomString(&rnd, 400);
+
+  // Since v is the size of a block, each key should take a block
+  // of 400+ bytes.
+  Put("1", v);
+  Put("3", v);
+  Put("5", v);
+  Put("7", v);
+  ASSERT_OK(Flush());
+
+  ASSERT_EQ(0, bbto.block_cache->GetPinnedUsage());
+
+  // Verify that iterators don't pin more than one data block in block cache
+  // at each time.
+  {
+    unique_ptr<Iterator> iter(db_->NewIterator(ReadOptions()));
+    iter->SeekToFirst();
+
+    for (int i = 0; i < 4; i++) {
+      ASSERT_TRUE(iter->Valid());
+      // Block cache should contain exactly one block.
+      ASSERT_GT(bbto.block_cache->GetPinnedUsage(), 0);
+      ASSERT_LT(bbto.block_cache->GetPinnedUsage(), 800);
+      iter->Next();
+    }
+    ASSERT_FALSE(iter->Valid());
+
+    iter->Seek("4");
+    ASSERT_TRUE(iter->Valid());
+
+    ASSERT_GT(bbto.block_cache->GetPinnedUsage(), 0);
+    ASSERT_LT(bbto.block_cache->GetPinnedUsage(), 800);
+
+    iter->Seek("3");
+    ASSERT_TRUE(iter->Valid());
+
+    ASSERT_GT(bbto.block_cache->GetPinnedUsage(), 0);
+    ASSERT_LT(bbto.block_cache->GetPinnedUsage(), 800);
+  }
+  ASSERT_EQ(0, bbto.block_cache->GetPinnedUsage());
+
+  // Test compaction case
+  Put("2", v);
+  Put("5", v);
+  Put("6", v);
+  Put("8", v);
+  ASSERT_OK(Flush());
+
+  // Clear existing data in block cache
+  bbto.block_cache->SetCapacity(0);
+  bbto.block_cache->SetCapacity(100000);
+
+  // Verify compaction input iterators don't hold more than one data blocks at
+  // one time.
+  std::atomic<bool> finished(false);
+  std::atomic<int> block_newed(0);
+  std::atomic<int> block_destroyed(0);
+  rocksdb::SyncPoint::GetInstance()->SetCallBack(
+      "Block::Block:0", [&](void* /*arg*/) {
+        if (finished) {
+          return;
+        }
+        // Two iterators. At most 2 outstanding blocks.
+        EXPECT_GE(block_newed.load(), block_destroyed.load());
+        EXPECT_LE(block_newed.load(), block_destroyed.load() + 1);
+        block_newed.fetch_add(1);
+      });
+  rocksdb::SyncPoint::GetInstance()->SetCallBack(
+      "Block::~Block", [&](void* /*arg*/) {
+        if (finished) {
+          return;
+        }
+        // Two iterators. At most 2 outstanding blocks.
+        EXPECT_GE(block_newed.load(), block_destroyed.load() + 1);
+        EXPECT_LE(block_newed.load(), block_destroyed.load() + 2);
+        block_destroyed.fetch_add(1);
+      });
+  rocksdb::SyncPoint::GetInstance()->SetCallBack(
+      "CompactionJob::Run:BeforeVerify",
+      [&](void* /*arg*/) { finished = true; });
+  rocksdb::SyncPoint::GetInstance()->EnableProcessing();
+
+  ASSERT_OK(db_->CompactRange(CompactRangeOptions(), nullptr, nullptr));
+
+  // Two input files. Each of them has 4 data blocks.
+  ASSERT_EQ(8, block_newed.load());
+  ASSERT_EQ(8, block_destroyed.load());
+
+  rocksdb::SyncPoint::GetInstance()->DisableProcessing();
+}
+
+TEST_F(DBTest2, TestBBTTailPrefetch) {
+  std::atomic<bool> called(false);
+  size_t expected_lower_bound = 512 * 1024;
+  size_t expected_higher_bound = 512 * 1024;
+  rocksdb::SyncPoint::GetInstance()->SetCallBack(
+      "BlockBasedTable::Open::TailPrefetchLen", [&](void* arg) {
+        size_t* prefetch_size = static_cast<size_t*>(arg);
+        EXPECT_LE(expected_lower_bound, *prefetch_size);
+        EXPECT_GE(expected_higher_bound, *prefetch_size);
+        called = true;
+      });
+  rocksdb::SyncPoint::GetInstance()->EnableProcessing();
+
+  Put("1", "1");
+  Put("9", "1");
+  Flush();
+
+  expected_lower_bound = 0;
+  expected_higher_bound = 8 * 1024;
+
+  Put("1", "1");
+  Put("9", "1");
+  Flush();
+
+  Put("1", "1");
+  Put("9", "1");
+  Flush();
+
+  // Full compaction to make sure there is no L0 file after the open.
+  ASSERT_OK(db_->CompactRange(CompactRangeOptions(), nullptr, nullptr));
+
+  ASSERT_TRUE(called.load());
+  called = false;
+
+  rocksdb::SyncPoint::GetInstance()->DisableProcessing();
+  rocksdb::SyncPoint::GetInstance()->ClearAllCallBacks();
+
+  std::atomic<bool> first_call(true);
+  rocksdb::SyncPoint::GetInstance()->SetCallBack(
+      "BlockBasedTable::Open::TailPrefetchLen", [&](void* arg) {
+        size_t* prefetch_size = static_cast<size_t*>(arg);
+        if (first_call) {
+          EXPECT_EQ(4 * 1024, *prefetch_size);
+          first_call = false;
+        } else {
+          EXPECT_GE(4 * 1024, *prefetch_size);
+        }
+        called = true;
+      });
+  rocksdb::SyncPoint::GetInstance()->EnableProcessing();
+
+  Options options = CurrentOptions();
+  options.max_file_opening_threads = 1;  // one thread
+  BlockBasedTableOptions table_options;
+  table_options.cache_index_and_filter_blocks = true;
+  options.table_factory.reset(NewBlockBasedTableFactory(table_options));
+  options.max_open_files = -1;
+  Reopen(options);
+
+  Put("1", "1");
+  Put("9", "1");
+  Flush();
+
+  Put("1", "1");
+  Put("9", "1");
+  Flush();
+
+  ASSERT_TRUE(called.load());
+  called = false;
+
+  // Parallel loading SST files
+  options.max_file_opening_threads = 16;
+  Reopen(options);
+
+  ASSERT_OK(db_->CompactRange(CompactRangeOptions(), nullptr, nullptr));
+
+  ASSERT_TRUE(called.load());
+
+  rocksdb::SyncPoint::GetInstance()->DisableProcessing();
+  rocksdb::SyncPoint::GetInstance()->ClearAllCallBacks();
+}
+
 }  // namespace rocksdb
 
 int main(int argc, char** argv) {
diff -Nru rocksdb-5.15.10/db/db_test.cc rocksdb-5.17.2/db/db_test.cc
--- rocksdb-5.15.10/db/db_test.cc	2018-09-13 17:25:20.000000000 +0000
+++ rocksdb-5.17.2/db/db_test.cc	2018-11-12 19:57:32.000000000 +0000
@@ -262,6 +262,196 @@
   }
 }
 
+TEST_F(DBTest, MixedSlowdownOptions) {
+  Options options = CurrentOptions();
+  options.env = env_;
+  options.write_buffer_size = 100000;
+  CreateAndReopenWithCF({"pikachu"}, options);
+  std::vector<port::Thread> threads;
+  std::atomic<int> thread_num(0);
+
+  std::function<void()> write_slowdown_func = [&]() {
+    int a = thread_num.fetch_add(1);
+    std::string key = "foo" + std::to_string(a);
+    WriteOptions wo;
+    wo.no_slowdown = false;
+    ASSERT_OK(dbfull()->Put(wo, key, "bar"));
+  };
+  std::function<void()> write_no_slowdown_func = [&]() {
+    int a = thread_num.fetch_add(1);
+    std::string key = "foo" + std::to_string(a);
+    WriteOptions wo;
+    wo.no_slowdown = true;
+    ASSERT_NOK(dbfull()->Put(wo, key, "bar"));
+  };
+  // Use a small number to ensure a large delay that is still effective
+  // when we do Put
+  // TODO(myabandeh): this is time dependent and could potentially make
+  // the test flaky
+  auto token = dbfull()->TEST_write_controler().GetDelayToken(1);
+  std::atomic<int> sleep_count(0);
+  rocksdb::SyncPoint::GetInstance()->SetCallBack(
+      "DBImpl::DelayWrite:BeginWriteStallDone",
+      [&](void* /*arg*/) {
+        sleep_count.fetch_add(1);
+        if (threads.empty()) {
+          for (int i = 0; i < 2; ++i) {
+            threads.emplace_back(write_slowdown_func);
+          }
+          for (int i = 0; i < 2; ++i) {
+            threads.emplace_back(write_no_slowdown_func);
+          }
+        }
+      });
+  rocksdb::SyncPoint::GetInstance()->EnableProcessing();
+
+  WriteOptions wo;
+  wo.sync = false;
+  wo.disableWAL = false;
+  wo.no_slowdown = false;
+  dbfull()->Put(wo, "foo", "bar");
+  // We need the 2nd write to trigger delay. This is because delay is
+  // estimated based on the last write size which is 0 for the first write.
+  ASSERT_OK(dbfull()->Put(wo, "foo2", "bar2"));
+          token.reset();
+
+  for (auto& t : threads) {
+    t.join();
+  }
+  ASSERT_GE(sleep_count.load(), 1);
+
+  wo.no_slowdown = true;
+  ASSERT_OK(dbfull()->Put(wo, "foo3", "bar"));
+}
+
+TEST_F(DBTest, MixedSlowdownOptionsInQueue) {
+  Options options = CurrentOptions();
+  options.env = env_;
+  options.write_buffer_size = 100000;
+  CreateAndReopenWithCF({"pikachu"}, options);
+  std::vector<port::Thread> threads;
+  std::atomic<int> thread_num(0);
+
+  std::function<void()> write_no_slowdown_func = [&]() {
+    int a = thread_num.fetch_add(1);
+    std::string key = "foo" + std::to_string(a);
+    WriteOptions wo;
+    wo.no_slowdown = true;
+    ASSERT_NOK(dbfull()->Put(wo, key, "bar"));
+  };
+  // Use a small number to ensure a large delay that is still effective
+  // when we do Put
+  // TODO(myabandeh): this is time dependent and could potentially make
+  // the test flaky
+  auto token = dbfull()->TEST_write_controler().GetDelayToken(1);
+  std::atomic<int> sleep_count(0);
+  rocksdb::SyncPoint::GetInstance()->SetCallBack(
+      "DBImpl::DelayWrite:Sleep",
+      [&](void* /*arg*/) {
+        sleep_count.fetch_add(1);
+        if (threads.empty()) {
+          for (int i = 0; i < 2; ++i) {
+            threads.emplace_back(write_no_slowdown_func);
+          }
+          // Sleep for 2s to allow the threads to insert themselves into the
+          // write queue
+          env_->SleepForMicroseconds(3000000ULL);
+        }
+      });
+  std::atomic<int> wait_count(0);
+  rocksdb::SyncPoint::GetInstance()->SetCallBack(
+      "DBImpl::DelayWrite:Wait",
+      [&](void* /*arg*/) { wait_count.fetch_add(1); });
+  rocksdb::SyncPoint::GetInstance()->EnableProcessing();
+
+  WriteOptions wo;
+  wo.sync = false;
+  wo.disableWAL = false;
+  wo.no_slowdown = false;
+  dbfull()->Put(wo, "foo", "bar");
+  // We need the 2nd write to trigger delay. This is because delay is
+  // estimated based on the last write size which is 0 for the first write.
+  ASSERT_OK(dbfull()->Put(wo, "foo2", "bar2"));
+          token.reset();
+
+  for (auto& t : threads) {
+    t.join();
+  }
+  ASSERT_EQ(sleep_count.load(), 1);
+  ASSERT_GE(wait_count.load(), 0);
+}
+
+TEST_F(DBTest, MixedSlowdownOptionsStop) {
+  Options options = CurrentOptions();
+  options.env = env_;
+  options.write_buffer_size = 100000;
+  CreateAndReopenWithCF({"pikachu"}, options);
+  std::vector<port::Thread> threads;
+  std::atomic<int> thread_num(0);
+
+  std::function<void()> write_slowdown_func = [&]() {
+    int a = thread_num.fetch_add(1);
+    std::string key = "foo" + std::to_string(a);
+    WriteOptions wo;
+    wo.no_slowdown = false;
+    ASSERT_OK(dbfull()->Put(wo, key, "bar"));
+  };
+  std::function<void()> write_no_slowdown_func = [&]() {
+    int a = thread_num.fetch_add(1);
+    std::string key = "foo" + std::to_string(a);
+    WriteOptions wo;
+    wo.no_slowdown = true;
+    ASSERT_NOK(dbfull()->Put(wo, key, "bar"));
+  };
+  std::function<void()> wakeup_writer = [&]() {
+    dbfull()->mutex_.Lock();
+    dbfull()->bg_cv_.SignalAll();
+    dbfull()->mutex_.Unlock();
+  };
+  // Use a small number to ensure a large delay that is still effective
+  // when we do Put
+  // TODO(myabandeh): this is time dependent and could potentially make
+  // the test flaky
+  auto token = dbfull()->TEST_write_controler().GetStopToken();
+  std::atomic<int> wait_count(0);
+  rocksdb::SyncPoint::GetInstance()->SetCallBack(
+      "DBImpl::DelayWrite:Wait",
+      [&](void* /*arg*/) {
+        wait_count.fetch_add(1);
+        if (threads.empty()) {
+          for (int i = 0; i < 2; ++i) {
+            threads.emplace_back(write_slowdown_func);
+          }
+          for (int i = 0; i < 2; ++i) {
+            threads.emplace_back(write_no_slowdown_func);
+          }
+          // Sleep for 2s to allow the threads to insert themselves into the
+          // write queue
+          env_->SleepForMicroseconds(3000000ULL);
+        }
+        token.reset();
+        threads.emplace_back(wakeup_writer);
+      });
+  rocksdb::SyncPoint::GetInstance()->EnableProcessing();
+
+  WriteOptions wo;
+  wo.sync = false;
+  wo.disableWAL = false;
+  wo.no_slowdown = false;
+  dbfull()->Put(wo, "foo", "bar");
+  // We need the 2nd write to trigger delay. This is because delay is
+  // estimated based on the last write size which is 0 for the first write.
+  ASSERT_OK(dbfull()->Put(wo, "foo2", "bar2"));
+          token.reset();
+
+  for (auto& t : threads) {
+    t.join();
+  }
+  ASSERT_GE(wait_count.load(), 1);
+
+  wo.no_slowdown = true;
+  ASSERT_OK(dbfull()->Put(wo, "foo3", "bar"));
+}
 #ifndef ROCKSDB_LITE
 
 TEST_F(DBTest, LevelLimitReopen) {
@@ -2149,6 +2339,9 @@
 
 }  // namespace
 
+#ifndef TRAVIS
+// Disable this test temporarily on Travis as it fails intermittently.
+// Github issue: #4151
 TEST_F(DBTest, GroupCommitTest) {
   do {
     Options options = CurrentOptions();
@@ -2195,6 +2388,7 @@
     ASSERT_GT(hist_data.average, 0.0);
   } while (ChangeOptions(kSkipNoSeekToLast));
 }
+#endif  // TRAVIS
 
 namespace {
 typedef std::map<std::string, std::string> KVMap;
@@ -4327,7 +4521,7 @@
   // Clean up memtable and L0. Block compaction threads. If continue to write
   // and flush memtables. We should see put stop after 8 memtable flushes
   // since level0_stop_writes_trigger = 8
-  dbfull()->TEST_FlushMemTable(true);
+  dbfull()->TEST_FlushMemTable(true, true);
   dbfull()->CompactRange(CompactRangeOptions(), nullptr, nullptr);
   // Block compaction
   test::SleepingBackgroundTask sleeping_task_low;
@@ -4340,7 +4534,7 @@
   WriteOptions wo;
   while (count < 64) {
     ASSERT_OK(Put(Key(count), RandomString(&rnd, 1024), wo));
-    dbfull()->TEST_FlushMemTable(true);
+    dbfull()->TEST_FlushMemTable(true, true);
     count++;
     if (dbfull()->TEST_write_controler().IsStopped()) {
       sleeping_task_low.WakeUp();
@@ -4368,7 +4562,7 @@
   count = 0;
   while (count < 64) {
     ASSERT_OK(Put(Key(count), RandomString(&rnd, 1024), wo));
-    dbfull()->TEST_FlushMemTable(true);
+    dbfull()->TEST_FlushMemTable(true, true);
     count++;
     if (dbfull()->TEST_write_controler().IsStopped()) {
       sleeping_task_low.WakeUp();
@@ -5508,7 +5702,7 @@
   for (int i = 0; i < 72; i++) {
     Put(Key(i), std::string(5000, 'x'));
     if (i % 10 == 0) {
-      Flush();
+      dbfull()->TEST_FlushMemTable(true, true);
     }
   }
   dbfull()->TEST_WaitForCompact();
@@ -5518,7 +5712,7 @@
   for (int i = 0; i < 72; i++) {
     Put(Key(i), std::string(5000, 'x'));
     if (i % 10 == 0) {
-      Flush();
+      dbfull()->TEST_FlushMemTable(true, true);
     }
   }
   dbfull()->TEST_WaitForCompact();
@@ -5537,7 +5731,7 @@
     Put(Key(i), std::string(5000, 'x'));
     Put(Key(100 - i), std::string(5000, 'x'));
     // Flush the file. File size is around 30KB.
-    Flush();
+    dbfull()->TEST_FlushMemTable(true, true);
   }
   ASSERT_TRUE(dbfull()->TEST_write_controler().NeedsDelay());
   ASSERT_TRUE(listener->CheckCondition(WriteStallCondition::kDelayed));
@@ -5572,7 +5766,7 @@
     Put(Key(10 + i), std::string(5000, 'x'));
     Put(Key(90 - i), std::string(5000, 'x'));
     // Flush the file. File size is around 30KB.
-    Flush();
+    dbfull()->TEST_FlushMemTable(true, true);
   }
 
   // Wake up sleep task to enable compaction to run and waits
@@ -5593,7 +5787,7 @@
     Put(Key(20 + i), std::string(5000, 'x'));
     Put(Key(80 - i), std::string(5000, 'x'));
     // Flush the file. File size is around 30KB.
-    Flush();
+    dbfull()->TEST_FlushMemTable(true, true);
   }
   // Wake up sleep task to enable compaction to run and waits
   // for it to go to sleep state again to make sure one compaction
diff -Nru rocksdb-5.15.10/db/db_test_util.cc rocksdb-5.17.2/db/db_test_util.cc
--- rocksdb-5.15.10/db/db_test_util.cc	2018-09-13 17:25:20.000000000 +0000
+++ rocksdb-5.17.2/db/db_test_util.cc	2018-11-12 19:57:32.000000000 +0000
@@ -449,15 +449,16 @@
       options.prefix_extractor.reset(NewNoopTransform());
       break;
     }
-    case kBlockBasedTableWithPartitionedIndexFormat3: {
-      table_options.format_version = 3;
-      // Format 3 changes the binary index format. Since partitioned index is a
+    case kBlockBasedTableWithPartitionedIndexFormat4: {
+      table_options.format_version = 4;
+      // Format 4 changes the binary index format. Since partitioned index is a
       // super-set of simple indexes, we are also using kTwoLevelIndexSearch to
       // test this format.
       table_options.index_type = BlockBasedTableOptions::kTwoLevelIndexSearch;
-      // The top-level index in partition filters are also affected by format 3.
+      // The top-level index in partition filters are also affected by format 4.
       table_options.filter_policy.reset(NewBloomFilterPolicy(10, false));
       table_options.partition_filters = true;
+      table_options.index_block_restart_interval = 8;
       break;
     }
     case kBlockBasedTableWithIndexRestartInterval: {
diff -Nru rocksdb-5.15.10/db/db_test_util.h rocksdb-5.17.2/db/db_test_util.h
--- rocksdb-5.15.10/db/db_test_util.h	2018-09-13 17:25:20.000000000 +0000
+++ rocksdb-5.17.2/db/db_test_util.h	2018-11-12 19:57:32.000000000 +0000
@@ -109,8 +109,6 @@
   // These will be used only if filter_policy is set
   bool partition_filters = false;
   uint64_t metadata_block_size = 1024;
-  BlockBasedTableOptions::IndexType index_type =
-      BlockBasedTableOptions::IndexType::kBinarySearch;
 
   // Used as a bit mask of individual enums in which to skip an XF test point
   int skip_policy = 0;
@@ -317,6 +315,9 @@
         }
       }
       uint64_t GetFileSize() override { return base_->GetFileSize(); }
+      Status Allocate(uint64_t offset, uint64_t len) override {
+        return base_->Allocate(offset, len);
+      }
 
      private:
       SpecialEnv* env_;
@@ -370,6 +371,9 @@
       bool IsSyncThreadSafe() const override {
         return env_->is_wal_sync_thread_safe_.load();
       }
+      Status Allocate(uint64_t offset, uint64_t len) override {
+        return base_->Allocate(offset, len);
+      }
 
      private:
       SpecialEnv* env_;
@@ -575,7 +579,7 @@
 
   std::atomic<bool> is_wal_sync_thread_safe_{true};
 
-  std::atomic<size_t> compaction_readahead_size_;
+  std::atomic<size_t> compaction_readahead_size_{};
 };
 
 class MockTimeEnv : public EnvWrapper {
@@ -698,7 +702,7 @@
     kLevelSubcompactions,
     kBlockBasedTableWithIndexRestartInterval,
     kBlockBasedTableWithPartitionedIndex,
-    kBlockBasedTableWithPartitionedIndexFormat3,
+    kBlockBasedTableWithPartitionedIndexFormat4,
     kPartitionedFilterWithNewTableReaderForCompactions,
     kUniversalSubcompactions,
     // This must be the last line
diff -Nru rocksdb-5.15.10/db/db_universal_compaction_test.cc rocksdb-5.17.2/db/db_universal_compaction_test.cc
--- rocksdb-5.15.10/db/db_universal_compaction_test.cc	2018-09-13 17:25:20.000000000 +0000
+++ rocksdb-5.17.2/db/db_universal_compaction_test.cc	2018-11-12 19:57:32.000000000 +0000
@@ -1824,7 +1824,7 @@
 
   port::Thread compact_files_thread([&]() {
     ASSERT_OK(dbfull()->CompactFiles(CompactionOptions(), default_cfh,
-        {first_sst_filename}, num_levels_ - 1));
+                                     {first_sst_filename}, num_levels_ - 1));
   });
 
   TEST_SYNC_POINT(
diff -Nru rocksdb-5.15.10/db/db_wal_test.cc rocksdb-5.17.2/db/db_wal_test.cc
--- rocksdb-5.15.10/db/db_wal_test.cc	2018-09-13 17:25:20.000000000 +0000
+++ rocksdb-5.17.2/db/db_wal_test.cc	2018-11-12 19:57:32.000000000 +0000
@@ -18,6 +18,15 @@
 class DBWALTest : public DBTestBase {
  public:
   DBWALTest() : DBTestBase("/db_wal_test") {}
+
+#if defined(ROCKSDB_PLATFORM_POSIX)
+  uint64_t GetAllocatedFileSize(std::string file_name) {
+    struct stat sbuf;
+    int err = stat(file_name.c_str(), &sbuf);
+    assert(err == 0);
+    return sbuf.st_blocks * 512;
+  }
+#endif
 };
 
 // A SpecialEnv enriched to give more insight about deleted files
@@ -815,7 +824,7 @@
       unique_ptr<WritableFile> file;
       ASSERT_OK(db_options.env->NewWritableFile(fname, &file, env_options));
       unique_ptr<WritableFileWriter> file_writer(
-          new WritableFileWriter(std::move(file), env_options));
+          new WritableFileWriter(std::move(file), fname, env_options));
       current_log_writer.reset(
           new log::Writer(std::move(file_writer), current_log_number,
                           db_options.recycle_log_file_num > 0));
@@ -1329,6 +1338,99 @@
   }
 }
 
+// Tests that total log size is recovered if we set
+// avoid_flush_during_recovery=true.
+// Flush should trigger if max_total_wal_size is reached.
+TEST_F(DBWALTest, RestoreTotalLogSizeAfterRecoverWithoutFlush) {
+  class TestFlushListener : public EventListener {
+   public:
+    std::atomic<int> count{0};
+
+    TestFlushListener() = default;
+
+    void OnFlushBegin(DB* /*db*/, const FlushJobInfo& flush_job_info) override {
+      count++;
+      assert(FlushReason::kWriteBufferManager == flush_job_info.flush_reason);
+    }
+  };
+  std::shared_ptr<TestFlushListener> test_listener =
+      std::make_shared<TestFlushListener>();
+
+  constexpr size_t kKB = 1024;
+  constexpr size_t kMB = 1024 * 1024;
+  Options options = CurrentOptions();
+  options.avoid_flush_during_recovery = true;
+  options.max_total_wal_size = 1 * kMB;
+  options.listeners.push_back(test_listener);
+  // Have to open DB in multi-CF mode to trigger flush when
+  // max_total_wal_size is reached.
+  CreateAndReopenWithCF({"one"}, options);
+  // Write some keys and we will end up with one log file which is slightly
+  // smaller than 1MB.
+  std::string value_100k(100 * kKB, 'v');
+  std::string value_300k(300 * kKB, 'v');
+  ASSERT_OK(Put(0, "foo", "v1"));
+  for (int i = 0; i < 9; i++) {
+    ASSERT_OK(Put(1, "key" + ToString(i), value_100k));
+  }
+  // Get log files before reopen.
+  VectorLogPtr log_files_before;
+  ASSERT_OK(dbfull()->GetSortedWalFiles(log_files_before));
+  ASSERT_EQ(1, log_files_before.size());
+  uint64_t log_size_before = log_files_before[0]->SizeFileBytes();
+  ASSERT_GT(log_size_before, 900 * kKB);
+  ASSERT_LT(log_size_before, 1 * kMB);
+  ReopenWithColumnFamilies({"default", "one"}, options);
+  // Write one more value to make log larger than 1MB.
+  ASSERT_OK(Put(1, "bar", value_300k));
+  // Get log files again. A new log file will be opened.
+  VectorLogPtr log_files_after_reopen;
+  ASSERT_OK(dbfull()->GetSortedWalFiles(log_files_after_reopen));
+  ASSERT_EQ(2, log_files_after_reopen.size());
+  ASSERT_EQ(log_files_before[0]->LogNumber(),
+            log_files_after_reopen[0]->LogNumber());
+  ASSERT_GT(log_files_after_reopen[0]->SizeFileBytes() +
+                log_files_after_reopen[1]->SizeFileBytes(),
+            1 * kMB);
+  // Write one more key to trigger flush.
+  ASSERT_OK(Put(0, "foo", "v2"));
+  dbfull()->TEST_WaitForFlushMemTable();
+  // Flushed two column families.
+  ASSERT_EQ(2, test_listener->count.load());
+}
+
+#if defined(ROCKSDB_PLATFORM_POSIX)
+#if defined(ROCKSDB_FALLOCATE_PRESENT)
+// Tests that we will truncate the preallocated space of the last log from
+// previous.
+TEST_F(DBWALTest, TruncateLastLogAfterRecoverWithoutFlush) {
+  constexpr size_t kKB = 1024;
+  Options options = CurrentOptions();
+  options.avoid_flush_during_recovery = true;
+  DestroyAndReopen(options);
+  size_t preallocated_size =
+      dbfull()->TEST_GetWalPreallocateBlockSize(options.write_buffer_size);
+  ASSERT_OK(Put("foo", "v1"));
+  VectorLogPtr log_files_before;
+  ASSERT_OK(dbfull()->GetSortedWalFiles(log_files_before));
+  ASSERT_EQ(1, log_files_before.size());
+  auto& file_before = log_files_before[0];
+  ASSERT_LT(file_before->SizeFileBytes(), 1 * kKB);
+  // The log file has preallocated space.
+  ASSERT_GE(GetAllocatedFileSize(dbname_ + file_before->PathName()),
+            preallocated_size);
+  Reopen(options);
+  VectorLogPtr log_files_after;
+  ASSERT_OK(dbfull()->GetSortedWalFiles(log_files_after));
+  ASSERT_EQ(1, log_files_after.size());
+  ASSERT_LT(log_files_after[0]->SizeFileBytes(), 1 * kKB);
+  // The preallocated space should be truncated.
+  ASSERT_LT(GetAllocatedFileSize(dbname_ + file_before->PathName()),
+            preallocated_size);
+}
+#endif  // ROCKSDB_FALLOCATE_PRESENT
+#endif  // ROCKSDB_PLATFORM_POSIX
+
 #endif  // ROCKSDB_LITE
 
 TEST_F(DBWALTest, WalTermTest) {
diff -Nru rocksdb-5.15.10/db/error_handler.cc rocksdb-5.17.2/db/error_handler.cc
--- rocksdb-5.15.10/db/error_handler.cc	2018-09-13 17:25:20.000000000 +0000
+++ rocksdb-5.17.2/db/error_handler.cc	2018-11-12 19:57:32.000000000 +0000
@@ -4,7 +4,9 @@
 //  (found in the LICENSE.Apache file in the root directory).
 //
 #include "db/error_handler.h"
+#include "db/db_impl.h"
 #include "db/event_helpers.h"
+#include "util/sst_file_manager_impl.h"
 
 namespace rocksdb {
 
@@ -33,7 +35,7 @@
         // Errors during BG flush
         {std::make_tuple(BackgroundErrorReason::kFlush, Status::Code::kIOError,
                          Status::SubCode::kNoSpace, true),
-         Status::Severity::kSoftError},
+         Status::Severity::kHardError},
         {std::make_tuple(BackgroundErrorReason::kFlush, Status::Code::kIOError,
                          Status::SubCode::kNoSpace, false),
          Status::Severity::kNoError},
@@ -44,11 +46,11 @@
         {std::make_tuple(BackgroundErrorReason::kWriteCallback,
                          Status::Code::kIOError, Status::SubCode::kNoSpace,
                          true),
-         Status::Severity::kFatalError},
+         Status::Severity::kHardError},
         {std::make_tuple(BackgroundErrorReason::kWriteCallback,
                          Status::Code::kIOError, Status::SubCode::kNoSpace,
                          false),
-         Status::Severity::kFatalError},
+         Status::Severity::kHardError},
 };
 
 std::map<std::tuple<BackgroundErrorReason, Status::Code, bool>, Status::Severity>
@@ -118,6 +120,45 @@
           Status::Severity::kFatalError},
 };
 
+void ErrorHandler::CancelErrorRecovery() {
+#ifndef ROCKSDB_LITE
+  db_mutex_->AssertHeld();
+
+  // We'll release the lock before calling sfm, so make sure no new
+  // recovery gets scheduled at that point
+  auto_recovery_ = false;
+  SstFileManagerImpl* sfm = reinterpret_cast<SstFileManagerImpl*>(
+      db_options_.sst_file_manager.get());
+  if (sfm) {
+    // This may or may not cancel a pending recovery
+    db_mutex_->Unlock();
+    bool cancelled = sfm->CancelErrorRecovery(this);
+    db_mutex_->Lock();
+    if (cancelled) {
+      recovery_in_prog_ = false;
+    }
+  }
+#endif
+}
+
+// This is the main function for looking at an error during a background
+// operation and deciding the severity, and error recovery strategy. The high
+// level algorithm is as follows -
+// 1. Classify the severity of the error based on the ErrorSeverityMap,
+//    DefaultErrorSeverityMap and DefaultReasonMap defined earlier
+// 2. Call a Status code specific override function to adjust the severity
+//    if needed. The reason for this is our ability to recover may depend on
+//    the exact options enabled in DBOptions
+// 3. Determine if auto recovery is possible. A listener notification callback
+//    is called, which can disable the auto recovery even if we decide its
+//    feasible
+// 4. For Status::NoSpace() errors, rely on SstFileManagerImpl to control
+//    the actual recovery. If no sst file manager is specified in DBOptions,
+//    a default one is allocated during DB::Open(), so there will always be
+//    one.
+// This can also get called as part of a recovery operation. In that case, we
+// also track the error seperately in recovery_error_ so we can tell in the
+// end whether recovery succeeded or not
 Status ErrorHandler::SetBGError(const Status& bg_err, BackgroundErrorReason reason) {
   db_mutex_->AssertHeld();
 
@@ -125,6 +166,12 @@
     return Status::OK();
   }
 
+  // Check if recovery is currently in progress. If it is, we will save this
+  // error so we can check it at the end to see if recovery succeeded or not
+  if (recovery_in_prog_ && recovery_error_.ok()) {
+    recovery_error_ = bg_err;
+  }
+
   bool paranoid = db_options_.paranoid_checks;
   Status::Severity sev = Status::Severity::kFatalError;
   Status new_bg_err;
@@ -156,15 +203,143 @@
   }
 
   new_bg_err = Status(bg_err, sev);
+
+  bool auto_recovery = auto_recovery_;
+  if (new_bg_err.severity() >= Status::Severity::kFatalError && auto_recovery) {
+    auto_recovery = false;
+    ;
+  }
+
+  // Allow some error specific overrides
+  if (new_bg_err == Status::NoSpace()) {
+    new_bg_err = OverrideNoSpaceError(new_bg_err, &auto_recovery);
+  }
+
   if (!new_bg_err.ok()) {
     Status s = new_bg_err;
-    EventHelpers::NotifyOnBackgroundError(db_options_.listeners, reason, &s, db_mutex_);
+    EventHelpers::NotifyOnBackgroundError(db_options_.listeners, reason, &s,
+                                          db_mutex_, &auto_recovery);
     if (!s.ok() && (s.severity() > bg_error_.severity())) {
       bg_error_ = s;
+    } else {
+      // This error is less severe than previously encountered error. Don't
+      // take any further action
+      return bg_error_;
     }
   }
 
+  if (auto_recovery) {
+    recovery_in_prog_ = true;
+
+    // Kick-off error specific recovery
+    if (bg_error_ == Status::NoSpace()) {
+      RecoverFromNoSpace();
+    }
+  }
   return bg_error_;
 }
 
+Status ErrorHandler::OverrideNoSpaceError(Status bg_error,
+                                          bool* auto_recovery) {
+#ifndef ROCKSDB_LITE
+  if (bg_error.severity() >= Status::Severity::kFatalError) {
+    return bg_error;
+  }
+
+  if (db_options_.sst_file_manager.get() == nullptr) {
+    // We rely on SFM to poll for enough disk space and recover
+    *auto_recovery = false;
+    return bg_error;
+  }
+
+  if (db_options_.allow_2pc &&
+      (bg_error.severity() <= Status::Severity::kSoftError)) {
+    // Don't know how to recover, as the contents of the current WAL file may
+    // be inconsistent, and it may be needed for 2PC. If 2PC is not enabled,
+    // we can just flush the memtable and discard the log
+    *auto_recovery = false;
+    return Status(bg_error, Status::Severity::kFatalError);
+  }
+
+  {
+    uint64_t free_space;
+    if (db_options_.env->GetFreeSpace(db_options_.db_paths[0].path,
+                                      &free_space) == Status::NotSupported()) {
+      *auto_recovery = false;
+    }
+  }
+
+  return bg_error;
+#else
+  (void)auto_recovery;
+  return Status(bg_error, Status::Severity::kFatalError);
+#endif
+}
+
+void ErrorHandler::RecoverFromNoSpace() {
+#ifndef ROCKSDB_LITE
+  SstFileManagerImpl* sfm =
+      reinterpret_cast<SstFileManagerImpl*>(db_options_.sst_file_manager.get());
+
+  // Inform SFM of the error, so it can kick-off the recovery
+  if (sfm) {
+    sfm->StartErrorRecovery(this, bg_error_);
+  }
+#endif
+}
+
+Status ErrorHandler::ClearBGError() {
+#ifndef ROCKSDB_LITE
+  db_mutex_->AssertHeld();
+
+  // Signal that recovery succeeded
+  if (recovery_error_.ok()) {
+    Status old_bg_error = bg_error_;
+    bg_error_ = Status::OK();
+    recovery_in_prog_ = false;
+    EventHelpers::NotifyOnErrorRecoveryCompleted(db_options_.listeners,
+                                                 old_bg_error, db_mutex_);
+  }
+  return recovery_error_;
+#else
+  return bg_error_;
+#endif
+}
+
+Status ErrorHandler::RecoverFromBGError(bool is_manual) {
+#ifndef ROCKSDB_LITE
+  InstrumentedMutexLock l(db_mutex_);
+  if (is_manual) {
+    // If its a manual recovery and there's a background recovery in progress
+    // return busy status
+    if (recovery_in_prog_) {
+      return Status::Busy();
+    }
+    recovery_in_prog_ = true;
+  }
+
+  if (bg_error_.severity() == Status::Severity::kSoftError) {
+    // Simply clear the background error and return
+    recovery_error_ = Status::OK();
+    return ClearBGError();
+  }
+
+  // Reset recovery_error_. We will use this to record any errors that happen
+  // during the recovery process. While recovering, the only operations that
+  // can generate background errors should be the flush operations
+  recovery_error_ = Status::OK();
+  Status s = db_->ResumeImpl();
+  // For manual recover, shutdown, and fatal error  cases, set
+  // recovery_in_prog_ to false. For automatic background recovery, leave it
+  // as is regardless of success or failure as it will be retried
+  if (is_manual || s.IsShutdownInProgress() ||
+      bg_error_.severity() >= Status::Severity::kFatalError) {
+    recovery_in_prog_ = false;
+  }
+  return s;
+#else
+  (void)is_manual;
+  return bg_error_;
+#endif
+}
 }
diff -Nru rocksdb-5.15.10/db/error_handler.h rocksdb-5.17.2/db/error_handler.h
--- rocksdb-5.15.10/db/error_handler.h	2018-09-13 17:25:20.000000000 +0000
+++ rocksdb-5.17.2/db/error_handler.h	2018-11-12 19:57:32.000000000 +0000
@@ -11,42 +11,65 @@
 
 namespace rocksdb {
 
+class DBImpl;
+
 class ErrorHandler {
   public:
-    ErrorHandler(const ImmutableDBOptions& db_options,
-        InstrumentedMutex* db_mutex)
-      : db_options_(db_options),
-        bg_error_(Status::OK()),
-        db_mutex_(db_mutex)
-      {}
-    ~ErrorHandler() {}
-
-    Status::Severity GetErrorSeverity(BackgroundErrorReason reason,
-        Status::Code code, Status::SubCode subcode);
-
-    Status SetBGError(const Status& bg_err, BackgroundErrorReason reason);
-
-    Status GetBGError()
-    {
-      return bg_error_;
+   ErrorHandler(DBImpl* db, const ImmutableDBOptions& db_options,
+                InstrumentedMutex* db_mutex)
+       : db_(db),
+         db_options_(db_options),
+         bg_error_(Status::OK()),
+         recovery_error_(Status::OK()),
+         db_mutex_(db_mutex),
+         auto_recovery_(false),
+         recovery_in_prog_(false) {}
+   ~ErrorHandler() {}
+
+   void EnableAutoRecovery() { auto_recovery_ = true; }
+
+   Status::Severity GetErrorSeverity(BackgroundErrorReason reason,
+                                     Status::Code code,
+                                     Status::SubCode subcode);
+
+   Status SetBGError(const Status& bg_err, BackgroundErrorReason reason);
+
+   Status GetBGError() { return bg_error_; }
+
+   Status GetRecoveryError() { return recovery_error_; }
+
+   Status ClearBGError();
+
+   bool IsDBStopped() {
+     return !bg_error_.ok() &&
+            bg_error_.severity() >= Status::Severity::kHardError;
     }
 
-    void ClearBGError() {
-      bg_error_ = Status::OK();
+    bool IsBGWorkStopped() {
+      return !bg_error_.ok() &&
+             (bg_error_.severity() >= Status::Severity::kHardError ||
+              !auto_recovery_);
     }
 
-    bool IsDBStopped() {
-      return !bg_error_.ok();
-    }
+    bool IsRecoveryInProgress() { return recovery_in_prog_; }
 
-    bool IsBGWorkStopped() {
-      return !bg_error_.ok();
-    }
+    Status RecoverFromBGError(bool is_manual = false);
+    void CancelErrorRecovery();
 
-  private:
+   private:
+    DBImpl* db_;
     const ImmutableDBOptions& db_options_;
     Status bg_error_;
+    // A seperate Status variable used to record any errors during the
+    // recovery process from hard errors
+    Status recovery_error_;
     InstrumentedMutex* db_mutex_;
+    // A flag indicating whether automatic recovery from errors is enabled
+    bool auto_recovery_;
+    bool recovery_in_prog_;
+
+    Status OverrideNoSpaceError(Status bg_error, bool* auto_recovery);
+    void RecoverFromNoSpace();
 };
 
 }
diff -Nru rocksdb-5.15.10/db/error_handler_test.cc rocksdb-5.17.2/db/error_handler_test.cc
--- rocksdb-5.15.10/db/error_handler_test.cc	2018-09-13 17:25:20.000000000 +0000
+++ rocksdb-5.17.2/db/error_handler_test.cc	2018-11-12 19:57:32.000000000 +0000
@@ -6,9 +6,12 @@
 // Copyright (c) 2011 The LevelDB Authors. All rights reserved.
 // Use of this source code is governed by a BSD-style license that can be
 // found in the LICENSE file. See the AUTHORS file for names of contributors.
+#ifndef ROCKSDB_LITE
+
 #include "db/db_test_util.h"
 #include "port/stack_trace.h"
 #include "rocksdb/perf_context.h"
+#include "rocksdb/sst_file_manager.h"
 #include "util/fault_injection_test_env.h"
 #if !defined(ROCKSDB_LITE)
 #include "util/sync_point.h"
@@ -33,36 +36,139 @@
     bool trig_io_error;
 };
 
+class ErrorHandlerListener : public EventListener {
+ public:
+  ErrorHandlerListener()
+      : mutex_(),
+        cv_(&mutex_),
+        no_auto_recovery_(false),
+        recovery_complete_(false),
+        file_creation_started_(false),
+        override_bg_error_(false),
+        file_count_(0),
+        fault_env_(nullptr) {}
+
+  void OnTableFileCreationStarted(
+      const TableFileCreationBriefInfo& /*ti*/) override {
+    InstrumentedMutexLock l(&mutex_);
+    file_creation_started_ = true;
+    if (file_count_ > 0) {
+      if (--file_count_ == 0) {
+        fault_env_->SetFilesystemActive(false, file_creation_error_);
+        file_creation_error_ = Status::OK();
+      }
+    }
+    cv_.SignalAll();
+  }
+
+  void OnErrorRecoveryBegin(BackgroundErrorReason /*reason*/,
+                            Status /*bg_error*/,
+                            bool* auto_recovery) override {
+    if (*auto_recovery && no_auto_recovery_) {
+      *auto_recovery = false;
+    }
+  }
+
+  void OnErrorRecoveryCompleted(Status /*old_bg_error*/) override {
+    InstrumentedMutexLock l(&mutex_);
+    recovery_complete_ = true;
+    cv_.SignalAll();
+  }
+
+  bool WaitForRecovery(uint64_t /*abs_time_us*/) {
+    InstrumentedMutexLock l(&mutex_);
+    while (!recovery_complete_) {
+      cv_.Wait(/*abs_time_us*/);
+    }
+    if (recovery_complete_) {
+      recovery_complete_ = false;
+      return true;
+    }
+    return false;
+  }
+
+  void WaitForTableFileCreationStarted(uint64_t /*abs_time_us*/) {
+    InstrumentedMutexLock l(&mutex_);
+    while (!file_creation_started_) {
+      cv_.Wait(/*abs_time_us*/);
+    }
+    file_creation_started_ = false;
+  }
+
+  void OnBackgroundError(BackgroundErrorReason /*reason*/,
+                         Status* bg_error) override {
+    if (override_bg_error_) {
+      *bg_error = bg_error_;
+      override_bg_error_ = false;
+    }
+  }
+
+  void EnableAutoRecovery(bool enable = true) { no_auto_recovery_ = !enable; }
+
+  void OverrideBGError(Status bg_err) {
+    bg_error_ = bg_err;
+    override_bg_error_ = true;
+  }
+
+  void InjectFileCreationError(FaultInjectionTestEnv* env, int file_count,
+                               Status s) {
+    fault_env_ = env;
+    file_count_ = file_count;
+    file_creation_error_ = s;
+  }
+
+ private:
+  InstrumentedMutex mutex_;
+  InstrumentedCondVar cv_;
+  bool no_auto_recovery_;
+  bool recovery_complete_;
+  bool file_creation_started_;
+  bool override_bg_error_;
+  int file_count_;
+  Status file_creation_error_;
+  Status bg_error_;
+  FaultInjectionTestEnv* fault_env_;
+};
+
 TEST_F(DBErrorHandlingTest, FLushWriteError) {
   std::unique_ptr<FaultInjectionTestEnv> fault_env(
       new FaultInjectionTestEnv(Env::Default()));
+  std::shared_ptr<ErrorHandlerListener> listener(new ErrorHandlerListener());
   Options options = GetDefaultOptions();
   options.create_if_missing = true;
   options.env = fault_env.get();
+  options.listeners.emplace_back(listener);
   Status s;
+
+  listener->EnableAutoRecovery(false);
   DestroyAndReopen(options);
 
-  Put(Key(0), "va;");
+  Put(Key(0), "val");
   SyncPoint::GetInstance()->SetCallBack(
       "FlushJob::Start", [&](void *) {
     fault_env->SetFilesystemActive(false, Status::NoSpace("Out of space"));
   });
   SyncPoint::GetInstance()->EnableProcessing();
   s = Flush();
-  ASSERT_EQ(s.severity(), rocksdb::Status::Severity::kSoftError);
+  ASSERT_EQ(s.severity(), rocksdb::Status::Severity::kHardError);
+  SyncPoint::GetInstance()->DisableProcessing();
   fault_env->SetFilesystemActive(true);
   s = dbfull()->Resume();
   ASSERT_EQ(s, Status::OK());
 
+  Reopen(options);
+  ASSERT_EQ("val", Get(Key(0)));
   Destroy(options);
 }
 
 TEST_F(DBErrorHandlingTest, CompactionWriteError) {
   std::unique_ptr<FaultInjectionTestEnv> fault_env(
       new FaultInjectionTestEnv(Env::Default()));
+  std::shared_ptr<ErrorHandlerListener> listener(new ErrorHandlerListener());
   Options options = GetDefaultOptions();
   options.create_if_missing = true;
   options.level0_file_num_compaction_trigger = 2;
+  options.listeners.emplace_back(listener);
   options.env = fault_env.get();
   Status s;
   DestroyAndReopen(options);
@@ -72,6 +178,10 @@
   s = Flush();
   ASSERT_EQ(s, Status::OK());
 
+  listener->OverrideBGError(
+      Status(Status::NoSpace(), Status::Severity::kHardError)
+      );
+  listener->EnableAutoRecovery(false);
   rocksdb::SyncPoint::GetInstance()->LoadDependency(
       {{"FlushMemTableFinished", "BackgroundCallCompaction:0"}});
   rocksdb::SyncPoint::GetInstance()->SetCallBack(
@@ -85,7 +195,7 @@
   ASSERT_EQ(s, Status::OK());
 
   s = dbfull()->TEST_WaitForCompact();
-  ASSERT_EQ(s.severity(), rocksdb::Status::Severity::kSoftError);
+  ASSERT_EQ(s.severity(), rocksdb::Status::Severity::kHardError);
 
   fault_env->SetFilesystemActive(true);
   s = dbfull()->Resume();
@@ -129,6 +239,439 @@
   Destroy(options);
 }
 
+TEST_F(DBErrorHandlingTest, AutoRecoverFlushError) {
+  std::unique_ptr<FaultInjectionTestEnv> fault_env(
+      new FaultInjectionTestEnv(Env::Default()));
+  std::shared_ptr<ErrorHandlerListener> listener(new ErrorHandlerListener());
+  Options options = GetDefaultOptions();
+  options.create_if_missing = true;
+  options.env = fault_env.get();
+  options.listeners.emplace_back(listener);
+  Status s;
+
+  listener->EnableAutoRecovery();
+  DestroyAndReopen(options);
+
+  Put(Key(0), "val");
+  SyncPoint::GetInstance()->SetCallBack("FlushJob::Start", [&](void*) {
+    fault_env->SetFilesystemActive(false, Status::NoSpace("Out of space"));
+  });
+  SyncPoint::GetInstance()->EnableProcessing();
+  s = Flush();
+  ASSERT_EQ(s.severity(), rocksdb::Status::Severity::kHardError);
+  SyncPoint::GetInstance()->DisableProcessing();
+  fault_env->SetFilesystemActive(true);
+  ASSERT_EQ(listener->WaitForRecovery(5000000), true);
+
+  s = Put(Key(1), "val");
+  ASSERT_EQ(s, Status::OK());
+
+  Reopen(options);
+  ASSERT_EQ("val", Get(Key(0)));
+  ASSERT_EQ("val", Get(Key(1)));
+  Destroy(options);
+}
+
+TEST_F(DBErrorHandlingTest, FailRecoverFlushError) {
+  std::unique_ptr<FaultInjectionTestEnv> fault_env(
+      new FaultInjectionTestEnv(Env::Default()));
+  std::shared_ptr<ErrorHandlerListener> listener(new ErrorHandlerListener());
+  Options options = GetDefaultOptions();
+  options.create_if_missing = true;
+  options.env = fault_env.get();
+  options.listeners.emplace_back(listener);
+  Status s;
+
+  listener->EnableAutoRecovery();
+  DestroyAndReopen(options);
+
+  Put(Key(0), "val");
+  SyncPoint::GetInstance()->SetCallBack("FlushJob::Start", [&](void*) {
+    fault_env->SetFilesystemActive(false, Status::NoSpace("Out of space"));
+  });
+  SyncPoint::GetInstance()->EnableProcessing();
+  s = Flush();
+  ASSERT_EQ(s.severity(), rocksdb::Status::Severity::kHardError);
+  // We should be able to shutdown the database while auto recovery is going
+  // on in the background
+  Close();
+  DestroyDB(dbname_, options);
+}
+
+TEST_F(DBErrorHandlingTest, WALWriteError) {
+  std::unique_ptr<FaultInjectionTestEnv> fault_env(
+      new FaultInjectionTestEnv(Env::Default()));
+  std::shared_ptr<ErrorHandlerListener> listener(new ErrorHandlerListener());
+  Options options = GetDefaultOptions();
+  options.create_if_missing = true;
+  options.writable_file_max_buffer_size = 32768;
+  options.env = fault_env.get();
+  options.listeners.emplace_back(listener);
+  Status s;
+  Random rnd(301);
+
+  listener->EnableAutoRecovery();
+  DestroyAndReopen(options);
+
+  {
+    WriteBatch batch;
+
+    for (auto i = 0; i<100; ++i) {
+      batch.Put(Key(i), RandomString(&rnd, 1024));
+    }
+
+    WriteOptions wopts;
+    wopts.sync = true;
+    ASSERT_EQ(dbfull()->Write(wopts, &batch), Status::OK());
+  };
+
+  {
+    WriteBatch batch;
+    int write_error = 0;
+
+    for (auto i = 100; i<199; ++i) {
+      batch.Put(Key(i), RandomString(&rnd, 1024));
+    }
+
+    SyncPoint::GetInstance()->SetCallBack("WritableFileWriter::Append:BeforePrepareWrite", [&](void*) {
+      write_error++;
+      if (write_error > 2) {
+        fault_env->SetFilesystemActive(false, Status::NoSpace("Out of space"));
+      }
+    });
+    SyncPoint::GetInstance()->EnableProcessing();
+    WriteOptions wopts;
+    wopts.sync = true;
+    s = dbfull()->Write(wopts, &batch);
+    ASSERT_EQ(s, s.NoSpace());
+  }
+  SyncPoint::GetInstance()->DisableProcessing();
+  fault_env->SetFilesystemActive(true);
+  ASSERT_EQ(listener->WaitForRecovery(5000000), true);
+  for (auto i=0; i<199; ++i) {
+    if (i < 100) {
+      ASSERT_NE(Get(Key(i)), "NOT_FOUND");
+    } else {
+      ASSERT_EQ(Get(Key(i)), "NOT_FOUND");
+    }
+  }
+  Reopen(options);
+  for (auto i=0; i<199; ++i) {
+    if (i < 100) {
+      ASSERT_NE(Get(Key(i)), "NOT_FOUND");
+    } else {
+      ASSERT_EQ(Get(Key(i)), "NOT_FOUND");
+    }
+  }
+  Close();
+}
+
+TEST_F(DBErrorHandlingTest, MultiCFWALWriteError) {
+  std::unique_ptr<FaultInjectionTestEnv> fault_env(
+      new FaultInjectionTestEnv(Env::Default()));
+  std::shared_ptr<ErrorHandlerListener> listener(new ErrorHandlerListener());
+  Options options = GetDefaultOptions();
+  options.create_if_missing = true;
+  options.writable_file_max_buffer_size = 32768;
+  options.env = fault_env.get();
+  options.listeners.emplace_back(listener);
+  Status s;
+  Random rnd(301);
+
+  listener->EnableAutoRecovery();
+  CreateAndReopenWithCF({"one", "two", "three"}, options);
+
+  {
+    WriteBatch batch;
+
+    for (auto i = 1; i < 4; ++i) {
+      for (auto j = 0; j < 100; ++j) {
+        batch.Put(handles_[i], Key(j), RandomString(&rnd, 1024));
+      }
+    }
+
+    WriteOptions wopts;
+    wopts.sync = true;
+    ASSERT_EQ(dbfull()->Write(wopts, &batch), Status::OK());
+  };
+
+  {
+    WriteBatch batch;
+    int write_error = 0;
+
+    // Write to one CF
+    for (auto i = 100; i < 199; ++i) {
+      batch.Put(handles_[2], Key(i), RandomString(&rnd, 1024));
+    }
+
+    SyncPoint::GetInstance()->SetCallBack(
+        "WritableFileWriter::Append:BeforePrepareWrite", [&](void*) {
+          write_error++;
+          if (write_error > 2) {
+            fault_env->SetFilesystemActive(false,
+                                           Status::NoSpace("Out of space"));
+          }
+        });
+    SyncPoint::GetInstance()->EnableProcessing();
+    WriteOptions wopts;
+    wopts.sync = true;
+    s = dbfull()->Write(wopts, &batch);
+    ASSERT_EQ(s, s.NoSpace());
+  }
+  SyncPoint::GetInstance()->DisableProcessing();
+  fault_env->SetFilesystemActive(true);
+  ASSERT_EQ(listener->WaitForRecovery(5000000), true);
+
+  for (auto i = 1; i < 4; ++i) {
+    // Every CF should have been flushed
+    ASSERT_EQ(NumTableFilesAtLevel(0, i), 1);
+  }
+
+  for (auto i = 1; i < 4; ++i) {
+    for (auto j = 0; j < 199; ++j) {
+      if (j < 100) {
+        ASSERT_NE(Get(i, Key(j)), "NOT_FOUND");
+      } else {
+        ASSERT_EQ(Get(i, Key(j)), "NOT_FOUND");
+      }
+    }
+  }
+  ReopenWithColumnFamilies({"default", "one", "two", "three"}, options);
+  for (auto i = 1; i < 4; ++i) {
+    for (auto j = 0; j < 199; ++j) {
+      if (j < 100) {
+        ASSERT_NE(Get(i, Key(j)), "NOT_FOUND");
+      } else {
+        ASSERT_EQ(Get(i, Key(j)), "NOT_FOUND");
+      }
+    }
+  }
+  Close();
+}
+
+TEST_F(DBErrorHandlingTest, MultiDBCompactionError) {
+  FaultInjectionTestEnv* def_env = new FaultInjectionTestEnv(Env::Default());
+  std::vector<std::unique_ptr<FaultInjectionTestEnv>> fault_env;
+  std::vector<Options> options;
+  std::vector<std::shared_ptr<ErrorHandlerListener>> listener;
+  std::vector<DB*> db;
+  std::shared_ptr<SstFileManager> sfm(NewSstFileManager(def_env));
+  int kNumDbInstances = 3;
+  Random rnd(301);
+
+  for (auto i = 0; i < kNumDbInstances; ++i) {
+    listener.emplace_back(new ErrorHandlerListener());
+    options.emplace_back(GetDefaultOptions());
+    fault_env.emplace_back(new FaultInjectionTestEnv(Env::Default()));
+    options[i].create_if_missing = true;
+    options[i].level0_file_num_compaction_trigger = 2;
+    options[i].writable_file_max_buffer_size = 32768;
+    options[i].env = fault_env[i].get();
+    options[i].listeners.emplace_back(listener[i]);
+    options[i].sst_file_manager = sfm;
+    DB* dbptr;
+    char buf[16];
+
+    listener[i]->EnableAutoRecovery();
+    // Setup for returning error for the 3rd SST, which would be level 1
+    listener[i]->InjectFileCreationError(fault_env[i].get(), 3,
+                                         Status::NoSpace("Out of space"));
+    snprintf(buf, sizeof(buf), "_%d", i);
+    DestroyDB(dbname_ + std::string(buf), options[i]);
+    ASSERT_EQ(DB::Open(options[i], dbname_ + std::string(buf), &dbptr),
+              Status::OK());
+    db.emplace_back(dbptr);
+  }
+
+  for (auto i = 0; i < kNumDbInstances; ++i) {
+    WriteBatch batch;
+
+    for (auto j = 0; j <= 100; ++j) {
+      batch.Put(Key(j), RandomString(&rnd, 1024));
+    }
+
+    WriteOptions wopts;
+    wopts.sync = true;
+    ASSERT_EQ(db[i]->Write(wopts, &batch), Status::OK());
+    ASSERT_EQ(db[i]->Flush(FlushOptions()), Status::OK());
+  }
+
+  def_env->SetFilesystemActive(false, Status::NoSpace("Out of space"));
+  for (auto i = 0; i < kNumDbInstances; ++i) {
+    WriteBatch batch;
+
+    // Write to one CF
+    for (auto j = 100; j < 199; ++j) {
+      batch.Put(Key(j), RandomString(&rnd, 1024));
+    }
+
+    WriteOptions wopts;
+    wopts.sync = true;
+    ASSERT_EQ(db[i]->Write(wopts, &batch), Status::OK());
+    ASSERT_EQ(db[i]->Flush(FlushOptions()), Status::OK());
+  }
+
+  for (auto i = 0; i < kNumDbInstances; ++i) {
+    Status s = static_cast<DBImpl*>(db[i])->TEST_WaitForCompact(true);
+    ASSERT_EQ(s.severity(), Status::Severity::kSoftError);
+    fault_env[i]->SetFilesystemActive(true);
+  }
+
+  def_env->SetFilesystemActive(true);
+  for (auto i = 0; i < kNumDbInstances; ++i) {
+    std::string prop;
+    ASSERT_EQ(listener[i]->WaitForRecovery(5000000), true);
+    EXPECT_TRUE(db[i]->GetProperty(
+        "rocksdb.num-files-at-level" + NumberToString(0), &prop));
+    EXPECT_EQ(atoi(prop.c_str()), 0);
+    EXPECT_TRUE(db[i]->GetProperty(
+        "rocksdb.num-files-at-level" + NumberToString(1), &prop));
+    EXPECT_EQ(atoi(prop.c_str()), 1);
+  }
+
+  for (auto i = 0; i < kNumDbInstances; ++i) {
+    char buf[16];
+    snprintf(buf, sizeof(buf), "_%d", i);
+    delete db[i];
+    fault_env[i]->SetFilesystemActive(true);
+    if (getenv("KEEP_DB")) {
+      printf("DB is still at %s%s\n", dbname_.c_str(), buf);
+    } else {
+      Status s = DestroyDB(dbname_ + std::string(buf), options[i]);
+    }
+  }
+  options.clear();
+  sfm.reset();
+  delete def_env;
+}
+
+TEST_F(DBErrorHandlingTest, MultiDBVariousErrors) {
+  FaultInjectionTestEnv* def_env = new FaultInjectionTestEnv(Env::Default());
+  std::vector<std::unique_ptr<FaultInjectionTestEnv>> fault_env;
+  std::vector<Options> options;
+  std::vector<std::shared_ptr<ErrorHandlerListener>> listener;
+  std::vector<DB*> db;
+  std::shared_ptr<SstFileManager> sfm(NewSstFileManager(def_env));
+  int kNumDbInstances = 3;
+  Random rnd(301);
+
+  for (auto i = 0; i < kNumDbInstances; ++i) {
+    listener.emplace_back(new ErrorHandlerListener());
+    options.emplace_back(GetDefaultOptions());
+    fault_env.emplace_back(new FaultInjectionTestEnv(Env::Default()));
+    options[i].create_if_missing = true;
+    options[i].level0_file_num_compaction_trigger = 2;
+    options[i].writable_file_max_buffer_size = 32768;
+    options[i].env = fault_env[i].get();
+    options[i].listeners.emplace_back(listener[i]);
+    options[i].sst_file_manager = sfm;
+    DB* dbptr;
+    char buf[16];
+
+    listener[i]->EnableAutoRecovery();
+    switch (i) {
+      case 0:
+        // Setup for returning error for the 3rd SST, which would be level 1
+        listener[i]->InjectFileCreationError(fault_env[i].get(), 3,
+                                             Status::NoSpace("Out of space"));
+        break;
+      case 1:
+        // Setup for returning error after the 1st SST, which would result
+        // in a hard error
+        listener[i]->InjectFileCreationError(fault_env[i].get(), 2,
+                                             Status::NoSpace("Out of space"));
+        break;
+      default:
+        break;
+    }
+    snprintf(buf, sizeof(buf), "_%d", i);
+    DestroyDB(dbname_ + std::string(buf), options[i]);
+    ASSERT_EQ(DB::Open(options[i], dbname_ + std::string(buf), &dbptr),
+              Status::OK());
+    db.emplace_back(dbptr);
+  }
+
+  for (auto i = 0; i < kNumDbInstances; ++i) {
+    WriteBatch batch;
+
+    for (auto j = 0; j <= 100; ++j) {
+      batch.Put(Key(j), RandomString(&rnd, 1024));
+    }
+
+    WriteOptions wopts;
+    wopts.sync = true;
+    ASSERT_EQ(db[i]->Write(wopts, &batch), Status::OK());
+    ASSERT_EQ(db[i]->Flush(FlushOptions()), Status::OK());
+  }
+
+  def_env->SetFilesystemActive(false, Status::NoSpace("Out of space"));
+  for (auto i = 0; i < kNumDbInstances; ++i) {
+    WriteBatch batch;
+
+    // Write to one CF
+    for (auto j = 100; j < 199; ++j) {
+      batch.Put(Key(j), RandomString(&rnd, 1024));
+    }
+
+    WriteOptions wopts;
+    wopts.sync = true;
+    ASSERT_EQ(db[i]->Write(wopts, &batch), Status::OK());
+    if (i != 1) {
+      ASSERT_EQ(db[i]->Flush(FlushOptions()), Status::OK());
+    } else {
+      ASSERT_EQ(db[i]->Flush(FlushOptions()), Status::NoSpace());
+    }
+  }
+
+  for (auto i = 0; i < kNumDbInstances; ++i) {
+    Status s = static_cast<DBImpl*>(db[i])->TEST_WaitForCompact(true);
+    switch (i) {
+      case 0:
+        ASSERT_EQ(s.severity(), Status::Severity::kSoftError);
+        break;
+      case 1:
+        ASSERT_EQ(s.severity(), Status::Severity::kHardError);
+        break;
+      case 2:
+        ASSERT_EQ(s, Status::OK());
+        break;
+    }
+    fault_env[i]->SetFilesystemActive(true);
+  }
+
+  def_env->SetFilesystemActive(true);
+  for (auto i = 0; i < kNumDbInstances; ++i) {
+    std::string prop;
+    if (i < 2) {
+      ASSERT_EQ(listener[i]->WaitForRecovery(5000000), true);
+    }
+    if (i == 1) {
+      ASSERT_EQ(static_cast<DBImpl*>(db[i])->TEST_WaitForCompact(true),
+                Status::OK());
+    }
+    EXPECT_TRUE(db[i]->GetProperty(
+        "rocksdb.num-files-at-level" + NumberToString(0), &prop));
+    EXPECT_EQ(atoi(prop.c_str()), 0);
+    EXPECT_TRUE(db[i]->GetProperty(
+        "rocksdb.num-files-at-level" + NumberToString(1), &prop));
+    EXPECT_EQ(atoi(prop.c_str()), 1);
+  }
+
+  for (auto i = 0; i < kNumDbInstances; ++i) {
+    char buf[16];
+    snprintf(buf, sizeof(buf), "_%d", i);
+    fault_env[i]->SetFilesystemActive(true);
+    delete db[i];
+    if (getenv("KEEP_DB")) {
+      printf("DB is still at %s%s\n", dbname_.c_str(), buf);
+    } else {
+      DestroyDB(dbname_ + std::string(buf), options[i]);
+    }
+  }
+  options.clear();
+  delete def_env;
+}
+
 }  // namespace rocksdb
 
 int main(int argc, char** argv) {
@@ -136,3 +679,13 @@
   ::testing::InitGoogleTest(&argc, argv);
   return RUN_ALL_TESTS();
 }
+
+#else
+#include <stdio.h>
+
+int main(int /*argc*/, char** /*argv*/) {
+  fprintf(stderr, "SKIPPED as Cuckoo table is not supported in ROCKSDB_LITE\n");
+  return 0;
+}
+
+#endif  // ROCKSDB_LITE
diff -Nru rocksdb-5.15.10/db/event_helpers.cc rocksdb-5.17.2/db/event_helpers.cc
--- rocksdb-5.15.10/db/event_helpers.cc	2018-09-13 17:25:20.000000000 +0000
+++ rocksdb-5.17.2/db/event_helpers.cc	2018-11-12 19:57:32.000000000 +0000
@@ -40,8 +40,8 @@
 
 void EventHelpers::NotifyOnBackgroundError(
     const std::vector<std::shared_ptr<EventListener>>& listeners,
-    BackgroundErrorReason reason, Status* bg_error,
-    InstrumentedMutex* db_mutex) {
+    BackgroundErrorReason reason, Status* bg_error, InstrumentedMutex* db_mutex,
+    bool* auto_recovery) {
 #ifndef ROCKSDB_LITE
   if (listeners.size() == 0U) {
     return;
@@ -51,6 +51,9 @@
   db_mutex->Unlock();
   for (auto& listener : listeners) {
     listener->OnBackgroundError(reason, bg_error);
+    if (*auto_recovery) {
+      listener->OnErrorRecoveryBegin(reason, *bg_error, auto_recovery);
+    }
   }
   db_mutex->Lock();
 #else
@@ -58,6 +61,7 @@
   (void)reason;
   (void)bg_error;
   (void)db_mutex;
+  (void)auto_recovery;
 #endif  // ROCKSDB_LITE
 }
 
@@ -167,4 +171,25 @@
 #endif  // !ROCKSDB_LITE
 }
 
+void EventHelpers::NotifyOnErrorRecoveryCompleted(
+    const std::vector<std::shared_ptr<EventListener>>& listeners,
+    Status old_bg_error, InstrumentedMutex* db_mutex) {
+#ifndef ROCKSDB_LITE
+  if (listeners.size() == 0U) {
+    return;
+  }
+  db_mutex->AssertHeld();
+  // release lock while notifying events
+  db_mutex->Unlock();
+  for (auto& listener : listeners) {
+    listener->OnErrorRecoveryCompleted(old_bg_error);
+  }
+  db_mutex->Lock();
+#else
+  (void)listeners;
+  (void)old_bg_error;
+  (void)db_mutex;
+#endif  // ROCKSDB_LITE
+}
+
 }  // namespace rocksdb
diff -Nru rocksdb-5.15.10/db/event_helpers.h rocksdb-5.17.2/db/event_helpers.h
--- rocksdb-5.15.10/db/event_helpers.h	2018-09-13 17:25:20.000000000 +0000
+++ rocksdb-5.17.2/db/event_helpers.h	2018-11-12 19:57:32.000000000 +0000
@@ -28,7 +28,7 @@
   static void NotifyOnBackgroundError(
       const std::vector<std::shared_ptr<EventListener>>& listeners,
       BackgroundErrorReason reason, Status* bg_error,
-      InstrumentedMutex* db_mutex);
+      InstrumentedMutex* db_mutex, bool* auto_recovery);
   static void LogAndNotifyTableFileCreationFinished(
       EventLogger* event_logger,
       const std::vector<std::shared_ptr<EventListener>>& listeners,
@@ -41,6 +41,9 @@
       uint64_t file_number, const std::string& file_path,
       const Status& status, const std::string& db_name,
       const std::vector<std::shared_ptr<EventListener>>& listeners);
+  static void NotifyOnErrorRecoveryCompleted(
+      const std::vector<std::shared_ptr<EventListener>>& listeners,
+      Status bg_error, InstrumentedMutex* db_mutex);
 
  private:
   static void LogAndNotifyTableFileCreation(
diff -Nru rocksdb-5.15.10/db/external_sst_file_ingestion_job.cc rocksdb-5.17.2/db/external_sst_file_ingestion_job.cc
--- rocksdb-5.15.10/db/external_sst_file_ingestion_job.cc	2018-09-13 17:25:20.000000000 +0000
+++ rocksdb-5.17.2/db/external_sst_file_ingestion_job.cc	2018-11-12 19:57:32.000000000 +0000
@@ -29,7 +29,8 @@
 namespace rocksdb {
 
 Status ExternalSstFileIngestionJob::Prepare(
-    const std::vector<std::string>& external_files_paths, SuperVersion* sv) {
+    const std::vector<std::string>& external_files_paths,
+    uint64_t next_file_number, SuperVersion* sv) {
   Status status;
 
   // Read the information of files we are ingesting
@@ -90,7 +91,7 @@
 
   // Copy/Move external files into DB
   for (IngestedFileInfo& f : files_to_ingest_) {
-    f.fd = FileDescriptor(versions_->NewFileNumber(), 0, f.file_size);
+    f.fd = FileDescriptor(next_file_number++, 0, f.file_size);
 
     const std::string path_outside_db = f.external_file_path;
     const std::string path_inside_db =
@@ -343,7 +344,7 @@
       file_to_ingest->global_seqno_offset = 0;
       return Status::Corruption("Was not able to find file global seqno field");
     }
-    file_to_ingest->global_seqno_offset = offsets_iter->second;
+    file_to_ingest->global_seqno_offset = static_cast<size_t>(offsets_iter->second);
   } else if (file_to_ingest->version == 1) {
     // SST file V1 should not have global seqno field
     assert(seqno_iter == uprops.end());
@@ -475,9 +476,9 @@
         const SequenceNumber level_largest_seqno =
             (*max_element(level_files.begin(), level_files.end(),
                           [](FileMetaData* f1, FileMetaData* f2) {
-                            return f1->largest_seqno < f2->largest_seqno;
+                            return f1->fd.largest_seqno < f2->fd.largest_seqno;
                           }))
-                ->largest_seqno;
+                ->fd.largest_seqno;
         // should only assign seqno to current level's largest seqno when
         // the file fits
         if (level_largest_seqno != 0 &&
@@ -522,7 +523,7 @@
   // at some upper level
   for (int lvl = 0; lvl < cfd_->NumberLevels() - 1; lvl++) {
     for (auto file : vstorage->LevelFiles(lvl)) {
-      if (file->smallest_seqno == 0) {
+      if (file->fd.smallest_seqno == 0) {
         return Status::InvalidArgument(
           "Can't ingest_behind file as despite allow_ingest_behind=true "
           "there are files with 0 seqno in database at upper levels!");
@@ -547,24 +548,27 @@
         "field");
   }
 
-  std::unique_ptr<RandomRWFile> rwfile;
-  Status status = env_->NewRandomRWFile(file_to_ingest->internal_file_path,
-                                        &rwfile, env_options_);
-  if (!status.ok()) {
-    return status;
+  if (ingestion_options_.write_global_seqno) {
+    // Determine if we can write global_seqno to a given offset of file.
+    // If the file system does not support random write, then we should not.
+    // Otherwise we should.
+    std::unique_ptr<RandomRWFile> rwfile;
+    Status status = env_->NewRandomRWFile(file_to_ingest->internal_file_path,
+                                          &rwfile, env_options_);
+    if (status.ok()) {
+      std::string seqno_val;
+      PutFixed64(&seqno_val, seqno);
+      status = rwfile->Write(file_to_ingest->global_seqno_offset, seqno_val);
+      if (!status.ok()) {
+        return status;
+      }
+    } else if (!status.IsNotSupported()) {
+      return status;
+    }
   }
 
-  // Write the new seqno in the global sequence number field in the file
-  std::string seqno_val;
-  PutFixed64(&seqno_val, seqno);
-  status = rwfile->Write(file_to_ingest->global_seqno_offset, seqno_val);
-  if (status.ok()) {
-    status = rwfile->Fsync();
-  }
-  if (status.ok()) {
-    file_to_ingest->assigned_seqno = seqno;
-  }
-  return status;
+  file_to_ingest->assigned_seqno = seqno;
+  return Status::OK();
 }
 
 bool ExternalSstFileIngestionJob::IngestedFileFitInLevel(
diff -Nru rocksdb-5.15.10/db/external_sst_file_ingestion_job.h rocksdb-5.17.2/db/external_sst_file_ingestion_job.h
--- rocksdb-5.15.10/db/external_sst_file_ingestion_job.h	2018-09-13 17:25:20.000000000 +0000
+++ rocksdb-5.17.2/db/external_sst_file_ingestion_job.h	2018-11-12 19:57:32.000000000 +0000
@@ -89,7 +89,7 @@
 
   // Prepare the job by copying external files into the DB.
   Status Prepare(const std::vector<std::string>& external_files_paths,
-                 SuperVersion* sv);
+                 uint64_t next_file_number, SuperVersion* sv);
 
   // Check if we need to flush the memtable before running the ingestion job
   // This will be true if the files we are ingesting are overlapping with any
diff -Nru rocksdb-5.15.10/db/flush_job.cc rocksdb-5.17.2/db/flush_job.cc
--- rocksdb-5.15.10/db/flush_job.cc	2018-09-13 17:25:20.000000000 +0000
+++ rocksdb-5.17.2/db/flush_job.cc	2018-11-12 19:57:32.000000000 +0000
@@ -78,6 +78,8 @@
       return "Auto Compaction";
     case FlushReason::kManualFlush:
       return "Manual Flush";
+    case FlushReason::kErrorRecovery:
+      return "Error Recovery";
     default:
       return "Invalid";
   }
@@ -371,8 +373,8 @@
                    s.ToString().c_str(),
                    meta_.marked_for_compaction ? " (needs compaction)" : "");
 
-    if (output_file_directory_ != nullptr) {
-      output_file_directory_->Fsync();
+    if (s.ok() && output_file_directory_ != nullptr) {
+      s = output_file_directory_->Fsync();
     }
     TEST_SYNC_POINT("FlushJob::WriteLevel0Table");
     db_mutex_->Lock();
@@ -389,7 +391,7 @@
     // Add file to L0
     edit_->AddFile(0 /* level */, meta_.fd.GetNumber(), meta_.fd.GetPathId(),
                    meta_.fd.GetFileSize(), meta_.smallest, meta_.largest,
-                   meta_.smallest_seqno, meta_.largest_seqno,
+                   meta_.fd.smallest_seqno, meta_.fd.largest_seqno,
                    meta_.marked_for_compaction);
   }
 
diff -Nru rocksdb-5.15.10/db/flush_job_test.cc rocksdb-5.17.2/db/flush_job_test.cc
--- rocksdb-5.15.10/db/flush_job_test.cc	2018-09-13 17:25:20.000000000 +0000
+++ rocksdb-5.17.2/db/flush_job_test.cc	2018-11-12 19:57:32.000000000 +0000
@@ -62,7 +62,7 @@
         manifest, &file, env_->OptimizeForManifestWrite(env_options_));
     ASSERT_OK(s);
     unique_ptr<WritableFileWriter> file_writer(
-        new WritableFileWriter(std::move(file), EnvOptions()));
+        new WritableFileWriter(std::move(file), manifest, EnvOptions()));
     {
       log::Writer log(std::move(file_writer), 0, false);
       std::string record;
@@ -147,19 +147,20 @@
                      db_options_.statistics.get(), &event_logger, true);
 
   HistogramData hist;
-  FileMetaData fd;
+  FileMetaData file_meta;
   mutex_.Lock();
   flush_job.PickMemTable();
-  ASSERT_OK(flush_job.Run(nullptr, &fd));
+  ASSERT_OK(flush_job.Run(nullptr, &file_meta));
   mutex_.Unlock();
   db_options_.statistics->histogramData(FLUSH_TIME, &hist);
   ASSERT_GT(hist.average, 0.0);
 
-  ASSERT_EQ(ToString(0), fd.smallest.user_key().ToString());
-  ASSERT_EQ("9999a",
-            fd.largest.user_key().ToString());  // range tombstone end key
-  ASSERT_EQ(1, fd.smallest_seqno);
-  ASSERT_EQ(10000, fd.largest_seqno);  // range tombstone seqnum 10000
+  ASSERT_EQ(ToString(0), file_meta.smallest.user_key().ToString());
+  ASSERT_EQ(
+      "9999a",
+      file_meta.largest.user_key().ToString());  // range tombstone end key
+  ASSERT_EQ(1, file_meta.fd.smallest_seqno);
+  ASSERT_EQ(10000, file_meta.fd.largest_seqno);  // range tombstone seqnum 10000
   mock_table_factory_->AssertSingleFile(inserted_keys);
   job_context.Clean();
 }
diff -Nru rocksdb-5.15.10/db/forward_iterator.cc rocksdb-5.17.2/db/forward_iterator.cc
--- rocksdb-5.15.10/db/forward_iterator.cc	2018-09-13 17:25:20.000000000 +0000
+++ rocksdb-5.17.2/db/forward_iterator.cc	2018-11-12 19:57:32.000000000 +0000
@@ -916,21 +916,13 @@
 uint32_t ForwardIterator::FindFileInRange(
     const std::vector<FileMetaData*>& files, const Slice& internal_key,
     uint32_t left, uint32_t right) {
-  while (left < right) {
-    uint32_t mid = (left + right) / 2;
-    const FileMetaData* f = files[mid];
-    if (cfd_->internal_comparator().InternalKeyComparator::Compare(
-          f->largest.Encode(), internal_key) < 0) {
-      // Key at "mid.largest" is < "target".  Therefore all
-      // files at or before "mid" are uninteresting.
-      left = mid + 1;
-    } else {
-      // Key at "mid.largest" is >= "target".  Therefore all files
-      // after "mid" are uninteresting.
-      right = mid;
-    }
-  }
-  return right;
+  auto cmp = [&](const FileMetaData* f, const Slice& key) -> bool {
+    return cfd_->internal_comparator().InternalKeyComparator::Compare(
+            f->largest.Encode(), key) < 0;
+  };
+  const auto &b = files.begin();
+  return static_cast<uint32_t>(std::lower_bound(b + left,
+                                 b + right, internal_key, cmp) - b);
 }
 
 void ForwardIterator::DeleteIterator(InternalIterator* iter, bool is_arena) {
diff -Nru rocksdb-5.15.10/db/job_context.h rocksdb-5.17.2/db/job_context.h
--- rocksdb-5.15.10/db/job_context.h	2018-09-13 17:25:20.000000000 +0000
+++ rocksdb-5.17.2/db/job_context.h	2018-11-12 19:57:32.000000000 +0000
@@ -35,6 +35,14 @@
   explicit SuperVersionContext(bool create_superversion = false)
     : new_superversion(create_superversion ? new SuperVersion() : nullptr) {}
 
+  explicit SuperVersionContext(SuperVersionContext&& other)
+      : superversions_to_free(std::move(other.superversions_to_free)),
+#ifndef ROCKSDB_DISABLE_STALL_NOTIFICATION
+        write_stall_notifications(std::move(other.write_stall_notifications)),
+#endif
+        new_superversion(std::move(other.new_superversion)) {
+  }
+
   void NewSuperVersion() {
     new_superversion = unique_ptr<SuperVersion>(new SuperVersion());
   }
@@ -98,8 +106,15 @@
   }
 
   inline bool HaveSomethingToClean() const {
+    bool sv_have_sth = false;
+    for (const auto& sv_ctx : superversion_contexts) {
+      if (sv_ctx.HaveSomethingToDelete()) {
+        sv_have_sth = true;
+        break;
+      }
+    }
     return memtables_to_free.size() > 0 || logs_to_free.size() > 0 ||
-           superversion_context.HaveSomethingToDelete();
+           sv_have_sth;
   }
 
   // Structure to store information for candidate files to delete.
@@ -142,7 +157,8 @@
   // a list of memtables to be free
   autovector<MemTable*> memtables_to_free;
 
-  SuperVersionContext superversion_context;
+  // contexts for installing superversions for multiple column families
+  std::vector<SuperVersionContext> superversion_contexts;
 
   autovector<log::Writer*> logs_to_free;
 
@@ -158,13 +174,14 @@
   size_t num_alive_log_files = 0;
   uint64_t size_log_to_delete = 0;
 
-  explicit JobContext(int _job_id, bool create_superversion = false)
-    : superversion_context(create_superversion) {
+  explicit JobContext(int _job_id, bool create_superversion = false) {
     job_id = _job_id;
     manifest_file_number = 0;
     pending_manifest_file_number = 0;
     log_number = 0;
     prev_log_number = 0;
+    superversion_contexts.emplace_back(
+        SuperVersionContext(create_superversion));
   }
 
   // For non-empty JobContext Clean() has to be called at least once before
@@ -173,7 +190,9 @@
   // doing potentially slow Clean() with locked DB mutex.
   void Clean() {
     // free superversions
-    superversion_context.Clean();
+    for (auto& sv_context : superversion_contexts) {
+      sv_context.Clean();
+    }
     // free pending memtables
     for (auto m : memtables_to_free) {
       delete m;
diff -Nru rocksdb-5.15.10/db/listener_test.cc rocksdb-5.17.2/db/listener_test.cc
--- rocksdb-5.15.10/db/listener_test.cc	2018-09-13 17:25:20.000000000 +0000
+++ rocksdb-5.17.2/db/listener_test.cc	2018-11-12 19:57:32.000000000 +0000
@@ -417,7 +417,9 @@
   for (int i = 0; static_cast<int>(cf_meta.file_count) < kSlowdownTrigger * 10;
        ++i) {
     Put(1, ToString(i), std::string(10000, 'x'), WriteOptions());
-    db_->Flush(FlushOptions(), handles_[1]);
+    FlushOptions fo;
+    fo.allow_write_stall = true;
+    db_->Flush(fo, handles_[1]);
     db_->GetColumnFamilyMetaData(handles_[1], &cf_meta);
   }
   ASSERT_GE(listener->slowdown_count, kSlowdownTrigger * 9);
@@ -880,10 +882,13 @@
   ASSERT_EQ(1, listener->counter());
 
   // trigger flush so compaction is triggered again; this time it succeeds
+  // The previous failed compaction may get retried automatically, so we may
+  // be left with 0 or 1 files in level 1, depending on when the retry gets
+  // scheduled
   ASSERT_OK(Put("key0", "val"));
   ASSERT_OK(dbfull()->TEST_WaitForFlushMemTable());
   ASSERT_OK(dbfull()->TEST_WaitForCompact());
-  ASSERT_EQ(0, NumTableFilesAtLevel(0));
+  ASSERT_LE(1, NumTableFilesAtLevel(0));
 }
 
 }  // namespace rocksdb
diff -Nru rocksdb-5.15.10/db/log_format.h rocksdb-5.17.2/db/log_format.h
--- rocksdb-5.15.10/db/log_format.h	2018-09-13 17:25:20.000000000 +0000
+++ rocksdb-5.17.2/db/log_format.h	2018-11-12 19:57:32.000000000 +0000
@@ -37,9 +37,9 @@
 // Header is checksum (4 bytes), length (2 bytes), type (1 byte)
 static const int kHeaderSize = 4 + 2 + 1;
 
-// Recyclable header is checksum (4 bytes), type (1 byte), log number
-// (4 bytes), length (2 bytes).
-static const int kRecyclableHeaderSize = 4 + 1 + 4 + 2;
+// Recyclable header is checksum (4 bytes), length (2 bytes), type (1 byte),
+// log number (4 bytes).
+static const int kRecyclableHeaderSize = 4 + 2 + 1 + 4;
 
 }  // namespace log
 }  // namespace rocksdb
diff -Nru rocksdb-5.15.10/db/log_reader.cc rocksdb-5.17.2/db/log_reader.cc
--- rocksdb-5.15.10/db/log_reader.cc	2018-09-13 17:25:20.000000000 +0000
+++ rocksdb-5.17.2/db/log_reader.cc	2018-11-12 19:57:32.000000000 +0000
@@ -24,7 +24,7 @@
 
 Reader::Reader(std::shared_ptr<Logger> info_log,
                unique_ptr<SequentialFileReader>&& _file, Reporter* reporter,
-               bool checksum, uint64_t initial_offset, uint64_t log_num)
+               bool checksum, uint64_t log_num)
     : info_log_(info_log),
       file_(std::move(_file)),
       reporter_(reporter),
@@ -36,7 +36,6 @@
       eof_offset_(0),
       last_record_offset_(0),
       end_of_buffer_offset_(0),
-      initial_offset_(initial_offset),
       log_number_(log_num),
       recycled_(false) {}
 
@@ -44,29 +43,6 @@
   delete[] backing_store_;
 }
 
-bool Reader::SkipToInitialBlock() {
-  size_t initial_offset_in_block = initial_offset_ % kBlockSize;
-  uint64_t block_start_location = initial_offset_ - initial_offset_in_block;
-
-  // Don't search a block if we'd be in the trailer
-  if (initial_offset_in_block > kBlockSize - 6) {
-    block_start_location += kBlockSize;
-  }
-
-  end_of_buffer_offset_ = block_start_location;
-
-  // Skip to start of first block that can contain the initial record
-  if (block_start_location > 0) {
-    Status skip_status = file_->Skip(block_start_location);
-    if (!skip_status.ok()) {
-      ReportDrop(static_cast<size_t>(block_start_location), skip_status);
-      return false;
-    }
-  }
-
-  return true;
-}
-
 // For kAbsoluteConsistency, on clean shutdown we don't expect any error
 // in the log files.  For other modes, we can ignore only incomplete records
 // in the last log file, which are presumably due to a write in progress
@@ -76,12 +52,6 @@
 // restrict the inconsistency to only the last log
 bool Reader::ReadRecord(Slice* record, std::string* scratch,
                         WALRecoveryMode wal_recovery_mode) {
-  if (last_record_offset_ < initial_offset_) {
-    if (!SkipToInitialBlock()) {
-      return false;
-    }
-  }
-
   scratch->clear();
   record->clear();
   bool in_fragmented_record = false;
@@ -299,8 +269,7 @@
 }
 
 void Reader::ReportDrop(size_t bytes, const Status& reason) {
-  if (reporter_ != nullptr &&
-      end_of_buffer_offset_ - buffer_.size() - bytes >= initial_offset_) {
+  if (reporter_ != nullptr) {
     reporter_->Corruption(bytes, reason);
   }
 }
@@ -317,7 +286,7 @@
       read_error_ = true;
       *error = kEof;
       return false;
-    } else if (buffer_.size() < (size_t)kBlockSize) {
+    } else if (buffer_.size() < static_cast<size_t>(kBlockSize)) {
       eof_ = true;
       eof_offset_ = buffer_.size();
     }
@@ -342,7 +311,7 @@
 unsigned int Reader::ReadPhysicalRecord(Slice* result, size_t* drop_size) {
   while (true) {
     // We need at least the minimum header size
-    if (buffer_.size() < (size_t)kHeaderSize) {
+    if (buffer_.size() < static_cast<size_t>(kHeaderSize)) {
       int r;
       if (!ReadMore(drop_size, &r)) {
         return r;
@@ -363,7 +332,7 @@
       }
       header_size = kRecyclableHeaderSize;
       // We need enough for the larger header
-      if (buffer_.size() < (size_t)kRecyclableHeaderSize) {
+      if (buffer_.size() < static_cast<size_t>(kRecyclableHeaderSize)) {
         int r;
         if (!ReadMore(drop_size, &r)) {
           return r;
@@ -417,13 +386,6 @@
 
     buffer_.remove_prefix(header_size + length);
 
-    // Skip physical record that started before initial_offset_
-    if (end_of_buffer_offset_ - buffer_.size() - header_size - length <
-        initial_offset_) {
-      result->clear();
-      return kBadRecord;
-    }
-
     *result = Slice(header + header_size, length);
     return type;
   }
diff -Nru rocksdb-5.15.10/db/log_reader.h rocksdb-5.17.2/db/log_reader.h
--- rocksdb-5.15.10/db/log_reader.h	2018-09-13 17:25:20.000000000 +0000
+++ rocksdb-5.17.2/db/log_reader.h	2018-11-12 19:57:32.000000000 +0000
@@ -50,14 +50,10 @@
   // live while this Reader is in use.
   //
   // If "checksum" is true, verify checksums if available.
-  //
-  // The Reader will start reading at the first record located at physical
-  // position >= initial_offset within the file.
   Reader(std::shared_ptr<Logger> info_log,
-  // @lint-ignore TXT2 T25377293 Grandfathered in
-	 unique_ptr<SequentialFileReader>&& file,
-         Reporter* reporter, bool checksum, uint64_t initial_offset,
-         uint64_t log_num);
+         // @lint-ignore TXT2 T25377293 Grandfathered in
+         unique_ptr<SequentialFileReader>&& file, Reporter* reporter,
+         bool checksum, uint64_t log_num);
 
   ~Reader();
 
@@ -108,9 +104,6 @@
   // Offset of the first location past the end of buffer_.
   uint64_t end_of_buffer_offset_;
 
-  // Offset at which to start looking for the first record to return
-  uint64_t const initial_offset_;
-
   // which log number this is
   uint64_t const log_number_;
 
@@ -124,7 +117,6 @@
     // Currently there are three situations in which this happens:
     // * The record has an invalid CRC (ReadPhysicalRecord reports a drop)
     // * The record is a 0-length record (No drop is reported)
-    // * The record is below constructor's initial_offset (No drop is reported)
     kBadRecord = kMaxRecordType + 2,
     // Returned when we fail to read a valid header.
     kBadHeader = kMaxRecordType + 3,
@@ -136,11 +128,6 @@
     kBadRecordChecksum = kMaxRecordType + 6,
   };
 
-  // Skips all blocks that are completely before "initial_offset_".
-  //
-  // Returns true on success. Handles reporting.
-  bool SkipToInitialBlock();
-
   // Return type, or one of the preceding special values
   unsigned int ReadPhysicalRecord(Slice* result, size_t* drop_size);
 
diff -Nru rocksdb-5.15.10/db/log_test.cc rocksdb-5.17.2/db/log_test.cc
--- rocksdb-5.15.10/db/log_test.cc	2018-09-13 17:25:20.000000000 +0000
+++ rocksdb-5.17.2/db/log_test.cc	2018-11-12 19:57:32.000000000 +0000
@@ -159,12 +159,12 @@
   LogTest()
       : reader_contents_(),
         dest_holder_(test::GetWritableFileWriter(
-            new test::StringSink(&reader_contents_))),
+            new test::StringSink(&reader_contents_), "" /* don't care */)),
         source_holder_(test::GetSequentialFileReader(
             new StringSource(reader_contents_), "" /* file name */)),
         writer_(std::move(dest_holder_), 123, GetParam()),
-        reader_(nullptr, std::move(source_holder_), &report_, true /*checksum*/,
-                0 /*initial_offset*/, 123) {
+        reader_(nullptr, std::move(source_holder_), &report_,
+                true /* checksum */, 123 /* log_number */) {
     int header_size = GetParam() ? kRecyclableHeaderSize : kHeaderSize;
     initial_offset_last_record_offsets_[0] = 0;
     initial_offset_last_record_offsets_[1] = header_size + 10000;
@@ -266,36 +266,6 @@
     }
   }
 
-  void CheckOffsetPastEndReturnsNoRecords(uint64_t offset_past_end) {
-    WriteInitialOffsetLog();
-    unique_ptr<SequentialFileReader> file_reader(test::GetSequentialFileReader(
-        new StringSource(reader_contents_), "" /* fname */));
-    unique_ptr<Reader> offset_reader(
-        new Reader(nullptr, std::move(file_reader), &report_,
-                   true /*checksum*/, WrittenBytes() + offset_past_end, 123));
-    Slice record;
-    std::string scratch;
-    ASSERT_TRUE(!offset_reader->ReadRecord(&record, &scratch));
-  }
-
-  void CheckInitialOffsetRecord(uint64_t initial_offset,
-                                int expected_record_offset) {
-    WriteInitialOffsetLog();
-    unique_ptr<SequentialFileReader> file_reader(test::GetSequentialFileReader(
-        new StringSource(reader_contents_), "" /* fname */));
-    unique_ptr<Reader> offset_reader(
-        new Reader(nullptr, std::move(file_reader), &report_,
-                   true /*checksum*/, initial_offset, 123));
-    Slice record;
-    std::string scratch;
-    ASSERT_TRUE(offset_reader->ReadRecord(&record, &scratch));
-    ASSERT_EQ(initial_offset_record_sizes_[expected_record_offset],
-              record.size());
-    ASSERT_EQ(initial_offset_last_record_offsets_[expected_record_offset],
-              offset_reader->LastRecordOffset());
-    ASSERT_EQ((char)('a' + expected_record_offset), record.data()[0]);
-  }
-
 };
 
 size_t LogTest::initial_offset_record_sizes_[] =
@@ -590,55 +560,6 @@
   }
 }
 
-TEST_P(LogTest, ReadStart) { CheckInitialOffsetRecord(0, 0); }
-
-TEST_P(LogTest, ReadSecondOneOff) { CheckInitialOffsetRecord(1, 1); }
-
-TEST_P(LogTest, ReadSecondTenThousand) { CheckInitialOffsetRecord(10000, 1); }
-
-TEST_P(LogTest, ReadSecondStart) {
-  int header_size = GetParam() ? kRecyclableHeaderSize : kHeaderSize;
-  CheckInitialOffsetRecord(10000 + header_size, 1);
-}
-
-TEST_P(LogTest, ReadThirdOneOff) {
-  int header_size = GetParam() ? kRecyclableHeaderSize : kHeaderSize;
-  CheckInitialOffsetRecord(10000 + header_size + 1, 2);
-}
-
-TEST_P(LogTest, ReadThirdStart) {
-  int header_size = GetParam() ? kRecyclableHeaderSize : kHeaderSize;
-  CheckInitialOffsetRecord(20000 + 2 * header_size, 2);
-}
-
-TEST_P(LogTest, ReadFourthOneOff) {
-  int header_size = GetParam() ? kRecyclableHeaderSize : kHeaderSize;
-  CheckInitialOffsetRecord(20000 + 2 * header_size + 1, 3);
-}
-
-TEST_P(LogTest, ReadFourthFirstBlockTrailer) {
-  CheckInitialOffsetRecord(log::kBlockSize - 4, 3);
-}
-
-TEST_P(LogTest, ReadFourthMiddleBlock) {
-  CheckInitialOffsetRecord(log::kBlockSize + 1, 3);
-}
-
-TEST_P(LogTest, ReadFourthLastBlock) {
-  CheckInitialOffsetRecord(2 * log::kBlockSize + 1, 3);
-}
-
-TEST_P(LogTest, ReadFourthStart) {
-  int header_size = GetParam() ? kRecyclableHeaderSize : kHeaderSize;
-  CheckInitialOffsetRecord(
-      2 * (header_size + 1000) + (2 * log::kBlockSize - 1000) + 3 * header_size,
-      3);
-}
-
-TEST_P(LogTest, ReadEnd) { CheckOffsetPastEndReturnsNoRecords(0); }
-
-TEST_P(LogTest, ReadPastEnd) { CheckOffsetPastEndReturnsNoRecords(5); }
-
 TEST_P(LogTest, ClearEofSingleBlock) {
   Write("foo");
   Write("bar");
@@ -718,7 +639,8 @@
     Write("xxxxxxxxxxxxxxxx");
   }
   unique_ptr<WritableFileWriter> dest_holder(test::GetWritableFileWriter(
-      new test::OverwritingStringSink(get_reader_contents())));
+      new test::OverwritingStringSink(get_reader_contents()),
+      "" /* don't care */));
   Writer recycle_writer(std::move(dest_holder), 123, true);
   recycle_writer.AddRecord(Slice("foooo"));
   recycle_writer.AddRecord(Slice("bar"));
diff -Nru rocksdb-5.15.10/db/malloc_stats.cc rocksdb-5.17.2/db/malloc_stats.cc
--- rocksdb-5.15.10/db/malloc_stats.cc	2018-09-13 17:25:20.000000000 +0000
+++ rocksdb-5.17.2/db/malloc_stats.cc	2018-11-12 19:57:32.000000000 +0000
@@ -18,9 +18,11 @@
 #ifdef ROCKSDB_JEMALLOC
 #ifdef __FreeBSD__
 #include <malloc_np.h>
-#define je_malloc_stats_print malloc_stats_print
 #else
 #include "jemalloc/jemalloc.h"
+#ifdef JEMALLOC_NO_RENAME
+#define malloc_stats_print je_malloc_stats_print
+#endif
 #endif
 
 typedef struct {
@@ -48,7 +50,7 @@
   std::unique_ptr<char[]> buf{new char[kMallocStatusLen + 1]};
   mstat.cur = buf.get();
   mstat.end = buf.get() + kMallocStatusLen;
-  je_malloc_stats_print(GetJemallocStatus, &mstat, "");
+  malloc_stats_print(GetJemallocStatus, &mstat, "");
   stats->append(buf.get());
 }
 #else
diff -Nru rocksdb-5.15.10/db/memtable.h rocksdb-5.17.2/db/memtable.h
--- rocksdb-5.15.10/db/memtable.h	2018-09-13 17:25:20.000000000 +0000
+++ rocksdb-5.17.2/db/memtable.h	2018-11-12 19:57:32.000000000 +0000
@@ -34,7 +34,6 @@
 class Mutex;
 class MemTableIterator;
 class MergeContext;
-class InternalIterator;
 
 struct ImmutableMemTableOptions {
   explicit ImmutableMemTableOptions(const ImmutableCFOptions& ioptions,
@@ -337,6 +336,14 @@
     mem_tracker_.DoneAllocating();
   }
 
+  // Notify the underlying storage that all data it contained has been
+  // persisted.
+  // REQUIRES: external synchronization to prevent simultaneous
+  // operations on the same MemTable.
+  void MarkFlushed() {
+    table_->MarkFlushed();
+  }
+
   // return true if the current MemTableRep supports merge operator.
   bool IsMergeOperatorSupported() const {
     return table_->IsMergeOperatorSupported();
diff -Nru rocksdb-5.15.10/db/memtable_list.cc rocksdb-5.17.2/db/memtable_list.cc
--- rocksdb-5.15.10/db/memtable_list.cc	2018-09-13 17:25:20.000000000 +0000
+++ rocksdb-5.17.2/db/memtable_list.cc	2018-11-12 19:57:32.000000000 +0000
@@ -248,6 +248,7 @@
   assert(refs_ == 1);  // only when refs_ == 1 is MemTableListVersion mutable
   memlist_.remove(m);
 
+  m->MarkFlushed();
   if (max_write_buffer_number_to_maintain_ > 0) {
     memlist_history_.push_front(m);
     TrimHistory(to_delete);
diff -Nru rocksdb-5.15.10/db/merge_helper.h rocksdb-5.17.2/db/merge_helper.h
--- rocksdb-5.15.10/db/merge_helper.h	2018-09-13 17:25:20.000000000 +0000
+++ rocksdb-5.17.2/db/merge_helper.h	2018-11-12 19:57:32.000000000 +0000
@@ -3,8 +3,7 @@
 //  COPYING file in the root directory) and Apache 2.0 License
 //  (found in the LICENSE.Apache file in the root directory).
 //
-#ifndef MERGE_HELPER_H
-#define MERGE_HELPER_H
+#pragma once
 
 #include <deque>
 #include <string>
@@ -26,7 +25,6 @@
 class Logger;
 class MergeOperator;
 class Statistics;
-class InternalIterator;
 
 class MergeHelper {
  public:
@@ -194,5 +192,3 @@
 };
 
 } // namespace rocksdb
-
-#endif
diff -Nru rocksdb-5.15.10/db/obsolete_files_test.cc rocksdb-5.17.2/db/obsolete_files_test.cc
--- rocksdb-5.15.10/db/obsolete_files_test.cc	2018-09-13 17:25:20.000000000 +0000
+++ rocksdb-5.17.2/db/obsolete_files_test.cc	2018-11-12 19:57:32.000000000 +0000
@@ -227,16 +227,24 @@
   }
   ASSERT_OK(dbi->EnableFileDeletions(true /* force */));
   ASSERT_EQ(optsfiles_nums.size(), optsfiles_keep.size());
-  int size = static_cast<int>(optsfiles_nums.size());
-  int kept_opts_files_count = 0;
-  for (int i = 0; i != size; ++i) {
-    if (optsfiles_keep[i]) {
-      ++kept_opts_files_count;
-    }
-  }
-  ASSERT_EQ(2, kept_opts_files_count);
 
   CloseDB();
+
+  std::vector<std::string> files;
+  int opts_file_count = 0;
+  ASSERT_OK(env_->GetChildren(dbname_, &files));
+  for (const auto& file : files) {
+    uint64_t file_num;
+    Slice dummy_info_log_name_prefix;
+    FileType type;
+    WalFileType log_type;
+    if (ParseFileName(file, &file_num, dummy_info_log_name_prefix, &type,
+                      &log_type) &&
+        type == kOptionsFile) {
+      opts_file_count++;
+    }
+  }
+  ASSERT_EQ(2, opts_file_count);
 }
 
 } //namespace rocksdb
diff -Nru rocksdb-5.15.10/db/perf_context_test.cc rocksdb-5.17.2/db/perf_context_test.cc
--- rocksdb-5.15.10/db/perf_context_test.cc	2018-09-13 17:25:20.000000000 +0000
+++ rocksdb-5.17.2/db/perf_context_test.cc	2018-11-12 19:57:32.000000000 +0000
@@ -469,7 +469,7 @@
     ASSERT_GT(hist_num_memtable_checked.Average(), 0);
     // In read-only mode Get(), no super version operation is needed
     ASSERT_EQ(hist_get_post_process.Average(), 0);
-    ASSERT_EQ(hist_get_snapshot.Average(), 0);
+    ASSERT_GT(hist_get_snapshot.Average(), 0);
 
     ASSERT_GT(hist_mget.Average(), 0);
     ASSERT_GT(hist_mget_snapshot.Average(), 0);
diff -Nru rocksdb-5.15.10/db/range_del_aggregator_bench.cc rocksdb-5.17.2/db/range_del_aggregator_bench.cc
--- rocksdb-5.15.10/db/range_del_aggregator_bench.cc	1970-01-01 00:00:00.000000000 +0000
+++ rocksdb-5.17.2/db/range_del_aggregator_bench.cc	2018-11-12 19:57:32.000000000 +0000
@@ -0,0 +1,244 @@
+//  Copyright (c) 2018-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+
+#ifndef GFLAGS
+#include <cstdio>
+int main() {
+  fprintf(stderr, "Please install gflags to run rocksdb tools\n");
+  return 1;
+}
+#else
+
+#include <iostream>
+#include <iomanip>
+#include <memory>
+#include <random>
+#include <set>
+#include <string>
+#include <vector>
+
+#include "db/range_del_aggregator.h"
+#include "rocksdb/comparator.h"
+#include "rocksdb/env.h"
+#include "util/coding.h"
+#include "util/random.h"
+#include "util/stop_watch.h"
+#include "util/testutil.h"
+
+#include "util/gflags_compat.h"
+
+using GFLAGS_NAMESPACE::ParseCommandLineFlags;
+
+DEFINE_int32(num_range_tombstones, 1000, "number of range tombstones created");
+
+DEFINE_int32(num_runs, 10000, "number of test runs");
+
+DEFINE_int32(tombstone_start_upper_bound, 1000,
+             "exclusive upper bound on range tombstone start keys");
+
+DEFINE_int32(should_delete_upper_bound, 1000,
+             "exclusive upper bound on keys passed to ShouldDelete");
+
+DEFINE_double(tombstone_width_mean, 100.0, "average range tombstone width");
+
+DEFINE_double(tombstone_width_stddev, 0.0,
+              "standard deviation of range tombstone width");
+
+DEFINE_bool(use_collapsed, true, "use the collapsed range tombstone map");
+
+DEFINE_int32(seed, 0, "random number generator seed");
+
+DEFINE_int32(should_deletes_per_run, 1, "number of ShouldDelete calls per run");
+
+DEFINE_int32(add_tombstones_per_run, 1,
+             "number of AddTombstones calls per run");
+
+namespace {
+
+struct Stats {
+  uint64_t time_add_tombstones = 0;
+  uint64_t time_first_should_delete = 0;
+  uint64_t time_rest_should_delete = 0;
+};
+
+std::ostream& operator<<(std::ostream& os, const Stats& s) {
+  std::ios fmt_holder(nullptr);
+  fmt_holder.copyfmt(os);
+
+  os << std::left;
+  os << std::setw(25) << "AddTombstones: "
+     << s.time_add_tombstones /
+            (FLAGS_add_tombstones_per_run * FLAGS_num_runs * 1.0e3)
+     << " us\n";
+  os << std::setw(25) << "ShouldDelete (first): "
+     << s.time_first_should_delete / (FLAGS_num_runs * 1.0e3) << " us\n";
+  if (FLAGS_should_deletes_per_run > 1) {
+    os << std::setw(25) << "ShouldDelete (rest): "
+       << s.time_rest_should_delete /
+              ((FLAGS_should_deletes_per_run - 1) * FLAGS_num_runs * 1.0e3)
+       << " us\n";
+  }
+
+  os.copyfmt(fmt_holder);
+  return os;
+}
+
+}  // anonymous namespace
+
+namespace rocksdb {
+
+namespace {
+
+// A wrapper around RangeTombstones and the underlying data of its start and end
+// keys.
+struct PersistentRangeTombstone {
+  std::string start_key;
+  std::string end_key;
+  RangeTombstone tombstone;
+
+  PersistentRangeTombstone(std::string start, std::string end,
+                           SequenceNumber seq)
+      : start_key(std::move(start)), end_key(std::move(end)) {
+    tombstone = RangeTombstone(start_key, end_key, seq);
+  }
+
+  PersistentRangeTombstone() = default;
+
+  PersistentRangeTombstone(const PersistentRangeTombstone& t) { *this = t; }
+
+  PersistentRangeTombstone& operator=(const PersistentRangeTombstone& t) {
+    start_key = t.start_key;
+    end_key = t.end_key;
+    tombstone = RangeTombstone(start_key, end_key, t.tombstone.seq_);
+
+    return *this;
+  }
+
+  PersistentRangeTombstone(PersistentRangeTombstone&& t) noexcept { *this = t; }
+
+  PersistentRangeTombstone& operator=(PersistentRangeTombstone&& t) {
+    start_key = std::move(t.start_key);
+    end_key = std::move(t.end_key);
+    tombstone = RangeTombstone(start_key, end_key, t.tombstone.seq_);
+
+    return *this;
+  }
+};
+
+struct TombstoneStartKeyComparator {
+  explicit TombstoneStartKeyComparator(const Comparator* c) : cmp(c) {}
+
+  bool operator()(const RangeTombstone& a, const RangeTombstone& b) const {
+    return cmp->Compare(a.start_key_, b.start_key_) < 0;
+  }
+
+  const Comparator* cmp;
+};
+
+std::unique_ptr<InternalIterator> MakeRangeDelIterator(
+    const std::vector<PersistentRangeTombstone>& range_dels) {
+  std::vector<std::string> keys, values;
+  for (const auto& range_del : range_dels) {
+    auto key_and_value = range_del.tombstone.Serialize();
+    keys.push_back(key_and_value.first.Encode().ToString());
+    values.push_back(key_and_value.second.ToString());
+  }
+  return std::unique_ptr<test::VectorIterator>(
+      new test::VectorIterator(keys, values));
+}
+
+// convert long to a big-endian slice key
+static std::string Key(int64_t val) {
+  std::string little_endian_key;
+  std::string big_endian_key;
+  PutFixed64(&little_endian_key, val);
+  assert(little_endian_key.size() == sizeof(val));
+  big_endian_key.resize(sizeof(val));
+  for (size_t i = 0; i < sizeof(val); ++i) {
+    big_endian_key[i] = little_endian_key[sizeof(val) - 1 - i];
+  }
+  return big_endian_key;
+}
+
+}  // anonymous namespace
+
+}  // namespace rocksdb
+
+int main(int argc, char** argv) {
+  ParseCommandLineFlags(&argc, &argv, true);
+
+  Stats stats;
+  rocksdb::Random64 rnd(FLAGS_seed);
+  std::default_random_engine random_gen(FLAGS_seed);
+  std::normal_distribution<double> normal_dist(FLAGS_tombstone_width_mean,
+                                               FLAGS_tombstone_width_stddev);
+  std::vector<std::vector<rocksdb::PersistentRangeTombstone> >
+      all_persistent_range_tombstones(FLAGS_add_tombstones_per_run);
+  for (int i = 0; i < FLAGS_add_tombstones_per_run; i++) {
+    all_persistent_range_tombstones[i] =
+        std::vector<rocksdb::PersistentRangeTombstone>(
+            FLAGS_num_range_tombstones);
+  }
+  auto mode = FLAGS_use_collapsed
+                  ? rocksdb::RangeDelPositioningMode::kForwardTraversal
+                  : rocksdb::RangeDelPositioningMode::kFullScan;
+
+  for (int i = 0; i < FLAGS_num_runs; i++) {
+    auto icmp = rocksdb::InternalKeyComparator(rocksdb::BytewiseComparator());
+    rocksdb::RangeDelAggregator range_del_agg(icmp, {} /* snapshots */,
+                                              FLAGS_use_collapsed);
+
+    for (auto& persistent_range_tombstones : all_persistent_range_tombstones) {
+      // TODO(abhimadan): consider whether creating the range tombstones right
+      // before AddTombstones is artificially warming the cache compared to
+      // real workloads.
+      for (int j = 0; j < FLAGS_num_range_tombstones; j++) {
+        uint64_t start = rnd.Uniform(FLAGS_tombstone_start_upper_bound);
+        uint64_t end = start + std::max(1.0, normal_dist(random_gen));
+        persistent_range_tombstones[j] = rocksdb::PersistentRangeTombstone(
+            rocksdb::Key(start), rocksdb::Key(end), j);
+      }
+
+      auto range_del_iter =
+          rocksdb::MakeRangeDelIterator(persistent_range_tombstones);
+      rocksdb::StopWatchNano stop_watch_add_tombstones(rocksdb::Env::Default(),
+                                                       true /* auto_start */);
+      range_del_agg.AddTombstones(std::move(range_del_iter));
+      stats.time_add_tombstones += stop_watch_add_tombstones.ElapsedNanos();
+    }
+
+    rocksdb::ParsedInternalKey parsed_key;
+    parsed_key.sequence = FLAGS_num_range_tombstones / 2;
+    parsed_key.type = rocksdb::kTypeValue;
+
+    uint64_t first_key = rnd.Uniform(FLAGS_should_delete_upper_bound -
+                                     FLAGS_should_deletes_per_run + 1);
+
+    for (int j = 0; j < FLAGS_should_deletes_per_run; j++) {
+      std::string key_string = rocksdb::Key(first_key + j);
+      parsed_key.user_key = key_string;
+
+      rocksdb::StopWatchNano stop_watch_should_delete(rocksdb::Env::Default(),
+          true /* auto_start */);
+      range_del_agg.ShouldDelete(parsed_key, mode);
+      uint64_t call_time = stop_watch_should_delete.ElapsedNanos();
+
+      if (j == 0) {
+        stats.time_first_should_delete += call_time;
+      } else {
+        stats.time_rest_should_delete += call_time;
+      }
+    }
+  }
+
+  std::cout << "=========================\n"
+            << "Results:\n"
+            << "=========================\n"
+            << stats;
+
+  return 0;
+}
+
+#endif  // GFLAGS
diff -Nru rocksdb-5.15.10/db/range_del_aggregator.cc rocksdb-5.17.2/db/range_del_aggregator.cc
--- rocksdb-5.15.10/db/range_del_aggregator.cc	2018-09-13 17:25:20.000000000 +0000
+++ rocksdb-5.17.2/db/range_del_aggregator.cc	2018-11-12 19:57:32.000000000 +0000
@@ -76,7 +76,9 @@
     return false;
   }
 
-  void AddTombstone(RangeTombstone tombstone) override { rep_.emplace(tombstone); }
+  void AddTombstone(RangeTombstone tombstone) override {
+    rep_.emplace(tombstone);
+  }
 
   size_t Size() const override { return rep_.size(); }
 
@@ -171,7 +173,9 @@
   const Comparator* ucmp_;
 
  public:
-  CollapsedRangeDelMap(const Comparator* ucmp) : ucmp_(ucmp) {
+  explicit CollapsedRangeDelMap(const Comparator* ucmp) 
+    : rep_(stl_wrappers::LessOfComparator(ucmp)), 
+      ucmp_(ucmp) {
     InvalidatePosition();
   }
 
@@ -265,22 +269,36 @@
       //     2:    c---   OR   2:    c---   OR   2:    c---   OR   2: c------
       //     1: A--C           1:                1: A------        1: C------
       //                ^                 ^                 ^                  ^
-      // Insert a new transition at the new tombstone's start point, or raise
-      // the existing transition at that point to the new tombstone's seqno.
       end_seq = prev_seq();
-      rep_[t.start_key_] = t.seq_;  // operator[] will overwrite existing entry
+      Rep::iterator pit;
+      if (it != rep_.begin() && (pit = std::prev(it)) != rep_.begin() &&
+          ucmp_->Compare(pit->first, t.start_key_) == 0 && std::prev(pit)->second == t.seq_) {
+        // The new tombstone starts at the end of an existing tombstone with an
+        // identical seqno:
+        //
+        //     3:
+        //     2: A--C---
+        //     1:
+        //                ^
+        // Merge the tombstones by removing the existing tombstone's end key.
+        it = rep_.erase(std::prev(it));
+      } else {
+        // Insert a new transition at the new tombstone's start point, or raise
+        // the existing transition at that point to the new tombstone's seqno.
+        rep_[t.start_key_] = t.seq_;  // operator[] will overwrite existing entry
+      }
     } else {
       // The new tombstone's start point is covered by an existing tombstone:
       //
-      //      3: A-----   OR    3: C------
-      //      2:   c---         2: c------
-      //                ^                  ^
+      //      3: A-----   OR    3: C------   OR
+      //      2:   c---         2: c------         2: C------
+      //                ^                  ^                  ^
       // Do nothing.
     }
 
     // Look at all the existing transitions that overlap the new tombstone.
     while (it != rep_.end() && ucmp_->Compare(it->first, t.end_key_) < 0) {
-      if (t.seq_ > it->second) {
+      if (t.seq_ >= it->second) {
         // The transition is to an existing tombstone that the new tombstone
         // covers. Save the covered tombstone's seqno. We'll need to return to
         // it if the new tombstone ends before the existing tombstone.
@@ -324,15 +342,29 @@
     }
 
     if (t.seq_ == prev_seq()) {
-      // The new tombstone is unterminated in the map:
-      //
-      //     3:             OR   3: --G       OR   3: --G   K--
-      //     2: C-------k        2:   G---k        2:   G---k
-      //                  ^                 ^               ^
-      // End it now, returning to the last seqno we covered. Because end keys
-      // are exclusive, if there's an existing transition at t.end_key_, it
-      // takes precedence over the transition that we install here.
-      rep_.emplace(t.end_key_, end_seq);  // emplace is a noop if existing entry
+      // The new tombstone is unterminated in the map.
+      if (it != rep_.end() && t.seq_ == it->second && ucmp_->Compare(it->first, t.end_key_) == 0) {
+        // The new tombstone ends at the start of another tombstone with an
+        // identical seqno. Merge the tombstones by removing the existing
+        // tombstone's start key.
+        rep_.erase(it);
+      } else if (end_seq == prev_seq() || (it != rep_.end() && end_seq == it->second)) {
+        // The new tombstone is implicitly ended because its end point is
+        // contained within an existing tombstone with the same seqno:
+        //
+        //     2: ---k--N
+        //              ^
+      } else {
+        // The new tombstone needs an explicit end point.
+        //
+        //     3:             OR   3: --G       OR   3: --G   K--
+        //     2: C-------k        2:   G---k        2:   G---k
+        //                  ^                 ^               ^
+        // Install one that returns to the last seqno we covered. Because end
+        // keys are exclusive, if there's an existing transition at t.end_key_,
+        // it takes precedence over the transition that we install here.
+        rep_.emplace(t.end_key_, end_seq);  // emplace is a noop if existing entry
+      }
     } else {
       // The new tombstone is implicitly ended because its end point is covered
       // by an existing tombstone with a higher seqno.
@@ -478,22 +510,22 @@
       }
     }
     if (largest != nullptr) {
-      // This is subtly correct despite the discrepancy between
-      // FileMetaData::largest being inclusive while RangeTombstone::end_key_
-      // is exclusive. A tombstone will only extend past the bounds of an
-      // sstable if its end-key is the largest key in the table. If that
-      // occurs, the largest key for the table is set based on the smallest
-      // key in the next table in the level. In that case, largest->user_key()
-      // is not actually a key in the current table and thus we can use it as
-      // the exclusive end-key for the tombstone.
-      if (icmp_.user_comparator()->Compare(
-              tombstone.end_key_, largest->user_key()) > 0) {
-        // The largest key should be a tombstone sentinel key.
-        assert(GetInternalKeySeqno(largest->Encode()) == kMaxSequenceNumber);
+      // To safely truncate the range tombstone's end key, it must extend past
+      // the largest key in the sstable (which may have been extended to the
+      // smallest key in the next sstable), and largest must be a tombstone
+      // sentinel key. A range tombstone may straddle two sstables and not be
+      // the tombstone sentinel key in the first sstable if a user-key also
+      // straddles the sstables (possible if there is a snapshot between the
+      // two versions of the user-key), in which case we cannot truncate the
+      // range tombstone.
+      if (icmp_.user_comparator()->Compare(tombstone.end_key_,
+                                           largest->user_key()) > 0 &&
+          GetInternalKeySeqno(largest->Encode()) == kMaxSequenceNumber) {
         tombstone.end_key_ = largest->user_key();
       }
     }
-    GetRangeDelMap(tombstone.seq_).AddTombstone(std::move(tombstone));
+    auto seq = tombstone.seq_;
+    GetRangeDelMap(seq).AddTombstone(std::move(tombstone));
     input->Next();
   }
   if (!first_iter) {
diff -Nru rocksdb-5.15.10/db/range_del_aggregator_test.cc rocksdb-5.17.2/db/range_del_aggregator_test.cc
--- rocksdb-5.15.10/db/range_del_aggregator_test.cc	2018-09-13 17:25:20.000000000 +0000
+++ rocksdb-5.17.2/db/range_del_aggregator_test.cc	2018-11-12 19:57:32.000000000 +0000
@@ -27,7 +27,7 @@
   kReverse,
 };
 
-static auto icmp = InternalKeyComparator(BytewiseComparator());
+static auto bytewise_icmp = InternalKeyComparator(BytewiseComparator());
 
 void AddTombstones(RangeDelAggregator* range_del_agg,
                    const std::vector<RangeTombstone>& range_dels,
@@ -66,8 +66,8 @@
     const std::vector<RangeTombstone>& range_dels_in,
     const std::vector<ExpectedPoint>& expected_points,
     const std::vector<RangeTombstone>& expected_collapsed_range_dels,
-    const InternalKey* smallest = nullptr,
-    const InternalKey* largest = nullptr) {
+    const InternalKey* smallest = nullptr, const InternalKey* largest = nullptr,
+    const InternalKeyComparator& icmp = bytewise_icmp) {
   // Test same result regardless of which order the range deletions are added
   // and regardless of collapsed mode.
   for (bool collapsed : {false, true}) {
@@ -164,6 +164,14 @@
                   {{"a", "b", 5}, {"b", "c", 10}, {"c", "d", 5}});
 }
 
+TEST_F(RangeDelAggregatorTest, OverlapAboveMiddleReverse) {
+  VerifyRangeDels({{"d", "a", 5}, {"c", "b", 10}},
+                  {{"z", 0}, {"d", 5}, {"c", 10}, {"b", 5}, {"a", 0}},
+                  {{"d", "c", 5}, {"c", "b", 10}, {"b", "a", 5}},
+                  nullptr /* smallest */, nullptr /* largest */,
+                  InternalKeyComparator(ReverseBytewiseComparator()));
+}
+
 TEST_F(RangeDelAggregatorTest, OverlapFully) {
   VerifyRangeDels({{"a", "d", 10}, {"b", "c", 5}},
                   {{" ", 0}, {"a", 10}, {"d", 0}}, {{"a", "d", 10}});
@@ -200,6 +208,30 @@
                   {{"a", "b", 5}, {"c", "d", 10}, {"e", "f", 15}});
 }
 
+TEST_F(RangeDelAggregatorTest, IdenticalSameSeqNo) {
+  VerifyRangeDels({{"a", "b", 5}, {"a", "b", 5}},
+                  {{" ", 0}, {"a", 5}, {"b", 0}},
+                  {{"a", "b", 5}});
+}
+
+TEST_F(RangeDelAggregatorTest, ContiguousSameSeqNo) {
+  VerifyRangeDels({{"a", "b", 5}, {"b", "c", 5}},
+                  {{" ", 0}, {"a", 5}, {"b", 5}, {"c", 0}},
+                  {{"a", "c", 5}});
+}
+
+TEST_F(RangeDelAggregatorTest, OverlappingSameSeqNo) {
+  VerifyRangeDels({{"a", "c", 5}, {"b", "d", 5}},
+                  {{" ", 0}, {"a", 5}, {"b", 5}, {"c", 5}, {"d", 0}},
+                  {{"a", "d", 5}});
+}
+
+TEST_F(RangeDelAggregatorTest, CoverSameSeqNo) {
+  VerifyRangeDels({{"a", "d", 5}, {"b", "c", 5}},
+                  {{" ", 0}, {"a", 5}, {"b", 5}, {"c", 5}, {"d", 0}},
+                  {{"a", "d", 5}});
+}
+
 // Note the Cover* tests also test cases where tombstones are inserted under a
 // larger one when VerifyRangeDels() runs them in reverse
 TEST_F(RangeDelAggregatorTest, CoverMultipleFromLeft) {
@@ -235,14 +267,14 @@
 
 TEST_F(RangeDelAggregatorTest, MergingIteratorAllEmptyStripes) {
   for (bool collapsed : {true, false}) {
-    RangeDelAggregator range_del_agg(icmp, {1, 2}, collapsed);
+    RangeDelAggregator range_del_agg(bytewise_icmp, {1, 2}, collapsed);
     VerifyRangeDelIter(range_del_agg.NewIterator().get(), {});
   }
 }
 
 TEST_F(RangeDelAggregatorTest, MergingIteratorOverlappingStripes) {
   for (bool collapsed : {true, false}) {
-    RangeDelAggregator range_del_agg(icmp, {5, 15, 25, 35}, collapsed);
+    RangeDelAggregator range_del_agg(bytewise_icmp, {5, 15, 25, 35}, collapsed);
     AddTombstones(
         &range_del_agg,
         {{"d", "e", 10}, {"aa", "b", 20}, {"c", "d", 30}, {"a", "b", 10}});
@@ -253,7 +285,8 @@
 }
 
 TEST_F(RangeDelAggregatorTest, MergingIteratorSeek) {
-  RangeDelAggregator range_del_agg(icmp, {5, 15}, true /* collapsed */);
+  RangeDelAggregator range_del_agg(bytewise_icmp, {5, 15},
+                                   true /* collapsed */);
   AddTombstones(&range_del_agg, {{"a", "c", 10},
                                  {"b", "c", 11},
                                  {"f", "g", 10},
@@ -300,6 +333,21 @@
       &smallest, &largest);
 }
 
+TEST_F(RangeDelAggregatorTest, OverlappingLargestKeyTruncateTombstones) {
+  const InternalKey smallest("b", 1, kTypeRangeDeletion);
+  const InternalKey largest(
+      "e", 3,  // could happen if "e" is in consecutive sstables
+      kTypeValue);
+  VerifyRangeDels(
+      {{"a", "c", 10}, {"d", "f", 10}},
+      {{"a", 10, true},  // truncated
+       {"b", 10, false}, // not truncated
+       {"d", 10, false}, // not truncated
+       {"e", 10, false}}, // not truncated
+      {{"b", "c", 10}, {"d", "f", 10}},
+      &smallest, &largest);
+}
+
 }  // namespace rocksdb
 
 int main(int argc, char** argv) {
diff -Nru rocksdb-5.15.10/db/repair.cc rocksdb-5.17.2/db/repair.cc
--- rocksdb-5.15.10/db/repair.cc	2018-09-13 17:25:20.000000000 +0000
+++ rocksdb-5.17.2/db/repair.cc	2018-11-12 19:57:32.000000000 +0000
@@ -353,7 +353,7 @@
     // propagating bad information (like overly large sequence
     // numbers).
     log::Reader reader(db_options_.info_log, std::move(lfile_reader), &reporter,
-                       true /*enable checksum*/, 0 /*initial_offset*/, log);
+                       true /*enable checksum*/, log);
 
     // Initialize per-column family memtables
     for (auto* cfd : *vset_.GetColumnFamilySet()) {
diff -Nru rocksdb-5.15.10/db/repair_test.cc rocksdb-5.17.2/db/repair_test.cc
--- rocksdb-5.15.10/db/repair_test.cc	2018-09-13 17:25:20.000000000 +0000
+++ rocksdb-5.17.2/db/repair_test.cc	2018-11-12 19:57:32.000000000 +0000
@@ -74,7 +74,7 @@
 
   Close();
   ASSERT_OK(env_->FileExists(manifest_path));
-  CreateFile(env_, manifest_path, "blah");
+  CreateFile(env_, manifest_path, "blah", false /* use_fsync */);
   ASSERT_OK(RepairDB(dbname_, CurrentOptions()));
   Reopen(CurrentOptions());
 
@@ -153,7 +153,7 @@
   Flush();
   auto sst_path = GetFirstSstPath();
   ASSERT_FALSE(sst_path.empty());
-  CreateFile(env_, sst_path, "blah");
+  CreateFile(env_, sst_path, "blah", false /* use_fsync */);
 
   Close();
   ASSERT_OK(RepairDB(dbname_, CurrentOptions()));
diff -Nru rocksdb-5.15.10/db/snapshot_checker.h rocksdb-5.17.2/db/snapshot_checker.h
--- rocksdb-5.15.10/db/snapshot_checker.h	2018-09-13 17:25:20.000000000 +0000
+++ rocksdb-5.17.2/db/snapshot_checker.h	2018-11-12 19:57:32.000000000 +0000
@@ -19,8 +19,9 @@
 class DisableGCSnapshotChecker : public SnapshotChecker {
  public:
   virtual ~DisableGCSnapshotChecker() {}
-  virtual bool IsInSnapshot(SequenceNumber /*sequence*/,
-                            SequenceNumber /*snapshot_sequence*/) const override {
+  virtual bool IsInSnapshot(
+      SequenceNumber /*sequence*/,
+      SequenceNumber /*snapshot_sequence*/) const override {
     // By returning false, we prevent all the values from being GCed
     return false;
   }
diff -Nru rocksdb-5.15.10/db/table_cache.cc rocksdb-5.17.2/db/table_cache.cc
--- rocksdb-5.15.10/db/table_cache.cc	2018-09-13 17:25:20.000000000 +0000
+++ rocksdb-5.17.2/db/table_cache.cc	2018-11-12 19:57:32.000000000 +0000
@@ -120,7 +120,7 @@
     s = ioptions_.table_factory->NewTableReader(
         TableReaderOptions(ioptions_, prefix_extractor, env_options,
                            internal_comparator, skip_filters, immortal_tables_,
-                           level),
+                           level, fd.largest_seqno),
         std::move(file_reader), fd.GetFileSize(), table_reader,
         prefetch_index_and_filter_in_cache);
     TEST_SYNC_POINT("TableCache::GetTableReader:0");
@@ -238,7 +238,7 @@
   if (s.ok()) {
     if (options.table_filter &&
         !options.table_filter(*table_reader->GetTableProperties())) {
-      result = NewEmptyInternalIterator(arena);
+      result = NewEmptyInternalIterator<Slice>(arena);
     } else {
       result = table_reader->NewIterator(options, prefix_extractor, arena,
                                          skip_filters, for_compaction);
@@ -279,7 +279,7 @@
   }
   if (!s.ok()) {
     assert(result == nullptr);
-    result = NewErrorInternalIterator(s, arena);
+    result = NewErrorInternalIterator<Slice>(s, arena);
   }
   return result;
 }
diff -Nru rocksdb-5.15.10/db/table_cache.h rocksdb-5.17.2/db/table_cache.h
--- rocksdb-5.15.10/db/table_cache.h	2018-09-13 17:25:20.000000000 +0000
+++ rocksdb-5.17.2/db/table_cache.h	2018-11-12 19:57:32.000000000 +0000
@@ -31,7 +31,6 @@
 struct FileDescriptor;
 class GetContext;
 class HistogramImpl;
-class InternalIterator;
 
 class TableCache {
  public:
diff -Nru rocksdb-5.15.10/db/table_properties_collector_test.cc rocksdb-5.17.2/db/table_properties_collector_test.cc
--- rocksdb-5.15.10/db/table_properties_collector_test.cc	2018-09-13 17:25:20.000000000 +0000
+++ rocksdb-5.17.2/db/table_properties_collector_test.cc	2018-11-12 19:57:32.000000000 +0000
@@ -46,7 +46,8 @@
                  std::unique_ptr<WritableFileWriter>* writable,
                  std::unique_ptr<TableBuilder>* builder) {
   unique_ptr<WritableFile> wf(new test::StringSink);
-  writable->reset(new WritableFileWriter(std::move(wf), EnvOptions()));
+  writable->reset(
+      new WritableFileWriter(std::move(wf), "" /* don't care */, EnvOptions()));
   int unknown_level = -1;
   builder->reset(NewTableBuilder(
       ioptions, moptions, internal_comparator, int_tbl_prop_collector_factories,
diff -Nru rocksdb-5.15.10/db/transaction_log_impl.cc rocksdb-5.17.2/db/transaction_log_impl.cc
--- rocksdb-5.15.10/db/transaction_log_impl.cc	2018-09-13 17:25:20.000000000 +0000
+++ rocksdb-5.17.2/db/transaction_log_impl.cc	2018-11-12 19:57:32.000000000 +0000
@@ -104,7 +104,7 @@
   if (files_->size() <= startFileIndex) {
     return;
   }
-  Status s = OpenLogReader(files_->at(startFileIndex).get());
+  Status s = OpenLogReader(files_->at(static_cast<size_t>(startFileIndex)).get());
   if (!s.ok()) {
     currentStatus_ = s;
     reporter_.Info(currentStatus_.ToString().c_str());
@@ -312,9 +312,9 @@
     return s;
   }
   assert(file);
-  currentLogReader_.reset(new log::Reader(
-      options_->info_log, std::move(file), &reporter_,
-      read_options_.verify_checksums_, 0, logFile->LogNumber()));
+  currentLogReader_.reset(
+      new log::Reader(options_->info_log, std::move(file), &reporter_,
+                      read_options_.verify_checksums_, logFile->LogNumber()));
   return Status::OK();
 }
 }  //  namespace rocksdb
diff -Nru rocksdb-5.15.10/db/version_builder.cc rocksdb-5.17.2/db/version_builder.cc
--- rocksdb-5.15.10/db/version_builder.cc	2018-09-13 17:25:20.000000000 +0000
+++ rocksdb-5.17.2/db/version_builder.cc	2018-11-12 19:57:32.000000000 +0000
@@ -35,11 +35,11 @@
 namespace rocksdb {
 
 bool NewestFirstBySeqNo(FileMetaData* a, FileMetaData* b) {
-  if (a->largest_seqno != b->largest_seqno) {
-    return a->largest_seqno > b->largest_seqno;
+  if (a->fd.largest_seqno != b->fd.largest_seqno) {
+    return a->fd.largest_seqno > b->fd.largest_seqno;
   }
-  if (a->smallest_seqno != b->smallest_seqno) {
-    return a->smallest_seqno > b->smallest_seqno;
+  if (a->fd.smallest_seqno != b->fd.smallest_seqno) {
+    return a->fd.smallest_seqno > b->fd.smallest_seqno;
   }
   // Break ties by file number
   return a->fd.GetNumber() > b->fd.GetNumber();
@@ -162,22 +162,24 @@
             abort();
           }
 
-          if (f2->smallest_seqno == f2->largest_seqno) {
+          if (f2->fd.smallest_seqno == f2->fd.largest_seqno) {
             // This is an external file that we ingested
-            SequenceNumber external_file_seqno = f2->smallest_seqno;
-            if (!(external_file_seqno < f1->largest_seqno ||
+            SequenceNumber external_file_seqno = f2->fd.smallest_seqno;
+            if (!(external_file_seqno < f1->fd.largest_seqno ||
                   external_file_seqno == 0)) {
-              fprintf(stderr, "L0 file with seqno %" PRIu64 " %" PRIu64
-                              " vs. file with global_seqno %" PRIu64 "\n",
-                      f1->smallest_seqno, f1->largest_seqno,
+              fprintf(stderr,
+                      "L0 file with seqno %" PRIu64 " %" PRIu64
+                      " vs. file with global_seqno %" PRIu64 "\n",
+                      f1->fd.smallest_seqno, f1->fd.largest_seqno,
                       external_file_seqno);
               abort();
             }
-          } else if (f1->smallest_seqno <= f2->smallest_seqno) {
-            fprintf(stderr, "L0 files seqno %" PRIu64 " %" PRIu64
-                            " vs. %" PRIu64 " %" PRIu64 "\n",
-                    f1->smallest_seqno, f1->largest_seqno, f2->smallest_seqno,
-                    f2->largest_seqno);
+          } else if (f1->fd.smallest_seqno <= f2->fd.smallest_seqno) {
+            fprintf(stderr,
+                    "L0 files seqno %" PRIu64 " %" PRIu64 " vs. %" PRIu64
+                    " %" PRIu64 "\n",
+                    f1->fd.smallest_seqno, f1->fd.largest_seqno,
+                    f2->fd.smallest_seqno, f2->fd.largest_seqno);
             abort();
           }
         } else {
@@ -322,8 +324,6 @@
       // Merge the set of added files with the set of pre-existing files.
       // Drop any deleted files.  Store the result in *v.
       const auto& base_files = base_vstorage_->LevelFiles(level);
-      auto base_iter = base_files.begin();
-      auto base_end = base_files.end();
       const auto& unordered_added_files = levels_[level].added_files;
       vstorage->Reserve(level,
                         base_files.size() + unordered_added_files.size());
@@ -337,30 +337,27 @@
       std::sort(added_files.begin(), added_files.end(), cmp);
 
 #ifndef NDEBUG
-      FileMetaData* prev_file = nullptr;
-#endif
-
+      FileMetaData* prev_added_file = nullptr;
       for (const auto& added : added_files) {
-#ifndef NDEBUG
-        if (level > 0 && prev_file != nullptr) {
+        if (level > 0 && prev_added_file != nullptr) {
           assert(base_vstorage_->InternalComparator()->Compare(
-                     prev_file->smallest, added->smallest) <= 0);
+                     prev_added_file->smallest, added->smallest) <= 0);
         }
-        prev_file = added;
+        prev_added_file = added;
+      }
 #endif
 
-        // Add all smaller files listed in base_
-        for (auto bpos = std::upper_bound(base_iter, base_end, added, cmp);
-             base_iter != bpos; ++base_iter) {
-          MaybeAddFile(vstorage, level, *base_iter);
+      auto base_iter = base_files.begin();
+      auto base_end = base_files.end();
+      auto added_iter = added_files.begin();
+      auto added_end = added_files.end();
+      while (added_iter != added_end || base_iter != base_end) {
+        if (base_iter == base_end ||
+                (added_iter != added_end && cmp(*added_iter, *base_iter))) {
+          MaybeAddFile(vstorage, level, *added_iter++);
+        } else {
+          MaybeAddFile(vstorage, level, *base_iter++);
         }
-
-        MaybeAddFile(vstorage, level, added);
-      }
-
-      // Add remaining base files
-      for (; base_iter != base_end; ++base_iter) {
-        MaybeAddFile(vstorage, level, *base_iter);
       }
     }
 
@@ -382,7 +379,7 @@
     }
 
     std::atomic<size_t> next_file_meta_idx(0);
-    std::function<void()> load_handlers_func = [&]() {
+    std::function<void()> load_handlers_func([&]() {
       while (true) {
         size_t file_idx = next_file_meta_idx.fetch_add(1);
         if (file_idx >= files_meta.size()) {
@@ -403,7 +400,7 @@
               file_meta->table_reader_handle);
         }
       }
-    };
+    });
 
     std::vector<port::Thread> threads;
     for (int i = 1; i < max_threads; i++) {
diff -Nru rocksdb-5.15.10/db/version_builder_test.cc rocksdb-5.17.2/db/version_builder_test.cc
--- rocksdb-5.15.10/db/version_builder_test.cc	2018-09-13 17:25:20.000000000 +0000
+++ rocksdb-5.17.2/db/version_builder_test.cc	2018-11-12 19:57:32.000000000 +0000
@@ -63,8 +63,8 @@
     f->fd = FileDescriptor(file_number, path_id, file_size);
     f->smallest = GetInternalKey(smallest, smallest_seq);
     f->largest = GetInternalKey(largest, largest_seq);
-    f->smallest_seqno = smallest_seqno;
-    f->largest_seqno = largest_seqno;
+    f->fd.smallest_seqno = smallest_seqno;
+    f->fd.largest_seqno = largest_seqno;
     f->compensated_file_size = file_size;
     f->refs = 0;
     f->num_entries = num_entries;
diff -Nru rocksdb-5.15.10/db/version_edit.cc rocksdb-5.17.2/db/version_edit.cc
--- rocksdb-5.15.10/db/version_edit.cc	2018-09-13 17:25:20.000000000 +0000
+++ rocksdb-5.17.2/db/version_edit.cc	2018-11-12 19:57:32.000000000 +0000
@@ -40,13 +40,15 @@
   kColumnFamilyAdd = 201,
   kColumnFamilyDrop = 202,
   kMaxColumnFamily = 203,
+
+  kInAtomicGroup = 300,
 };
 
 enum CustomTag : uint32_t {
   kTerminate = 1,  // The end of customized fields
   kNeedCompaction = 2,
   // Since Manifest is not entirely currently forward-compatible, and the only
-  // forward-compatbile part is the CutsomtTag of kNewFile, we currently encode
+  // forward-compatible part is the CutsomtTag of kNewFile, we currently encode
   // kMinLogNumberToKeep as part of a CustomTag as a hack. This should be
   // removed when manifest becomes forward-comptabile.
   kMinLogNumberToKeepHack = 3,
@@ -83,6 +85,8 @@
   is_column_family_add_ = 0;
   is_column_family_drop_ = 0;
   column_family_name_.clear();
+  is_in_atomic_group_ = false;
+  remaining_entries_ = 0;
 }
 
 bool VersionEdit::EncodeTo(std::string* dst) const {
@@ -135,7 +139,7 @@
     PutVarint64(dst, f.fd.GetFileSize());
     PutLengthPrefixedSlice(dst, f.smallest.Encode());
     PutLengthPrefixedSlice(dst, f.largest.Encode());
-    PutVarint64Varint64(dst, f.smallest_seqno, f.largest_seqno);
+    PutVarint64Varint64(dst, f.fd.smallest_seqno, f.fd.largest_seqno);
     if (has_customized_fields) {
       // Customized fields' format:
       // +-----------------------------+
@@ -200,6 +204,11 @@
   if (is_column_family_drop_) {
     PutVarint32(dst, kColumnFamilyDrop);
   }
+
+  if (is_in_atomic_group_) {
+    PutVarint32(dst, kInAtomicGroup);
+    PutVarint32(dst, remaining_entries_);
+  }
   return true;
 }
 
@@ -233,14 +242,16 @@
   uint64_t number;
   uint32_t path_id = 0;
   uint64_t file_size;
+  SequenceNumber smallest_seqno;
+  SequenceNumber largest_seqno;
   // Since this is the only forward-compatible part of the code, we hack new
   // extension into this record. When we do, we set this boolean to distinguish
   // the record from the normal NewFile records.
   if (GetLevel(input, &level, &msg) && GetVarint64(input, &number) &&
       GetVarint64(input, &file_size) && GetInternalKey(input, &f.smallest) &&
       GetInternalKey(input, &f.largest) &&
-      GetVarint64(input, &f.smallest_seqno) &&
-      GetVarint64(input, &f.largest_seqno)) {
+      GetVarint64(input, &smallest_seqno) &&
+      GetVarint64(input, &largest_seqno)) {
     // See comments in VersionEdit::EncodeTo() for format of customized fields
     while (true) {
       uint32_t custom_tag;
@@ -272,7 +283,7 @@
           break;
         case kMinLogNumberToKeepHack:
           // This is a hack to encode kMinLogNumberToKeep in a
-          // forward-compatbile fashion.
+          // forward-compatible fashion.
           if (!GetFixed64(&field, &min_log_number_to_keep_)) {
             return "deleted log number malformatted";
           }
@@ -289,7 +300,8 @@
   } else {
     return "new-file4 entry";
   }
-  f.fd = FileDescriptor(number, path_id, file_size);
+  f.fd =
+      FileDescriptor(number, path_id, file_size, smallest_seqno, largest_seqno);
   new_files_.push_back(std::make_pair(level, f));
   return nullptr;
 }
@@ -409,13 +421,16 @@
       case kNewFile2: {
         uint64_t number;
         uint64_t file_size;
+        SequenceNumber smallest_seqno;
+        SequenceNumber largest_seqno;
         if (GetLevel(&input, &level, &msg) && GetVarint64(&input, &number) &&
             GetVarint64(&input, &file_size) &&
             GetInternalKey(&input, &f.smallest) &&
             GetInternalKey(&input, &f.largest) &&
-            GetVarint64(&input, &f.smallest_seqno) &&
-            GetVarint64(&input, &f.largest_seqno)) {
-          f.fd = FileDescriptor(number, 0, file_size);
+            GetVarint64(&input, &smallest_seqno) &&
+            GetVarint64(&input, &largest_seqno)) {
+          f.fd = FileDescriptor(number, 0, file_size, smallest_seqno,
+                                largest_seqno);
           new_files_.push_back(std::make_pair(level, f));
         } else {
           if (!msg) {
@@ -429,13 +444,16 @@
         uint64_t number;
         uint32_t path_id;
         uint64_t file_size;
+        SequenceNumber smallest_seqno;
+        SequenceNumber largest_seqno;
         if (GetLevel(&input, &level, &msg) && GetVarint64(&input, &number) &&
             GetVarint32(&input, &path_id) && GetVarint64(&input, &file_size) &&
             GetInternalKey(&input, &f.smallest) &&
             GetInternalKey(&input, &f.largest) &&
-            GetVarint64(&input, &f.smallest_seqno) &&
-            GetVarint64(&input, &f.largest_seqno)) {
-          f.fd = FileDescriptor(number, path_id, file_size);
+            GetVarint64(&input, &smallest_seqno) &&
+            GetVarint64(&input, &largest_seqno)) {
+          f.fd = FileDescriptor(number, path_id, file_size, smallest_seqno,
+                                largest_seqno);
           new_files_.push_back(std::make_pair(level, f));
         } else {
           if (!msg) {
@@ -473,6 +491,15 @@
         is_column_family_drop_ = true;
         break;
 
+      case kInAtomicGroup:
+        is_in_atomic_group_ = true;
+        if (!GetVarint32(&input, &remaining_entries_)) {
+          if (!msg) {
+            msg = "remaining entries";
+          }
+        }
+        break;
+
       default:
         msg = "unknown tag";
         break;
@@ -551,6 +578,11 @@
     r.append("\n  MaxColumnFamily: ");
     AppendNumberTo(&r, max_column_family_);
   }
+  if (is_in_atomic_group_) {
+    r.append("\n AtomicGroup: ");
+    AppendNumberTo(&r, remaining_entries_);
+    r.append(" entries remains");
+  }
   r.append("\n}\n");
   return r;
 }
@@ -623,6 +655,9 @@
   if (has_min_log_number_to_keep_) {
     jw << "MinLogNumberToKeep" << min_log_number_to_keep_;
   }
+  if (is_in_atomic_group_) {
+    jw << "AtomicGroup" << remaining_entries_;
+  }
 
   jw.EndObject();
 
diff -Nru rocksdb-5.15.10/db/version_edit.h rocksdb-5.17.2/db/version_edit.h
--- rocksdb-5.15.10/db/version_edit.h	2018-09-13 17:25:20.000000000 +0000
+++ rocksdb-5.17.2/db/version_edit.h	2018-11-12 19:57:32.000000000 +0000
@@ -27,7 +27,7 @@
 extern uint64_t PackFileNumberAndPathId(uint64_t number, uint64_t path_id);
 
 // A copyable structure contains information needed to read data from an SST
-// file. It can contains a pointer to a table reader opened for the file, or
+// file. It can contain a pointer to a table reader opened for the file, or
 // file number and size, which can be used to create a new table reader for it.
 // The behavior is undefined when a copied of the structure is used when the
 // file is not in any live version any more.
@@ -36,18 +36,28 @@
   TableReader* table_reader;
   uint64_t packed_number_and_path_id;
   uint64_t file_size;  // File size in bytes
+  SequenceNumber smallest_seqno;  // The smallest seqno in this file
+  SequenceNumber largest_seqno;   // The largest seqno in this file
 
   FileDescriptor() : FileDescriptor(0, 0, 0) {}
 
   FileDescriptor(uint64_t number, uint32_t path_id, uint64_t _file_size)
+      : FileDescriptor(number, path_id, _file_size, kMaxSequenceNumber, 0) {}
+
+  FileDescriptor(uint64_t number, uint32_t path_id, uint64_t _file_size,
+                 SequenceNumber _smallest_seqno, SequenceNumber _largest_seqno)
       : table_reader(nullptr),
         packed_number_and_path_id(PackFileNumberAndPathId(number, path_id)),
-        file_size(_file_size) {}
+        file_size(_file_size),
+        smallest_seqno(_smallest_seqno),
+        largest_seqno(_largest_seqno) {}
 
   FileDescriptor& operator=(const FileDescriptor& fd) {
     table_reader = fd.table_reader;
     packed_number_and_path_id = fd.packed_number_and_path_id;
     file_size = fd.file_size;
+    smallest_seqno = fd.smallest_seqno;
+    largest_seqno = fd.largest_seqno;
     return *this;
   }
 
@@ -77,8 +87,6 @@
   FileDescriptor fd;
   InternalKey smallest;            // Smallest internal key served by table
   InternalKey largest;             // Largest internal key served by table
-  SequenceNumber smallest_seqno;   // The smallest seqno in this file
-  SequenceNumber largest_seqno;    // The largest seqno in this file
 
   // Needs to be disposed when refs becomes 0.
   Cache::Handle* table_reader_handle;
@@ -108,9 +116,7 @@
                                // file.
 
   FileMetaData()
-      : smallest_seqno(kMaxSequenceNumber),
-        largest_seqno(0),
-        table_reader_handle(nullptr),
+      : table_reader_handle(nullptr),
         compensated_file_size(0),
         num_entries(0),
         num_deletions(0),
@@ -128,8 +134,8 @@
       smallest.DecodeFrom(key);
     }
     largest.DecodeFrom(key);
-    smallest_seqno = std::min(smallest_seqno, seqno);
-    largest_seqno = std::max(largest_seqno, seqno);
+    fd.smallest_seqno = std::min(fd.smallest_seqno, seqno);
+    fd.largest_seqno = std::max(fd.largest_seqno, seqno);
   }
 
   // Unlike UpdateBoundaries, ranges do not need to be presented in any
@@ -143,8 +149,8 @@
     if (largest.size() == 0 || icmp.Compare(largest, end) < 0) {
       largest = end;
     }
-    smallest_seqno = std::min(smallest_seqno, seqno);
-    largest_seqno = std::max(largest_seqno, seqno);
+    fd.smallest_seqno = std::min(fd.smallest_seqno, seqno);
+    fd.largest_seqno = std::max(fd.largest_seqno, seqno);
   }
 };
 
@@ -233,17 +239,18 @@
                bool marked_for_compaction) {
     assert(smallest_seqno <= largest_seqno);
     FileMetaData f;
-    f.fd = FileDescriptor(file, file_path_id, file_size);
+    f.fd = FileDescriptor(file, file_path_id, file_size, smallest_seqno,
+                          largest_seqno);
     f.smallest = smallest;
     f.largest = largest;
-    f.smallest_seqno = smallest_seqno;
-    f.largest_seqno = largest_seqno;
+    f.fd.smallest_seqno = smallest_seqno;
+    f.fd.largest_seqno = largest_seqno;
     f.marked_for_compaction = marked_for_compaction;
     new_files_.emplace_back(level, std::move(f));
   }
 
   void AddFile(int level, const FileMetaData& f) {
-    assert(f.smallest_seqno <= f.largest_seqno);
+    assert(f.fd.smallest_seqno <= f.fd.largest_seqno);
     new_files_.emplace_back(level, f);
   }
 
@@ -293,6 +300,11 @@
     return new_files_;
   }
 
+  void MarkAtomicGroup(uint32_t remaining_entries) {
+    is_in_atomic_group_ = true;
+    remaining_entries_ = remaining_entries;
+  }
+
   std::string DebugString(bool hex_key = false) const;
   std::string DebugJSON(int edit_num, bool hex_key = false) const;
 
@@ -322,7 +334,7 @@
   DeletedFileSet deleted_files_;
   std::vector<std::pair<int, FileMetaData>> new_files_;
 
-  // Each version edit record should have column_family_id set
+  // Each version edit record should have column_family_ set
   // If it's not set, it is default (0)
   uint32_t column_family_;
   // a version edit can be either column_family add or
@@ -331,6 +343,9 @@
   bool is_column_family_drop_;
   bool is_column_family_add_;
   std::string column_family_name_;
+
+  bool is_in_atomic_group_;
+  uint32_t remaining_entries_;
 };
 
 }  // namespace rocksdb
diff -Nru rocksdb-5.15.10/db/version_edit_test.cc rocksdb-5.17.2/db/version_edit_test.cc
--- rocksdb-5.15.10/db/version_edit_test.cc	2018-09-13 17:25:20.000000000 +0000
+++ rocksdb-5.17.2/db/version_edit_test.cc	2018-11-12 19:57:32.000000000 +0000
@@ -191,6 +191,12 @@
   TestEncodeDecode(edit);
 }
 
+TEST_F(VersionEditTest, AtomicGroupTest) {
+  VersionEdit edit;
+  edit.MarkAtomicGroup(1);
+  TestEncodeDecode(edit);
+}
+
 }  // namespace rocksdb
 
 int main(int argc, char** argv) {
diff -Nru rocksdb-5.15.10/db/version_set.cc rocksdb-5.17.2/db/version_set.cc
--- rocksdb-5.15.10/db/version_set.cc	2018-09-13 17:25:20.000000000 +0000
+++ rocksdb-5.17.2/db/version_set.cc	2018-11-12 19:57:32.000000000 +0000
@@ -21,6 +21,7 @@
 #include <string>
 #include <unordered_map>
 #include <vector>
+#include <list>
 #include "db/compaction.h"
 #include "db/internal_stats.h"
 #include "db/log_reader.h"
@@ -62,20 +63,12 @@
     const Slice& key,
     uint32_t left,
     uint32_t right) {
-  while (left < right) {
-    uint32_t mid = (left + right) / 2;
-    const FdWithKeyRange& f = file_level.files[mid];
-    if (icmp.InternalKeyComparator::Compare(f.largest_key, key) < 0) {
-      // Key at "mid.largest" is < "target".  Therefore all
-      // files at or before "mid" are uninteresting.
-      left = mid + 1;
-    } else {
-      // Key at "mid.largest" is >= "target".  Therefore all files
-      // after "mid" are uninteresting.
-      right = mid;
-    }
-  }
-  return right;
+  auto cmp = [&](const FdWithKeyRange& f, const Slice& k) -> bool {
+    return icmp.InternalKeyComparator::Compare(f.largest_key, k) < 0;
+  };
+  const auto &b = file_level.files;
+  return static_cast<int>(std::lower_bound(b + left,
+                                           b + right, key, cmp) - b);
 }
 
 Status OverlapWithIterator(const Comparator* ucmp,
@@ -895,13 +888,16 @@
         assert(!ioptions->cf_paths.empty());
         file_path = ioptions->cf_paths.back().path;
       }
-      files.emplace_back(
-          MakeTableFileName("", file->fd.GetNumber()), file_path,
-          file->fd.GetFileSize(), file->smallest_seqno, file->largest_seqno,
+      files.emplace_back(SstFileMetaData{
+          MakeTableFileName("", file->fd.GetNumber()),
+          file_path,
+          static_cast<size_t>(file->fd.GetFileSize()),
+          file->fd.smallest_seqno,
+          file->fd.largest_seqno,
           file->smallest.user_key().ToString(),
           file->largest.user_key().ToString(),
           file->stats.num_reads_sampled.load(std::memory_order_relaxed),
-          file->being_compacted);
+          file->being_compacted});
       level_size += file->fd.GetFileSize();
     }
     cf_meta->levels.emplace_back(
@@ -1212,12 +1208,9 @@
 
     // report the counters before returning
     if (get_context.State() != GetContext::kNotFound &&
-        get_context.State() != GetContext::kMerge) {
-      for (uint32_t t = 0; t < Tickers::TICKER_ENUM_MAX; t++) {
-        if (get_context.tickers_value[t] > 0) {
-          RecordTick(db_statistics_, t, get_context.tickers_value[t]);
-        }
-      }
+        get_context.State() != GetContext::kMerge &&
+        db_statistics_ != nullptr) {
+      get_context.ReportCounters();
     }
     switch (get_context.State()) {
       case GetContext::kNotFound:
@@ -1251,10 +1244,8 @@
     f = fp.GetNextFile();
   }
 
-  for (uint32_t t = 0; t < Tickers::TICKER_ENUM_MAX; t++) {
-    if (get_context.tickers_value[t] > 0) {
-      RecordTick(db_statistics_, t, get_context.tickers_value[t]);
-    }
+  if (db_statistics_ != nullptr) {
+    get_context.ReportCounters();
   }
   if (GetContext::kMerge == get_context.State()) {
     if (!merge_operator_) {
@@ -1896,13 +1887,15 @@
       case kOldestLargestSeqFirst:
         std::sort(temp.begin(), temp.end(),
                   [](const Fsize& f1, const Fsize& f2) -> bool {
-                    return f1.file->largest_seqno < f2.file->largest_seqno;
+                    return f1.file->fd.largest_seqno <
+                           f2.file->fd.largest_seqno;
                   });
         break;
       case kOldestSmallestSeqFirst:
         std::sort(temp.begin(), temp.end(),
                   [](const Fsize& f1, const Fsize& f2) -> bool {
-                    return f1.file->smallest_seqno < f2.file->smallest_seqno;
+                    return f1.file->fd.smallest_seqno <
+                           f2.file->fd.smallest_seqno;
                   });
         break;
       case kMinOverlappingRatio:
@@ -1986,17 +1979,17 @@
   bottommost_files_mark_threshold_ = kMaxSequenceNumber;
   for (auto& level_and_file : bottommost_files_) {
     if (!level_and_file.second->being_compacted &&
-        level_and_file.second->largest_seqno != 0 &&
+        level_and_file.second->fd.largest_seqno != 0 &&
         level_and_file.second->num_deletions > 1) {
       // largest_seqno might be nonzero due to containing the final key in an
       // earlier compaction, whose seqnum we didn't zero out. Multiple deletions
       // ensures the file really contains deleted or overwritten keys.
-      if (level_and_file.second->largest_seqno < oldest_snapshot_seqnum_) {
+      if (level_and_file.second->fd.largest_seqno < oldest_snapshot_seqnum_) {
         bottommost_files_marked_for_compaction_.push_back(level_and_file);
       } else {
         bottommost_files_mark_threshold_ =
             std::min(bottommost_files_mark_threshold_,
-                     level_and_file.second->largest_seqno);
+                     level_and_file.second->fd.largest_seqno);
       }
     }
   }
@@ -2035,57 +2028,82 @@
 void VersionStorageInfo::GetOverlappingInputs(
     int level, const InternalKey* begin, const InternalKey* end,
     std::vector<FileMetaData*>* inputs, int hint_index, int* file_index,
-    bool expand_range) const {
+    bool expand_range, InternalKey** next_smallest) const {
   if (level >= num_non_empty_levels_) {
     // this level is empty, no overlapping inputs
     return;
   }
 
   inputs->clear();
-  Slice user_begin, user_end;
-  if (begin != nullptr) {
-    user_begin = begin->user_key();
-  }
-  if (end != nullptr) {
-    user_end = end->user_key();
-  }
   if (file_index) {
     *file_index = -1;
   }
   const Comparator* user_cmp = user_comparator_;
   if (level > 0) {
-    GetOverlappingInputsRangeBinarySearch(level, begin, end, inputs,
-                                          hint_index, file_index);
+    GetOverlappingInputsRangeBinarySearch(level, begin, end, inputs, hint_index,
+                                          file_index, false, next_smallest);
     return;
   }
 
-  for (size_t i = 0; i < level_files_brief_[level].num_files; ) {
-    FdWithKeyRange* f = &(level_files_brief_[level].files[i++]);
-    const Slice file_start = ExtractUserKey(f->smallest_key);
-    const Slice file_limit = ExtractUserKey(f->largest_key);
-    if (begin != nullptr && user_cmp->Compare(file_limit, user_begin) < 0) {
-      // "f" is completely before specified range; skip it
-    } else if (end != nullptr && user_cmp->Compare(file_start, user_end) > 0) {
-      // "f" is completely after specified range; skip it
-    } else {
-      inputs->push_back(files_[level][i-1]);
-      if (level == 0 && expand_range) {
-        // Level-0 files may overlap each other.  So check if the newly
-        // added file has expanded the range.  If so, restart search.
-        if (begin != nullptr && user_cmp->Compare(file_start, user_begin) < 0) {
-          user_begin = file_start;
-          inputs->clear();
-          i = 0;
-        } else if (end != nullptr
-            && user_cmp->Compare(file_limit, user_end) > 0) {
-          user_end = file_limit;
-          inputs->clear();
-          i = 0;
+  if (next_smallest) {
+    // next_smallest key only makes sense for non-level 0, where files are
+    // non-overlapping
+    *next_smallest = nullptr;
+  }
+
+  Slice user_begin, user_end;
+  if (begin != nullptr) {
+    user_begin = begin->user_key();
+  }
+  if (end != nullptr) {
+    user_end = end->user_key();
+  }
+
+  // index stores the file index need to check.
+  std::list<size_t> index;
+  for (size_t i = 0; i < level_files_brief_[level].num_files; i++) {
+    index.emplace_back(i);
+  }
+
+  while (!index.empty()) {
+    bool found_overlapping_file = false;
+    auto iter = index.begin();
+    while (iter != index.end()) {
+      FdWithKeyRange* f = &(level_files_brief_[level].files[*iter]);
+      const Slice file_start = ExtractUserKey(f->smallest_key);
+      const Slice file_limit = ExtractUserKey(f->largest_key);
+      if (begin != nullptr && user_cmp->Compare(file_limit, user_begin) < 0) {
+        // "f" is completely before specified range; skip it
+        iter++;
+      } else if (end != nullptr && user_cmp->Compare(file_start, user_end) > 0) {
+        // "f" is completely after specified range; skip it
+        iter++;
+      } else {
+        // if overlap
+        inputs->emplace_back(files_[level][*iter]);
+        found_overlapping_file = true;
+        // record the first file index.
+        if (file_index && *file_index == -1) {
+          *file_index = static_cast<int>(*iter);
+        }
+        // the related file is overlap, erase to avoid checking again.
+        iter = index.erase(iter);
+        if (expand_range) {
+          if (begin != nullptr &&
+              user_cmp->Compare(file_start, user_begin) < 0) {
+            user_begin = file_start;
+          }
+          if (end != nullptr &&
+              user_cmp->Compare(file_limit, user_end) > 0) {
+            user_end = file_limit;
+          }
         }
-      } else if (file_index) {
-        *file_index = static_cast<int>(i) - 1;
       }
     }
+    // if all the files left are not overlap, break
+    if (!found_overlapping_file) {
+      break;
+    }
   }
 }
 
@@ -2186,7 +2204,7 @@
 void VersionStorageInfo::GetOverlappingInputsRangeBinarySearch(
     int level, const InternalKey* begin, const InternalKey* end,
     std::vector<FileMetaData*>* inputs, int hint_index, int* file_index,
-    bool within_interval) const {
+    bool within_interval, InternalKey** next_smallest) const {
   assert(level > 0);
   int min = 0;
   int mid = 0;
@@ -2222,6 +2240,9 @@
 
   // If there were no overlapping files, return immediately.
   if (!foundOverlap) {
+    if (next_smallest) {
+      next_smallest = nullptr;
+    }
     return;
   }
   // returns the index where an overlap is found
@@ -2242,6 +2263,15 @@
   for (int i = start_index; i <= end_index; i++) {
     inputs->push_back(files_[level][i]);
   }
+
+  if (next_smallest != nullptr) {
+    // Provide the next key outside the range covered by inputs
+    if (++end_index < static_cast<int>(files_[level].size())) {
+      **next_smallest = files_[level][end_index]->smallest;
+    } else {
+      *next_smallest = nullptr;
+    }
+  }
 }
 
 // Store in *start_index and *end_index the range of all files in
@@ -2422,7 +2452,7 @@
     AppendHumanBytes(f->fd.GetFileSize(), sztxt, sizeof(sztxt));
     int ret = snprintf(scratch->buffer + len, sz,
                        "#%" PRIu64 "(seq=%" PRIu64 ",sz=%s,%d) ",
-                       f->fd.GetNumber(), f->smallest_seqno, sztxt,
+                       f->fd.GetNumber(), f->fd.smallest_seqno, sztxt,
                        static_cast<int>(f->being_compacted));
     if (ret < 0 || ret >= sz)
       break;
@@ -2904,16 +2934,17 @@
       // create new manifest file
       ROCKS_LOG_INFO(db_options_->info_log, "Creating manifest %" PRIu64 "\n",
                      pending_manifest_file_number_);
+      std::string descriptor_fname =
+          DescriptorFileName(dbname_, pending_manifest_file_number_);
       unique_ptr<WritableFile> descriptor_file;
-      s = NewWritableFile(
-          env_, DescriptorFileName(dbname_, pending_manifest_file_number_),
-          &descriptor_file, opt_env_opts);
+      s = NewWritableFile(env_, descriptor_fname, &descriptor_file,
+                          opt_env_opts);
       if (s.ok()) {
         descriptor_file->SetPreallocationBlockSize(
             db_options_->manifest_preallocation_size);
 
-        unique_ptr<WritableFileWriter> file_writer(
-            new WritableFileWriter(std::move(descriptor_file), opt_env_opts));
+        unique_ptr<WritableFileWriter> file_writer(new WritableFileWriter(
+            std::move(descriptor_file), descriptor_fname, opt_env_opts));
         descriptor_log_.reset(
             new log::Writer(std::move(file_writer), 0, false));
         s = WriteSnapshot(descriptor_log_.get());
@@ -3211,6 +3242,133 @@
   builder->Apply(edit);
 }
 
+Status VersionSet::ApplyOneVersionEdit(
+    VersionEdit& edit,
+    const std::unordered_map<std::string, ColumnFamilyOptions>& name_to_options,
+    std::unordered_map<int, std::string>& column_families_not_found,
+    std::unordered_map<uint32_t, BaseReferencedVersionBuilder*>& builders,
+    bool* have_log_number, uint64_t* /* log_number */,
+    bool* have_prev_log_number, uint64_t* previous_log_number,
+    bool* have_next_file, uint64_t* next_file, bool* have_last_sequence,
+    SequenceNumber* last_sequence, uint64_t* min_log_number_to_keep,
+    uint32_t* max_column_family) {
+  // Not found means that user didn't supply that column
+  // family option AND we encountered column family add
+  // record. Once we encounter column family drop record,
+  // we will delete the column family from
+  // column_families_not_found.
+  bool cf_in_not_found = (column_families_not_found.find(edit.column_family_) !=
+                          column_families_not_found.end());
+  // in builders means that user supplied that column family
+  // option AND that we encountered column family add record
+  bool cf_in_builders = builders.find(edit.column_family_) != builders.end();
+
+  // they can't both be true
+  assert(!(cf_in_not_found && cf_in_builders));
+
+  ColumnFamilyData* cfd = nullptr;
+
+  if (edit.is_column_family_add_) {
+    if (cf_in_builders || cf_in_not_found) {
+      return Status::Corruption(
+          "Manifest adding the same column family twice: " +
+          edit.column_family_name_);
+    }
+    auto cf_options = name_to_options.find(edit.column_family_name_);
+    if (cf_options == name_to_options.end()) {
+      column_families_not_found.insert(
+          {edit.column_family_, edit.column_family_name_});
+    } else {
+      cfd = CreateColumnFamily(cf_options->second, &edit);
+      cfd->set_initialized();
+      builders.insert(
+          {edit.column_family_, new BaseReferencedVersionBuilder(cfd)});
+    }
+  } else if (edit.is_column_family_drop_) {
+    if (cf_in_builders) {
+      auto builder = builders.find(edit.column_family_);
+      assert(builder != builders.end());
+      delete builder->second;
+      builders.erase(builder);
+      cfd = column_family_set_->GetColumnFamily(edit.column_family_);
+      assert(cfd != nullptr);
+      if (cfd->Unref()) {
+        delete cfd;
+        cfd = nullptr;
+      } else {
+        // who else can have reference to cfd!?
+        assert(false);
+      }
+    } else if (cf_in_not_found) {
+      column_families_not_found.erase(edit.column_family_);
+    } else {
+      return Status::Corruption(
+          "Manifest - dropping non-existing column family");
+    }
+  } else if (!cf_in_not_found) {
+    if (!cf_in_builders) {
+      return Status::Corruption(
+          "Manifest record referencing unknown column family");
+    }
+
+    cfd = column_family_set_->GetColumnFamily(edit.column_family_);
+    // this should never happen since cf_in_builders is true
+    assert(cfd != nullptr);
+
+    // if it is not column family add or column family drop,
+    // then it's a file add/delete, which should be forwarded
+    // to builder
+    auto builder = builders.find(edit.column_family_);
+    assert(builder != builders.end());
+    builder->second->version_builder()->Apply(&edit);
+  }
+
+  if (cfd != nullptr) {
+    if (edit.has_log_number_) {
+      if (cfd->GetLogNumber() > edit.log_number_) {
+        ROCKS_LOG_WARN(
+            db_options_->info_log,
+            "MANIFEST corruption detected, but ignored - Log numbers in "
+            "records NOT monotonically increasing");
+      } else {
+        cfd->SetLogNumber(edit.log_number_);
+        *have_log_number = true;
+      }
+    }
+    if (edit.has_comparator_ &&
+        edit.comparator_ != cfd->user_comparator()->Name()) {
+      return Status::InvalidArgument(
+          cfd->user_comparator()->Name(),
+          "does not match existing comparator " + edit.comparator_);
+    }
+  }
+
+  if (edit.has_prev_log_number_) {
+    *previous_log_number = edit.prev_log_number_;
+    *have_prev_log_number = true;
+  }
+
+  if (edit.has_next_file_number_) {
+    *next_file = edit.next_file_number_;
+    *have_next_file = true;
+  }
+
+  if (edit.has_max_column_family_) {
+    *max_column_family = edit.max_column_family_;
+  }
+
+  if (edit.has_min_log_number_to_keep_) {
+    *min_log_number_to_keep =
+        std::max(*min_log_number_to_keep, edit.min_log_number_to_keep_);
+  }
+
+  if (edit.has_last_sequence_) {
+    *last_sequence = edit.last_sequence_;
+    *have_last_sequence = true;
+  }
+  return Status::OK();
+}
+
 Status VersionSet::Recover(
     const std::vector<ColumnFamilyDescriptor>& column_families,
     bool read_only) {
@@ -3296,9 +3454,11 @@
     VersionSet::LogReporter reporter;
     reporter.status = &s;
     log::Reader reader(nullptr, std::move(manifest_file_reader), &reporter,
-                       true /*checksum*/, 0 /*initial_offset*/, 0);
+                       true /* checksum */, 0 /* log_number */);
     Slice record;
     std::string scratch;
+    std::vector<VersionEdit> replay_buffer;
+    size_t num_entries_decoded = 0;
     while (reader.ReadRecord(&record, &scratch) && s.ok()) {
       VersionEdit edit;
       s = edit.DecodeFrom(record);
@@ -3306,123 +3466,44 @@
         break;
       }
 
-      // Not found means that user didn't supply that column
-      // family option AND we encountered column family add
-      // record. Once we encounter column family drop record,
-      // we will delete the column family from
-      // column_families_not_found.
-      bool cf_in_not_found =
-          column_families_not_found.find(edit.column_family_) !=
-          column_families_not_found.end();
-      // in builders means that user supplied that column family
-      // option AND that we encountered column family add record
-      bool cf_in_builders =
-          builders.find(edit.column_family_) != builders.end();
-
-      // they can't both be true
-      assert(!(cf_in_not_found && cf_in_builders));
-
-      ColumnFamilyData* cfd = nullptr;
-
-      if (edit.is_column_family_add_) {
-        if (cf_in_builders || cf_in_not_found) {
-          s = Status::Corruption(
-              "Manifest adding the same column family twice");
-          break;
-        }
-        auto cf_options = cf_name_to_options.find(edit.column_family_name_);
-        if (cf_options == cf_name_to_options.end()) {
-          column_families_not_found.insert(
-              {edit.column_family_, edit.column_family_name_});
-        } else {
-          cfd = CreateColumnFamily(cf_options->second, &edit);
-          cfd->set_initialized();
-          builders.insert(
-              {edit.column_family_, new BaseReferencedVersionBuilder(cfd)});
-        }
-      } else if (edit.is_column_family_drop_) {
-        if (cf_in_builders) {
-          auto builder = builders.find(edit.column_family_);
-          assert(builder != builders.end());
-          delete builder->second;
-          builders.erase(builder);
-          cfd = column_family_set_->GetColumnFamily(edit.column_family_);
-          if (cfd->Unref()) {
-            delete cfd;
-            cfd = nullptr;
-          } else {
-            // who else can have reference to cfd!?
-            assert(false);
-          }
-        } else if (cf_in_not_found) {
-          column_families_not_found.erase(edit.column_family_);
-        } else {
-          s = Status::Corruption(
-              "Manifest - dropping non-existing column family");
-          break;
-        }
-      } else if (!cf_in_not_found) {
-        if (!cf_in_builders) {
-          s = Status::Corruption(
-              "Manifest record referencing unknown column family");
-          break;
-        }
-
-        cfd = column_family_set_->GetColumnFamily(edit.column_family_);
-        // this should never happen since cf_in_builders is true
-        assert(cfd != nullptr);
-
-        // if it is not column family add or column family drop,
-        // then it's a file add/delete, which should be forwarded
-        // to builder
-        auto builder = builders.find(edit.column_family_);
-        assert(builder != builders.end());
-        builder->second->version_builder()->Apply(&edit);
-      }
-
-      if (cfd != nullptr) {
-        if (edit.has_log_number_) {
-          if (cfd->GetLogNumber() > edit.log_number_) {
-            ROCKS_LOG_WARN(
-                db_options_->info_log,
-                "MANIFEST corruption detected, but ignored - Log numbers in "
-                "records NOT monotonically increasing");
-          } else {
-            cfd->SetLogNumber(edit.log_number_);
-            have_log_number = true;
+      if (edit.is_in_atomic_group_) {
+        if (replay_buffer.empty()) {
+          replay_buffer.resize(edit.remaining_entries_ + 1);
+        }
+        ++num_entries_decoded;
+        if (num_entries_decoded + edit.remaining_entries_ !=
+            static_cast<uint32_t>(replay_buffer.size())) {
+          return Status::Corruption("corrupted atomic group");
+        }
+        replay_buffer[num_entries_decoded - 1] = std::move(edit);
+        if (num_entries_decoded == replay_buffer.size()) {
+          for (auto& e : replay_buffer) {
+            s = ApplyOneVersionEdit(
+                e, cf_name_to_options, column_families_not_found, builders,
+                &have_log_number, &log_number, &have_prev_log_number,
+                &previous_log_number, &have_next_file, &next_file,
+                &have_last_sequence, &last_sequence, &min_log_number_to_keep,
+                &max_column_family);
+            if (!s.ok()) {
+              break;
+            }
           }
+          replay_buffer.clear();
+          num_entries_decoded = 0;
         }
-        if (edit.has_comparator_ &&
-            edit.comparator_ != cfd->user_comparator()->Name()) {
-          s = Status::InvalidArgument(
-              cfd->user_comparator()->Name(),
-              "does not match existing comparator " + edit.comparator_);
-          break;
+      } else {
+        if (!replay_buffer.empty()) {
+          return Status::Corruption("corrupted atomic group");
         }
+        s = ApplyOneVersionEdit(
+            edit, cf_name_to_options, column_families_not_found, builders,
+            &have_log_number, &log_number, &have_prev_log_number,
+            &previous_log_number, &have_next_file, &next_file,
+            &have_last_sequence, &last_sequence, &min_log_number_to_keep,
+            &max_column_family);
       }
-
-      if (edit.has_prev_log_number_) {
-        previous_log_number = edit.prev_log_number_;
-        have_prev_log_number = true;
-      }
-
-      if (edit.has_next_file_number_) {
-        next_file = edit.next_file_number_;
-        have_next_file = true;
-      }
-
-      if (edit.has_max_column_family_) {
-        max_column_family = edit.max_column_family_;
-      }
-
-      if (edit.has_min_log_number_to_keep_) {
-        min_log_number_to_keep =
-            std::max(min_log_number_to_keep, edit.min_log_number_to_keep_);
-      }
-
-      if (edit.has_last_sequence_) {
-        last_sequence = edit.last_sequence_;
-        have_last_sequence = true;
+      if (!s.ok()) {
+        break;
       }
     }
   }
@@ -3578,8 +3659,8 @@
   column_family_names.insert({0, kDefaultColumnFamilyName});
   VersionSet::LogReporter reporter;
   reporter.status = &s;
-  log::Reader reader(nullptr, std::move(file_reader), &reporter, true /*checksum*/,
-                     0 /*initial_offset*/, 0);
+  log::Reader reader(nullptr, std::move(file_reader), &reporter,
+                     true /* checksum */, 0 /* log_number */);
   Slice record;
   std::string scratch;
   while (reader.ReadRecord(&record, &scratch) && s.ok()) {
@@ -3739,7 +3820,7 @@
     VersionSet::LogReporter reporter;
     reporter.status = &s;
     log::Reader reader(nullptr, std::move(file_reader), &reporter,
-                       true /*checksum*/, 0 /*initial_offset*/, 0);
+                       true /* checksum */, 0 /* log_number */);
     Slice record;
     std::string scratch;
     while (reader.ReadRecord(&record, &scratch) && s.ok()) {
@@ -3968,7 +4049,7 @@
              cfd->current()->storage_info()->LevelFiles(level)) {
           edit.AddFile(level, f->fd.GetNumber(), f->fd.GetPathId(),
                        f->fd.GetFileSize(), f->smallest, f->largest,
-                       f->smallest_seqno, f->largest_seqno,
+                       f->fd.smallest_seqno, f->fd.largest_seqno,
                        f->marked_for_compaction);
         }
       }
@@ -4290,11 +4371,11 @@
         }
         filemetadata.name = MakeTableFileName("", file->fd.GetNumber());
         filemetadata.level = level;
-        filemetadata.size = file->fd.GetFileSize();
+        filemetadata.size = static_cast<size_t>(file->fd.GetFileSize());
         filemetadata.smallestkey = file->smallest.user_key().ToString();
         filemetadata.largestkey = file->largest.user_key().ToString();
-        filemetadata.smallest_seqno = file->smallest_seqno;
-        filemetadata.largest_seqno = file->largest_seqno;
+        filemetadata.smallest_seqno = file->fd.smallest_seqno;
+        filemetadata.largest_seqno = file->fd.largest_seqno;
         metadata->push_back(filemetadata);
       }
     }
diff -Nru rocksdb-5.15.10/db/version_set.h rocksdb-5.17.2/db/version_set.h
--- rocksdb-5.15.10/db/version_set.h	2018-09-13 17:25:20.000000000 +0000
+++ rocksdb-5.17.2/db/version_set.h	2018-11-12 19:57:32.000000000 +0000
@@ -52,7 +52,6 @@
 }
 
 class Compaction;
-class InternalIterator;
 class LogBuffer;
 class LookupKey;
 class MemTable;
@@ -189,9 +188,11 @@
       std::vector<FileMetaData*>* inputs,
       int hint_index = -1,        // index of overlap file
       int* file_index = nullptr,  // return index of overlap file
-      bool expand_range = true)   // if set, returns files which overlap the
-      const;                      // range and overlap each other. If false,
+      bool expand_range = true,   // if set, returns files which overlap the
+                                  // range and overlap each other. If false,
                                   // then just files intersecting the range
+      InternalKey** next_smallest = nullptr)  // if non-null, returns the
+      const;  // smallest key of next file not included
   void GetCleanInputsWithinInterval(
       int level, const InternalKey* begin,  // nullptr means before all keys
       const InternalKey* end,               // nullptr means after all keys
@@ -201,14 +202,15 @@
       const;
 
   void GetOverlappingInputsRangeBinarySearch(
-      int level,           // level > 0
+      int level,                 // level > 0
       const InternalKey* begin,  // nullptr means before all keys
       const InternalKey* end,    // nullptr means after all keys
       std::vector<FileMetaData*>* inputs,
       int hint_index,                // index of overlap file
       int* file_index,               // return index of overlap file
-      bool within_interval = false)  // if set, force the inputs within interval
-      const;
+      bool within_interval = false,  // if set, force the inputs within interval
+      InternalKey** next_smallest = nullptr)  // if non-null, returns the
+      const;  // smallest key of next file not included
 
   void ExtendFileRangeOverlappingInterval(
       int level,
@@ -729,6 +731,10 @@
   }
 };
 
+namespace {
+class BaseReferencedVersionBuilder;
+}
+
 class VersionSet {
  public:
   VersionSet(const std::string& dbname, const ImmutableDBOptions* db_options,
@@ -832,6 +838,11 @@
   // Allocate and return a new file number
   uint64_t NewFileNumber() { return next_file_number_.fetch_add(1); }
 
+  // Fetch And Add n new file number
+  uint64_t FetchAddFileNumber(uint64_t n) {
+    return next_file_number_.fetch_add(n);
+  }
+
   // Return the last sequence number.
   uint64_t LastSequence() const {
     return last_sequence_.load(std::memory_order_acquire);
@@ -985,6 +996,16 @@
   ColumnFamilyData* CreateColumnFamily(const ColumnFamilyOptions& cf_options,
                                        VersionEdit* edit);
 
+  Status ApplyOneVersionEdit(
+      VersionEdit& edit,
+      const std::unordered_map<std::string, ColumnFamilyOptions>& name_to_opts,
+      std::unordered_map<int, std::string>& column_families_not_found,
+      std::unordered_map<uint32_t, BaseReferencedVersionBuilder*>& builders,
+      bool* have_log_number, uint64_t* log_number, bool* have_prev_log_number,
+      uint64_t* previous_log_number, bool* have_next_file, uint64_t* next_file,
+      bool* have_last_sequence, SequenceNumber* last_sequence,
+      uint64_t* min_log_number_to_keep, uint32_t* max_column_family);
+
   Status ProcessManifestWrites(std::deque<ManifestWriter>& writers,
                                InstrumentedMutex* mu, Directory* db_directory,
                                bool new_descriptor_log,
diff -Nru rocksdb-5.15.10/db/version_set_test.cc rocksdb-5.17.2/db/version_set_test.cc
--- rocksdb-5.15.10/db/version_set_test.cc	2018-09-13 17:25:20.000000000 +0000
+++ rocksdb-5.17.2/db/version_set_test.cc	2018-11-12 19:57:32.000000000 +0000
@@ -566,7 +566,7 @@
         manifest, &file, env_->OptimizeForManifestWrite(env_options_));
     ASSERT_OK(s);
     unique_ptr<WritableFileWriter> file_writer(
-        new WritableFileWriter(std::move(file), env_options_));
+        new WritableFileWriter(std::move(file), manifest, env_options_));
     {
       log::Writer log(std::move(file_writer), 0, false);
       std::string record;
diff -Nru rocksdb-5.15.10/db/wal_manager.cc rocksdb-5.17.2/db/wal_manager.cc
--- rocksdb-5.15.10/db/wal_manager.cc	2018-09-13 17:25:20.000000000 +0000
+++ rocksdb-5.17.2/db/wal_manager.cc	2018-11-12 19:57:32.000000000 +0000
@@ -237,7 +237,7 @@
   }
 
   size_t const files_keep_num =
-      db_options_.wal_size_limit_mb * 1024 * 1024 / log_file_size;
+      static_cast<size_t>(db_options_.wal_size_limit_mb * 1024 * 1024 / log_file_size);
   if (log_files_num <= files_keep_num) {
     return;
   }
@@ -352,7 +352,7 @@
   // Binary Search. avoid opening all files.
   while (end >= start) {
     int64_t mid = start + (end - start) / 2;  // Avoid overflow.
-    SequenceNumber current_seq_num = all_logs.at(mid)->StartSequence();
+    SequenceNumber current_seq_num = all_logs.at(static_cast<size_t>(mid))->StartSequence();
     if (current_seq_num == target) {
       end = mid;
       break;
@@ -363,7 +363,7 @@
     }
   }
   // end could be -ve.
-  size_t start_index = std::max(static_cast<int64_t>(0), end);
+  size_t start_index = static_cast<size_t>(std::max(static_cast<int64_t>(0), end));
   // The last wal file is always included
   all_logs.erase(all_logs.begin(), all_logs.begin() + start_index);
   return Status::OK();
@@ -457,7 +457,7 @@
   reporter.status = &status;
   reporter.ignore_error = !db_options_.paranoid_checks;
   log::Reader reader(db_options_.info_log, std::move(file_reader), &reporter,
-                     true /*checksum*/, 0 /*initial_offset*/, number);
+                     true /*checksum*/, number);
   std::string scratch;
   Slice record;
 
diff -Nru rocksdb-5.15.10/db/wal_manager_test.cc rocksdb-5.17.2/db/wal_manager_test.cc
--- rocksdb-5.15.10/db/wal_manager_test.cc	2018-09-13 17:25:20.000000000 +0000
+++ rocksdb-5.17.2/db/wal_manager_test.cc	2018-11-12 19:57:32.000000000 +0000
@@ -79,7 +79,7 @@
     unique_ptr<WritableFile> file;
     ASSERT_OK(env_->NewWritableFile(fname, &file, env_options_));
     unique_ptr<WritableFileWriter> file_writer(
-        new WritableFileWriter(std::move(file), env_options_));
+        new WritableFileWriter(std::move(file), fname, env_options_));
     current_log_writer_.reset(new log::Writer(std::move(file_writer), 0, false));
   }
 
@@ -130,7 +130,7 @@
   ASSERT_EQ(s, 0U);
 
   unique_ptr<WritableFileWriter> file_writer(
-      new WritableFileWriter(std::move(file), EnvOptions()));
+      new WritableFileWriter(std::move(file), path, EnvOptions()));
   log::Writer writer(std::move(file_writer), 1,
                      db_options_.recycle_log_file_num > 0);
   WriteBatch batch;
diff -Nru rocksdb-5.15.10/db/write_batch.cc rocksdb-5.17.2/db/write_batch.cc
--- rocksdb-5.15.10/db/write_batch.cc	2018-09-13 17:25:20.000000000 +0000
+++ rocksdb-5.17.2/db/write_batch.cc	2018-11-12 19:57:32.000000000 +0000
@@ -727,6 +727,11 @@
                               ContentFlags::HAS_END_PREPARE |
                               ContentFlags::HAS_BEGIN_PREPARE,
                           std::memory_order_relaxed);
+  if (unprepared_batch) {
+    b->content_flags_.store(b->content_flags_.load(std::memory_order_relaxed) |
+                                ContentFlags::HAS_BEGIN_UNPREPARE,
+                            std::memory_order_relaxed);
+  }
   return Status::OK();
 }
 
diff -Nru rocksdb-5.15.10/db/write_thread.cc rocksdb-5.17.2/db/write_thread.cc
--- rocksdb-5.15.10/db/write_thread.cc	2018-09-13 17:25:20.000000000 +0000
+++ rocksdb-5.17.2/db/write_thread.cc	2018-11-12 19:57:32.000000000 +0000
@@ -24,7 +24,10 @@
       enable_pipelined_write_(db_options.enable_pipelined_write),
       newest_writer_(nullptr),
       newest_memtable_writer_(nullptr),
-      last_sequence_(0) {}
+      last_sequence_(0),
+      write_stall_dummy_(),
+      stall_mu_(),
+      stall_cv_(&stall_mu_) {}
 
 uint8_t WriteThread::BlockingAwaitState(Writer* w, uint8_t goal_mask) {
   // We're going to block.  Lazily create the mutex.  We guarantee
@@ -219,6 +222,28 @@
   assert(w->state == STATE_INIT);
   Writer* writers = newest_writer->load(std::memory_order_relaxed);
   while (true) {
+    // If write stall in effect, and w->no_slowdown is not true,
+    // block here until stall is cleared. If its true, then return
+    // immediately
+    if (writers == &write_stall_dummy_) {
+      if (w->no_slowdown) {
+        w->status = Status::Incomplete("Write stall");
+        SetState(w, STATE_COMPLETED);
+        return false;
+      }
+      // Since no_slowdown is false, wait here to be notified of the write
+      // stall clearing
+      {
+        MutexLock lock(&stall_mu_);
+        writers = newest_writer->load(std::memory_order_relaxed);
+        if (writers == &write_stall_dummy_) {
+          stall_cv_.Wait();
+          // Load newest_writers_ again since it may have changed
+          writers = newest_writer->load(std::memory_order_relaxed);
+          continue;
+        }
+      }
+    }
     w->link_older = writers;
     if (newest_writer->compare_exchange_weak(writers, w)) {
       return (writers == nullptr);
@@ -303,12 +328,44 @@
   SetState(w, STATE_COMPLETED);
 }
 
+void WriteThread::BeginWriteStall() {
+  LinkOne(&write_stall_dummy_, &newest_writer_);
+
+  // Walk writer list until w->write_group != nullptr. The current write group
+  // will not have a mix of slowdown/no_slowdown, so its ok to stop at that
+  // point
+  Writer* w = write_stall_dummy_.link_older;
+  Writer* prev = &write_stall_dummy_;
+  while (w != nullptr && w->write_group == nullptr) {
+    if (w->no_slowdown) {
+      prev->link_older = w->link_older;
+      w->status = Status::Incomplete("Write stall");
+      SetState(w, STATE_COMPLETED);
+      w = prev->link_older;
+    } else {
+      prev = w;
+      w = w->link_older;
+    }
+  }
+}
+
+void WriteThread::EndWriteStall() {
+  MutexLock lock(&stall_mu_);
+
+  assert(newest_writer_.load(std::memory_order_relaxed) == &write_stall_dummy_);
+  newest_writer_.exchange(write_stall_dummy_.link_older);
+
+  // Wake up writers
+  stall_cv_.SignalAll();
+}
+
 static WriteThread::AdaptationContext jbg_ctx("JoinBatchGroup");
 void WriteThread::JoinBatchGroup(Writer* w) {
   TEST_SYNC_POINT_CALLBACK("WriteThread::JoinBatchGroup:Start", w);
   assert(w->batch != nullptr);
 
   bool linked_as_leader = LinkOne(w, &newest_writer_);
+
   if (linked_as_leader) {
     SetState(w, STATE_GROUP_LEADER);
   }
diff -Nru rocksdb-5.15.10/db/write_thread.h rocksdb-5.17.2/db/write_thread.h
--- rocksdb-5.15.10/db/write_thread.h	2018-09-13 17:25:20.000000000 +0000
+++ rocksdb-5.17.2/db/write_thread.h	2018-11-12 19:57:32.000000000 +0000
@@ -342,6 +342,13 @@
     return last_sequence_;
   }
 
+  // Insert a dummy writer at the tail of the write queue to indicate a write
+  // stall, and fail any writers in the queue with no_slowdown set to true
+  void BeginWriteStall();
+
+  // Remove the dummy writer and wake up waiting writers
+  void EndWriteStall();
+
  private:
   // See AwaitState.
   const uint64_t max_yield_usec_;
@@ -365,6 +372,17 @@
   // is not necessary visible to reads because the writer can be ongoing.
   SequenceNumber last_sequence_;
 
+  // A dummy writer to indicate a write stall condition. This will be inserted
+  // at the tail of the writer queue by the leader, so newer writers can just
+  // check for this and bail
+  Writer write_stall_dummy_;
+
+  // Mutex and condvar for writers to block on a write stall. During a write
+  // stall, writers with no_slowdown set to false will wait on this rather
+  // on the writer queue
+  port::Mutex stall_mu_;
+  port::CondVar stall_cv_;
+
   // Waits for w->state & goal_mask using w->StateMutex().  Returns
   // the state that satisfies goal_mask.
   uint8_t BlockingAwaitState(Writer* w, uint8_t goal_mask);
diff -Nru rocksdb-5.15.10/debian/changelog rocksdb-5.17.2/debian/changelog
--- rocksdb-5.15.10/debian/changelog	2018-11-21 21:07:17.000000000 +0000
+++ rocksdb-5.17.2/debian/changelog	2018-12-19 17:01:38.000000000 +0000
@@ -1,3 +1,23 @@
+rocksdb (5.17.2-3) unstable; urgency=medium
+
+  * Backport fix for snprintf() buffer overflow.
+  * Upload to Sid.
+
+ -- Laszlo Boszormenyi (GCS) <gcs@debian.org>  Wed, 19 Dec 2018 17:01:38 +0000
+
+rocksdb (5.17.2-2) experimental; urgency=medium
+
+  * Backport upstream fix for db_bench_tool.cc FTBFS.
+
+ -- Laszlo Boszormenyi (GCS) <gcs@debian.org>  Sun, 09 Dec 2018 06:00:39 +0000
+
+rocksdb (5.17.2-1) experimental; urgency=medium
+
+  * New upstream release.
+  * Library transition from librocksdb5.15 to librocksdb5.17 .
+
+ -- Laszlo Boszormenyi (GCS) <gcs@debian.org>  Thu, 22 Nov 2018 16:25:48 +0100
+
 rocksdb (5.15.10-2) unstable; urgency=medium
 
   * Remove ppc64 from build architectures, upstream doesn't want to
diff -Nru rocksdb-5.15.10/debian/control rocksdb-5.17.2/debian/control
--- rocksdb-5.15.10/debian/control	2018-11-21 21:07:17.000000000 +0000
+++ rocksdb-5.17.2/debian/control	2018-11-22 15:25:48.000000000 +0000
@@ -4,12 +4,12 @@
 Maintainer: Laszlo Boszormenyi (GCS) <gcs@debian.org>
 Build-Depends: debhelper (>= 11), libgflags-dev, libsnappy-dev, libbz2-dev, zlib1g-dev, liblz4-dev, libzstd-dev
 Standards-Version: 4.2.1
-Homepage: http://rocksdb.org/
+Homepage: https://rocksdb.org/
 
 Package: librocksdb-dev
 Section: libdevel
 Architecture: amd64 arm64 ppc64el mips mipsel mips64el sparc64 s390x i386
-Depends: ${misc:Depends}, librocksdb5.15 (= ${binary:Version})
+Depends: ${misc:Depends}, librocksdb5.17 (= ${binary:Version})
 Conflicts: librocksdb5.7
 Replaces: librocksdb5.7
 Description: persistent Key-Value Store for Flash and RAM Storage (development)
@@ -36,7 +36,7 @@
  This package contains libraries and header files for developing
  applications that use librocksdb .
 
-Package: librocksdb5.15
+Package: librocksdb5.17
 Section: libs
 Architecture: amd64 arm64 ppc64el mips mipsel mips64el sparc64 s390x i386
 Depends: ${misc:Depends}, ${shlibs:Depends}
diff -Nru rocksdb-5.15.10/debian/copyright rocksdb-5.17.2/debian/copyright
--- rocksdb-5.15.10/debian/copyright	2017-10-01 07:44:53.000000000 +0000
+++ rocksdb-5.17.2/debian/copyright	2018-11-22 15:25:48.000000000 +0000
@@ -1,6 +1,6 @@
-Format: http://www.debian.org/doc/packaging-manuals/copyright-format/1.0/
+Format: https://www.debian.org/doc/packaging-manuals/copyright-format/1.0/
 Upstream-Name: RocksDB
-Upstream-Contact: https://code.facebook.com/projects/
+Upstream-Contact: https://opensource.fb.com/
 Source: https://github.com/facebook/rocksdb
 Copyright: Copyright (C) 2013- Facebook Database Engineering Team
 
@@ -18,7 +18,7 @@
 
 Files: java/benchmark/src/main/java/org/rocksdb/benchmark/DbBenchmark.java
 Copyright: Copyright (C) 2011 Dain Sundstrom <dain@iq80.com>,
- Copyright (C) 2011 FuseSource Corp. http://fusesource.com
+ Copyright (C) 2011 FuseSource Corp. https://fusesource.com
 License: Apache-2.0
 
 Files: java/rocksjni.pom
@@ -35,7 +35,7 @@
  Version 2.0 (the "License"); you may not use this work except in compliance
  with the License.  You may obtain a copy of the License at
  .
- http://www.apache.org/licenses/LICENSE-2.0
+ https://www.apache.org/licenses/LICENSE-2.0
  .
  On Debian systems, the complete text of the Apache License Version 2.0
  can be found in the file '/usr/share/common-licenses/Apache-2.0'.
@@ -65,7 +65,7 @@
  GNU General Public License for more details.
  .
  You should have received a copy of the GNU General Public License
- along with this program.  If not, see <http://www.gnu.org/licenses/>.
+ along with this program.  If not, see <https://www.gnu.org/licenses/>.
  .
  On Debian systems, the complete text of the GNU General Public License 3
  can be found in the file `/usr/share/common-licenses/GPL-3'.
diff -Nru rocksdb-5.15.10/debian/librocksdb5.15.install rocksdb-5.17.2/debian/librocksdb5.15.install
--- rocksdb-5.15.10/debian/librocksdb5.15.install	2017-10-01 07:44:53.000000000 +0000
+++ rocksdb-5.17.2/debian/librocksdb5.15.install	1970-01-01 00:00:00.000000000 +0000
@@ -1 +0,0 @@
-usr/lib/lib*.so.*.*
diff -Nru rocksdb-5.15.10/debian/librocksdb5.17.install rocksdb-5.17.2/debian/librocksdb5.17.install
--- rocksdb-5.15.10/debian/librocksdb5.17.install	1970-01-01 00:00:00.000000000 +0000
+++ rocksdb-5.17.2/debian/librocksdb5.17.install	2017-10-01 07:44:53.000000000 +0000
@@ -0,0 +1 @@
+usr/lib/lib*.so.*.*
diff -Nru rocksdb-5.15.10/debian/patches/fix_db_bench_tool_FTBFS.patch rocksdb-5.17.2/debian/patches/fix_db_bench_tool_FTBFS.patch
--- rocksdb-5.15.10/debian/patches/fix_db_bench_tool_FTBFS.patch	1970-01-01 00:00:00.000000000 +0000
+++ rocksdb-5.17.2/debian/patches/fix_db_bench_tool_FTBFS.patch	2018-11-22 15:25:48.000000000 +0000
@@ -0,0 +1,48 @@
+From f959e88048642f3548065d07306ad7a0ffdeaa7e Mon Sep 17 00:00:00 2001
+From: =?UTF-8?q?Simon=20Gra=CC=88tzer?= <simon@graetzer.org>
+Date: Fri, 19 Oct 2018 14:43:55 -0700
+Subject: [PATCH] Fix printf formatting on MacOS (#4533)
+
+Summary:
+On MacOS with clang the compilation of _tools/db_bench_tool.cc_ always fails because the format used in a `fprintf` call has the wrong type. This PR should hopefully fix this issue
+```
+tools/db_bench_tool.cc:4233:61: error: format specifies type 'unsigned long long' but the argument has type 'size_t' (aka 'unsigned long')
+```
+Pull Request resolved: https://github.com/facebook/rocksdb/pull/4533
+
+Differential Revision: D10471657
+
+Pulled By: maysamyabandeh
+
+fbshipit-source-id: f20f5f3756d3571b586c895c845d0d4d1e34a398
+---
+ .travis.yml            | 2 +-
+ tools/db_bench_tool.cc | 2 +-
+ 2 files changed, 2 insertions(+), 2 deletions(-)
+
+diff --git a/.travis.yml b/.travis.yml
+index b366da2517..e759a642a0 100644
+--- a/.travis.yml
++++ b/.travis.yml
+@@ -90,7 +90,7 @@ script:
+       OPT=-DTRAVIS V=1 ROCKSDBTESTS_START=db_block_cache_test ROCKSDBTESTS_END=full_filter_block_test make -j4 check_some
+       ;;
+     2)
+-      OPT=-DTRAVIS V=1 ROCKSDBTESTS_START=full_filter_block_test ROCKSDBTESTS_END=write_batch_with_index_test make -j4 check_some
++      OPT=-DTRAVIS V=1 make -j4 tools && OPT=-DTRAVIS V=1 ROCKSDBTESTS_START=full_filter_block_test ROCKSDBTESTS_END=write_batch_with_index_test make -j4 check_some
+       ;;
+     3)
+       OPT=-DTRAVIS V=1 ROCKSDBTESTS_START=write_batch_with_index_test ROCKSDBTESTS_END=write_prepared_transaction_test make -j4 check_some
+diff --git a/tools/db_bench_tool.cc b/tools/db_bench_tool.cc
+index 1a68ad6548..a416b91abe 100644
+--- a/tools/db_bench_tool.cc
++++ b/tools/db_bench_tool.cc
+@@ -4237,7 +4237,7 @@ void VerifyDBFromDB(std::string& truth_db_name) {
+         }
+         if (levelMeta.level == 0) {
+           for (auto& fileMeta : levelMeta.files) {
+-            fprintf(stdout, "Level[%d]: %s(size: %" PRIu64 " bytes)\n",
++            fprintf(stdout, "Level[%d]: %s(size: %" ROCKSDB_PRIszt " bytes)\n",
+                     levelMeta.level, fileMeta.name.c_str(), fileMeta.size);
+           }
+         } else {
diff -Nru rocksdb-5.15.10/debian/patches/fix_snprintf_buffer_overflow_bug.patch rocksdb-5.17.2/debian/patches/fix_snprintf_buffer_overflow_bug.patch
--- rocksdb-5.15.10/debian/patches/fix_snprintf_buffer_overflow_bug.patch	1970-01-01 00:00:00.000000000 +0000
+++ rocksdb-5.17.2/debian/patches/fix_snprintf_buffer_overflow_bug.patch	2018-12-19 17:01:38.000000000 +0000
@@ -0,0 +1,38 @@
+From 1fb68055271bc4cf879325db49f8c4266bbcb5e6 Mon Sep 17 00:00:00 2001
+From: Maysam Yabandeh <myabandeh@fb.com>
+Date: Fri, 5 Oct 2018 14:49:01 -0700
+Subject: [PATCH] Fix snprintf buffer overflow bug (#4465)
+
+Summary:
+The contract of snprintf says that it returns "The number of characters that would have been written if n had been sufficiently large" http://www.cplusplus.com/reference/cstdio/snprintf/
+The existing code however was assuming that the return value is the actual number of written bytes and uses that to reposition the starting point on the next call to snprintf. This leads to buffer overflow when the last call to snprintf has filled up the buffer.
+Pull Request resolved: https://github.com/facebook/rocksdb/pull/4465
+
+Differential Revision: D10224080
+
+Pulled By: maysamyabandeh
+
+fbshipit-source-id: 40f44e122d15b0db439812a0a361167cf012de3e
+---
+ db/compaction.cc | 2 ++
+ 1 file changed, 2 insertions(+)
+
+diff --git a/db/compaction.cc b/db/compaction.cc
+index 4ea92d5cc7..b3921eb4bc 100644
+--- a/db/compaction.cc
++++ b/db/compaction.cc
+@@ -331,12 +331,14 @@ const char* Compaction::InputLevelSummary(
+     if (!is_first) {
+       len +=
+           snprintf(scratch->buffer + len, sizeof(scratch->buffer) - len, " + ");
++      len = std::min(len, static_cast<int>(sizeof(scratch->buffer)));
+     } else {
+       is_first = false;
+     }
+     len += snprintf(scratch->buffer + len, sizeof(scratch->buffer) - len,
+                     "%" ROCKSDB_PRIszt "@%d", input_level.size(),
+                     input_level.level);
++    len = std::min(len, static_cast<int>(sizeof(scratch->buffer)));
+   }
+   snprintf(scratch->buffer + len, sizeof(scratch->buffer) - len,
+            " files to L%d", output_level());
diff -Nru rocksdb-5.15.10/debian/patches/series rocksdb-5.17.2/debian/patches/series
--- rocksdb-5.15.10/debian/patches/series	2018-09-18 20:52:12.000000000 +0000
+++ rocksdb-5.17.2/debian/patches/series	2018-12-19 17:01:38.000000000 +0000
@@ -1,3 +1,5 @@
 install_dir-is-destdir.patch
 build_reproducible.patch
 verbose_build.patch
+fix_db_bench_tool_FTBFS.patch
+fix_snprintf_buffer_overflow_bug.patch
diff -Nru rocksdb-5.15.10/debian/rules rocksdb-5.17.2/debian/rules
--- rocksdb-5.15.10/debian/rules	2018-07-17 16:29:50.000000000 +0000
+++ rocksdb-5.17.2/debian/rules	2018-11-22 15:25:48.000000000 +0000
@@ -17,7 +17,11 @@
 override_dh_auto_install:
 	dh_auto_install --destdir=$(CURDIR)/debian/tmp/usr/
 
+override_dh_missing:
+	dh_missing --list-missing
+
 %:
-	dh $@ --fail-missing
+	dh $@
 
-.PHONY: override_dh_auto_build override_dh_auto_test override_dh_auto_install
+.PHONY: override_dh_auto_build override_dh_auto_test \
+	override_dh_auto_install override_dh_missing
diff -Nru rocksdb-5.15.10/docs/_data/authors.yml rocksdb-5.17.2/docs/_data/authors.yml
--- rocksdb-5.15.10/docs/_data/authors.yml	2018-09-13 17:25:20.000000000 +0000
+++ rocksdb-5.17.2/docs/_data/authors.yml	2018-11-12 19:57:32.000000000 +0000
@@ -60,3 +60,7 @@
 lightmark:
   full_name: Aaron Gao
   fbid: 1351549072
+
+fgwu:
+  full_name: Fenggang Wu
+  fbid: 100002297362180
diff -Nru rocksdb-5.15.10/docs/feed.xml rocksdb-5.17.2/docs/feed.xml
--- rocksdb-5.15.10/docs/feed.xml	2018-09-13 17:25:20.000000000 +0000
+++ rocksdb-5.17.2/docs/feed.xml	2018-11-12 19:57:32.000000000 +0000
@@ -6,7 +6,7 @@
   <channel>
     <title>{{ site.title | xml_escape }}</title>
     <description>{{ site.description | xml_escape }}</description>
-    <link>{{ absolute_url }}/</link>
+    <link>https://rocksdb.org/feed.xml</link>
     <atom:link href="{{ '/feed.xml' | absolute_url }}" rel="self" type="application/rss+xml"/>
     <pubDate>{{ site.time | date_to_rfc822 }}</pubDate>
     <lastBuildDate>{{ site.time | date_to_rfc822 }}</lastBuildDate>
diff -Nru rocksdb-5.15.10/docs/_posts/2018-08-01-rocksdb-tuning-advisor.markdown rocksdb-5.17.2/docs/_posts/2018-08-01-rocksdb-tuning-advisor.markdown
--- rocksdb-5.15.10/docs/_posts/2018-08-01-rocksdb-tuning-advisor.markdown	1970-01-01 00:00:00.000000000 +0000
+++ rocksdb-5.17.2/docs/_posts/2018-08-01-rocksdb-tuning-advisor.markdown	2018-11-12 19:57:32.000000000 +0000
@@ -0,0 +1,58 @@
+---
+title: Rocksdb Tuning Advisor
+layout: post
+author: poojam23
+category: blog
+---
+
+The performance of Rocksdb is contingent on its tuning. However, because
+of the complexity of its underlying technology and a large number of
+configurable parameters, a good configuration is sometimes hard to obtain. The aim of
+the python command-line tool, Rocksdb Advisor, is to automate the process of
+suggesting improvements in the configuration based on advice from Rocksdb
+experts.
+
+### Overview
+
+Experts share their wisdom as rules comprising of conditions and suggestions in the INI format (refer
+[rules.ini](https://github.com/facebook/rocksdb/blob/master/tools/advisor/advisor/rules.ini)).
+Users provide the Rocksdb configuration that they want to improve upon (as the
+familiar Rocksdb OPTIONS file —
+[example](https://github.com/facebook/rocksdb/blob/master/examples/rocksdb_option_file_example.ini))
+and the path of the file which contains Rocksdb logs and statistics.
+The [Advisor](https://github.com/facebook/rocksdb/blob/master/tools/advisor/advisor/rule_parser_example.py)
+creates appropriate DataSource objects (for Rocksdb
+[logs](https://github.com/facebook/rocksdb/blob/master/tools/advisor/advisor/db_log_parser.py),
+[options](https://github.com/facebook/rocksdb/blob/master/tools/advisor/advisor/db_options_parser.py),
+[statistics](https://github.com/facebook/rocksdb/blob/master/tools/advisor/advisor/db_stats_fetcher.py) etc.)
+and provides them to the [Rules Engine](https://github.com/facebook/rocksdb/blob/master/tools/advisor/advisor/rule_parser.py).
+The Rules uses rules from experts to parse data-sources and trigger appropriate rules.
+The Advisor's output gives information about which rules were triggered,
+why they were triggered and what each of them suggests. Each suggestion
+provided by a triggered rule advises some action on a Rocksdb
+configuration option, for example, increase CFOptions.write_buffer_size,
+set bloom_bits to 2 etc.
+
+### Usage
+
+An example command to run the tool:
+
+```shell
+cd rocksdb/tools/advisor
+python3 -m advisor.rule_parser_example --rules_spec=advisor/rules.ini --rocksdb_options=test/input_files/OPTIONS-000005 --log_files_path_prefix=test/input_files/LOG-0 --stats_dump_period_sec=20
+```
+
+Sample output where a Rocksdb log-based rule has been triggered :
+
+```shell
+Rule: stall-too-many-memtables
+LogCondition: stall-too-many-memtables regex: Stopping writes because we have \d+ immutable memtables \(waiting for flush\), max_write_buffer_number is set to \d+
+Suggestion: inc-bg-flush option : DBOptions.max_background_flushes action : increase suggested_values : ['2']
+Suggestion: inc-write-buffer option : CFOptions.max_write_buffer_number action : increase
+scope: col_fam:
+{'default'}
+```
+
+### Read more
+
+For more information, refer to [advisor](https://github.com/facebook/rocksdb/tree/master/tools/advisor/README.md).
diff -Nru rocksdb-5.15.10/docs/_posts/2018-08-23-data-block-hash-index.markdown rocksdb-5.17.2/docs/_posts/2018-08-23-data-block-hash-index.markdown
--- rocksdb-5.15.10/docs/_posts/2018-08-23-data-block-hash-index.markdown	1970-01-01 00:00:00.000000000 +0000
+++ rocksdb-5.17.2/docs/_posts/2018-08-23-data-block-hash-index.markdown	2018-11-12 19:57:32.000000000 +0000
@@ -0,0 +1,118 @@
+---
+title: Improving Point-Lookup Using Data Block Hash Index
+layout: post
+author: fgwu
+category: blog
+---
+We've designed and implemented a _data block hash index_ in RocksDB that has the benefit of both reducing the CPU util and increasing the throughput for point lookup queries with a reasonable and tunable space overhead. 
+
+Specifially, we append a compact hash table to the end of the data block for efficient indexing. It is backward compatible with the data base created without this feature. After turned on the hash index feature, existing data will be gradually converted to the hash index format.
+
+Benchmarks with `db_bench`  show the CPU utilization of one of the main functions in the point lookup code path, `DataBlockIter::Seek()`, is reduced by 21.8%, and the overall RocksDB throughput is increased by 10% under purely cached workloads, at an overhead of 4.6% more space. Shadow testing with Facebook production traffic shows good CPU improvements too.
+
+
+### How to use it
+Two new options are added as part of this feature: `BlockBasedTableOptions::data_block_index_type` and `BlockBasedTableOptions::data_block_hash_table_util_ratio`.
+
+The hash index is disabled by default unless `BlockBasedTableOptions::data_block_index_type` is set to `data_block_index_type = kDataBlockBinaryAndHash`. The hash table utilization ratio is adjustable using `BlockBasedTableOptions::data_block_hash_table_util_ratio`, which is valid only if `data_block_index_type = kDataBlockBinaryAndHash`.
+
+
+```
+// the definitions can be found in include/rocksdb/table.h
+
+// The index type that will be used for the data block.
+enum DataBlockIndexType : char {
+  kDataBlockBinarySearch = 0,  // traditional block type
+  kDataBlockBinaryAndHash = 1, // additional hash index
+};
+
+// Set to kDataBlockBinaryAndHash to enable hash index
+DataBlockIndexType data_block_index_type = kDataBlockBinarySearch;
+
+// #entries/#buckets. It is valid only when data_block_hash_index_type is
+// kDataBlockBinaryAndHash.
+double data_block_hash_table_util_ratio = 0.75;
+
+```
+
+
+### Data Block Hash Index Design
+
+Current data block format groups adjacent keys together as a restart interval. One block consists of multiple restart intervals. The byte offset of the beginning of each restart interval, i.e. a restart point, is stored in an array called restart interval index or binary seek index. RocksDB does a binary search when performing point lookup for keys in data blocks to find the right restart interval the key may reside. We will use binary seek and binary search interchangeably in this post.
+
+In order to find the right location where the key may reside using binary search, multiple key parsing and comparison are needed. Each binary search branching triggers CPU cache miss, causing much CPU utilization. We have seen that this binary search takes up considerable CPU in production use-cases.
+
+![](/static/images/data-block-hash-index/block-format-binary-seek.png)
+
+We implemented a hash map at the end of the block to index the key to reduce the CPU overhead of the binary search. The hash index is just an array of pointers pointing into the binary seek index.
+
+![](/static/images/data-block-hash-index/block-format-hash-index.png)
+
+
+Each array element is considered as a hash bucket when storing the location of a key (or more precisely, the restart index of the restart interval where the key resides). When multiple keys happen to hash into the same bucket (hash collision), we just mark the bucket as “collision”. So that when later querying on that key, the hash table lookup knows that there was a hash collision happened so it can fall back to the traditional binary search to find the location of the key.
+
+We define hash table utilization ratio as the #keys/#buckets. If a utilization ratio is 0.5 and there are 100 buckets, 50 keys are stored in the bucket. The less the util ratio, the less hash collision, and the less chance for a point lookup falls back to binary seek (fall back ratio) due to the collision. So a small util ratio has more benefit to reduce the CPU time but introduces more space overhead.
+
+Space overhead depends on the util ratio. Each bucket is a `uint8_t`  (i.e. one byte). For a util ratio of 1, the space overhead is 1Byte per key, the fall back ratio observed is ~52%.
+
+![](/static/images/data-block-hash-index/hash-index-data-structure.png)
+
+### Things that Need Attention
+
+**Customized Comparator**
+
+Hash index will hash different keys (keys with different content, or byte sequence) into different hash values. This assumes the comparator will not treat different keys as equal if they have different content. 
+
+The default bytewise comparator orders the keys in alphabetical order and works well with hash index, as different keys will never be regarded as equal. However, some specially crafted comparators will do. For example, say, a `StringToIntComparator` can convert a string into an integer, and use the integer to perform the comparison. Key string “16” and “0x10” is equal to each other as seen by this `StringToIntComparator`, but they probably hash to different value. Later queries to one form of the key will not be able to find the existing key been stored in the other format.
+
+We add a new function member to the comparator interface: 
+
+```
+virtual bool CanKeysWithDifferentByteContentsBeEqual() const { return true; }
+```
+
+
+Every comparator implementation should override this function and specify the behavior of the comparator. If a comparator can regard different keys equal, the function returns true, and as a result the hash index feature will not be enabled, and vice versa.
+
+NOTE: to use the hash index feature, one should 1) have a comparator that can never treat different keys as equal; and 2) override the `CanKeysWithDifferentByteContentsBeEqual()` function to return `false`, so the hash index can be enabled.
+
+
+**Util Ratio's Impact on Data Block Cache**
+
+Adding the hash index to the end of the data block essentially takes up the data block cache space, making the effective data block cache size smaller and increasing the data block cache miss ratio. Therefore, a very small util ratio will result in a large data block cache miss ratio, and the extra I/O may drag down the throughput gain achieved by the hash index lookup. Besides, when compression is enabled, cache miss also incurs data block decompression, which is CPU-consuming. Therefore the CPU may even increase if using a too small util ratio. The best util ratio depends on workloads, cache to data ratio, disk bandwidth/latency etc. In our experiment, we found util ratio = 0.5 ~ 1 is a good range to explore that brings both CPU and throughput gains.
+
+
+### Limitations
+
+As we use `uint8_t` to store binary seek index, i.e. restart interval index, the total number of restart intervals cannot be more than 253 (we reserved  255 and 254 as special flags). For blocks having a larger number of restart intervals, the hash index will not be created and the point lookup will be done by traditional binary seek.
+
+Data block hash index only supports point lookup. We do not support range lookup. Range lookup request will fall back to BinarySeek.
+
+RocksDB supports many types of records, such as `Put`, `Delete`, `Merge`, etc (visit [here](https://github.com/facebook/rocksdb/wiki/rocksdb-basics) for more information). Currently we only support `Put` and `Delete`, but not `Merge`. Internally we have a limited set of supported record types:
+
+
+```
+kPutRecord,          <=== supported
+kDeleteRecord,       <=== supported
+kSingleDeleteRecord, <=== supported
+kTypeBlobIndex,      <=== supported
+```
+
+For records not supported, the searching process will fall back to the traditional binary seek. 
+
+
+
+### Evaluation
+To evaluate the CPU util reduction and isolate other factors such as disk I/O and block decompression, we first evaluate the hash idnex in a purely cached workload. We observe that the CPU utilization of one of the main functions in the point lookup code path, DataBlockIter::Seek(), is reduced by 21.8% and the overall throughput is increased by 10% at an overhead of 4.6% more space.
+
+However, general worload is not always purely cached. So we also evaluate the performance under different cache space pressure. In the following test, we use `db_bench` with RocksDB deployed on SSDs. The total DB size is 5~6GB, and it is about 14GB if decompressed. Different block cache sizes are used, ranging from 14GB down to 2GB, with an increasing cache miss ratio.
+
+Orange bars are representing our hash index performance. We use a hash util ratio of 1.0 in this test. Block size are set to 16KiB with the restart interval as 16.
+
+![](/static/images/data-block-hash-index/perf-throughput.png)
+![](/static/images/data-block-hash-index/perf-cache-miss.png)
+
+We can see that if cache size is greater than 8GB, hash index can bring throughput gain. Cache size greater than 8GB can be translated to a cache miss ratio smaller than 40%. So if the workload has a cache miss ratio smaller than 40%, hash index is able to increase the throughput.
+
+Besides, shadow testing with Facebook production traffic shows good CPU improvements too.
+
Binary files /tmp/tmpCpQiov/sdWDYeQVrI/rocksdb-5.15.10/docs/static/images/binaryseek.png and /tmp/tmpCpQiov/KYFbIzS3v6/rocksdb-5.17.2/docs/static/images/binaryseek.png differ
Binary files /tmp/tmpCpQiov/sdWDYeQVrI/rocksdb-5.15.10/docs/static/images/data-block-hash-index/block-format-binary-seek.png and /tmp/tmpCpQiov/KYFbIzS3v6/rocksdb-5.17.2/docs/static/images/data-block-hash-index/block-format-binary-seek.png differ
Binary files /tmp/tmpCpQiov/sdWDYeQVrI/rocksdb-5.15.10/docs/static/images/data-block-hash-index/block-format-hash-index.png and /tmp/tmpCpQiov/KYFbIzS3v6/rocksdb-5.17.2/docs/static/images/data-block-hash-index/block-format-hash-index.png differ
Binary files /tmp/tmpCpQiov/sdWDYeQVrI/rocksdb-5.15.10/docs/static/images/data-block-hash-index/hash-index-data-structure.png and /tmp/tmpCpQiov/KYFbIzS3v6/rocksdb-5.17.2/docs/static/images/data-block-hash-index/hash-index-data-structure.png differ
Binary files /tmp/tmpCpQiov/sdWDYeQVrI/rocksdb-5.15.10/docs/static/images/data-block-hash-index/perf-cache-miss.png and /tmp/tmpCpQiov/KYFbIzS3v6/rocksdb-5.17.2/docs/static/images/data-block-hash-index/perf-cache-miss.png differ
Binary files /tmp/tmpCpQiov/sdWDYeQVrI/rocksdb-5.15.10/docs/static/images/data-block-hash-index/perf-throughput.png and /tmp/tmpCpQiov/KYFbIzS3v6/rocksdb-5.17.2/docs/static/images/data-block-hash-index/perf-throughput.png differ
diff -Nru rocksdb-5.15.10/env/env_posix.cc rocksdb-5.17.2/env/env_posix.cc
--- rocksdb-5.15.10/env/env_posix.cc	2018-09-13 17:25:20.000000000 +0000
+++ rocksdb-5.17.2/env/env_posix.cc	2018-11-12 19:57:32.000000000 +0000
@@ -20,11 +20,12 @@
 #include <sys/ioctl.h>
 #include <sys/mman.h>
 #include <sys/stat.h>
-#if defined(OS_LINUX) || defined(OS_SOLARIS)
+#if defined(OS_LINUX) || defined(OS_SOLARIS) || defined(OS_ANDROID)
 #include <sys/statfs.h>
 #include <sys/syscall.h>
 #include <sys/sysmacros.h>
 #endif
+#include <sys/statvfs.h>
 #include <sys/time.h>
 #include <sys/types.h>
 #include <time.h>
@@ -102,6 +103,18 @@
   std::string filename;
 };
 
+int cloexec_flags(int flags, const EnvOptions* options) {
+  // If the system supports opening the file with cloexec enabled,
+  // do so, as this avoids a race condition if a db is opened around
+  // the same time that a child process is forked
+#ifdef O_CLOEXEC
+  if (options == nullptr || options->set_fd_cloexec) {
+    flags |= O_CLOEXEC;
+  }
+#endif
+  return flags;
+}
+
 class PosixEnv : public Env {
  public:
   PosixEnv();
@@ -133,7 +146,7 @@
                                    const EnvOptions& options) override {
     result->reset();
     int fd = -1;
-    int flags = O_RDONLY;
+    int flags = cloexec_flags(O_RDONLY, &options);
     FILE* file = nullptr;
 
     if (options.use_direct_reads && !options.use_mmap_reads) {
@@ -184,7 +197,8 @@
     result->reset();
     Status s;
     int fd;
-    int flags = O_RDONLY;
+    int flags = cloexec_flags(O_RDONLY, &options);
+
     if (options.use_direct_reads && !options.use_mmap_reads) {
 #ifdef ROCKSDB_LITE
       return Status::IOError(fname, "Direct I/O not supported in RocksDB lite");
@@ -266,6 +280,8 @@
       flags |= O_WRONLY;
     }
 
+    flags = cloexec_flags(flags, &options);
+
     do {
       IOSTATS_TIMER_GUARD(open_nanos);
       fd = open(fname.c_str(), flags, GetDBFileMode(allow_non_owner_access_));
@@ -354,6 +370,8 @@
       flags |= O_WRONLY;
     }
 
+    flags = cloexec_flags(flags, &options);
+
     do {
       IOSTATS_TIMER_GUARD(open_nanos);
       fd = open(old_fname.c_str(), flags,
@@ -415,9 +433,12 @@
                                  unique_ptr<RandomRWFile>* result,
                                  const EnvOptions& options) override {
     int fd = -1;
+    int flags = cloexec_flags(O_RDWR, &options);
+
     while (fd < 0) {
       IOSTATS_TIMER_GUARD(open_nanos);
-      fd = open(fname.c_str(), O_RDWR, GetDBFileMode(allow_non_owner_access_));
+
+      fd = open(fname.c_str(), flags, GetDBFileMode(allow_non_owner_access_));
       if (fd < 0) {
         // Error while opening the file
         if (errno == EINTR) {
@@ -437,9 +458,11 @@
       unique_ptr<MemoryMappedFileBuffer>* result) override {
     int fd = -1;
     Status status;
+    int flags = cloexec_flags(O_RDWR, nullptr);
+
     while (fd < 0) {
       IOSTATS_TIMER_GUARD(open_nanos);
-      fd = open(fname.c_str(), O_RDWR, 0644);
+      fd = open(fname.c_str(), flags, 0644);
       if (fd < 0) {
         // Error while opening the file
         if (errno == EINTR) {
@@ -477,9 +500,10 @@
                               unique_ptr<Directory>* result) override {
     result->reset();
     int fd;
+    int flags = cloexec_flags(0, nullptr);
     {
       IOSTATS_TIMER_GUARD(open_nanos);
-      fd = open(name.c_str(), 0);
+      fd = open(name.c_str(), flags);
     }
     if (fd < 0) {
       return IOError("While open directory", name, errno);
@@ -496,7 +520,8 @@
       return Status::OK();
     }
 
-    switch (errno) {
+    int err = errno;
+    switch (err) {
       case EACCES:
       case ELOOP:
       case ENAMETOOLONG:
@@ -504,8 +529,8 @@
       case ENOTDIR:
         return Status::NotFound();
       default:
-        assert(result == EIO || result == ENOMEM);
-        return Status::IOError("Unexpected error(" + ToString(result) +
+        assert(err == EIO || err == ENOMEM);
+        return Status::IOError("Unexpected error(" + ToString(err) +
                                ") accessing file `" + fname + "' ");
     }
   }
@@ -663,9 +688,11 @@
     }
 
     int fd;
+    int flags = cloexec_flags(O_RDWR | O_CREAT, nullptr);
+
     {
       IOSTATS_TIMER_GUARD(open_nanos);
-      fd = open(fname.c_str(), O_RDWR | O_CREAT, 0644);
+      fd = open(fname.c_str(), flags, 0644);
     }
     if (fd < 0) {
       result = IOError("while open a file for lock", fname, errno);
@@ -751,12 +778,30 @@
     return gettid(pthread_self());
   }
 
+  virtual Status GetFreeSpace(const std::string& fname,
+                              uint64_t* free_space) override {
+    struct statvfs sbuf;
+
+    if (statvfs(fname.c_str(), &sbuf) < 0) {
+      return IOError("While doing statvfs", fname, errno);
+    }
+
+    *free_space = ((uint64_t)sbuf.f_bsize * sbuf.f_bfree);
+    return Status::OK();
+  }
+
   virtual Status NewLogger(const std::string& fname,
                            shared_ptr<Logger>* result) override {
     FILE* f;
     {
       IOSTATS_TIMER_GUARD(open_nanos);
-      f = fopen(fname.c_str(), "w");
+      f = fopen(fname.c_str(), "w"
+#ifdef __GLIBC_PREREQ
+#if __GLIBC_PREREQ(2, 7)
+          "e" // glibc extension to enable O_CLOEXEC
+#endif
+#endif
+          );
     }
     if (f == nullptr) {
       result->reset();
diff -Nru rocksdb-5.15.10/env/mock_env.cc rocksdb-5.17.2/env/mock_env.cc
--- rocksdb-5.15.10/env/mock_env.cc	2018-09-13 17:25:20.000000000 +0000
+++ rocksdb-5.17.2/env/mock_env.cc	2018-11-12 19:57:32.000000000 +0000
@@ -201,7 +201,7 @@
     if (n > available) {
       n = available;
     }
-    pos_ += n;
+    pos_ += static_cast<size_t>(n);
     return Status::OK();
   }
 
diff -Nru rocksdb-5.15.10/env/posix_logger.h rocksdb-5.17.2/env/posix_logger.h
--- rocksdb-5.15.10/env/posix_logger.h	2018-09-13 17:25:20.000000000 +0000
+++ rocksdb-5.17.2/env/posix_logger.h	2018-11-12 19:57:32.000000000 +0000
@@ -165,7 +165,6 @@
 
       size_t sz = fwrite(base, 1, write_size, file_);
       flush_pending_ = true;
-      assert(sz == write_size);
       if (sz > 0) {
         log_size_ += write_size;
       }
diff -Nru rocksdb-5.15.10/.gitignore rocksdb-5.17.2/.gitignore
--- rocksdb-5.15.10/.gitignore	2018-09-13 17:25:20.000000000 +0000
+++ rocksdb-5.17.2/.gitignore	2018-11-12 19:57:32.000000000 +0000
@@ -45,6 +45,8 @@
 rocksdb_dump
 rocksdb_undump
 db_test2
+trace_analyzer
+trace_analyzer_test
 
 java/out
 java/target
diff -Nru rocksdb-5.15.10/HISTORY.md rocksdb-5.17.2/HISTORY.md
--- rocksdb-5.15.10/HISTORY.md	2018-09-13 17:25:20.000000000 +0000
+++ rocksdb-5.17.2/HISTORY.md	2018-11-12 19:57:32.000000000 +0000
@@ -1,43 +1,58 @@
 # Rocksdb Change Log
-### 5.15.10 (9/13/2018)
-### Bug Fixes
-* Fix RocksDB Java build and tests.
 
-### 5.15.9 (9/4/2018)
+# 5.17.2 (10/24/2018)
 ### Bug Fixes
-* Fix compilation errors on OS X clang due to '-Wsuggest-override'.
+* Fix the bug that WriteBatchWithIndex's SeekForPrev() doesn't see the entries with the same key.
 
-## 5.15.8 (8/31/2018)
+# 5.17.1 (10/16/2018)
 ### Bug Fixes
-* Further avoid creating empty SSTs and subsequently deleting them during compaction.
+* Fix slow flush/compaction when DB contains many snapshots. The problem became noticeable to us in DBs with 100,000+ snapshots, though it will affect others at different thresholds.
+* Properly set the stop key for a truncated manual CompactRange
+* Fix corner case where a write group leader blocked due to write stall blocks other writers in queue with WriteOptions::no_slowdown set.
 
-## 5.15.7 (8/24/2018)
-### Bug Fixes
-* Avoid creating empty SSTs and subsequently deleting them in certain cases during compaction.
+### New Features
+* Introduced CacheAllocator, which lets the user specify custom allocator for memory in block cache.
 
-## 5.15.6 (8/21/2018)
+## 5.17.0 (10/05/2018)
 ### Public API Change
-* The merge operands are passed to `MergeOperator::ShouldMerge` in the reversed order relative to how they were merged (passed to FullMerge or FullMergeV2) for performance reasons
+* `OnTableFileCreated` will now be called for empty files generated during compaction. In that case, `TableFileCreationInfo::file_path` will be "(nil)" and `TableFileCreationInfo::file_size` will be zero.
+* Add `FlushOptions::allow_write_stall`, which controls whether Flush calls start working immediately, even if it causes user writes to stall, or will wait until flush can be performed without causing write stall (similar to `CompactRangeOptions::allow_write_stall`). Note that the default value is false, meaning we add delay to Flush calls until stalling can be avoided when possible. This is behavior change compared to previous RocksDB versions, where Flush calls didn't check if they might cause stall or not.
+* Application using PessimisticTransactionDB is expected to rollback/commit recovered transactions before starting new ones. This assumption is used to skip concurrency control during recovery.
+
+### New Features
+* TransactionOptions::skip_concurrency_control allows pessimistic transactions to skip the overhead of concurrency control. Could be used for optimizing certain transactions or during recovery.
 
-## 5.15.5 (8/16/2018)
 ### Bug Fixes
-* Fix VerifyChecksum() API not preserving options
+* Avoid creating empty SSTs and subsequently deleting them in certain cases during compaction.
+* Sync CURRENT file contents during checkpoint.
 
-## 5.15.4 (8/11/2018)
+## 5.16.3 (10/1/2018)
 ### Bug Fixes
-* Fix a bug caused by not generating OnTableFileCreated() notification for a 0-byte SST.
+* Fix crash caused when `CompactFiles` run with `CompactionOptions::compression == CompressionType::kDisableCompressionOption`. Now that setting causes the compression type to be chosen according to the column family-wide compression options.
 
-## 5.15.3 (8/10/2018)
+## 5.16.2 (9/21/2018)
 ### Bug Fixes
-* Fix a bug in misreporting the estimated partition index size in properties block.
+* Fix bug in partition filters with format_version=4.
 
-## 5.15.2 (8/9/2018)
+## 5.16.1 (9/17/2018)
 ### Bug Fixes
-* Return correct usable_size for BlockContents.
+* Remove trace_analyzer_tool from rocksdb_lib target in TARGETS file.
+* Fix RocksDB Java build and tests.
+* Remove sync point in Block destructor.
+
+## 5.16.0 (8/21/2018)
+### Public API Change
+* The merge operands are passed to `MergeOperator::ShouldMerge` in the reversed order relative to how they were merged (passed to FullMerge or FullMergeV2) for performance reasons
+* GetAllKeyVersions() to take an extra argument of `max_num_ikeys`.
+* Using ZSTD dictionary trainer (i.e., setting `CompressionOptions::zstd_max_train_bytes` to a nonzero value) now requires ZSTD version 1.1.3 or later.
+
+### New Features
+* Changes the format of index blocks by delta encoding the index values, which are the block handles. This saves the encoding of BlockHandle::offset of the non-head index entries in each restart interval. The feature is backward compatible but not forward compatible. It is disabled by default unless format_version 4 or above is used.
+* Add a new tool: trace_analyzer. Trace_analyzer analyzes the trace file generated by using trace_replay API. It can convert the binary format trace file to a human readable txt file, output the statistics of the analyzed query types such as access statistics and size statistics, combining the dumped whole key space file to analyze, support query correlation analyzing, and etc. Current supported query types are: Get, Put, Delete, SingleDelete, DeleteRange, Merge, Iterator (Seek, SeekForPrev only).
+* Add hash index support to data blocks, which helps reducing the cpu utilization of point-lookup operations. This feature is backward compatible with the data block created without the hash index. It is disabled by default unless BlockBasedTableOptions::data_block_index_type is set to data_block_index_type = kDataBlockBinaryAndHash.
 
-## 5.15.1 (8/1/2018)
 ### Bug Fixes
-* Prevent dereferencing invalid STL iterators when there are range tombstones in ingested files.
+* Fix a bug in misreporting the estimated partition index size in properties block.
 
 ## 5.15.0 (7/17/2018)
 ### Public API Change
@@ -48,12 +63,13 @@
 * The "rocksdb.num.entries" table property no longer counts range deletion tombstones as entries.
 
 ### New Features
-* Changes the format of index blocks by storing the key in their raw form rather than converting them to InternalKey. This saves 8 bytes per index key. The feature is backward compatbile but not forward compatible. It is disabled by default unless format_version 3 or above is used.
+* Changes the format of index blocks by storing the key in their raw form rather than converting them to InternalKey. This saves 8 bytes per index key. The feature is backward compatible but not forward compatible. It is disabled by default unless format_version 3 or above is used.
 * Avoid memcpy when reading mmap files with OpenReadOnly and max_open_files==-1.
 * Support dynamically changing `ColumnFamilyOptions::ttl` via `SetOptions()`.
 * Add a new table property, "rocksdb.num.range-deletions", which counts the number of range deletion tombstones in the table.
 * Improve the performance of iterators doing long range scans by using readahead, when using direct IO.
 * pin_top_level_index_and_filter (default true) in BlockBasedTableOptions can be used in combination with cache_index_and_filter_blocks to prefetch and pin the top-level index of partitioned index and filter blocks in cache. It has no impact when cache_index_and_filter_blocks is false.
+* Write properties meta-block at the end of block-based table to save read-ahead IO.
 
 ### Bug Fixes
 * Fix deadlock with enable_pipelined_write=true and max_successive_merges > 0
@@ -172,7 +188,8 @@
 * `BackupableDBOptions::max_valid_backups_to_open == 0` now means no backups will be opened during BackupEngine initialization. Previously this condition disabled limiting backups opened.
 * `DBOptions::preserve_deletes` is a new option that allows one to specify that DB should not drop tombstones for regular deletes if they have sequence number larger than what was set by the new API call `DB::SetPreserveDeletesSequenceNumber(SequenceNumber seqnum)`. Disabled by default.
 * API call `DB::SetPreserveDeletesSequenceNumber(SequenceNumber seqnum)` was added, users who wish to preserve deletes are expected to periodically call this function to advance the cutoff seqnum (all deletes made before this seqnum can be dropped by DB). It's user responsibility to figure out how to advance the seqnum in the way so the tombstones are kept for the desired period of time, yet are eventually processed in time and don't eat up too much space.
-* `ReadOptions::iter_start_seqnum` was added; if set to something > 0 user will see 2 changes in iterators behavior 1) only keys written with sequence larger than this parameter would be returned and 2) the `Slice` returned by iter->key() now points to the memory that keep User-oriented representation of the internal key, rather than user key. New struct `FullKey` was added to represent internal keys, along with a new helper function `ParseFullKey(const Slice& internal_key, FullKey* result);`.
+* `ReadOptions::iter_start_seqnum` was added;
+if set to something > 0 user will see 2 changes in iterators behavior 1) only keys written with sequence larger than this parameter would be returned and 2) the `Slice` returned by iter->key() now points to the memory that keep User-oriented representation of the internal key, rather than user key. New struct `FullKey` was added to represent internal keys, along with a new helper function `ParseFullKey(const Slice& internal_key, FullKey* result);`.
 * Deprecate trash_dir param in NewSstFileManager, right now we will rename deleted files to <name>.trash instead of moving them to trash directory
 * Allow setting a custom trash/DB size ratio limit in the SstFileManager, after which files that are to be scheduled for deletion are deleted immediately, regardless of any delete ratelimit.
 * Return an error on write if write_options.sync = true and write_options.disableWAL = true to warn user of inconsistent options. Previously we will not write to WAL and not respecting the sync options in this case.
diff -Nru rocksdb-5.15.10/include/rocksdb/c.h rocksdb-5.17.2/include/rocksdb/c.h
--- rocksdb-5.15.10/include/rocksdb/c.h	2018-09-13 17:25:20.000000000 +0000
+++ rocksdb-5.17.2/include/rocksdb/c.h	2018-11-12 19:57:32.000000000 +0000
@@ -42,9 +42,6 @@
   (5) All of the pointer arguments must be non-NULL.
 */
 
-#ifndef STORAGE_ROCKSDB_INCLUDE_C_H_
-#define STORAGE_ROCKSDB_INCLUDE_C_H_
-
 #pragma once
 
 #ifdef _WIN32
@@ -126,6 +123,8 @@
 typedef struct rocksdb_checkpoint_t rocksdb_checkpoint_t;
 typedef struct rocksdb_wal_iterator_t rocksdb_wal_iterator_t;
 typedef struct rocksdb_wal_readoptions_t rocksdb_wal_readoptions_t;
+typedef struct rocksdb_memory_consumers_t rocksdb_memory_consumers_t;
+typedef struct rocksdb_memory_usage_t rocksdb_memory_usage_t;
 
 /* DB operations */
 
@@ -831,6 +830,12 @@
 extern ROCKSDB_LIBRARY_API void
 rocksdb_options_set_max_write_buffer_number_to_maintain(rocksdb_options_t*,
                                                         int);
+extern ROCKSDB_LIBRARY_API void rocksdb_options_set_enable_pipelined_write(
+    rocksdb_options_t*, unsigned char);
+extern ROCKSDB_LIBRARY_API void rocksdb_options_set_max_subcompactions(
+    rocksdb_options_t*, uint32_t);
+extern ROCKSDB_LIBRARY_API void rocksdb_options_set_max_background_jobs(
+    rocksdb_options_t*, int);
 extern ROCKSDB_LIBRARY_API void rocksdb_options_set_max_background_compactions(
     rocksdb_options_t*, int);
 extern ROCKSDB_LIBRARY_API void rocksdb_options_set_base_background_compactions(
@@ -1669,8 +1674,33 @@
 extern ROCKSDB_LIBRARY_API const char* rocksdb_pinnableslice_value(
     const rocksdb_pinnableslice_t* t, size_t* vlen);
 
+extern ROCKSDB_LIBRARY_API rocksdb_memory_consumers_t*
+    rocksdb_memory_consumers_create();
+extern ROCKSDB_LIBRARY_API void rocksdb_memory_consumers_add_db(
+    rocksdb_memory_consumers_t* consumers, rocksdb_t* db);
+extern ROCKSDB_LIBRARY_API void rocksdb_memory_consumers_add_cache(
+    rocksdb_memory_consumers_t* consumers, rocksdb_cache_t* cache);
+extern ROCKSDB_LIBRARY_API void rocksdb_memory_consumers_destroy(
+    rocksdb_memory_consumers_t* consumers);
+extern ROCKSDB_LIBRARY_API rocksdb_memory_usage_t*
+rocksdb_approximate_memory_usage_create(rocksdb_memory_consumers_t* consumers,
+                                       char** errptr);
+extern ROCKSDB_LIBRARY_API void rocksdb_approximate_memory_usage_destroy(
+    rocksdb_memory_usage_t* usage);
+
+extern ROCKSDB_LIBRARY_API uint64_t
+rocksdb_approximate_memory_usage_get_mem_table_total(
+    rocksdb_memory_usage_t* memory_usage);
+extern ROCKSDB_LIBRARY_API uint64_t
+rocksdb_approximate_memory_usage_get_mem_table_unflushed(
+    rocksdb_memory_usage_t* memory_usage);
+extern ROCKSDB_LIBRARY_API uint64_t
+rocksdb_approximate_memory_usage_get_mem_table_readers_total(
+    rocksdb_memory_usage_t* memory_usage);
+extern ROCKSDB_LIBRARY_API uint64_t
+rocksdb_approximate_memory_usage_get_cache_total(
+    rocksdb_memory_usage_t* memory_usage);
+
 #ifdef __cplusplus
 }  /* end extern "C" */
 #endif
-
-#endif  /* STORAGE_ROCKSDB_INCLUDE_C_H_ */
diff -Nru rocksdb-5.15.10/include/rocksdb/cleanable.h rocksdb-5.17.2/include/rocksdb/cleanable.h
--- rocksdb-5.15.10/include/rocksdb/cleanable.h	2018-09-13 17:25:20.000000000 +0000
+++ rocksdb-5.17.2/include/rocksdb/cleanable.h	2018-11-12 19:57:32.000000000 +0000
@@ -16,8 +16,7 @@
 // non-const method, all threads accessing the same Iterator must use
 // external synchronization.
 
-#ifndef INCLUDE_ROCKSDB_CLEANABLE_H_
-#define INCLUDE_ROCKSDB_CLEANABLE_H_
+#pragma once
 
 namespace rocksdb {
 
@@ -78,5 +77,3 @@
 };
 
 }  // namespace rocksdb
-
-#endif  // INCLUDE_ROCKSDB_CLEANABLE_H_
diff -Nru rocksdb-5.15.10/include/rocksdb/compaction_filter.h rocksdb-5.17.2/include/rocksdb/compaction_filter.h
--- rocksdb-5.15.10/include/rocksdb/compaction_filter.h	2018-09-13 17:25:20.000000000 +0000
+++ rocksdb-5.17.2/include/rocksdb/compaction_filter.h	2018-11-12 19:57:32.000000000 +0000
@@ -6,8 +6,7 @@
 // Use of this source code is governed by a BSD-style license that can be
 // found in the LICENSE file. See the AUTHORS file for names of contributors.
 
-#ifndef STORAGE_ROCKSDB_INCLUDE_COMPACTION_FILTER_H_
-#define STORAGE_ROCKSDB_INCLUDE_COMPACTION_FILTER_H_
+#pragma once
 
 #include <cassert>
 #include <memory>
@@ -206,5 +205,3 @@
 };
 
 }  // namespace rocksdb
-
-#endif  // STORAGE_ROCKSDB_INCLUDE_COMPACTION_FILTER_H_
diff -Nru rocksdb-5.15.10/include/rocksdb/comparator.h rocksdb-5.17.2/include/rocksdb/comparator.h
--- rocksdb-5.15.10/include/rocksdb/comparator.h	2018-09-13 17:25:20.000000000 +0000
+++ rocksdb-5.17.2/include/rocksdb/comparator.h	2018-11-12 19:57:32.000000000 +0000
@@ -6,8 +6,7 @@
 // Use of this source code is governed by a BSD-style license that can be
 // found in the LICENSE file. See the AUTHORS file for names of contributors.
 
-#ifndef STORAGE_ROCKSDB_INCLUDE_COMPARATOR_H_
-#define STORAGE_ROCKSDB_INCLUDE_COMPARATOR_H_
+#pragma once
 
 #include <string>
 
@@ -74,6 +73,12 @@
                                               const Slice& /*t*/) const {
     return false;
   }
+
+  // return true if two keys with different byte sequences can be regarded
+  // as equal by this comparator.
+  // The major use case is to determine if DataBlockHashIndex is compatible
+  // with the customized comparator.
+  virtual bool CanKeysWithDifferentByteContentsBeEqual() const { return true; }
 };
 
 // Return a builtin comparator that uses lexicographic byte-wise
@@ -86,5 +91,3 @@
 extern const Comparator* ReverseBytewiseComparator();
 
 }  // namespace rocksdb
-
-#endif  // STORAGE_ROCKSDB_INCLUDE_COMPARATOR_H_
diff -Nru rocksdb-5.15.10/include/rocksdb/db.h rocksdb-5.17.2/include/rocksdb/db.h
--- rocksdb-5.15.10/include/rocksdb/db.h	2018-09-13 17:25:20.000000000 +0000
+++ rocksdb-5.17.2/include/rocksdb/db.h	2018-11-12 19:57:32.000000000 +0000
@@ -6,8 +6,7 @@
 // Use of this source code is governed by a BSD-style license that can be
 // found in the LICENSE file. See the AUTHORS file for names of contributors.
 
-#ifndef STORAGE_ROCKSDB_INCLUDE_DB_H_
-#define STORAGE_ROCKSDB_INCLUDE_DB_H_
+#pragma once
 
 #include <stdint.h>
 #include <stdio.h>
@@ -53,6 +52,7 @@
 class WriteBatch;
 class Env;
 class EventListener;
+class TraceWriter;
 
 using std::unique_ptr;
 
@@ -949,14 +949,14 @@
   // GetLiveFiles followed by GetSortedWalFiles can generate a lossless backup
 
   // Retrieve the list of all files in the database. The files are
-  // relative to the dbname and are not absolute paths. The valid size of the
-  // manifest file is returned in manifest_file_size. The manifest file is an
-  // ever growing file, but only the portion specified by manifest_file_size is
-  // valid for this snapshot.
-  // Setting flush_memtable to true does Flush before recording the live files.
-  // Setting flush_memtable to false is useful when we don't want to wait for
-  // flush which may have to wait for compaction to complete taking an
-  // indeterminate time.
+  // relative to the dbname and are not absolute paths. Despite being relative
+  // paths, the file names begin with "/". The valid size of the manifest file
+  // is returned in manifest_file_size. The manifest file is an ever growing
+  // file, but only the portion specified by manifest_file_size is valid for
+  // this snapshot. Setting flush_memtable to true does Flush before recording
+  // the live files. Setting flush_memtable to false is useful when we don't
+  // want to wait for flush which may have to wait for compaction to complete
+  // taking an indeterminate time.
   //
   // In case you have multiple column families, even if flush_memtable is true,
   // you still need to call GetSortedWalFiles after GetLiveFiles to compensate
@@ -996,11 +996,6 @@
       std::vector<LiveFileMetaData>* /*metadata*/) {}
 
   // Obtains the meta data of the specified column family of the DB.
-  // Status::NotFound() will be returned if the current DB does not have
-  // any column family match the specified name.
-  //
-  // If cf_name is not specified, then the metadata of the default
-  // column family will be returned.
   virtual void GetColumnFamilyMetaData(ColumnFamilyHandle* /*column_family*/,
                                        ColumnFamilyMetaData* /*metadata*/) {}
 
@@ -1173,6 +1168,15 @@
     return Status::NotSupported("PromoteL0() is not implemented.");
   }
 
+  // Trace DB operations. Use EndTrace() to stop tracing.
+  virtual Status StartTrace(const TraceOptions& /*options*/,
+                            std::unique_ptr<TraceWriter>&& /*trace_writer*/) {
+    return Status::NotSupported("StartTrace() is not implemented.");
+  }
+
+  virtual Status EndTrace() {
+    return Status::NotSupported("EndTrace() is not implemented.");
+  }
 #endif  // ROCKSDB_LITE
 
   // Needed for StackableDB
@@ -1216,5 +1220,3 @@
 #endif
 
 }  // namespace rocksdb
-
-#endif  // STORAGE_ROCKSDB_INCLUDE_DB_H_
diff -Nru rocksdb-5.15.10/include/rocksdb/env.h rocksdb-5.17.2/include/rocksdb/env.h
--- rocksdb-5.15.10/include/rocksdb/env.h	2018-09-13 17:25:20.000000000 +0000
+++ rocksdb-5.17.2/include/rocksdb/env.h	2018-11-12 19:57:32.000000000 +0000
@@ -14,8 +14,7 @@
 // All Env implementations are safe for concurrent access from
 // multiple threads without any external synchronization.
 
-#ifndef STORAGE_ROCKSDB_INCLUDE_ENV_H_
-#define STORAGE_ROCKSDB_INCLUDE_ENV_H_
+#pragma once
 
 #include <stdint.h>
 #include <cstdarg>
@@ -478,6 +477,15 @@
   // Returns the ID of the current thread.
   virtual uint64_t GetThreadID() const;
 
+// This seems to clash with a macro on Windows, so #undef it here
+#undef GetFreeSpace
+
+  // Get the amount of free disk space
+  virtual Status GetFreeSpace(const std::string& /*path*/,
+                              uint64_t* /*diskfree*/) {
+    return Status::NotSupported();
+  }
+
  protected:
   // The pointer to an internal structure that will update the
   // status of each thread.
@@ -1267,5 +1275,3 @@
 Env* NewTimedEnv(Env* base_env);
 
 }  // namespace rocksdb
-
-#endif  // STORAGE_ROCKSDB_INCLUDE_ENV_H_
diff -Nru rocksdb-5.15.10/include/rocksdb/filter_policy.h rocksdb-5.17.2/include/rocksdb/filter_policy.h
--- rocksdb-5.15.10/include/rocksdb/filter_policy.h	2018-09-13 17:25:20.000000000 +0000
+++ rocksdb-5.17.2/include/rocksdb/filter_policy.h	2018-11-12 19:57:32.000000000 +0000
@@ -17,8 +17,7 @@
 // Most people will want to use the builtin bloom filter support (see
 // NewBloomFilterPolicy() below).
 
-#ifndef STORAGE_ROCKSDB_INCLUDE_FILTER_POLICY_H_
-#define STORAGE_ROCKSDB_INCLUDE_FILTER_POLICY_H_
+#pragma once
 
 #include <memory>
 #include <stdexcept>
@@ -149,5 +148,3 @@
 extern const FilterPolicy* NewBloomFilterPolicy(int bits_per_key,
     bool use_block_based_builder = true);
 }
-
-#endif  // STORAGE_ROCKSDB_INCLUDE_FILTER_POLICY_H_
diff -Nru rocksdb-5.15.10/include/rocksdb/iterator.h rocksdb-5.17.2/include/rocksdb/iterator.h
--- rocksdb-5.15.10/include/rocksdb/iterator.h	2018-09-13 17:25:20.000000000 +0000
+++ rocksdb-5.17.2/include/rocksdb/iterator.h	2018-11-12 19:57:32.000000000 +0000
@@ -16,8 +16,7 @@
 // non-const method, all threads accessing the same Iterator must use
 // external synchronization.
 
-#ifndef STORAGE_ROCKSDB_INCLUDE_ITERATOR_H_
-#define STORAGE_ROCKSDB_INCLUDE_ITERATOR_H_
+#pragma once
 
 #include <string>
 #include "rocksdb/cleanable.h"
@@ -119,5 +118,3 @@
 extern Iterator* NewErrorIterator(const Status& status);
 
 }  // namespace rocksdb
-
-#endif  // STORAGE_ROCKSDB_INCLUDE_ITERATOR_H_
diff -Nru rocksdb-5.15.10/include/rocksdb/ldb_tool.h rocksdb-5.17.2/include/rocksdb/ldb_tool.h
--- rocksdb-5.15.10/include/rocksdb/ldb_tool.h	2018-09-13 17:25:20.000000000 +0000
+++ rocksdb-5.17.2/include/rocksdb/ldb_tool.h	2018-11-12 19:57:32.000000000 +0000
@@ -2,8 +2,8 @@
 //  This source code is licensed under both the GPLv2 (found in the
 //  COPYING file in the root directory) and Apache 2.0 License
 //  (found in the LICENSE.Apache file in the root directory).
-#ifndef ROCKSDB_LITE
 #pragma once
+#ifndef ROCKSDB_LITE
 #include <string>
 #include <vector>
 #include "rocksdb/db.h"
diff -Nru rocksdb-5.15.10/include/rocksdb/listener.h rocksdb-5.17.2/include/rocksdb/listener.h
--- rocksdb-5.15.10/include/rocksdb/listener.h	2018-09-13 17:25:20.000000000 +0000
+++ rocksdb-5.17.2/include/rocksdb/listener.h	2018-11-12 19:57:32.000000000 +0000
@@ -27,6 +27,7 @@
   kFlush,
   kCompaction,
   kRecovery,
+  kMisc,
 };
 
 struct TableFileCreationBriefInfo {
@@ -103,6 +104,7 @@
   kDeleteFiles = 0x08,
   kAutoCompaction = 0x09,
   kManualFlush = 0x0a,
+  kErrorRecovery = 0xb,
 };
 
 enum class BackgroundErrorReason {
@@ -393,6 +395,21 @@
   // returns.  Otherwise, RocksDB may be blocked.
   virtual void OnStallConditionsChanged(const WriteStallInfo& /*info*/) {}
 
+  // A callback function for RocksDB which will be called just before
+  // starting the automatic recovery process for recoverable background
+  // errors, such as NoSpace(). The callback can suppress the automatic
+  // recovery by setting *auto_recovery to false. The database will then
+  // have to be transitioned out of read-only mode by calling DB::Resume()
+  virtual void OnErrorRecoveryBegin(BackgroundErrorReason /* reason */,
+                                    Status /* bg_error */,
+                                    bool* /* auto_recovery */) {}
+
+  // A callback function for RocksDB which will be called once the database
+  // is recovered from read-only mode after an error. When this is called, it
+  // means normal writes to the database can be issued and the user can
+  // initiate any further recovery actions needed
+  virtual void OnErrorRecoveryCompleted(Status /* old_bg_error */) {}
+
   virtual ~EventListener() {}
 };
 
diff -Nru rocksdb-5.15.10/include/rocksdb/memtablerep.h rocksdb-5.17.2/include/rocksdb/memtablerep.h
--- rocksdb-5.15.10/include/rocksdb/memtablerep.h	2018-09-13 17:25:20.000000000 +0000
+++ rocksdb-5.17.2/include/rocksdb/memtablerep.h	2018-11-12 19:57:32.000000000 +0000
@@ -144,6 +144,14 @@
   // or any writes done directly to entries accessed through the iterator.)
   virtual void MarkReadOnly() { }
 
+  // Notify this table rep that it has been flushed to stable storage.
+  // By default, does nothing.
+  //
+  // Invariant: MarkReadOnly() is called, before MarkFlushed().
+  // Note that this method if overridden, should not run for an extended period
+  // of time. Otherwise, RocksDB may be blocked.
+  virtual void MarkFlushed() { }
+
   // Look up key from the mem table, since the first key in the mem table whose
   // user_key matches the one given k, call the function callback_func(), with
   // callback_args directly forwarded as the first parameter, and the mem table
diff -Nru rocksdb-5.15.10/include/rocksdb/merge_operator.h rocksdb-5.17.2/include/rocksdb/merge_operator.h
--- rocksdb-5.15.10/include/rocksdb/merge_operator.h	2018-09-13 17:25:20.000000000 +0000
+++ rocksdb-5.17.2/include/rocksdb/merge_operator.h	2018-11-12 19:57:32.000000000 +0000
@@ -3,8 +3,7 @@
 //  COPYING file in the root directory) and Apache 2.0 License
 //  (found in the LICENSE.Apache file in the root directory).
 
-#ifndef STORAGE_ROCKSDB_INCLUDE_MERGE_OPERATOR_H_
-#define STORAGE_ROCKSDB_INCLUDE_MERGE_OPERATOR_H_
+#pragma once
 
 #include <deque>
 #include <memory>
@@ -241,5 +240,3 @@
 };
 
 }  // namespace rocksdb
-
-#endif  // STORAGE_ROCKSDB_INCLUDE_MERGE_OPERATOR_H_
diff -Nru rocksdb-5.15.10/include/rocksdb/metadata.h rocksdb-5.17.2/include/rocksdb/metadata.h
--- rocksdb-5.15.10/include/rocksdb/metadata.h	2018-09-13 17:25:20.000000000 +0000
+++ rocksdb-5.17.2/include/rocksdb/metadata.h	2018-11-12 19:57:32.000000000 +0000
@@ -65,7 +65,7 @@
         num_reads_sampled(0),
         being_compacted(false) {}
   SstFileMetaData(const std::string& _file_name, const std::string& _path,
-                  uint64_t _size, SequenceNumber _smallest_seqno,
+                  size_t _size, SequenceNumber _smallest_seqno,
                   SequenceNumber _largest_seqno,
                   const std::string& _smallestkey,
                   const std::string& _largestkey, uint64_t _num_reads_sampled,
@@ -81,7 +81,7 @@
         being_compacted(_being_compacted) {}
 
   // File size in bytes.
-  uint64_t size;
+  size_t size;
   // The name of the file.
   std::string name;
   // The full path where the file locates.
diff -Nru rocksdb-5.15.10/include/rocksdb/options.h rocksdb-5.17.2/include/rocksdb/options.h
--- rocksdb-5.15.10/include/rocksdb/options.h	2018-09-13 17:25:20.000000000 +0000
+++ rocksdb-5.17.2/include/rocksdb/options.h	2018-11-12 19:57:32.000000000 +0000
@@ -6,8 +6,7 @@
 // Use of this source code is governed by a BSD-style license that can be
 // found in the LICENSE file. See the AUTHORS file for names of contributors.
 
-#ifndef STORAGE_ROCKSDB_INCLUDE_OPTIONS_H_
-#define STORAGE_ROCKSDB_INCLUDE_OPTIONS_H_
+#pragma once
 
 #include <stddef.h>
 #include <stdint.h>
@@ -430,6 +429,8 @@
   // (i.e. the ones that are causing all the space amplification). If set to 0
   // (default), we will dynamically choose the WAL size limit to be
   // [sum of all write_buffer_size * max_write_buffer_number] * 4
+  // This option takes effect only when there are more than one column family as
+  // otherwise the wal size is dictated by the write_buffer_size.
   // Default: 0
   uint64_t max_total_wal_size = 0;
 
@@ -1181,8 +1182,13 @@
   // If true, the flush will wait until the flush is done.
   // Default: true
   bool wait;
-
-  FlushOptions() : wait(true) {}
+  // If true, the flush would proceed immediately even it means writes will
+  // stall for the duration of the flush; if false the operation will wait
+  // until it's possible to do flush w/o causing stall or until required flush
+  // is performed by someone else (foreground call or background thread).
+  // Default: false
+  bool allow_write_stall;
+  FlushOptions() : wait(true), allow_write_stall(false) {}
 };
 
 // Create a Logger from provided DBOptions
@@ -1194,6 +1200,9 @@
 struct CompactionOptions {
   // Compaction output compression type
   // Default: snappy
+  // If set to `kDisableCompressionOption`, RocksDB will choose compression type
+  // according to the `ColumnFamilyOptions`, taking into account the output
+  // level if `compression_per_level` is specified.
   CompressionType compression;
   // Compaction will create files of size `output_file_size_limit`.
   // Default: MAX, which means that compaction will create a single file
@@ -1265,8 +1274,20 @@
   // with allow_ingest_behind=true since the dawn of time.
   // All files will be ingested at the bottommost level with seqno=0.
   bool ingest_behind = false;
+  // Set to true if you would like to write global_seqno to a given offset in
+  // the external SST file for backward compatibility. Older versions of
+  // RocksDB writes a global_seqno to a given offset within ingested SST files,
+  // and new versions of RocksDB do not. If you ingest an external SST using
+  // new version of RocksDB and would like to be able to downgrade to an
+  // older version of RocksDB, you should set 'write_global_seqno' to true. If
+  // your service is just starting to use the new RocksDB, we recommend that
+  // you set this option to false, which brings two benefits:
+  // 1. No extra random write for global_seqno during ingestion.
+  // 2. Without writing external SST file, it's possible to do checksum.
+  // We have a plan to set this option to false by default in the future.
+  bool write_global_seqno = true;
 };
 
-}  // namespace rocksdb
+struct TraceOptions {};
 
-#endif  // STORAGE_ROCKSDB_INCLUDE_OPTIONS_H_
+}  // namespace rocksdb
diff -Nru rocksdb-5.15.10/include/rocksdb/perf_context.h rocksdb-5.17.2/include/rocksdb/perf_context.h
--- rocksdb-5.15.10/include/rocksdb/perf_context.h	2018-09-13 17:25:20.000000000 +0000
+++ rocksdb-5.17.2/include/rocksdb/perf_context.h	2018-11-12 19:57:32.000000000 +0000
@@ -3,8 +3,7 @@
 //  COPYING file in the root directory) and Apache 2.0 License
 //  (found in the LICENSE.Apache file in the root directory).
 
-#ifndef STORAGE_ROCKSDB_INCLUDE_PERF_CONTEXT_H
-#define STORAGE_ROCKSDB_INCLUDE_PERF_CONTEXT_H
+#pragma once
 
 #include <stdint.h>
 #include <string>
@@ -176,5 +175,3 @@
 PerfContext* get_perf_context();
 
 }
-
-#endif
diff -Nru rocksdb-5.15.10/include/rocksdb/perf_level.h rocksdb-5.17.2/include/rocksdb/perf_level.h
--- rocksdb-5.15.10/include/rocksdb/perf_level.h	2018-09-13 17:25:20.000000000 +0000
+++ rocksdb-5.17.2/include/rocksdb/perf_level.h	2018-11-12 19:57:32.000000000 +0000
@@ -3,8 +3,7 @@
 //  COPYING file in the root directory) and Apache 2.0 License
 //  (found in the LICENSE.Apache file in the root directory).
 
-#ifndef INCLUDE_ROCKSDB_PERF_LEVEL_H_
-#define INCLUDE_ROCKSDB_PERF_LEVEL_H_
+#pragma once
 
 #include <stdint.h>
 #include <string>
@@ -29,5 +28,3 @@
 PerfLevel GetPerfLevel();
 
 }  // namespace rocksdb
-
-#endif  // INCLUDE_ROCKSDB_PERF_LEVEL_H_
diff -Nru rocksdb-5.15.10/include/rocksdb/slice.h rocksdb-5.17.2/include/rocksdb/slice.h
--- rocksdb-5.15.10/include/rocksdb/slice.h	2018-09-13 17:25:20.000000000 +0000
+++ rocksdb-5.17.2/include/rocksdb/slice.h	2018-11-12 19:57:32.000000000 +0000
@@ -16,8 +16,7 @@
 // non-const method, all threads accessing the same Slice must use
 // external synchronization.
 
-#ifndef STORAGE_ROCKSDB_INCLUDE_SLICE_H_
-#define STORAGE_ROCKSDB_INCLUDE_SLICE_H_
+#pragma once
 
 #include <assert.h>
 #include <cstdio>
@@ -25,6 +24,10 @@
 #include <string.h>
 #include <string>
 
+#ifdef __cpp_lib_string_view
+#include <string_view>
+#endif
+
 #include "rocksdb/cleanable.h"
 
 namespace rocksdb {
@@ -41,6 +44,12 @@
   /* implicit */
   Slice(const std::string& s) : data_(s.data()), size_(s.size()) { }
 
+#ifdef __cpp_lib_string_view
+  // Create a slice that refers to the same contents as "sv"
+  /* implicit */
+  Slice(std::string_view sv) : data_(sv.data()), size_(sv.size()) {}
+#endif
+
   // Create a slice that refers to s[0,strlen(s)-1]
   /* implicit */
   Slice(const char* s) : data_(s) {
@@ -86,6 +95,13 @@
   // when hex is true, returns a string of twice the length hex encoded (0-9A-F)
   std::string ToString(bool hex = false) const;
 
+#ifdef __cpp_lib_string_view
+  // Return a string_view that references the same data as this slice.
+  std::string_view ToStringView() const {
+    return std::string_view(data_, size_);
+  }
+#endif
+
   // Decodes the current slice interpreted as an hexadecimal string into result,
   // if successful returns true, if this isn't a valid hex string
   // (e.g not coming from Slice::ToString(true)) DecodeHex returns false.
@@ -239,6 +255,4 @@
   return off;
 }
 
-}  // namespace rocksdb
-
-#endif  // STORAGE_ROCKSDB_INCLUDE_SLICE_H_
+}  // namespace rocksdb
\ No newline at end of file
diff -Nru rocksdb-5.15.10/include/rocksdb/slice_transform.h rocksdb-5.17.2/include/rocksdb/slice_transform.h
--- rocksdb-5.15.10/include/rocksdb/slice_transform.h	2018-09-13 17:25:20.000000000 +0000
+++ rocksdb-5.17.2/include/rocksdb/slice_transform.h	2018-11-12 19:57:32.000000000 +0000
@@ -12,8 +12,7 @@
 // define InDomain and InRange to determine which slices are in either
 // of these sets respectively.
 
-#ifndef STORAGE_ROCKSDB_INCLUDE_SLICE_TRANSFORM_H_
-#define STORAGE_ROCKSDB_INCLUDE_SLICE_TRANSFORM_H_
+#pragma once
 
 #include <string>
 
@@ -100,5 +99,3 @@
 extern const SliceTransform* NewNoopTransform();
 
 }
-
-#endif  // STORAGE_ROCKSDB_INCLUDE_SLICE_TRANSFORM_H_
diff -Nru rocksdb-5.15.10/include/rocksdb/sst_file_manager.h rocksdb-5.17.2/include/rocksdb/sst_file_manager.h
--- rocksdb-5.15.10/include/rocksdb/sst_file_manager.h	2018-09-13 17:25:20.000000000 +0000
+++ rocksdb-5.17.2/include/rocksdb/sst_file_manager.h	2018-11-12 19:57:32.000000000 +0000
@@ -75,6 +75,10 @@
   // Update trash/DB size ratio where new files will be deleted immediately
   // thread-safe
   virtual void SetMaxTrashDBRatio(double ratio) = 0;
+
+  // Return the total size of trash files
+  // thread-safe
+  virtual uint64_t GetTotalTrashSize() = 0;
 };
 
 // Create a new SstFileManager that can be shared among multiple RocksDB
diff -Nru rocksdb-5.15.10/include/rocksdb/sst_file_writer.h rocksdb-5.17.2/include/rocksdb/sst_file_writer.h
--- rocksdb-5.15.10/include/rocksdb/sst_file_writer.h	2018-09-13 17:25:20.000000000 +0000
+++ rocksdb-5.17.2/include/rocksdb/sst_file_writer.h	2018-11-12 19:57:32.000000000 +0000
@@ -3,10 +3,10 @@
 //  COPYING file in the root directory) and Apache 2.0 License
 //  (found in the LICENSE.Apache file in the root directory).
 
-#ifndef ROCKSDB_LITE
-
 #pragma once
 
+#ifndef ROCKSDB_LITE
+
 #include <memory>
 #include <string>
 
diff -Nru rocksdb-5.15.10/include/rocksdb/statistics.h rocksdb-5.17.2/include/rocksdb/statistics.h
--- rocksdb-5.15.10/include/rocksdb/statistics.h	2018-09-13 17:25:20.000000000 +0000
+++ rocksdb-5.17.2/include/rocksdb/statistics.h	2018-11-12 19:57:32.000000000 +0000
@@ -3,8 +3,7 @@
 //  COPYING file in the root directory) and Apache 2.0 License
 //  (found in the LICENSE.Apache file in the root directory).
 
-#ifndef STORAGE_ROCKSDB_INCLUDE_STATISTICS_H_
-#define STORAGE_ROCKSDB_INCLUDE_STATISTICS_H_
+#pragma once
 
 #include <atomic>
 #include <cstddef>
@@ -673,5 +672,3 @@
 std::shared_ptr<Statistics> CreateDBStatistics();
 
 }  // namespace rocksdb
-
-#endif  // STORAGE_ROCKSDB_INCLUDE_STATISTICS_H_
diff -Nru rocksdb-5.15.10/include/rocksdb/status.h rocksdb-5.17.2/include/rocksdb/status.h
--- rocksdb-5.15.10/include/rocksdb/status.h	2018-09-13 17:25:20.000000000 +0000
+++ rocksdb-5.17.2/include/rocksdb/status.h	2018-11-12 19:57:32.000000000 +0000
@@ -14,8 +14,7 @@
 // non-const method, all threads accessing the same Status must use
 // external synchronization.
 
-#ifndef STORAGE_ROCKSDB_INCLUDE_STATUS_H_
-#define STORAGE_ROCKSDB_INCLUDE_STATUS_H_
+#pragma once
 
 #include <string>
 #include "rocksdb/slice.h"
@@ -282,8 +281,6 @@
   Severity sev_;
   const char* state_;
 
-  static const char* msgs[static_cast<int>(kMaxSubCode)];
-
   explicit Status(Code _code, SubCode _subcode = kNone)
       : code_(_code), subcode_(_subcode), sev_(kNoError), state_(nullptr) {}
 
@@ -350,5 +347,3 @@
 }
 
 }  // namespace rocksdb
-
-#endif  // STORAGE_ROCKSDB_INCLUDE_STATUS_H_
diff -Nru rocksdb-5.15.10/include/rocksdb/table.h rocksdb-5.17.2/include/rocksdb/table.h
--- rocksdb-5.15.10/include/rocksdb/table.h	2018-09-13 17:25:20.000000000 +0000
+++ rocksdb-5.17.2/include/rocksdb/table.h	2018-11-12 19:57:32.000000000 +0000
@@ -16,6 +16,7 @@
 //   https://github.com/facebook/rocksdb/wiki/A-Tutorial-of-RocksDB-SST-formats#wiki-examples
 
 #pragma once
+
 #include <memory>
 #include <string>
 #include <unordered_map>
@@ -100,6 +101,18 @@
 
   IndexType index_type = kBinarySearch;
 
+  // The index type that will be used for the data block.
+  enum DataBlockIndexType : char {
+    kDataBlockBinarySearch = 0,   // traditional block type
+    kDataBlockBinaryAndHash = 1,  // additional hash index
+  };
+
+  DataBlockIndexType data_block_index_type = kDataBlockBinarySearch;
+
+  // #entries/#buckets. It is valid only when data_block_hash_index_type is
+  // kDataBlockBinaryAndHash.
+  double data_block_hash_table_util_ratio = 0.75;
+
   // This option is now deprecated. No matter what value it is set to,
   // it will behave as if hash_index_allow_collision=true.
   bool hash_index_allow_collision = true;
@@ -226,6 +239,12 @@
   // version 5.15, you should probably use this.
   // This option only affects newly written tables. When reading existing
   // tables, the information about version is read from the footer.
+  // 4 -- Can be read by RocksDB's versions since 5.16. Changes the way we
+  // encode the values in index blocks. If you don't plan to run RocksDB before
+  // version 5.16 and you are using index_block_restart_interval > 1, you should
+  // probably use this as it would reduce the index size.
+  // This option only affects newly written tables. When reading existing
+  // tables, the information about version is read from the footer.
   uint32_t format_version = 2;
 
   // Store index blocks on disk in compressed format. Changing this option to
diff -Nru rocksdb-5.15.10/include/rocksdb/table_properties.h rocksdb-5.17.2/include/rocksdb/table_properties.h
--- rocksdb-5.15.10/include/rocksdb/table_properties.h	2018-09-13 17:25:20.000000000 +0000
+++ rocksdb-5.17.2/include/rocksdb/table_properties.h	2018-11-12 19:57:32.000000000 +0000
@@ -34,6 +34,7 @@
   static const std::string kIndexPartitions;
   static const std::string kTopLevelIndexSize;
   static const std::string kIndexKeyIsUserKey;
+  static const std::string kIndexValueIsDeltaEncoded;
   static const std::string kFilterSize;
   static const std::string kRawKeySize;
   static const std::string kRawValueSize;
@@ -139,6 +140,8 @@
   // Whether the index key is user key. Otherwise it includes 8 byte of sequence
   // number added by internal key format.
   uint64_t index_key_is_user_key = 0;
+  // Whether delta encoding is used to encode the index values.
+  uint64_t index_value_is_delta_encoded = 0;
   // the size of filter block.
   uint64_t filter_size = 0;
   // total raw key size
diff -Nru rocksdb-5.15.10/include/rocksdb/trace_reader_writer.h rocksdb-5.17.2/include/rocksdb/trace_reader_writer.h
--- rocksdb-5.15.10/include/rocksdb/trace_reader_writer.h	1970-01-01 00:00:00.000000000 +0000
+++ rocksdb-5.17.2/include/rocksdb/trace_reader_writer.h	2018-11-12 19:57:32.000000000 +0000
@@ -0,0 +1,47 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+
+#pragma once
+
+#include "rocksdb/env.h"
+
+namespace rocksdb {
+
+// Allow custom implementations of TraceWriter and TraceReader.
+// By default, RocksDB provides a way to capture the traces to a file using the
+// factory NewFileTraceWriter(). But users could also choose to export traces to
+// any other system by providing custom implementations of TraceWriter and
+// TraceReader.
+
+// TraceWriter allows exporting RocksDB traces to any system, one operation at
+// a time.
+class TraceWriter {
+ public:
+  TraceWriter() {}
+  virtual ~TraceWriter() {}
+
+  virtual Status Write(const Slice& data) = 0;
+  virtual Status Close() = 0;
+};
+
+// TraceReader allows reading RocksDB traces from any system, one operation at
+// a time. A RocksDB Replayer could depend on this to replay opertions.
+class TraceReader {
+ public:
+  TraceReader() {}
+  virtual ~TraceReader() {}
+
+  virtual Status Read(std::string* data) = 0;
+  virtual Status Close() = 0;
+};
+
+// Factory methods to read/write traces from/to a file.
+Status NewFileTraceWriter(Env* env, const EnvOptions& env_options,
+                          const std::string& trace_filename,
+                          std::unique_ptr<TraceWriter>* trace_writer);
+Status NewFileTraceReader(Env* env, const EnvOptions& env_options,
+                          const std::string& trace_filename,
+                          std::unique_ptr<TraceReader>* trace_reader);
+}  // namespace rocksdb
diff -Nru rocksdb-5.15.10/include/rocksdb/transaction_log.h rocksdb-5.17.2/include/rocksdb/transaction_log.h
--- rocksdb-5.15.10/include/rocksdb/transaction_log.h	2018-09-13 17:25:20.000000000 +0000
+++ rocksdb-5.17.2/include/rocksdb/transaction_log.h	2018-11-12 19:57:32.000000000 +0000
@@ -3,8 +3,7 @@
 //  COPYING file in the root directory) and Apache 2.0 License
 //  (found in the LICENSE.Apache file in the root directory).
 
-#ifndef STORAGE_ROCKSDB_INCLUDE_TRANSACTION_LOG_ITERATOR_H_
-#define STORAGE_ROCKSDB_INCLUDE_TRANSACTION_LOG_ITERATOR_H_
+#pragma once
 
 #include "rocksdb/status.h"
 #include "rocksdb/types.h"
@@ -121,5 +120,3 @@
   };
 };
 } //  namespace rocksdb
-
-#endif  // STORAGE_ROCKSDB_INCLUDE_TRANSACTION_LOG_ITERATOR_H_
diff -Nru rocksdb-5.15.10/include/rocksdb/types.h rocksdb-5.17.2/include/rocksdb/types.h
--- rocksdb-5.15.10/include/rocksdb/types.h	2018-09-13 17:25:20.000000000 +0000
+++ rocksdb-5.17.2/include/rocksdb/types.h	2018-11-12 19:57:32.000000000 +0000
@@ -3,8 +3,7 @@
 //  COPYING file in the root directory) and Apache 2.0 License
 //  (found in the LICENSE.Apache file in the root directory).
 
-#ifndef STORAGE_ROCKSDB_INCLUDE_TYPES_H_
-#define STORAGE_ROCKSDB_INCLUDE_TYPES_H_
+#pragma once
 
 #include <stdint.h>
 #include "rocksdb/slice.h"
@@ -23,6 +22,7 @@
   kEntrySingleDelete,
   kEntryMerge,
   kEntryRangeDeletion,
+  kEntryBlobIndex,
   kEntryOther,
 };
 
@@ -52,5 +52,3 @@
 bool ParseFullKey(const Slice& internal_key, FullKey* result);
 
 }  //  namespace rocksdb
-
-#endif //  STORAGE_ROCKSDB_INCLUDE_TYPES_H_
diff -Nru rocksdb-5.15.10/include/rocksdb/universal_compaction.h rocksdb-5.17.2/include/rocksdb/universal_compaction.h
--- rocksdb-5.15.10/include/rocksdb/universal_compaction.h	2018-09-13 17:25:20.000000000 +0000
+++ rocksdb-5.17.2/include/rocksdb/universal_compaction.h	2018-11-12 19:57:32.000000000 +0000
@@ -3,8 +3,7 @@
 //  COPYING file in the root directory) and Apache 2.0 License
 //  (found in the LICENSE.Apache file in the root directory).
 
-#ifndef STORAGE_ROCKSDB_UNIVERSAL_COMPACTION_OPTIONS_H
-#define STORAGE_ROCKSDB_UNIVERSAL_COMPACTION_OPTIONS_H
+#pragma once
 
 #include <stdint.h>
 #include <climits>
@@ -86,5 +85,3 @@
 };
 
 }  // namespace rocksdb
-
-#endif  // STORAGE_ROCKSDB_UNIVERSAL_COMPACTION_OPTIONS_H
diff -Nru rocksdb-5.15.10/include/rocksdb/utilities/debug.h rocksdb-5.17.2/include/rocksdb/utilities/debug.h
--- rocksdb-5.15.10/include/rocksdb/utilities/debug.h	2018-09-13 17:25:20.000000000 +0000
+++ rocksdb-5.17.2/include/rocksdb/utilities/debug.h	2018-11-12 19:57:32.000000000 +0000
@@ -31,9 +31,13 @@
 };
 
 // Returns listing of all versions of keys in the provided user key range.
-// The range is inclusive-inclusive, i.e., [`begin_key`, `end_key`].
+// The range is inclusive-inclusive, i.e., [`begin_key`, `end_key`], or
+// `max_num_ikeys` has been reached. Since all those keys returned will be
+// copied to memory, if the range covers too many keys, the memory usage
+// may be huge. `max_num_ikeys` can be used to cap the memory usage.
 // The result is inserted into the provided vector, `key_versions`.
 Status GetAllKeyVersions(DB* db, Slice begin_key, Slice end_key,
+                         size_t max_num_ikeys,
                          std::vector<KeyVersion>* key_versions);
 
 }  // namespace rocksdb
diff -Nru rocksdb-5.15.10/include/rocksdb/utilities/env_librados.h rocksdb-5.17.2/include/rocksdb/utilities/env_librados.h
--- rocksdb-5.15.10/include/rocksdb/utilities/env_librados.h	2018-09-13 17:25:20.000000000 +0000
+++ rocksdb-5.17.2/include/rocksdb/utilities/env_librados.h	2018-11-12 19:57:32.000000000 +0000
@@ -2,8 +2,8 @@
 //  This source code is licensed under both the GPLv2 (found in the
 //  COPYING file in the root directory) and Apache 2.0 License
 //  (found in the LICENSE.Apache file in the root directory).
-#ifndef ROCKSDB_UTILITIES_ENV_LIBRADOS_H
-#define ROCKSDB_UTILITIES_ENV_LIBRADOS_H
+
+#pragma once
 
 #include <memory>
 #include <string>
@@ -173,4 +173,3 @@
   friend class LibradosWritableFile;
 };
 }
-#endif
diff -Nru rocksdb-5.15.10/include/rocksdb/utilities/table_properties_collectors.h rocksdb-5.17.2/include/rocksdb/utilities/table_properties_collectors.h
--- rocksdb-5.15.10/include/rocksdb/utilities/table_properties_collectors.h	2018-09-13 17:25:20.000000000 +0000
+++ rocksdb-5.17.2/include/rocksdb/utilities/table_properties_collectors.h	2018-11-12 19:57:32.000000000 +0000
@@ -5,12 +5,60 @@
 
 #pragma once
 #ifndef ROCKSDB_LITE
+#include <atomic>
 #include <memory>
 
 #include "rocksdb/table_properties.h"
 
 namespace rocksdb {
 
+// A factory of a table property collector that marks a SST
+// file as need-compaction when it observe at least "D" deletion
+// entries in any "N" consecutive entires.
+class CompactOnDeletionCollectorFactory
+    : public TablePropertiesCollectorFactory {
+ public:
+  virtual ~CompactOnDeletionCollectorFactory() {}
+
+  virtual TablePropertiesCollector* CreateTablePropertiesCollector(
+      TablePropertiesCollectorFactory::Context context) override;
+
+  // Change the value of sliding_window_size "N"
+  // Setting it to 0 disables the delete triggered compaction
+  void SetWindowSize(size_t sliding_window_size) {
+    sliding_window_size_.store(sliding_window_size);
+  }
+
+  // Change the value of deletion_trigger "D"
+  void SetDeletionTrigger(size_t deletion_trigger) {
+    deletion_trigger_.store(deletion_trigger);
+  }
+
+  virtual const char* Name() const override {
+    return "CompactOnDeletionCollector";
+  }
+
+ private:
+  friend std::shared_ptr<CompactOnDeletionCollectorFactory>
+    NewCompactOnDeletionCollectorFactory(
+        size_t sliding_window_size,
+        size_t deletion_trigger);
+  // A factory of a table property collector that marks a SST
+  // file as need-compaction when it observe at least "D" deletion
+  // entries in any "N" consecutive entires.
+  //
+  // @param sliding_window_size "N"
+  // @param deletion_trigger "D"
+  CompactOnDeletionCollectorFactory(
+      size_t sliding_window_size,
+      size_t deletion_trigger) :
+          sliding_window_size_(sliding_window_size),
+          deletion_trigger_(deletion_trigger) {}
+
+  std::atomic<size_t> sliding_window_size_;
+  std::atomic<size_t> deletion_trigger_;
+};
+
 // Creates a factory of a table property collector that marks a SST
 // file as need-compaction when it observe at least "D" deletion
 // entries in any "N" consecutive entires.
@@ -20,7 +68,7 @@
 //     than the specified size.
 // @param deletion_trigger "D".  Note that even when "N" is changed,
 //     the specified number for "D" will not be changed.
-extern std::shared_ptr<TablePropertiesCollectorFactory>
+extern std::shared_ptr<CompactOnDeletionCollectorFactory>
     NewCompactOnDeletionCollectorFactory(
         size_t sliding_window_size,
         size_t deletion_trigger);
diff -Nru rocksdb-5.15.10/include/rocksdb/utilities/transaction_db.h rocksdb-5.17.2/include/rocksdb/utilities/transaction_db.h
--- rocksdb-5.15.10/include/rocksdb/utilities/transaction_db.h	2018-09-13 17:25:20.000000000 +0000
+++ rocksdb-5.17.2/include/rocksdb/utilities/transaction_db.h	2018-11-12 19:57:32.000000000 +0000
@@ -137,6 +137,15 @@
 
   // The maximum number of bytes used for the write batch. 0 means no limit.
   size_t max_write_batch_size = 0;
+
+  // Skip Concurrency Control. This could be as an optimization if the
+  // application knows that the transaction would not have any conflict with
+  // concurrent transactions. It could also be used during recovery if (i)
+  // application guarantees no conflict between prepared transactions in the WAL
+  // (ii) application guarantees that recovered transactions will be rolled
+  // back/commit before new transactions start.
+  // Default: false
+  bool skip_concurrency_control = false;
 };
 
 // The per-write optimizations that do not involve transactions. TransactionDB
diff -Nru rocksdb-5.15.10/include/rocksdb/utilities/transaction.h rocksdb-5.17.2/include/rocksdb/utilities/transaction.h
--- rocksdb-5.15.10/include/rocksdb/utilities/transaction.h	2018-09-13 17:25:20.000000000 +0000
+++ rocksdb-5.17.2/include/rocksdb/utilities/transaction.h	2018-11-12 19:57:32.000000000 +0000
@@ -152,6 +152,12 @@
   // If there is no previous call to SetSavePoint(), returns Status::NotFound()
   virtual Status RollbackToSavePoint() = 0;
 
+  // Pop the most recent save point.
+  // If there is no previous call to SetSavePoint(), Status::NotFound()
+  // will be returned.
+  // Otherwise returns Status::OK().
+  virtual Status PopSavePoint() = 0;
+
   // This function is similar to DB::Get() except it will also read pending
   // changes in this transaction.  Currently, this function will return
   // Status::MergeInProgress if the most recent write to the queried key in
diff -Nru rocksdb-5.15.10/include/rocksdb/utilities/write_batch_with_index.h rocksdb-5.17.2/include/rocksdb/utilities/write_batch_with_index.h
--- rocksdb-5.15.10/include/rocksdb/utilities/write_batch_with_index.h	2018-09-13 17:25:20.000000000 +0000
+++ rocksdb-5.17.2/include/rocksdb/utilities/write_batch_with_index.h	2018-11-12 19:57:32.000000000 +0000
@@ -231,6 +231,7 @@
   Status PopSavePoint() override;
 
   void SetMaxBytes(size_t max_bytes) override;
+  size_t GetDataSize() const;
 
  private:
   friend class PessimisticTransactionDB;
diff -Nru rocksdb-5.15.10/include/rocksdb/version.h rocksdb-5.17.2/include/rocksdb/version.h
--- rocksdb-5.15.10/include/rocksdb/version.h	2018-09-13 17:25:20.000000000 +0000
+++ rocksdb-5.17.2/include/rocksdb/version.h	2018-11-12 19:57:32.000000000 +0000
@@ -5,8 +5,8 @@
 #pragma once
 
 #define ROCKSDB_MAJOR 5
-#define ROCKSDB_MINOR 15
-#define ROCKSDB_PATCH 10
+#define ROCKSDB_MINOR 17
+#define ROCKSDB_PATCH 2
 
 // Do not use these. We made the mistake of declaring macros starting with
 // double underscore. Now we have to live with our choice. We'll deprecate these
diff -Nru rocksdb-5.15.10/include/rocksdb/wal_filter.h rocksdb-5.17.2/include/rocksdb/wal_filter.h
--- rocksdb-5.15.10/include/rocksdb/wal_filter.h	2018-09-13 17:25:20.000000000 +0000
+++ rocksdb-5.17.2/include/rocksdb/wal_filter.h	2018-11-12 19:57:32.000000000 +0000
@@ -4,6 +4,7 @@
 //  (found in the LICENSE.Apache file in the root directory).
 
 #pragma once
+
 #include <string>
 #include <map>
 
diff -Nru rocksdb-5.15.10/include/rocksdb/write_batch.h rocksdb-5.17.2/include/rocksdb/write_batch.h
--- rocksdb-5.15.10/include/rocksdb/write_batch.h	2018-09-13 17:25:20.000000000 +0000
+++ rocksdb-5.17.2/include/rocksdb/write_batch.h	2018-11-12 19:57:32.000000000 +0000
@@ -22,8 +22,7 @@
 // non-const method, all threads accessing the same WriteBatch must use
 // external synchronization.
 
-#ifndef STORAGE_ROCKSDB_INCLUDE_WRITE_BATCH_H_
-#define STORAGE_ROCKSDB_INCLUDE_WRITE_BATCH_H_
+#pragma once
 
 #include <atomic>
 #include <stack>
@@ -367,5 +366,3 @@
 };
 
 }  // namespace rocksdb
-
-#endif  // STORAGE_ROCKSDB_INCLUDE_WRITE_BATCH_H_
diff -Nru rocksdb-5.15.10/java/CMakeLists.txt rocksdb-5.17.2/java/CMakeLists.txt
--- rocksdb-5.15.10/java/CMakeLists.txt	2018-09-13 17:25:20.000000000 +0000
+++ rocksdb-5.17.2/java/CMakeLists.txt	2018-11-12 19:57:32.000000000 +0000
@@ -13,6 +13,7 @@
         rocksjni/compaction_filter_factory_jnicallback.cc
         rocksjni/compaction_options_fifo.cc
         rocksjni/compaction_options_universal.cc
+        rocksjni/compact_range_options.cc
         rocksjni/comparator.cc
         rocksjni/comparatorjnicallback.cc
         rocksjni/compression_options.cc
@@ -79,6 +80,7 @@
         org.rocksdb.ColumnFamilyOptions
         org.rocksdb.CompactionOptionsFIFO
         org.rocksdb.CompactionOptionsUniversal
+        org.rocksdb.CompactRangeOptions
         org.rocksdb.Comparator
         org.rocksdb.ComparatorOptions
         org.rocksdb.CompressionOptions
@@ -192,6 +194,7 @@
   src/main/java/org/rocksdb/CompactionOptionsFIFO.java
   src/main/java/org/rocksdb/CompactionOptionsUniversal.java
   src/main/java/org/rocksdb/CompactionPriority.java
+  src/main/java/org/rocksdb/CompactRangeOptions.java
   src/main/java/org/rocksdb/CompactionStopStyle.java
   src/main/java/org/rocksdb/CompactionStyle.java
   src/main/java/org/rocksdb/Comparator.java
diff -Nru rocksdb-5.15.10/java/crossbuild/build-linux-centos.sh rocksdb-5.17.2/java/crossbuild/build-linux-centos.sh
--- rocksdb-5.15.10/java/crossbuild/build-linux-centos.sh	2018-09-13 17:25:20.000000000 +0000
+++ rocksdb-5.17.2/java/crossbuild/build-linux-centos.sh	2018-11-12 19:57:32.000000000 +0000
@@ -26,6 +26,6 @@
 # build rocksdb
 cd /rocksdb
 scl enable devtoolset-2 'make jclean clean'
-scl enable devtoolset-2 'PORTABLE=1 make rocksdbjavastatic'
+scl enable devtoolset-2 'PORTABLE=1 make -j8 rocksdbjavastatic'
 cp /rocksdb/java/target/librocksdbjni-* /rocksdb-build
 cp /rocksdb/java/target/rocksdbjni-* /rocksdb-build
diff -Nru rocksdb-5.15.10/java/crossbuild/docker-build-linux-centos.sh rocksdb-5.17.2/java/crossbuild/docker-build-linux-centos.sh
--- rocksdb-5.15.10/java/crossbuild/docker-build-linux-centos.sh	2018-09-13 17:25:20.000000000 +0000
+++ rocksdb-5.17.2/java/crossbuild/docker-build-linux-centos.sh	2018-11-12 19:57:32.000000000 +0000
@@ -9,10 +9,10 @@
 # Use scl devtoolset if available (i.e. CentOS <7)
 if hash scl 2>/dev/null; then
 	scl enable devtoolset-2 'make jclean clean'
-	scl enable devtoolset-2 'PORTABLE=1 make rocksdbjavastatic'
+	scl enable devtoolset-2 'PORTABLE=1 make -j8 rocksdbjavastatic'
 else
 	make jclean clean
-        PORTABLE=1 make rocksdbjavastatic
+        PORTABLE=1 make -j8 rocksdbjavastatic
 fi
 
 cp java/target/librocksdbjni-linux*.so java/target/rocksdbjni-*-linux*.jar /rocksdb-host/java/target
diff -Nru rocksdb-5.15.10/java/Makefile rocksdb-5.17.2/java/Makefile
--- rocksdb-5.15.10/java/Makefile	2018-09-13 17:25:20.000000000 +0000
+++ rocksdb-5.17.2/java/Makefile	2018-11-12 19:57:32.000000000 +0000
@@ -14,6 +14,7 @@
 	org.rocksdb.ColumnFamilyOptions\
 	org.rocksdb.CompactionOptionsFIFO\
 	org.rocksdb.CompactionOptionsUniversal\
+	org.rocksdb.CompactRangeOptions\
 	org.rocksdb.Comparator\
 	org.rocksdb.ComparatorOptions\
 	org.rocksdb.CompressionOptions\
diff -Nru rocksdb-5.15.10/java/rocksjni/compact_range_options.cc rocksdb-5.17.2/java/rocksjni/compact_range_options.cc
--- rocksdb-5.15.10/java/rocksjni/compact_range_options.cc	1970-01-01 00:00:00.000000000 +0000
+++ rocksdb-5.17.2/java/rocksjni/compact_range_options.cc	2018-11-12 19:57:32.000000000 +0000
@@ -0,0 +1,196 @@
+// Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+//
+// This file implements the "bridge" between Java and C++ for
+// rocksdb::CompactRangeOptions.
+
+#include <jni.h>
+
+#include "include/org_rocksdb_CompactRangeOptions.h"
+#include "rocksdb/options.h"
+#include "rocksjni/portal.h"
+
+/*
+ * Class:     org_rocksdb_CompactRangeOptions
+ * Method:    newCompactRangeOptions
+ * Signature: ()J
+ */
+jlong Java_org_rocksdb_CompactRangeOptions_newCompactRangeOptions(
+    JNIEnv* /*env*/, jclass /*jclazz*/) {
+  auto* options = new rocksdb::CompactRangeOptions();
+  return reinterpret_cast<jlong>(options);
+}
+
+
+/*
+ * Class:     org_rocksdb_CompactRangeOptions
+ * Method:    exclusiveManualCompaction
+ * Signature: (J)Z
+ */
+jboolean Java_org_rocksdb_CompactRangeOptions_exclusiveManualCompaction(
+    JNIEnv* /*env*/, jobject /*jobj*/, jlong jhandle) {
+  auto* options = reinterpret_cast<rocksdb::CompactRangeOptions*>(jhandle);
+  return static_cast<jboolean>(options->exclusive_manual_compaction);
+}
+
+/*
+ * Class:     org_rocksdb_CompactRangeOptions
+ * Method:    setExclusiveManualCompaction
+ * Signature: (JZ)V
+ */
+void Java_org_rocksdb_CompactRangeOptions_setExclusiveManualCompaction(
+    JNIEnv* /*env*/, jobject /*jobj*/, jlong jhandle, jboolean exclusive_manual_compaction) {
+  auto* options =
+      reinterpret_cast<rocksdb::CompactRangeOptions*>(jhandle);
+  options->exclusive_manual_compaction = static_cast<bool>(exclusive_manual_compaction);
+}
+
+
+/*
+ * Class:     org_rocksdb_CompactRangeOptions
+ * Method:    bottommostLevelCompaction
+ * Signature: (J)I
+ */
+jint Java_org_rocksdb_CompactRangeOptions_bottommostLevelCompaction(
+    JNIEnv* /*env*/, jobject /*jobj*/, jlong jhandle) {
+  auto* options = reinterpret_cast<rocksdb::CompactRangeOptions*>(jhandle);
+  return rocksdb::BottommostLevelCompactionJni::toJavaBottommostLevelCompaction(
+    options->bottommost_level_compaction);
+}
+
+/*
+ * Class:     org_rocksdb_CompactRangeOptions
+ * Method:    setBottommostLevelCompaction
+ * Signature: (JI)V
+ */
+void Java_org_rocksdb_CompactRangeOptions_setBottommostLevelCompaction(
+    JNIEnv* /*env*/, jobject /*jobj*/, jlong jhandle,
+    jint bottommost_level_compaction) {
+  auto* options = reinterpret_cast<rocksdb::CompactRangeOptions*>(jhandle);
+  options->bottommost_level_compaction =
+    rocksdb::BottommostLevelCompactionJni::toCppBottommostLevelCompaction(bottommost_level_compaction);
+}
+
+/*
+ * Class:     org_rocksdb_CompactRangeOptions
+ * Method:    changeLevel
+ * Signature: (J)Z
+ */
+jboolean Java_org_rocksdb_CompactRangeOptions_changeLevel
+  (JNIEnv* /*env*/, jobject /*jobj*/, jlong jhandle) {
+  auto* options = reinterpret_cast<rocksdb::CompactRangeOptions*>(jhandle);
+  return static_cast<jboolean>(options->change_level);
+}
+
+/*
+ * Class:     org_rocksdb_CompactRangeOptions
+ * Method:    setChangeLevel
+ * Signature: (JZ)V
+ */
+void Java_org_rocksdb_CompactRangeOptions_setChangeLevel
+  (JNIEnv* /*env*/, jobject /*jobj*/, jlong jhandle, jboolean change_level) {
+  auto* options = reinterpret_cast<rocksdb::CompactRangeOptions*>(jhandle);
+  options->change_level = static_cast<bool>(change_level);
+}
+
+/*
+ * Class:     org_rocksdb_CompactRangeOptions
+ * Method:    targetLevel
+ * Signature: (J)I
+ */
+jint Java_org_rocksdb_CompactRangeOptions_targetLevel
+  (JNIEnv* /*env*/, jobject /*jobj*/, jlong jhandle) {
+  auto* options = reinterpret_cast<rocksdb::CompactRangeOptions*>(jhandle);
+  return static_cast<jint>(options->target_level);
+}
+
+/*
+ * Class:     org_rocksdb_CompactRangeOptions
+ * Method:    setTargetLevel
+ * Signature: (JI)V
+ */
+void Java_org_rocksdb_CompactRangeOptions_setTargetLevel
+  (JNIEnv* /*env*/, jobject /*jobj*/, jlong jhandle, jint target_level) {
+  auto* options = reinterpret_cast<rocksdb::CompactRangeOptions*>(jhandle);
+  options->target_level = static_cast<int>(target_level);
+}
+
+/*
+ * Class:     org_rocksdb_CompactRangeOptions
+ * Method:    targetPathId
+ * Signature: (J)I
+ */
+jint Java_org_rocksdb_CompactRangeOptions_targetPathId
+  (JNIEnv* /*env*/, jobject /*jobj*/, jlong jhandle) {
+  auto* options = reinterpret_cast<rocksdb::CompactRangeOptions*>(jhandle);
+  return static_cast<jint>(options->target_path_id);
+}
+
+/*
+ * Class:     org_rocksdb_CompactRangeOptions
+ * Method:    setTargetPathId
+ * Signature: (JI)V
+ */
+void Java_org_rocksdb_CompactRangeOptions_setTargetPathId
+  (JNIEnv* /*env*/, jobject /*jobj*/, jlong jhandle, jint target_path_id) {
+  auto* options = reinterpret_cast<rocksdb::CompactRangeOptions*>(jhandle);
+  options->target_path_id = static_cast<uint32_t>(target_path_id);
+}
+
+/*
+ * Class:     org_rocksdb_CompactRangeOptions
+ * Method:    allowWriteStall
+ * Signature: (J)Z
+ */
+jboolean Java_org_rocksdb_CompactRangeOptions_allowWriteStall
+  (JNIEnv* /*env*/, jobject /*jobj*/, jlong jhandle) {
+  auto* options = reinterpret_cast<rocksdb::CompactRangeOptions*>(jhandle);
+  return static_cast<jboolean>(options->allow_write_stall);
+}
+
+/*
+ * Class:     org_rocksdb_CompactRangeOptions
+ * Method:    setAllowWriteStall
+ * Signature: (JZ)V
+ */
+void Java_org_rocksdb_CompactRangeOptions_setAllowWriteStall
+  (JNIEnv* /*env*/, jobject /*jobj*/, jlong jhandle, jboolean allow_write_stall) {
+  auto* options = reinterpret_cast<rocksdb::CompactRangeOptions*>(jhandle);
+  options->allow_write_stall = static_cast<bool>(allow_write_stall);
+}
+
+
+/*
+ * Class:     org_rocksdb_CompactRangeOptions
+ * Method:    maxSubcompactions
+ * Signature: (J)I
+ */
+jint Java_org_rocksdb_CompactRangeOptions_maxSubcompactions
+  (JNIEnv* /*env*/, jobject /*jobj*/, jlong jhandle) {
+  auto* options = reinterpret_cast<rocksdb::CompactRangeOptions*>(jhandle);
+  return static_cast<jint>(options->max_subcompactions);
+}
+
+/*
+ * Class:     org_rocksdb_CompactRangeOptions
+ * Method:    setMaxSubcompactions
+ * Signature: (JI)V
+ */
+void Java_org_rocksdb_CompactRangeOptions_setMaxSubcompactions
+  (JNIEnv* /*env*/, jobject /*jobj*/, jlong jhandle, jint max_subcompactions) {
+  auto* options = reinterpret_cast<rocksdb::CompactRangeOptions*>(jhandle);
+  options->max_subcompactions = static_cast<uint32_t>(max_subcompactions);
+}
+
+/*
+ * Class:     org_rocksdb_CompactRangeOptions
+ * Method:    disposeInternal
+ * Signature: (J)V
+ */
+void Java_org_rocksdb_CompactRangeOptions_disposeInternal(
+    JNIEnv* /*env*/, jobject /*jobj*/, jlong jhandle) {
+  auto* options = reinterpret_cast<rocksdb::CompactRangeOptions*>(jhandle);
+  delete options;
+}
diff -Nru rocksdb-5.15.10/java/rocksjni/portal.h rocksdb-5.17.2/java/rocksjni/portal.h
--- rocksdb-5.15.10/java/rocksjni/portal.h	2018-09-13 17:25:20.000000000 +0000
+++ rocksdb-5.17.2/java/rocksjni/portal.h	2018-11-12 19:57:32.000000000 +0000
@@ -2896,6 +2896,43 @@
   }
 };
 
+// The portal class for org.rocksdb.BottommostLevelCompaction
+class BottommostLevelCompactionJni {
+ public:
+  // Returns the equivalent org.rocksdb.BottommostLevelCompaction for the provided
+  // C++ rocksdb::BottommostLevelCompaction enum
+  static jint toJavaBottommostLevelCompaction(
+      const rocksdb::BottommostLevelCompaction& bottommost_level_compaction) {
+    switch(bottommost_level_compaction) {
+      case rocksdb::BottommostLevelCompaction::kSkip:
+        return 0x0;
+      case rocksdb::BottommostLevelCompaction::kIfHaveCompactionFilter:
+        return 0x1;
+      case rocksdb::BottommostLevelCompaction::kForce:
+        return 0x2;
+      default:
+        return 0x7F;  // undefined
+    }
+  }
+
+  // Returns the equivalent C++ rocksdb::BottommostLevelCompaction enum for the
+  // provided Java org.rocksdb.BottommostLevelCompaction
+  static rocksdb::BottommostLevelCompaction toCppBottommostLevelCompaction(
+      jint bottommost_level_compaction) {
+    switch(bottommost_level_compaction) {
+      case 0x0:
+        return rocksdb::BottommostLevelCompaction::kSkip;
+      case 0x1:
+        return rocksdb::BottommostLevelCompaction::kIfHaveCompactionFilter;
+      case 0x2:
+        return rocksdb::BottommostLevelCompaction::kForce;
+      default:
+        // undefined/default
+        return rocksdb::BottommostLevelCompaction::kIfHaveCompactionFilter;
+    }
+  }
+};
+
 // The portal class for org.rocksdb.CompactionStopStyle
 class CompactionStopStyleJni {
  public:
diff -Nru rocksdb-5.15.10/java/rocksjni/rocksjni.cc rocksdb-5.17.2/java/rocksjni/rocksjni.cc
--- rocksdb-5.15.10/java/rocksjni/rocksjni.cc	2018-09-13 17:25:20.000000000 +0000
+++ rocksdb-5.17.2/java/rocksjni/rocksjni.cc	2018-11-12 19:57:32.000000000 +0000
@@ -1844,6 +1844,32 @@
   return 0;
 }
 
+/*
+ * Class:     org_rocksdb_RocksDB
+ * Method:    getAggregatedLongProperty
+ * Signature: (JLjava/lang/String;I)J
+ */
+jlong Java_org_rocksdb_RocksDB_getAggregatedLongProperty(
+    JNIEnv* env, jobject, jlong db_handle, jstring jproperty, jint jproperty_len) {
+  const char* property = env->GetStringUTFChars(jproperty, nullptr);
+  if (property == nullptr) {
+    return 0;
+  }
+  rocksdb::Slice property_slice(property, jproperty_len);
+  auto* db = reinterpret_cast<rocksdb::DB*>(db_handle);
+  uint64_t property_value = 0;
+  bool retCode = db->GetAggregatedIntProperty(property_slice, &property_value);
+  env->ReleaseStringUTFChars(jproperty, property);
+
+  if (retCode) {
+    return property_value;
+  }
+
+  rocksdb::RocksDBExceptionJni::ThrowNew(env, rocksdb::Status::NotFound());
+  return 0;
+}
+
+
 //////////////////////////////////////////////////////////////////////////////
 // rocksdb::DB::Flush
 
@@ -1955,8 +1981,7 @@
                                  rocksdb::ColumnFamilyHandle* cf_handle,
                                  jbyteArray jbegin, jint jbegin_len,
                                  jbyteArray jend, jint jend_len,
-                                 jboolean jreduce_level, jint jtarget_level,
-                                 jint jtarget_path_id) {
+                                 const rocksdb::CompactRangeOptions& compact_options) {
   jbyte* begin = env->GetByteArrayElements(jbegin, nullptr);
   if (begin == nullptr) {
     // exception thrown: OutOfMemoryError
@@ -1974,10 +1999,6 @@
   const rocksdb::Slice end_slice(reinterpret_cast<char*>(end), jend_len);
 
   rocksdb::Status s;
-  rocksdb::CompactRangeOptions compact_options;
-  compact_options.change_level = jreduce_level;
-  compact_options.target_level = jtarget_level;
-  compact_options.target_path_id = static_cast<uint32_t>(jtarget_path_id);
   if (cf_handle != nullptr) {
     s = db->CompactRange(compact_options, cf_handle, &begin_slice, &end_slice);
   } else {
@@ -1996,6 +2017,25 @@
   return false;
 }
 
+/**
+ * @return true if the compact range succeeded, false if a Java Exception
+ *     was thrown
+ */
+bool rocksdb_compactrange_helper(JNIEnv* env, rocksdb::DB* db,
+                                 rocksdb::ColumnFamilyHandle* cf_handle,
+                                 jbyteArray jbegin, jint jbegin_len,
+                                 jbyteArray jend, jint jend_len,
+                                 jboolean jreduce_level, jint jtarget_level,
+                                 jint jtarget_path_id) {
+    rocksdb::CompactRangeOptions compact_options;
+    compact_options.change_level = jreduce_level;
+    compact_options.target_level = jtarget_level;
+    compact_options.target_path_id = static_cast<uint32_t>(jtarget_path_id);
+
+    return rocksdb_compactrange_helper(env, db, cf_handle, jbegin, jbegin_len,
+      jend, jend_len, compact_options);
+}
+
 /*
  * Class:     org_rocksdb_RocksDB
  * Method:    compactRange0
@@ -2027,6 +2067,20 @@
                               jtarget_path_id);
 }
 
+
+void Java_org_rocksdb_RocksDB_compactRange__J_3BI_3BIJJ(
+    JNIEnv* env, jobject /*jdb*/, jlong jdb_handle, jbyteArray jbegin,
+    jint jbegin_len, jbyteArray jend, jint jend_len,
+    jlong jcompact_options_handle, jlong jcf_handle) {
+  auto* db = reinterpret_cast<rocksdb::DB*>(jdb_handle);
+  auto* cf_handle = reinterpret_cast<rocksdb::ColumnFamilyHandle*>(jcf_handle);
+  auto* compact_options = reinterpret_cast<rocksdb::CompactRangeOptions*>(jcompact_options_handle);
+
+  rocksdb_compactrange_helper(env, db, cf_handle, jbegin, jbegin_len, jend,
+                              jend_len, *compact_options);
+}
+
+
 //////////////////////////////////////////////////////////////////////////////
 // rocksdb::DB::PauseBackgroundWork
 
diff -Nru rocksdb-5.15.10/java/rocksjni/transaction.cc rocksdb-5.17.2/java/rocksjni/transaction.cc
--- rocksdb-5.15.10/java/rocksjni/transaction.cc	2018-09-13 17:25:20.000000000 +0000
+++ rocksdb-5.17.2/java/rocksjni/transaction.cc	2018-11-12 19:57:32.000000000 +0000
@@ -18,7 +18,8 @@
 
 #if defined(_MSC_VER)
 #pragma warning(push)
-#pragma warning(disable : 4503) // identifier' : decorated name length exceeded, name was truncated
+#pragma warning(disable : 4503)  // identifier' : decorated name length
+                                 // exceeded, name was truncated
 #endif
 
 /*
@@ -271,8 +272,8 @@
 
 void free_parts(
     JNIEnv* env,
-    std::vector<std::tuple<jbyteArray, jbyte*, jobject>> &parts_to_free) {
-  for (auto &value : parts_to_free) {
+    std::vector<std::tuple<jbyteArray, jbyte*, jobject>>& parts_to_free) {
+  for (auto& value : parts_to_free) {
     jobject jk;
     jbyteArray jk_ba;
     jbyte* jk_val;
@@ -675,10 +676,10 @@
       return;
     }
 
-    jparts_to_free.push_back(std::make_tuple(
-        jba_key_part, jkey_part, jobj_key_part));
-    jparts_to_free.push_back(std::make_tuple(
-        jba_value_part, jvalue_part, jobj_value_part));
+    jparts_to_free.push_back(
+        std::make_tuple(jba_key_part, jkey_part, jobj_key_part));
+    jparts_to_free.push_back(
+        std::make_tuple(jba_value_part, jvalue_part, jobj_value_part));
 
     key_parts.push_back(
         rocksdb::Slice(reinterpret_cast<char*>(jkey_part), jkey_part_len));
@@ -688,8 +689,8 @@
 
   // call the write_multi function
   rocksdb::Status s = fn_write_kv_parts(
-    rocksdb::SliceParts(key_parts.data(), (int)key_parts.size()),
-    rocksdb::SliceParts(value_parts.data(), (int)value_parts.size()));
+      rocksdb::SliceParts(key_parts.data(), (int)key_parts.size()),
+      rocksdb::SliceParts(value_parts.data(), (int)value_parts.size()));
 
   // cleanup temporary memory
   free_parts(env, jparts_to_free);
@@ -834,13 +835,11 @@
 typedef std::function<rocksdb::Status(const rocksdb::SliceParts&)>
     FnWriteKParts;
 
-
 // TODO(AR) consider refactoring to share this between here and rocksjni.cc
 void txn_write_k_parts_helper(JNIEnv* env,
                               const FnWriteKParts& fn_write_k_parts,
                               const jobjectArray& jkey_parts,
                               const jint& jkey_parts_len) {
-
   std::vector<rocksdb::Slice> key_parts;
   std::vector<std::tuple<jbyteArray, jbyte*, jobject>> jkey_parts_to_free;
 
@@ -872,12 +871,13 @@
     jkey_parts_to_free.push_back(std::tuple<jbyteArray, jbyte*, jobject>(
         jba_key_part, jkey_part, jobj_key_part));
 
-    key_parts.push_back(rocksdb::Slice(reinterpret_cast<char*>(jkey_part), jkey_part_len));
+    key_parts.push_back(
+        rocksdb::Slice(reinterpret_cast<char*>(jkey_part), jkey_part_len));
   }
 
   // call the write_multi function
-  rocksdb::Status s =
-      fn_write_k_parts(rocksdb::SliceParts(key_parts.data(), (int)key_parts.size()));
+  rocksdb::Status s = fn_write_k_parts(
+      rocksdb::SliceParts(key_parts.data(), (int)key_parts.size()));
 
   // cleanup temporary memory
   free_parts(env, jkey_parts_to_free);
diff -Nru rocksdb-5.15.10/java/src/main/java/org/rocksdb/CompactRangeOptions.java rocksdb-5.17.2/java/src/main/java/org/rocksdb/CompactRangeOptions.java
--- rocksdb-5.15.10/java/src/main/java/org/rocksdb/CompactRangeOptions.java	1970-01-01 00:00:00.000000000 +0000
+++ rocksdb-5.17.2/java/src/main/java/org/rocksdb/CompactRangeOptions.java	2018-11-12 19:57:32.000000000 +0000
@@ -0,0 +1,233 @@
+// Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+
+package org.rocksdb;
+
+/**
+ * CompactRangeOptions is used by CompactRange() call. In the documentation of the methods "the compaction" refers to
+ * any compaction that is using this CompactRangeOptions.
+ */
+public class CompactRangeOptions extends RocksObject {
+
+  private final static byte VALUE_kSkip = 0;
+  private final static byte VALUE_kIfHaveCompactionFilter = 1;
+  private final static byte VALUE_kForce = 2;
+
+  // For level based compaction, we can configure if we want to skip/force bottommost level compaction.
+  // The order of this neum MUST follow the C++ layer. See BottommostLevelCompaction in db/options.h
+  public enum BottommostLevelCompaction {
+    /**
+     * Skip bottommost level compaction
+     */
+    kSkip((byte)VALUE_kSkip),
+    /**
+     * Only compact bottommost level if there is a compaction filter. This is the default option
+     */
+    kIfHaveCompactionFilter(VALUE_kIfHaveCompactionFilter),
+    /**
+     * Always compact bottommost level
+     */
+    kForce(VALUE_kForce);
+
+    private final byte value;
+
+    BottommostLevelCompaction(final byte value) {
+      this.value = value;
+    }
+
+    /**
+     * <p>Returns the byte value of the enumerations value.</p>
+     *
+     * @return byte representation
+     */
+    public byte getValue() {
+      return value;
+    }
+
+    /**
+     * Returns the BottommostLevelCompaction for the given C++ rocks enum value.
+     * @param bottommostLevelCompaction The value of the BottommostLevelCompaction
+     * @return BottommostLevelCompaction instance, or null if none matches
+     */
+    public static BottommostLevelCompaction fromRocksId(final int bottommostLevelCompaction) {
+      switch (bottommostLevelCompaction) {
+        case VALUE_kSkip: return kSkip;
+        case VALUE_kIfHaveCompactionFilter: return kIfHaveCompactionFilter;
+        case VALUE_kForce: return kForce;
+        default: return null;
+      }
+    }
+  }
+
+  /**
+   * Construct CompactRangeOptions.
+   */
+  public CompactRangeOptions() {
+    super(newCompactRangeOptions());
+  }
+
+  /**
+   * Returns whether the compaction is exclusive or other compactions may run concurrently at the same time.
+   *
+   * @return true if exclusive, false if concurrent
+   */
+  public boolean exclusiveManualCompaction() {
+    return exclusiveManualCompaction(nativeHandle_);
+  }
+
+  /**
+   * Sets whether the compaction is exclusive or other compaction are allowed run concurrently at the same time.
+   *
+   * @param exclusiveCompaction true if compaction should be exclusive
+   * @return This CompactRangeOptions
+   */
+  public CompactRangeOptions setExclusiveManualCompaction(final boolean exclusiveCompaction) {
+    setExclusiveManualCompaction(nativeHandle_, exclusiveCompaction);
+    return this;
+  }
+
+
+  /**
+   * Returns the policy for compacting the bottommost level
+   * @return The BottommostLevelCompaction policy
+   */
+  public BottommostLevelCompaction bottommostLevelCompaction() {
+    return BottommostLevelCompaction.fromRocksId(bottommostLevelCompaction(nativeHandle_));
+  }
+
+  /**
+   * Sets the policy for compacting the bottommost level
+   *
+   * @param bottommostLevelCompaction The policy for compacting the bottommost level
+   * @return This CompactRangeOptions
+   */
+  public CompactRangeOptions setBottommostLevelCompaction(final BottommostLevelCompaction bottommostLevelCompaction) {
+    setBottommostLevelCompaction(nativeHandle_, bottommostLevelCompaction.getValue());
+    return this;
+  }
+
+  /**
+   * Returns whether compacted files will be moved to the minimum level capable of holding the data or given level
+   * (specified non-negative target_level).
+   * @return true, if compacted files will be moved to the minimum level
+   */
+  public boolean changeLevel() {
+    return changeLevel(nativeHandle_);
+  }
+
+  /**
+   * Whether compacted files will be moved to the minimum level capable of holding the data or given level
+   * (specified non-negative target_level).
+   *
+   * @param changeLevel If true, compacted files will be moved to the minimum level
+   * @return This CompactRangeOptions
+   */
+  public CompactRangeOptions setChangeLevel(final boolean changeLevel) {
+    setChangeLevel(nativeHandle_, changeLevel);
+    return this;
+  }
+
+  /**
+   * If change_level is true and target_level have non-negative value, compacted files will be moved to target_level.
+   * @return The target level for the compacted files
+   */
+  public int targetLevel() {
+    return targetLevel(nativeHandle_);
+  }
+
+
+  /**
+   * If change_level is true and target_level have non-negative value, compacted files will be moved to target_level.
+   *
+   * @param targetLevel target level for the compacted files
+   * @return This CompactRangeOptions
+   */
+  public CompactRangeOptions setTargetLevel(final int targetLevel) {
+    setTargetLevel(nativeHandle_, targetLevel);
+    return this;
+  }
+
+  /**
+   * target_path_id for compaction output. Compaction outputs will be placed in options.db_paths[target_path_id].
+   *
+   * @return target_path_id
+   */
+  public int targetPathId() {
+    return targetPathId(nativeHandle_);
+  }
+
+  /**
+   * Compaction outputs will be placed in options.db_paths[target_path_id]. Behavior is undefined if target_path_id is
+   * out of range.
+   *
+   * @param targetPathId target path id
+   * @return This CompactRangeOptions
+   */
+  public CompactRangeOptions setTargetPathId(final int targetPathId) {
+    setTargetPathId(nativeHandle_, targetPathId);
+    return this;
+  }
+
+  /**
+   * If true, compaction will execute immediately even if doing so would cause the DB to
+   * enter write stall mode. Otherwise, it'll sleep until load is low enough.
+   * @return true if compaction will execute immediately
+   */
+  public boolean allowWriteStall() {
+    return allowWriteStall(nativeHandle_);
+  }
+
+
+  /**
+   * If true, compaction will execute immediately even if doing so would cause the DB to
+   * enter write stall mode. Otherwise, it'll sleep until load is low enough.
+   *
+   * @return This CompactRangeOptions
+   * @param allowWriteStall true if compaction should execute immediately
+   */
+  public CompactRangeOptions setAllowWriteStall(final boolean allowWriteStall) {
+    setAllowWriteStall(nativeHandle_, allowWriteStall);
+    return this;
+  }
+
+  /**
+   * If &gt; 0, it will replace the option in the DBOptions for this compaction
+   * @return number of subcompactions
+   */
+  public int maxSubcompactions() {
+    return maxSubcompactions(nativeHandle_);
+  }
+
+  /**
+   * If &gt; 0, it will replace the option in the DBOptions for this compaction
+   *
+   * @param maxSubcompactions number of subcompactions
+   * @return This CompactRangeOptions
+   */
+  public CompactRangeOptions setMaxSubcompactions(final int maxSubcompactions) {
+    setMaxSubcompactions(nativeHandle_, maxSubcompactions);
+    return this;
+  }
+
+  private native static long newCompactRangeOptions();
+  private native boolean exclusiveManualCompaction(final long handle);
+  private native void setExclusiveManualCompaction(final long handle, final boolean exclusive_manual_compaction);
+  private native int bottommostLevelCompaction(final long handle);
+  private native void setBottommostLevelCompaction(final long handle, final int bottommostLevelCompaction);
+  private native boolean changeLevel(final long handle);
+  private native void setChangeLevel(final long handle, final boolean changeLevel);
+  private native int targetLevel(final long handle);
+  private native void setTargetLevel(final long handle, final int targetLevel);
+  private native int targetPathId(final long handle);
+  private native void setTargetPathId(final long handle, final int /* uint32_t */ targetPathId);
+  private native boolean allowWriteStall(final long handle);
+  private native void setAllowWriteStall(final long handle, final boolean allowWriteStall);
+  private native void setMaxSubcompactions(final long handle, final int /* uint32_t */ maxSubcompactions);
+  private native int maxSubcompactions(final long handle);
+
+  @Override
+  protected final native void disposeInternal(final long handle);
+
+}
diff -Nru rocksdb-5.15.10/java/src/main/java/org/rocksdb/DBOptionsInterface.java rocksdb-5.17.2/java/src/main/java/org/rocksdb/DBOptionsInterface.java
--- rocksdb-5.15.10/java/src/main/java/org/rocksdb/DBOptionsInterface.java	2018-09-13 17:25:20.000000000 +0000
+++ rocksdb-5.17.2/java/src/main/java/org/rocksdb/DBOptionsInterface.java	2018-11-12 19:57:32.000000000 +0000
@@ -262,6 +262,8 @@
    * </p>
    * <p>If set to 0 (default), we will dynamically choose the WAL size limit to
    * be [sum of all write_buffer_size * max_write_buffer_number] * 2</p>
+   * <p>This option takes effect only when there are more than one column family as
+   * otherwise the wal size is dictated by the write_buffer_size.</p>
    * <p>Default: 0</p>
    *
    * @param maxTotalWalSize max total wal size.
diff -Nru rocksdb-5.15.10/java/src/main/java/org/rocksdb/RocksDB.java rocksdb-5.17.2/java/src/main/java/org/rocksdb/RocksDB.java
--- rocksdb-5.15.10/java/src/main/java/org/rocksdb/RocksDB.java	2018-09-13 17:25:20.000000000 +0000
+++ rocksdb-5.17.2/java/src/main/java/org/rocksdb/RocksDB.java	2018-11-12 19:57:32.000000000 +0000
@@ -1518,6 +1518,31 @@
         property, property.length());
   }
 
+ /**
+   * <p> Return sum of the getLongProperty of all the column families</p>
+   *
+   * <p><strong>Note</strong>: As the returned property is of type
+   * {@code uint64_t} on C++ side the returning value can be negative
+   * because Java supports in Java 7 only signed long values.</p>
+   *
+   * <p><strong>Java 7</strong>: To mitigate the problem of the non
+   * existent unsigned long tpye, values should be encapsulated using
+   * {@link java.math.BigInteger} to reflect the correct value. The correct
+   * behavior is guaranteed if {@code 2^64} is added to negative values.</p>
+   *
+   * <p><strong>Java 8</strong>: In Java 8 the value should be treated as
+   * unsigned long using provided methods of type {@link Long}.</p>
+   *
+   * @param property to be fetched.
+   *
+   * @return numerical property value
+   *
+   * @throws RocksDBException if an error happens in the underlying native code.
+   */
+  public long getAggregatedLongProperty(final String property) throws RocksDBException {
+    return getAggregatedLongProperty(nativeHandle_, property, property.length());
+  }
+
   /**
    * <p>Return a heap-allocated iterator over the contents of the
    * database. The result of newIterator() is initially invalid
@@ -1823,6 +1848,8 @@
    * <li>{@link #compactRange(byte[], byte[], boolean, int, int)}</li>
    * </ul>
    *
+   * @deprecated Use {@link #compactRange(ColumnFamilyHandle, byte[], byte[], CompactRangeOptions)} instead
+   *
    * @param reduce_level reduce level after compaction
    * @param target_level target level to compact to
    * @param target_path_id the target path id of output path
@@ -1830,6 +1857,7 @@
    * @throws RocksDBException thrown if an error occurs within the native
    *     part of the library.
    */
+  @Deprecated
   public void compactRange(final boolean reduce_level,
       final int target_level, final int target_path_id)
       throws RocksDBException {
@@ -1855,6 +1883,8 @@
    * <li>{@link #compactRange(byte[], byte[])}</li>
    * </ul>
    *
+   * @deprecated Use {@link #compactRange(ColumnFamilyHandle, byte[], byte[], CompactRangeOptions)} instead
+   *
    * @param begin start of key range (included in range)
    * @param end end of key range (excluded from range)
    * @param reduce_level reduce level after compaction
@@ -1864,6 +1894,7 @@
    * @throws RocksDBException thrown if an error occurs within the native
    *     part of the library.
    */
+  @Deprecated
   public void compactRange(final byte[] begin, final byte[] end,
       final boolean reduce_level, final int target_level,
       final int target_path_id) throws RocksDBException {
@@ -1935,6 +1966,27 @@
         false, -1, 0, columnFamilyHandle.nativeHandle_);
   }
 
+
+  /**
+   * <p>Range compaction of column family.</p>
+   * <p><strong>Note</strong>: After the database has been compacted,
+   * all data will have been pushed down to the last level containing
+   * any data.</p>
+   *
+   * @param columnFamilyHandle {@link org.rocksdb.ColumnFamilyHandle} instance.
+   * @param begin start of key range (included in range)
+   * @param end end of key range (excluded from range)
+   * @param compactRangeOptions options for the compaction
+   *
+   * @throws RocksDBException thrown if an error occurs within the native
+   *     part of the library.
+   */
+  public void compactRange(final ColumnFamilyHandle columnFamilyHandle,
+    final byte[] begin, final byte[] end, CompactRangeOptions compactRangeOptions) throws RocksDBException {
+    compactRange(nativeHandle_, begin, begin.length, end, end.length,
+      compactRangeOptions.nativeHandle_, columnFamilyHandle.nativeHandle_);
+  }
+
   /**
    * <p>Range compaction of column family.</p>
    * <p><strong>Note</strong>: After the database has been compacted,
@@ -1957,6 +2009,8 @@
    * </li>
    * </ul>
    *
+   * @deprecated Use {@link #compactRange(ColumnFamilyHandle, byte[], byte[], CompactRangeOptions)} instead
+   *
    * @param columnFamilyHandle {@link org.rocksdb.ColumnFamilyHandle}
    *     instance.
    * @param reduce_level reduce level after compaction
@@ -1966,6 +2020,7 @@
    * @throws RocksDBException thrown if an error occurs within the native
    *     part of the library.
    */
+  @Deprecated
   public void compactRange(final ColumnFamilyHandle columnFamilyHandle,
       final boolean reduce_level, final int target_level,
       final int target_path_id) throws RocksDBException {
@@ -1994,6 +2049,8 @@
    * </li>
    * </ul>
    *
+   * @deprecated Use {@link #compactRange(ColumnFamilyHandle, byte[], byte[], CompactRangeOptions)} instead
+   *
    * @param columnFamilyHandle {@link org.rocksdb.ColumnFamilyHandle}
    *     instance.
    * @param begin start of key range (included in range)
@@ -2005,6 +2062,7 @@
    * @throws RocksDBException thrown if an error occurs within the native
    *     part of the library.
    */
+  @Deprecated
   public void compactRange(final ColumnFamilyHandle columnFamilyHandle,
       final byte[] begin, final byte[] end, final boolean reduce_level,
       final int target_level, final int target_path_id)
@@ -2350,6 +2408,8 @@
       int propertyLength) throws RocksDBException;
   protected native long getLongProperty(long nativeHandle, long cfHandle,
       String property, int propertyLength) throws RocksDBException;
+  protected native long getAggregatedLongProperty(long nativeHandle, String property,
+      int propertyLength) throws RocksDBException;
   protected native long iterator(long handle);
   protected native long iterator(long handle, long readOptHandle);
   protected native long iteratorCF(long handle, long cfHandle);
@@ -2377,6 +2437,9 @@
   private native void compactRange0(long handle, byte[] begin, int beginLen,
       byte[] end, int endLen, boolean reduce_level, int target_level,
       int target_path_id) throws RocksDBException;
+  private native void compactRange(long handle, byte[] begin, int beginLen,
+    byte[] end, int endLen, long compactRangeOptHandle, long cfHandle)
+    throws RocksDBException;
   private native void compactRange(long handle, boolean reduce_level,
       int target_level, int target_path_id, long cfHandle)
       throws RocksDBException;
diff -Nru rocksdb-5.15.10/java/src/test/java/org/rocksdb/AbstractTransactionTest.java rocksdb-5.17.2/java/src/test/java/org/rocksdb/AbstractTransactionTest.java
--- rocksdb-5.15.10/java/src/test/java/org/rocksdb/AbstractTransactionTest.java	2018-09-13 17:25:20.000000000 +0000
+++ rocksdb-5.17.2/java/src/test/java/org/rocksdb/AbstractTransactionTest.java	2018-11-12 19:57:32.000000000 +0000
@@ -686,13 +686,12 @@
   @Test
   public void elapsedTime() throws RocksDBException, InterruptedException {
     final long preStartTxnTime = System.currentTimeMillis();
-    try(final DBContainer dbContainer = startDb();
-        final Transaction txn = dbContainer.beginTransaction()) {
+    try (final DBContainer dbContainer = startDb();
+         final Transaction txn = dbContainer.beginTransaction()) {
       Thread.sleep(2);
 
       final long txnElapsedTime = txn.getElapsedTime();
-      assertThat(txnElapsedTime).isLessThan(System.currentTimeMillis()
-          - preStartTxnTime);
+      assertThat(txnElapsedTime).isLessThan(System.currentTimeMillis() - preStartTxnTime);
       assertThat(txnElapsedTime).isGreaterThan(0);
     }
   }
diff -Nru rocksdb-5.15.10/java/src/test/java/org/rocksdb/ColumnFamilyTest.java rocksdb-5.17.2/java/src/test/java/org/rocksdb/ColumnFamilyTest.java
--- rocksdb-5.15.10/java/src/test/java/org/rocksdb/ColumnFamilyTest.java	2018-09-13 17:25:20.000000000 +0000
+++ rocksdb-5.17.2/java/src/test/java/org/rocksdb/ColumnFamilyTest.java	2018-11-12 19:57:32.000000000 +0000
@@ -404,6 +404,10 @@
             "rocksdb.stats")).isNotNull();
         assertThat(db.getProperty(columnFamilyHandleList.get(1),
             "rocksdb.sstables")).isNotNull();
+        assertThat(db.getAggregatedLongProperty("rocksdb.estimate-num-keys")).
+            isNotNull();
+        assertThat(db.getAggregatedLongProperty("rocksdb.estimate-num-keys")).
+            isGreaterThanOrEqualTo(0);
       } finally {
         for (final ColumnFamilyHandle columnFamilyHandle :
             columnFamilyHandleList) {
diff -Nru rocksdb-5.15.10/java/src/test/java/org/rocksdb/CompactRangeOptionsTest.java rocksdb-5.17.2/java/src/test/java/org/rocksdb/CompactRangeOptionsTest.java
--- rocksdb-5.15.10/java/src/test/java/org/rocksdb/CompactRangeOptionsTest.java	1970-01-01 00:00:00.000000000 +0000
+++ rocksdb-5.17.2/java/src/test/java/org/rocksdb/CompactRangeOptionsTest.java	2018-11-12 19:57:32.000000000 +0000
@@ -0,0 +1,98 @@
+// Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+
+package org.rocksdb;
+
+import org.junit.Test;
+import org.rocksdb.CompactRangeOptions.BottommostLevelCompaction;
+
+import static org.assertj.core.api.Assertions.assertThat;
+
+public class CompactRangeOptionsTest {
+
+  static {
+    RocksDB.loadLibrary();
+  }
+
+  @Test
+  public void exclusiveManualCompaction() {
+    CompactRangeOptions opt = new CompactRangeOptions();
+    boolean value = false;
+    opt.setExclusiveManualCompaction(value);
+    assertThat(opt.exclusiveManualCompaction()).isEqualTo(value);
+    value = true;
+    opt.setExclusiveManualCompaction(value);
+    assertThat(opt.exclusiveManualCompaction()).isEqualTo(value);
+  }
+
+  @Test
+  public void bottommostLevelCompaction() {
+    CompactRangeOptions opt = new CompactRangeOptions();
+    BottommostLevelCompaction value = BottommostLevelCompaction.kSkip;
+    opt.setBottommostLevelCompaction(value);
+    assertThat(opt.bottommostLevelCompaction()).isEqualTo(value);
+    value = BottommostLevelCompaction.kForce;
+    opt.setBottommostLevelCompaction(value);
+    assertThat(opt.bottommostLevelCompaction()).isEqualTo(value);
+    value = BottommostLevelCompaction.kIfHaveCompactionFilter;
+    opt.setBottommostLevelCompaction(value);
+    assertThat(opt.bottommostLevelCompaction()).isEqualTo(value);
+  }
+
+  @Test
+  public void changeLevel() {
+    CompactRangeOptions opt = new CompactRangeOptions();
+    boolean value = false;
+    opt.setChangeLevel(value);
+    assertThat(opt.changeLevel()).isEqualTo(value);
+    value = true;
+    opt.setChangeLevel(value);
+    assertThat(opt.changeLevel()).isEqualTo(value);
+  }
+
+  @Test
+  public void targetLevel() {
+    CompactRangeOptions opt = new CompactRangeOptions();
+    int value = 2;
+    opt.setTargetLevel(value);
+    assertThat(opt.targetLevel()).isEqualTo(value);
+    value = 3;
+    opt.setTargetLevel(value);
+    assertThat(opt.targetLevel()).isEqualTo(value);
+  }
+
+  @Test
+  public void targetPathId() {
+    CompactRangeOptions opt = new CompactRangeOptions();
+    int value = 2;
+    opt.setTargetPathId(value);
+    assertThat(opt.targetPathId()).isEqualTo(value);
+    value = 3;
+    opt.setTargetPathId(value);
+    assertThat(opt.targetPathId()).isEqualTo(value);
+  }
+
+  @Test
+  public void allowWriteStall() {
+    CompactRangeOptions opt = new CompactRangeOptions();
+    boolean value = false;
+    opt.setAllowWriteStall(value);
+    assertThat(opt.allowWriteStall()).isEqualTo(value);
+    value = true;
+    opt.setAllowWriteStall(value);
+    assertThat(opt.allowWriteStall()).isEqualTo(value);
+  }
+
+  @Test
+  public void maxSubcompactions() {
+    CompactRangeOptions opt = new CompactRangeOptions();
+    int value = 2;
+    opt.setMaxSubcompactions(value);
+    assertThat(opt.maxSubcompactions()).isEqualTo(value);
+    value = 3;
+    opt.setMaxSubcompactions(value);
+    assertThat(opt.maxSubcompactions()).isEqualTo(value);
+  }
+}
diff -Nru rocksdb-5.15.10/Makefile rocksdb-5.17.2/Makefile
--- rocksdb-5.15.10/Makefile	2018-09-13 17:25:20.000000000 +0000
+++ rocksdb-5.17.2/Makefile	2018-11-12 19:57:32.000000000 +0000
@@ -382,6 +382,8 @@
 
 BENCHTOOLOBJECTS = $(BENCH_LIB_SOURCES:.cc=.o) $(LIBOBJECTS) $(TESTUTIL)
 
+ANALYZETOOLOBJECTS = $(ANALYZER_LIB_SOURCES:.cc=.o)
+
 EXPOBJECTS = $(EXP_LIB_SOURCES:.cc=.o) $(LIBOBJECTS) $(TESTUTIL)
 
 TESTS = \
@@ -437,6 +439,7 @@
 	table_properties_collector_test \
 	arena_test \
 	block_test \
+	data_block_hash_index_test \
 	cache_test \
 	corruption_test \
 	slice_transform_test \
@@ -529,6 +532,8 @@
 	write_prepared_transaction_test \
 	write_unprepared_transaction_test \
 	db_universal_compaction_test \
+	trace_analyzer_test \
+	repeatable_thread_test \
 
 PARALLEL_TEST = \
 	backupable_db_test \
@@ -572,12 +577,13 @@
 	rocksdb_dump \
 	rocksdb_undump \
 	blob_dump \
+	trace_analyzer \
 
 TEST_LIBS = \
 	librocksdb_env_basic_test.a
 
 # TODO: add back forward_iterator_bench, after making it build in all environemnts.
-BENCHMARKS = db_bench table_reader_bench cache_bench memtablerep_bench column_aware_encoding_exp persistent_cache_bench
+BENCHMARKS = db_bench table_reader_bench cache_bench memtablerep_bench column_aware_encoding_exp persistent_cache_bench range_del_aggregator_bench
 
 # if user didn't config LIBNAME, set the default
 ifeq ($(LIBNAME),)
@@ -665,7 +671,7 @@
 endif  # PLATFORM_SHARED_EXT
 
 .PHONY: blackbox_crash_test check clean coverage crash_test ldb_tests package \
-	release tags valgrind_check whitebox_crash_test format static_lib shared_lib all \
+	release tags tags0 valgrind_check whitebox_crash_test format static_lib shared_lib all \
 	dbg rocksdbjavastatic rocksdbjava install install-static install-shared uninstall \
 	analyze tools tools_lib
 
@@ -997,8 +1003,10 @@
 	$(AM_V_AR)rm -f $@
 	$(AM_V_at)$(AR) $(ARFLAGS) $@ unity.o
 
+
+TOOLLIBOBJECTS = $(TOOL_LIB_SOURCES:.cc=.o)
 # try compiling db_test with unity
-unity_test: db/db_test.o db/db_test_util.o $(TESTHARNESS) unity.a
+unity_test: db/db_test.o db/db_test_util.o $(TESTHARNESS) $(TOOLLIBOBJECTS) unity.a
 	$(AM_LINK)
 	./unity_test
 
@@ -1018,6 +1026,13 @@
 	cscope -b `$(FIND) . -name '*.cc'` `$(FIND) . -name '*.h'` `$(FIND) . -name '*.c'`
 	ctags -e -R -o etags *
 
+tags0:
+	ctags -R .
+	cscope -b `$(FIND) . -name '*.cc' -and ! -name '*_test.cc'` \
+		  `$(FIND) . -name '*.c' -and ! -name '*_test.c'` \
+		  `$(FIND) . -name '*.h' -and ! -name '*_test.h'`
+	ctags -e -R -o etags *
+
 format:
 	build_tools/format-diff.sh
 
@@ -1031,7 +1046,7 @@
 	$(AM_V_AR)rm -f $@
 	$(AM_V_at)$(AR) $(ARFLAGS) $@ $(LIBOBJECTS)
 
-$(TOOLS_LIBRARY): $(BENCH_LIB_SOURCES:.cc=.o) $(TOOL_LIB_SOURCES:.cc=.o) $(LIB_SOURCES:.cc=.o) $(TESTUTIL)
+$(TOOLS_LIBRARY): $(BENCH_LIB_SOURCES:.cc=.o) $(TOOL_LIB_SOURCES:.cc=.o) $(LIB_SOURCES:.cc=.o) $(TESTUTIL) $(ANALYZER_LIB_SOURCES:.cc=.o)
 	$(AM_V_AR)rm -f $@
 	$(AM_V_at)$(AR) $(ARFLAGS) $@ $^
 
@@ -1042,6 +1057,9 @@
 db_bench: tools/db_bench.o $(BENCHTOOLOBJECTS)
 	$(AM_LINK)
 
+trace_analyzer: tools/trace_analyzer.o $(ANALYZETOOLOBJECTS) $(LIBOBJECTS)
+	$(AM_LINK)
+
 cache_bench: cache/cache_bench.o $(LIBOBJECTS) $(TESTUTIL)
 	$(AM_LINK)
 
@@ -1350,6 +1368,9 @@
 block_test: table/block_test.o $(LIBOBJECTS) $(TESTHARNESS)
 	$(AM_LINK)
 
+data_block_hash_index_test: table/data_block_hash_index_test.o $(LIBOBJECTS) $(TESTHARNESS)
+	$(AM_LINK)
+
 inlineskiplist_test: memtable/inlineskiplist_test.o $(LIBOBJECTS) $(TESTHARNESS)
 	$(AM_LINK)
 
@@ -1446,6 +1467,9 @@
 db_bench_tool_test: tools/db_bench_tool_test.o $(BENCHTOOLOBJECTS) $(TESTHARNESS)
 	$(AM_LINK)
 
+trace_analyzer_test: tools/trace_analyzer_test.o $(LIBOBJECTS) $(ANALYZETOOLOBJECTS) $(TESTHARNESS)
+	$(AM_LINK)
+
 event_logger_test: util/event_logger_test.o $(LIBOBJECTS) $(TESTHARNESS)
 	$(AM_LINK)
 
@@ -1527,9 +1551,15 @@
 range_del_aggregator_test: db/range_del_aggregator_test.o db/db_test_util.o $(LIBOBJECTS) $(TESTHARNESS)
 	$(AM_LINK)
 
+range_del_aggregator_bench: db/range_del_aggregator_bench.o $(LIBOBJECTS) $(TESTUTIL)
+	$(AM_LINK)
+
 blob_db_test: utilities/blob_db/blob_db_test.o $(LIBOBJECTS) $(TESTHARNESS)
 	$(AM_LINK)
 
+repeatable_thread_test: util/repeatable_thread_test.o $(LIBOBJECTS) $(TESTHARNESS)
+	$(AM_LINK)
+
 #-------------------------------------------------
 # make install related stuff
 INSTALL_PATH ?= /usr/local
@@ -1760,20 +1790,26 @@
 	cd java/target;jar -uf $(ROCKSDB_JAR_ALL) librocksdbjni-*.so librocksdbjni-*.jnilib
 	cd java/target/classes;jar -uf ../$(ROCKSDB_JAR_ALL) org/rocksdb/*.class org/rocksdb/util/*.class
 
-rocksdbjavastaticreleasedocker: rocksdbjavastatic
-	DOCKER_LINUX_X64_CONTAINER=`docker ps -aqf name=rocksdb_linux_x64-be`; \
-	if [ -z "$$DOCKER_LINUX_X64_CONTAINER" ]; then \
-		docker container create --attach stdin --attach stdout --attach stderr --volume `pwd`:/rocksdb-host --name rocksdb_linux_x64-be evolvedbinary/rocksjava:centos6_x64-be /rocksdb-host/java/crossbuild/docker-build-linux-centos.sh; \
-	fi
-	docker start -a rocksdb_linux_x64-be
+rocksdbjavastaticreleasedocker: rocksdbjavastatic rocksdbjavastaticdockerx86 rocksdbjavastaticdockerx86_64
+	cd java;jar -cf target/$(ROCKSDB_JAR_ALL) HISTORY*.md
+	cd java/target;jar -uf $(ROCKSDB_JAR_ALL) librocksdbjni-*.so librocksdbjni-*.jnilib
+	cd java/target/classes;jar -uf ../$(ROCKSDB_JAR_ALL) org/rocksdb/*.class org/rocksdb/util/*.class
+
+rocksdbjavastaticdockerx86:
+	mkdir -p java/target
 	DOCKER_LINUX_X86_CONTAINER=`docker ps -aqf name=rocksdb_linux_x86-be`; \
 	if [ -z "$$DOCKER_LINUX_X86_CONTAINER" ]; then \
 		docker container create --attach stdin --attach stdout --attach stderr --volume `pwd`:/rocksdb-host --name rocksdb_linux_x86-be evolvedbinary/rocksjava:centos6_x86-be /rocksdb-host/java/crossbuild/docker-build-linux-centos.sh; \
 	fi
 	docker start -a rocksdb_linux_x86-be
-	cd java;jar -cf target/$(ROCKSDB_JAR_ALL) HISTORY*.md
-	cd java/target;jar -uf $(ROCKSDB_JAR_ALL) librocksdbjni-*.so librocksdbjni-*.jnilib
-	cd java/target/classes;jar -uf ../$(ROCKSDB_JAR_ALL) org/rocksdb/*.class org/rocksdb/util/*.class
+
+rocksdbjavastaticdockerx86_64:
+	mkdir -p java/target
+	DOCKER_LINUX_X64_CONTAINER=`docker ps -aqf name=rocksdb_linux_x64-be`; \
+	if [ -z "$$DOCKER_LINUX_X64_CONTAINER" ]; then \
+		docker container create --attach stdin --attach stdout --attach stderr --volume `pwd`:/rocksdb-host --name rocksdb_linux_x64-be evolvedbinary/rocksjava:centos6_x64-be /rocksdb-host/java/crossbuild/docker-build-linux-centos.sh; \
+	fi
+	docker start -a rocksdb_linux_x64-be
 
 rocksdbjavastaticdockerppc64le:
 	mkdir -p java/target
@@ -1898,7 +1934,7 @@
 #  	Source files dependencies detection
 # ---------------------------------------------------------------------------
 
-all_sources = $(LIB_SOURCES) $(MAIN_SOURCES) $(MOCK_LIB_SOURCES) $(TOOL_LIB_SOURCES) $(BENCH_LIB_SOURCES) $(TEST_LIB_SOURCES) $(EXP_LIB_SOURCES)
+all_sources = $(LIB_SOURCES) $(MAIN_SOURCES) $(MOCK_LIB_SOURCES) $(TOOL_LIB_SOURCES) $(BENCH_LIB_SOURCES) $(TEST_LIB_SOURCES) $(EXP_LIB_SOURCES) $(ANALYZER_LIB_SOURCES)
 DEPFILES = $(all_sources:.cc=.cc.d)
 
 # Add proper dependency support so changing a .h file forces a .cc file to
diff -Nru rocksdb-5.15.10/memtable/hash_cuckoo_rep.cc rocksdb-5.17.2/memtable/hash_cuckoo_rep.cc
--- rocksdb-5.15.10/memtable/hash_cuckoo_rep.cc	2018-09-13 17:25:20.000000000 +0000
+++ rocksdb-5.17.2/memtable/hash_cuckoo_rep.cc	2018-11-12 19:57:32.000000000 +0000
@@ -408,6 +408,7 @@
       const auto bucket_user_key = UserKey(stored_key);
       if (bucket_user_key.compare(user_key) == 0) {
         cuckoo_bucket_id = bucket_ids[hid];
+        assert(cuckoo_bucket_id != -1);
         break;
       }
     }
diff -Nru rocksdb-5.15.10/monitoring/histogram_windowing.cc rocksdb-5.17.2/monitoring/histogram_windowing.cc
--- rocksdb-5.15.10/monitoring/histogram_windowing.cc	2018-09-13 17:25:20.000000000 +0000
+++ rocksdb-5.17.2/monitoring/histogram_windowing.cc	2018-11-12 19:57:32.000000000 +0000
@@ -17,7 +17,7 @@
 
 HistogramWindowingImpl::HistogramWindowingImpl() {
   env_ = Env::Default();
-  window_stats_.reset(new HistogramStat[num_windows_]);
+  window_stats_.reset(new HistogramStat[static_cast<size_t>(num_windows_)]);
   Clear();
 }
 
@@ -29,7 +29,7 @@
       micros_per_window_(micros_per_window),
       min_num_per_window_(min_num_per_window) {
   env_ = Env::Default();
-  window_stats_.reset(new HistogramStat[num_windows_]);
+  window_stats_.reset(new HistogramStat[static_cast<size_t>(num_windows_)]);
   Clear();
 }
 
diff -Nru rocksdb-5.15.10/options/options.cc rocksdb-5.17.2/options/options.cc
--- rocksdb-5.15.10/options/options.cc	2018-09-13 17:25:20.000000000 +0000
+++ rocksdb-5.17.2/options/options.cc	2018-11-12 19:57:32.000000000 +0000
@@ -479,6 +479,9 @@
   prefix_extractor.reset(NewNoopTransform());
   BlockBasedTableOptions block_based_options;
   block_based_options.index_type = BlockBasedTableOptions::kHashSearch;
+  block_based_options.data_block_index_type =
+      BlockBasedTableOptions::kDataBlockBinaryAndHash;
+  block_based_options.data_block_hash_table_util_ratio = 0.75;
   block_based_options.filter_policy.reset(NewBloomFilterPolicy(10));
   block_based_options.block_cache =
       NewLRUCache(static_cast<size_t>(block_cache_size_mb * 1024 * 1024));
diff -Nru rocksdb-5.15.10/options/options_helper.cc rocksdb-5.17.2/options/options_helper.cc
--- rocksdb-5.15.10/options/options_helper.cc	2018-09-13 17:25:20.000000000 +0000
+++ rocksdb-5.17.2/options/options_helper.cc	2018-11-12 19:57:32.000000000 +0000
@@ -494,6 +494,11 @@
       return ParseEnum<BlockBasedTableOptions::IndexType>(
           block_base_table_index_type_string_map, value,
           reinterpret_cast<BlockBasedTableOptions::IndexType*>(opt_address));
+    case OptionType::kBlockBasedTableDataBlockIndexType:
+      return ParseEnum<BlockBasedTableOptions::DataBlockIndexType>(
+          block_base_table_data_block_index_type_string_map, value,
+          reinterpret_cast<BlockBasedTableOptions::DataBlockIndexType*>(
+              opt_address));
     case OptionType::kEncodingType:
       return ParseEnum<EncodingType>(
           encoding_type_string_map, value,
@@ -673,6 +678,12 @@
           *reinterpret_cast<const BlockBasedTableOptions::IndexType*>(
               opt_address),
           value);
+    case OptionType::kBlockBasedTableDataBlockIndexType:
+      return SerializeEnum<BlockBasedTableOptions::DataBlockIndexType>(
+          block_base_table_data_block_index_type_string_map,
+          *reinterpret_cast<const BlockBasedTableOptions::DataBlockIndexType*>(
+              opt_address),
+          value);
     case OptionType::kFlushBlockPolicyFactory: {
       const auto* ptr =
           reinterpret_cast<const std::shared_ptr<FlushBlockPolicyFactory>*>(
@@ -1552,6 +1563,13 @@
         {"kTwoLevelIndexSearch",
          BlockBasedTableOptions::IndexType::kTwoLevelIndexSearch}};
 
+std::unordered_map<std::string, BlockBasedTableOptions::DataBlockIndexType>
+    OptionsHelper::block_base_table_data_block_index_type_string_map = {
+        {"kDataBlockBinarySearch",
+         BlockBasedTableOptions::DataBlockIndexType::kDataBlockBinarySearch},
+        {"kDataBlockBinaryAndHash",
+         BlockBasedTableOptions::DataBlockIndexType::kDataBlockBinaryAndHash}};
+
 std::unordered_map<std::string, EncodingType>
     OptionsHelper::encoding_type_string_map = {{"kPlain", kPlain},
                                                {"kPrefix", kPrefix}};
diff -Nru rocksdb-5.15.10/options/options_helper.h rocksdb-5.17.2/options/options_helper.h
--- rocksdb-5.15.10/options/options_helper.h	2018-09-13 17:25:20.000000000 +0000
+++ rocksdb-5.17.2/options/options_helper.h	2018-11-12 19:57:32.000000000 +0000
@@ -69,6 +69,7 @@
   kMergeOperator,
   kMemTableRepFactory,
   kBlockBasedTableIndexType,
+  kBlockBasedTableDataBlockIndexType,
   kFilterPolicy,
   kFlushBlockPolicyFactory,
   kChecksumType,
@@ -163,6 +164,9 @@
       lru_cache_options_type_info;
   static std::unordered_map<std::string, BlockBasedTableOptions::IndexType>
       block_base_table_index_type_string_map;
+  static std::unordered_map<std::string,
+                            BlockBasedTableOptions::DataBlockIndexType>
+      block_base_table_data_block_index_type_string_map;
   static std::unordered_map<std::string, EncodingType> encoding_type_string_map;
   static std::unordered_map<std::string, CompactionStyle>
       compaction_style_string_map;
@@ -203,6 +207,8 @@
     OptionsHelper::compression_type_string_map;
 static auto& block_base_table_index_type_string_map =
     OptionsHelper::block_base_table_index_type_string_map;
+static auto& block_base_table_data_block_index_type_string_map =
+    OptionsHelper::block_base_table_data_block_index_type_string_map;
 static auto& encoding_type_string_map = OptionsHelper::encoding_type_string_map;
 static auto& compaction_style_string_map =
     OptionsHelper::compaction_style_string_map;
diff -Nru rocksdb-5.15.10/options/options_parser.cc rocksdb-5.17.2/options/options_parser.cc
--- rocksdb-5.15.10/options/options_parser.cc	2018-09-13 17:25:20.000000000 +0000
+++ rocksdb-5.17.2/options/options_parser.cc	2018-11-12 19:57:32.000000000 +0000
@@ -49,7 +49,7 @@
     return s;
   }
   unique_ptr<WritableFileWriter> writable;
-  writable.reset(new WritableFileWriter(std::move(wf), EnvOptions(),
+  writable.reset(new WritableFileWriter(std::move(wf), file_name, EnvOptions(),
                                         nullptr /* statistics */));
 
   std::string options_file_content;
@@ -200,45 +200,6 @@
   return Status::OK();
 }
 
-namespace {
-bool ReadOneLine(std::istringstream* iss, SequentialFile* seq_file,
-                 std::string* output, bool* has_data, Status* result) {
-  const int kBufferSize = 8192;
-  char buffer[kBufferSize + 1];
-  Slice input_slice;
-
-  std::string line;
-  bool has_complete_line = false;
-  while (!has_complete_line) {
-    if (std::getline(*iss, line)) {
-      has_complete_line = !iss->eof();
-    } else {
-      has_complete_line = false;
-    }
-    if (!has_complete_line) {
-      // if we're not sure whether we have a complete line,
-      // further read from the file.
-      if (*has_data) {
-        *result = seq_file->Read(kBufferSize, &input_slice, buffer);
-      }
-      if (input_slice.size() == 0) {
-        // meaning we have read all the data
-        *has_data = false;
-        break;
-      } else {
-        iss->str(line + input_slice.ToString());
-        // reset the internal state of iss so that we can keep reading it.
-        iss->clear();
-        *has_data = (input_slice.size() == kBufferSize);
-        continue;
-      }
-    }
-  }
-  *output = line;
-  return *has_data || has_complete_line;
-}
-}  // namespace
-
 Status RocksDBOptionsParser::Parse(const std::string& file_name, Env* env,
                                    bool ignore_unknown_options) {
   Reset();
@@ -592,6 +553,12 @@
           *reinterpret_cast<const BlockBasedTableOptions::IndexType*>(
               offset1) ==
           *reinterpret_cast<const BlockBasedTableOptions::IndexType*>(offset2));
+    case OptionType::kBlockBasedTableDataBlockIndexType:
+      return (
+          *reinterpret_cast<const BlockBasedTableOptions::DataBlockIndexType*>(
+              offset1) ==
+          *reinterpret_cast<const BlockBasedTableOptions::DataBlockIndexType*>(
+              offset2));
     case OptionType::kWALRecoveryMode:
       return (*reinterpret_cast<const WALRecoveryMode*>(offset1) ==
               *reinterpret_cast<const WALRecoveryMode*>(offset2));
diff -Nru rocksdb-5.15.10/options/options_settable_test.cc rocksdb-5.17.2/options/options_settable_test.cc
--- rocksdb-5.15.10/options/options_settable_test.cc	2018-09-13 17:25:20.000000000 +0000
+++ rocksdb-5.17.2/options/options_settable_test.cc	2018-11-12 19:57:32.000000000 +0000
@@ -142,6 +142,8 @@
       "pin_l0_filter_and_index_blocks_in_cache=1;"
       "pin_top_level_index_and_filter=1;"
       "index_type=kHashSearch;"
+      "data_block_index_type=kDataBlockBinaryAndHash;"
+      "data_block_hash_table_util_ratio=0.75;"
       "checksum=kxxHash;hash_index_allow_collision=1;no_block_cache=1;"
       "block_cache=1M;block_cache_compressed=1k;block_size=1024;"
       "block_size_deviation=8;block_restart_interval=4; "
diff -Nru rocksdb-5.15.10/port/dirent.h rocksdb-5.17.2/port/dirent.h
--- rocksdb-5.15.10/port/dirent.h	2018-09-13 17:25:20.000000000 +0000
+++ rocksdb-5.17.2/port/dirent.h	2018-11-12 19:57:32.000000000 +0000
@@ -9,8 +9,7 @@
 //
 // See port_example.h for documentation for the following types/functions.
 
-#ifndef STORAGE_LEVELDB_PORT_DIRENT_H_
-#define STORAGE_LEVELDB_PORT_DIRENT_H_
+#pragma once
 
 #ifdef ROCKSDB_PLATFORM_POSIX
 #include <dirent.h>
@@ -43,5 +42,3 @@
 }  // namespace rocksdb
 
 #endif  // OS_WIN
-
-#endif  // STORAGE_LEVELDB_PORT_DIRENT_H_
diff -Nru rocksdb-5.15.10/port/likely.h rocksdb-5.17.2/port/likely.h
--- rocksdb-5.15.10/port/likely.h	2018-09-13 17:25:20.000000000 +0000
+++ rocksdb-5.17.2/port/likely.h	2018-11-12 19:57:32.000000000 +0000
@@ -7,8 +7,7 @@
 // Use of this source code is governed by a BSD-style license that can be
 // found in the LICENSE file. See the AUTHORS file for names of contributors.
 
-#ifndef PORT_LIKELY_H_
-#define PORT_LIKELY_H_
+#pragma once
 
 #if defined(__GNUC__) && __GNUC__ >= 4
 #define LIKELY(x)   (__builtin_expect((x), 1))
@@ -17,5 +16,3 @@
 #define LIKELY(x)   (x)
 #define UNLIKELY(x) (x)
 #endif
-
-#endif  // PORT_LIKELY_H_
diff -Nru rocksdb-5.15.10/port/port_example.h rocksdb-5.17.2/port/port_example.h
--- rocksdb-5.15.10/port/port_example.h	2018-09-13 17:25:20.000000000 +0000
+++ rocksdb-5.17.2/port/port_example.h	2018-11-12 19:57:32.000000000 +0000
@@ -12,8 +12,7 @@
 // specific port_<platform>.h file.  Use this file as a reference for
 // how to port this package to a new platform.
 
-#ifndef STORAGE_LEVELDB_PORT_PORT_EXAMPLE_H_
-#define STORAGE_LEVELDB_PORT_PORT_EXAMPLE_H_
+#pragma once
 
 namespace rocksdb {
 namespace port {
@@ -100,5 +99,3 @@
 
 }  // namespace port
 }  // namespace rocksdb
-
-#endif  // STORAGE_LEVELDB_PORT_PORT_EXAMPLE_H_
diff -Nru rocksdb-5.15.10/port/sys_time.h rocksdb-5.17.2/port/sys_time.h
--- rocksdb-5.15.10/port/sys_time.h	2018-09-13 17:25:20.000000000 +0000
+++ rocksdb-5.17.2/port/sys_time.h	2018-11-12 19:57:32.000000000 +0000
@@ -10,8 +10,7 @@
 // This file is a portable substitute for sys/time.h which does not exist on
 // Windows
 
-#ifndef STORAGE_LEVELDB_PORT_SYS_TIME_H_
-#define STORAGE_LEVELDB_PORT_SYS_TIME_H_
+#pragma once
 
 #if defined(OS_WIN) && defined(_MSC_VER)
 
@@ -44,5 +43,3 @@
 #include <time.h>
 #include <sys/time.h>
 #endif
-
-#endif  // STORAGE_LEVELDB_PORT_SYS_TIME_H_
diff -Nru rocksdb-5.15.10/port/util_logger.h rocksdb-5.17.2/port/util_logger.h
--- rocksdb-5.15.10/port/util_logger.h	2018-09-13 17:25:20.000000000 +0000
+++ rocksdb-5.17.2/port/util_logger.h	2018-11-12 19:57:32.000000000 +0000
@@ -7,8 +7,7 @@
 // Use of this source code is governed by a BSD-style license that can be
 // found in the LICENSE file. See the AUTHORS file for names of contributors.
 
-#ifndef STORAGE_LEVELDB_PORT_UTIL_LOGGER_H_
-#define STORAGE_LEVELDB_PORT_UTIL_LOGGER_H_
+#pragma once
 
 // Include the appropriate platform specific file below.  If you are
 // porting to a new platform, see "port_example.h" for documentation
@@ -19,5 +18,3 @@
 #elif defined(OS_WIN)
 #include "port/win/win_logger.h"
 #endif
-
-#endif  // STORAGE_LEVELDB_PORT_UTIL_LOGGER_H_
diff -Nru rocksdb-5.15.10/port/win/env_win.cc rocksdb-5.17.2/port/win/env_win.cc
--- rocksdb-5.15.10/port/win/env_win.cc	2018-09-13 17:25:20.000000000 +0000
+++ rocksdb-5.17.2/port/win/env_win.cc	2018-11-12 19:57:32.000000000 +0000
@@ -235,7 +235,7 @@
         MapViewOfFileEx(hMap, FILE_MAP_READ,
         0,  // High DWORD of access start
         0,  // Low DWORD
-        fileSize,
+        static_cast<SIZE_T>(fileSize),
         NULL);  // Let the OS choose the mapping
 
       if (!mapped_region) {
@@ -246,7 +246,7 @@
       }
 
       result->reset(new WinMmapReadableFile(fname, hFile, hMap, mapped_region,
-        fileSize));
+				static_cast<size_t>(fileSize)));
 
       mapGuard.release();
       fileGuard.release();
@@ -448,7 +448,7 @@
   void* base = MapViewOfFileEx(hMap, FILE_MAP_WRITE,
     0,  // High DWORD of access start
     0,  // Low DWORD
-    fileSize,
+		static_cast<SIZE_T>(fileSize),
     NULL);  // Let the OS choose the mapping
 
   if (!base) {
@@ -706,6 +706,9 @@
 
   if (!CreateHardLinkA(target.c_str(), src.c_str(), NULL)) {
     DWORD lastError = GetLastError();
+    if (lastError == ERROR_NOT_SAME_DEVICE) {
+      return Status::NotSupported("No cross FS links allowed");
+    }
 
     std::string text("Failed to link: ");
     text.append(src).append(" to: ").append(target);
@@ -716,6 +719,31 @@
   return result;
 }
 
+Status WinEnvIO::NumFileLinks(const std::string& fname, uint64_t* count) {
+  Status s;
+  HANDLE handle = ::CreateFileA(
+      fname.c_str(), 0, FILE_SHARE_DELETE | FILE_SHARE_READ | FILE_SHARE_WRITE,
+      NULL, OPEN_EXISTING, FILE_FLAG_BACKUP_SEMANTICS, NULL);
+
+  if (INVALID_HANDLE_VALUE == handle) {
+    auto lastError = GetLastError();
+    s = IOErrorFromWindowsError("NumFileLinks: " + fname, lastError);
+    return s;
+  }
+  UniqueCloseHandlePtr handle_guard(handle, CloseHandleFunc);
+  FILE_STANDARD_INFO standard_info;
+  if (0 != GetFileInformationByHandleEx(handle, FileStandardInfo,
+                                        &standard_info,
+                                        sizeof(standard_info))) {
+    *count = standard_info.NumberOfLinks;
+  } else {
+    auto lastError = GetLastError();
+    s = IOErrorFromWindowsError("GetFileInformationByHandleEx: " + fname,
+                                lastError);
+  }
+  return s;
+}
+
 Status WinEnvIO::AreFilesSame(const std::string& first,
   const std::string& second, bool* res) {
 // For MinGW builds
@@ -1325,6 +1353,10 @@
   return winenv_io_.LinkFile(src, target);
 }
 
+Status WinEnv::NumFileLinks(const std::string& fname, uint64_t* count) {
+  return winenv_io_.NumFileLinks(fname, count);
+}
+
 Status WinEnv::AreFilesSame(const std::string& first,
   const std::string& second, bool* res) {
   return winenv_io_.AreFilesSame(first, second, res);
diff -Nru rocksdb-5.15.10/port/win/env_win.h rocksdb-5.17.2/port/win/env_win.h
--- rocksdb-5.15.10/port/win/env_win.h	2018-09-13 17:25:20.000000000 +0000
+++ rocksdb-5.17.2/port/win/env_win.h	2018-11-12 19:57:32.000000000 +0000
@@ -144,6 +144,9 @@
   virtual Status LinkFile(const std::string& src,
     const std::string& target);
 
+  virtual Status NumFileLinks(const std::string& /*fname*/,
+                              uint64_t* /*count*/);
+
   virtual Status AreFilesSame(const std::string& first,
     const std::string& second, bool* res);
 
@@ -268,6 +271,8 @@
   Status LinkFile(const std::string& src,
     const std::string& target) override;
 
+  Status NumFileLinks(const std::string& fname, uint64_t* count) override;
+
   Status AreFilesSame(const std::string& first,
     const std::string& second, bool* res) override;
 
diff -Nru rocksdb-5.15.10/port/win/io_win.cc rocksdb-5.17.2/port/win/io_win.cc
--- rocksdb-5.15.10/port/win/io_win.cc	2018-09-13 17:25:20.000000000 +0000
+++ rocksdb-5.17.2/port/win/io_win.cc	2018-11-12 19:57:32.000000000 +0000
@@ -260,7 +260,7 @@
     *result = Slice();
     return IOError(filename_, EINVAL);
   } else if (offset + n > length_) {
-    n = length_ - offset;
+    n = length_ - static_cast<size_t>(offset);
   }
   *result =
     Slice(reinterpret_cast<const char*>(mapped_region_)+offset, n);
@@ -317,7 +317,7 @@
 
   assert(mapped_begin_ == nullptr);
 
-  size_t minDiskSize = file_offset_ + view_size_;
+  size_t minDiskSize = static_cast<size_t>(file_offset_) + view_size_;
 
   if (minDiskSize > reserved_size_) {
     status = Allocate(file_offset_, view_size_);
@@ -579,7 +579,7 @@
   // Make sure that we reserve an aligned amount of space
   // since the reservation block size is driven outside so we want
   // to check if we are ok with reservation here
-  size_t spaceToReserve = Roundup(offset + len, view_size_);
+  size_t spaceToReserve = Roundup(static_cast<size_t>(offset + len), view_size_);
   // Nothing to do
   if (spaceToReserve <= reserved_size_) {
     return status;
@@ -656,14 +656,14 @@
     return Status::NotSupported("This function is only used for direct_io");
   }
 
-  if (!IsSectorAligned(offset) ||
+  if (!IsSectorAligned(static_cast<size_t>(offset)) ||
       !IsSectorAligned(n)) {
       return Status::InvalidArgument(
         "WinSequentialFile::PositionedRead: offset is not properly aligned");
   }
 
   size_t bytes_read = 0; // out param
-  s = PositionedReadInternal(scratch, n, offset, bytes_read);
+  s = PositionedReadInternal(scratch, static_cast<size_t>(n), offset, bytes_read);
   *result = Slice(scratch, bytes_read);
   return s;
 }
@@ -721,7 +721,7 @@
 
   // Check buffer alignment
   if (file_base_->use_direct_io()) {
-    if (!IsSectorAligned(offset) ||
+    if (!IsSectorAligned(static_cast<size_t>(offset)) ||
         !IsAligned(alignment_, scratch)) {
       return Status::InvalidArgument(
         "WinRandomAccessImpl::ReadImpl: offset or scratch is not properly aligned");
@@ -818,7 +818,7 @@
     // to the end of the file
     assert(IsSectorAligned(next_write_offset_));
     if (!IsSectorAligned(data.size()) ||
-        !IsAligned(GetAlignement(), data.data())) {
+        !IsAligned(static_cast<size_t>(GetAlignement()), data.data())) {
       s = Status::InvalidArgument(
         "WriteData must be page aligned, size must be sector aligned");
     } else {
@@ -857,9 +857,9 @@
 Status WinWritableImpl::PositionedAppendImpl(const Slice& data, uint64_t offset) {
 
   if(file_data_->use_direct_io()) {
-    if (!IsSectorAligned(offset) ||
+    if (!IsSectorAligned(static_cast<size_t>(offset)) ||
         !IsSectorAligned(data.size()) ||
-        !IsAligned(GetAlignement(), data.data())) {
+        !IsAligned(static_cast<size_t>(GetAlignement()), data.data())) {
       return Status::InvalidArgument(
         "Data and offset must be page aligned, size must be sector aligned");
     }
@@ -944,7 +944,7 @@
   // Make sure that we reserve an aligned amount of space
   // since the reservation block size is driven outside so we want
   // to check if we are ok with reservation here
-  size_t spaceToReserve = Roundup(offset + len, alignment_);
+  size_t spaceToReserve = Roundup(static_cast<size_t>(offset + len), static_cast<size_t>(alignment_));
   // Nothing to do
   if (spaceToReserve <= reservedsize_) {
     return status;
@@ -977,7 +977,7 @@
 bool WinWritableFile::use_direct_io() const { return WinFileData::use_direct_io(); }
 
 size_t WinWritableFile::GetRequiredBufferAlignment() const {
-  return GetAlignement();
+  return static_cast<size_t>(GetAlignement());
 }
 
 Status WinWritableFile::Append(const Slice& data) {
@@ -1037,7 +1037,7 @@
 bool WinRandomRWFile::use_direct_io() const { return WinFileData::use_direct_io(); }
 
 size_t WinRandomRWFile::GetRequiredBufferAlignment() const {
-  return GetAlignement();
+  return static_cast<size_t>(GetAlignement());
 }
 
 Status WinRandomRWFile::Write(uint64_t offset, const Slice & data) {
diff -Nru rocksdb-5.15.10/port/win/port_win.h rocksdb-5.17.2/port/win/port_win.h
--- rocksdb-5.15.10/port/win/port_win.h	2018-09-13 17:25:20.000000000 +0000
+++ rocksdb-5.17.2/port/win/port_win.h	2018-11-12 19:57:32.000000000 +0000
@@ -9,8 +9,7 @@
 //
 // See port_example.h for documentation for the following types/functions.
 
-#ifndef STORAGE_LEVELDB_PORT_PORT_WIN_H_
-#define STORAGE_LEVELDB_PORT_PORT_WIN_H_
+#pragma once
 
 // Always want minimum headers
 #ifndef WIN32_LEAN_AND_MEAN
@@ -341,5 +340,3 @@
 using port::truncate;
 
 }  // namespace rocksdb
-
-#endif  // STORAGE_LEVELDB_PORT_PORT_WIN_H_
diff -Nru rocksdb-5.15.10/port/win/win_jemalloc.cc rocksdb-5.17.2/port/win/win_jemalloc.cc
--- rocksdb-5.15.10/port/win/win_jemalloc.cc	2018-09-13 17:25:20.000000000 +0000
+++ rocksdb-5.17.2/port/win/win_jemalloc.cc	2018-11-12 19:57:32.000000000 +0000
@@ -43,8 +43,8 @@
   return je_aligned_alloc(alignment, size);
 }
 void jemalloc_aligned_free(void* p) ROCKSDB_NOEXCEPT { je_free(p); }
-} // port
-} // rocksdb
+}  // namespace port
+}  // namespace rocksdb
 
 void* operator new(size_t size) {
   void* p = je_malloc(size);
diff -Nru rocksdb-5.15.10/src.mk rocksdb-5.17.2/src.mk
--- rocksdb-5.15.10/src.mk	2018-09-13 17:25:20.000000000 +0000
+++ rocksdb-5.17.2/src.mk	2018-11-12 19:57:32.000000000 +0000
@@ -103,6 +103,8 @@
   table/cuckoo_table_builder.cc                                 \
   table/cuckoo_table_factory.cc                                 \
   table/cuckoo_table_reader.cc                                  \
+  table/data_block_hash_index.cc                                \
+  table/data_block_footer.cc                                    \
   table/flush_block_policy.cc                                   \
   table/format.cc                                               \
   table/full_filter_block.cc                                    \
@@ -147,18 +149,19 @@
   util/slice.cc                                                 \
   util/sst_file_manager_impl.cc                                 \
   util/status.cc                                                \
-  util/status_message.cc                                        \
   util/string_util.cc                                           \
   util/sync_point.cc                                            \
   util/sync_point_impl.cc                                       \
   util/thread_local.cc                                          \
   util/threadpool_imp.cc                                        \
+  util/trace_replay.cc                                          \
   util/transaction_test_util.cc                                 \
   util/xxhash.cc                                                \
   utilities/backupable/backupable_db.cc                         \
   utilities/blob_db/blob_compaction_filter.cc                   \
   utilities/blob_db/blob_db.cc                                  \
   utilities/blob_db/blob_db_impl.cc                             \
+  utilities/blob_db/blob_db_impl_filesnapshot.cc                \
   utilities/blob_db/blob_file.cc                                \
   utilities/blob_db/blob_log_format.cc                          \
   utilities/blob_db/blob_log_reader.cc                          \
@@ -197,6 +200,7 @@
   utilities/simulator_cache/sim_cache.cc                        \
   utilities/spatialdb/spatial_db.cc                             \
   utilities/table_properties_collectors/compact_on_deletion_collector.cc \
+  utilities/trace/file_trace_reader_writer.cc                   \
   utilities/transactions/optimistic_transaction.cc              \
   utilities/transactions/optimistic_transaction_db_impl.cc      \
   utilities/transactions/pessimistic_transaction.cc             \
@@ -230,6 +234,9 @@
   tools/sst_dump_tool.cc                                        \
   utilities/blob_db/blob_dump_tool.cc                           \
 
+ANALYZER_LIB_SOURCES = \
+  tools/trace_analyzer_tool.cc					\
+
 MOCK_LIB_SOURCES = \
   table/mock_table.cc \
   util/fault_injection_test_env.cc
@@ -321,6 +328,7 @@
   db/redis_test.cc                                                      \
   db/repair_test.cc                                                     \
   db/range_del_aggregator_test.cc                                       \
+  db/range_del_aggregator_bench.cc                                      \
   db/table_properties_collector_test.cc                                 \
   db/util_merge_operators_test.cc                                       \
   db/version_builder_test.cc                                            \
@@ -346,6 +354,7 @@
   table/cleanable_test.cc                                               \
   table/cuckoo_table_builder_test.cc                                    \
   table/cuckoo_table_reader_test.cc                                     \
+  table/data_block_hash_index_test.cc                                   \
   table/full_filter_block_test.cc                                       \
   table/merger_test.cc                                                  \
   table/table_reader_bench.cc                                           \
@@ -357,6 +366,7 @@
   tools/ldb_cmd_test.cc                                                 \
   tools/reduce_levels_test.cc                                           \
   tools/sst_dump_test.cc                                                \
+  tools/trace_analyzer_test.cc						\
   util/arena_test.cc                                                    \
   util/auto_roll_logger_test.cc                                         \
   util/autovector_test.cc                                               \
@@ -368,6 +378,7 @@
   util/filelock_test.cc                                                 \
   util/log_write_bench.cc                                               \
   util/rate_limiter_test.cc                                             \
+  util/repeatable_thread_test.cc                                        \
   util/slice_transform_test.cc                                          \
   util/timer_queue_test.cc                                              \
   util/thread_list_test.cc                                              \
@@ -411,6 +422,7 @@
   java/rocksjni/compaction_filter.cc                          \
   java/rocksjni/compaction_filter_factory.cc                  \
   java/rocksjni/compaction_filter_factory_jnicallback.cc      \
+  java/rocksjni/compact_range_options.cc                      \
   java/rocksjni/compaction_options_fifo.cc                    \
   java/rocksjni/compaction_options_universal.cc               \
   java/rocksjni/comparator.cc                                 \
diff -Nru rocksdb-5.15.10/table/block_based_table_builder.cc rocksdb-5.17.2/table/block_based_table_builder.cc
--- rocksdb-5.15.10/table/block_based_table_builder.cc	2018-09-13 17:25:20.000000000 +0000
+++ rocksdb-5.17.2/table/block_based_table_builder.cc	2018-11-12 19:57:32.000000000 +0000
@@ -39,11 +39,11 @@
 #include "table/full_filter_block.h"
 #include "table/table_builder.h"
 
-#include "util/string_util.h"
 #include "util/coding.h"
 #include "util/compression.h"
 #include "util/crc32c.h"
 #include "util/stop_watch.h"
+#include "util/string_util.h"
 #include "util/xxhash.h"
 
 #include "table/index_builder.h"
@@ -63,6 +63,7 @@
 FilterBlockBuilder* CreateFilterBlockBuilder(
     const ImmutableCFOptions& /*opt*/, const MutableCFOptions& mopt,
     const BlockBasedTableOptions& table_opt,
+    const bool use_delta_encoding_for_index_values,
     PartitionedIndexBuilder* const p_index_builder) {
   if (table_opt.filter_policy == nullptr) return nullptr;
 
@@ -85,7 +86,7 @@
       return new PartitionedFilterBlockBuilder(
           mopt.prefix_extractor.get(), table_opt.whole_key_filtering,
           filter_bits_builder, table_opt.index_block_restart_interval,
-          p_index_builder, partition_size);
+          use_delta_encoding_for_index_values, p_index_builder, partition_size);
     } else {
       return new FullFilterBlockBuilder(mopt.prefix_extractor.get(),
                                         table_opt.whole_key_filtering,
@@ -266,6 +267,7 @@
   TableProperties props;
 
   bool closed = false;  // Either Finish() or Abandon() has been called.
+  const bool use_delta_encoding_for_index_values;
   std::unique_ptr<FilterBlockBuilder> filter_builder;
   char compressed_cache_key_prefix[BlockBasedTable::kMaxCacheKeyPrefixSize];
   size_t compressed_cache_key_prefix_size;
@@ -301,11 +303,19 @@
                       ? std::min(table_options.block_size, kDefaultPageSize)
                       : 0),
         data_block(table_options.block_restart_interval,
-                   table_options.use_delta_encoding),
+                   table_options.use_delta_encoding,
+                   false /* use_value_delta_encoding */,
+                   icomparator.user_comparator()
+                           ->CanKeysWithDifferentByteContentsBeEqual()
+                       ? BlockBasedTableOptions::kDataBlockBinarySearch
+                       : table_options.data_block_index_type,
+                   table_options.data_block_hash_table_util_ratio),
         range_del_block(1 /* block_restart_interval */),
         internal_prefix_transform(_moptions.prefix_extractor.get()),
         compression_dict(_compression_dict),
         compression_ctx(_compression_type, _compression_opts),
+        use_delta_encoding_for_index_values(table_opt.format_version >= 4 &&
+                                            !table_opt.block_align),
         compressed_cache_key_prefix_size(0),
         flush_block_policy(
             table_options.flush_block_policy_factory->NewFlushBlockPolicy(
@@ -317,18 +327,21 @@
     if (table_options.index_type ==
         BlockBasedTableOptions::kTwoLevelIndexSearch) {
       p_index_builder_ = PartitionedIndexBuilder::CreateIndexBuilder(
-          &internal_comparator, table_options);
+          &internal_comparator, use_delta_encoding_for_index_values,
+          table_options);
       index_builder.reset(p_index_builder_);
     } else {
       index_builder.reset(IndexBuilder::CreateIndexBuilder(
           table_options.index_type, &internal_comparator,
-          &this->internal_prefix_transform, table_options));
+          &this->internal_prefix_transform, use_delta_encoding_for_index_values,
+          table_options));
     }
     if (skip_filters) {
       filter_builder = nullptr;
     } else {
       filter_builder.reset(CreateFilterBlockBuilder(
-          _ioptions, _moptions, table_options, p_index_builder_));
+          _ioptions, _moptions, table_options,
+          use_delta_encoding_for_index_values, p_index_builder_));
     }
 
     for (auto& collector_factories : *int_tbl_prop_collector_factories) {
@@ -675,7 +688,8 @@
   if (ok() && !empty_filter_block) {
     Status s = Status::Incomplete();
     while (ok() && s.IsIncomplete()) {
-      Slice filter_content = rep_->filter_builder->Finish(filter_block_handle, &s);
+      Slice filter_content =
+          rep_->filter_builder->Finish(filter_block_handle, &s);
       assert(s.ok() || s.IsIncomplete());
       rep_->props.filter_size += filter_content.size();
       WriteRawBlock(filter_content, kNoCompression, &filter_block_handle);
@@ -752,22 +766,25 @@
     PropertyBlockBuilder property_block_builder;
     rep_->props.column_family_id = rep_->column_family_id;
     rep_->props.column_family_name = rep_->column_family_name;
-    rep_->props.filter_policy_name = rep_->table_options.filter_policy != nullptr
-                                      ? rep_->table_options.filter_policy->Name()
-                                      : "";
+    rep_->props.filter_policy_name =
+        rep_->table_options.filter_policy != nullptr
+            ? rep_->table_options.filter_policy->Name()
+            : "";
     rep_->props.index_size =
         rep_->index_builder->IndexSize() + kBlockTrailerSize;
     rep_->props.comparator_name = rep_->ioptions.user_comparator != nullptr
-                                   ? rep_->ioptions.user_comparator->Name()
-                                   : "nullptr";
-    rep_->props.merge_operator_name = rep_->ioptions.merge_operator != nullptr
-                                       ? rep_->ioptions.merge_operator->Name()
-                                       : "nullptr";
+                                      ? rep_->ioptions.user_comparator->Name()
+                                      : "nullptr";
+    rep_->props.merge_operator_name =
+        rep_->ioptions.merge_operator != nullptr
+            ? rep_->ioptions.merge_operator->Name()
+            : "nullptr";
     rep_->props.compression_name =
         CompressionTypeToString(rep_->compression_ctx.type());
-    rep_->props.prefix_extractor_name = rep_->moptions.prefix_extractor != nullptr
-                                         ? rep_->moptions.prefix_extractor->Name()
-                                         : "nullptr";
+    rep_->props.prefix_extractor_name =
+        rep_->moptions.prefix_extractor != nullptr
+            ? rep_->moptions.prefix_extractor->Name()
+            : "nullptr";
 
     std::string property_collectors_names = "[";
     for (size_t i = 0;
@@ -789,6 +806,8 @@
     }
     rep_->props.index_key_is_user_key =
         !rep_->index_builder->seperator_is_key_plus_seq();
+    rep_->props.index_value_is_delta_encoded =
+        rep_->use_delta_encoding_for_index_values;
     rep_->props.creation_time = rep_->creation_time;
     rep_->props.oldest_key_time = rep_->oldest_key_time;
 
diff -Nru rocksdb-5.15.10/table/block_based_table_factory.cc rocksdb-5.17.2/table/block_based_table_factory.cc
--- rocksdb-5.15.10/table/block_based_table_factory.cc	2018-09-13 17:25:20.000000000 +0000
+++ rocksdb-5.17.2/table/block_based_table_factory.cc	2018-11-12 19:57:32.000000000 +0000
@@ -27,10 +27,141 @@
 #include "table/block_based_table_builder.h"
 #include "table/block_based_table_reader.h"
 #include "table/format.h"
+#include "util/mutexlock.h"
 #include "util/string_util.h"
 
 namespace rocksdb {
 
+void TailPrefetchStats::RecordEffectiveSize(size_t len) {
+  MutexLock l(&mutex_);
+  if (num_records_ < kNumTracked) {
+    num_records_++;
+  }
+  records_[next_++] = len;
+  if (next_ == kNumTracked) {
+    next_ = 0;
+  }
+}
+
+size_t TailPrefetchStats::GetSuggestedPrefetchSize() {
+  std::vector<size_t> sorted;
+  {
+    MutexLock l(&mutex_);
+
+    if (num_records_ == 0) {
+      return 0;
+    }
+    sorted.assign(records_, records_ + num_records_);
+  }
+
+  // Of the historic size, we find the maximum one that satisifis the condtiion
+  // that if prefetching all, less than 1/8 will be wasted.
+  std::sort(sorted.begin(), sorted.end());
+
+  // Assuming we have 5 data points, and after sorting it looks like this:
+  //
+  //                                     +---+
+  //                             +---+   |   |
+  //                             |   |   |   |
+  //                             |   |   |   |
+  //                             |   |   |   |
+  //                             |   |   |   |
+  //                    +---+    |   |   |   |
+  //                    |   |    |   |   |   |
+  //           +---+    |   |    |   |   |   |
+  //           |   |    |   |    |   |   |   |
+  //  +---+    |   |    |   |    |   |   |   |
+  //  |   |    |   |    |   |    |   |   |   |
+  //  |   |    |   |    |   |    |   |   |   |
+  //  |   |    |   |    |   |    |   |   |   |
+  //  |   |    |   |    |   |    |   |   |   |
+  //  |   |    |   |    |   |    |   |   |   |
+  //  +---+    +---+    +---+    +---+   +---+
+  //
+  // and we use every of the value as a candidate, and estimate how much we
+  // wasted, compared to read. For example, when we use the 3rd record
+  // as candiate. This area is what we read:
+  //                                     +---+
+  //                             +---+   |   |
+  //                             |   |   |   |
+  //                             |   |   |   |
+  //                             |   |   |   |
+  //                             |   |   |   |
+  //  ***  ***  ***  ***+ ***  ***  *** *** **
+  //  *                 |   |    |   |   |   |
+  //           +---+    |   |    |   |   |   *
+  //  *        |   |    |   |    |   |   |   |
+  //  +---+    |   |    |   |    |   |   |   *
+  //  *   |    |   |    | X |    |   |   |   |
+  //  |   |    |   |    |   |    |   |   |   *
+  //  *   |    |   |    |   |    |   |   |   |
+  //  |   |    |   |    |   |    |   |   |   *
+  //  *   |    |   |    |   |    |   |   |   |
+  //  *** *** ***-***  ***--*** ***--*** +****
+  // which is (size of the record) X (number of records).
+  //
+  // While wasted is this area:
+  //                                     +---+
+  //                             +---+   |   |
+  //                             |   |   |   |
+  //                             |   |   |   |
+  //                             |   |   |   |
+  //                             |   |   |   |
+  //  ***  ***  ***  ****---+    |   |   |   |
+  //  *                 *   |    |   |   |   |
+  //  *        *-***  ***   |    |   |   |   |
+  //  *        *   |    |   |    |   |   |   |
+  //  *--**  ***   |    |   |    |   |   |   |
+  //  |   |    |   |    | X |    |   |   |   |
+  //  |   |    |   |    |   |    |   |   |   |
+  //  |   |    |   |    |   |    |   |   |   |
+  //  |   |    |   |    |   |    |   |   |   |
+  //  |   |    |   |    |   |    |   |   |   |
+  //  +---+    +---+    +---+    +---+   +---+
+  //
+  // Which can be calculated iteratively.
+  // The difference between wasted using 4st and 3rd record, will
+  // be following area:
+  //                                     +---+
+  //  +--+  +-+   ++  +-+  +-+   +---+   |   |
+  //  + xxxxxxxxxxxxxxxxxxxxxxxx |   |   |   |
+  //    xxxxxxxxxxxxxxxxxxxxxxxx |   |   |   |
+  //  + xxxxxxxxxxxxxxxxxxxxxxxx |   |   |   |
+  //  | xxxxxxxxxxxxxxxxxxxxxxxx |   |   |   |
+  //  +-+ +-+  +-+  ++  +---+ +--+   |   |   |
+  //  |                 |   |    |   |   |   |
+  //           +---+ ++ |   |    |   |   |   |
+  //  |        |   |    |   |    | X |   |   |
+  //  +---+ ++ |   |    |   |    |   |   |   |
+  //  |   |    |   |    |   |    |   |   |   |
+  //  |   |    |   |    |   |    |   |   |   |
+  //  |   |    |   |    |   |    |   |   |   |
+  //  |   |    |   |    |   |    |   |   |   |
+  //  |   |    |   |    |   |    |   |   |   |
+  //  +---+    +---+    +---+    +---+   +---+
+  //
+  // which will be the size difference between 4st and 3rd record,
+  // times 3, which is number of records before the 4st.
+  // Here we assume that all data within the prefetch range will be useful. In
+  // reality, it may not be the case when a partial block is inside the range,
+  // or there are data in the middle that is not read. We ignore those cases
+  // for simplicity.
+  assert(!sorted.empty());
+  size_t prev_size = sorted[0];
+  size_t max_qualified_size = sorted[0];
+  size_t wasted = 0;
+  for (size_t i = 1; i < sorted.size(); i++) {
+    size_t read = sorted[i] * sorted.size();
+    wasted += (sorted[i] - prev_size) * i;
+    if (wasted <= read / 8) {
+      max_qualified_size = sorted[i];
+    }
+    prev_size = sorted[i];
+  }
+  const size_t kMaxPrefetchSize = 512 * 1024;  // Never exceed 512KB
+  return std::min(kMaxPrefetchSize, max_qualified_size);
+}
+
 BlockBasedTableFactory::BlockBasedTableFactory(
     const BlockBasedTableOptions& _table_options)
     : table_options_(_table_options) {
@@ -71,7 +202,8 @@
       table_options_, table_reader_options.internal_comparator, std::move(file),
       file_size, table_reader, table_reader_options.prefix_extractor,
       prefetch_index_and_filter_in_cache, table_reader_options.skip_filters,
-      table_reader_options.level, table_reader_options.immortal);
+      table_reader_options.level, table_reader_options.immortal,
+      table_reader_options.largest_seqno, &tail_prefetch_stats_);
 }
 
 TableBuilder* BlockBasedTableFactory::NewTableBuilder(
@@ -127,6 +259,13 @@
     return Status::InvalidArgument(
         "Block alignment requested but block size is not a power of 2");
   }
+  if (table_options_.data_block_index_type ==
+          BlockBasedTableOptions::kDataBlockBinaryAndHash &&
+      table_options_.data_block_hash_table_util_ratio <= 0) {
+    return Status::InvalidArgument(
+        "data_block_hash_table_util_ratio should be greater than 0 when "
+        "data_block_index_type is set to kDataBlockBinaryAndHash");
+  }
   return Status::OK();
 }
 
diff -Nru rocksdb-5.15.10/table/block_based_table_factory.h rocksdb-5.17.2/table/block_based_table_factory.h
--- rocksdb-5.15.10/table/block_based_table_factory.h	2018-09-13 17:25:20.000000000 +0000
+++ rocksdb-5.17.2/table/block_based_table_factory.h	2018-11-12 19:57:32.000000000 +0000
@@ -26,6 +26,22 @@
 using std::unique_ptr;
 class BlockBasedTableBuilder;
 
+// A class used to track actual bytes written from the tail in the recent SST
+// file opens, and provide a suggestion for following open.
+class TailPrefetchStats {
+ public:
+  void RecordEffectiveSize(size_t len);
+  // 0 indicates no information to determine.
+  size_t GetSuggestedPrefetchSize();
+
+ private:
+  const static size_t kNumTracked = 32;
+  size_t records_[kNumTracked];
+  port::Mutex mutex_;
+  size_t next_ = 0;
+  size_t num_records_ = 0;
+};
+
 class BlockBasedTableFactory : public TableFactory {
  public:
   explicit BlockBasedTableFactory(
@@ -64,6 +80,7 @@
 
  private:
   BlockBasedTableOptions table_options_;
+  mutable TailPrefetchStats tail_prefetch_stats_;
 };
 
 extern const std::string kHashIndexPrefixesBlock;
@@ -106,6 +123,14 @@
         {"hash_index_allow_collision",
          {offsetof(struct BlockBasedTableOptions, hash_index_allow_collision),
           OptionType::kBoolean, OptionVerificationType::kNormal, false, 0}},
+        {"data_block_index_type",
+         {offsetof(struct BlockBasedTableOptions, data_block_index_type),
+          OptionType::kBlockBasedTableDataBlockIndexType,
+          OptionVerificationType::kNormal, false, 0}},
+        {"data_block_hash_table_util_ratio",
+         {offsetof(struct BlockBasedTableOptions,
+                   data_block_hash_table_util_ratio),
+          OptionType::kDouble, OptionVerificationType::kNormal, false, 0}},
         {"checksum",
          {offsetof(struct BlockBasedTableOptions, checksum),
           OptionType::kChecksumType, OptionVerificationType::kNormal, false,
diff -Nru rocksdb-5.15.10/table/block_based_table_reader.cc rocksdb-5.17.2/table/block_based_table_reader.cc
--- rocksdb-5.15.10/table/block_based_table_reader.cc	2018-09-13 17:25:20.000000000 +0000
+++ rocksdb-5.17.2/table/block_based_table_reader.cc	2018-11-12 19:57:32.000000000 +0000
@@ -9,6 +9,7 @@
 #include "table/block_based_table_reader.h"
 
 #include <algorithm>
+#include <array>
 #include <limits>
 #include <string>
 #include <utility>
@@ -137,6 +138,8 @@
 Cache::Handle* GetEntryFromCache(Cache* block_cache, const Slice& key,
                                  Tickers block_cache_miss_ticker,
                                  Tickers block_cache_hit_ticker,
+                                 uint64_t* block_cache_miss_stats,
+                                 uint64_t* block_cache_hit_stats,
                                  Statistics* statistics,
                                  GetContext* get_context) {
   auto cache_handle = block_cache->Lookup(key, statistics);
@@ -144,12 +147,12 @@
     PERF_COUNTER_ADD(block_cache_hit_count, 1);
     if (get_context != nullptr) {
       // overall cache hit
-      get_context->RecordCounters(BLOCK_CACHE_HIT, 1);
+      get_context->get_context_stats_.num_cache_hit++;
       // total bytes read from cache
-      get_context->RecordCounters(BLOCK_CACHE_BYTES_READ,
-                                  block_cache->GetUsage(cache_handle));
+      get_context->get_context_stats_.num_cache_bytes_read +=
+          block_cache->GetUsage(cache_handle);
       // block-type specific cache hit
-      get_context->RecordCounters(block_cache_hit_ticker, 1);
+      (*block_cache_hit_stats)++;
     } else {
       // overall cache hit
       RecordTick(statistics, BLOCK_CACHE_HIT);
@@ -161,9 +164,9 @@
   } else {
     if (get_context != nullptr) {
       // overall cache miss
-      get_context->RecordCounters(BLOCK_CACHE_MISS, 1);
+      get_context->get_context_stats_.num_cache_miss++;
       // block-type specific cache miss
-      get_context->RecordCounters(block_cache_miss_ticker, 1);
+      (*block_cache_miss_stats)++;
     } else {
       RecordTick(statistics, BLOCK_CACHE_MISS);
       RecordTick(statistics, block_cache_miss_ticker);
@@ -211,7 +214,8 @@
                        const InternalKeyComparator* icomparator,
                        IndexReader** index_reader,
                        const PersistentCacheOptions& cache_options,
-                       const int level, const bool index_key_includes_seq) {
+                       const int level, const bool index_key_includes_seq,
+                       const bool index_value_is_full) {
     std::unique_ptr<Block> index_block;
     auto s = ReadBlockFromFile(
         file, prefetch_buffer, footer, ReadOptions(), index_handle,
@@ -222,36 +226,37 @@
     if (s.ok()) {
       *index_reader = new PartitionIndexReader(
           table, icomparator, std::move(index_block), ioptions.statistics,
-          level, index_key_includes_seq);
+          level, index_key_includes_seq, index_value_is_full);
     }
 
     return s;
   }
 
   // return a two-level iterator: first level is on the partition index
-  virtual InternalIterator* NewIterator(IndexBlockIter* /*iter*/ = nullptr,
-                                        bool /*dont_care*/ = true,
-                                        bool fill_cache = true) override {
+  virtual InternalIteratorBase<BlockHandle>* NewIterator(
+      IndexBlockIter* /*iter*/ = nullptr, bool /*dont_care*/ = true,
+      bool fill_cache = true) override {
     Statistics* kNullStats = nullptr;
     // Filters are already checked before seeking the index
     if (!partition_map_.empty()) {
       return NewTwoLevelIterator(
           new BlockBasedTable::PartitionedIndexIteratorState(
-              table_, &partition_map_, index_key_includes_seq_),
+              table_, &partition_map_, index_key_includes_seq_,
+              index_value_is_full_),
           index_block_->NewIterator<IndexBlockIter>(
               icomparator_, icomparator_->user_comparator(), nullptr,
-              kNullStats, true, index_key_includes_seq_));
+              kNullStats, true, index_key_includes_seq_, index_value_is_full_));
     } else {
       auto ro = ReadOptions();
       ro.fill_cache = fill_cache;
       bool kIsIndex = true;
-      return new BlockBasedTableIterator<IndexBlockIter>(
+      return new BlockBasedTableIterator<IndexBlockIter, BlockHandle>(
           table_, ro, *icomparator_,
           index_block_->NewIterator<IndexBlockIter>(
               icomparator_, icomparator_->user_comparator(), nullptr,
-              kNullStats, true, index_key_includes_seq_),
+              kNullStats, true, index_key_includes_seq_, index_value_is_full_),
           false, true, /* prefix_extractor */ nullptr, kIsIndex,
-          index_key_includes_seq_);
+          index_key_includes_seq_, index_value_is_full_);
     }
     // TODO(myabandeh): Update TwoLevelIterator to be able to make use of
     // on-stack BlockIter while the state is on heap. Currentlly it assumes
@@ -267,7 +272,7 @@
     Statistics* kNullStats = nullptr;
     index_block_->NewIterator<IndexBlockIter>(
         icomparator_, icomparator_->user_comparator(), &biter, kNullStats, true,
-        index_key_includes_seq_);
+        index_key_includes_seq_, index_value_is_full_);
     // Index partitions are assumed to be consecuitive. Prefetch them all.
     // Read the first block offset
     biter.SeekToFirst();
@@ -275,14 +280,7 @@
       // Empty index.
       return;
     }
-    Slice input = biter.value();
-    Status s = handle.DecodeFrom(&input);
-    assert(s.ok());
-    if (!s.ok()) {
-      ROCKS_LOG_WARN(rep->ioptions.info_log,
-                     "Could not read first index partition");
-      return;
-    }
+    handle = biter.value();
     uint64_t prefetch_off = handle.offset();
 
     // Read the last block's offset
@@ -291,36 +289,21 @@
       // Empty index.
       return;
     }
-    input = biter.value();
-    s = handle.DecodeFrom(&input);
-    assert(s.ok());
-    if (!s.ok()) {
-      ROCKS_LOG_WARN(rep->ioptions.info_log,
-                     "Could not read last index partition");
-      return;
-    }
+    handle = biter.value();
     uint64_t last_off = handle.offset() + handle.size() + kBlockTrailerSize;
     uint64_t prefetch_len = last_off - prefetch_off;
     std::unique_ptr<FilePrefetchBuffer> prefetch_buffer;
     auto& file = table_->rep_->file;
     prefetch_buffer.reset(new FilePrefetchBuffer());
-    s = prefetch_buffer->Prefetch(file.get(), prefetch_off,
-      static_cast<size_t>(prefetch_len));
+    Status s = prefetch_buffer->Prefetch(file.get(), prefetch_off,
+                                         static_cast<size_t>(prefetch_len));
 
     // After prefetch, read the partitions one by one
     biter.SeekToFirst();
     auto ro = ReadOptions();
     Cache* block_cache = rep->table_options.block_cache.get();
     for (; biter.Valid(); biter.Next()) {
-      input = biter.value();
-      s = handle.DecodeFrom(&input);
-      assert(s.ok());
-      if (!s.ok()) {
-        ROCKS_LOG_WARN(rep->ioptions.info_log,
-                       "Could not read index partition");
-        continue;
-      }
-
+      handle = biter.value();
       BlockBasedTable::CachableEntry<Block> block;
       Slice compression_dict;
       if (rep->compression_dict_block) {
@@ -371,11 +354,13 @@
   PartitionIndexReader(BlockBasedTable* table,
                        const InternalKeyComparator* icomparator,
                        std::unique_ptr<Block>&& index_block, Statistics* stats,
-                       const int /*level*/, const bool index_key_includes_seq)
+                       const int /*level*/, const bool index_key_includes_seq,
+                       const bool index_value_is_full)
       : IndexReader(icomparator, stats),
         table_(table),
         index_block_(std::move(index_block)),
-        index_key_includes_seq_(index_key_includes_seq) {
+        index_key_includes_seq_(index_key_includes_seq),
+        index_value_is_full_(index_value_is_full) {
     assert(index_block_ != nullptr);
   }
   BlockBasedTable* table_;
@@ -383,6 +368,7 @@
   std::unordered_map<uint64_t, BlockBasedTable::CachableEntry<Block>>
       partition_map_;
   const bool index_key_includes_seq_;
+  const bool index_value_is_full_;
 };
 
 // Index that allows binary search lookup for the first key of each block.
@@ -401,7 +387,8 @@
                        const InternalKeyComparator* icomparator,
                        IndexReader** index_reader,
                        const PersistentCacheOptions& cache_options,
-                       const bool index_key_includes_seq) {
+                       const bool index_key_includes_seq,
+                       const bool index_value_is_full) {
     std::unique_ptr<Block> index_block;
     auto s = ReadBlockFromFile(
         file, prefetch_buffer, footer, ReadOptions(), index_handle,
@@ -412,19 +399,19 @@
     if (s.ok()) {
       *index_reader = new BinarySearchIndexReader(
           icomparator, std::move(index_block), ioptions.statistics,
-          index_key_includes_seq);
+          index_key_includes_seq, index_value_is_full);
     }
 
     return s;
   }
 
-  virtual InternalIterator* NewIterator(IndexBlockIter* iter = nullptr,
-                                        bool /*dont_care*/ = true,
-                                        bool /*dont_care*/ = true) override {
+  virtual InternalIteratorBase<BlockHandle>* NewIterator(
+      IndexBlockIter* iter = nullptr, bool /*dont_care*/ = true,
+      bool /*dont_care*/ = true) override {
     Statistics* kNullStats = nullptr;
     return index_block_->NewIterator<IndexBlockIter>(
         icomparator_, icomparator_->user_comparator(), iter, kNullStats, true,
-        index_key_includes_seq_);
+        index_key_includes_seq_, index_value_is_full_);
   }
 
   virtual size_t size() const override { return index_block_->size(); }
@@ -446,31 +433,32 @@
  private:
   BinarySearchIndexReader(const InternalKeyComparator* icomparator,
                           std::unique_ptr<Block>&& index_block,
-                          Statistics* stats, const bool index_key_includes_seq)
+                          Statistics* stats, const bool index_key_includes_seq,
+                          const bool index_value_is_full)
       : IndexReader(icomparator, stats),
         index_block_(std::move(index_block)),
-        index_key_includes_seq_(index_key_includes_seq) {
+        index_key_includes_seq_(index_key_includes_seq),
+        index_value_is_full_(index_value_is_full) {
     assert(index_block_ != nullptr);
   }
   std::unique_ptr<Block> index_block_;
   const bool index_key_includes_seq_;
+  const bool index_value_is_full_;
 };
 
 // Index that leverages an internal hash table to quicken the lookup for a given
 // key.
 class HashIndexReader : public IndexReader {
  public:
-  static Status Create(const SliceTransform* hash_key_extractor,
-                       const Footer& footer, RandomAccessFileReader* file,
-                       FilePrefetchBuffer* prefetch_buffer,
-                       const ImmutableCFOptions& ioptions,
-                       const InternalKeyComparator* icomparator,
-                       const BlockHandle& index_handle,
-                       InternalIterator* meta_index_iter,
-                       IndexReader** index_reader,
-                       bool /*hash_index_allow_collision*/,
-                       const PersistentCacheOptions& cache_options,
-                       const bool index_key_includes_seq) {
+  static Status Create(
+      const SliceTransform* hash_key_extractor, const Footer& footer,
+      RandomAccessFileReader* file, FilePrefetchBuffer* prefetch_buffer,
+      const ImmutableCFOptions& ioptions,
+      const InternalKeyComparator* icomparator, const BlockHandle& index_handle,
+      InternalIterator* meta_index_iter, IndexReader** index_reader,
+      bool /*hash_index_allow_collision*/,
+      const PersistentCacheOptions& cache_options,
+      const bool index_key_includes_seq, const bool index_value_is_full) {
     std::unique_ptr<Block> index_block;
     auto s = ReadBlockFromFile(
         file, prefetch_buffer, footer, ReadOptions(), index_handle,
@@ -486,9 +474,9 @@
     // hard error. We can still fall back to the original binary search index.
     // So, Create will succeed regardless, from this point on.
 
-    auto new_index_reader =
-        new HashIndexReader(icomparator, std::move(index_block),
-                            ioptions.statistics, index_key_includes_seq);
+    auto new_index_reader = new HashIndexReader(
+        icomparator, std::move(index_block), ioptions.statistics,
+        index_key_includes_seq, index_value_is_full);
     *index_reader = new_index_reader;
 
     // Get prefixes block
@@ -542,13 +530,14 @@
     return Status::OK();
   }
 
-  virtual InternalIterator* NewIterator(IndexBlockIter* iter = nullptr,
-                                        bool total_order_seek = true,
-                                        bool /*dont_care*/ = true) override {
+  virtual InternalIteratorBase<BlockHandle>* NewIterator(
+      IndexBlockIter* iter = nullptr, bool total_order_seek = true,
+      bool /*dont_care*/ = true) override {
     Statistics* kNullStats = nullptr;
     return index_block_->NewIterator<IndexBlockIter>(
         icomparator_, icomparator_->user_comparator(), iter, kNullStats,
-        total_order_seek, index_key_includes_seq_, prefix_index_.get());
+        total_order_seek, index_key_includes_seq_, index_value_is_full_,
+        prefix_index_.get());
   }
 
   virtual size_t size() const override { return index_block_->size(); }
@@ -574,10 +563,12 @@
  private:
   HashIndexReader(const InternalKeyComparator* icomparator,
                   std::unique_ptr<Block>&& index_block, Statistics* stats,
-                  const bool index_key_includes_seq)
+                  const bool index_key_includes_seq,
+                  const bool index_value_is_full)
       : IndexReader(icomparator, stats),
         index_block_(std::move(index_block)),
-        index_key_includes_seq_(index_key_includes_seq) {
+        index_key_includes_seq_(index_key_includes_seq),
+        index_value_is_full_(index_value_is_full) {
     assert(index_block_ != nullptr);
   }
 
@@ -588,6 +579,7 @@
   std::unique_ptr<BlockPrefixIndex> prefix_index_;
   BlockContents prefixes_contents_;
   const bool index_key_includes_seq_;
+  const bool index_value_is_full_;
 };
 
 // Helper function to setup the cache key's prefix for the Table.
@@ -661,51 +653,71 @@
   return true;
 }
 
-SequenceNumber GetGlobalSequenceNumber(const TableProperties& table_properties,
-                                       Logger* info_log) {
-  auto& props = table_properties.user_collected_properties;
-
-  auto version_pos = props.find(ExternalSstFilePropertyNames::kVersion);
-  auto seqno_pos = props.find(ExternalSstFilePropertyNames::kGlobalSeqno);
+// Caller has to ensure seqno is not nullptr.
+Status GetGlobalSequenceNumber(const TableProperties& table_properties,
+                               SequenceNumber largest_seqno,
+                               SequenceNumber* seqno) {
+  const auto& props = table_properties.user_collected_properties;
+  const auto version_pos = props.find(ExternalSstFilePropertyNames::kVersion);
+  const auto seqno_pos = props.find(ExternalSstFilePropertyNames::kGlobalSeqno);
 
+  *seqno = kDisableGlobalSequenceNumber;
   if (version_pos == props.end()) {
     if (seqno_pos != props.end()) {
+      std::array<char, 200> msg_buf;
       // This is not an external sst file, global_seqno is not supported.
-      assert(false);
-      ROCKS_LOG_ERROR(
-          info_log,
+      snprintf(
+          msg_buf.data(), msg_buf.max_size(),
           "A non-external sst file have global seqno property with value %s",
           seqno_pos->second.c_str());
+      return Status::Corruption(msg_buf.data());
     }
-    return kDisableGlobalSequenceNumber;
+    return Status::OK();
   }
 
   uint32_t version = DecodeFixed32(version_pos->second.c_str());
   if (version < 2) {
     if (seqno_pos != props.end() || version != 1) {
+      std::array<char, 200> msg_buf;
       // This is a v1 external sst file, global_seqno is not supported.
-      assert(false);
-      ROCKS_LOG_ERROR(
-          info_log,
-          "An external sst file with version %u have global seqno property "
-          "with value %s",
-          version, seqno_pos->second.c_str());
+      snprintf(msg_buf.data(), msg_buf.max_size(),
+               "An external sst file with version %u have global seqno "
+               "property with value %s",
+               version, seqno_pos->second.c_str());
+      return Status::Corruption(msg_buf.data());
     }
-    return kDisableGlobalSequenceNumber;
+    return Status::OK();
   }
 
-  SequenceNumber global_seqno = DecodeFixed64(seqno_pos->second.c_str());
+  // Since we have a plan to deprecate global_seqno, we do not return failure
+  // if seqno_pos == props.end(). We rely on version_pos to detect whether the
+  // SST is external.
+  SequenceNumber global_seqno(0);
+  if (seqno_pos != props.end()) {
+    global_seqno = DecodeFixed64(seqno_pos->second.c_str());
+  }
+  if (global_seqno != 0 && global_seqno != largest_seqno) {
+    std::array<char, 200> msg_buf;
+    snprintf(msg_buf.data(), msg_buf.max_size(),
+             "An external sst file with version %u have global seqno property "
+             "with value %s, while largest seqno in the file is %llu",
+             version, seqno_pos->second.c_str(),
+             static_cast<unsigned long long>(largest_seqno));
+    return Status::Corruption(msg_buf.data());
+  }
+  global_seqno = largest_seqno;
+  *seqno = largest_seqno;
 
   if (global_seqno > kMaxSequenceNumber) {
-    assert(false);
-    ROCKS_LOG_ERROR(
-        info_log,
-        "An external sst file with version %u have global seqno property "
-        "with value %llu, which is greater than kMaxSequenceNumber",
-        version, global_seqno);
+    std::array<char, 200> msg_buf;
+    snprintf(msg_buf.data(), msg_buf.max_size(),
+             "An external sst file with version %u have global seqno property "
+             "with value %llu, which is greater than kMaxSequenceNumber",
+             version, static_cast<unsigned long long>(global_seqno));
+    return Status::Corruption(msg_buf.data());
   }
 
-  return global_seqno;
+  return Status::OK();
 }
 }  // namespace
 
@@ -731,7 +743,9 @@
                              const SliceTransform* prefix_extractor,
                              const bool prefetch_index_and_filter_in_cache,
                              const bool skip_filters, const int level,
-                             const bool immortal_table) {
+                             const bool immortal_table,
+                             const SequenceNumber largest_seqno,
+                             TailPrefetchStats* tail_prefetch_stats) {
   table_reader->reset();
 
   Footer footer;
@@ -741,29 +755,40 @@
   // prefetch both index and filters, down to all partitions
   const bool prefetch_all = prefetch_index_and_filter_in_cache || level == 0;
   const bool preload_all = !table_options.cache_index_and_filter_blocks;
-  // Before read footer, readahead backwards to prefetch data. Do more readahead
-  // if we're going to read index/filter.
-  // TODO: This may incorrectly select small readahead in case partitioned
-  // index/filter is enabled and top-level partition pinning is enabled. That's
-  // because we need to issue readahead before we read the properties, at which
-  // point we don't yet know the index type.
-  const size_t kTailPrefetchSize =
-      prefetch_all || preload_all ? 512 * 1024 : 4 * 1024;
+
+  size_t tail_prefetch_size = 0;
+  if (tail_prefetch_stats != nullptr) {
+    // Multiple threads may get a 0 (no history) when running in parallel,
+    // but it will get cleared after the first of them finishes.
+    tail_prefetch_size = tail_prefetch_stats->GetSuggestedPrefetchSize();
+  }
+  if (tail_prefetch_size == 0) {
+    // Before read footer, readahead backwards to prefetch data. Do more
+    // readahead if we're going to read index/filter.
+    // TODO: This may incorrectly select small readahead in case partitioned
+    // index/filter is enabled and top-level partition pinning is enabled.
+    // That's because we need to issue readahead before we read the properties,
+    // at which point we don't yet know the index type.
+    tail_prefetch_size = prefetch_all || preload_all ? 512 * 1024 : 4 * 1024;
+  }
   size_t prefetch_off;
   size_t prefetch_len;
-  if (file_size < kTailPrefetchSize) {
+  if (file_size < tail_prefetch_size) {
     prefetch_off = 0;
     prefetch_len = static_cast<size_t>(file_size);
   } else {
-    prefetch_off = static_cast<size_t>(file_size - kTailPrefetchSize);
-    prefetch_len = kTailPrefetchSize;
+    prefetch_off = static_cast<size_t>(file_size - tail_prefetch_size);
+    prefetch_len = tail_prefetch_size;
   }
+  TEST_SYNC_POINT_CALLBACK("BlockBasedTable::Open::TailPrefetchLen",
+                           &tail_prefetch_size);
   Status s;
   // TODO should not have this special logic in the future.
   if (!file->use_direct_io()) {
+    prefetch_buffer.reset(new FilePrefetchBuffer(nullptr, 0, 0, false, true));
     s = file->Prefetch(prefetch_off, prefetch_len);
   } else {
-    prefetch_buffer.reset(new FilePrefetchBuffer());
+    prefetch_buffer.reset(new FilePrefetchBuffer(nullptr, 0, 0, true, true));
     s = prefetch_buffer->Prefetch(file.get(), prefetch_off, prefetch_len);
   }
   s = ReadFooterFromFile(file.get(), prefetch_buffer.get(), file_size, &footer,
@@ -922,8 +947,12 @@
         *(rep->table_properties),
         BlockBasedTablePropertyNames::kPrefixFiltering, rep->ioptions.info_log);
 
-    rep->global_seqno = GetGlobalSequenceNumber(*(rep->table_properties),
-                                                rep->ioptions.info_log);
+    s = GetGlobalSequenceNumber(*(rep->table_properties), largest_seqno,
+                                &(rep->global_seqno));
+    if (!s.ok()) {
+      ROCKS_LOG_ERROR(rep->ioptions.info_log, "%s", s.ToString().c_str());
+      return s;
+    }
   }
 
   // Read the range del meta block
@@ -990,8 +1019,9 @@
       bool disable_prefix_seek =
           rep->index_type == BlockBasedTableOptions::kHashSearch &&
           need_upper_bound_check;
-      unique_ptr<InternalIterator> iter(new_table->NewIndexIterator(
-          ReadOptions(), disable_prefix_seek, nullptr, &index_entry));
+      unique_ptr<InternalIteratorBase<BlockHandle>> iter(
+          new_table->NewIndexIterator(ReadOptions(), disable_prefix_seek,
+                                      nullptr, &index_entry));
       s = iter->status();
       if (s.ok()) {
         // This is the first call to NewIndexIterator() since we're in Open().
@@ -1060,6 +1090,12 @@
   }
 
   if (s.ok()) {
+    assert(prefetch_buffer.get() != nullptr);
+    if (tail_prefetch_stats != nullptr) {
+      assert(prefetch_buffer->min_offset_read() < file_size);
+      tail_prefetch_stats->RecordEffectiveSize(
+				static_cast<size_t>(file_size) - prefetch_buffer->min_offset_read());
+    }
     *table_reader = std::move(new_table);
   }
 
@@ -1148,8 +1184,16 @@
     block->cache_handle = GetEntryFromCache(
         block_cache, block_cache_key,
         is_index ? BLOCK_CACHE_INDEX_MISS : BLOCK_CACHE_DATA_MISS,
-        is_index ? BLOCK_CACHE_INDEX_HIT : BLOCK_CACHE_DATA_HIT, statistics,
-        get_context);
+        is_index ? BLOCK_CACHE_INDEX_HIT : BLOCK_CACHE_DATA_HIT,
+        get_context
+            ? (is_index ? &get_context->get_context_stats_.num_cache_index_miss
+                        : &get_context->get_context_stats_.num_cache_data_miss)
+            : nullptr,
+        get_context
+            ? (is_index ? &get_context->get_context_stats_.num_cache_index_hit
+                        : &get_context->get_context_stats_.num_cache_data_hit)
+            : nullptr,
+        statistics, get_context);
     if (block->cache_handle != nullptr) {
       block->value =
           reinterpret_cast<Block*>(block_cache->Value(block->cache_handle));
@@ -1204,24 +1248,26 @@
       block_cache->TEST_mark_as_data_block(block_cache_key, charge);
       if (s.ok()) {
         if (get_context != nullptr) {
-          get_context->RecordCounters(BLOCK_CACHE_ADD, 1);
-          get_context->RecordCounters(BLOCK_CACHE_BYTES_WRITE, charge);
+          get_context->get_context_stats_.num_cache_add++;
+          get_context->get_context_stats_.num_cache_bytes_write += charge;
         } else {
           RecordTick(statistics, BLOCK_CACHE_ADD);
           RecordTick(statistics, BLOCK_CACHE_BYTES_WRITE, charge);
         }
         if (is_index) {
           if (get_context != nullptr) {
-            get_context->RecordCounters(BLOCK_CACHE_INDEX_ADD, 1);
-            get_context->RecordCounters(BLOCK_CACHE_INDEX_BYTES_INSERT, charge);
+            get_context->get_context_stats_.num_cache_index_add++;
+            get_context->get_context_stats_.num_cache_index_bytes_insert +=
+                charge;
           } else {
             RecordTick(statistics, BLOCK_CACHE_INDEX_ADD);
             RecordTick(statistics, BLOCK_CACHE_INDEX_BYTES_INSERT, charge);
           }
         } else {
           if (get_context != nullptr) {
-            get_context->RecordCounters(BLOCK_CACHE_DATA_ADD, 1);
-            get_context->RecordCounters(BLOCK_CACHE_DATA_BYTES_INSERT, charge);
+            get_context->get_context_stats_.num_cache_data_add++;
+            get_context->get_context_stats_.num_cache_data_bytes_insert +=
+                charge;
           } else {
             RecordTick(statistics, BLOCK_CACHE_DATA_ADD);
             RecordTick(statistics, BLOCK_CACHE_DATA_BYTES_INSERT, charge);
@@ -1303,24 +1349,25 @@
     if (s.ok()) {
       assert(block->cache_handle != nullptr);
       if (get_context != nullptr) {
-        get_context->RecordCounters(BLOCK_CACHE_ADD, 1);
-        get_context->RecordCounters(BLOCK_CACHE_BYTES_WRITE, charge);
+        get_context->get_context_stats_.num_cache_add++;
+        get_context->get_context_stats_.num_cache_bytes_write += charge;
       } else {
         RecordTick(statistics, BLOCK_CACHE_ADD);
         RecordTick(statistics, BLOCK_CACHE_BYTES_WRITE, charge);
       }
       if (is_index) {
         if (get_context != nullptr) {
-          get_context->RecordCounters(BLOCK_CACHE_INDEX_ADD, 1);
-          get_context->RecordCounters(BLOCK_CACHE_INDEX_BYTES_INSERT, charge);
+          get_context->get_context_stats_.num_cache_index_add++;
+          get_context->get_context_stats_.num_cache_index_bytes_insert +=
+              charge;
         } else {
           RecordTick(statistics, BLOCK_CACHE_INDEX_ADD);
           RecordTick(statistics, BLOCK_CACHE_INDEX_BYTES_INSERT, charge);
         }
       } else {
         if (get_context != nullptr) {
-          get_context->RecordCounters(BLOCK_CACHE_DATA_ADD, 1);
-          get_context->RecordCounters(BLOCK_CACHE_DATA_BYTES_INSERT, charge);
+          get_context->get_context_stats_.num_cache_data_add++;
+          get_context->get_context_stats_.num_cache_data_bytes_insert += charge;
         } else {
           RecordTick(statistics, BLOCK_CACHE_DATA_ADD);
           RecordTick(statistics, BLOCK_CACHE_DATA_BYTES_INSERT, charge);
@@ -1378,7 +1425,9 @@
           rep->whole_key_filtering, std::move(block), nullptr,
           rep->ioptions.statistics, rep->internal_comparator, this,
           rep_->table_properties == nullptr ||
-              !rep_->table_properties->index_key_is_user_key);
+              rep_->table_properties->index_key_is_user_key == 0,
+          rep_->table_properties == nullptr ||
+              rep_->table_properties->index_value_is_delta_encoded == 0);
     }
 
     case Rep::FilterType::kBlockFilter:
@@ -1445,9 +1494,13 @@
                          filter_blk_handle, cache_key);
 
   Statistics* statistics = rep_->ioptions.statistics;
-  auto cache_handle =
-      GetEntryFromCache(block_cache, key, BLOCK_CACHE_FILTER_MISS,
-                        BLOCK_CACHE_FILTER_HIT, statistics, get_context);
+  auto cache_handle = GetEntryFromCache(
+      block_cache, key, BLOCK_CACHE_FILTER_MISS, BLOCK_CACHE_FILTER_HIT,
+      get_context ? &get_context->get_context_stats_.num_cache_filter_miss
+                  : nullptr,
+      get_context ? &get_context->get_context_stats_.num_cache_filter_hit
+                  : nullptr,
+      statistics, get_context);
 
   FilterBlockReader* filter = nullptr;
   if (cache_handle != nullptr) {
@@ -1468,10 +1521,11 @@
               : Cache::Priority::LOW);
       if (s.ok()) {
         if (get_context != nullptr) {
-          get_context->RecordCounters(BLOCK_CACHE_ADD, 1);
-          get_context->RecordCounters(BLOCK_CACHE_BYTES_WRITE, usage);
-          get_context->RecordCounters(BLOCK_CACHE_FILTER_ADD, 1);
-          get_context->RecordCounters(BLOCK_CACHE_FILTER_BYTES_INSERT, usage);
+          get_context->get_context_stats_.num_cache_add++;
+          get_context->get_context_stats_.num_cache_bytes_write += usage;
+          get_context->get_context_stats_.num_cache_filter_add++;
+          get_context->get_context_stats_.num_cache_filter_bytes_insert +=
+              usage;
         } else {
           RecordTick(statistics, BLOCK_CACHE_ADD);
           RecordTick(statistics, BLOCK_CACHE_BYTES_WRITE, usage);
@@ -1491,7 +1545,7 @@
 
 // disable_prefix_seek should be set to true when prefix_extractor found in SST
 // differs from the one in mutable_cf_options and index type is HashBasedIndex
-InternalIterator* BlockBasedTable::NewIndexIterator(
+InternalIteratorBase<BlockHandle>* BlockBasedTable::NewIndexIterator(
     const ReadOptions& read_options, bool disable_prefix_seek,
     IndexBlockIter* input_iter, CachableEntry<IndexReader>* index_entry,
     GetContext* get_context) {
@@ -1517,16 +1571,21 @@
       GetCacheKeyFromOffset(rep_->cache_key_prefix, rep_->cache_key_prefix_size,
                             rep_->dummy_index_reader_offset, cache_key);
   Statistics* statistics = rep_->ioptions.statistics;
-  auto cache_handle =
-      GetEntryFromCache(block_cache, key, BLOCK_CACHE_INDEX_MISS,
-                        BLOCK_CACHE_INDEX_HIT, statistics, get_context);
+  auto cache_handle = GetEntryFromCache(
+      block_cache, key, BLOCK_CACHE_INDEX_MISS, BLOCK_CACHE_INDEX_HIT,
+      get_context ? &get_context->get_context_stats_.num_cache_index_miss
+                  : nullptr,
+      get_context ? &get_context->get_context_stats_.num_cache_index_hit
+                  : nullptr,
+      statistics, get_context);
 
   if (cache_handle == nullptr && no_io) {
     if (input_iter != nullptr) {
       input_iter->Invalidate(Status::Incomplete("no blocking io"));
       return input_iter;
     } else {
-      return NewErrorInternalIterator(Status::Incomplete("no blocking io"));
+      return NewErrorInternalIterator<BlockHandle>(
+          Status::Incomplete("no blocking io"));
     }
   }
 
@@ -1555,8 +1614,8 @@
 
     if (s.ok()) {
       if (get_context != nullptr) {
-        get_context->RecordCounters(BLOCK_CACHE_ADD, 1);
-        get_context->RecordCounters(BLOCK_CACHE_BYTES_WRITE, charge);
+        get_context->get_context_stats_.num_cache_add++;
+        get_context->get_context_stats_.num_cache_bytes_write += charge;
       } else {
         RecordTick(statistics, BLOCK_CACHE_ADD);
         RecordTick(statistics, BLOCK_CACHE_BYTES_WRITE, charge);
@@ -1573,7 +1632,7 @@
         input_iter->Invalidate(s);
         return input_iter;
       } else {
-        return NewErrorInternalIterator(s);
+        return NewErrorInternalIterator<BlockHandle>(s);
       }
     }
 
@@ -1594,21 +1653,6 @@
   return iter;
 }
 
-template <typename TBlockIter>
-TBlockIter* BlockBasedTable::NewDataBlockIterator(
-    Rep* rep, const ReadOptions& ro, const Slice& index_value,
-    TBlockIter* input_iter, bool is_index, bool key_includes_seq,
-    GetContext* get_context, FilePrefetchBuffer* prefetch_buffer) {
-  BlockHandle handle;
-  Slice input = index_value;
-  // We intentionally allow extra stuff in index_value so that we
-  // can add more features in the future.
-  Status s = handle.DecodeFrom(&input);
-  return NewDataBlockIterator<TBlockIter>(rep, ro, handle, input_iter, is_index,
-                                          key_includes_seq, get_context, s,
-                                          prefetch_buffer);
-}
-
 // Convert an index iterator value (i.e., an encoded BlockHandle)
 // into an iterator over the contents of the corresponding block.
 // If input_iter is null, new a iterator
@@ -1617,7 +1661,8 @@
 TBlockIter* BlockBasedTable::NewDataBlockIterator(
     Rep* rep, const ReadOptions& ro, const BlockHandle& handle,
     TBlockIter* input_iter, bool is_index, bool key_includes_seq,
-    GetContext* get_context, Status s, FilePrefetchBuffer* prefetch_buffer) {
+    bool index_key_is_full, GetContext* get_context, Status s,
+    FilePrefetchBuffer* prefetch_buffer) {
   PERF_TIMER_GUARD(new_table_block_iter_nanos);
 
   const bool no_io = (ro.read_tier == kBlockCacheTier);
@@ -1667,7 +1712,8 @@
     const bool kTotalOrderSeek = true;
     iter = block.value->NewIterator<TBlockIter>(
         &rep->internal_comparator, rep->internal_comparator.user_comparator(),
-        iter, rep->ioptions.statistics, kTotalOrderSeek, key_includes_seq);
+        iter, rep->ioptions.statistics, kTotalOrderSeek, key_includes_seq,
+        index_key_is_full);
     if (block.cache_handle != nullptr) {
       iter->RegisterCleanup(&ReleaseCachedEntry, block_cache,
                             block.cache_handle);
@@ -1782,22 +1828,20 @@
 BlockBasedTable::PartitionedIndexIteratorState::PartitionedIndexIteratorState(
     BlockBasedTable* table,
     std::unordered_map<uint64_t, CachableEntry<Block>>* block_map,
-    bool index_key_includes_seq)
+    bool index_key_includes_seq, bool index_key_is_full)
     : table_(table),
       block_map_(block_map),
-      index_key_includes_seq_(index_key_includes_seq) {}
+      index_key_includes_seq_(index_key_includes_seq),
+      index_key_is_full_(index_key_is_full) {}
 
-template <class TBlockIter>
-const size_t BlockBasedTableIterator<TBlockIter>::kMaxReadaheadSize =
+template <class TBlockIter, typename TValue>
+const size_t BlockBasedTableIterator<TBlockIter, TValue>::kMaxReadaheadSize =
     256 * 1024;
 
-InternalIterator*
+InternalIteratorBase<BlockHandle>*
 BlockBasedTable::PartitionedIndexIteratorState::NewSecondaryIterator(
-    const Slice& index_value) {
+    const BlockHandle& handle) {
   // Return a block iterator on the index partition
-  BlockHandle handle;
-  Slice input = index_value;
-  Status s = handle.DecodeFrom(&input);
   auto rep = table_->get_rep();
   auto block = block_map_->find(handle.offset());
   // This is a possible scenario since block cache might not have had space
@@ -1813,10 +1857,10 @@
     Statistics* kNullStats = nullptr;
     return block->second.value->NewIterator<IndexBlockIter>(
         &rep->internal_comparator, rep->internal_comparator.user_comparator(),
-        nullptr, kNullStats, true, index_key_includes_seq_);
+        nullptr, kNullStats, true, index_key_includes_seq_, index_key_is_full_);
   }
   // Create an empty iterator
-  return new DataBlockIter();
+  return new IndexBlockIter();
 }
 
 // This will be broken if the user specifies an unusual implementation
@@ -1889,7 +1933,7 @@
       // Then, try find it within each block
       // we already know prefix_extractor and prefix_extractor_name must match
       // because `CheckPrefixMayMatch` first checks `check_filter_ == true`
-      unique_ptr<InternalIterator> iiter(
+      unique_ptr<InternalIteratorBase<BlockHandle>> iiter(
           NewIndexIterator(no_io_read_options,
                            /* need_upper_bound_check */ false));
       iiter->Seek(internal_prefix);
@@ -1922,10 +1966,7 @@
         // after the data block corresponding to iiter->key() cannot
         // possibly contain the key.  Thus, the corresponding data block
         // is the only on could potentially contain the prefix.
-        Slice handle_value = iiter->value();
-        BlockHandle handle;
-        s = handle.DecodeFrom(&handle_value);
-        assert(s.ok());
+        BlockHandle handle = iiter->value();
         may_match =
             filter->PrefixMayMatch(prefix, prefix_extractor, handle.offset());
       }
@@ -1949,8 +1990,8 @@
   return may_match;
 }
 
-template <class TBlockIter>
-void BlockBasedTableIterator<TBlockIter>::Seek(const Slice& target) {
+template <class TBlockIter, typename TValue>
+void BlockBasedTableIterator<TBlockIter, TValue>::Seek(const Slice& target) {
   is_out_of_bound_ = false;
   if (!CheckPrefixMayMatch(target)) {
     ResetDataIter();
@@ -1979,8 +2020,9 @@
                                          block_iter_.key()) <= 0));
 }
 
-template <class TBlockIter>
-void BlockBasedTableIterator<TBlockIter>::SeekForPrev(const Slice& target) {
+template <class TBlockIter, typename TValue>
+void BlockBasedTableIterator<TBlockIter, TValue>::SeekForPrev(
+    const Slice& target) {
   is_out_of_bound_ = false;
   if (!CheckPrefixMayMatch(target)) {
     ResetDataIter();
@@ -2022,8 +2064,8 @@
          icomp_.Compare(target, block_iter_.key()) >= 0);
 }
 
-template <class TBlockIter>
-void BlockBasedTableIterator<TBlockIter>::SeekToFirst() {
+template <class TBlockIter, typename TValue>
+void BlockBasedTableIterator<TBlockIter, TValue>::SeekToFirst() {
   is_out_of_bound_ = false;
   SavePrevIndexValue();
   index_iter_->SeekToFirst();
@@ -2036,8 +2078,8 @@
   FindKeyForward();
 }
 
-template <class TBlockIter>
-void BlockBasedTableIterator<TBlockIter>::SeekToLast() {
+template <class TBlockIter, typename TValue>
+void BlockBasedTableIterator<TBlockIter, TValue>::SeekToLast() {
   is_out_of_bound_ = false;
   SavePrevIndexValue();
   index_iter_->SeekToLast();
@@ -2050,32 +2092,30 @@
   FindKeyBackward();
 }
 
-template <class TBlockIter>
-void BlockBasedTableIterator<TBlockIter>::Next() {
+template <class TBlockIter, typename TValue>
+void BlockBasedTableIterator<TBlockIter, TValue>::Next() {
   assert(block_iter_points_to_real_block_);
   block_iter_.Next();
   FindKeyForward();
 }
 
-template <class TBlockIter>
-void BlockBasedTableIterator<TBlockIter>::Prev() {
+template <class TBlockIter, typename TValue>
+void BlockBasedTableIterator<TBlockIter, TValue>::Prev() {
   assert(block_iter_points_to_real_block_);
   block_iter_.Prev();
   FindKeyBackward();
 }
 
-template <class TBlockIter>
-void BlockBasedTableIterator<TBlockIter>::InitDataBlock() {
-  BlockHandle data_block_handle;
-  Slice handle_slice = index_iter_->value();
+template <class TBlockIter, typename TValue>
+void BlockBasedTableIterator<TBlockIter, TValue>::InitDataBlock() {
+  BlockHandle data_block_handle = index_iter_->value();
   if (!block_iter_points_to_real_block_ ||
-      handle_slice.compare(prev_index_value_) != 0 ||
+      data_block_handle.offset() != prev_index_value_.offset() ||
       // if previous attempt of reading the block missed cache, try again
       block_iter_.status().IsIncomplete()) {
     if (block_iter_points_to_real_block_) {
       ResetDataIter();
     }
-    Status s = data_block_handle.DecodeFrom(&handle_slice);
     auto* rep = table_->get_rep();
 
     // Automatically prefetch additional data when a range scan (iterator) does
@@ -2107,16 +2147,17 @@
       }
     }
 
+    Status s;
     BlockBasedTable::NewDataBlockIterator<TBlockIter>(
         rep, read_options_, data_block_handle, &block_iter_, is_index_,
-        key_includes_seq_,
+        key_includes_seq_, index_key_is_full_,
         /* get_context */ nullptr, s, prefetch_buffer_.get());
     block_iter_points_to_real_block_ = true;
   }
 }
 
-template <class TBlockIter>
-void BlockBasedTableIterator<TBlockIter>::FindKeyForward() {
+template <class TBlockIter, typename TValue>
+void BlockBasedTableIterator<TBlockIter, TValue>::FindKeyForward() {
   assert(!is_out_of_bound_);
   // TODO the while loop inherits from two-level-iterator. We don't know
   // whether a block can be empty so it can be replaced by an "if".
@@ -2155,8 +2196,8 @@
   }
 }
 
-template <class TBlockIter>
-void BlockBasedTableIterator<TBlockIter>::FindKeyBackward() {
+template <class TBlockIter, typename TValue>
+void BlockBasedTableIterator<TBlockIter, TValue>::FindKeyBackward() {
   assert(!is_out_of_bound_);
   while (!block_iter_.Valid()) {
     if (!block_iter_.status().ok()) {
@@ -2231,11 +2272,10 @@
       return iter;
     }
   }
-  std::string str;
-  rep_->range_del_handle.EncodeTo(&str);
   // The meta-block exists but isn't in uncompressed block cache (maybe
   // because it is disabled), so go through the full lookup process.
-  return NewDataBlockIterator<DataBlockIter>(rep_, read_options, Slice(str));
+  return NewDataBlockIterator<DataBlockIter>(rep_, read_options,
+                                             rep_->range_del_handle);
 }
 
 bool BlockBasedTable::FullFilterKeyMayMatch(
@@ -2298,7 +2338,7 @@
     auto iiter =
         NewIndexIterator(read_options, need_upper_bound_check, &iiter_on_stack,
                          /* index_entry */ nullptr, get_context);
-    std::unique_ptr<InternalIterator> iiter_unique_ptr;
+    std::unique_ptr<InternalIteratorBase<BlockHandle>> iiter_unique_ptr;
     if (iiter != &iiter_on_stack) {
       iiter_unique_ptr.reset(iiter);
     }
@@ -2306,12 +2346,10 @@
     bool matched = false;  // if such user key mathced a key in SST
     bool done = false;
     for (iiter->Seek(key); iiter->Valid() && !done; iiter->Next()) {
-      Slice handle_value = iiter->value();
+      BlockHandle handle = iiter->value();
 
-      BlockHandle handle;
       bool not_exist_in_filter =
           filter != nullptr && filter->IsBlockBased() == true &&
-          handle.DecodeFrom(&handle_value).ok() &&
           !filter->KeyMayMatch(ExtractUserKey(key), prefix_extractor,
                                handle.offset(), no_io);
 
@@ -2340,8 +2378,17 @@
           break;
         }
 
+        bool may_exist = biter.SeekForGet(key);
+        if (!may_exist) {
+          // HashSeek cannot find the key this block and the the iter is not
+          // the end of the block, i.e. cannot be in the following blocks
+          // either. In this case, the seek_key cannot be found, so we break
+          // from the top level for-loop.
+          break;
+        }
+
         // Call the *saver function on each entry/block until it returns false
-        for (biter.Seek(key); biter.Valid(); biter.Next()) {
+        for (; biter.Valid(); biter.Next()) {
           ParsedInternalKey parsed_key;
           if (!ParseInternalKey(biter.key(), &parsed_key)) {
             s = Status::Corruption(Slice());
@@ -2389,9 +2436,10 @@
 
   IndexBlockIter iiter_on_stack;
   auto iiter = NewIndexIterator(ReadOptions(), false, &iiter_on_stack);
-  std::unique_ptr<InternalIterator> iiter_unique_ptr;
+  std::unique_ptr<InternalIteratorBase<BlockHandle>> iiter_unique_ptr;
   if (iiter != &iiter_on_stack) {
-    iiter_unique_ptr = std::unique_ptr<InternalIterator>(iiter);
+    iiter_unique_ptr =
+        std::unique_ptr<InternalIteratorBase<BlockHandle>>(iiter);
   }
 
   if (!iiter->status().ok()) {
@@ -2404,7 +2452,7 @@
 
   for (begin ? iiter->Seek(*begin) : iiter->SeekToFirst(); iiter->Valid();
        iiter->Next()) {
-    Slice block_handle = iiter->value();
+    BlockHandle block_handle = iiter->value();
     const bool is_user_key = rep_->table_properties &&
                              rep_->table_properties->index_key_is_user_key > 0;
     if (end &&
@@ -2450,11 +2498,12 @@
   }
   // Check Data blocks
   IndexBlockIter iiter_on_stack;
-  InternalIterator* iiter =
+  InternalIteratorBase<BlockHandle>* iiter =
       NewIndexIterator(ReadOptions(), false, &iiter_on_stack);
-  std::unique_ptr<InternalIterator> iiter_unique_ptr;
+  std::unique_ptr<InternalIteratorBase<BlockHandle>> iiter_unique_ptr;
   if (iiter != &iiter_on_stack) {
-    iiter_unique_ptr = std::unique_ptr<InternalIterator>(iiter);
+    iiter_unique_ptr =
+        std::unique_ptr<InternalIteratorBase<BlockHandle>>(iiter);
   }
   if (!iiter->status().ok()) {
     // error opening index iterator
@@ -2464,19 +2513,41 @@
   return s;
 }
 
-Status BlockBasedTable::VerifyChecksumInBlocks(InternalIterator* index_iter) {
+Status BlockBasedTable::VerifyChecksumInBlocks(
+    InternalIteratorBase<BlockHandle>* index_iter) {
   Status s;
   for (index_iter->SeekToFirst(); index_iter->Valid(); index_iter->Next()) {
     s = index_iter->status();
     if (!s.ok()) {
       break;
     }
-    BlockHandle handle;
-    Slice input = index_iter->value();
-    s = handle.DecodeFrom(&input);
+    BlockHandle handle = index_iter->value();
+    BlockContents contents;
+    Slice dummy_comp_dict;
+    BlockFetcher block_fetcher(rep_->file.get(), nullptr /* prefetch buffer */,
+                               rep_->footer, ReadOptions(), handle, &contents,
+                               rep_->ioptions, false /* decompress */,
+                               dummy_comp_dict /*compression dict*/,
+                               rep_->persistent_cache_options);
+    s = block_fetcher.ReadBlockContents();
+    if (!s.ok()) {
+      break;
+    }
+  }
+  return s;
+}
+
+Status BlockBasedTable::VerifyChecksumInBlocks(
+    InternalIteratorBase<Slice>* index_iter) {
+  Status s;
+  for (index_iter->SeekToFirst(); index_iter->Valid(); index_iter->Next()) {
+    s = index_iter->status();
     if (!s.ok()) {
       break;
     }
+    BlockHandle handle;
+    Slice input = index_iter->value();
+    s = handle.DecodeFrom(&input);
     BlockContents contents;
     Slice dummy_comp_dict;
     BlockFetcher block_fetcher(rep_->file.get(), nullptr /* prefetch buffer */,
@@ -2494,15 +2565,13 @@
 
 bool BlockBasedTable::TEST_KeyInCache(const ReadOptions& options,
                                       const Slice& key) {
-  std::unique_ptr<InternalIterator> iiter(NewIndexIterator(options));
+  std::unique_ptr<InternalIteratorBase<BlockHandle>> iiter(
+      NewIndexIterator(options));
   iiter->Seek(key);
   assert(iiter->Valid());
   CachableEntry<Block> block;
 
-  BlockHandle handle;
-  Slice input = iiter->value();
-  Status s = handle.DecodeFrom(&input);
-  assert(s.ok());
+  BlockHandle handle = iiter->value();
   Cache* block_cache = rep_->table_options.block_cache.get();
   assert(block_cache != nullptr);
 
@@ -2512,6 +2581,7 @@
                   cache_key_storage);
   Slice ckey;
 
+  Status s;
   s = GetDataBlockFromCache(
       cache_key, ckey, block_cache, nullptr, rep_->ioptions, options, &block,
       rep_->table_options.format_version,
@@ -2572,14 +2642,18 @@
           rep_->ioptions, icomparator, index_reader,
           rep_->persistent_cache_options, level,
           rep_->table_properties == nullptr ||
-              rep_->table_properties->index_key_is_user_key == 0);
+              rep_->table_properties->index_key_is_user_key == 0,
+          rep_->table_properties == nullptr ||
+              rep_->table_properties->index_value_is_delta_encoded == 0);
     }
     case BlockBasedTableOptions::kBinarySearch: {
       return BinarySearchIndexReader::Create(
           file, prefetch_buffer, footer, footer.index_handle(), rep_->ioptions,
           icomparator, index_reader, rep_->persistent_cache_options,
           rep_->table_properties == nullptr ||
-              rep_->table_properties->index_key_is_user_key == 0);
+              rep_->table_properties->index_key_is_user_key == 0,
+          rep_->table_properties == nullptr ||
+              rep_->table_properties->index_value_is_delta_encoded == 0);
     }
     case BlockBasedTableOptions::kHashSearch: {
       std::unique_ptr<Block> meta_guard;
@@ -2599,7 +2673,9 @@
               rep_->ioptions, icomparator, index_reader,
               rep_->persistent_cache_options,
               rep_->table_properties == nullptr ||
-                  rep_->table_properties->index_key_is_user_key == 0);
+                  rep_->table_properties->index_key_is_user_key == 0,
+              rep_->table_properties == nullptr ||
+                  rep_->table_properties->index_value_is_delta_encoded == 0);
         }
         meta_index_iter = meta_iter_guard.get();
       }
@@ -2610,7 +2686,9 @@
           index_reader, rep_->hash_index_allow_collision,
           rep_->persistent_cache_options,
           rep_->table_properties == nullptr ||
-              rep_->table_properties->index_key_is_user_key == 0);
+              rep_->table_properties->index_key_is_user_key == 0,
+          rep_->table_properties == nullptr ||
+              rep_->table_properties->index_value_is_delta_encoded == 0);
     }
     default: {
       std::string error_message =
@@ -2621,22 +2699,14 @@
 }
 
 uint64_t BlockBasedTable::ApproximateOffsetOf(const Slice& key) {
-  unique_ptr<InternalIterator> index_iter(NewIndexIterator(ReadOptions()));
+  unique_ptr<InternalIteratorBase<BlockHandle>> index_iter(
+      NewIndexIterator(ReadOptions()));
 
   index_iter->Seek(key);
   uint64_t result;
   if (index_iter->Valid()) {
-    BlockHandle handle;
-    Slice input = index_iter->value();
-    Status s = handle.DecodeFrom(&input);
-    if (s.ok()) {
-      result = handle.offset();
-    } else {
-      // Strange: we can't decode the block handle in the index block.
-      // We'll just return the offset of the metaindex block, which is
-      // close to the whole file size for this case.
-      result = rep_->footer.metaindex_handle().offset();
-    }
+    BlockHandle handle = index_iter->value();
+    result = handle.offset();
   } else {
     // key is past the last key in the file. If table_properties is not
     // available, approximate the offset by returning the offset of the
@@ -2663,7 +2733,7 @@
 
 Status BlockBasedTable::GetKVPairsFromDataBlocks(
     std::vector<KVPairBlock>* kv_pair_blocks) {
-  std::unique_ptr<InternalIterator> blockhandles_iter(
+  std::unique_ptr<InternalIteratorBase<BlockHandle>> blockhandles_iter(
       NewIndexIterator(ReadOptions()));
 
   Status s = blockhandles_iter->status();
@@ -2770,32 +2840,32 @@
         "  ");
     out_file->Append(table_properties->ToString("\n  ", ": ").c_str());
     out_file->Append("\n");
-  }
 
-  // Output Filter blocks
-  if (!rep_->filter && !table_properties->filter_policy_name.empty()) {
-    // Support only BloomFilter as off now
-    rocksdb::BlockBasedTableOptions table_options;
-    table_options.filter_policy.reset(rocksdb::NewBloomFilterPolicy(1));
-    if (table_properties->filter_policy_name.compare(
-            table_options.filter_policy->Name()) == 0) {
-      std::string filter_block_key = kFilterBlockPrefix;
-      filter_block_key.append(table_properties->filter_policy_name);
-      BlockHandle handle;
-      if (FindMetaBlock(meta_iter.get(), filter_block_key, &handle).ok()) {
-        BlockContents block;
-        Slice dummy_comp_dict;
-        BlockFetcher block_fetcher(
-            rep_->file.get(), nullptr /* prefetch_buffer */, rep_->footer,
-            ReadOptions(), handle, &block, rep_->ioptions, false /*decompress*/,
-            dummy_comp_dict /*compression dict*/,
-            rep_->persistent_cache_options);
-        s = block_fetcher.ReadBlockContents();
-        if (!s.ok()) {
-          rep_->filter.reset(new BlockBasedFilterBlockReader(
-              prefix_extractor, table_options,
-              table_options.whole_key_filtering, std::move(block),
-              rep_->ioptions.statistics));
+    // Output Filter blocks
+    if (!rep_->filter && !table_properties->filter_policy_name.empty()) {
+      // Support only BloomFilter as off now
+      rocksdb::BlockBasedTableOptions table_options;
+      table_options.filter_policy.reset(rocksdb::NewBloomFilterPolicy(1));
+      if (table_properties->filter_policy_name.compare(
+              table_options.filter_policy->Name()) == 0) {
+        std::string filter_block_key = kFilterBlockPrefix;
+        filter_block_key.append(table_properties->filter_policy_name);
+        BlockHandle handle;
+        if (FindMetaBlock(meta_iter.get(), filter_block_key, &handle).ok()) {
+          BlockContents block;
+          Slice dummy_comp_dict;
+          BlockFetcher block_fetcher(
+              rep_->file.get(), nullptr /* prefetch_buffer */, rep_->footer,
+              ReadOptions(), handle, &block, rep_->ioptions,
+              false /*decompress*/, dummy_comp_dict /*compression dict*/,
+              rep_->persistent_cache_options);
+          s = block_fetcher.ReadBlockContents();
+          if (!s.ok()) {
+            rep_->filter.reset(new BlockBasedFilterBlockReader(
+                prefix_extractor, table_options,
+                table_options.whole_key_filtering, std::move(block),
+                rep_->ioptions.statistics));
+          }
         }
       }
     }
@@ -2878,7 +2948,7 @@
   out_file->Append(
       "Index Details:\n"
       "--------------------------------------\n");
-  std::unique_ptr<InternalIterator> blockhandles_iter(
+  std::unique_ptr<InternalIteratorBase<BlockHandle>> blockhandles_iter(
       NewIndexIterator(ReadOptions()));
   Status s = blockhandles_iter->status();
   if (!s.ok()) {
@@ -2927,7 +2997,7 @@
 }
 
 Status BlockBasedTable::DumpDataBlocks(WritableFile* out_file) {
-  std::unique_ptr<InternalIterator> blockhandles_iter(
+  std::unique_ptr<InternalIteratorBase<BlockHandle>> blockhandles_iter(
       NewIndexIterator(ReadOptions()));
   Status s = blockhandles_iter->status();
   if (!s.ok()) {
@@ -2947,9 +3017,7 @@
       break;
     }
 
-    Slice bh_val = blockhandles_iter->value();
-    BlockHandle bh;
-    bh.DecodeFrom(&bh_val);
+    BlockHandle bh = blockhandles_iter->value();
     uint64_t datablock_size = bh.size();
     datablock_size_min = std::min(datablock_size_min, datablock_size);
     datablock_size_max = std::max(datablock_size_max, datablock_size);
diff -Nru rocksdb-5.15.10/table/block_based_table_reader.h rocksdb-5.17.2/table/block_based_table_reader.h
--- rocksdb-5.15.10/table/block_based_table_reader.h	2018-09-13 17:25:20.000000000 +0000
+++ rocksdb-5.17.2/table/block_based_table_reader.h	2018-11-12 19:57:32.000000000 +0000
@@ -23,6 +23,7 @@
 #include "rocksdb/status.h"
 #include "rocksdb/table.h"
 #include "table/block.h"
+#include "table/block_based_table_factory.h"
 #include "table/filter_block.h"
 #include "table/format.h"
 #include "table/persistent_cache_helper.h"
@@ -50,7 +51,6 @@
 struct EnvOptions;
 struct ReadOptions;
 class GetContext;
-class InternalIterator;
 
 using std::unique_ptr;
 
@@ -93,7 +93,9 @@
                      const SliceTransform* prefix_extractor = nullptr,
                      bool prefetch_index_and_filter_in_cache = true,
                      bool skip_filters = false, int level = -1,
-                     const bool immortal_table = false);
+                     const bool immortal_table = false,
+                     const SequenceNumber largest_seqno = 0,
+                     TailPrefetchStats* tail_prefetch_stats = nullptr);
 
   bool PrefixMayMatch(const Slice& internal_key,
                       const ReadOptions& read_options,
@@ -175,9 +177,9 @@
     // to
     // a different object then iter and the callee has the ownership of the
     // returned object.
-    virtual InternalIterator* NewIterator(IndexBlockIter* iter = nullptr,
-                                          bool total_order_seek = true,
-                                          bool fill_cache = true) = 0;
+    virtual InternalIteratorBase<BlockHandle>* NewIterator(
+        IndexBlockIter* iter = nullptr, bool total_order_seek = true,
+        bool fill_cache = true) = 0;
 
     // The size of the index.
     virtual size_t size() const = 0;
@@ -221,14 +223,16 @@
   static TBlockIter* NewDataBlockIterator(
       Rep* rep, const ReadOptions& ro, const Slice& index_value,
       TBlockIter* input_iter = nullptr, bool is_index = false,
-      bool key_includes_seq = true, GetContext* get_context = nullptr,
+      bool key_includes_seq = true, bool index_key_is_full = true,
+      GetContext* get_context = nullptr,
       FilePrefetchBuffer* prefetch_buffer = nullptr);
   template <typename TBlockIter>
   static TBlockIter* NewDataBlockIterator(
       Rep* rep, const ReadOptions& ro, const BlockHandle& block_hanlde,
       TBlockIter* input_iter = nullptr, bool is_index = false,
-      bool key_includes_seq = true, GetContext* get_context = nullptr,
-      Status s = Status(), FilePrefetchBuffer* prefetch_buffer = nullptr);
+      bool key_includes_seq = true, bool index_key_is_full = true,
+      GetContext* get_context = nullptr, Status s = Status(),
+      FilePrefetchBuffer* prefetch_buffer = nullptr);
 
   class PartitionedIndexIteratorState;
 
@@ -281,7 +285,7 @@
   //  2. index is not present in block cache.
   //  3. We disallowed any io to be performed, that is, read_options ==
   //     kBlockCacheTier
-  InternalIterator* NewIndexIterator(
+  InternalIteratorBase<BlockHandle>* NewIndexIterator(
       const ReadOptions& read_options, bool need_upper_bound_check = false,
       IndexBlockIter* input_iter = nullptr,
       CachableEntry<IndexReader>* index_entry = nullptr,
@@ -350,7 +354,8 @@
                               std::unique_ptr<Block>* meta_block,
                               std::unique_ptr<InternalIterator>* iter);
 
-  Status VerifyChecksumInBlocks(InternalIterator* index_iter);
+  Status VerifyChecksumInBlocks(InternalIteratorBase<Slice>* index_iter);
+  Status VerifyChecksumInBlocks(InternalIteratorBase<BlockHandle>* index_iter);
 
   // Create the filter from the filter block.
   virtual FilterBlockReader* ReadFilter(
@@ -387,14 +392,16 @@
   PartitionedIndexIteratorState(
       BlockBasedTable* table,
       std::unordered_map<uint64_t, CachableEntry<Block>>* block_map,
-      const bool index_key_includes_seq);
-  InternalIterator* NewSecondaryIterator(const Slice& index_value) override;
+      const bool index_key_includes_seq, const bool index_key_is_full);
+  InternalIteratorBase<BlockHandle>* NewSecondaryIterator(
+      const BlockHandle& index_value) override;
 
  private:
   // Don't own table_
   BlockBasedTable* table_;
   std::unordered_map<uint64_t, CachableEntry<Block>>* block_map_;
   bool index_key_includes_seq_;
+  bool index_key_is_full_;
 };
 
 // CachableEntry represents the entries that *may* be fetched from block cache.
@@ -518,16 +525,17 @@
   const bool immortal_table;
 };
 
-template <class TBlockIter>
-class BlockBasedTableIterator : public InternalIterator {
+template <class TBlockIter, typename TValue = Slice>
+class BlockBasedTableIterator : public InternalIteratorBase<TValue> {
  public:
   BlockBasedTableIterator(BlockBasedTable* table,
                           const ReadOptions& read_options,
                           const InternalKeyComparator& icomp,
-                          InternalIterator* index_iter, bool check_filter,
-                          bool need_upper_bound_check,
+                          InternalIteratorBase<BlockHandle>* index_iter,
+                          bool check_filter, bool need_upper_bound_check,
                           const SliceTransform* prefix_extractor, bool is_index,
                           bool key_includes_seq = true,
+                          bool index_key_is_full = true,
                           bool for_compaction = false)
       : table_(table),
         read_options_(read_options),
@@ -540,6 +548,7 @@
         prefix_extractor_(prefix_extractor),
         is_index_(is_index),
         key_includes_seq_(key_includes_seq),
+        index_key_is_full_(index_key_is_full),
         for_compaction_(for_compaction) {}
 
   ~BlockBasedTableIterator() { delete index_iter_; }
@@ -558,7 +567,7 @@
     assert(Valid());
     return block_iter_.key();
   }
-  Slice value() const override {
+  TValue value() const override {
     assert(Valid());
     return block_iter_.value();
   }
@@ -615,8 +624,7 @@
     if (block_iter_points_to_real_block_) {
       // Reseek. If they end up with the same data block, we shouldn't re-fetch
       // the same data block.
-      Slice v = index_iter_->value();
-      prev_index_value_.assign(v.data(), v.size());
+      prev_index_value_ = index_iter_->value();
     }
   }
 
@@ -628,7 +636,7 @@
   BlockBasedTable* table_;
   const ReadOptions read_options_;
   const InternalKeyComparator& icomp_;
-  InternalIterator* index_iter_;
+  InternalIteratorBase<BlockHandle>* index_iter_;
   PinnedIteratorsManager* pinned_iters_mgr_;
   TBlockIter block_iter_;
   bool block_iter_points_to_real_block_;
@@ -641,10 +649,10 @@
   bool is_index_;
   // If the keys in the blocks over which we iterate include 8 byte sequence
   bool key_includes_seq_;
+  bool index_key_is_full_;
   // If this iterator is created for compaction
   bool for_compaction_;
-  // TODO use block offset instead
-  std::string prev_index_value_;
+  BlockHandle prev_index_value_;
 
   static const size_t kInitReadaheadSize = 8 * 1024;
   // Found that 256 KB readahead size provides the best performance, based on
diff -Nru rocksdb-5.15.10/table/block_builder.cc rocksdb-5.17.2/table/block_builder.cc
--- rocksdb-5.15.10/table/block_builder.cc	2018-09-13 17:25:20.000000000 +0000
+++ rocksdb-5.17.2/table/block_builder.cc	2018-11-12 19:57:32.000000000 +0000
@@ -33,20 +33,36 @@
 
 #include "table/block_builder.h"
 
-#include <algorithm>
 #include <assert.h>
-#include "rocksdb/comparator.h"
+#include <algorithm>
 #include "db/dbformat.h"
+#include "rocksdb/comparator.h"
+#include "table/data_block_footer.h"
 #include "util/coding.h"
 
 namespace rocksdb {
 
-BlockBuilder::BlockBuilder(int block_restart_interval, bool use_delta_encoding)
+BlockBuilder::BlockBuilder(
+    int block_restart_interval, bool use_delta_encoding,
+    bool use_value_delta_encoding,
+    BlockBasedTableOptions::DataBlockIndexType index_type,
+    double data_block_hash_table_util_ratio)
     : block_restart_interval_(block_restart_interval),
       use_delta_encoding_(use_delta_encoding),
+      use_value_delta_encoding_(use_value_delta_encoding),
       restarts_(),
       counter_(0),
       finished_(false) {
+  switch (index_type) {
+    case BlockBasedTableOptions::kDataBlockBinarySearch:
+      break;
+    case BlockBasedTableOptions::kDataBlockBinaryAndHash:
+      data_block_hash_index_builder_.Initialize(
+          data_block_hash_table_util_ratio);
+      break;
+    default:
+      assert(0);
+  }
   assert(block_restart_interval_ >= 1);
   restarts_.push_back(0);       // First restart point is at offset 0
   estimate_ = sizeof(uint32_t) + sizeof(uint32_t);
@@ -60,19 +76,35 @@
   counter_ = 0;
   finished_ = false;
   last_key_.clear();
+  if (data_block_hash_index_builder_.Valid()) {
+    data_block_hash_index_builder_.Reset();
+  }
 }
 
 size_t BlockBuilder::EstimateSizeAfterKV(const Slice& key, const Slice& value)
   const {
   size_t estimate = CurrentSizeEstimate();
-  estimate += key.size() + value.size();
+  // Note: this is an imprecise estimate as it accounts for the whole key size
+  // instead of non-shared key size.
+  estimate += key.size();
+  // In value delta encoding we estimate the value delta size as half the full
+  // value size since only the size field of block handle is encoded.
+  estimate +=
+      !use_value_delta_encoding_ || (counter_ >= block_restart_interval_)
+          ? value.size()
+          : value.size() / 2;
+
   if (counter_ >= block_restart_interval_) {
     estimate += sizeof(uint32_t); // a new restart entry.
   }
 
   estimate += sizeof(int32_t); // varint for shared prefix length.
+  // Note: this is an imprecise estimate as we will have to encoded size, one
+  // for shared key and one for non-shared key.
   estimate += VarintLength(key.size()); // varint for key length.
-  estimate += VarintLength(value.size()); // varint for value length.
+  if (!use_value_delta_encoding_ || (counter_ >= block_restart_interval_)) {
+    estimate += VarintLength(value.size());  // varint for value length.
+  }
 
   return estimate;
 }
@@ -82,14 +114,29 @@
   for (size_t i = 0; i < restarts_.size(); i++) {
     PutFixed32(&buffer_, restarts_[i]);
   }
-  PutFixed32(&buffer_, static_cast<uint32_t>(restarts_.size()));
+
+  uint32_t num_restarts = static_cast<uint32_t>(restarts_.size());
+  BlockBasedTableOptions::DataBlockIndexType index_type =
+      BlockBasedTableOptions::kDataBlockBinarySearch;
+  if (data_block_hash_index_builder_.Valid() &&
+      CurrentSizeEstimate() <= kMaxBlockSizeSupportedByHashIndex) {
+    data_block_hash_index_builder_.Finish(buffer_);
+    index_type = BlockBasedTableOptions::kDataBlockBinaryAndHash;
+  }
+
+  // footer is a packed format of data_block_index_type and num_restarts
+  uint32_t block_footer = PackIndexTypeAndNumRestarts(index_type, num_restarts);
+
+  PutFixed32(&buffer_, block_footer);
   finished_ = true;
   return Slice(buffer_);
 }
 
-void BlockBuilder::Add(const Slice& key, const Slice& value) {
+void BlockBuilder::Add(const Slice& key, const Slice& value,
+                       const Slice* const delta_value) {
   assert(!finished_);
   assert(counter_ <= block_restart_interval_);
+  assert(!use_value_delta_encoding_ || delta_value);
   size_t shared = 0;  // number of bytes shared with prev key
   if (counter_ >= block_restart_interval_) {
     // Restart compression
@@ -115,14 +162,32 @@
   const size_t non_shared = key.size() - shared;
   const size_t curr_size = buffer_.size();
 
-  // Add "<shared><non_shared><value_size>" to buffer_
-  PutVarint32Varint32Varint32(&buffer_, static_cast<uint32_t>(shared),
-                              static_cast<uint32_t>(non_shared),
-                              static_cast<uint32_t>(value.size()));
+  if (use_value_delta_encoding_) {
+    // Add "<shared><non_shared>" to buffer_
+    PutVarint32Varint32(&buffer_, static_cast<uint32_t>(shared),
+                        static_cast<uint32_t>(non_shared));
+  } else {
+    // Add "<shared><non_shared><value_size>" to buffer_
+    PutVarint32Varint32Varint32(&buffer_, static_cast<uint32_t>(shared),
+                                static_cast<uint32_t>(non_shared),
+                                static_cast<uint32_t>(value.size()));
+  }
 
   // Add string delta to buffer_ followed by value
   buffer_.append(key.data() + shared, non_shared);
-  buffer_.append(value.data(), value.size());
+  // Use value delta encoding only when the key has shared bytes. This would
+  // simplify the decoding, where it can figure which decoding to use simply by
+  // looking at the shared bytes size.
+  if (shared != 0 && use_value_delta_encoding_) {
+    buffer_.append(delta_value->data(), delta_value->size());
+  } else {
+    buffer_.append(value.data(), value.size());
+  }
+
+  if (data_block_hash_index_builder_.Valid()) {
+    data_block_hash_index_builder_.Add(ExtractUserKey(key),
+                                       restarts_.size() - 1);
+  }
 
   counter_++;
   estimate_ += buffer_.size() - curr_size;
diff -Nru rocksdb-5.15.10/table/block_builder.h rocksdb-5.17.2/table/block_builder.h
--- rocksdb-5.15.10/table/block_builder.h	2018-09-13 17:25:20.000000000 +0000
+++ rocksdb-5.17.2/table/block_builder.h	2018-11-12 19:57:32.000000000 +0000
@@ -12,6 +12,8 @@
 
 #include <stdint.h>
 #include "rocksdb/slice.h"
+#include "rocksdb/table.h"
+#include "table/data_block_hash_index.h"
 
 namespace rocksdb {
 
@@ -21,14 +23,19 @@
   void operator=(const BlockBuilder&) = delete;
 
   explicit BlockBuilder(int block_restart_interval,
-                        bool use_delta_encoding = true);
+                        bool use_delta_encoding = true,
+                        bool use_value_delta_encoding = false,
+                        BlockBasedTableOptions::DataBlockIndexType index_type =
+                            BlockBasedTableOptions::kDataBlockBinarySearch,
+                        double data_block_hash_table_util_ratio = 0.75);
 
   // Reset the contents as if the BlockBuilder was just constructed.
   void Reset();
 
   // REQUIRES: Finish() has not been called since the last call to Reset().
   // REQUIRES: key is larger than any previously added key
-  void Add(const Slice& key, const Slice& value);
+  void Add(const Slice& key, const Slice& value,
+           const Slice* const delta_value = nullptr);
 
   // Finish building the block and return a slice that refers to the
   // block contents.  The returned slice will remain valid for the
@@ -37,7 +44,11 @@
 
   // Returns an estimate of the current (uncompressed) size of the block
   // we are building.
-  inline size_t CurrentSizeEstimate() const { return estimate_; }
+  inline size_t CurrentSizeEstimate() const {
+    return estimate_ + (data_block_hash_index_builder_.Valid()
+                            ? data_block_hash_index_builder_.EstimateSize()
+                            : 0);
+  }
 
   // Returns an estimated block size after appending key and value.
   size_t EstimateSizeAfterKV(const Slice& key, const Slice& value) const;
@@ -49,7 +60,10 @@
 
  private:
   const int          block_restart_interval_;
+  // TODO(myabandeh): put it into a separate IndexBlockBuilder
   const bool         use_delta_encoding_;
+  // Refer to BlockIter::DecodeCurrentValue for format of delta encoded values
+  const bool use_value_delta_encoding_;
 
   std::string           buffer_;    // Destination buffer
   std::vector<uint32_t> restarts_;  // Restart points
@@ -57,6 +71,7 @@
   int                   counter_;   // Number of entries emitted since restart
   bool                  finished_;  // Has Finish() been called?
   std::string           last_key_;
+  DataBlockHashIndexBuilder data_block_hash_index_builder_;
 };
 
 }  // namespace rocksdb
diff -Nru rocksdb-5.15.10/table/block.cc rocksdb-5.17.2/table/block.cc
--- rocksdb-5.15.10/table/block.cc	2018-09-13 17:25:20.000000000 +0000
+++ rocksdb-5.17.2/table/block.cc	2018-11-12 19:57:32.000000000 +0000
@@ -20,6 +20,7 @@
 #include "port/stack_trace.h"
 #include "rocksdb/comparator.h"
 #include "table/block_prefix_index.h"
+#include "table/data_block_footer.h"
 #include "table/format.h"
 #include "util/coding.h"
 #include "util/logging.h"
@@ -33,28 +34,65 @@
 //
 // If any errors are detected, returns nullptr.  Otherwise, returns a
 // pointer to the key delta (just past the three decoded values).
-static inline const char* DecodeEntry(const char* p, const char* limit,
-                                      uint32_t* shared,
-                                      uint32_t* non_shared,
-                                      uint32_t* value_length) {
-  if (limit - p < 3) return nullptr;
-  *shared = reinterpret_cast<const unsigned char*>(p)[0];
-  *non_shared = reinterpret_cast<const unsigned char*>(p)[1];
-  *value_length = reinterpret_cast<const unsigned char*>(p)[2];
-  if ((*shared | *non_shared | *value_length) < 128) {
-    // Fast path: all three values are encoded in one byte each
-    p += 3;
-  } else {
-    if ((p = GetVarint32Ptr(p, limit, shared)) == nullptr) return nullptr;
-    if ((p = GetVarint32Ptr(p, limit, non_shared)) == nullptr) return nullptr;
-    if ((p = GetVarint32Ptr(p, limit, value_length)) == nullptr) return nullptr;
-  }
+struct DecodeEntry {
+  inline const char* operator()(const char* p, const char* limit,
+                                uint32_t* shared, uint32_t* non_shared,
+                                uint32_t* value_length) {
+    // We need 2 bytes for shared and non_shared size. We also need one more
+    // byte either for value size or the actual value in case of value delta
+    // encoding.
+    assert(limit - p >= 3);
+    *shared = reinterpret_cast<const unsigned char*>(p)[0];
+    *non_shared = reinterpret_cast<const unsigned char*>(p)[1];
+    *value_length = reinterpret_cast<const unsigned char*>(p)[2];
+    if ((*shared | *non_shared | *value_length) < 128) {
+      // Fast path: all three values are encoded in one byte each
+      p += 3;
+    } else {
+      if ((p = GetVarint32Ptr(p, limit, shared)) == nullptr) return nullptr;
+      if ((p = GetVarint32Ptr(p, limit, non_shared)) == nullptr) return nullptr;
+      if ((p = GetVarint32Ptr(p, limit, value_length)) == nullptr) {
+        return nullptr;
+      }
+    }
 
-  if (static_cast<uint32_t>(limit - p) < (*non_shared + *value_length)) {
-    return nullptr;
+    // Using an assert in place of "return null" since we should not pay the
+    // cost of checking for corruption on every single key decoding
+    assert(!(static_cast<uint32_t>(limit - p) < (*non_shared + *value_length)));
+    return p;
+  }
+};
+
+struct DecodeKey {
+  inline const char* operator()(const char* p, const char* limit,
+                                uint32_t* shared, uint32_t* non_shared) {
+    uint32_t value_length;
+    return DecodeEntry()(p, limit, shared, non_shared, &value_length);
+  }
+};
+
+// In format_version 4, which is used by index blocks, the value size is not
+// encoded before the entry, as the value is known to be the handle with the
+// known size.
+struct DecodeKeyV4 {
+  inline const char* operator()(const char* p, const char* limit,
+                                uint32_t* shared, uint32_t* non_shared) {
+    // We need 2 bytes for shared and non_shared size. We also need one more
+    // byte either for value size or the actual value in case of value delta
+    // encoding.
+    if (limit - p < 3) return nullptr;
+    *shared = reinterpret_cast<const unsigned char*>(p)[0];
+    *non_shared = reinterpret_cast<const unsigned char*>(p)[1];
+    if ((*shared | *non_shared) < 128) {
+      // Fast path: all three values are encoded in one byte each
+      p += 2;
+    } else {
+      if ((p = GetVarint32Ptr(p, limit, shared)) == nullptr) return nullptr;
+      if ((p = GetVarint32Ptr(p, limit, non_shared)) == nullptr) return nullptr;
+    }
+    return p;
   }
-  return p;
-}
+};
 
 void DataBlockIter::Next() {
   assert(Valid());
@@ -170,7 +208,8 @@
     return;
   }
   uint32_t index = 0;
-  bool ok = BinarySeek(seek_key, 0, num_restarts_ - 1, &index, comparator_);
+  bool ok = BinarySeek<DecodeKey>(seek_key, 0, num_restarts_ - 1, &index,
+                                  comparator_);
 
   if (!ok) {
     return;
@@ -185,6 +224,123 @@
   }
 }
 
+// Optimized Seek for point lookup for an internal key `target`
+// target = "seek_user_key @ type | seqno".
+//
+// For any type other than kTypeValue, kTypeDeletion, kTypeSingleDeletion,
+// or kTypeBlobIndex, this function behaves identically as Seek().
+//
+// For any type in kTypeValue, kTypeDeletion, kTypeSingleDeletion,
+// or kTypeBlobIndex:
+//
+// If the return value is FALSE, iter location is undefined, and it means:
+// 1) there is no key in this block falling into the range:
+//    ["seek_user_key @ type | seqno", "seek_user_key @ kTypeDeletion | 0"],
+//    inclusive; AND
+// 2) the last key of this block has a greater user_key from seek_user_key
+//
+// If the return value is TRUE, iter location has two possibilies:
+// 1) If iter is valid, it is set to a location as if set by BinarySeek. In
+//    this case, it points to the first key_ with a larger user_key or a
+//    matching user_key with a seqno no greater than the seeking seqno.
+// 2) If the iter is invalid, it means that either all the user_key is less
+//    than the seek_user_key, or the block ends with a matching user_key but
+//    with a smaller [ type | seqno ] (i.e. a larger seqno, or the same seqno
+//    but larger type).
+bool DataBlockIter::SeekForGetImpl(const Slice& target) {
+  Slice user_key = ExtractUserKey(target);
+  uint32_t map_offset = restarts_ + num_restarts_ * sizeof(uint32_t);
+  uint8_t entry = data_block_hash_index_->Lookup(data_, map_offset, user_key);
+
+  if (entry == kCollision) {
+    // HashSeek not effective, falling back
+    Seek(target);
+    return true;
+  }
+
+  if (entry == kNoEntry) {
+    // Even if we cannot find the user_key in this block, the result may
+    // exist in the next block. Consider this exmpale:
+    //
+    // Block N:    [aab@100, ... , app@120]
+    // bounary key: axy@50 (we make minimal assumption about a boundary key)
+    // Block N+1:  [axy@10, ...   ]
+    //
+    // If seek_key = axy@60, the search will starts from Block N.
+    // Even if the user_key is not found in the hash map, the caller still
+    // have to conntinue searching the next block.
+    //
+    // In this case, we pretend the key is the the last restart interval.
+    // The while-loop below will search the last restart interval for the
+    // key. It will stop at the first key that is larger than the seek_key,
+    // or to the end of the block if no one is larger.
+    entry = static_cast<uint8_t>(num_restarts_ - 1);
+  }
+
+  uint32_t restart_index = entry;
+
+  // check if the key is in the restart_interval
+  assert(restart_index < num_restarts_);
+  SeekToRestartPoint(restart_index);
+
+  const char* limit = nullptr;
+  if (restart_index_ + 1 < num_restarts_) {
+    limit = data_ + GetRestartPoint(restart_index_ + 1);
+  } else {
+    limit = data_ + restarts_;
+  }
+
+  while (true) {
+    // Here we only linear seek the target key inside the restart interval.
+    // If a key does not exist inside a restart interval, we avoid
+    // further searching the block content accross restart interval boundary.
+    //
+    // TODO(fwu): check the left and write boundary of the restart interval
+    // to avoid linear seek a target key that is out of range.
+    if (!ParseNextDataKey(limit) || Compare(key_, target) >= 0) {
+      // we stop at the first potential matching user key.
+      break;
+    }
+  }
+
+  if (current_ == restarts_) {
+    // Search reaches to the end of the block. There are three possibilites:
+    // 1) there is only one user_key match in the block (otherwise collsion).
+    //    the matching user_key resides in the last restart interval, and it
+    //    is the last key of the restart interval and of the block as well.
+    //    ParseNextDataKey() skiped it as its [ type | seqno ] is smaller.
+    //
+    // 2) The seek_key is not found in the HashIndex Lookup(), i.e. kNoEntry,
+    //    AND all existing user_keys in the restart interval are smaller than
+    //    seek_user_key.
+    //
+    // 3) The seek_key is a false positive and happens to be hashed to the
+    //    last restart interval, AND all existing user_keys in the restart
+    //    interval are smaller than seek_user_key.
+    //
+    // The result may exist in the next block each case, so we return true.
+    return true;
+  }
+
+  if (user_comparator_->Compare(key_.GetUserKey(), user_key) != 0) {
+    // the key is not in this block and cannot be at the next block either.
+    return false;
+  }
+
+  // Here we are conservative and only support a limited set of cases
+  ValueType value_type = ExtractValueType(key_.GetKey());
+  if (value_type != ValueType::kTypeValue &&
+      value_type != ValueType::kTypeDeletion &&
+      value_type != ValueType::kTypeSingleDeletion &&
+      value_type != ValueType::kTypeBlobIndex) {
+    Seek(target);
+    return true;
+  }
+
+  // Result found, and the iter is correctly set.
+  return true;
+}
+
 void IndexBlockIter::Seek(const Slice& target) {
   Slice seek_key = target;
   if (!key_includes_seq_) {
@@ -198,8 +354,12 @@
   bool ok = false;
   if (prefix_index_) {
     ok = PrefixSeek(target, &index);
+  } else if (value_delta_encoded_) {
+    ok = BinarySeek<DecodeKeyV4>(seek_key, 0, num_restarts_ - 1, &index,
+                                 comparator_);
   } else {
-    ok = BinarySeek(seek_key, 0, num_restarts_ - 1, &index, active_comparator_);
+    ok = BinarySeek<DecodeKey>(seek_key, 0, num_restarts_ - 1, &index,
+                               comparator_);
   }
 
   if (!ok) {
@@ -222,7 +382,8 @@
     return;
   }
   uint32_t index = 0;
-  bool ok = BinarySeek(seek_key, 0, num_restarts_ - 1, &index, comparator_);
+  bool ok = BinarySeek<DecodeKey>(seek_key, 0, num_restarts_ - 1, &index,
+                                  comparator_);
 
   if (!ok) {
     return;
@@ -277,7 +438,8 @@
   }
 }
 
-void BlockIter::CorruptionError() {
+template <class TValue>
+void BlockIter<TValue>::CorruptionError() {
   current_ = restarts_;
   restart_index_ = num_restarts_;
   status_ = Status::Corruption("bad entry in block");
@@ -285,10 +447,13 @@
   value_.clear();
 }
 
-bool DataBlockIter::ParseNextDataKey() {
+bool DataBlockIter::ParseNextDataKey(const char* limit) {
   current_ = NextEntryOffset();
   const char* p = data_ + current_;
-  const char* limit = data_ + restarts_;  // Restarts come right after data
+  if (!limit) {
+    limit = data_ + restarts_;  // Restarts come right after data
+  }
+
   if (p >= limit) {
     // No more entries to return.  Mark as invalid.
     current_ = restarts_;
@@ -298,7 +463,7 @@
 
   // Decode next entry
   uint32_t shared, non_shared, value_length;
-  p = DecodeEntry(p, limit, &shared, &non_shared, &value_length);
+  p = DecodeEntry()(p, limit, &shared, &non_shared, &value_length);
   if (p == nullptr || key_.Size() < shared) {
     CorruptionError();
     return false;
@@ -340,10 +505,14 @@
     }
 
     value_ = Slice(p + non_shared, value_length);
-    while (restart_index_ + 1 < num_restarts_ &&
-           GetRestartPoint(restart_index_ + 1) < current_) {
-      ++restart_index_;
+    if (shared == 0) {
+      while (restart_index_ + 1 < num_restarts_ &&
+             GetRestartPoint(restart_index_ + 1) < current_) {
+        ++restart_index_;
+      }
     }
+    // else we are in the middle of a restart interval and the restart_index_
+    // thus has not changed
     return true;
   }
 }
@@ -361,7 +530,12 @@
 
   // Decode next entry
   uint32_t shared, non_shared, value_length;
-  p = DecodeEntry(p, limit, &shared, &non_shared, &value_length);
+  if (value_delta_encoded_) {
+    p = DecodeKeyV4()(p, limit, &shared, &non_shared);
+    value_length = 0;
+  } else {
+    p = DecodeEntry()(p, limit, &shared, &non_shared, &value_length);
+  }
   if (p == nullptr || key_.Size() < shared) {
     CorruptionError();
     return false;
@@ -377,27 +551,71 @@
     key_pinned_ = false;
   }
   value_ = Slice(p + non_shared, value_length);
-  while (restart_index_ + 1 < num_restarts_ &&
-         GetRestartPoint(restart_index_ + 1) < current_) {
-    ++restart_index_;
+  if (shared == 0) {
+    while (restart_index_ + 1 < num_restarts_ &&
+           GetRestartPoint(restart_index_ + 1) < current_) {
+      ++restart_index_;
+    }
+  }
+  // else we are in the middle of a restart interval and the restart_index_
+  // thus has not changed
+  if (value_delta_encoded_) {
+    assert(value_length == 0);
+    DecodeCurrentValue(shared);
   }
   return true;
 }
 
+// The format:
+// restart_point   0: k, v (off, sz), k, v (delta-sz), ..., k, v (delta-sz)
+// restart_point   1: k, v (off, sz), k, v (delta-sz), ..., k, v (delta-sz)
+// ...
+// restart_point n-1: k, v (off, sz), k, v (delta-sz), ..., k, v (delta-sz)
+// where, k is key, v is value, and its encoding is in parenthesis.
+// The format of each key is (shared_size, non_shared_size, shared, non_shared)
+// The format of each value, i.e., block hanlde, is (offset, size) whenever the
+// shared_size is 0, which included the first entry in each restart point.
+// Otherwise the format is delta-size = block handle size - size of last block
+// handle.
+void IndexBlockIter::DecodeCurrentValue(uint32_t shared) {
+  assert(value_delta_encoded_);
+  const char* limit = data_ + restarts_;
+  if (shared == 0) {
+    uint64_t o, s;
+    const char* newp = GetVarint64Ptr(value_.data(), limit, &o);
+    assert(newp);
+    newp = GetVarint64Ptr(newp, limit, &s);
+    assert(newp);
+    decoded_value_ = BlockHandle(o, s);
+    value_ = Slice(value_.data(), newp - value_.data());
+  } else {
+    uint64_t next_value_base =
+        decoded_value_.offset() + decoded_value_.size() + kBlockTrailerSize;
+    int64_t delta;
+    const char* newp = GetVarsignedint64Ptr(value_.data(), limit, &delta);
+    decoded_value_ =
+        BlockHandle(next_value_base, decoded_value_.size() + delta);
+    value_ = Slice(value_.data(), newp - value_.data());
+  }
+}
+
 // Binary search in restart array to find the first restart point that
 // is either the last restart point with a key less than target,
 // which means the key of next restart point is larger than target, or
 // the first restart point with a key = target
-bool BlockIter::BinarySeek(const Slice& target, uint32_t left, uint32_t right,
-                           uint32_t* index, const Comparator* comp) {
+template <class TValue>
+template <typename DecodeKeyFunc>
+bool BlockIter<TValue>::BinarySeek(const Slice& target, uint32_t left,
+                                   uint32_t right, uint32_t* index,
+                                   const Comparator* comp) {
   assert(left <= right);
 
   while (left < right) {
     uint32_t mid = (left + right + 1) / 2;
     uint32_t region_offset = GetRestartPoint(mid);
-    uint32_t shared, non_shared, value_length;
-    const char* key_ptr = DecodeEntry(data_ + region_offset, data_ + restarts_,
-                                      &shared, &non_shared, &value_length);
+    uint32_t shared, non_shared;
+    const char* key_ptr = DecodeKeyFunc()(
+        data_ + region_offset, data_ + restarts_, &shared, &non_shared);
     if (key_ptr == nullptr || (shared != 0)) {
       CorruptionError();
       return false;
@@ -425,9 +643,13 @@
 // Return -1 if error.
 int IndexBlockIter::CompareBlockKey(uint32_t block_index, const Slice& target) {
   uint32_t region_offset = GetRestartPoint(block_index);
-  uint32_t shared, non_shared, value_length;
-  const char* key_ptr = DecodeEntry(data_ + region_offset, data_ + restarts_,
-                                    &shared, &non_shared, &value_length);
+  uint32_t shared, non_shared;
+  const char* key_ptr =
+      value_delta_encoded_
+          ? DecodeKeyV4()(data_ + region_offset, data_ + restarts_, &shared,
+                          &non_shared)
+          : DecodeKey()(data_ + region_offset, data_ + restarts_, &shared,
+                        &non_shared);
   if (key_ptr == nullptr || (shared != 0)) {
     CorruptionError();
     return 1;  // Return target is smaller
@@ -507,7 +729,43 @@
 
 uint32_t Block::NumRestarts() const {
   assert(size_ >= 2*sizeof(uint32_t));
-  return DecodeFixed32(data_ + size_ - sizeof(uint32_t));
+  uint32_t block_footer = DecodeFixed32(data_ + size_ - sizeof(uint32_t));
+  uint32_t num_restarts = block_footer;
+  if (size_ > kMaxBlockSizeSupportedByHashIndex) {
+    // In BlockBuilder, we have ensured a block with HashIndex is less than
+    // kMaxBlockSizeSupportedByHashIndex (64KiB).
+    //
+    // Therefore, if we encounter a block with a size > 64KiB, the block
+    // cannot have HashIndex. So the footer will directly interpreted as
+    // num_restarts.
+    //
+    // Such check is for backward compatibility. We can ensure legacy block
+    // with a vary large num_restarts i.e. >= 0x80000000 can be interpreted
+    // correctly as no HashIndex even if the MSB of num_restarts is set.
+    return num_restarts;
+  }
+  BlockBasedTableOptions::DataBlockIndexType index_type;
+  UnPackIndexTypeAndNumRestarts(block_footer, &index_type, &num_restarts);
+  return num_restarts;
+}
+
+BlockBasedTableOptions::DataBlockIndexType Block::IndexType() const {
+  assert(size_ >= 2 * sizeof(uint32_t));
+  if (size_ > kMaxBlockSizeSupportedByHashIndex) {
+    // The check is for the same reason as that in NumRestarts()
+    return BlockBasedTableOptions::kDataBlockBinarySearch;
+  }
+  uint32_t block_footer = DecodeFixed32(data_ + size_ - sizeof(uint32_t));
+  uint32_t num_restarts = block_footer;
+  BlockBasedTableOptions::DataBlockIndexType index_type;
+  UnPackIndexTypeAndNumRestarts(block_footer, &index_type, &num_restarts);
+  return index_type;
+}
+
+Block::~Block() {
+  // This sync point can be re-enabled if RocksDB can control the
+  // initialization order of any/all static options created by the user.
+  // TEST_SYNC_POINT("Block::~Block");
 }
 
 Block::Block(BlockContents&& contents, SequenceNumber _global_seqno,
@@ -518,18 +776,49 @@
       restart_offset_(0),
       num_restarts_(0),
       global_seqno_(_global_seqno) {
+  TEST_SYNC_POINT("Block::Block:0");
   if (size_ < sizeof(uint32_t)) {
     size_ = 0;  // Error marker
   } else {
     // Should only decode restart points for uncompressed blocks
     if (compression_type() == kNoCompression) {
       num_restarts_ = NumRestarts();
-      restart_offset_ =
-          static_cast<uint32_t>(size_) - (1 + num_restarts_) * sizeof(uint32_t);
-      if (restart_offset_ > size_ - sizeof(uint32_t)) {
-        // The size is too small for NumRestarts() and therefore
-        // restart_offset_ wrapped around.
-        size_ = 0;
+      switch (IndexType()) {
+        case BlockBasedTableOptions::kDataBlockBinarySearch:
+          restart_offset_ = static_cast<uint32_t>(size_) -
+                            (1 + num_restarts_) * sizeof(uint32_t);
+          if (restart_offset_ > size_ - sizeof(uint32_t)) {
+            // The size is too small for NumRestarts() and therefore
+            // restart_offset_ wrapped around.
+            size_ = 0;
+          }
+          break;
+        case BlockBasedTableOptions::kDataBlockBinaryAndHash:
+          if (size_ < sizeof(uint32_t) /* block footer */ +
+                          sizeof(uint16_t) /* NUM_BUCK */) {
+            size_ = 0;
+            break;
+          }
+
+          uint16_t map_offset;
+          data_block_hash_index_.Initialize(
+              contents.data.data(),
+              static_cast<uint16_t>(contents.data.size() -
+                                    sizeof(uint32_t)), /*chop off
+                                                   NUM_RESTARTS*/
+              &map_offset);
+
+          restart_offset_ = map_offset - num_restarts_ * sizeof(uint32_t);
+
+          if (restart_offset_ > map_offset) {
+            // map_offset is too small for NumRestarts() and
+            // therefore restart_offset_ wrapped around.
+            size_ = 0;
+            break;
+          }
+          break;
+        default:
+          size_ = 0;  // Error marker
       }
     }
   }
@@ -544,6 +833,7 @@
                                   DataBlockIter* iter, Statistics* stats,
                                   bool /*total_order_seek*/,
                                   bool /*key_includes_seq*/,
+                                  bool /*value_is_full*/,
                                   BlockPrefixIndex* /*prefix_index*/) {
   DataBlockIter* ret_iter;
   if (iter != nullptr) {
@@ -560,8 +850,10 @@
     ret_iter->Invalidate(Status::OK());
     return ret_iter;
   } else {
-    ret_iter->Initialize(cmp, ucmp, data_, restart_offset_, num_restarts_,
-                         global_seqno_, read_amp_bitmap_.get(), cachable());
+    ret_iter->Initialize(
+        cmp, ucmp, data_, restart_offset_, num_restarts_, global_seqno_,
+        read_amp_bitmap_.get(), cachable(),
+        data_block_hash_index_.Valid() ? &data_block_hash_index_ : nullptr);
     if (read_amp_bitmap_) {
       if (read_amp_bitmap_->GetStatistics() != stats) {
         // DB changed the Statistics pointer, we need to notify read_amp_bitmap_
@@ -577,7 +869,7 @@
 IndexBlockIter* Block::NewIterator(const Comparator* cmp,
                                    const Comparator* ucmp, IndexBlockIter* iter,
                                    Statistics* /*stats*/, bool total_order_seek,
-                                   bool key_includes_seq,
+                                   bool key_includes_seq, bool value_is_full,
                                    BlockPrefixIndex* prefix_index) {
   IndexBlockIter* ret_iter;
   if (iter != nullptr) {
@@ -597,7 +889,8 @@
     BlockPrefixIndex* prefix_index_ptr =
         total_order_seek ? nullptr : prefix_index;
     ret_iter->Initialize(cmp, ucmp, data_, restart_offset_, num_restarts_,
-                         prefix_index_ptr, key_includes_seq, cachable());
+                         prefix_index_ptr, key_includes_seq, value_is_full,
+                         cachable(), nullptr /* data_block_hash_index */);
   }
 
   return ret_iter;
diff -Nru rocksdb-5.15.10/table/block_fetcher.cc rocksdb-5.17.2/table/block_fetcher.cc
--- rocksdb-5.15.10/table/block_fetcher.cc	2018-09-13 17:25:20.000000000 +0000
+++ rocksdb-5.17.2/table/block_fetcher.cc	2018-11-12 19:57:32.000000000 +0000
@@ -169,6 +169,7 @@
     // page can be either uncompressed or compressed, the buffer either stack
     // or heap provided. Refer to https://github.com/facebook/rocksdb/pull/4096
     if (got_from_prefetch_buffer_ || used_buf_ == &stack_buf_[0]) {
+      assert(used_buf_ != heap_buf_.get());
       heap_buf_.reset(new char[block_size_ + kBlockTrailerSize]);
       memcpy(heap_buf_.get(), used_buf_, block_size_ + kBlockTrailerSize);
     }
diff -Nru rocksdb-5.15.10/table/block.h rocksdb-5.17.2/table/block.h
--- rocksdb-5.15.10/table/block.h	2018-09-13 17:25:20.000000000 +0000
+++ rocksdb-5.17.2/table/block.h	2018-11-12 19:57:32.000000000 +0000
@@ -22,19 +22,22 @@
 
 #include "db/dbformat.h"
 #include "db/pinned_iterators_manager.h"
+#include "format.h"
 #include "rocksdb/iterator.h"
 #include "rocksdb/options.h"
 #include "rocksdb/statistics.h"
+#include "rocksdb/table.h"
 #include "table/block_prefix_index.h"
+#include "table/data_block_hash_index.h"
 #include "table/internal_iterator.h"
 #include "util/random.h"
 #include "util/sync_point.h"
-#include "format.h"
 
 namespace rocksdb {
 
 struct BlockContents;
 class Comparator;
+template <class TValue>
 class BlockIter;
 class DataBlockIter;
 class IndexBlockIter;
@@ -146,7 +149,7 @@
                  size_t read_amp_bytes_per_bit = 0,
                  Statistics* statistics = nullptr);
 
-  ~Block() = default;
+  ~Block();
 
   size_t size() const { return size_; }
   const char* data() const { return data_; }
@@ -154,6 +157,7 @@
   // The additional memory space taken by the block data.
   size_t usable_size() const { return contents_.usable_size(); }
   uint32_t NumRestarts() const;
+  BlockBasedTableOptions::DataBlockIndexType IndexType() const;
   CompressionType compression_type() const {
     return contents_.compression_type;
   }
@@ -164,6 +168,11 @@
   // If iter is null, return new Iterator
   // If iter is not null, update this one and return it as Iterator*
   //
+  // key_includes_seq, default true, means that the keys are in internal key
+  // format.
+  // value_is_full, default ture, means that no delta encoding is
+  // applied to values.
+  //
   // NewIterator<DataBlockIter>
   // Same as above but also updates read_amp_bitmap_ if it is not nullptr.
   //
@@ -175,13 +184,11 @@
   // the iterator will simply be set as "invalid", rather than returning
   // the key that is just pass the target key.
   template <typename TBlockIter>
-  TBlockIter* NewIterator(const Comparator* comparator,
-                          const Comparator* user_comparator,
-                          TBlockIter* iter = nullptr,
-                          Statistics* stats = nullptr,
-                          bool total_order_seek = true,
-                          bool key_includes_seq = true,
-                          BlockPrefixIndex* prefix_index = nullptr);
+  TBlockIter* NewIterator(
+      const Comparator* comparator, const Comparator* user_comparator,
+      TBlockIter* iter = nullptr, Statistics* stats = nullptr,
+      bool total_order_seek = true, bool key_includes_seq = true,
+      bool value_is_full = true, BlockPrefixIndex* prefix_index = nullptr);
 
   // Report an approximation of how much memory has been used.
   size_t ApproximateMemoryUsage() const;
@@ -199,12 +206,15 @@
   // the encoded value (kDisableGlobalSequenceNumber means disabled)
   const SequenceNumber global_seqno_;
 
+  DataBlockHashIndex data_block_hash_index_;
+
   // No copying allowed
   Block(const Block&) = delete;
   void operator=(const Block&) = delete;
 };
 
-class BlockIter : public InternalIterator {
+template <class TValue>
+class BlockIter : public InternalIteratorBase<TValue> {
  public:
   void InitializeBase(const Comparator* comparator, const char* data,
                       uint32_t restarts, uint32_t num_restarts,
@@ -243,10 +253,6 @@
     assert(Valid());
     return key_.GetKey();
   }
-  virtual Slice value() const override {
-    assert(Valid());
-    return value_;
-  }
 
 #ifndef NDEBUG
   virtual ~BlockIter() {
@@ -280,7 +286,8 @@
   const char* data_;       // underlying block contents
   uint32_t num_restarts_;  // Number of uint32_t entries in restart array
 
-  uint32_t restart_index_;  // Index of restart block in which current_ falls
+  // Index of restart block in which current_ or current_-1 falls
+  uint32_t restart_index_;
   uint32_t restarts_;       // Offset of restart array (list of fixed32)
   // current_ is offset in data_ of current entry.  >= restarts_ if !Valid
   uint32_t current_;
@@ -316,33 +323,39 @@
 
   void CorruptionError();
 
-  bool BinarySeek(const Slice& target, uint32_t left, uint32_t right,
-                  uint32_t* index, const Comparator* comp);
+  template <typename DecodeKeyFunc>
+  inline bool BinarySeek(const Slice& target, uint32_t left, uint32_t right,
+                         uint32_t* index, const Comparator* comp);
 };
 
-class DataBlockIter final : public BlockIter {
+class DataBlockIter final : public BlockIter<Slice> {
  public:
   DataBlockIter()
       : BlockIter(), read_amp_bitmap_(nullptr), last_bitmap_offset_(0) {}
   DataBlockIter(const Comparator* comparator, const Comparator* user_comparator,
                 const char* data, uint32_t restarts, uint32_t num_restarts,
                 SequenceNumber global_seqno,
-                BlockReadAmpBitmap* read_amp_bitmap, bool block_contents_pinned)
+                BlockReadAmpBitmap* read_amp_bitmap, bool block_contents_pinned,
+                DataBlockHashIndex* data_block_hash_index)
       : DataBlockIter() {
     Initialize(comparator, user_comparator, data, restarts, num_restarts,
-               global_seqno, read_amp_bitmap, block_contents_pinned);
+               global_seqno, read_amp_bitmap, block_contents_pinned,
+               data_block_hash_index);
   }
   void Initialize(const Comparator* comparator,
-                  const Comparator* /*user_comparator*/, const char* data,
+                  const Comparator* user_comparator, const char* data,
                   uint32_t restarts, uint32_t num_restarts,
                   SequenceNumber global_seqno,
                   BlockReadAmpBitmap* read_amp_bitmap,
-                  bool block_contents_pinned) {
+                  bool block_contents_pinned,
+                  DataBlockHashIndex* data_block_hash_index) {
     InitializeBase(comparator, data, restarts, num_restarts, global_seqno,
                    block_contents_pinned);
+    user_comparator_ = user_comparator;
     key_.SetIsUserKey(false);
     read_amp_bitmap_ = read_amp_bitmap;
     last_bitmap_offset_ = current_ + 1;
+    data_block_hash_index_ = data_block_hash_index;
   }
 
   virtual Slice value() const override {
@@ -358,6 +371,15 @@
 
   virtual void Seek(const Slice& target) override;
 
+  inline bool SeekForGet(const Slice& target) {
+    if (!data_block_hash_index_) {
+      Seek(target);
+      return true;
+    }
+
+    return SeekForGetImpl(target);
+  }
+
   virtual void SeekForPrev(const Slice& target) override;
 
   virtual void Prev() override;
@@ -405,14 +427,19 @@
   std::vector<CachedPrevEntry> prev_entries_;
   int32_t prev_entries_idx_ = -1;
 
-  bool ParseNextDataKey();
+  DataBlockHashIndex* data_block_hash_index_;
+  const Comparator* user_comparator_;
+
+  inline bool ParseNextDataKey(const char* limit = nullptr);
 
   inline int Compare(const IterKey& ikey, const Slice& b) const {
     return comparator_->Compare(ikey.GetInternalKey(), b);
   }
+
+  bool SeekForGetImpl(const Slice& target);
 };
 
-class IndexBlockIter final : public BlockIter {
+class IndexBlockIter final : public BlockIter<BlockHandle> {
  public:
   IndexBlockIter() : BlockIter(), prefix_index_(nullptr) {}
 
@@ -420,27 +447,47 @@
     assert(Valid());
     return key_.GetKey();
   }
+  // key_includes_seq, default true, means that the keys are in internal key
+  // format.
+  // value_is_full, default ture, means that no delta encoding is
+  // applied to values.
   IndexBlockIter(const Comparator* comparator,
                  const Comparator* user_comparator, const char* data,
                  uint32_t restarts, uint32_t num_restarts,
                  BlockPrefixIndex* prefix_index, bool key_includes_seq,
-                 bool block_contents_pinned)
+                 bool value_is_full, bool block_contents_pinned)
       : IndexBlockIter() {
     Initialize(comparator, user_comparator, data, restarts, num_restarts,
-               prefix_index, key_includes_seq, block_contents_pinned);
+               prefix_index, key_includes_seq, block_contents_pinned,
+               value_is_full, nullptr /* data_block_hash_index */);
   }
 
   void Initialize(const Comparator* comparator,
                   const Comparator* user_comparator, const char* data,
                   uint32_t restarts, uint32_t num_restarts,
                   BlockPrefixIndex* prefix_index, bool key_includes_seq,
-                  bool block_contents_pinned) {
-    InitializeBase(comparator, data, restarts, num_restarts,
-                   kDisableGlobalSequenceNumber, block_contents_pinned);
+                  bool value_is_full, bool block_contents_pinned,
+                  DataBlockHashIndex* /*data_block_hash_index*/) {
+    InitializeBase(key_includes_seq ? comparator : user_comparator, data,
+                   restarts, num_restarts, kDisableGlobalSequenceNumber,
+                   block_contents_pinned);
     key_includes_seq_ = key_includes_seq;
-    active_comparator_ = key_includes_seq_ ? comparator_ : user_comparator;
     key_.SetIsUserKey(!key_includes_seq_);
     prefix_index_ = prefix_index;
+    value_delta_encoded_ = !value_is_full;
+  }
+
+  virtual BlockHandle value() const override {
+    assert(Valid());
+    if (value_delta_encoded_) {
+      return decoded_value_;
+    } else {
+      BlockHandle handle;
+      Slice v = value_;
+      Status decode_s __attribute__((__unused__)) = handle.DecodeFrom(&v);
+      assert(decode_s.ok());
+      return handle;
+    }
   }
 
   virtual void Seek(const Slice& target) override;
@@ -467,27 +514,37 @@
   void Invalidate(Status s) { InvalidateBase(s); }
 
  private:
+  // Key is in InternalKey format
+  bool key_includes_seq_;
+  bool value_delta_encoded_;
+  BlockPrefixIndex* prefix_index_;
+  // Whether the value is delta encoded. In that case the value is assumed to be
+  // BlockHandle. The first value in each restart interval is the full encoded
+  // BlockHandle; the restart of encoded size part of the BlockHandle. The
+  // offset of delta encoded BlockHandles is computed by adding the size of
+  // previous delta encoded values in the same restart interval to the offset of
+  // the first value in that restart interval.
+  BlockHandle decoded_value_;
+
   bool PrefixSeek(const Slice& target, uint32_t* index);
   bool BinaryBlockIndexSeek(const Slice& target, uint32_t* block_ids,
                             uint32_t left, uint32_t right,
                             uint32_t* index);
-  int CompareBlockKey(uint32_t block_index, const Slice& target);
+  inline int CompareBlockKey(uint32_t block_index, const Slice& target);
 
   inline int Compare(const Slice& a, const Slice& b) const {
-    return active_comparator_->Compare(a, b);
+    return comparator_->Compare(a, b);
   }
 
   inline int Compare(const IterKey& ikey, const Slice& b) const {
-    return active_comparator_->Compare(ikey.GetKey(), b);
+    return comparator_->Compare(ikey.GetKey(), b);
   }
 
-  bool ParseNextIndexKey();
+  inline bool ParseNextIndexKey();
 
-  // Key is in InternalKey format
-  bool key_includes_seq_;
-  // key_includes_seq_ ? comparator_ : user_comparator_
-  const Comparator* active_comparator_;
-  BlockPrefixIndex* prefix_index_;
+  // When value_delta_encoded_ is enabled it decodes the value which is assumed
+  // to be BlockHandle and put it to decoded_value_
+  inline void DecodeCurrentValue(uint32_t shared);
 };
 
 }  // namespace rocksdb
diff -Nru rocksdb-5.15.10/table/block_test.cc rocksdb-5.17.2/table/block_test.cc
--- rocksdb-5.15.10/table/block_test.cc	2018-09-13 17:25:20.000000000 +0000
+++ rocksdb-5.17.2/table/block_test.cc	2018-11-12 19:57:32.000000000 +0000
@@ -68,6 +68,29 @@
   }
 }
 
+// Same as GenerateRandomKVs but the values are BlockHandle
+void GenerateRandomKBHs(std::vector<std::string> *keys,
+                        std::vector<BlockHandle> *values, const int from,
+                        const int len, const int step = 1,
+                        const int padding_size = 0,
+                        const int keys_share_prefix = 1) {
+  Random rnd(302);
+  uint64_t offset = 0;
+
+  // generate different prefix
+  for (int i = from; i < from + len; i += step) {
+    // generate keys that shares the prefix
+    for (int j = 0; j < keys_share_prefix; ++j) {
+      keys->emplace_back(GenerateKey(i, j, padding_size, &rnd));
+
+      uint64_t size = rnd.Uniform(1024 * 16);
+      BlockHandle handle(offset, size);
+      offset += size + kBlockTrailerSize;
+      values->emplace_back(handle);
+    }
+  }
+}
+
 class BlockTest : public testing::Test {};
 
 // block test
@@ -131,6 +154,84 @@
   delete iter;
 }
 
+TEST_F(BlockTest, ValueDeltaEncodingTest) {
+  Random rnd(301);
+  Options options = Options();
+  std::unique_ptr<InternalKeyComparator> ic;
+  ic.reset(new test::PlainInternalKeyComparator(options.comparator));
+
+  std::vector<std::string> keys;
+  std::vector<BlockHandle> values;
+  const bool kUseDeltaEncoding = true;
+  const bool kUseValueDeltaEncoding = true;
+  BlockBuilder builder(16, kUseDeltaEncoding, kUseValueDeltaEncoding);
+  int num_records = 100;
+
+  GenerateRandomKBHs(&keys, &values, 0, num_records);
+  // add a bunch of records to a block
+  BlockHandle last_encoded_handle;
+  for (int i = 0; i < num_records; i++) {
+    auto block_handle = values[i];
+    std::string handle_encoding;
+    block_handle.EncodeTo(&handle_encoding);
+    std::string handle_delta_encoding;
+    PutVarsignedint64(&handle_delta_encoding,
+                      block_handle.size() - last_encoded_handle.size());
+    last_encoded_handle = block_handle;
+    const Slice handle_delta_encoding_slice(handle_delta_encoding);
+    builder.Add(keys[i], handle_encoding, &handle_delta_encoding_slice);
+  }
+
+  // read serialized contents of the block
+  Slice rawblock = builder.Finish();
+
+  // create block reader
+  BlockContents contents;
+  contents.data = rawblock;
+  contents.cachable = false;
+  Block reader(std::move(contents), kDisableGlobalSequenceNumber);
+
+  const bool kTotalOrderSeek = true;
+  const bool kIncludesSeq = true;
+  const bool kValueIsFull = !kUseValueDeltaEncoding;
+  IndexBlockIter *kNullIter = nullptr;
+  Statistics *kNullStats = nullptr;
+  // read contents of block sequentially
+  int count = 0;
+  InternalIteratorBase<BlockHandle> *iter = reader.NewIterator<IndexBlockIter>(
+      options.comparator, options.comparator, kNullIter, kNullStats,
+      kTotalOrderSeek, kIncludesSeq, kValueIsFull);
+  for (iter->SeekToFirst(); iter->Valid(); count++, iter->Next()) {
+    // read kv from block
+    Slice k = iter->key();
+    BlockHandle handle = iter->value();
+
+    // compare with lookaside array
+    ASSERT_EQ(k.ToString().compare(keys[count]), 0);
+
+    ASSERT_EQ(values[count].offset(), handle.offset());
+    ASSERT_EQ(values[count].size(), handle.size());
+  }
+  delete iter;
+
+  // read block contents randomly
+  iter = reader.NewIterator<IndexBlockIter>(
+      options.comparator, options.comparator, kNullIter, kNullStats,
+      kTotalOrderSeek, kIncludesSeq, kValueIsFull);
+  for (int i = 0; i < num_records; i++) {
+    // find a random key in the lookaside array
+    int index = rnd.Uniform(num_records);
+    Slice k(keys[index]);
+
+    // search in block for this key
+    iter->Seek(k);
+    ASSERT_TRUE(iter->Valid());
+    BlockHandle handle = iter->value();
+    ASSERT_EQ(values[index].offset(), handle.offset());
+    ASSERT_EQ(values[index].size(), handle.size());
+  }
+  delete iter;
+}
 // return the block contents
 BlockContents GetBlockContents(std::unique_ptr<BlockBuilder> *builder,
                                const std::vector<std::string> &keys,
diff -Nru rocksdb-5.15.10/table/cuckoo_table_builder.cc rocksdb-5.17.2/table/cuckoo_table_builder.cc
--- rocksdb-5.15.10/table/cuckoo_table_builder.cc	2018-09-13 17:25:20.000000000 +0000
+++ rocksdb-5.17.2/table/cuckoo_table_builder.cc	2018-11-12 19:57:32.000000000 +0000
@@ -164,9 +164,9 @@
 Slice CuckooTableBuilder::GetKey(uint64_t idx) const {
   assert(closed_);
   if (IsDeletedKey(idx)) {
-    return Slice(&deleted_keys_[(idx - num_values_) * key_size_], key_size_);
+    return Slice(&deleted_keys_[static_cast<size_t>((idx - num_values_) * key_size_)], static_cast<size_t>(key_size_));
   }
-  return Slice(&kvs_[idx * (key_size_ + value_size_)], key_size_);
+  return Slice(&kvs_[static_cast<size_t>(idx * (key_size_ + value_size_))], static_cast<size_t>(key_size_));
 }
 
 Slice CuckooTableBuilder::GetUserKey(uint64_t idx) const {
@@ -177,14 +177,14 @@
 Slice CuckooTableBuilder::GetValue(uint64_t idx) const {
   assert(closed_);
   if (IsDeletedKey(idx)) {
-    static std::string empty_value(value_size_, 'a');
+    static std::string empty_value(static_cast<unsigned int>(value_size_), 'a');
     return Slice(empty_value);
   }
-  return Slice(&kvs_[idx * (key_size_ + value_size_) + key_size_], value_size_);
+  return Slice(&kvs_[static_cast<size_t>(idx * (key_size_ + value_size_) + key_size_)], static_cast<size_t>(value_size_));
 }
 
 Status CuckooTableBuilder::MakeHashTable(std::vector<CuckooBucket>* buckets) {
-  buckets->resize(hash_table_size_ + cuckoo_block_size_ - 1);
+  buckets->resize(static_cast<size_t>(hash_table_size_ + cuckoo_block_size_ - 1));
   uint32_t make_space_for_key_call_id = 0;
   for (uint32_t vector_idx = 0; vector_idx < num_entries_; vector_idx++) {
     uint64_t bucket_id = 0;
@@ -200,13 +200,13 @@
       // stop searching and proceed for next hash function.
       for (uint32_t block_idx = 0; block_idx < cuckoo_block_size_;
           ++block_idx, ++hash_val) {
-        if ((*buckets)[hash_val].vector_idx == kMaxVectorIdx) {
+        if ((*buckets)[static_cast<size_t>(hash_val)].vector_idx == kMaxVectorIdx) {
           bucket_id = hash_val;
           bucket_found = true;
           break;
         } else {
           if (ucomp_->Compare(user_key,
-                GetUserKey((*buckets)[hash_val].vector_idx)) == 0) {
+                GetUserKey((*buckets)[static_cast<size_t>(hash_val)].vector_idx)) == 0) {
             return Status::NotSupported("Same key is being inserted again.");
           }
           hash_vals.push_back(hash_val);
@@ -226,7 +226,7 @@
       ++num_hash_func_;
       for (uint32_t block_idx = 0; block_idx < cuckoo_block_size_;
           ++block_idx, ++hash_val) {
-        if ((*buckets)[hash_val].vector_idx == kMaxVectorIdx) {
+        if ((*buckets)[static_cast<size_t>(hash_val)].vector_idx == kMaxVectorIdx) {
           bucket_found = true;
           bucket_id = hash_val;
           break;
@@ -235,7 +235,7 @@
         }
       }
     }
-    (*buckets)[bucket_id].vector_idx = vector_idx;
+    (*buckets)[static_cast<size_t>(bucket_id)].vector_idx = vector_idx;
   }
   return Status::OK();
 }
@@ -295,7 +295,7 @@
         reinterpret_cast<const char*>(&value_size_), sizeof(value_size_));
 
   uint64_t bucket_size = key_size_ + value_size_;
-  unused_bucket.resize(bucket_size, 'a');
+  unused_bucket.resize(static_cast<size_t>(bucket_size), 'a');
   // Write the table.
   uint32_t num_added = 0;
   for (auto& bucket : buckets) {
@@ -320,7 +320,7 @@
 
   uint64_t offset = buckets.size() * bucket_size;
   properties_.data_size = offset;
-  unused_bucket.resize(properties_.fixed_key_len);
+  unused_bucket.resize(static_cast<size_t>(properties_.fixed_key_len));
   properties_.user_collected_properties[
     CuckooTablePropertyNames::kEmptyKey] = unused_bucket;
   properties_.user_collected_properties[
@@ -456,7 +456,7 @@
   // no. of times this will be called is <= max_num_hash_func_ + num_entries_.
   for (uint32_t hash_cnt = 0; hash_cnt < num_hash_func_; ++hash_cnt) {
     uint64_t bid = hash_vals[hash_cnt];
-    (*buckets)[bid].make_space_for_key_call_id = make_space_for_key_call_id;
+    (*buckets)[static_cast<size_t>(bid)].make_space_for_key_call_id = make_space_for_key_call_id;
     tree.push_back(CuckooNode(bid, 0, 0));
   }
   bool null_found = false;
@@ -467,7 +467,7 @@
     if (curr_depth >= max_search_depth_) {
       break;
     }
-    CuckooBucket& curr_bucket = (*buckets)[curr_node.bucket_id];
+    CuckooBucket& curr_bucket = (*buckets)[static_cast<size_t>(curr_node.bucket_id)];
     for (uint32_t hash_cnt = 0;
         hash_cnt < num_hash_func_ && !null_found; ++hash_cnt) {
       uint64_t child_bucket_id = CuckooHash(GetUserKey(curr_bucket.vector_idx),
@@ -476,15 +476,15 @@
       // Iterate inside Cuckoo Block.
       for (uint32_t block_idx = 0; block_idx < cuckoo_block_size_;
           ++block_idx, ++child_bucket_id) {
-        if ((*buckets)[child_bucket_id].make_space_for_key_call_id ==
+        if ((*buckets)[static_cast<size_t>(child_bucket_id)].make_space_for_key_call_id ==
             make_space_for_key_call_id) {
           continue;
         }
-        (*buckets)[child_bucket_id].make_space_for_key_call_id =
+        (*buckets)[static_cast<size_t>(child_bucket_id)].make_space_for_key_call_id =
           make_space_for_key_call_id;
         tree.push_back(CuckooNode(child_bucket_id, curr_depth + 1,
               curr_pos));
-        if ((*buckets)[child_bucket_id].vector_idx == kMaxVectorIdx) {
+        if ((*buckets)[static_cast<size_t>(child_bucket_id)].vector_idx == kMaxVectorIdx) {
           null_found = true;
           break;
         }
@@ -502,8 +502,8 @@
     uint32_t bucket_to_replace_pos = static_cast<uint32_t>(tree.size()) - 1;
     while (bucket_to_replace_pos >= num_hash_func_) {
       CuckooNode& curr_node = tree[bucket_to_replace_pos];
-      (*buckets)[curr_node.bucket_id] =
-        (*buckets)[tree[curr_node.parent_pos].bucket_id];
+      (*buckets)[static_cast<size_t>(curr_node.bucket_id)] =
+        (*buckets)[static_cast<size_t>(tree[curr_node.parent_pos].bucket_id)];
       bucket_to_replace_pos = curr_node.parent_pos;
     }
     *bucket_id = tree[bucket_to_replace_pos].bucket_id;
diff -Nru rocksdb-5.15.10/table/cuckoo_table_builder_test.cc rocksdb-5.17.2/table/cuckoo_table_builder_test.cc
--- rocksdb-5.15.10/table/cuckoo_table_builder_test.cc	2018-09-13 17:25:20.000000000 +0000
+++ rocksdb-5.17.2/table/cuckoo_table_builder_test.cc	2018-11-12 19:57:32.000000000 +0000
@@ -156,7 +156,7 @@
   fname = test::PerThreadDBPath("EmptyFile");
   ASSERT_OK(env_->NewWritableFile(fname, &writable_file, env_options_));
   unique_ptr<WritableFileWriter> file_writer(
-      new WritableFileWriter(std::move(writable_file), EnvOptions()));
+      new WritableFileWriter(std::move(writable_file), fname, EnvOptions()));
   CuckooTableBuilder builder(file_writer.get(), kHashTableRatio, 4, 100,
                              BytewiseComparator(), 1, false, false,
                              GetSliceHash, 0 /* column_family_id */,
@@ -192,7 +192,7 @@
   fname = test::PerThreadDBPath("NoCollisionFullKey");
   ASSERT_OK(env_->NewWritableFile(fname, &writable_file, env_options_));
   unique_ptr<WritableFileWriter> file_writer(
-      new WritableFileWriter(std::move(writable_file), EnvOptions()));
+      new WritableFileWriter(std::move(writable_file), fname, EnvOptions()));
   CuckooTableBuilder builder(file_writer.get(), kHashTableRatio, num_hash_fun,
                              100, BytewiseComparator(), 1, false, false,
                              GetSliceHash, 0 /* column_family_id */,
@@ -240,7 +240,7 @@
   fname = test::PerThreadDBPath("WithCollisionFullKey");
   ASSERT_OK(env_->NewWritableFile(fname, &writable_file, env_options_));
   unique_ptr<WritableFileWriter> file_writer(
-      new WritableFileWriter(std::move(writable_file), EnvOptions()));
+      new WritableFileWriter(std::move(writable_file), fname, EnvOptions()));
   CuckooTableBuilder builder(file_writer.get(), kHashTableRatio, num_hash_fun,
                              100, BytewiseComparator(), 1, false, false,
                              GetSliceHash, 0 /* column_family_id */,
@@ -289,7 +289,7 @@
   fname = test::PerThreadDBPath("WithCollisionFullKey2");
   ASSERT_OK(env_->NewWritableFile(fname, &writable_file, env_options_));
   unique_ptr<WritableFileWriter> file_writer(
-      new WritableFileWriter(std::move(writable_file), EnvOptions()));
+      new WritableFileWriter(std::move(writable_file), fname, EnvOptions()));
   CuckooTableBuilder builder(
       file_writer.get(), kHashTableRatio, num_hash_fun, 100,
       BytewiseComparator(), cuckoo_block_size, false, false, GetSliceHash,
@@ -342,7 +342,7 @@
   fname = test::PerThreadDBPath("WithCollisionPathFullKey");
   ASSERT_OK(env_->NewWritableFile(fname, &writable_file, env_options_));
   unique_ptr<WritableFileWriter> file_writer(
-      new WritableFileWriter(std::move(writable_file), EnvOptions()));
+      new WritableFileWriter(std::move(writable_file), fname, EnvOptions()));
   CuckooTableBuilder builder(file_writer.get(), kHashTableRatio, num_hash_fun,
                              100, BytewiseComparator(), 1, false, false,
                              GetSliceHash, 0 /* column_family_id */,
@@ -392,7 +392,7 @@
   fname = test::PerThreadDBPath("WithCollisionPathFullKeyAndCuckooBlock");
   ASSERT_OK(env_->NewWritableFile(fname, &writable_file, env_options_));
   unique_ptr<WritableFileWriter> file_writer(
-      new WritableFileWriter(std::move(writable_file), EnvOptions()));
+      new WritableFileWriter(std::move(writable_file), fname, EnvOptions()));
   CuckooTableBuilder builder(file_writer.get(), kHashTableRatio, num_hash_fun,
                              100, BytewiseComparator(), 2, false, false,
                              GetSliceHash, 0 /* column_family_id */,
@@ -435,7 +435,7 @@
   fname = test::PerThreadDBPath("NoCollisionUserKey");
   ASSERT_OK(env_->NewWritableFile(fname, &writable_file, env_options_));
   unique_ptr<WritableFileWriter> file_writer(
-      new WritableFileWriter(std::move(writable_file), EnvOptions()));
+      new WritableFileWriter(std::move(writable_file), fname, EnvOptions()));
   CuckooTableBuilder builder(file_writer.get(), kHashTableRatio, num_hash_fun,
                              100, BytewiseComparator(), 1, false, false,
                              GetSliceHash, 0 /* column_family_id */,
@@ -479,7 +479,7 @@
   fname = test::PerThreadDBPath("WithCollisionUserKey");
   ASSERT_OK(env_->NewWritableFile(fname, &writable_file, env_options_));
   unique_ptr<WritableFileWriter> file_writer(
-      new WritableFileWriter(std::move(writable_file), EnvOptions()));
+      new WritableFileWriter(std::move(writable_file), fname, EnvOptions()));
   CuckooTableBuilder builder(file_writer.get(), kHashTableRatio, num_hash_fun,
                              100, BytewiseComparator(), 1, false, false,
                              GetSliceHash, 0 /* column_family_id */,
@@ -525,7 +525,7 @@
   fname = test::PerThreadDBPath("WithCollisionPathUserKey");
   ASSERT_OK(env_->NewWritableFile(fname, &writable_file, env_options_));
   unique_ptr<WritableFileWriter> file_writer(
-      new WritableFileWriter(std::move(writable_file), EnvOptions()));
+      new WritableFileWriter(std::move(writable_file), fname, EnvOptions()));
   CuckooTableBuilder builder(file_writer.get(), kHashTableRatio, num_hash_fun,
                              2, BytewiseComparator(), 1, false, false,
                              GetSliceHash, 0 /* column_family_id */,
@@ -570,7 +570,7 @@
   fname = test::PerThreadDBPath("WithCollisionPathUserKey");
   ASSERT_OK(env_->NewWritableFile(fname, &writable_file, env_options_));
   unique_ptr<WritableFileWriter> file_writer(
-      new WritableFileWriter(std::move(writable_file), EnvOptions()));
+      new WritableFileWriter(std::move(writable_file), fname, EnvOptions()));
   CuckooTableBuilder builder(file_writer.get(), kHashTableRatio, num_hash_fun,
                              2, BytewiseComparator(), 1, false, false,
                              GetSliceHash, 0 /* column_family_id */,
@@ -598,7 +598,7 @@
   fname = test::PerThreadDBPath("FailWhenSameKeyInserted");
   ASSERT_OK(env_->NewWritableFile(fname, &writable_file, env_options_));
   unique_ptr<WritableFileWriter> file_writer(
-      new WritableFileWriter(std::move(writable_file), EnvOptions()));
+      new WritableFileWriter(std::move(writable_file), fname, EnvOptions()));
   CuckooTableBuilder builder(file_writer.get(), kHashTableRatio, num_hash_fun,
                              100, BytewiseComparator(), 1, false, false,
                              GetSliceHash, 0 /* column_family_id */,
diff -Nru rocksdb-5.15.10/table/cuckoo_table_reader.cc rocksdb-5.17.2/table/cuckoo_table_reader.cc
--- rocksdb-5.15.10/table/cuckoo_table_reader.cc	2018-09-13 17:25:20.000000000 +0000
+++ rocksdb-5.17.2/table/cuckoo_table_reader.cc	2018-11-12 19:57:32.000000000 +0000
@@ -136,7 +136,7 @@
   cuckoo_block_size_ = *reinterpret_cast<const uint32_t*>(
       cuckoo_block_size->second.data());
   cuckoo_block_bytes_minus_one_ = cuckoo_block_size_ * bucket_length_ - 1;
-  status_ = file_->Read(0, file_size, &file_data_, nullptr);
+  status_ = file_->Read(0, static_cast<size_t>(file_size), &file_data_, nullptr);
 }
 
 Status CuckooTableReader::Get(const ReadOptions& /*readOptions*/,
@@ -268,7 +268,7 @@
   if (initialized_) {
     return;
   }
-  sorted_bucket_ids_.reserve(reader_->GetTableProperties()->num_entries);
+  sorted_bucket_ids_.reserve(static_cast<size_t>(reader_->GetTableProperties()->num_entries));
   uint64_t num_buckets = reader_->table_size_ + reader_->cuckoo_block_size_ - 1;
   assert(num_buckets < kInvalidIndex);
   const char* bucket = reader_->file_data_.data();
@@ -374,15 +374,12 @@
   return curr_value_;
 }
 
-extern InternalIterator* NewErrorInternalIterator(const Status& status,
-                                                  Arena* arena);
-
 InternalIterator* CuckooTableReader::NewIterator(
     const ReadOptions& /*read_options*/,
     const SliceTransform* /* prefix_extractor */, Arena* arena,
     bool /*skip_filters*/, bool /*for_compaction*/) {
   if (!status().ok()) {
-    return NewErrorInternalIterator(
+    return NewErrorInternalIterator<Slice>(
         Status::Corruption("CuckooTableReader status is not okay."), arena);
   }
   CuckooTableIterator* iter;
diff -Nru rocksdb-5.15.10/table/cuckoo_table_reader.h rocksdb-5.17.2/table/cuckoo_table_reader.h
--- rocksdb-5.15.10/table/cuckoo_table_reader.h	2018-09-13 17:25:20.000000000 +0000
+++ rocksdb-5.17.2/table/cuckoo_table_reader.h	2018-11-12 19:57:32.000000000 +0000
@@ -25,7 +25,6 @@
 
 class Arena;
 class TableReader;
-class InternalIterator;
 
 class CuckooTableReader: public TableReader {
  public:
diff -Nru rocksdb-5.15.10/table/cuckoo_table_reader_test.cc rocksdb-5.17.2/table/cuckoo_table_reader_test.cc
--- rocksdb-5.15.10/table/cuckoo_table_reader_test.cc	2018-09-13 17:25:20.000000000 +0000
+++ rocksdb-5.17.2/table/cuckoo_table_reader_test.cc	2018-11-12 19:57:32.000000000 +0000
@@ -96,7 +96,7 @@
     std::unique_ptr<WritableFile> writable_file;
     ASSERT_OK(env->NewWritableFile(fname, &writable_file, env_options));
     unique_ptr<WritableFileWriter> file_writer(
-        new WritableFileWriter(std::move(writable_file), env_options));
+        new WritableFileWriter(std::move(writable_file), fname, env_options));
 
     CuckooTableBuilder builder(
         file_writer.get(), 0.9, kNumHashFunc, 100, ucomp, 2, false, false,
@@ -412,7 +412,7 @@
   std::unique_ptr<WritableFile> writable_file;
   ASSERT_OK(env->NewWritableFile(fname, &writable_file, env_options));
   unique_ptr<WritableFileWriter> file_writer(
-      new WritableFileWriter(std::move(writable_file), env_options));
+      new WritableFileWriter(std::move(writable_file), fname, env_options));
   CuckooTableBuilder builder(
       file_writer.get(), hash_ratio, 64, 1000, test::Uint64Comparator(), 5,
       false, FLAGS_identity_as_first_hash, nullptr, 0 /* column_family_id */,
diff -Nru rocksdb-5.15.10/table/data_block_footer.cc rocksdb-5.17.2/table/data_block_footer.cc
--- rocksdb-5.15.10/table/data_block_footer.cc	1970-01-01 00:00:00.000000000 +0000
+++ rocksdb-5.17.2/table/data_block_footer.cc	2018-11-12 19:57:32.000000000 +0000
@@ -0,0 +1,59 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+//
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+
+#include "data_block_footer.h"
+
+#include "rocksdb/table.h"
+
+namespace rocksdb {
+
+const int kDataBlockIndexTypeBitShift = 31;
+
+// 0x7FFFFFFF
+const uint32_t kMaxNumRestarts = (1u << kDataBlockIndexTypeBitShift) - 1u;
+
+// 0x7FFFFFFF
+const uint32_t kNumRestartsMask = (1u << kDataBlockIndexTypeBitShift) - 1u;
+
+uint32_t PackIndexTypeAndNumRestarts(
+    BlockBasedTableOptions::DataBlockIndexType index_type,
+    uint32_t num_restarts) {
+  if (num_restarts > kMaxNumRestarts) {
+    assert(0);  // mute travis "unused" warning
+  }
+
+  uint32_t block_footer = num_restarts;
+  if (index_type == BlockBasedTableOptions::kDataBlockBinaryAndHash) {
+    block_footer |= 1u << kDataBlockIndexTypeBitShift;
+  } else if (index_type != BlockBasedTableOptions::kDataBlockBinarySearch) {
+    assert(0);
+  }
+
+  return block_footer;
+}
+
+void UnPackIndexTypeAndNumRestarts(
+    uint32_t block_footer,
+    BlockBasedTableOptions::DataBlockIndexType* index_type,
+    uint32_t* num_restarts) {
+  if (index_type) {
+    if (block_footer & 1u << kDataBlockIndexTypeBitShift) {
+      *index_type = BlockBasedTableOptions::kDataBlockBinaryAndHash;
+    } else {
+      *index_type = BlockBasedTableOptions::kDataBlockBinarySearch;
+    }
+  }
+
+  if (num_restarts) {
+    *num_restarts = block_footer & kNumRestartsMask;
+    assert(*num_restarts <= kMaxNumRestarts);
+  }
+}
+
+}  // namespace rocksdb
diff -Nru rocksdb-5.15.10/table/data_block_footer.h rocksdb-5.17.2/table/data_block_footer.h
--- rocksdb-5.15.10/table/data_block_footer.h	1970-01-01 00:00:00.000000000 +0000
+++ rocksdb-5.17.2/table/data_block_footer.h	2018-11-12 19:57:32.000000000 +0000
@@ -0,0 +1,25 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+//
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+
+#pragma once
+
+#include "rocksdb/table.h"
+
+namespace rocksdb {
+
+uint32_t PackIndexTypeAndNumRestarts(
+    BlockBasedTableOptions::DataBlockIndexType index_type,
+    uint32_t num_restarts);
+
+void UnPackIndexTypeAndNumRestarts(
+    uint32_t block_footer,
+    BlockBasedTableOptions::DataBlockIndexType* index_type,
+    uint32_t* num_restarts);
+
+}  // namespace rocksdb
diff -Nru rocksdb-5.15.10/table/data_block_hash_index.cc rocksdb-5.17.2/table/data_block_hash_index.cc
--- rocksdb-5.15.10/table/data_block_hash_index.cc	1970-01-01 00:00:00.000000000 +0000
+++ rocksdb-5.17.2/table/data_block_hash_index.cc	2018-11-12 19:57:32.000000000 +0000
@@ -0,0 +1,93 @@
+// Copyright (c) 2011-present, Facebook, Inc. All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+#include <string>
+#include <vector>
+
+#include "rocksdb/slice.h"
+#include "table/data_block_hash_index.h"
+#include "util/coding.h"
+#include "util/hash.h"
+
+namespace rocksdb {
+
+void DataBlockHashIndexBuilder::Add(const Slice& key,
+                                    const size_t restart_index) {
+  assert(Valid());
+  if (restart_index > kMaxRestartSupportedByHashIndex) {
+    valid_ = false;
+    return;
+  }
+
+  uint32_t hash_value = GetSliceHash(key);
+  hash_and_restart_pairs_.emplace_back(hash_value,
+                                       static_cast<uint8_t>(restart_index));
+  estimated_num_buckets_ += bucket_per_key_;
+}
+
+void DataBlockHashIndexBuilder::Finish(std::string& buffer) {
+  assert(Valid());
+  uint16_t num_buckets = static_cast<uint16_t>(estimated_num_buckets_);
+
+  if (num_buckets == 0) {
+    num_buckets = 1;  // sanity check
+  }
+
+  // The build-in hash cannot well distribute strings when into different
+  // buckets when num_buckets is power of two, resulting in high hash
+  // collision.
+  // We made the num_buckets to be odd to avoid this issue.
+  num_buckets |= 1;
+
+  std::vector<uint8_t> buckets(num_buckets, kNoEntry);
+  // write the restart_index array
+  for (auto& entry : hash_and_restart_pairs_) {
+    uint32_t hash_value = entry.first;
+    uint8_t restart_index = entry.second;
+    uint16_t buck_idx = static_cast<uint16_t>(hash_value % num_buckets);
+    if (buckets[buck_idx] == kNoEntry) {
+      buckets[buck_idx] = restart_index;
+    } else if (buckets[buck_idx] != restart_index) {
+      // same bucket cannot store two different restart_index, mark collision
+      buckets[buck_idx] = kCollision;
+    }
+  }
+
+  for (uint8_t restart_index : buckets) {
+    buffer.append(
+        const_cast<const char*>(reinterpret_cast<char*>(&restart_index)),
+        sizeof(restart_index));
+  }
+
+  // write NUM_BUCK
+  PutFixed16(&buffer, num_buckets);
+
+  assert(buffer.size() <= kMaxBlockSizeSupportedByHashIndex);
+}
+
+void DataBlockHashIndexBuilder::Reset() {
+  estimated_num_buckets_ = 0;
+  valid_ = true;
+  hash_and_restart_pairs_.clear();
+}
+
+void DataBlockHashIndex::Initialize(const char* data, uint16_t size,
+                                    uint16_t* map_offset) {
+  assert(size >= sizeof(uint16_t));  // NUM_BUCKETS
+  num_buckets_ = DecodeFixed16(data + size - sizeof(uint16_t));
+  assert(num_buckets_ > 0);
+  assert(size > num_buckets_ * sizeof(uint8_t));
+  *map_offset = static_cast<uint16_t>(size - sizeof(uint16_t) -
+                                      num_buckets_ * sizeof(uint8_t));
+}
+
+uint8_t DataBlockHashIndex::Lookup(const char* data, uint32_t map_offset,
+                                   const Slice& key) const {
+  uint32_t hash_value = GetSliceHash(key);
+  uint16_t idx = static_cast<uint16_t>(hash_value % num_buckets_);
+  const char* bucket_table = data + map_offset;
+  return static_cast<uint8_t>(*(bucket_table + idx * sizeof(uint8_t)));
+}
+
+}  // namespace rocksdb
diff -Nru rocksdb-5.15.10/table/data_block_hash_index.h rocksdb-5.17.2/table/data_block_hash_index.h
--- rocksdb-5.15.10/table/data_block_hash_index.h	1970-01-01 00:00:00.000000000 +0000
+++ rocksdb-5.17.2/table/data_block_hash_index.h	2018-11-12 19:57:32.000000000 +0000
@@ -0,0 +1,136 @@
+// Copyright (c) 2011-present, Facebook, Inc. All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+
+#pragma once
+
+#include <string>
+#include <vector>
+
+#include "rocksdb/slice.h"
+
+namespace rocksdb {
+// This is an experimental feature aiming to reduce the CPU utilization of
+// point-lookup within a data-block. It is only used in data blocks, and not
+// in meta-data blocks or per-table index blocks.
+//
+// It only used to support BlockBasedTable::Get().
+//
+// A serialized hash index is appended to the data-block. The new block data
+// format is as follows:
+//
+// DATA_BLOCK: [RI RI RI ... RI RI_IDX HASH_IDX FOOTER]
+//
+// RI:       Restart Interval (the same as the default data-block format)
+// RI_IDX:   Restart Interval index (the same as the default data-block format)
+// HASH_IDX: The new data-block hash index feature.
+// FOOTER:   A 32bit block footer, which is the NUM_RESTARTS with the MSB as
+//           the flag indicating if this hash index is in use. Note that
+//           given a data block < 32KB, the MSB is never used. So we can
+//           borrow the MSB as the hash index flag. Therefore, this format is
+//           compatible with the legacy data-blocks with num_restarts < 32768,
+//           as the MSB is 0.
+//
+// The format of the data-block hash index is as follows:
+//
+// HASH_IDX: [B B B ... B NUM_BUCK]
+//
+// B:         bucket, an array of restart index. Each buckets is uint8_t.
+// NUM_BUCK:  Number of buckets, which is the length of the bucket array.
+//
+// We reserve two special flag:
+//    kNoEntry=255,
+//    kCollision=254.
+//
+// Therefore, the max number of restarts this hash index can supoport is 253.
+//
+// Buckets are initialized to be kNoEntry.
+//
+// When storing a key in the hash index, the key is first hashed to a bucket.
+// If there the bucket is empty (kNoEntry), the restart index is stored in
+// the bucket. If there is already a restart index there, we will update the
+// existing restart index to a collision marker (kCollision). If the
+// the bucket is already marked as collision, we do not store the restart
+// index either.
+//
+// During query process, a key is first hashed to a bucket. Then we examine if
+// the buckets store nothing (kNoEntry) or the bucket had a collision
+// (kCollision). If either of those happens, we get the restart index of
+// the key and will directly go to the restart interval to search the key.
+//
+// Note that we only support blocks with #restart_interval < 254. If a block
+// has more restart interval than that, hash index will not be create for it.
+
+const uint8_t kNoEntry = 255;
+const uint8_t kCollision = 254;
+const uint8_t kMaxRestartSupportedByHashIndex = 253;
+
+// Because we use uint16_t address, we only support block no more than 64KB
+const size_t kMaxBlockSizeSupportedByHashIndex = 1u << 16;
+const double kDefaultUtilRatio = 0.75;
+
+class DataBlockHashIndexBuilder {
+ public:
+  DataBlockHashIndexBuilder()
+      : bucket_per_key_(-1 /*uninitialized marker*/),
+        estimated_num_buckets_(0),
+        valid_(false) {}
+
+  void Initialize(double util_ratio) {
+    if (util_ratio <= 0) {
+      util_ratio = kDefaultUtilRatio;  // sanity check
+    }
+    bucket_per_key_ = 1 / util_ratio;
+    valid_ = true;
+  }
+
+  inline bool Valid() const { return valid_ && bucket_per_key_ > 0; }
+  void Add(const Slice& key, const size_t restart_index);
+  void Finish(std::string& buffer);
+  void Reset();
+  inline size_t EstimateSize() const {
+    uint16_t estimated_num_buckets =
+        static_cast<uint16_t>(estimated_num_buckets_);
+
+    // Maching the num_buckets number in DataBlockHashIndexBuilder::Finish.
+    estimated_num_buckets |= 1;
+
+    return sizeof(uint16_t) +
+           static_cast<size_t>(estimated_num_buckets * sizeof(uint8_t));
+  }
+
+ private:
+  double bucket_per_key_;  // is the multiplicative inverse of util_ratio_
+  double estimated_num_buckets_;
+
+  // Now the only usage for `valid_` is to mark false when the inserted
+  // restart_index is larger than supported. In this case HashIndex is not
+  // appended to the block content.
+  bool valid_;
+
+  std::vector<std::pair<uint32_t, uint8_t>> hash_and_restart_pairs_;
+  friend class DataBlockHashIndex_DataBlockHashTestSmall_Test;
+};
+
+class DataBlockHashIndex {
+ public:
+  DataBlockHashIndex() : num_buckets_(0) {}
+
+  void Initialize(const char* data, uint16_t size, uint16_t* map_offset);
+
+  uint8_t Lookup(const char* data, uint32_t map_offset, const Slice& key) const;
+
+  inline bool Valid() { return num_buckets_ != 0; }
+
+ private:
+  // To make the serialized hash index compact and to save the space overhead,
+  // here all the data fields persisted in the block are in uint16 format.
+  // We find that a uint16 is large enough to index every offset of a 64KiB
+  // block.
+  // So in other words, DataBlockHashIndex does not support block size equal
+  // or greater then 64KiB.
+  uint16_t num_buckets_;
+};
+
+}  // namespace rocksdb
diff -Nru rocksdb-5.15.10/table/data_block_hash_index_test.cc rocksdb-5.17.2/table/data_block_hash_index_test.cc
--- rocksdb-5.15.10/table/data_block_hash_index_test.cc	1970-01-01 00:00:00.000000000 +0000
+++ rocksdb-5.17.2/table/data_block_hash_index_test.cc	2018-11-12 19:57:32.000000000 +0000
@@ -0,0 +1,728 @@
+// Copyright (c) 2011-present, Facebook, Inc. All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+
+#include <cstdlib>
+#include <string>
+#include <unordered_map>
+
+#include "rocksdb/slice.h"
+#include "table/block.h"
+#include "table/block_based_table_reader.h"
+#include "table/block_builder.h"
+#include "table/data_block_hash_index.h"
+#include "table/get_context.h"
+#include "util/testharness.h"
+#include "util/testutil.h"
+
+namespace rocksdb {
+
+bool SearchForOffset(DataBlockHashIndex& index, const char* data,
+                     uint16_t map_offset, const Slice& key,
+                     uint8_t& restart_point) {
+  uint8_t entry = index.Lookup(data, map_offset, key);
+  if (entry == kCollision) {
+    return true;
+  }
+
+  if (entry == kNoEntry) {
+    return false;
+  }
+
+  return entry == restart_point;
+}
+
+// Random KV generator similer to block_test
+static std::string RandomString(Random* rnd, int len) {
+  std::string r;
+  test::RandomString(rnd, len, &r);
+  return r;
+}
+std::string GenerateKey(int primary_key, int secondary_key, int padding_size,
+                        Random* rnd) {
+  char buf[50];
+  char* p = &buf[0];
+  snprintf(buf, sizeof(buf), "%6d%4d", primary_key, secondary_key);
+  std::string k(p);
+  if (padding_size) {
+    k += RandomString(rnd, padding_size);
+  }
+
+  return k;
+}
+
+// Generate random key value pairs.
+// The generated key will be sorted. You can tune the parameters to generated
+// different kinds of test key/value pairs for different scenario.
+void GenerateRandomKVs(std::vector<std::string>* keys,
+                       std::vector<std::string>* values, const int from,
+                       const int len, const int step = 1,
+                       const int padding_size = 0,
+                       const int keys_share_prefix = 1) {
+  Random rnd(302);
+
+  // generate different prefix
+  for (int i = from; i < from + len; i += step) {
+    // generating keys that shares the prefix
+    for (int j = 0; j < keys_share_prefix; ++j) {
+      keys->emplace_back(GenerateKey(i, j, padding_size, &rnd));
+
+      // 100 bytes values
+      values->emplace_back(RandomString(&rnd, 100));
+    }
+  }
+}
+
+TEST(DataBlockHashIndex, DataBlockHashTestSmall) {
+  DataBlockHashIndexBuilder builder;
+  builder.Initialize(0.75 /*util_ratio*/);
+  for (int j = 0; j < 5; j++) {
+    for (uint8_t i = 0; i < 2 + j; i++) {
+      std::string key("key" + std::to_string(i));
+      uint8_t restart_point = i;
+      builder.Add(key, restart_point);
+    }
+
+    size_t estimated_size = builder.EstimateSize();
+
+    std::string buffer("fake"), buffer2;
+    size_t original_size = buffer.size();
+    estimated_size += original_size;
+    builder.Finish(buffer);
+
+    ASSERT_EQ(buffer.size(), estimated_size);
+
+    buffer2 = buffer;  // test for the correctness of relative offset
+
+    Slice s(buffer2);
+    DataBlockHashIndex index;
+    uint16_t map_offset;
+    index.Initialize(s.data(), static_cast<uint16_t>(s.size()), &map_offset);
+
+    // the additional hash map should start at the end of the buffer
+    ASSERT_EQ(original_size, map_offset);
+    for (uint8_t i = 0; i < 2; i++) {
+      std::string key("key" + std::to_string(i));
+      uint8_t restart_point = i;
+      ASSERT_TRUE(
+          SearchForOffset(index, s.data(), map_offset, key, restart_point));
+    }
+    builder.Reset();
+  }
+}
+
+TEST(DataBlockHashIndex, DataBlockHashTest) {
+  // bucket_num = 200, #keys = 100. 50% utilization
+  DataBlockHashIndexBuilder builder;
+  builder.Initialize(0.75 /*util_ratio*/);
+
+  for (uint8_t i = 0; i < 100; i++) {
+    std::string key("key" + std::to_string(i));
+    uint8_t restart_point = i;
+    builder.Add(key, restart_point);
+  }
+
+  size_t estimated_size = builder.EstimateSize();
+
+  std::string buffer("fake content"), buffer2;
+  size_t original_size = buffer.size();
+  estimated_size += original_size;
+  builder.Finish(buffer);
+
+  ASSERT_EQ(buffer.size(), estimated_size);
+
+  buffer2 = buffer; // test for the correctness of relative offset
+
+  Slice s(buffer2);
+  DataBlockHashIndex index;
+  uint16_t map_offset;
+  index.Initialize(s.data(), static_cast<uint16_t>(s.size()), &map_offset);
+
+  // the additional hash map should start at the end of the buffer
+  ASSERT_EQ(original_size, map_offset);
+  for (uint8_t i = 0; i < 100; i++) {
+    std::string key("key" + std::to_string(i));
+    uint8_t restart_point = i;
+    ASSERT_TRUE(
+        SearchForOffset(index, s.data(), map_offset, key, restart_point));
+  }
+}
+
+TEST(DataBlockHashIndex, DataBlockHashTestCollision) {
+  // bucket_num = 2. There will be intense hash collisions
+  DataBlockHashIndexBuilder builder;
+  builder.Initialize(0.75 /*util_ratio*/);
+
+  for (uint8_t i = 0; i < 100; i++) {
+    std::string key("key" + std::to_string(i));
+    uint8_t restart_point = i;
+    builder.Add(key, restart_point);
+  }
+
+  size_t estimated_size = builder.EstimateSize();
+
+  std::string buffer("some other fake content to take up space"), buffer2;
+  size_t original_size = buffer.size();
+  estimated_size += original_size;
+  builder.Finish(buffer);
+
+  ASSERT_EQ(buffer.size(), estimated_size);
+
+  buffer2 = buffer; // test for the correctness of relative offset
+
+  Slice s(buffer2);
+  DataBlockHashIndex index;
+  uint16_t map_offset;
+  index.Initialize(s.data(), static_cast<uint16_t>(s.size()), &map_offset);
+
+  // the additional hash map should start at the end of the buffer
+  ASSERT_EQ(original_size, map_offset);
+  for (uint8_t i = 0; i < 100; i++) {
+    std::string key("key" + std::to_string(i));
+    uint8_t restart_point = i;
+    ASSERT_TRUE(
+        SearchForOffset(index, s.data(), map_offset, key, restart_point));
+  }
+}
+
+TEST(DataBlockHashIndex, DataBlockHashTestLarge) {
+  DataBlockHashIndexBuilder builder;
+  builder.Initialize(0.75 /*util_ratio*/);
+  std::unordered_map<std::string, uint8_t> m;
+
+  for (uint8_t i = 0; i < 100; i++) {
+    if (i % 2) {
+      continue;  // leave half of the keys out
+    }
+    std::string key = "key" + std::to_string(i);
+    uint8_t restart_point = i;
+    builder.Add(key, restart_point);
+    m[key] = restart_point;
+  }
+
+  size_t estimated_size = builder.EstimateSize();
+
+  std::string buffer("filling stuff"), buffer2;
+  size_t original_size = buffer.size();
+  estimated_size += original_size;
+  builder.Finish(buffer);
+
+  ASSERT_EQ(buffer.size(), estimated_size);
+
+  buffer2 = buffer; // test for the correctness of relative offset
+
+  Slice s(buffer2);
+  DataBlockHashIndex index;
+  uint16_t map_offset;
+  index.Initialize(s.data(), static_cast<uint16_t>(s.size()), &map_offset);
+
+  // the additional hash map should start at the end of the buffer
+  ASSERT_EQ(original_size, map_offset);
+  for (uint8_t i = 0; i < 100; i++) {
+    std::string key = "key" + std::to_string(i);
+    uint8_t restart_point = i;
+    if (m.count(key)) {
+      ASSERT_TRUE(m[key] == restart_point);
+      ASSERT_TRUE(
+          SearchForOffset(index, s.data(), map_offset, key, restart_point));
+    } else {
+      // we allow false positve, so don't test the nonexisting keys.
+      // when false positive happens, the search will continue to the
+      // restart intervals to see if the key really exist.
+    }
+  }
+}
+
+TEST(DataBlockHashIndex, RestartIndexExceedMax) {
+  DataBlockHashIndexBuilder builder;
+  builder.Initialize(0.75 /*util_ratio*/);
+  std::unordered_map<std::string, uint8_t> m;
+
+  for (uint8_t i = 0; i <= 253; i++) {
+    std::string key = "key" + std::to_string(i);
+    uint8_t restart_point = i;
+    builder.Add(key, restart_point);
+  }
+  ASSERT_TRUE(builder.Valid());
+
+  builder.Reset();
+
+  for (uint8_t i = 0; i <= 254; i++) {
+    std::string key = "key" + std::to_string(i);
+    uint8_t restart_point = i;
+    builder.Add(key, restart_point);
+  }
+
+  ASSERT_FALSE(builder.Valid());
+
+  builder.Reset();
+  ASSERT_TRUE(builder.Valid());
+}
+
+TEST(DataBlockHashIndex, BlockRestartIndexExceedMax) {
+  Options options = Options();
+
+  BlockBuilder builder(1 /* block_restart_interval */,
+                       true /* use_delta_encoding */,
+                       false /* use_value_delta_encoding */,
+                       BlockBasedTableOptions::kDataBlockBinaryAndHash);
+
+  // #restarts <= 253. HashIndex is valid
+  for (int i = 0; i <= 253; i++) {
+    std::string ukey = "key" + std::to_string(i);
+    InternalKey ikey(ukey, 0, kTypeValue);
+    builder.Add(ikey.Encode().ToString(), "value");
+  }
+
+  {
+    // read serialized contents of the block
+    Slice rawblock = builder.Finish();
+
+    // create block reader
+    BlockContents contents;
+    contents.data = rawblock;
+    contents.cachable = false;
+    Block reader(std::move(contents), kDisableGlobalSequenceNumber);
+
+    ASSERT_EQ(reader.IndexType(),
+              BlockBasedTableOptions::kDataBlockBinaryAndHash);
+  }
+
+  builder.Reset();
+
+  // #restarts > 253. HashIndex is not used
+  for (int i = 0; i <= 254; i++) {
+    std::string ukey = "key" + std::to_string(i);
+    InternalKey ikey(ukey, 0, kTypeValue);
+    builder.Add(ikey.Encode().ToString(), "value");
+  }
+
+  {
+    // read serialized contents of the block
+    Slice rawblock = builder.Finish();
+
+    // create block reader
+    BlockContents contents;
+    contents.data = rawblock;
+    contents.cachable = false;
+    Block reader(std::move(contents), kDisableGlobalSequenceNumber);
+
+    ASSERT_EQ(reader.IndexType(),
+              BlockBasedTableOptions::kDataBlockBinarySearch);
+  }
+}
+
+TEST(DataBlockHashIndex, BlockSizeExceedMax) {
+  Options options = Options();
+  std::string ukey(10, 'k');
+  InternalKey ikey(ukey, 0, kTypeValue);
+
+  BlockBuilder builder(1 /* block_restart_interval */,
+                       false /* use_delta_encoding */,
+                       false /* use_value_delta_encoding */,
+                       BlockBasedTableOptions::kDataBlockBinaryAndHash);
+
+  {
+    // insert a large value. The block size plus HashIndex is 65536.
+    std::string value(65502, 'v');
+
+    builder.Add(ikey.Encode().ToString(), value);
+
+    // read serialized contents of the block
+    Slice rawblock = builder.Finish();
+    ASSERT_LE(rawblock.size(), kMaxBlockSizeSupportedByHashIndex);
+    std::cerr << "block size: " << rawblock.size() << std::endl;
+
+    // create block reader
+    BlockContents contents;
+    contents.data = rawblock;
+    contents.cachable = false;
+    Block reader(std::move(contents), kDisableGlobalSequenceNumber);
+
+    ASSERT_EQ(reader.IndexType(),
+              BlockBasedTableOptions::kDataBlockBinaryAndHash);
+  }
+
+  builder.Reset();
+
+  {
+    // insert a large value. The block size plus HashIndex would be 65537.
+    // This excceed the max block size supported by HashIndex (65536).
+    // So when build finishes HashIndex will not be created for the block.
+    std::string value(65503, 'v');
+
+    builder.Add(ikey.Encode().ToString(), value);
+
+    // read serialized contents of the block
+    Slice rawblock = builder.Finish();
+    ASSERT_LE(rawblock.size(), kMaxBlockSizeSupportedByHashIndex);
+    std::cerr << "block size: " << rawblock.size() << std::endl;
+
+    // create block reader
+    BlockContents contents;
+    contents.data = rawblock;
+    contents.cachable = false;
+    Block reader(std::move(contents), kDisableGlobalSequenceNumber);
+
+    // the index type have fallen back to binary when build finish.
+    ASSERT_EQ(reader.IndexType(),
+              BlockBasedTableOptions::kDataBlockBinarySearch);
+  }
+}
+
+TEST(DataBlockHashIndex, BlockTestSingleKey) {
+  Options options = Options();
+
+  BlockBuilder builder(16 /* block_restart_interval */,
+                       true /* use_delta_encoding */,
+                       false /* use_value_delta_encoding */,
+                       BlockBasedTableOptions::kDataBlockBinaryAndHash);
+
+  std::string ukey("gopher");
+  std::string value("gold");
+  InternalKey ikey(ukey, 10, kTypeValue);
+  builder.Add(ikey.Encode().ToString(), value /*value*/);
+
+  // read serialized contents of the block
+  Slice rawblock = builder.Finish();
+
+  // create block reader
+  BlockContents contents;
+  contents.data = rawblock;
+  contents.cachable = false;
+  Block reader(std::move(contents), kDisableGlobalSequenceNumber);
+
+  const InternalKeyComparator icmp(BytewiseComparator());
+  auto iter = reader.NewIterator<DataBlockIter>(&icmp, icmp.user_comparator());
+  bool may_exist;
+  // search in block for the key just inserted
+  {
+    InternalKey seek_ikey(ukey, 10, kValueTypeForSeek);
+    may_exist = iter->SeekForGet(seek_ikey.Encode().ToString());
+    ASSERT_TRUE(may_exist);
+    ASSERT_TRUE(iter->Valid());
+    ASSERT_EQ(
+        options.comparator->Compare(iter->key(), ikey.Encode().ToString()), 0);
+    ASSERT_EQ(iter->value(), value);
+  }
+
+  // search in block for the existing ukey, but with higher seqno
+  {
+    InternalKey seek_ikey(ukey, 20, kValueTypeForSeek);
+
+    // HashIndex should be able to set the iter correctly
+    may_exist = iter->SeekForGet(seek_ikey.Encode().ToString());
+    ASSERT_TRUE(may_exist);
+    ASSERT_TRUE(iter->Valid());
+
+    // user key should match
+    ASSERT_EQ(options.comparator->Compare(ExtractUserKey(iter->key()), ukey),
+              0);
+
+    // seek_key seqno number should be greater than that of iter result
+    ASSERT_GT(GetInternalKeySeqno(seek_ikey.Encode()),
+              GetInternalKeySeqno(iter->key()));
+
+    ASSERT_EQ(iter->value(), value);
+  }
+
+  // Search in block for the existing ukey, but with lower seqno
+  // in this case, hash can find the only occurrence of the user_key, but
+  // ParseNextDataKey() will skip it as it does not have a older seqno.
+  // In this case, GetForSeek() is effective to locate the user_key, and
+  // iter->Valid() == false indicates that we've reached to the end of
+  // the block and the caller should continue searching the next block.
+  {
+    InternalKey seek_ikey(ukey, 5, kValueTypeForSeek);
+    may_exist = iter->SeekForGet(seek_ikey.Encode().ToString());
+    ASSERT_TRUE(may_exist);
+    ASSERT_FALSE(iter->Valid());  // should have reached to the end of block
+  }
+
+  delete iter;
+}
+
+TEST(DataBlockHashIndex, BlockTestLarge) {
+  Random rnd(1019);
+  Options options = Options();
+  std::vector<std::string> keys;
+  std::vector<std::string> values;
+
+  BlockBuilder builder(16 /* block_restart_interval */,
+                       true /* use_delta_encoding */,
+                       false /* use_value_delta_encoding */,
+                       BlockBasedTableOptions::kDataBlockBinaryAndHash);
+  int num_records = 500;
+
+  GenerateRandomKVs(&keys, &values, 0, num_records);
+
+  // Generate keys. Adding a trailing "1" to indicate existent keys.
+  // Later will Seeking for keys with a trailing "0" to test seeking
+  // non-existent keys.
+  for (int i = 0; i < num_records; i++) {
+    std::string ukey(keys[i] + "1" /* existing key marker */);
+    InternalKey ikey(ukey, 0, kTypeValue);
+    builder.Add(ikey.Encode().ToString(), values[i]);
+  }
+
+  // read serialized contents of the block
+  Slice rawblock = builder.Finish();
+
+  // create block reader
+  BlockContents contents;
+  contents.data = rawblock;
+  contents.cachable = false;
+  Block reader(std::move(contents), kDisableGlobalSequenceNumber);
+  const InternalKeyComparator icmp(BytewiseComparator());
+
+  // random seek existent keys
+  for (int i = 0; i < num_records; i++) {
+    auto iter =
+        reader.NewIterator<DataBlockIter>(&icmp, icmp.user_comparator());
+    // find a random key in the lookaside array
+    int index = rnd.Uniform(num_records);
+    std::string ukey(keys[index] + "1" /* existing key marker */);
+    InternalKey ikey(ukey, 0, kTypeValue);
+
+    // search in block for this key
+    bool may_exist = iter->SeekForGet(ikey.Encode().ToString());
+    ASSERT_TRUE(may_exist);
+    ASSERT_TRUE(iter->Valid());
+    ASSERT_EQ(values[index], iter->value());
+
+    delete iter;
+  }
+
+  // random seek non-existent user keys
+  // In this case A), the user_key cannot be found in HashIndex. The key may
+  // exist in the next block. So the iter is set invalidated to tell the
+  // caller to search the next block. This test case belongs to this case A).
+  //
+  // Note that for non-existent keys, there is possibility of false positive,
+  // i.e. the key is still hashed into some restart interval.
+  // Two additional possible outcome:
+  // B) linear seek the restart interval and not found, the iter stops at the
+  //    starting of the next restart interval. The key does not exist
+  //    anywhere.
+  // C) linear seek the restart interval and not found, the iter stops at the
+  //    the end of the block, i.e. restarts_. The key may exist in the next
+  //    block.
+  // So these combinations are possible when searching non-existent user_key:
+  //
+  // case#    may_exist  iter->Valid()
+  //     A         true          false
+  //     B        false           true
+  //     C         true          false
+
+  for (int i = 0; i < num_records; i++) {
+    auto iter =
+        reader.NewIterator<DataBlockIter>(&icmp, icmp.user_comparator());
+    // find a random key in the lookaside array
+    int index = rnd.Uniform(num_records);
+    std::string ukey(keys[index] + "0" /* non-existing key marker */);
+    InternalKey ikey(ukey, 0, kTypeValue);
+
+    // search in block for this key
+    bool may_exist = iter->SeekForGet(ikey.Encode().ToString());
+    if (!may_exist) {
+      ASSERT_TRUE(iter->Valid());
+    }
+    if (!iter->Valid()) {
+      ASSERT_TRUE(may_exist);
+    }
+
+    delete iter;
+  }
+}
+
+// helper routine for DataBlockHashIndex.BlockBoundary
+void TestBoundary(InternalKey& ik1, std::string& v1, InternalKey& ik2,
+                  std::string& v2, InternalKey& seek_ikey,
+                  GetContext& get_context, Options& options) {
+  unique_ptr<WritableFileWriter> file_writer;
+  unique_ptr<RandomAccessFileReader> file_reader;
+  unique_ptr<TableReader> table_reader;
+  int level_ = -1;
+
+  std::vector<std::string> keys;
+  const ImmutableCFOptions ioptions(options);
+  const MutableCFOptions moptions(options);
+  const InternalKeyComparator internal_comparator(options.comparator);
+
+  EnvOptions soptions;
+
+  soptions.use_mmap_reads = ioptions.allow_mmap_reads;
+  file_writer.reset(
+      test::GetWritableFileWriter(new test::StringSink(), "" /* don't care */));
+  unique_ptr<TableBuilder> builder;
+  std::vector<std::unique_ptr<IntTblPropCollectorFactory>>
+      int_tbl_prop_collector_factories;
+  std::string column_family_name;
+  builder.reset(ioptions.table_factory->NewTableBuilder(
+      TableBuilderOptions(ioptions, moptions, internal_comparator,
+                          &int_tbl_prop_collector_factories,
+                          options.compression, CompressionOptions(),
+                          nullptr /* compression_dict */,
+                          false /* skip_filters */, column_family_name, level_),
+      TablePropertiesCollectorFactory::Context::kUnknownColumnFamily,
+      file_writer.get()));
+
+  builder->Add(ik1.Encode().ToString(), v1);
+  builder->Add(ik2.Encode().ToString(), v2);
+  EXPECT_TRUE(builder->status().ok());
+
+  Status s = builder->Finish();
+  file_writer->Flush();
+  EXPECT_TRUE(s.ok()) << s.ToString();
+
+  EXPECT_EQ(static_cast<test::StringSink*>(file_writer->writable_file())
+                ->contents()
+                .size(),
+            builder->FileSize());
+
+  // Open the table
+  file_reader.reset(test::GetRandomAccessFileReader(new test::StringSource(
+      static_cast<test::StringSink*>(file_writer->writable_file())->contents(),
+      0 /*uniq_id*/, ioptions.allow_mmap_reads)));
+  const bool kSkipFilters = true;
+  const bool kImmortal = true;
+  ioptions.table_factory->NewTableReader(
+      TableReaderOptions(ioptions, moptions.prefix_extractor.get(), soptions,
+                         internal_comparator, !kSkipFilters, !kImmortal,
+                         level_),
+      std::move(file_reader),
+      static_cast<test::StringSink*>(file_writer->writable_file())
+          ->contents()
+          .size(),
+      &table_reader);
+  // Search using Get()
+  ReadOptions ro;
+
+  ASSERT_OK(table_reader->Get(ro, seek_ikey.Encode().ToString(), &get_context,
+                              moptions.prefix_extractor.get()));
+}
+
+TEST(DataBlockHashIndex, BlockBoundary) {
+  BlockBasedTableOptions table_options;
+  table_options.data_block_index_type =
+      BlockBasedTableOptions::kDataBlockBinaryAndHash;
+  table_options.block_restart_interval = 1;
+  table_options.block_size = 4096;
+
+  Options options;
+  options.comparator = BytewiseComparator();
+
+  options.table_factory.reset(NewBlockBasedTableFactory(table_options));
+
+  // insert two large k/v pair. Given that the block_size is 4096, one k/v
+  // pair will take up one block.
+  // [    k1/v1   ][    k2/v2  ]
+  // [   Block N  ][ Block N+1 ]
+
+  {
+    // [ "aab"@100 ][ "axy"@10  ]
+    // | Block  N  ][ Block N+1 ]
+    // seek for "axy"@60
+    std::string uk1("aab");
+    InternalKey ik1(uk1, 100, kTypeValue);
+    std::string v1(4100, '1');  // large value
+
+    std::string uk2("axy");
+    InternalKey ik2(uk2, 10, kTypeValue);
+    std::string v2(4100, '2');  // large value
+
+    PinnableSlice value;
+    std::string seek_ukey("axy");
+    InternalKey seek_ikey(seek_ukey, 60, kTypeValue);
+    GetContext get_context(options.comparator, nullptr, nullptr, nullptr,
+                           GetContext::kNotFound, seek_ukey, &value, nullptr,
+                           nullptr, nullptr, nullptr);
+
+    TestBoundary(ik1, v1, ik2, v2, seek_ikey, get_context, options);
+    ASSERT_EQ(get_context.State(), GetContext::kFound);
+    ASSERT_EQ(value, v2);
+    value.Reset();
+  }
+
+  {
+    // [ "axy"@100 ][ "axy"@10  ]
+    // | Block  N  ][ Block N+1 ]
+    // seek for "axy"@60
+    std::string uk1("axy");
+    InternalKey ik1(uk1, 100, kTypeValue);
+    std::string v1(4100, '1');  // large value
+
+    std::string uk2("axy");
+    InternalKey ik2(uk2, 10, kTypeValue);
+    std::string v2(4100, '2');  // large value
+
+    PinnableSlice value;
+    std::string seek_ukey("axy");
+    InternalKey seek_ikey(seek_ukey, 60, kTypeValue);
+    GetContext get_context(options.comparator, nullptr, nullptr, nullptr,
+                           GetContext::kNotFound, seek_ukey, &value, nullptr,
+                           nullptr, nullptr, nullptr);
+
+    TestBoundary(ik1, v1, ik2, v2, seek_ikey, get_context, options);
+    ASSERT_EQ(get_context.State(), GetContext::kFound);
+    ASSERT_EQ(value, v2);
+    value.Reset();
+  }
+
+  {
+    // [ "axy"@100 ][ "axy"@10  ]
+    // | Block  N  ][ Block N+1 ]
+    // seek for "axy"@120
+    std::string uk1("axy");
+    InternalKey ik1(uk1, 100, kTypeValue);
+    std::string v1(4100, '1');  // large value
+
+    std::string uk2("axy");
+    InternalKey ik2(uk2, 10, kTypeValue);
+    std::string v2(4100, '2');  // large value
+
+    PinnableSlice value;
+    std::string seek_ukey("axy");
+    InternalKey seek_ikey(seek_ukey, 120, kTypeValue);
+    GetContext get_context(options.comparator, nullptr, nullptr, nullptr,
+                           GetContext::kNotFound, seek_ukey, &value, nullptr,
+                           nullptr, nullptr, nullptr);
+
+    TestBoundary(ik1, v1, ik2, v2, seek_ikey, get_context, options);
+    ASSERT_EQ(get_context.State(), GetContext::kFound);
+    ASSERT_EQ(value, v1);
+    value.Reset();
+  }
+
+  {
+    // [ "axy"@100 ][ "axy"@10  ]
+    // | Block  N  ][ Block N+1 ]
+    // seek for "axy"@5
+    std::string uk1("axy");
+    InternalKey ik1(uk1, 100, kTypeValue);
+    std::string v1(4100, '1');  // large value
+
+    std::string uk2("axy");
+    InternalKey ik2(uk2, 10, kTypeValue);
+    std::string v2(4100, '2');  // large value
+
+    PinnableSlice value;
+    std::string seek_ukey("axy");
+    InternalKey seek_ikey(seek_ukey, 5, kTypeValue);
+    GetContext get_context(options.comparator, nullptr, nullptr, nullptr,
+                           GetContext::kNotFound, seek_ukey, &value, nullptr,
+                           nullptr, nullptr, nullptr);
+
+    TestBoundary(ik1, v1, ik2, v2, seek_ikey, get_context, options);
+    ASSERT_EQ(get_context.State(), GetContext::kNotFound);
+    value.Reset();
+  }
+}
+
+}  // namespace rocksdb
+
+int main(int argc, char** argv) {
+  ::testing::InitGoogleTest(&argc, argv);
+  return RUN_ALL_TESTS();
+}
diff -Nru rocksdb-5.15.10/table/format.cc rocksdb-5.17.2/table/format.cc
--- rocksdb-5.15.10/table/format.cc	2018-09-13 17:25:20.000000000 +0000
+++ rocksdb-5.17.2/table/format.cc	2018-11-12 19:57:32.000000000 +0000
@@ -66,6 +66,18 @@
   }
 }
 
+Status BlockHandle::DecodeSizeFrom(uint64_t _offset, Slice* input) {
+  if (GetVarint64(input, &size_)) {
+    offset_ = _offset;
+    return Status::OK();
+  } else {
+    // reset in case failure after partially decoding
+    offset_ = 0;
+    size_ = 0;
+    return Status::Corruption("bad block handle");
+  }
+}
+
 // Return a string that contains the copy of handle.
 std::string BlockHandle::ToString(bool hex) const {
   std::string handle_str;
diff -Nru rocksdb-5.15.10/table/format.h rocksdb-5.17.2/table/format.h
--- rocksdb-5.15.10/table/format.h	2018-09-13 17:25:20.000000000 +0000
+++ rocksdb-5.17.2/table/format.h	2018-11-12 19:57:32.000000000 +0000
@@ -54,6 +54,7 @@
 
   void EncodeTo(std::string* dst) const;
   Status DecodeFrom(Slice* input);
+  Status DecodeSizeFrom(uint64_t offset, Slice* input);
 
   // Return a string that contains the copy of handle.
   std::string ToString(bool hex = true) const;
@@ -90,7 +91,7 @@
 }
 
 inline bool BlockBasedTableSupportedVersion(uint32_t version) {
-  return version <= 3;
+  return version <= 4;
 }
 
 // Footer encapsulates the fixed information stored at the tail
diff -Nru rocksdb-5.15.10/table/get_context.cc rocksdb-5.17.2/table/get_context.cc
--- rocksdb-5.15.10/table/get_context.cc	2018-09-13 17:25:20.000000000 +0000
+++ rocksdb-5.17.2/table/get_context.cc	2018-11-12 19:57:32.000000000 +0000
@@ -91,11 +91,73 @@
   }
 }
 
-void GetContext::RecordCounters(Tickers ticker, size_t val) {
-  if (ticker == Tickers::TICKER_ENUM_MAX) {
-    return;
+void GetContext::ReportCounters() {
+  if (get_context_stats_.num_cache_hit > 0) {
+    RecordTick(statistics_, BLOCK_CACHE_HIT, get_context_stats_.num_cache_hit);
+  }
+  if (get_context_stats_.num_cache_index_hit > 0) {
+    RecordTick(statistics_, BLOCK_CACHE_INDEX_HIT,
+               get_context_stats_.num_cache_index_hit);
+  }
+  if (get_context_stats_.num_cache_data_hit > 0) {
+    RecordTick(statistics_, BLOCK_CACHE_DATA_HIT,
+               get_context_stats_.num_cache_data_hit);
+  }
+  if (get_context_stats_.num_cache_filter_hit > 0) {
+    RecordTick(statistics_, BLOCK_CACHE_FILTER_HIT,
+               get_context_stats_.num_cache_filter_hit);
+  }
+  if (get_context_stats_.num_cache_index_miss > 0) {
+    RecordTick(statistics_, BLOCK_CACHE_INDEX_MISS,
+               get_context_stats_.num_cache_index_miss);
+  }
+  if (get_context_stats_.num_cache_filter_miss > 0) {
+    RecordTick(statistics_, BLOCK_CACHE_FILTER_MISS,
+               get_context_stats_.num_cache_filter_miss);
+  }
+  if (get_context_stats_.num_cache_data_miss > 0) {
+    RecordTick(statistics_, BLOCK_CACHE_DATA_MISS,
+               get_context_stats_.num_cache_data_miss);
+  }
+  if (get_context_stats_.num_cache_bytes_read > 0) {
+    RecordTick(statistics_, BLOCK_CACHE_BYTES_READ,
+               get_context_stats_.num_cache_bytes_read);
+  }
+  if (get_context_stats_.num_cache_miss > 0) {
+    RecordTick(statistics_, BLOCK_CACHE_MISS,
+               get_context_stats_.num_cache_miss);
+  }
+  if (get_context_stats_.num_cache_add > 0) {
+    RecordTick(statistics_, BLOCK_CACHE_ADD, get_context_stats_.num_cache_add);
+  }
+  if (get_context_stats_.num_cache_bytes_write > 0) {
+    RecordTick(statistics_, BLOCK_CACHE_BYTES_WRITE,
+               get_context_stats_.num_cache_bytes_write);
+  }
+  if (get_context_stats_.num_cache_index_add > 0) {
+    RecordTick(statistics_, BLOCK_CACHE_INDEX_ADD,
+               get_context_stats_.num_cache_index_add);
+  }
+  if (get_context_stats_.num_cache_index_bytes_insert > 0) {
+    RecordTick(statistics_, BLOCK_CACHE_INDEX_BYTES_INSERT,
+               get_context_stats_.num_cache_index_bytes_insert);
+  }
+  if (get_context_stats_.num_cache_data_add > 0) {
+    RecordTick(statistics_, BLOCK_CACHE_DATA_ADD,
+               get_context_stats_.num_cache_data_add);
+  }
+  if (get_context_stats_.num_cache_data_bytes_insert > 0) {
+    RecordTick(statistics_, BLOCK_CACHE_DATA_BYTES_INSERT,
+               get_context_stats_.num_cache_data_bytes_insert);
+  }
+  if (get_context_stats_.num_cache_filter_add > 0) {
+    RecordTick(statistics_, BLOCK_CACHE_FILTER_ADD,
+               get_context_stats_.num_cache_filter_add);
+  }
+  if (get_context_stats_.num_cache_filter_bytes_insert > 0) {
+    RecordTick(statistics_, BLOCK_CACHE_FILTER_BYTES_INSERT,
+               get_context_stats_.num_cache_filter_bytes_insert);
   }
-  tickers_value[ticker] += static_cast<uint64_t>(val);
 }
 
 bool GetContext::SaveValue(const ParsedInternalKey& parsed_key,
diff -Nru rocksdb-5.15.10/table/get_context.h rocksdb-5.17.2/table/get_context.h
--- rocksdb-5.15.10/table/get_context.h	2018-09-13 17:25:20.000000000 +0000
+++ rocksdb-5.17.2/table/get_context.h	2018-11-12 19:57:32.000000000 +0000
@@ -17,6 +17,26 @@
 class MergeContext;
 class PinnedIteratorsManager;
 
+struct GetContextStats {
+  uint64_t num_cache_hit = 0;
+  uint64_t num_cache_index_hit = 0;
+  uint64_t num_cache_data_hit = 0;
+  uint64_t num_cache_filter_hit = 0;
+  uint64_t num_cache_index_miss = 0;
+  uint64_t num_cache_filter_miss = 0;
+  uint64_t num_cache_data_miss = 0;
+  uint64_t num_cache_bytes_read = 0;
+  uint64_t num_cache_miss = 0;
+  uint64_t num_cache_add = 0;
+  uint64_t num_cache_bytes_write = 0;
+  uint64_t num_cache_index_add = 0;
+  uint64_t num_cache_index_bytes_insert = 0;
+  uint64_t num_cache_data_add = 0;
+  uint64_t num_cache_data_bytes_insert = 0;
+  uint64_t num_cache_filter_add = 0;
+  uint64_t num_cache_filter_bytes_insert = 0;
+};
+
 class GetContext {
  public:
   enum GetState {
@@ -27,7 +47,7 @@
     kMerge,  // saver contains the current merge result (the operands)
     kBlobIndex,
   };
-  uint64_t tickers_value[Tickers::TICKER_ENUM_MAX] = {0};
+  GetContextStats get_context_stats_;
 
   GetContext(const Comparator* ucmp, const MergeOperator* merge_operator,
              Logger* logger, Statistics* statistics, GetState init_state,
@@ -77,7 +97,7 @@
     return true;
   }
 
-  void RecordCounters(Tickers ticker, size_t val);
+  void ReportCounters();
 
  private:
   const Comparator* ucmp_;
diff -Nru rocksdb-5.15.10/table/index_builder.cc rocksdb-5.17.2/table/index_builder.cc
--- rocksdb-5.15.10/table/index_builder.cc	2018-09-13 17:25:20.000000000 +0000
+++ rocksdb-5.17.2/table/index_builder.cc	2018-11-12 19:57:32.000000000 +0000
@@ -27,23 +27,26 @@
     BlockBasedTableOptions::IndexType index_type,
     const InternalKeyComparator* comparator,
     const InternalKeySliceTransform* int_key_slice_transform,
+    const bool use_value_delta_encoding,
     const BlockBasedTableOptions& table_opt) {
   IndexBuilder* result = nullptr;
   switch (index_type) {
     case BlockBasedTableOptions::kBinarySearch: {
-      result = new ShortenedIndexBuilder(comparator,
-                                         table_opt.index_block_restart_interval,
-                                         table_opt.format_version);
+      result = new ShortenedIndexBuilder(
+          comparator, table_opt.index_block_restart_interval,
+          table_opt.format_version, use_value_delta_encoding);
     }
   break;
     case BlockBasedTableOptions::kHashSearch: {
       result = new HashIndexBuilder(comparator, int_key_slice_transform,
                                     table_opt.index_block_restart_interval,
-                                    table_opt.format_version);
+                                    table_opt.format_version,
+                                    use_value_delta_encoding);
     }
   break;
     case BlockBasedTableOptions::kTwoLevelIndexSearch: {
-      result = PartitionedIndexBuilder::CreateIndexBuilder(comparator, table_opt);
+      result = PartitionedIndexBuilder::CreateIndexBuilder(
+          comparator, use_value_delta_encoding, table_opt);
     }
     break;
     default: {
@@ -56,18 +59,23 @@
 
 PartitionedIndexBuilder* PartitionedIndexBuilder::CreateIndexBuilder(
     const InternalKeyComparator* comparator,
+    const bool use_value_delta_encoding,
     const BlockBasedTableOptions& table_opt) {
-  return new PartitionedIndexBuilder(comparator, table_opt);
+  return new PartitionedIndexBuilder(comparator, table_opt,
+                                     use_value_delta_encoding);
 }
 
 PartitionedIndexBuilder::PartitionedIndexBuilder(
     const InternalKeyComparator* comparator,
-    const BlockBasedTableOptions& table_opt)
+    const BlockBasedTableOptions& table_opt,
+    const bool use_value_delta_encoding)
     : IndexBuilder(comparator),
       index_block_builder_(table_opt.index_block_restart_interval,
-                           table_opt.format_version),
+                           true /*use_delta_encoding*/,
+                           use_value_delta_encoding),
       index_block_builder_without_seq_(table_opt.index_block_restart_interval,
-                                       table_opt.format_version),
+                                       true /*use_delta_encoding*/,
+                                       use_value_delta_encoding),
       sub_index_builder_(nullptr),
       table_opt_(table_opt),
       // We start by false. After each partition we revise the value based on
@@ -76,7 +84,8 @@
       // sub_index_builder. Otherwise, it could be set to true even one of the
       // sub_index_builders could not safely exclude seq from the keys, then it
       // wil be enforced on all sub_index_builders on ::Finish.
-      seperator_is_key_plus_seq_(false) {}
+      seperator_is_key_plus_seq_(false),
+      use_value_delta_encoding_(use_value_delta_encoding) {}
 
 PartitionedIndexBuilder::~PartitionedIndexBuilder() {
   delete sub_index_builder_;
@@ -86,7 +95,7 @@
   assert(sub_index_builder_ == nullptr);
   sub_index_builder_ = new ShortenedIndexBuilder(
       comparator_, table_opt_.index_block_restart_interval,
-      table_opt_.format_version);
+      table_opt_.format_version, use_value_delta_encoding_);
   flush_policy_.reset(FlushBlockBySizePolicyFactory::NewFlushBlockPolicy(
       table_opt_.metadata_block_size, table_opt_.block_size_deviation,
       // Note: this is sub-optimal since sub_index_builder_ could later reset
@@ -162,10 +171,18 @@
     Entry& last_entry = entries_.front();
     std::string handle_encoding;
     last_partition_block_handle.EncodeTo(&handle_encoding);
-    index_block_builder_.Add(last_entry.key, handle_encoding);
+    std::string handle_delta_encoding;
+    PutVarsignedint64(
+        &handle_delta_encoding,
+        last_partition_block_handle.size() - last_encoded_handle_.size());
+    last_encoded_handle_ = last_partition_block_handle;
+    const Slice handle_delta_encoding_slice(handle_delta_encoding);
+    index_block_builder_.Add(last_entry.key, handle_encoding,
+                             &handle_delta_encoding_slice);
     if (!seperator_is_key_plus_seq_) {
       index_block_builder_without_seq_.Add(ExtractUserKey(last_entry.key),
-                                           handle_encoding);
+                                           handle_encoding,
+                                           &handle_delta_encoding_slice);
     }
     entries_.pop_front();
   }
@@ -193,7 +210,5 @@
   }
 }
 
-size_t PartitionedIndexBuilder::NumPartitions() const {
-  return partition_cnt_;
-}
+size_t PartitionedIndexBuilder::NumPartitions() const { return partition_cnt_; }
 }  // namespace rocksdb
diff -Nru rocksdb-5.15.10/table/index_builder.h rocksdb-5.17.2/table/index_builder.h
--- rocksdb-5.15.10/table/index_builder.h	2018-09-13 17:25:20.000000000 +0000
+++ rocksdb-5.17.2/table/index_builder.h	2018-11-12 19:57:32.000000000 +0000
@@ -38,6 +38,7 @@
       BlockBasedTableOptions::IndexType index_type,
       const rocksdb::InternalKeyComparator* comparator,
       const InternalKeySliceTransform* int_key_slice_transform,
+      const bool use_value_delta_encoding,
       const BlockBasedTableOptions& table_opt);
 
   // Index builder will construct a set of blocks which contain:
@@ -119,11 +120,16 @@
 class ShortenedIndexBuilder : public IndexBuilder {
  public:
   explicit ShortenedIndexBuilder(const InternalKeyComparator* comparator,
-                                 int index_block_restart_interval,
-                                 uint32_t format_version)
+                                 const int index_block_restart_interval,
+                                 const uint32_t format_version,
+                                 const bool use_value_delta_encoding)
       : IndexBuilder(comparator),
-        index_block_builder_(index_block_restart_interval),
-        index_block_builder_without_seq_(index_block_restart_interval) {
+        index_block_builder_(index_block_restart_interval,
+                             true /*use_delta_encoding*/,
+                             use_value_delta_encoding),
+        index_block_builder_without_seq_(index_block_restart_interval,
+                                         true /*use_delta_encoding*/,
+                                         use_value_delta_encoding) {
     // Making the default true will disable the feature for old versions
     seperator_is_key_plus_seq_ = (format_version <= 2);
   }
@@ -147,10 +153,17 @@
 
     std::string handle_encoding;
     block_handle.EncodeTo(&handle_encoding);
-    index_block_builder_.Add(sep, handle_encoding);
+    std::string handle_delta_encoding;
+    PutVarsignedint64(&handle_delta_encoding,
+                      block_handle.size() - last_encoded_handle_.size());
+    assert(handle_delta_encoding.size() != 0);
+    last_encoded_handle_ = block_handle;
+    const Slice handle_delta_encoding_slice(handle_delta_encoding);
+    index_block_builder_.Add(sep, handle_encoding,
+                             &handle_delta_encoding_slice);
     if (!seperator_is_key_plus_seq_) {
-      index_block_builder_without_seq_.Add(ExtractUserKey(sep),
-                                           handle_encoding);
+      index_block_builder_without_seq_.Add(ExtractUserKey(sep), handle_encoding,
+                                           &handle_delta_encoding_slice);
     }
   }
 
@@ -168,9 +181,7 @@
     return Status::OK();
   }
 
-  virtual size_t IndexSize() const override {
-    return index_size_;
-  }
+  virtual size_t IndexSize() const override { return index_size_; }
 
   virtual bool seperator_is_key_plus_seq() override {
     return seperator_is_key_plus_seq_;
@@ -182,6 +193,7 @@
   BlockBuilder index_block_builder_;
   BlockBuilder index_block_builder_without_seq_;
   bool seperator_is_key_plus_seq_;
+  BlockHandle last_encoded_handle_;
 };
 
 // HashIndexBuilder contains a binary-searchable primary index and the
@@ -216,10 +228,10 @@
   explicit HashIndexBuilder(const InternalKeyComparator* comparator,
                             const SliceTransform* hash_key_extractor,
                             int index_block_restart_interval,
-                            int format_version)
+                            int format_version, bool use_value_delta_encoding)
       : IndexBuilder(comparator),
         primary_index_builder_(comparator, index_block_restart_interval,
-                               format_version),
+                               format_version, use_value_delta_encoding),
         hash_key_extractor_(hash_key_extractor) {}
 
   virtual void AddIndexEntry(std::string* last_key_in_current_block,
@@ -322,10 +334,12 @@
  public:
   static PartitionedIndexBuilder* CreateIndexBuilder(
       const rocksdb::InternalKeyComparator* comparator,
+      const bool use_value_delta_encoding,
       const BlockBasedTableOptions& table_opt);
 
   explicit PartitionedIndexBuilder(const InternalKeyComparator* comparator,
-                                   const BlockBasedTableOptions& table_opt);
+                                   const BlockBasedTableOptions& table_opt,
+                                   const bool use_value_delta_encoding);
 
   virtual ~PartitionedIndexBuilder();
 
@@ -337,12 +351,8 @@
       IndexBlocks* index_blocks,
       const BlockHandle& last_partition_block_handle) override;
 
-  virtual size_t IndexSize() const override {
-    return index_size_;
-  }
-  size_t TopLevelIndexSize(uint64_t) const {
-    return top_level_index_size_;
-  }
+  virtual size_t IndexSize() const override { return index_size_; }
+  size_t TopLevelIndexSize(uint64_t) const { return top_level_index_size_; }
   size_t NumPartitions() const;
 
   inline bool ShouldCutFilterBlock() {
@@ -364,6 +374,8 @@
     return seperator_is_key_plus_seq_;
   }
 
+  bool get_use_value_delta_encoding() { return use_value_delta_encoding_; }
+
  private:
   // Set after ::Finish is called
   size_t top_level_index_size_ = 0;
@@ -388,10 +400,12 @@
   bool finishing_indexes = false;
   const BlockBasedTableOptions& table_opt_;
   bool seperator_is_key_plus_seq_;
+  bool use_value_delta_encoding_;
   // true if an external entity (such as filter partition builder) request
   // cutting the next partition
   bool partition_cut_requested_ = true;
   // true if it should cut the next filter partition block
   bool cut_filter_block = false;
+  BlockHandle last_encoded_handle_;
 };
 }  // namespace rocksdb
diff -Nru rocksdb-5.15.10/table/internal_iterator.h rocksdb-5.17.2/table/internal_iterator.h
--- rocksdb-5.15.10/table/internal_iterator.h	2018-09-13 17:25:20.000000000 +0000
+++ rocksdb-5.17.2/table/internal_iterator.h	2018-11-12 19:57:32.000000000 +0000
@@ -10,15 +10,17 @@
 #include "rocksdb/comparator.h"
 #include "rocksdb/iterator.h"
 #include "rocksdb/status.h"
+#include "table/format.h"
 
 namespace rocksdb {
 
 class PinnedIteratorsManager;
 
-class InternalIterator : public Cleanable {
+template <class TValue>
+class InternalIteratorBase : public Cleanable {
  public:
-  InternalIterator() {}
-  virtual ~InternalIterator() {}
+  InternalIteratorBase() {}
+  virtual ~InternalIteratorBase() {}
 
   // An iterator is either positioned at a key/value pair, or
   // not valid.  This method returns true iff the iterator is valid.
@@ -66,7 +68,7 @@
   // the returned slice is valid only until the next modification of
   // the iterator.
   // REQUIRES: Valid()
-  virtual Slice value() const = 0;
+  virtual TValue value() const = 0;
 
   // If an error has occurred, return it.  Else return an ok status.
   // If non-blocking IO is requested and this operation cannot be
@@ -117,14 +119,24 @@
 
  private:
   // No copying allowed
-  InternalIterator(const InternalIterator&) = delete;
-  InternalIterator& operator=(const InternalIterator&) = delete;
+  InternalIteratorBase(const InternalIteratorBase&) = delete;
+  InternalIteratorBase& operator=(const InternalIteratorBase&) = delete;
 };
 
+using InternalIterator = InternalIteratorBase<Slice>;
+
 // Return an empty iterator (yields nothing).
-extern InternalIterator* NewEmptyInternalIterator();
+template <class TValue = Slice>
+extern InternalIteratorBase<TValue>* NewEmptyInternalIterator();
 
 // Return an empty iterator with the specified status.
-extern InternalIterator* NewErrorInternalIterator(const Status& status);
+template <class TValue = Slice>
+extern InternalIteratorBase<TValue>* NewErrorInternalIterator(
+    const Status& status);
+
+// Return an empty iterator with the specified status, allocated arena.
+template <class TValue = Slice>
+extern InternalIteratorBase<TValue>* NewErrorInternalIterator(
+    const Status& status, Arena* arena);
 
 }  // namespace rocksdb
diff -Nru rocksdb-5.15.10/table/iterator.cc rocksdb-5.17.2/table/iterator.cc
--- rocksdb-5.15.10/table/iterator.cc	2018-09-13 17:25:20.000000000 +0000
+++ rocksdb-5.17.2/table/iterator.cc	2018-11-12 19:57:32.000000000 +0000
@@ -131,7 +131,8 @@
   Status status_;
 };
 
-class EmptyInternalIterator : public InternalIterator {
+template <class TValue = Slice>
+class EmptyInternalIterator : public InternalIteratorBase<TValue> {
  public:
   explicit EmptyInternalIterator(const Status& s) : status_(s) {}
   virtual bool Valid() const override { return false; }
@@ -145,9 +146,9 @@
     assert(false);
     return Slice();
   }
-  Slice value() const override {
+  TValue value() const override {
     assert(false);
-    return Slice();
+    return TValue();
   }
   virtual Status status() const override { return status_; }
 
@@ -164,30 +165,48 @@
   return new EmptyIterator(status);
 }
 
-InternalIterator* NewEmptyInternalIterator() {
-  return new EmptyInternalIterator(Status::OK());
-}
-
-InternalIterator* NewEmptyInternalIterator(Arena* arena) {
+template <class TValue>
+InternalIteratorBase<TValue>* NewErrorInternalIterator(const Status& status) {
+  return new EmptyInternalIterator<TValue>(status);
+}
+template InternalIteratorBase<BlockHandle>* NewErrorInternalIterator(
+    const Status& status);
+template InternalIteratorBase<Slice>* NewErrorInternalIterator(
+    const Status& status);
+
+template <class TValue>
+InternalIteratorBase<TValue>* NewErrorInternalIterator(const Status& status,
+                                                       Arena* arena) {
   if (arena == nullptr) {
-    return NewEmptyInternalIterator();
+    return NewErrorInternalIterator<TValue>(status);
   } else {
     auto mem = arena->AllocateAligned(sizeof(EmptyIterator));
-    return new (mem) EmptyInternalIterator(Status::OK());
+    return new (mem) EmptyInternalIterator<TValue>(status);
   }
 }
-
-InternalIterator* NewErrorInternalIterator(const Status& status) {
-  return new EmptyInternalIterator(status);
+template InternalIteratorBase<BlockHandle>* NewErrorInternalIterator(
+    const Status& status, Arena* arena);
+template InternalIteratorBase<Slice>* NewErrorInternalIterator(
+    const Status& status, Arena* arena);
+
+template <class TValue>
+InternalIteratorBase<TValue>* NewEmptyInternalIterator() {
+  return new EmptyInternalIterator<TValue>(Status::OK());
 }
+template InternalIteratorBase<BlockHandle>* NewEmptyInternalIterator();
+template InternalIteratorBase<Slice>* NewEmptyInternalIterator();
 
-InternalIterator* NewErrorInternalIterator(const Status& status, Arena* arena) {
+template <class TValue>
+InternalIteratorBase<TValue>* NewEmptyInternalIterator(Arena* arena) {
   if (arena == nullptr) {
-    return NewErrorInternalIterator(status);
+    return NewEmptyInternalIterator<TValue>();
   } else {
     auto mem = arena->AllocateAligned(sizeof(EmptyIterator));
-    return new (mem) EmptyInternalIterator(status);
+    return new (mem) EmptyInternalIterator<TValue>(Status::OK());
   }
 }
+template InternalIteratorBase<BlockHandle>* NewEmptyInternalIterator(
+    Arena* arena);
+template InternalIteratorBase<Slice>* NewEmptyInternalIterator(Arena* arena);
 
 }  // namespace rocksdb
diff -Nru rocksdb-5.15.10/table/iterator_wrapper.h rocksdb-5.17.2/table/iterator_wrapper.h
--- rocksdb-5.15.10/table/iterator_wrapper.h	2018-09-13 17:25:20.000000000 +0000
+++ rocksdb-5.17.2/table/iterator_wrapper.h	2018-11-12 19:57:32.000000000 +0000
@@ -19,19 +19,21 @@
 // the valid() and key() results for an underlying iterator.
 // This can help avoid virtual function calls and also gives better
 // cache locality.
-class IteratorWrapper {
+template <class TValue = Slice>
+class IteratorWrapperBase {
  public:
-  IteratorWrapper() : iter_(nullptr), valid_(false) {}
-  explicit IteratorWrapper(InternalIterator* _iter) : iter_(nullptr) {
+  IteratorWrapperBase() : iter_(nullptr), valid_(false) {}
+  explicit IteratorWrapperBase(InternalIteratorBase<TValue>* _iter)
+      : iter_(nullptr) {
     Set(_iter);
   }
-  ~IteratorWrapper() {}
-  InternalIterator* iter() const { return iter_; }
+  ~IteratorWrapperBase() {}
+  InternalIteratorBase<TValue>* iter() const { return iter_; }
 
   // Set the underlying Iterator to _iter and return
   // previous underlying Iterator.
-  InternalIterator* Set(InternalIterator* _iter) {
-    InternalIterator* old_iter = iter_;
+  InternalIteratorBase<TValue>* Set(InternalIteratorBase<TValue>* _iter) {
+    InternalIteratorBase<TValue>* old_iter = iter_;
 
     iter_ = _iter;
     if (iter_ == nullptr) {
@@ -47,7 +49,7 @@
       if (!is_arena_mode) {
         delete iter_;
       } else {
-        iter_->~InternalIterator();
+        iter_->~InternalIteratorBase<TValue>();
       }
     }
   }
@@ -55,7 +57,10 @@
   // Iterator interface methods
   bool Valid() const        { return valid_; }
   Slice key() const         { assert(Valid()); return key_; }
-  Slice value() const       { assert(Valid()); return iter_->value(); }
+  TValue value() const {
+    assert(Valid());
+    return iter_->value();
+  }
   // Methods below require iter() != nullptr
   Status status() const     { assert(iter_); return iter_->status(); }
   void Next()               { assert(iter_); iter_->Next();        Update(); }
@@ -91,17 +96,16 @@
     }
   }
 
-  InternalIterator* iter_;
+  InternalIteratorBase<TValue>* iter_;
   bool valid_;
   Slice key_;
 };
 
+using IteratorWrapper = IteratorWrapperBase<Slice>;
+
 class Arena;
 // Return an empty iterator (yields nothing) allocated from arena.
-extern InternalIterator* NewEmptyInternalIterator(Arena* arena);
-
-// Return an empty iterator with the specified status, allocated arena.
-extern InternalIterator* NewErrorInternalIterator(const Status& status,
-                                                  Arena* arena);
+template <class TValue = Slice>
+extern InternalIteratorBase<TValue>* NewEmptyInternalIterator(Arena* arena);
 
 }  // namespace rocksdb
diff -Nru rocksdb-5.15.10/table/merging_iterator.cc rocksdb-5.17.2/table/merging_iterator.cc
--- rocksdb-5.15.10/table/merging_iterator.cc	2018-09-13 17:25:20.000000000 +0000
+++ rocksdb-5.17.2/table/merging_iterator.cc	2018-11-12 19:57:32.000000000 +0000
@@ -387,7 +387,7 @@
                                      Arena* arena, bool prefix_seek_mode) {
   assert(n >= 0);
   if (n == 0) {
-    return NewEmptyInternalIterator(arena);
+    return NewEmptyInternalIterator<Slice>(arena);
   } else if (n == 1) {
     return list[0];
   } else {
diff -Nru rocksdb-5.15.10/table/merging_iterator.h rocksdb-5.17.2/table/merging_iterator.h
--- rocksdb-5.15.10/table/merging_iterator.h	2018-09-13 17:25:20.000000000 +0000
+++ rocksdb-5.17.2/table/merging_iterator.h	2018-11-12 19:57:32.000000000 +0000
@@ -15,9 +15,11 @@
 namespace rocksdb {
 
 class Comparator;
-class InternalIterator;
 class Env;
 class Arena;
+template <class TValue>
+class InternalIteratorBase;
+using InternalIterator = InternalIteratorBase<Slice>;
 
 // Return an iterator that provided the union of the data in
 // children[0,n-1].  Takes ownership of the child iterators and
diff -Nru rocksdb-5.15.10/table/meta_blocks.cc rocksdb-5.17.2/table/meta_blocks.cc
--- rocksdb-5.15.10/table/meta_blocks.cc	2018-09-13 17:25:20.000000000 +0000
+++ rocksdb-5.17.2/table/meta_blocks.cc	2018-11-12 19:57:32.000000000 +0000
@@ -76,6 +76,8 @@
     Add(TablePropertiesNames::kTopLevelIndexSize, props.top_level_index_size);
   }
   Add(TablePropertiesNames::kIndexKeyIsUserKey, props.index_key_is_user_key);
+  Add(TablePropertiesNames::kIndexValueIsDeltaEncoded,
+      props.index_value_is_delta_encoded);
   Add(TablePropertiesNames::kNumEntries, props.num_entries);
   Add(TablePropertiesNames::kNumRangeDeletions, props.num_range_deletions);
   Add(TablePropertiesNames::kNumDataBlocks, props.num_data_blocks);
@@ -218,6 +220,8 @@
        &new_table_properties->top_level_index_size},
       {TablePropertiesNames::kIndexKeyIsUserKey,
        &new_table_properties->index_key_is_user_key},
+      {TablePropertiesNames::kIndexValueIsDeltaEncoded,
+       &new_table_properties->index_value_is_delta_encoded},
       {TablePropertiesNames::kFilterSize, &new_table_properties->filter_size},
       {TablePropertiesNames::kRawKeySize, &new_table_properties->raw_key_size},
       {TablePropertiesNames::kRawValueSize,
diff -Nru rocksdb-5.15.10/table/meta_blocks.h rocksdb-5.17.2/table/meta_blocks.h
--- rocksdb-5.15.10/table/meta_blocks.h	2018-09-13 17:25:20.000000000 +0000
+++ rocksdb-5.17.2/table/meta_blocks.h	2018-11-12 19:57:32.000000000 +0000
@@ -27,7 +27,6 @@
 class Logger;
 class RandomAccessFile;
 struct TableProperties;
-class InternalIterator;
 
 class MetaIndexBuilder {
  public:
diff -Nru rocksdb-5.15.10/table/mock_table.cc rocksdb-5.17.2/table/mock_table.cc
--- rocksdb-5.15.10/table/mock_table.cc	2018-09-13 17:25:20.000000000 +0000
+++ rocksdb-5.17.2/table/mock_table.cc	2018-11-12 19:57:32.000000000 +0000
@@ -93,7 +93,7 @@
     return s;
   }
 
-  WritableFileWriter file_writer(std::move(file), EnvOptions());
+  WritableFileWriter file_writer(std::move(file), fname, EnvOptions());
 
   uint32_t id = GetAndWriteNextID(&file_writer);
   file_system_.files.insert({id, std::move(file_contents)});
diff -Nru rocksdb-5.15.10/table/partitioned_filter_block.cc rocksdb-5.17.2/table/partitioned_filter_block.cc
--- rocksdb-5.15.10/table/partitioned_filter_block.cc	2018-09-13 17:25:20.000000000 +0000
+++ rocksdb-5.17.2/table/partitioned_filter_block.cc	2018-11-12 19:57:32.000000000 +0000
@@ -26,12 +26,17 @@
 PartitionedFilterBlockBuilder::PartitionedFilterBlockBuilder(
     const SliceTransform* prefix_extractor, bool whole_key_filtering,
     FilterBitsBuilder* filter_bits_builder, int index_block_restart_interval,
+    const bool use_value_delta_encoding,
     PartitionedIndexBuilder* const p_index_builder,
     const uint32_t partition_size)
     : FullFilterBlockBuilder(prefix_extractor, whole_key_filtering,
                              filter_bits_builder),
-      index_on_filter_block_builder_(index_block_restart_interval),
-      index_on_filter_block_builder_without_seq_(index_block_restart_interval),
+      index_on_filter_block_builder_(index_block_restart_interval,
+                                     true /*use_delta_encoding*/,
+                                     use_value_delta_encoding),
+      index_on_filter_block_builder_without_seq_(index_block_restart_interval,
+                                                 true /*use_delta_encoding*/,
+                                                 use_value_delta_encoding),
       p_index_builder_(p_index_builder),
       filters_in_partition_(0),
       num_added_(0) {
@@ -73,10 +78,18 @@
     FilterEntry& last_entry = filters.front();
     std::string handle_encoding;
     last_partition_block_handle.EncodeTo(&handle_encoding);
-    index_on_filter_block_builder_.Add(last_entry.key, handle_encoding);
+    std::string handle_delta_encoding;
+    PutVarsignedint64(
+        &handle_delta_encoding,
+        last_partition_block_handle.size() - last_encoded_handle_.size());
+    last_encoded_handle_ = last_partition_block_handle;
+    const Slice handle_delta_encoding_slice(handle_delta_encoding);
+    index_on_filter_block_builder_.Add(last_entry.key, handle_encoding,
+                                       &handle_delta_encoding_slice);
     if (!p_index_builder_->seperator_is_key_plus_seq()) {
       index_on_filter_block_builder_without_seq_.Add(
-          ExtractUserKey(last_entry.key), handle_encoding);
+          ExtractUserKey(last_entry.key), handle_encoding,
+          &handle_delta_encoding_slice);
     }
     filters.pop_front();
   } else {
@@ -109,12 +122,14 @@
     const SliceTransform* prefix_extractor, bool _whole_key_filtering,
     BlockContents&& contents, FilterBitsReader* /*filter_bits_reader*/,
     Statistics* stats, const InternalKeyComparator comparator,
-    const BlockBasedTable* table, const bool index_key_includes_seq)
+    const BlockBasedTable* table, const bool index_key_includes_seq,
+    const bool index_value_is_full)
     : FilterBlockReader(contents.data.size(), stats, _whole_key_filtering),
       prefix_extractor_(prefix_extractor),
       comparator_(comparator),
       table_(table),
-      index_key_includes_seq_(index_key_includes_seq) {
+      index_key_includes_seq_(index_key_includes_seq),
+      index_value_is_full_(index_value_is_full) {
   idx_on_fltr_blk_.reset(new Block(std::move(contents),
                                    kDisableGlobalSequenceNumber,
                                    0 /* read_amp_bytes_per_bit */, stats));
@@ -134,15 +149,10 @@
   Statistics* kNullStats = nullptr;
   idx_on_fltr_blk_->NewIterator<IndexBlockIter>(
       &comparator_, comparator_.user_comparator(), &biter, kNullStats, true,
-      index_key_includes_seq_);
+      index_key_includes_seq_, index_value_is_full_);
   biter.SeekToFirst();
   for (; biter.Valid(); biter.Next()) {
-    auto input = biter.value();
-    auto s = handle.DecodeFrom(&input);
-    assert(s.ok());
-    if (!s.ok()) {
-      continue;
-    }
+    handle = biter.value();
     auto key = BlockBasedTable::GetCacheKey(table_->rep_->cache_key_prefix,
                                             table_->rep_->cache_key_prefix_size,
                                             handle, cache_key);
@@ -168,7 +178,7 @@
   }
   bool cached = false;
   auto filter_partition =
-      GetFilterPartition(nullptr /* prefetch_buffer */, &filter_handle, no_io,
+      GetFilterPartition(nullptr /* prefetch_buffer */, filter_handle, no_io,
                          &cached, prefix_extractor);
   if (UNLIKELY(!filter_partition.value)) {
     return true;
@@ -207,7 +217,7 @@
   }
   bool cached = false;
   auto filter_partition =
-      GetFilterPartition(nullptr /* prefetch_buffer */, &filter_handle, no_io,
+      GetFilterPartition(nullptr /* prefetch_buffer */, filter_handle, no_io,
                          &cached, prefix_extractor);
   if (UNLIKELY(!filter_partition.value)) {
     return true;
@@ -225,29 +235,26 @@
   return res;
 }
 
-Slice PartitionedFilterBlockReader::GetFilterPartitionHandle(
+BlockHandle PartitionedFilterBlockReader::GetFilterPartitionHandle(
     const Slice& entry) {
   IndexBlockIter iter;
   Statistics* kNullStats = nullptr;
   idx_on_fltr_blk_->NewIterator<IndexBlockIter>(
       &comparator_, comparator_.user_comparator(), &iter, kNullStats, true,
-      index_key_includes_seq_);
+      index_key_includes_seq_, index_value_is_full_);
   iter.Seek(entry);
   if (UNLIKELY(!iter.Valid())) {
-    return Slice();
+    return BlockHandle(0, 0);
   }
   assert(iter.Valid());
-  Slice handle_value = iter.value();
-  return handle_value;
+  BlockHandle fltr_blk_handle = iter.value();
+  return fltr_blk_handle;
 }
 
 BlockBasedTable::CachableEntry<FilterBlockReader>
 PartitionedFilterBlockReader::GetFilterPartition(
-    FilePrefetchBuffer* prefetch_buffer, Slice* handle_value, const bool no_io,
-    bool* cached, const SliceTransform* prefix_extractor) {
-  BlockHandle fltr_blk_handle;
-  auto s = fltr_blk_handle.DecodeFrom(handle_value);
-  assert(s.ok());
+    FilePrefetchBuffer* prefetch_buffer, BlockHandle& fltr_blk_handle,
+    const bool no_io, bool* cached, const SliceTransform* prefix_extractor) {
   const bool is_a_filter_partition = true;
   auto block_cache = table_->rep_->table_options.block_cache.get();
   if (LIKELY(block_cache != nullptr)) {
@@ -299,39 +306,25 @@
   // Before read partitions, prefetch them to avoid lots of IOs
   auto rep = table_->rep_;
   IndexBlockIter biter;
-  BlockHandle handle;
   Statistics* kNullStats = nullptr;
   idx_on_fltr_blk_->NewIterator<IndexBlockIter>(
       &comparator_, comparator_.user_comparator(), &biter, kNullStats, true,
-      index_key_includes_seq_);
+      index_key_includes_seq_, index_value_is_full_);
   // Index partitions are assumed to be consecuitive. Prefetch them all.
   // Read the first block offset
   biter.SeekToFirst();
-  Slice input = biter.value();
-  Status s = handle.DecodeFrom(&input);
-  assert(s.ok());
-  if (!s.ok()) {
-    ROCKS_LOG_WARN(rep->ioptions.info_log,
-                   "Could not read first index partition");
-    return;
-  }
+  BlockHandle handle = biter.value();
   uint64_t prefetch_off = handle.offset();
 
   // Read the last block's offset
   biter.SeekToLast();
-  input = biter.value();
-  s = handle.DecodeFrom(&input);
-  assert(s.ok());
-  if (!s.ok()) {
-    ROCKS_LOG_WARN(rep->ioptions.info_log,
-                   "Could not read last index partition");
-    return;
-  }
+  handle = biter.value();
   uint64_t last_off = handle.offset() + handle.size() + kBlockTrailerSize;
   uint64_t prefetch_len = last_off - prefetch_off;
   std::unique_ptr<FilePrefetchBuffer> prefetch_buffer;
   auto& file = table_->rep_->file;
   prefetch_buffer.reset(new FilePrefetchBuffer());
+  Status s;
   s = prefetch_buffer->Prefetch(file.get(), prefetch_off,
     static_cast<size_t>(prefetch_len));
 
@@ -339,14 +332,7 @@
   biter.SeekToFirst();
   Cache* block_cache = rep->table_options.block_cache.get();
   for (; biter.Valid(); biter.Next()) {
-    input = biter.value();
-    s = handle.DecodeFrom(&input);
-    assert(s.ok());
-    if (!s.ok()) {
-      ROCKS_LOG_WARN(rep->ioptions.info_log, "Could not read index partition");
-      continue;
-    }
-
+    handle = biter.value();
     const bool no_io = true;
     const bool is_a_filter_partition = true;
     auto filter = table_->GetFilter(
diff -Nru rocksdb-5.15.10/table/partitioned_filter_block.h rocksdb-5.17.2/table/partitioned_filter_block.h
--- rocksdb-5.15.10/table/partitioned_filter_block.h	2018-09-13 17:25:20.000000000 +0000
+++ rocksdb-5.17.2/table/partitioned_filter_block.h	2018-11-12 19:57:32.000000000 +0000
@@ -26,6 +26,7 @@
   explicit PartitionedFilterBlockBuilder(
       const SliceTransform* prefix_extractor, bool whole_key_filtering,
       FilterBitsBuilder* filter_bits_builder, int index_block_restart_interval,
+      const bool use_value_delta_encoding,
       PartitionedIndexBuilder* const p_index_builder,
       const uint32_t partition_size);
 
@@ -65,6 +66,7 @@
   uint32_t filters_in_partition_;
   // Number of keys added
   size_t num_added_;
+  BlockHandle last_encoded_handle_;
 };
 
 class PartitionedFilterBlockReader : public FilterBlockReader,
@@ -74,7 +76,8 @@
       const SliceTransform* prefix_extractor, bool whole_key_filtering,
       BlockContents&& contents, FilterBitsReader* filter_bits_reader,
       Statistics* stats, const InternalKeyComparator comparator,
-      const BlockBasedTable* table, const bool index_key_includes_seq);
+      const BlockBasedTable* table, const bool index_key_includes_seq,
+      const bool index_value_is_full);
   virtual ~PartitionedFilterBlockReader();
 
   virtual bool IsBlockBased() override { return false; }
@@ -89,10 +92,11 @@
   virtual size_t ApproximateMemoryUsage() const override;
 
  private:
-  Slice GetFilterPartitionHandle(const Slice& entry);
+  BlockHandle GetFilterPartitionHandle(const Slice& entry);
   BlockBasedTable::CachableEntry<FilterBlockReader> GetFilterPartition(
-      FilePrefetchBuffer* prefetch_buffer, Slice* handle, const bool no_io,
-      bool* cached, const SliceTransform* prefix_extractor = nullptr);
+      FilePrefetchBuffer* prefetch_buffer, BlockHandle& handle,
+      const bool no_io, bool* cached,
+      const SliceTransform* prefix_extractor = nullptr);
   virtual void CacheDependencies(
       bool bin, const SliceTransform* prefix_extractor) override;
 
@@ -101,6 +105,7 @@
   const InternalKeyComparator comparator_;
   const BlockBasedTable* table_;
   const bool index_key_includes_seq_;
+  const bool index_value_is_full_;
   std::unordered_map<uint64_t,
                      BlockBasedTable::CachableEntry<FilterBlockReader>>
       filter_map_;
diff -Nru rocksdb-5.15.10/table/partitioned_filter_block_test.cc rocksdb-5.17.2/table/partitioned_filter_block_test.cc
--- rocksdb-5.15.10/table/partitioned_filter_block_test.cc	2018-09-13 17:25:20.000000000 +0000
+++ rocksdb-5.17.2/table/partitioned_filter_block_test.cc	2018-11-12 19:57:32.000000000 +0000
@@ -50,7 +50,9 @@
   }
 };
 
-class PartitionedFilterBlockTest : public testing::Test {
+class PartitionedFilterBlockTest
+    : public testing::Test,
+      virtual public ::testing::WithParamInterface<uint32_t> {
  public:
   BlockBasedTableOptions table_options_;
   InternalKeyComparator icomp = InternalKeyComparator(BytewiseComparator());
@@ -60,6 +62,8 @@
     table_options_.no_block_cache = true;  // Otherwise BlockBasedTable::Close
                                            // will access variable that are not
                                            // initialized in our mocked version
+    table_options_.format_version = GetParam();
+    table_options_.index_block_restart_interval = 3;
   }
 
   std::shared_ptr<Cache> cache_;
@@ -100,7 +104,9 @@
   }
 
   PartitionedIndexBuilder* NewIndexBuilder() {
-    return PartitionedIndexBuilder::CreateIndexBuilder(&icomp, table_options_);
+    const bool kValueDeltaEncoded = true;
+    return PartitionedIndexBuilder::CreateIndexBuilder(
+        &icomp, !kValueDeltaEncoded, table_options_);
   }
 
   PartitionedFilterBlockBuilder* NewBuilder(
@@ -113,11 +119,12 @@
               99) /
              100);
     partition_size = std::max(partition_size, static_cast<uint32_t>(1));
+    const bool kValueDeltaEncoded = true;
     return new PartitionedFilterBlockBuilder(
         prefix_extractor, table_options_.whole_key_filtering,
         table_options_.filter_policy->GetFilterBitsBuilder(),
-        table_options_.index_block_restart_interval, p_index_builder,
-        partition_size);
+        table_options_.index_block_restart_interval, !kValueDeltaEncoded,
+        p_index_builder, partition_size);
   }
 
   std::unique_ptr<MockedBlockBasedTable> table;
@@ -143,7 +150,8 @@
                                  !kSkipFilters, !kImmortal)));
     auto reader = new PartitionedFilterBlockReader(
         prefix_extractor, true, BlockContents(slice, false, kNoCompression),
-        nullptr, nullptr, icomp, table.get(), pib->seperator_is_key_plus_seq());
+        nullptr, nullptr, icomp, table.get(), pib->seperator_is_key_plus_seq(),
+        !pib->get_use_value_delta_encoding());
     return reader;
   }
 
@@ -275,14 +283,19 @@
   }
 };
 
-TEST_F(PartitionedFilterBlockTest, EmptyBuilder) {
+INSTANTIATE_TEST_CASE_P(FormatDef, PartitionedFilterBlockTest,
+                        testing::Values(test::kDefaultFormatVersion));
+INSTANTIATE_TEST_CASE_P(FormatLatest, PartitionedFilterBlockTest,
+                        testing::Values(test::kLatestFormatVersion));
+
+TEST_P(PartitionedFilterBlockTest, EmptyBuilder) {
   std::unique_ptr<PartitionedIndexBuilder> pib(NewIndexBuilder());
   std::unique_ptr<PartitionedFilterBlockBuilder> builder(NewBuilder(pib.get()));
   const bool empty = true;
   VerifyReader(builder.get(), pib.get(), empty);
 }
 
-TEST_F(PartitionedFilterBlockTest, OneBlock) {
+TEST_P(PartitionedFilterBlockTest, OneBlock) {
   uint64_t max_index_size = MaxIndexSize();
   for (uint64_t i = 1; i < max_index_size + 1; i++) {
     table_options_.metadata_block_size = i;
@@ -290,7 +303,7 @@
   }
 }
 
-TEST_F(PartitionedFilterBlockTest, TwoBlocksPerKey) {
+TEST_P(PartitionedFilterBlockTest, TwoBlocksPerKey) {
   uint64_t max_index_size = MaxIndexSize();
   for (uint64_t i = 1; i < max_index_size + 1; i++) {
     table_options_.metadata_block_size = i;
@@ -300,7 +313,7 @@
 
 // This reproduces the bug that a prefix is the same among multiple consecutive
 // blocks but the bug would add it only to the first block.
-TEST_F(PartitionedFilterBlockTest, SamePrefixInMultipleBlocks) {
+TEST_P(PartitionedFilterBlockTest, SamePrefixInMultipleBlocks) {
   // some small number to cause partition cuts
   table_options_.metadata_block_size = 1;
   std::unique_ptr<const SliceTransform> prefix_extractor
@@ -326,7 +339,7 @@
   }
 }
 
-TEST_F(PartitionedFilterBlockTest, OneBlockPerKey) {
+TEST_P(PartitionedFilterBlockTest, OneBlockPerKey) {
   uint64_t max_index_size = MaxIndexSize();
   for (uint64_t i = 1; i < max_index_size + 1; i++) {
     table_options_.metadata_block_size = i;
@@ -334,7 +347,7 @@
   }
 }
 
-TEST_F(PartitionedFilterBlockTest, PartitionCount) {
+TEST_P(PartitionedFilterBlockTest, PartitionCount) {
   int num_keys = sizeof(keys) / sizeof(*keys);
   table_options_.metadata_block_size =
       std::max(MaxIndexSize(), MaxFilterSize());
diff -Nru rocksdb-5.15.10/table/plain_table_reader.cc rocksdb-5.17.2/table/plain_table_reader.cc
--- rocksdb-5.15.10/table/plain_table_reader.cc	2018-09-13 17:25:20.000000000 +0000
+++ rocksdb-5.17.2/table/plain_table_reader.cc	2018-11-12 19:57:32.000000000 +0000
@@ -277,7 +277,7 @@
 Status PlainTableReader::MmapDataIfNeeded() {
   if (file_info_.is_mmap_mode) {
     // Get mmapped memory.
-    return file_info_.file->Read(0, file_size_, &file_info_.file_data, nullptr);
+    return file_info_.file->Read(0, static_cast<size_t>(file_size_), &file_info_.file_data, nullptr);
   }
   return Status::OK();
 }
diff -Nru rocksdb-5.15.10/table/plain_table_reader.h rocksdb-5.17.2/table/plain_table_reader.h
--- rocksdb-5.15.10/table/plain_table_reader.h	2018-09-13 17:25:20.000000000 +0000
+++ rocksdb-5.17.2/table/plain_table_reader.h	2018-11-12 19:57:32.000000000 +0000
@@ -38,7 +38,6 @@
 class InternalKeyComparator;
 class PlainTableKeyDecoder;
 class GetContext;
-class InternalIterator;
 
 using std::unique_ptr;
 using std::unordered_map;
diff -Nru rocksdb-5.15.10/table/sst_file_writer.cc rocksdb-5.17.2/table/sst_file_writer.cc
--- rocksdb-5.15.10/table/sst_file_writer.cc	2018-09-13 17:25:20.000000000 +0000
+++ rocksdb-5.17.2/table/sst_file_writer.cc	2018-11-12 19:57:32.000000000 +0000
@@ -238,7 +238,7 @@
       nullptr /* compression_dict */, r->skip_filters, r->column_family_name,
       unknown_level);
   r->file_writer.reset(
-      new WritableFileWriter(std::move(sst_file), r->env_options));
+      new WritableFileWriter(std::move(sst_file), file_path, r->env_options));
 
   // TODO(tec) : If table_factory is using compressed block cache, we will
   // be adding the external sst file blocks into it, which is wasteful.
diff -Nru rocksdb-5.15.10/table/table_builder.h rocksdb-5.17.2/table/table_builder.h
--- rocksdb-5.15.10/table/table_builder.h	2018-09-13 17:25:20.000000000 +0000
+++ rocksdb-5.17.2/table/table_builder.h	2018-11-12 19:57:32.000000000 +0000
@@ -13,6 +13,7 @@
 #include <string>
 #include <utility>
 #include <vector>
+#include "db/dbformat.h"
 #include "db/table_properties_collector.h"
 #include "options/cf_options.h"
 #include "rocksdb/options.h"
@@ -32,13 +33,25 @@
                      const InternalKeyComparator& _internal_comparator,
                      bool _skip_filters = false, bool _immortal = false,
                      int _level = -1)
+      : TableReaderOptions(_ioptions, _prefix_extractor, _env_options,
+                           _internal_comparator, _skip_filters, _immortal,
+                           _level, 0 /* _largest_seqno */) {}
+
+  // @param skip_filters Disables loading/accessing the filter block
+  TableReaderOptions(const ImmutableCFOptions& _ioptions,
+                     const SliceTransform* _prefix_extractor,
+                     const EnvOptions& _env_options,
+                     const InternalKeyComparator& _internal_comparator,
+                     bool _skip_filters, bool _immortal, int _level,
+                     SequenceNumber _largest_seqno)
       : ioptions(_ioptions),
         prefix_extractor(_prefix_extractor),
         env_options(_env_options),
         internal_comparator(_internal_comparator),
         skip_filters(_skip_filters),
         immortal(_immortal),
-        level(_level) {}
+        level(_level),
+        largest_seqno(_largest_seqno) {}
 
   const ImmutableCFOptions& ioptions;
   const SliceTransform* prefix_extractor;
@@ -50,6 +63,8 @@
   bool immortal;
   // what level this table/file is on, -1 for "not set, don't know"
   int level;
+  // largest seqno in the table
+  SequenceNumber largest_seqno;
 };
 
 struct TableBuilderOptions {
diff -Nru rocksdb-5.15.10/table/table_properties.cc rocksdb-5.17.2/table/table_properties.cc
--- rocksdb-5.15.10/table/table_properties.cc	2018-09-13 17:25:20.000000000 +0000
+++ rocksdb-5.17.2/table/table_properties.cc	2018-11-12 19:57:32.000000000 +0000
@@ -94,8 +94,9 @@
   AppendProperty(result, "data block size", data_size, prop_delim, kv_delim);
   char index_block_size_str[80];
   snprintf(index_block_size_str, sizeof(index_block_size_str),
-           "index block size (user-key? %d)",
-           static_cast<int>(index_key_is_user_key));
+           "index block size (user-key? %d, delta-value? %d)",
+           static_cast<int>(index_key_is_user_key),
+           static_cast<int>(index_value_is_delta_encoded));
   AppendProperty(result, index_block_size_str, index_size, prop_delim,
                  kv_delim);
   if (index_partitions != 0) {
@@ -163,6 +164,7 @@
   index_partitions += tp.index_partitions;
   top_level_index_size += tp.top_level_index_size;
   index_key_is_user_key += tp.index_key_is_user_key;
+  index_value_is_delta_encoded += tp.index_value_is_delta_encoded;
   filter_size += tp.filter_size;
   raw_key_size += tp.raw_key_size;
   raw_value_size += tp.raw_value_size;
@@ -181,6 +183,8 @@
     "rocksdb.top-level.index.size";
 const std::string TablePropertiesNames::kIndexKeyIsUserKey =
     "rocksdb.index.key.is.user.key";
+const std::string TablePropertiesNames::kIndexValueIsDeltaEncoded =
+    "rocksdb.index.value.is.delta.encoded";
 const std::string TablePropertiesNames::kFilterSize =
     "rocksdb.filter.size";
 const std::string TablePropertiesNames::kRawKeySize =
diff -Nru rocksdb-5.15.10/table/table_properties_internal.h rocksdb-5.17.2/table/table_properties_internal.h
--- rocksdb-5.15.10/table/table_properties_internal.h	2018-09-13 17:25:20.000000000 +0000
+++ rocksdb-5.17.2/table/table_properties_internal.h	2018-11-12 19:57:32.000000000 +0000
@@ -10,7 +10,6 @@
 
 namespace rocksdb {
 
-class InternalIterator;
 class BlockHandle;
 
 // Seek to the properties block.
diff -Nru rocksdb-5.15.10/table/table_reader_bench.cc rocksdb-5.17.2/table/table_reader_bench.cc
--- rocksdb-5.15.10/table/table_reader_bench.cc	2018-09-13 17:25:20.000000000 +0000
+++ rocksdb-5.17.2/table/table_reader_bench.cc	2018-11-12 19:57:32.000000000 +0000
@@ -94,7 +94,8 @@
     std::vector<std::unique_ptr<IntTblPropCollectorFactory> >
         int_tbl_prop_collector_factories;
 
-    file_writer.reset(new WritableFileWriter(std::move(file), env_options));
+    file_writer.reset(
+        new WritableFileWriter(std::move(file), file_name, env_options));
     int unknown_level = -1;
     tb = opts.table_factory->NewTableBuilder(
         TableBuilderOptions(
diff -Nru rocksdb-5.15.10/table/table_reader.h rocksdb-5.17.2/table/table_reader.h
--- rocksdb-5.15.10/table/table_reader.h	2018-09-13 17:25:20.000000000 +0000
+++ rocksdb-5.17.2/table/table_reader.h	2018-11-12 19:57:32.000000000 +0000
@@ -21,7 +21,6 @@
 struct ReadOptions;
 struct TableProperties;
 class GetContext;
-class InternalIterator;
 
 // A Table is a sorted map from strings to strings.  Tables are
 // immutable and persistent.  A Table may be safely accessed from
diff -Nru rocksdb-5.15.10/table/table_test.cc rocksdb-5.17.2/table/table_test.cc
--- rocksdb-5.15.10/table/table_test.cc	2018-09-13 17:25:20.000000000 +0000
+++ rocksdb-5.17.2/table/table_test.cc	2018-11-12 19:57:32.000000000 +0000
@@ -323,7 +323,8 @@
                             const stl_wrappers::KVMap& kv_map) override {
     Reset();
     soptions.use_mmap_reads = ioptions.allow_mmap_reads;
-    file_writer_.reset(test::GetWritableFileWriter(new test::StringSink()));
+    file_writer_.reset(test::GetWritableFileWriter(new test::StringSink(),
+                                                   "" /* don't care */));
     unique_ptr<TableBuilder> builder;
     std::vector<std::unique_ptr<IntTblPropCollectorFactory>>
         int_tbl_prop_collector_factories;
@@ -364,7 +365,8 @@
         TableReaderOptions(ioptions, moptions.prefix_extractor.get(), soptions,
                            internal_comparator, !kSkipFilters, !kImmortal,
                            level_),
-        std::move(file_reader_), TEST_GetSink()->contents().size(), &table_reader_);
+        std::move(file_reader_), TEST_GetSink()->contents().size(),
+        &table_reader_);
   }
 
   virtual InternalIterator* NewIterator(
@@ -394,12 +396,11 @@
     return ioptions.table_factory->NewTableReader(
         TableReaderOptions(ioptions, moptions.prefix_extractor.get(), soptions,
                            *last_internal_key_),
-        std::move(file_reader_), TEST_GetSink()->contents().size(), &table_reader_);
+        std::move(file_reader_), TEST_GetSink()->contents().size(),
+        &table_reader_);
   }
 
-  virtual TableReader* GetTableReader() {
-    return table_reader_.get();
-  }
+  virtual TableReader* GetTableReader() { return table_reader_.get(); }
 
   virtual bool AnywayDeleteIterator() const override {
     return convert_to_internal_key_;
@@ -1075,6 +1076,7 @@
 };
 class PlainTableTest : public TableTest {};
 class TablePropertyTest : public testing::Test {};
+class BBTTailPrefetchTest : public TableTest {};
 
 INSTANTIATE_TEST_CASE_P(FormatDef, BlockBasedTableTest,
                         testing::Values(test::kDefaultFormatVersion));
@@ -2549,7 +2551,7 @@
   PlainTableFactory factory(plain_table_options);
   test::StringSink sink;
   unique_ptr<WritableFileWriter> file_writer(
-      test::GetWritableFileWriter(new test::StringSink()));
+      test::GetWritableFileWriter(new test::StringSink(), "" /* don't care */));
   Options options;
   const ImmutableCFOptions ioptions(options);
   const MutableCFOptions moptions(options);
@@ -2987,9 +2989,13 @@
 
 class IndexBlockRestartIntervalTest
     : public TableTest,
-      public ::testing::WithParamInterface<int> {
+      public ::testing::WithParamInterface<std::pair<int, bool>> {
  public:
-  static std::vector<int> GetRestartValues() { return {-1, 0, 1, 8, 16, 32}; }
+  static std::vector<std::pair<int, bool>> GetRestartValues() {
+    return {{-1, false}, {0, false},  {1, false}, {8, false},
+            {16, false}, {32, false}, {-1, true}, {0, true},
+            {1, true},   {8, true},   {16, true}, {32, true}};
+  }
 };
 
 INSTANTIATE_TEST_CASE_P(
@@ -3001,12 +3007,16 @@
   const int kKeySize = 100;
   const int kValSize = 500;
 
-  int index_block_restart_interval = GetParam();
+  const int index_block_restart_interval = std::get<0>(GetParam());
+  const bool value_delta_encoding = std::get<1>(GetParam());
 
   Options options;
   BlockBasedTableOptions table_options;
   table_options.block_size = 64;  // small block size to get big index block
   table_options.index_block_restart_interval = index_block_restart_interval;
+  if (value_delta_encoding) {
+    table_options.format_version = 4;
+  }
   options.table_factory.reset(new BlockBasedTableFactory(table_options));
 
   TableConstructor c(BytewiseComparator());
@@ -3131,10 +3141,18 @@
   // rocksdb still works.
 }
 
-TEST_P(BlockBasedTableTest, TableWithGlobalSeqno) {
+/*
+ * Disable TableWithGlobalSeqno since RocksDB does not store global_seqno in
+ * the SST file any more. Instead, RocksDB deduces global_seqno from the
+ * MANIFEST while reading from an SST. Therefore, it's not possible to test the
+ * functionality of global_seqno in a single, isolated unit test without the
+ * involvement of Version, VersionSet, etc.
+ */
+TEST_P(BlockBasedTableTest, DISABLED_TableWithGlobalSeqno) {
   BlockBasedTableOptions bbto = GetBlockBasedTableOptions();
   test::StringSink* sink = new test::StringSink();
-  unique_ptr<WritableFileWriter> file_writer(test::GetWritableFileWriter(sink));
+  unique_ptr<WritableFileWriter> file_writer(
+      test::GetWritableFileWriter(sink, "" /* don't care */));
   Options options;
   options.table_factory.reset(NewBlockBasedTableFactory(bbto));
   const ImmutableCFOptions ioptions(options);
@@ -3315,7 +3333,8 @@
   BlockBasedTableOptions bbto = GetBlockBasedTableOptions();
   bbto.block_align = true;
   test::StringSink* sink = new test::StringSink();
-  unique_ptr<WritableFileWriter> file_writer(test::GetWritableFileWriter(sink));
+  unique_ptr<WritableFileWriter> file_writer(
+      test::GetWritableFileWriter(sink, "" /* don't care */));
   Options options;
   options.compression = kNoCompression;
   options.table_factory.reset(NewBlockBasedTableFactory(bbto));
@@ -3404,7 +3423,8 @@
   BlockBasedTableOptions bbto = GetBlockBasedTableOptions();
   bbto.block_align = true;
   test::StringSink* sink = new test::StringSink();
-  unique_ptr<WritableFileWriter> file_writer(test::GetWritableFileWriter(sink));
+  unique_ptr<WritableFileWriter> file_writer(
+      test::GetWritableFileWriter(sink, "" /* don't care */));
 
   Options options;
   options.compression = kNoCompression;
@@ -3594,6 +3614,145 @@
   ASSERT_NOK(rocksdb::DB::Open(options, kDBPath, &db));
 }
 
+TEST_F(BBTTailPrefetchTest, TestTailPrefetchStats) {
+  TailPrefetchStats tpstats;
+  ASSERT_EQ(0, tpstats.GetSuggestedPrefetchSize());
+  tpstats.RecordEffectiveSize(size_t{1000});
+  tpstats.RecordEffectiveSize(size_t{1005});
+  tpstats.RecordEffectiveSize(size_t{1002});
+  ASSERT_EQ(1005, tpstats.GetSuggestedPrefetchSize());
+
+  // One single super large value shouldn't influence much
+  tpstats.RecordEffectiveSize(size_t{1002000});
+  tpstats.RecordEffectiveSize(size_t{999});
+  ASSERT_LE(1005, tpstats.GetSuggestedPrefetchSize());
+  ASSERT_GT(1200, tpstats.GetSuggestedPrefetchSize());
+
+  // Only history of 32 is kept
+  for (int i = 0; i < 32; i++) {
+    tpstats.RecordEffectiveSize(size_t{100});
+  }
+  ASSERT_EQ(100, tpstats.GetSuggestedPrefetchSize());
+
+  // 16 large values and 16 small values. The result should be closer
+  // to the small value as the algorithm.
+  for (int i = 0; i < 16; i++) {
+    tpstats.RecordEffectiveSize(size_t{1000});
+  }
+  tpstats.RecordEffectiveSize(size_t{10});
+  tpstats.RecordEffectiveSize(size_t{20});
+  for (int i = 0; i < 6; i++) {
+    tpstats.RecordEffectiveSize(size_t{100});
+  }
+  ASSERT_LE(80, tpstats.GetSuggestedPrefetchSize());
+  ASSERT_GT(200, tpstats.GetSuggestedPrefetchSize());
+}
+
+TEST_F(BBTTailPrefetchTest, FilePrefetchBufferMinOffset) {
+  TailPrefetchStats tpstats;
+  FilePrefetchBuffer buffer(nullptr, 0, 0, false, true);
+  buffer.TryReadFromCache(500, 10, nullptr);
+  buffer.TryReadFromCache(480, 10, nullptr);
+  buffer.TryReadFromCache(490, 10, nullptr);
+  ASSERT_EQ(480, buffer.min_offset_read());
+}
+
+TEST_P(BlockBasedTableTest, DataBlockHashIndex) {
+  const int kNumKeys = 500;
+  const int kKeySize = 8;
+  const int kValSize = 40;
+
+  BlockBasedTableOptions table_options = GetBlockBasedTableOptions();
+  table_options.data_block_index_type =
+      BlockBasedTableOptions::kDataBlockBinaryAndHash;
+
+  Options options;
+  options.comparator = BytewiseComparator();
+
+  options.table_factory.reset(new BlockBasedTableFactory(table_options));
+
+  TableConstructor c(options.comparator);
+
+  static Random rnd(1048);
+  for (int i = 0; i < kNumKeys; i++) {
+    // padding one "0" to mark existent keys.
+    std::string random_key(RandomString(&rnd, kKeySize - 1) + "1");
+    InternalKey k(random_key, 0, kTypeValue);
+    c.Add(k.Encode().ToString(), RandomString(&rnd, kValSize));
+  }
+
+  std::vector<std::string> keys;
+  stl_wrappers::KVMap kvmap;
+  const ImmutableCFOptions ioptions(options);
+  const MutableCFOptions moptions(options);
+  const InternalKeyComparator internal_comparator(options.comparator);
+  c.Finish(options, ioptions, moptions, table_options, internal_comparator,
+           &keys, &kvmap);
+
+  auto reader = c.GetTableReader();
+
+  std::unique_ptr<InternalIterator> seek_iter;
+  seek_iter.reset(
+      reader->NewIterator(ReadOptions(), moptions.prefix_extractor.get()));
+  for (int i = 0; i < 2; ++i) {
+    ReadOptions ro;
+    // for every kv, we seek using two method: Get() and Seek()
+    // Get() will use the SuffixIndexHash in Block. For non-existent key it
+    //      will invalidate the iterator
+    // Seek() will use the default BinarySeek() in Block. So for non-existent
+    //      key it will land at the closest key that is large than target.
+
+    // Search for existent keys
+    for (auto& kv : kvmap) {
+      if (i == 0) {
+        // Search using Seek()
+        seek_iter->Seek(kv.first);
+        ASSERT_OK(seek_iter->status());
+        ASSERT_TRUE(seek_iter->Valid());
+        ASSERT_EQ(seek_iter->key(), kv.first);
+        ASSERT_EQ(seek_iter->value(), kv.second);
+      } else {
+        // Search using Get()
+        PinnableSlice value;
+        std::string user_key = ExtractUserKey(kv.first).ToString();
+        GetContext get_context(options.comparator, nullptr, nullptr, nullptr,
+                               GetContext::kNotFound, user_key, &value, nullptr,
+                               nullptr, nullptr, nullptr);
+        ASSERT_OK(reader->Get(ro, kv.first, &get_context,
+                              moptions.prefix_extractor.get()));
+        ASSERT_EQ(get_context.State(), GetContext::kFound);
+        ASSERT_EQ(value, Slice(kv.second));
+        value.Reset();
+      }
+    }
+
+    // Search for non-existent keys
+    for (auto& kv : kvmap) {
+      std::string user_key = ExtractUserKey(kv.first).ToString();
+      user_key.back() = '0';  // make it non-existent key
+      InternalKey internal_key(user_key, 0, kTypeValue);
+      std::string encoded_key = internal_key.Encode().ToString();
+      if (i == 0) {  // Search using Seek()
+        seek_iter->Seek(encoded_key);
+        ASSERT_OK(seek_iter->status());
+        if (seek_iter->Valid()) {
+          ASSERT_TRUE(BytewiseComparator()->Compare(
+                          user_key, ExtractUserKey(seek_iter->key())) < 0);
+        }
+      } else {  // Search using Get()
+        PinnableSlice value;
+        GetContext get_context(options.comparator, nullptr, nullptr, nullptr,
+                               GetContext::kNotFound, user_key, &value, nullptr,
+                               nullptr, nullptr, nullptr);
+        ASSERT_OK(reader->Get(ro, encoded_key, &get_context,
+                              moptions.prefix_extractor.get()));
+        ASSERT_EQ(get_context.State(), GetContext::kNotFound);
+        value.Reset();
+      }
+    }
+  }
+}
+
 }  // namespace rocksdb
 
 int main(int argc, char** argv) {
diff -Nru rocksdb-5.15.10/table/two_level_iterator.cc rocksdb-5.17.2/table/two_level_iterator.cc
--- rocksdb-5.15.10/table/two_level_iterator.cc	2018-09-13 17:25:20.000000000 +0000
+++ rocksdb-5.17.2/table/two_level_iterator.cc	2018-11-12 19:57:32.000000000 +0000
@@ -19,12 +19,13 @@
 
 namespace {
 
-class TwoLevelIterator : public InternalIterator {
+class TwoLevelIndexIterator : public InternalIteratorBase<BlockHandle> {
  public:
-  explicit TwoLevelIterator(TwoLevelIteratorState* state,
-                            InternalIterator* first_level_iter);
+  explicit TwoLevelIndexIterator(
+      TwoLevelIteratorState* state,
+      InternalIteratorBase<BlockHandle>* first_level_iter);
 
-  virtual ~TwoLevelIterator() {
+  virtual ~TwoLevelIndexIterator() {
     first_level_iter_.DeleteIter(false /* is_arena_mode */);
     second_level_iter_.DeleteIter(false /* is_arena_mode */);
     delete state_;
@@ -42,7 +43,7 @@
     assert(Valid());
     return second_level_iter_.key();
   }
-  virtual Slice value() const override {
+  virtual BlockHandle value() const override {
     assert(Valid());
     return second_level_iter_.value();
   }
@@ -68,23 +69,24 @@
   }
   void SkipEmptyDataBlocksForward();
   void SkipEmptyDataBlocksBackward();
-  void SetSecondLevelIterator(InternalIterator* iter);
+  void SetSecondLevelIterator(InternalIteratorBase<BlockHandle>* iter);
   void InitDataBlock();
 
   TwoLevelIteratorState* state_;
-  IteratorWrapper first_level_iter_;
-  IteratorWrapper second_level_iter_;  // May be nullptr
+  IteratorWrapperBase<BlockHandle> first_level_iter_;
+  IteratorWrapperBase<BlockHandle> second_level_iter_;  // May be nullptr
   Status status_;
   // If second_level_iter is non-nullptr, then "data_block_handle_" holds the
   // "index_value" passed to block_function_ to create the second_level_iter.
-  std::string data_block_handle_;
+  BlockHandle data_block_handle_;
 };
 
-TwoLevelIterator::TwoLevelIterator(TwoLevelIteratorState* state,
-                                   InternalIterator* first_level_iter)
+TwoLevelIndexIterator::TwoLevelIndexIterator(
+    TwoLevelIteratorState* state,
+    InternalIteratorBase<BlockHandle>* first_level_iter)
     : state_(state), first_level_iter_(first_level_iter) {}
 
-void TwoLevelIterator::Seek(const Slice& target) {
+void TwoLevelIndexIterator::Seek(const Slice& target) {
   first_level_iter_.Seek(target);
 
   InitDataBlock();
@@ -94,7 +96,7 @@
   SkipEmptyDataBlocksForward();
 }
 
-void TwoLevelIterator::SeekForPrev(const Slice& target) {
+void TwoLevelIndexIterator::SeekForPrev(const Slice& target) {
   first_level_iter_.Seek(target);
   InitDataBlock();
   if (second_level_iter_.iter() != nullptr) {
@@ -112,7 +114,7 @@
   }
 }
 
-void TwoLevelIterator::SeekToFirst() {
+void TwoLevelIndexIterator::SeekToFirst() {
   first_level_iter_.SeekToFirst();
   InitDataBlock();
   if (second_level_iter_.iter() != nullptr) {
@@ -121,7 +123,7 @@
   SkipEmptyDataBlocksForward();
 }
 
-void TwoLevelIterator::SeekToLast() {
+void TwoLevelIndexIterator::SeekToLast() {
   first_level_iter_.SeekToLast();
   InitDataBlock();
   if (second_level_iter_.iter() != nullptr) {
@@ -130,19 +132,19 @@
   SkipEmptyDataBlocksBackward();
 }
 
-void TwoLevelIterator::Next() {
+void TwoLevelIndexIterator::Next() {
   assert(Valid());
   second_level_iter_.Next();
   SkipEmptyDataBlocksForward();
 }
 
-void TwoLevelIterator::Prev() {
+void TwoLevelIndexIterator::Prev() {
   assert(Valid());
   second_level_iter_.Prev();
   SkipEmptyDataBlocksBackward();
 }
 
-void TwoLevelIterator::SkipEmptyDataBlocksForward() {
+void TwoLevelIndexIterator::SkipEmptyDataBlocksForward() {
   while (second_level_iter_.iter() == nullptr ||
          (!second_level_iter_.Valid() && second_level_iter_.status().ok())) {
     // Move to next block
@@ -158,7 +160,7 @@
   }
 }
 
-void TwoLevelIterator::SkipEmptyDataBlocksBackward() {
+void TwoLevelIndexIterator::SkipEmptyDataBlocksBackward() {
   while (second_level_iter_.iter() == nullptr ||
          (!second_level_iter_.Valid() && second_level_iter_.status().ok())) {
     // Move to next block
@@ -174,24 +176,26 @@
   }
 }
 
-void TwoLevelIterator::SetSecondLevelIterator(InternalIterator* iter) {
-  InternalIterator* old_iter = second_level_iter_.Set(iter);
+void TwoLevelIndexIterator::SetSecondLevelIterator(
+    InternalIteratorBase<BlockHandle>* iter) {
+  InternalIteratorBase<BlockHandle>* old_iter = second_level_iter_.Set(iter);
   delete old_iter;
 }
 
-void TwoLevelIterator::InitDataBlock() {
+void TwoLevelIndexIterator::InitDataBlock() {
   if (!first_level_iter_.Valid()) {
     SetSecondLevelIterator(nullptr);
   } else {
-    Slice handle = first_level_iter_.value();
+    BlockHandle handle = first_level_iter_.value();
     if (second_level_iter_.iter() != nullptr &&
         !second_level_iter_.status().IsIncomplete() &&
-        handle.compare(data_block_handle_) == 0) {
+        handle.offset() == data_block_handle_.offset()) {
       // second_level_iter is already constructed with this iterator, so
       // no need to change anything
     } else {
-      InternalIterator* iter = state_->NewSecondaryIterator(handle);
-      data_block_handle_.assign(handle.data(), handle.size());
+      InternalIteratorBase<BlockHandle>* iter =
+          state_->NewSecondaryIterator(handle);
+      data_block_handle_ = handle;
       SetSecondLevelIterator(iter);
     }
   }
@@ -199,8 +203,9 @@
 
 }  // namespace
 
-InternalIterator* NewTwoLevelIterator(TwoLevelIteratorState* state,
-                                      InternalIterator* first_level_iter) {
-  return new TwoLevelIterator(state, first_level_iter);
+InternalIteratorBase<BlockHandle>* NewTwoLevelIterator(
+    TwoLevelIteratorState* state,
+    InternalIteratorBase<BlockHandle>* first_level_iter) {
+  return new TwoLevelIndexIterator(state, first_level_iter);
 }
 }  // namespace rocksdb
diff -Nru rocksdb-5.15.10/table/two_level_iterator.h rocksdb-5.17.2/table/two_level_iterator.h
--- rocksdb-5.15.10/table/two_level_iterator.h	2018-09-13 17:25:20.000000000 +0000
+++ rocksdb-5.17.2/table/two_level_iterator.h	2018-11-12 19:57:32.000000000 +0000
@@ -22,7 +22,8 @@
   TwoLevelIteratorState() {}
 
   virtual ~TwoLevelIteratorState() {}
-  virtual InternalIterator* NewSecondaryIterator(const Slice& handle) = 0;
+  virtual InternalIteratorBase<BlockHandle>* NewSecondaryIterator(
+      const BlockHandle& handle) = 0;
 };
 
 
@@ -36,7 +37,8 @@
 // Uses a supplied function to convert an index_iter value into
 // an iterator over the contents of the corresponding block.
 // Note: this function expects first_level_iter was not created using the arena
-extern InternalIterator* NewTwoLevelIterator(
-    TwoLevelIteratorState* state, InternalIterator* first_level_iter);
+extern InternalIteratorBase<BlockHandle>* NewTwoLevelIterator(
+    TwoLevelIteratorState* state,
+    InternalIteratorBase<BlockHandle>* first_level_iter);
 
 }  // namespace rocksdb
diff -Nru rocksdb-5.15.10/TARGETS rocksdb-5.17.2/TARGETS
--- rocksdb-5.15.10/TARGETS	2018-09-13 17:25:20.000000000 +0000
+++ rocksdb-5.17.2/TARGETS	2018-11-12 19:57:32.000000000 +0000
@@ -1,3 +1,5 @@
+load("@fbcode_macros//build_defs:auto_headers.bzl", "AutoHeaders")
+
 REPO_PATH = package_name() + "/"
 
 BUCK_BINS = "buck-out/gen/" + REPO_PATH
@@ -171,6 +173,8 @@
         "table/cuckoo_table_builder.cc",
         "table/cuckoo_table_factory.cc",
         "table/cuckoo_table_reader.cc",
+        "table/data_block_footer.cc",
+        "table/data_block_hash_index.cc",
         "table/flush_block_policy.cc",
         "table/format.cc",
         "table/full_filter_block.cc",
@@ -218,18 +222,19 @@
         "util/slice.cc",
         "util/sst_file_manager_impl.cc",
         "util/status.cc",
-        "util/status_message.cc",
         "util/string_util.cc",
         "util/sync_point.cc",
         "util/sync_point_impl.cc",
         "util/thread_local.cc",
         "util/threadpool_imp.cc",
+        "util/trace_replay.cc",
         "util/transaction_test_util.cc",
         "util/xxhash.cc",
         "utilities/backupable/backupable_db.cc",
         "utilities/blob_db/blob_compaction_filter.cc",
         "utilities/blob_db/blob_db.cc",
         "utilities/blob_db/blob_db_impl.cc",
+        "utilities/blob_db/blob_db_impl_filesnapshot.cc",
         "utilities/blob_db/blob_dump_tool.cc",
         "utilities/blob_db/blob_file.cc",
         "utilities/blob_db/blob_log_format.cc",
@@ -269,6 +274,7 @@
         "utilities/simulator_cache/sim_cache.cc",
         "utilities/spatialdb/spatial_db.cc",
         "utilities/table_properties_collectors/compact_on_deletion_collector.cc",
+        "utilities/trace/file_trace_reader_writer.cc",
         "utilities/transactions/optimistic_transaction.cc",
         "utilities/transactions/optimistic_transaction_db_impl.cc",
         "utilities/transactions/pessimistic_transaction.cc",
@@ -286,7 +292,7 @@
         "utilities/write_batch_with_index/write_batch_with_index.cc",
         "utilities/write_batch_with_index/write_batch_with_index_internal.cc",
     ],
-    headers = AutoHeaders.RECURSIVE_GLOB,
+    auto_headers = AutoHeaders.RECURSIVE_GLOB,
     arch_preprocessor_flags = rocksdb_arch_preprocessor_flags,
     compiler_flags = rocksdb_compiler_flags,
     preprocessor_flags = rocksdb_preprocessor_flags,
@@ -299,6 +305,7 @@
     srcs = [
         "db/db_test_util.cc",
         "table/mock_table.cc",
+        "tools/trace_analyzer_tool.cc",
         "util/fault_injection_test_env.cc",
         "util/testharness.cc",
         "util/testutil.cc",
@@ -307,7 +314,7 @@
         "utilities/col_buf_encoder.cc",
         "utilities/column_aware_encoding_util.cc",
     ],
-    headers = AutoHeaders.RECURSIVE_GLOB,
+    auto_headers = AutoHeaders.RECURSIVE_GLOB,
     arch_preprocessor_flags = rocksdb_arch_preprocessor_flags,
     compiler_flags = rocksdb_compiler_flags,
     preprocessor_flags = rocksdb_preprocessor_flags,
@@ -319,9 +326,10 @@
     name = "rocksdb_tools_lib",
     srcs = [
         "tools/db_bench_tool.cc",
+        "tools/trace_analyzer_tool.cc",
         "util/testutil.cc",
     ],
-    headers = AutoHeaders.RECURSIVE_GLOB,
+    auto_headers = AutoHeaders.RECURSIVE_GLOB,
     arch_preprocessor_flags = rocksdb_arch_preprocessor_flags,
     compiler_flags = rocksdb_compiler_flags,
     preprocessor_flags = rocksdb_preprocessor_flags,
@@ -332,7 +340,7 @@
 cpp_library(
     name = "env_basic_test_lib",
     srcs = ["env/env_basic_test.cc"],
-    headers = AutoHeaders.RECURSIVE_GLOB,
+    auto_headers = AutoHeaders.RECURSIVE_GLOB,
     arch_preprocessor_flags = rocksdb_arch_preprocessor_flags,
     compiler_flags = rocksdb_compiler_flags,
     preprocessor_flags = rocksdb_preprocessor_flags,
@@ -498,6 +506,11 @@
         "serial",
     ],
     [
+        "data_block_hash_index_test",
+        "table/data_block_hash_index_test.cc",
+        "serial",
+    ],
+    [
         "date_tiered_test",
         "utilities/date_tiered/date_tiered_test.cc",
         "serial",
@@ -923,6 +936,11 @@
         "serial",
     ],
     [
+        "repeatable_thread_test",
+        "util/repeatable_thread_test.cc",
+        "serial",
+    ],
+    [
         "sim_cache_test",
         "utilities/simulator_cache/sim_cache_test.cc",
         "serial",
@@ -983,6 +1001,11 @@
         "serial",
     ],
     [
+        "trace_analyzer_test",
+        "tools/trace_analyzer_test.cc",
+        "serial",
+    ],
+    [
         "transaction_test",
         "utilities/transactions/transaction_test.cc",
         "parallel",
@@ -1064,20 +1087,19 @@
         ttype = "gtest" if test_cfg[2] == "parallel" else "simple"
         test_bin = test_name + "_bin"
 
-        cpp_binary (
-          name = test_bin,
-          srcs = [test_cc],
-          deps = [":rocksdb_test_lib"],
-          preprocessor_flags = rocksdb_preprocessor_flags,
-          arch_preprocessor_flags = rocksdb_arch_preprocessor_flags,
-          compiler_flags = rocksdb_compiler_flags,
-          external_deps = rocksdb_external_deps,
+        cpp_binary(
+            name = test_bin,
+            srcs = [test_cc],
+            arch_preprocessor_flags = rocksdb_arch_preprocessor_flags,
+            compiler_flags = rocksdb_compiler_flags,
+            preprocessor_flags = rocksdb_preprocessor_flags,
+            deps = [":rocksdb_test_lib"],
+            external_deps = rocksdb_external_deps,
         )
 
         custom_unittest(
-          name = test_name,
-          type = ttype,
-          deps = [":" + test_bin],
-          command = [TEST_RUNNER, BUCK_BINS + test_bin]
+            name = test_name,
+            command = [TEST_RUNNER, BUCK_BINS + test_bin],
+            type = ttype,
+            deps = [":" + test_bin],
         )
-
diff -Nru rocksdb-5.15.10/third-party/fbson/FbsonDocument.h rocksdb-5.17.2/third-party/fbson/FbsonDocument.h
--- rocksdb-5.15.10/third-party/fbson/FbsonDocument.h	2018-09-13 17:25:20.000000000 +0000
+++ rocksdb-5.17.2/third-party/fbson/FbsonDocument.h	2018-11-12 19:57:32.000000000 +0000
@@ -55,8 +55,7 @@
  * @author Tian Xia <tianx@fb.com>
  */
 
-#ifndef FBSON_FBSONDOCUMENT_H
-#define FBSON_FBSONDOCUMENT_H
+#pragma once
 
 #include <stdlib.h>
 #include <string.h>
@@ -889,5 +888,3 @@
 #pragma pack(pop)
 
 } // namespace fbson
-
-#endif // FBSON_FBSONDOCUMENT_H
diff -Nru rocksdb-5.15.10/third-party/fbson/FbsonJsonParser.h rocksdb-5.17.2/third-party/fbson/FbsonJsonParser.h
--- rocksdb-5.15.10/third-party/fbson/FbsonJsonParser.h	2018-09-13 17:25:20.000000000 +0000
+++ rocksdb-5.17.2/third-party/fbson/FbsonJsonParser.h	2018-11-12 19:57:32.000000000 +0000
@@ -47,8 +47,7 @@
  * @author Tian Xia <tianx@fb.com>
  */
 
-#ifndef FBSON_FBSONPARSER_H
-#define FBSON_FBSONPARSER_H
+#pragma once
 
 #include <cmath>
 #include <limits>
@@ -741,5 +740,3 @@
 typedef FbsonJsonParserT<FbsonOutStream> FbsonJsonParser;
 
 } // namespace fbson
-
-#endif // FBSON_FBSONPARSER_H
diff -Nru rocksdb-5.15.10/third-party/fbson/FbsonStream.h rocksdb-5.17.2/third-party/fbson/FbsonStream.h
--- rocksdb-5.15.10/third-party/fbson/FbsonStream.h	2018-09-13 17:25:20.000000000 +0000
+++ rocksdb-5.17.2/third-party/fbson/FbsonStream.h	2018-11-12 19:57:32.000000000 +0000
@@ -18,8 +18,7 @@
  * @author Tian Xia <tianx@fb.com>
  */
 
-#ifndef FBSON_FBSONSTREAM_H
-#define FBSON_FBSONSTREAM_H
+#pragma once
 
 #ifndef __STDC_FORMAT_MACROS
 #define __STDC_FORMAT_MACROS
@@ -178,5 +177,3 @@
 };
 
 } // namespace fbson
-
-#endif // FBSON_FBSONSTREAM_H
diff -Nru rocksdb-5.15.10/third-party/fbson/FbsonUtil.h rocksdb-5.17.2/third-party/fbson/FbsonUtil.h
--- rocksdb-5.15.10/third-party/fbson/FbsonUtil.h	2018-09-13 17:25:20.000000000 +0000
+++ rocksdb-5.17.2/third-party/fbson/FbsonUtil.h	2018-11-12 19:57:32.000000000 +0000
@@ -9,8 +9,7 @@
  * @author Tian Xia <tianx@fb.com>
  */
 
-#ifndef FBSON_FBSONUTIL_H
-#define FBSON_FBSONUTIL_H
+#pragma once
 
 #include <sstream>
 #include "FbsonDocument.h"
@@ -159,5 +158,3 @@
 };
 
 } // namespace fbson
-
-#endif // FBSON_FBSONUTIL_H
diff -Nru rocksdb-5.15.10/third-party/fbson/FbsonWriter.h rocksdb-5.17.2/third-party/fbson/FbsonWriter.h
--- rocksdb-5.15.10/third-party/fbson/FbsonWriter.h	2018-09-13 17:25:20.000000000 +0000
+++ rocksdb-5.17.2/third-party/fbson/FbsonWriter.h	2018-11-12 19:57:32.000000000 +0000
@@ -25,8 +25,7 @@
  * @author Tian Xia <tianx@fb.com>
  */
 
-#ifndef FBSON_FBSONWRITER_H
-#define FBSON_FBSONWRITER_H
+#pragma once
 
 #include <stack>
 #include "FbsonDocument.h"
@@ -433,5 +432,3 @@
 typedef FbsonWriterT<FbsonOutStream> FbsonWriter;
 
 } // namespace fbson
-
-#endif // FBSON_FBSONWRITER_H
diff -Nru rocksdb-5.15.10/tools/advisor/advisor/bench_runner.py rocksdb-5.17.2/tools/advisor/advisor/bench_runner.py
--- rocksdb-5.15.10/tools/advisor/advisor/bench_runner.py	1970-01-01 00:00:00.000000000 +0000
+++ rocksdb-5.17.2/tools/advisor/advisor/bench_runner.py	2018-11-12 19:57:32.000000000 +0000
@@ -0,0 +1,39 @@
+# Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+#  This source code is licensed under both the GPLv2 (found in the
+#  COPYING file in the root directory) and Apache 2.0 License
+#  (found in the LICENSE.Apache file in the root directory).
+
+from abc import ABC, abstractmethod
+import re
+
+
+class BenchmarkRunner(ABC):
+    @staticmethod
+    @abstractmethod
+    def is_metric_better(new_metric, old_metric):
+        pass
+
+    @abstractmethod
+    def run_experiment(self):
+        # should return a list of DataSource objects
+        pass
+
+    @staticmethod
+    def get_info_log_file_name(log_dir, db_path):
+        # Example: DB Path = /dev/shm and OPTIONS file has option
+        # db_log_dir=/tmp/rocks/, then the name of the log file will be
+        # 'dev_shm_LOG' and its location will be /tmp/rocks. If db_log_dir is
+        # not specified in the OPTIONS file, then the location of the log file
+        # will be /dev/shm and the name of the file will be 'LOG'
+        file_name = ''
+        if log_dir:
+            # refer GetInfoLogPrefix() in rocksdb/util/filename.cc
+            # example db_path: /dev/shm/dbbench
+            file_name = db_path[1:]  # to ignore the leading '/' character
+            to_be_replaced = re.compile('[^0-9a-zA-Z\-_\.]')
+            for character in to_be_replaced.findall(db_path):
+                file_name = file_name.replace(character, '_')
+            if not file_name.endswith('_'):
+                file_name += '_'
+        file_name += 'LOG'
+        return file_name
diff -Nru rocksdb-5.15.10/tools/advisor/advisor/config_optimizer_example.py rocksdb-5.17.2/tools/advisor/advisor/config_optimizer_example.py
--- rocksdb-5.15.10/tools/advisor/advisor/config_optimizer_example.py	1970-01-01 00:00:00.000000000 +0000
+++ rocksdb-5.17.2/tools/advisor/advisor/config_optimizer_example.py	2018-11-12 19:57:32.000000000 +0000
@@ -0,0 +1,134 @@
+# Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+#  This source code is licensed under both the GPLv2 (found in the
+#  COPYING file in the root directory) and Apache 2.0 License
+#  (found in the LICENSE.Apache file in the root directory).
+
+import argparse
+from advisor.db_config_optimizer import ConfigOptimizer
+from advisor.db_log_parser import NO_COL_FAMILY
+from advisor.db_options_parser import DatabaseOptions
+from advisor.rule_parser import RulesSpec
+
+
+CONFIG_OPT_NUM_ITER = 10
+
+
+def main(args):
+    # initialise the RulesSpec parser
+    rule_spec_parser = RulesSpec(args.rules_spec)
+    # initialise the benchmark runner
+    bench_runner_module = __import__(
+        args.benchrunner_module, fromlist=[args.benchrunner_class]
+    )
+    bench_runner_class = getattr(bench_runner_module, args.benchrunner_class)
+    ods_args = {}
+    if args.ods_client and args.ods_entity:
+        ods_args['client_script'] = args.ods_client
+        ods_args['entity'] = args.ods_entity
+        if args.ods_key_prefix:
+            ods_args['key_prefix'] = args.ods_key_prefix
+    db_bench_runner = bench_runner_class(args.benchrunner_pos_args, ods_args)
+    # initialise the database configuration
+    db_options = DatabaseOptions(args.rocksdb_options, args.misc_options)
+    # set the frequency at which stats are dumped in the LOG file and the
+    # location of the LOG file.
+    db_log_dump_settings = {
+        "DBOptions.stats_dump_period_sec": {
+            NO_COL_FAMILY: args.stats_dump_period_sec
+        }
+    }
+    db_options.update_options(db_log_dump_settings)
+    # initialise the configuration optimizer
+    config_optimizer = ConfigOptimizer(
+        db_bench_runner,
+        db_options,
+        rule_spec_parser,
+        args.base_db_path
+    )
+    # run the optimiser to improve the database configuration for given
+    # benchmarks, with the help of expert-specified rules
+    final_db_options = config_optimizer.run()
+    # generate the final rocksdb options file
+    print(
+        'Final configuration in: ' +
+        final_db_options.generate_options_config('final')
+    )
+    print(
+        'Final miscellaneous options: ' +
+        repr(final_db_options.get_misc_options())
+    )
+
+
+if __name__ == '__main__':
+    '''
+    An example run of this tool from the command-line would look like:
+    python3 -m advisor.config_optimizer_example
+    --base_db_path=/tmp/rocksdbtest-155919/dbbench
+    --rocksdb_options=temp/OPTIONS_boot.tmp --misc_options bloom_bits=2
+    --rules_spec=advisor/rules.ini --stats_dump_period_sec=20
+    --benchrunner_module=advisor.db_bench_runner
+    --benchrunner_class=DBBenchRunner --benchrunner_pos_args ./../../db_bench
+    readwhilewriting use_existing_db=true duration=90
+    '''
+    parser = argparse.ArgumentParser(description='This script is used for\
+        searching for a better database configuration')
+    parser.add_argument(
+        '--rocksdb_options', required=True, type=str,
+        help='path of the starting Rocksdb OPTIONS file'
+    )
+    # these are options that are column-family agnostic and are not yet
+    # supported by the Rocksdb Options file: eg. bloom_bits=2
+    parser.add_argument(
+        '--misc_options', nargs='*',
+        help='whitespace-separated list of options that are not supported ' +
+        'by the Rocksdb OPTIONS file, given in the ' +
+        '<option_name>=<option_value> format eg. "bloom_bits=2 ' +
+        'rate_limiter_bytes_per_sec=128000000"')
+    parser.add_argument(
+        '--base_db_path', required=True, type=str,
+        help='path for the Rocksdb database'
+    )
+    parser.add_argument(
+        '--rules_spec', required=True, type=str,
+        help='path of the file containing the expert-specified Rules'
+    )
+    parser.add_argument(
+        '--stats_dump_period_sec', required=True, type=int,
+        help='the frequency (in seconds) at which STATISTICS are printed to ' +
+        'the Rocksdb LOG file'
+    )
+    # ODS arguments
+    parser.add_argument(
+        '--ods_client', type=str, help='the ODS client binary'
+    )
+    parser.add_argument(
+        '--ods_entity', type=str,
+        help='the servers for which the ODS stats need to be fetched'
+    )
+    parser.add_argument(
+        '--ods_key_prefix', type=str,
+        help='the prefix that needs to be attached to the keys of time ' +
+        'series to be fetched from ODS'
+    )
+    # benchrunner_module example: advisor.db_benchmark_client
+    parser.add_argument(
+        '--benchrunner_module', required=True, type=str,
+        help='the module containing the BenchmarkRunner class to be used by ' +
+        'the Optimizer, example: advisor.db_bench_runner'
+    )
+    # benchrunner_class example: DBBenchRunner
+    parser.add_argument(
+        '--benchrunner_class', required=True, type=str,
+        help='the name of the BenchmarkRunner class to be used by the ' +
+        'Optimizer, should be present in the module provided in the ' +
+        'benchrunner_module argument, example: DBBenchRunner'
+    )
+    parser.add_argument(
+        '--benchrunner_pos_args', nargs='*',
+        help='whitespace-separated positional arguments that are passed on ' +
+        'to the constructor of the BenchmarkRunner class provided in the ' +
+        'benchrunner_class argument, example: "use_existing_db=true ' +
+        'duration=900"'
+    )
+    args = parser.parse_args()
+    main(args)
diff -Nru rocksdb-5.15.10/tools/advisor/advisor/db_bench_runner.py rocksdb-5.17.2/tools/advisor/advisor/db_bench_runner.py
--- rocksdb-5.15.10/tools/advisor/advisor/db_bench_runner.py	1970-01-01 00:00:00.000000000 +0000
+++ rocksdb-5.17.2/tools/advisor/advisor/db_bench_runner.py	2018-11-12 19:57:32.000000000 +0000
@@ -0,0 +1,245 @@
+# Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+#  This source code is licensed under both the GPLv2 (found in the
+#  COPYING file in the root directory) and Apache 2.0 License
+#  (found in the LICENSE.Apache file in the root directory).
+
+from advisor.bench_runner import BenchmarkRunner
+from advisor.db_log_parser import DataSource, DatabaseLogs, NO_COL_FAMILY
+from advisor.db_stats_fetcher import (
+    LogStatsParser, OdsStatsFetcher, DatabasePerfContext
+)
+import shutil
+import subprocess
+import time
+
+
+'''
+NOTE: This is not thread-safe, because the output file is simply overwritten.
+'''
+
+
+class DBBenchRunner(BenchmarkRunner):
+    OUTPUT_FILE = "temp/dbbench_out.tmp"
+    ERROR_FILE = "temp/dbbench_err.tmp"
+    DB_PATH = "DB path"
+    THROUGHPUT = "ops/sec"
+    PERF_CON = " PERF_CONTEXT:"
+
+    @staticmethod
+    def is_metric_better(new_metric, old_metric):
+        # for db_bench 'throughput' is the metric returned by run_experiment
+        return new_metric >= old_metric
+
+    @staticmethod
+    def get_opt_args_str(misc_options_dict):
+        # given a dictionary of options and their values, return a string
+        # that can be appended as command-line arguments
+        optional_args_str = ""
+        for option_name, option_value in misc_options_dict.items():
+            if option_value:
+                optional_args_str += (
+                    " --" + option_name + "=" + str(option_value)
+                )
+        return optional_args_str
+
+    def __init__(self, positional_args, ods_args=None):
+        # parse positional_args list appropriately
+        self.db_bench_binary = positional_args[0]
+        self.benchmark = positional_args[1]
+        self.db_bench_args = None
+        if len(positional_args) > 2:
+            # options list with each option given as "<option>=<value>"
+            self.db_bench_args = positional_args[2:]
+        # save ods_args, if provided
+        self.ods_args = ods_args
+
+    def _parse_output(self, get_perf_context=False):
+        '''
+        Sample db_bench output after running 'readwhilewriting' benchmark:
+        DB path: [/tmp/rocksdbtest-155919/dbbench]\n
+        readwhilewriting : 16.582 micros/op 60305 ops/sec; 4.2 MB/s (3433828\
+        of 5427999 found)\n
+        PERF_CONTEXT:\n
+        user_key_comparison_count = 500466712, block_cache_hit_count = ...\n
+        '''
+        output = {
+            self.THROUGHPUT: None, self.DB_PATH: None, self.PERF_CON: None
+        }
+        perf_context_begins = False
+        with open(self.OUTPUT_FILE, 'r') as fp:
+            for line in fp:
+                if line.startswith(self.benchmark):
+                    # line from sample output:
+                    # readwhilewriting : 16.582 micros/op 60305 ops/sec; \
+                    # 4.2 MB/s (3433828 of 5427999 found)\n
+                    print(line)  # print output of the benchmark run
+                    token_list = line.strip().split()
+                    for ix, token in enumerate(token_list):
+                        if token.startswith(self.THROUGHPUT):
+                            # in above example, throughput = 60305 ops/sec
+                            output[self.THROUGHPUT] = (
+                                float(token_list[ix - 1])
+                            )
+                            break
+                elif get_perf_context and line.startswith(self.PERF_CON):
+                    # the following lines in the output contain perf context
+                    # statistics (refer example above)
+                    perf_context_begins = True
+                elif get_perf_context and perf_context_begins:
+                    # Sample perf_context output:
+                    # user_key_comparison_count = 500, block_cache_hit_count =\
+                    # 468, block_read_count = 580, block_read_byte = 445, ...
+                    token_list = line.strip().split(',')
+                    # token_list = ['user_key_comparison_count = 500',
+                    # 'block_cache_hit_count = 468','block_read_count = 580'...
+                    perf_context = {
+                        tk.split('=')[0].strip(): tk.split('=')[1].strip()
+                        for tk in token_list
+                        if tk
+                    }
+                    # TODO(poojam23): this is a hack and should be replaced
+                    # with the timestamp that db_bench will provide per printed
+                    # perf_context
+                    timestamp = int(time.time())
+                    perf_context_ts = {}
+                    for stat in perf_context.keys():
+                        perf_context_ts[stat] = {
+                            timestamp: int(perf_context[stat])
+                        }
+                    output[self.PERF_CON] = perf_context_ts
+                    perf_context_begins = False
+                elif line.startswith(self.DB_PATH):
+                    # line from sample output:
+                    # DB path: [/tmp/rocksdbtest-155919/dbbench]\n
+                    output[self.DB_PATH] = (
+                        line.split('[')[1].split(']')[0]
+                    )
+        return output
+
+    def get_log_options(self, db_options, db_path):
+        # get the location of the LOG file and the frequency at which stats are
+        # dumped in the LOG file
+        log_dir_path = None
+        stats_freq_sec = None
+        logs_file_prefix = None
+
+        # fetch frequency at which the stats are dumped in the Rocksdb logs
+        dump_period = 'DBOptions.stats_dump_period_sec'
+        # fetch the directory, if specified, in which the Rocksdb logs are
+        # dumped, by default logs are dumped in same location as database
+        log_dir = 'DBOptions.db_log_dir'
+        log_options = db_options.get_options([dump_period, log_dir])
+        if dump_period in log_options:
+            stats_freq_sec = int(log_options[dump_period][NO_COL_FAMILY])
+        if log_dir in log_options:
+            log_dir_path = log_options[log_dir][NO_COL_FAMILY]
+
+        log_file_name = DBBenchRunner.get_info_log_file_name(
+            log_dir_path, db_path
+        )
+
+        if not log_dir_path:
+            log_dir_path = db_path
+        if not log_dir_path.endswith('/'):
+            log_dir_path += '/'
+
+        logs_file_prefix = log_dir_path + log_file_name
+        return (logs_file_prefix, stats_freq_sec)
+
+    def _get_options_command_line_args_str(self, curr_options):
+        '''
+        This method uses the provided Rocksdb OPTIONS to create a string of
+        command-line arguments for db_bench.
+        The --options_file argument is always given and the options that are
+        not supported by the OPTIONS file are given as separate arguments.
+        '''
+        optional_args_str = DBBenchRunner.get_opt_args_str(
+            curr_options.get_misc_options()
+        )
+        # generate an options configuration file
+        options_file = curr_options.generate_options_config(nonce='12345')
+        optional_args_str += " --options_file=" + options_file
+        return optional_args_str
+
+    def _setup_db_before_experiment(self, curr_options, db_path):
+        # remove destination directory if it already exists
+        try:
+            shutil.rmtree(db_path, ignore_errors=True)
+        except OSError as e:
+            print('Error: rmdir ' + e.filename + ' ' + e.strerror)
+        # setup database with a million keys using the fillrandom benchmark
+        command = "%s --benchmarks=fillrandom --db=%s --num=1000000" % (
+            self.db_bench_binary, db_path
+        )
+        args_str = self._get_options_command_line_args_str(curr_options)
+        command += args_str
+        self._run_command(command)
+
+    def _build_experiment_command(self, curr_options, db_path):
+        command = "%s --benchmarks=%s --statistics --perf_level=3 --db=%s" % (
+            self.db_bench_binary, self.benchmark, db_path
+        )
+        # fetch the command-line arguments string for providing Rocksdb options
+        args_str = self._get_options_command_line_args_str(curr_options)
+        # handle the command-line args passed in the constructor, these
+        # arguments are specific to db_bench
+        for cmd_line_arg in self.db_bench_args:
+            args_str += (" --" + cmd_line_arg)
+        command += args_str
+        return command
+
+    def _run_command(self, command):
+        out_file = open(self.OUTPUT_FILE, "w+")
+        err_file = open(self.ERROR_FILE, "w+")
+        print('executing... - ' + command)
+        subprocess.call(command, shell=True, stdout=out_file, stderr=err_file)
+        out_file.close()
+        err_file.close()
+
+    def run_experiment(self, db_options, db_path):
+        # setup the Rocksdb database before running experiment
+        self._setup_db_before_experiment(db_options, db_path)
+        # get the command to run the experiment
+        command = self._build_experiment_command(db_options, db_path)
+        experiment_start_time = int(time.time())
+        # run experiment
+        self._run_command(command)
+        experiment_end_time = int(time.time())
+        # parse the db_bench experiment output
+        parsed_output = self._parse_output(get_perf_context=True)
+
+        # get the log files path prefix and frequency at which Rocksdb stats
+        # are dumped in the logs
+        logs_file_prefix, stats_freq_sec = self.get_log_options(
+            db_options, parsed_output[self.DB_PATH]
+        )
+        # create the Rocksbd LOGS object
+        db_logs = DatabaseLogs(
+            logs_file_prefix, db_options.get_column_families()
+        )
+        # Create the Log STATS object
+        db_log_stats = LogStatsParser(logs_file_prefix, stats_freq_sec)
+        # Create the PerfContext STATS object
+        db_perf_context = DatabasePerfContext(
+            parsed_output[self.PERF_CON], 0, False
+        )
+        # create the data-sources dictionary
+        data_sources = {
+            DataSource.Type.DB_OPTIONS: [db_options],
+            DataSource.Type.LOG: [db_logs],
+            DataSource.Type.TIME_SERIES: [db_log_stats, db_perf_context]
+        }
+        # Create the ODS STATS object
+        if self.ods_args:
+            key_prefix = ''
+            if 'key_prefix' in self.ods_args:
+                key_prefix = self.ods_args['key_prefix']
+            data_sources[DataSource.Type.TIME_SERIES].append(OdsStatsFetcher(
+                self.ods_args['client_script'],
+                self.ods_args['entity'],
+                experiment_start_time,
+                experiment_end_time,
+                key_prefix
+            ))
+        # return the experiment's data-sources and throughput
+        return data_sources, parsed_output[self.THROUGHPUT]
diff -Nru rocksdb-5.15.10/tools/advisor/advisor/db_config_optimizer.py rocksdb-5.17.2/tools/advisor/advisor/db_config_optimizer.py
--- rocksdb-5.15.10/tools/advisor/advisor/db_config_optimizer.py	1970-01-01 00:00:00.000000000 +0000
+++ rocksdb-5.17.2/tools/advisor/advisor/db_config_optimizer.py	2018-11-12 19:57:32.000000000 +0000
@@ -0,0 +1,282 @@
+# Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+#  This source code is licensed under both the GPLv2 (found in the
+#  COPYING file in the root directory) and Apache 2.0 License
+#  (found in the LICENSE.Apache file in the root directory).
+
+from advisor.db_log_parser import NO_COL_FAMILY
+from advisor.db_options_parser import DatabaseOptions
+from advisor.rule_parser import Suggestion
+import copy
+import random
+
+
+class ConfigOptimizer:
+    SCOPE = 'scope'
+    SUGG_VAL = 'suggested values'
+
+    @staticmethod
+    def apply_action_on_value(old_value, action, suggested_values):
+        chosen_sugg_val = None
+        if suggested_values:
+            chosen_sugg_val = random.choice(list(suggested_values))
+        new_value = None
+        if action is Suggestion.Action.set or not old_value:
+            assert(chosen_sugg_val)
+            new_value = chosen_sugg_val
+        else:
+            # For increase/decrease actions, currently the code tries to make
+            # a 30% change in the option's value per iteration. An addend is
+            # also present (+1 or -1) to handle the cases when the option's
+            # old value was 0 or the final int() conversion suppressed the 30%
+            # change made to the option
+            old_value = float(old_value)
+            mul = 0
+            add = 0
+            if action is Suggestion.Action.increase:
+                if old_value < 0:
+                    mul = 0.7
+                    add = 2
+                else:
+                    mul = 1.3
+                    add = 2
+            elif action is Suggestion.Action.decrease:
+                if old_value < 0:
+                    mul = 1.3
+                    add = -2
+                else:
+                    mul = 0.7
+                    add = -2
+            new_value = int(old_value * mul + add)
+        return new_value
+
+    @staticmethod
+    def improve_db_config(options, rule, suggestions_dict):
+        # this method takes ONE 'rule' and applies all its suggestions on the
+        # appropriate options
+        required_options = []
+        rule_suggestions = []
+        for sugg_name in rule.get_suggestions():
+            option = suggestions_dict[sugg_name].option
+            action = suggestions_dict[sugg_name].action
+            # A Suggestion in the rules spec must have the 'option' and
+            # 'action' fields defined, always call perform_checks() method
+            # after parsing the rules file using RulesSpec
+            assert(option)
+            assert(action)
+            required_options.append(option)
+            rule_suggestions.append(suggestions_dict[sugg_name])
+        current_config = options.get_options(required_options)
+        # Create the updated configuration from the rule's suggestions
+        updated_config = {}
+        for sugg in rule_suggestions:
+            # case: when the option is not present in the current configuration
+            if sugg.option not in current_config:
+                try:
+                    new_value = ConfigOptimizer.apply_action_on_value(
+                        None, sugg.action, sugg.suggested_values
+                    )
+                    if sugg.option not in updated_config:
+                        updated_config[sugg.option] = {}
+                    if DatabaseOptions.is_misc_option(sugg.option):
+                        # this suggestion is on an option that is not yet
+                        # supported by the Rocksdb OPTIONS file and so it is
+                        # not prefixed by a section type.
+                        updated_config[sugg.option][NO_COL_FAMILY] = new_value
+                    else:
+                        for col_fam in rule.get_trigger_column_families():
+                            updated_config[sugg.option][col_fam] = new_value
+                except AssertionError:
+                    print(
+                        'WARNING(ConfigOptimizer): provide suggested_values ' +
+                        'for ' + sugg.option
+                    )
+                continue
+            # case: when the option is present in the current configuration
+            if NO_COL_FAMILY in current_config[sugg.option]:
+                old_value = current_config[sugg.option][NO_COL_FAMILY]
+                try:
+                    new_value = ConfigOptimizer.apply_action_on_value(
+                        old_value, sugg.action, sugg.suggested_values
+                    )
+                    if sugg.option not in updated_config:
+                        updated_config[sugg.option] = {}
+                    updated_config[sugg.option][NO_COL_FAMILY] = new_value
+                except AssertionError:
+                    print(
+                        'WARNING(ConfigOptimizer): provide suggested_values ' +
+                        'for ' + sugg.option
+                    )
+            else:
+                for col_fam in rule.get_trigger_column_families():
+                    old_value = None
+                    if col_fam in current_config[sugg.option]:
+                        old_value = current_config[sugg.option][col_fam]
+                    try:
+                        new_value = ConfigOptimizer.apply_action_on_value(
+                            old_value, sugg.action, sugg.suggested_values
+                        )
+                        if sugg.option not in updated_config:
+                            updated_config[sugg.option] = {}
+                        updated_config[sugg.option][col_fam] = new_value
+                    except AssertionError:
+                        print(
+                            'WARNING(ConfigOptimizer): provide ' +
+                            'suggested_values for ' + sugg.option
+                        )
+        return current_config, updated_config
+
+    @staticmethod
+    def pick_rule_to_apply(rules, last_rule_name, rules_tried, backtrack):
+        if not rules:
+            print('\nNo more rules triggered!')
+            return None
+        # if the last rule provided an improvement in the database performance,
+        # and it was triggered again (i.e. it is present in 'rules'), then pick
+        # the same rule for this iteration too.
+        if last_rule_name and not backtrack:
+            for rule in rules:
+                if rule.name == last_rule_name:
+                    return rule
+        # there was no previous rule OR the previous rule did not improve db
+        # performance OR it was not triggered for this iteration,
+        # then pick another rule that has not been tried yet
+        for rule in rules:
+            if rule.name not in rules_tried:
+                return rule
+        print('\nAll rules have been exhausted')
+        return None
+
+    @staticmethod
+    def apply_suggestions(
+        triggered_rules,
+        current_rule_name,
+        rules_tried,
+        backtrack,
+        curr_options,
+        suggestions_dict
+    ):
+        curr_rule = ConfigOptimizer.pick_rule_to_apply(
+            triggered_rules, current_rule_name, rules_tried, backtrack
+        )
+        if not curr_rule:
+            return tuple([None]*4)
+        # if a rule has been picked for improving db_config, update rules_tried
+        rules_tried.add(curr_rule.name)
+        # get updated config based on the picked rule
+        curr_conf, updated_conf = ConfigOptimizer.improve_db_config(
+            curr_options, curr_rule, suggestions_dict
+        )
+        conf_diff = DatabaseOptions.get_options_diff(curr_conf, updated_conf)
+        if not conf_diff:  # the current and updated configs are the same
+            curr_rule, rules_tried, curr_conf, updated_conf = (
+                ConfigOptimizer.apply_suggestions(
+                    triggered_rules,
+                    None,
+                    rules_tried,
+                    backtrack,
+                    curr_options,
+                    suggestions_dict
+                )
+            )
+        print('returning from apply_suggestions')
+        return (curr_rule, rules_tried, curr_conf, updated_conf)
+
+    # TODO(poojam23): check if this method is required or can we directly set
+    # the config equal to the curr_config
+    @staticmethod
+    def get_backtrack_config(curr_config, updated_config):
+        diff = DatabaseOptions.get_options_diff(curr_config, updated_config)
+        bt_config = {}
+        for option in diff:
+            bt_config[option] = {}
+            for col_fam in diff[option]:
+                bt_config[option][col_fam] = diff[option][col_fam][0]
+        print(bt_config)
+        return bt_config
+
+    def __init__(self, bench_runner, db_options, rule_parser, base_db):
+        self.bench_runner = bench_runner
+        self.db_options = db_options
+        self.rule_parser = rule_parser
+        self.base_db_path = base_db
+
+    def run(self):
+        # In every iteration of this method's optimization loop we pick ONE
+        # RULE from all the triggered rules and apply all its suggestions to
+        # the appropriate options.
+        # bootstrapping the optimizer
+        print('Bootstrapping optimizer:')
+        options = copy.deepcopy(self.db_options)
+        old_data_sources, old_metric = (
+            self.bench_runner.run_experiment(options, self.base_db_path)
+        )
+        print('Initial metric: ' + str(old_metric))
+        self.rule_parser.load_rules_from_spec()
+        self.rule_parser.perform_section_checks()
+        triggered_rules = self.rule_parser.get_triggered_rules(
+            old_data_sources, options.get_column_families()
+        )
+        print('\nTriggered:')
+        self.rule_parser.print_rules(triggered_rules)
+        backtrack = False
+        rules_tried = set()
+        curr_rule, rules_tried, curr_conf, updated_conf = (
+            ConfigOptimizer.apply_suggestions(
+                triggered_rules,
+                None,
+                rules_tried,
+                backtrack,
+                options,
+                self.rule_parser.get_suggestions_dict()
+            )
+        )
+        # the optimizer loop
+        while curr_rule:
+            print('\nRule picked for next iteration:')
+            print(curr_rule.name)
+            print('\ncurrent config:')
+            print(curr_conf)
+            print('updated config:')
+            print(updated_conf)
+            options.update_options(updated_conf)
+            # run bench_runner with updated config
+            new_data_sources, new_metric = (
+                self.bench_runner.run_experiment(options, self.base_db_path)
+            )
+            print('\nnew metric: ' + str(new_metric))
+            backtrack = not self.bench_runner.is_metric_better(
+                new_metric, old_metric
+            )
+            # update triggered_rules, metric, data_sources, if required
+            if backtrack:
+                # revert changes to options config
+                print('\nBacktracking to previous configuration')
+                backtrack_conf = ConfigOptimizer.get_backtrack_config(
+                    curr_conf, updated_conf
+                )
+                options.update_options(backtrack_conf)
+            else:
+                # run advisor on new data sources
+                self.rule_parser.load_rules_from_spec()  # reboot the advisor
+                self.rule_parser.perform_section_checks()
+                triggered_rules = self.rule_parser.get_triggered_rules(
+                    new_data_sources, options.get_column_families()
+                )
+                print('\nTriggered:')
+                self.rule_parser.print_rules(triggered_rules)
+                old_metric = new_metric
+                old_data_sources = new_data_sources
+                rules_tried = set()
+            # pick rule to work on and set curr_rule to that
+            curr_rule, rules_tried, curr_conf, updated_conf = (
+                ConfigOptimizer.apply_suggestions(
+                    triggered_rules,
+                    curr_rule.name,
+                    rules_tried,
+                    backtrack,
+                    options,
+                    self.rule_parser.get_suggestions_dict()
+                )
+            )
+        # return the final database options configuration
+        return options
diff -Nru rocksdb-5.15.10/tools/advisor/advisor/db_log_parser.py rocksdb-5.17.2/tools/advisor/advisor/db_log_parser.py
--- rocksdb-5.15.10/tools/advisor/advisor/db_log_parser.py	2018-09-13 17:25:20.000000000 +0000
+++ rocksdb-5.17.2/tools/advisor/advisor/db_log_parser.py	2018-11-12 19:57:32.000000000 +0000
@@ -4,18 +4,21 @@
 #  (found in the LICENSE.Apache file in the root directory).
 
 from abc import ABC, abstractmethod
+from calendar import timegm
+from enum import Enum
 import glob
 import re
-from enum import Enum
+import time
+
+
+NO_COL_FAMILY = 'DB_WIDE'
 
 
 class DataSource(ABC):
     class Type(Enum):
         LOG = 1
         DB_OPTIONS = 2
-        STATS = 3
-        PERF_CONTEXT = 4
-        ODS = 5
+        TIME_SERIES = 3
 
     def __init__(self, type):
         self.type = type
@@ -33,15 +36,30 @@
         date_regex = '\d{4}/\d{2}/\d{2}-\d{2}:\d{2}:\d{2}\.\d{6}'
         return re.match(date_regex, log_line)
 
-    def __init__(self, log_line):
+    def __init__(self, log_line, column_families):
         token_list = log_line.strip().split()
         self.time = token_list[0]
         self.context = token_list[1]
         self.message = " ".join(token_list[2:])
+        self.column_family = None
+        # example log for 'default' column family:
+        # "2018/07/25-17:29:05.176080 7f969de68700 [db/compaction_job.cc:1634]
+        # [default] [JOB 3] Compacting 24@0 + 16@1 files to L1, score 6.00\n"
+        for col_fam in column_families:
+            search_for_str = '\[' + col_fam + '\]'
+            if re.search(search_for_str, self.message):
+                self.column_family = col_fam
+                break
+        if not self.column_family:
+            self.column_family = NO_COL_FAMILY
 
-    def get_time(self):
+    def get_human_readable_time(self):
+        # example from a log line: '2018/07/25-11:25:45.782710'
         return self.time
 
+    def get_column_family(self):
+        return self.column_family
+
     def get_context(self):
         return self.context
 
@@ -49,48 +67,65 @@
         return self.message
 
     def append_message(self, remaining_log):
-        self.message = self.message + remaining_log
+        self.message = self.message + '\n' + remaining_log.strip()
+
+    def get_timestamp(self):
+        # example: '2018/07/25-11:25:45.782710' will be converted to the GMT
+        # Unix timestamp 1532517945 (note: this method assumes that self.time
+        # is in GMT)
+        hr_time = self.time + 'GMT'
+        timestamp = timegm(time.strptime(hr_time, "%Y/%m/%d-%H:%M:%S.%f%Z"))
+        return timestamp
 
     def __repr__(self):
-        return 'time: ' + self.time + ', context: ' + self.context +\
-             ', message: ' + self.message
+        return (
+            'time: ' + self.time + '; context: ' + self.context +
+            '; col_fam: ' + self.column_family +
+            '; message: ' + self.message
+        )
 
 
 class DatabaseLogs(DataSource):
-    def __init__(self, logs_path_prefix):
+    def __init__(self, logs_path_prefix, column_families):
         super().__init__(DataSource.Type.LOG)
         self.logs_path_prefix = logs_path_prefix
+        self.column_families = column_families
 
-    def trigger_appropriate_conditions(self, conditions, log):
-        conditions_to_be_removed = []
+    def trigger_conditions_for_log(self, conditions, log):
+        # For a LogCondition object, trigger is:
+        # Dict[column_family_name, List[Log]]. This explains why the condition
+        # was triggered and for which column families.
         for cond in conditions:
             if re.search(cond.regex, log.get_message(), re.IGNORECASE):
-                cond.set_trigger(log)
-                conditions_to_be_removed.append(cond)
-        for remove_cond in conditions_to_be_removed:
-            conditions.remove(remove_cond)
-        return conditions
+                trigger = cond.get_trigger()
+                if not trigger:
+                    trigger = {}
+                if log.get_column_family() not in trigger:
+                    trigger[log.get_column_family()] = []
+                trigger[log.get_column_family()].append(log)
+                cond.set_trigger(trigger)
 
     def check_and_trigger_conditions(self, conditions):
         for file_name in glob.glob(self.logs_path_prefix + '*'):
+            # TODO(poojam23): find a way to distinguish between log files
+            # - generated in the current experiment but are labeled 'old'
+            # because they LOGs exceeded the file size limit  AND
+            # - generated in some previous experiment that are also labeled
+            # 'old' and were not deleted for some reason
+            if re.search('old', file_name, re.IGNORECASE):
+                continue
             with open(file_name, 'r') as db_logs:
                 new_log = None
                 for line in db_logs:
-                    if not conditions:
-                        break
                     if Log.is_new_log(line):
                         if new_log:
-                            conditions = self.trigger_appropriate_conditions(
-                                conditions,
-                                new_log
+                            self.trigger_conditions_for_log(
+                                conditions, new_log
                             )
-                        new_log = Log(line)
+                        new_log = Log(line, self.column_families)
                     else:
                         # To account for logs split into multiple lines
                         new_log.append_message(line)
             # Check for the last log in the file.
-            if new_log and conditions:
-                conditions = self.trigger_appropriate_conditions(
-                    conditions,
-                    new_log
-                )
+            if new_log:
+                self.trigger_conditions_for_log(conditions, new_log)
diff -Nru rocksdb-5.15.10/tools/advisor/advisor/db_options_parser.py rocksdb-5.17.2/tools/advisor/advisor/db_options_parser.py
--- rocksdb-5.15.10/tools/advisor/advisor/db_options_parser.py	2018-09-13 17:25:20.000000000 +0000
+++ rocksdb-5.17.2/tools/advisor/advisor/db_options_parser.py	2018-11-12 19:57:32.000000000 +0000
@@ -3,8 +3,10 @@
 #  COPYING file in the root directory) and Apache 2.0 License
 #  (found in the LICENSE.Apache file in the root directory).
 
-from advisor.db_log_parser import DataSource
+import copy
+from advisor.db_log_parser import DataSource, NO_COL_FAMILY
 from advisor.ini_parser import IniParser
+import os
 
 
 class OptionsSpecParser(IniParser):
@@ -16,7 +18,8 @@
     def get_section_type(line):
         '''
         Example section header: [TableOptions/BlockBasedTable "default"]
-        Here section_type returned would be 'TableOptions.BlockBasedTable'
+        Here ConfigurationOptimizer returned would be
+        'TableOptions.BlockBasedTable'
         '''
         section_path = line.strip()[1:-1].split()[0]
         section_type = '.'.join(section_path.split('/'))
@@ -24,84 +27,332 @@
 
     @staticmethod
     def get_section_name(line):
+        # example: get_section_name('[CFOptions "default"]')
         token_list = line.strip()[1:-1].split('"')
+        # token_list = ['CFOptions', 'default', '']
         if len(token_list) < 3:
             return None
-        return token_list[1]
+        return token_list[1]  # return 'default'
+
+    @staticmethod
+    def get_section_str(section_type, section_name):
+        # Example:
+        # Case 1: get_section_str('DBOptions', NO_COL_FAMILY)
+        # Case 2: get_section_str('TableOptions.BlockBasedTable', 'default')
+        section_type = '/'.join(section_type.strip().split('.'))
+        # Case 1: section_type = 'DBOptions'
+        # Case 2: section_type = 'TableOptions/BlockBasedTable'
+        section_str = '[' + section_type
+        if section_name == NO_COL_FAMILY:
+            # Case 1: '[DBOptions]'
+            return (section_str + ']')
+        else:
+            # Case 2: '[TableOptions/BlockBasedTable "default"]'
+            return section_str + ' "' + section_name + '"]'
+
+    @staticmethod
+    def get_option_str(key, values):
+        option_str = key + '='
+        # get_option_str('db_log_dir', None), returns 'db_log_dir='
+        if values:
+            # example:
+            # get_option_str('max_bytes_for_level_multiplier_additional',
+            # [1,1,1,1,1,1,1]), returned string:
+            # 'max_bytes_for_level_multiplier_additional=1:1:1:1:1:1:1'
+            if isinstance(values, list):
+                for value in values:
+                    option_str += (str(value) + ':')
+                option_str = option_str[:-1]
+            else:
+                # example: get_option_str('write_buffer_size', 1048576)
+                # returned string: 'write_buffer_size=1048576'
+                option_str += str(values)
+        return option_str
 
 
 class DatabaseOptions(DataSource):
-    def __init__(self, rocksdb_options):
+
+    @staticmethod
+    def is_misc_option(option_name):
+        # these are miscellaneous options that are not yet supported by the
+        # Rocksdb options file, hence they are not prefixed with any section
+        # name
+        return '.' not in option_name
+
+    @staticmethod
+    def get_options_diff(opt_old, opt_new):
+        # type: Dict[option, Dict[col_fam, value]] X 2 ->
+        # Dict[option, Dict[col_fam, Tuple(old_value, new_value)]]
+        # note: diff should contain a tuple of values only if they are
+        # different from each other
+        options_union = set(opt_old.keys()).union(set(opt_new.keys()))
+        diff = {}
+        for opt in options_union:
+            diff[opt] = {}
+            # if option in options_union, then it must be in one of the configs
+            if opt not in opt_old:
+                for col_fam in opt_new[opt]:
+                    diff[opt][col_fam] = (None, opt_new[opt][col_fam])
+            elif opt not in opt_new:
+                for col_fam in opt_old[opt]:
+                    diff[opt][col_fam] = (opt_old[opt][col_fam], None)
+            else:
+                for col_fam in opt_old[opt]:
+                    if col_fam in opt_new[opt]:
+                        if opt_old[opt][col_fam] != opt_new[opt][col_fam]:
+                            diff[opt][col_fam] = (
+                                opt_old[opt][col_fam],
+                                opt_new[opt][col_fam]
+                            )
+                    else:
+                        diff[opt][col_fam] = (opt_old[opt][col_fam], None)
+                for col_fam in opt_new[opt]:
+                    if col_fam in opt_old[opt]:
+                        if opt_old[opt][col_fam] != opt_new[opt][col_fam]:
+                            diff[opt][col_fam] = (
+                                opt_old[opt][col_fam],
+                                opt_new[opt][col_fam]
+                            )
+                    else:
+                        diff[opt][col_fam] = (None, opt_new[opt][col_fam])
+            if not diff[opt]:
+                diff.pop(opt)
+        return diff
+
+    def __init__(self, rocksdb_options, misc_options=None):
         super().__init__(DataSource.Type.DB_OPTIONS)
-        self.options_path = rocksdb_options
-        # Load the options from the given file to a dictionary.
-        self.load_from_source()
+        # The options are stored in the following data structure:
+        # Dict[section_type, Dict[section_name, Dict[option_name, value]]]
         self.options_dict = None
         self.column_families = None
+        # Load the options from the given file to a dictionary.
+        self.load_from_source(rocksdb_options)
+        # Setup the miscellaneous options expected to be List[str], where each
+        # element in the List has the format "<option_name>=<option_value>"
+        # These options are the ones that are not yet supported by the Rocksdb
+        # OPTIONS file, so they are provided separately
+        self.setup_misc_options(misc_options)
+
+    def setup_misc_options(self, misc_options):
+        self.misc_options = {}
+        if misc_options:
+            for option_pair_str in misc_options:
+                option_name = option_pair_str.split('=')[0].strip()
+                option_value = option_pair_str.split('=')[1].strip()
+                self.misc_options[option_name] = option_value
 
-    def load_from_source(self):
+    def load_from_source(self, options_path):
         self.options_dict = {}
-        with open(self.options_path, 'r') as db_options:
+        with open(options_path, 'r') as db_options:
             for line in db_options:
                 line = OptionsSpecParser.remove_trailing_comment(line)
                 if not line:
                     continue
                 if OptionsSpecParser.is_section_header(line):
-                    curr_sec_type = OptionsSpecParser.get_section_type(line)
+                    curr_sec_type = (
+                        OptionsSpecParser.get_section_type(line)
+                    )
                     curr_sec_name = OptionsSpecParser.get_section_name(line)
-                    if curr_sec_name:
-                        option_prefix = curr_sec_name + '.' + curr_sec_type
-                        if curr_sec_type == 'CFOptions':
-                            if not self.column_families:
-                                self.column_families = []
-                            self.column_families.append(curr_sec_name)
-                    else:
-                        option_prefix = curr_sec_type
+                    if curr_sec_type not in self.options_dict:
+                        self.options_dict[curr_sec_type] = {}
+                    if not curr_sec_name:
+                        curr_sec_name = NO_COL_FAMILY
+                    self.options_dict[curr_sec_type][curr_sec_name] = {}
+                    # example: if the line read from the Rocksdb OPTIONS file
+                    # is [CFOptions "default"], then the section type is
+                    # CFOptions and 'default' is the name of a column family
+                    # that for this database, so it's added to the list of
+                    # column families stored in this object
+                    if curr_sec_type == 'CFOptions':
+                        if not self.column_families:
+                            self.column_families = []
+                        self.column_families.append(curr_sec_name)
                 elif OptionsSpecParser.is_new_option(line):
                     key, value = OptionsSpecParser.get_key_value_pair(line)
-                    if not self.options_dict:
-                        self.options_dict = {}
-                    self.options_dict[option_prefix + '.' + key] = value
+                    self.options_dict[curr_sec_type][curr_sec_name][key] = (
+                        value
+                    )
                 else:
                     error = 'Not able to parse line in Options file.'
                     OptionsSpecParser.exit_with_parse_error(line, error)
 
+    def get_misc_options(self):
+        # these are options that are not yet supported by the Rocksdb OPTIONS
+        # file, hence they are provided and stored separately
+        return self.misc_options
+
+    def get_column_families(self):
+        return self.column_families
+
+    def get_all_options(self):
+        # This method returns all the options that are stored in this object as
+        # a: Dict[<sec_type>.<option_name>: Dict[col_fam, option_value]]
+        all_options = []
+        # Example: in the section header '[CFOptions "default"]' read from the
+        # OPTIONS file, sec_type='CFOptions'
+        for sec_type in self.options_dict:
+            for col_fam in self.options_dict[sec_type]:
+                for opt_name in self.options_dict[sec_type][col_fam]:
+                    option = sec_type + '.' + opt_name
+                    all_options.append(option)
+        all_options.extend(list(self.misc_options.keys()))
+        return self.get_options(all_options)
+
+    def get_options(self, reqd_options):
+        # type: List[str] -> Dict[str, Dict[str, Any]]
+        # List[option] -> Dict[option, Dict[col_fam, value]]
+        reqd_options_dict = {}
+        for option in reqd_options:
+            if DatabaseOptions.is_misc_option(option):
+                # the option is not prefixed by '<section_type>.' because it is
+                # not yet supported by the Rocksdb OPTIONS file; so it has to
+                # be fetched from the misc_options dictionary
+                if option not in self.misc_options:
+                    continue
+                if option not in reqd_options_dict:
+                    reqd_options_dict[option] = {}
+                reqd_options_dict[option][NO_COL_FAMILY] = (
+                    self.misc_options[option]
+                )
+            else:
+                # Example: option = 'TableOptions.BlockBasedTable.block_align'
+                # then, sec_type = 'TableOptions.BlockBasedTable'
+                sec_type = '.'.join(option.split('.')[:-1])
+                # opt_name = 'block_align'
+                opt_name = option.split('.')[-1]
+                if sec_type not in self.options_dict:
+                    continue
+                for col_fam in self.options_dict[sec_type]:
+                    if opt_name in self.options_dict[sec_type][col_fam]:
+                        if option not in reqd_options_dict:
+                            reqd_options_dict[option] = {}
+                        reqd_options_dict[option][col_fam] = (
+                            self.options_dict[sec_type][col_fam][opt_name]
+                        )
+        return reqd_options_dict
+
+    def update_options(self, options):
+        # An example 'options' object looks like:
+        # {'DBOptions.max_background_jobs': {NO_COL_FAMILY: 2},
+        # 'CFOptions.write_buffer_size': {'default': 1048576, 'cf_A': 128000},
+        # 'bloom_bits': {NO_COL_FAMILY: 4}}
+        for option in options:
+            if DatabaseOptions.is_misc_option(option):
+                # this is a misc_option i.e. an option that is not yet
+                # supported by the Rocksdb OPTIONS file, so it is not prefixed
+                # by '<section_type>.' and must be stored in the separate
+                # misc_options dictionary
+                if NO_COL_FAMILY not in options[option]:
+                    print(
+                        'WARNING(DatabaseOptions.update_options): not ' +
+                        'updating option ' + option + ' because it is in ' +
+                        'misc_option format but its scope is not ' +
+                        NO_COL_FAMILY + '. Check format of option.'
+                    )
+                    continue
+                self.misc_options[option] = options[option][NO_COL_FAMILY]
+            else:
+                sec_name = '.'.join(option.split('.')[:-1])
+                opt_name = option.split('.')[-1]
+                if sec_name not in self.options_dict:
+                    self.options_dict[sec_name] = {}
+                for col_fam in options[option]:
+                    # if the option is not already present in the dictionary,
+                    # it will be inserted, else it will be updated to the new
+                    # value
+                    if col_fam not in self.options_dict[sec_name]:
+                        self.options_dict[sec_name][col_fam] = {}
+                    self.options_dict[sec_name][col_fam][opt_name] = (
+                        copy.deepcopy(options[option][col_fam])
+                    )
+
+    def generate_options_config(self, nonce):
+        # this method generates a Rocksdb OPTIONS file in the INI format from
+        # the options stored in self.options_dict
+        this_path = os.path.abspath(os.path.dirname(__file__))
+        file_name = '../temp/OPTIONS_' + str(nonce) + '.tmp'
+        file_path = os.path.join(this_path, file_name)
+        with open(file_path, 'w') as fp:
+            for section in self.options_dict:
+                for col_fam in self.options_dict[section]:
+                    fp.write(
+                        OptionsSpecParser.get_section_str(section, col_fam) +
+                        '\n'
+                    )
+                    for option in self.options_dict[section][col_fam]:
+                        values = self.options_dict[section][col_fam][option]
+                        fp.write(
+                            OptionsSpecParser.get_option_str(option, values) +
+                            '\n'
+                        )
+                fp.write('\n')
+        return file_path
+
     def check_and_trigger_conditions(self, conditions):
-        '''
-        For every condition, if the fields are not present set_trigger will
-        not be called for it. Or if all the fields are present, then the
-        trigger will be set to whatever the expression evaluates to.
-        '''
         for cond in conditions:
-            # This contains the indices of options to whose name the column
-            # family name needs to be prepended in order to create the full
-            # option name as parsed from the options file.
+            reqd_options_dict = self.get_options(cond.options)
+            # This contains the indices of options that are specific to some
+            # column family and are not database-wide options.
             incomplete_option_ix = []
-            ix = 0
             options = []
-            for option in cond.options:
-                if option in self.options_dict.keys():
-                    options.append(self.options_dict[option])
+            missing_reqd_option = False
+            for ix, option in enumerate(cond.options):
+                if option not in reqd_options_dict:
+                    print(
+                        'WARNING(DatabaseOptions.check_and_trigger): ' +
+                        'skipping condition ' + cond.name + ' because it '
+                        'requires option ' + option + ' but this option is' +
+                        ' not available'
+                    )
+                    missing_reqd_option = True
+                    break  # required option is absent
+                if NO_COL_FAMILY in reqd_options_dict[option]:
+                    options.append(reqd_options_dict[option][NO_COL_FAMILY])
                 else:
+                    options.append(None)
                     incomplete_option_ix.append(ix)
-                    options.append(0)
-                ix += 1
 
-            # if all the options were present as is:
+            if missing_reqd_option:
+                continue
+
+            # if all the options are database-wide options
             if not incomplete_option_ix:
-                if not eval(cond.eval_expr):
-                    cond.set_trigger(cond.eval_expr)
+                try:
+                    if eval(cond.eval_expr):
+                        cond.set_trigger({NO_COL_FAMILY: options})
+                except Exception as e:
+                    print(
+                        'WARNING(DatabaseOptions) check_and_trigger:' + str(e)
+                    )
                 continue
 
-            # for all the options that were not present as is, we prepend them
-            # their names with every column family found in options file.
+            # for all the options that are not database-wide, we look for their
+            # values specific to column families
+            col_fam_options_dict = {}
             for col_fam in self.column_families:
                 present = True
                 for ix in incomplete_option_ix:
-                    full_option = col_fam + '.' + cond.options[ix]
-                    if full_option not in self.options_dict.keys():
+                    option = cond.options[ix]
+                    if col_fam not in reqd_options_dict[option]:
                         present = False
                         break
-                    options[ix] = self.options_dict[full_option]
-                if present and not eval(cond.eval_expr):
-                    cond.set_trigger(cond.eval_expr)
+                    options[ix] = reqd_options_dict[option][col_fam]
+                if present:
+                    try:
+                        if eval(cond.eval_expr):
+                            col_fam_options_dict[col_fam] = (
+                                copy.deepcopy(options)
+                            )
+                    except Exception as e:
+                        print(
+                            'WARNING(DatabaseOptions) check_and_trigger: ' +
+                            str(e)
+                        )
+            # Trigger for an OptionCondition object is of the form:
+            # Dict[col_fam_name: List[option_value]]
+            # where col_fam_name is the name of a column family for which
+            # 'eval_expr' evaluated to True and List[option_value] is the list
+            # of values of the options specified in the condition's 'options'
+            # field
+            if col_fam_options_dict:
+                cond.set_trigger(col_fam_options_dict)
diff -Nru rocksdb-5.15.10/tools/advisor/advisor/db_stats_fetcher.py rocksdb-5.17.2/tools/advisor/advisor/db_stats_fetcher.py
--- rocksdb-5.15.10/tools/advisor/advisor/db_stats_fetcher.py	1970-01-01 00:00:00.000000000 +0000
+++ rocksdb-5.17.2/tools/advisor/advisor/db_stats_fetcher.py	2018-11-12 19:57:32.000000000 +0000
@@ -0,0 +1,338 @@
+# Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+#  This source code is licensed under both the GPLv2 (found in the
+#  COPYING file in the root directory) and Apache 2.0 License
+#  (found in the LICENSE.Apache file in the root directory).
+
+from advisor.db_log_parser import Log
+from advisor.db_timeseries_parser import TimeSeriesData, NO_ENTITY
+import copy
+import glob
+import re
+import subprocess
+import time
+
+
+class LogStatsParser(TimeSeriesData):
+    STATS = 'STATISTICS:'
+
+    @staticmethod
+    def parse_log_line_for_stats(log_line):
+        # Example stat line (from LOG file):
+        # "rocksdb.db.get.micros P50 : 8.4 P95 : 21.8 P99 : 33.9 P100 : 92.0\n"
+        token_list = log_line.strip().split()
+        # token_list = ['rocksdb.db.get.micros', 'P50', ':', '8.4', 'P95', ':',
+        # '21.8', 'P99', ':', '33.9', 'P100', ':', '92.0']
+        stat_prefix = token_list[0] + '.'  # 'rocksdb.db.get.micros.'
+        stat_values = [
+            token
+            for token in token_list[1:]
+            if token != ':'
+        ]
+        # stat_values = ['P50', '8.4', 'P95', '21.8', 'P99', '33.9', 'P100',
+        # '92.0']
+        stat_dict = {}
+        for ix, metric in enumerate(stat_values):
+            if ix % 2 == 0:
+                stat_name = stat_prefix + metric
+                stat_name = stat_name.lower()  # Note: case insensitive names
+            else:
+                stat_dict[stat_name] = float(metric)
+        # stat_dict = {'rocksdb.db.get.micros.p50': 8.4,
+        # 'rocksdb.db.get.micros.p95': 21.8, 'rocksdb.db.get.micros.p99': 33.9,
+        # 'rocksdb.db.get.micros.p100': 92.0}
+        return stat_dict
+
+    def __init__(self, logs_path_prefix, stats_freq_sec):
+        super().__init__()
+        self.logs_file_prefix = logs_path_prefix
+        self.stats_freq_sec = stats_freq_sec
+        self.duration_sec = 60
+
+    def get_keys_from_conditions(self, conditions):
+        # Note: case insensitive stat names
+        reqd_stats = []
+        for cond in conditions:
+            for key in cond.keys:
+                key = key.lower()
+                # some keys are prepended with '[]' for OdsStatsFetcher to
+                # replace this with the appropriate key_prefix, remove these
+                # characters here since the LogStatsParser does not need
+                # a prefix
+                if key.startswith('[]'):
+                    reqd_stats.append(key[2:])
+                else:
+                    reqd_stats.append(key)
+        return reqd_stats
+
+    def add_to_timeseries(self, log, reqd_stats):
+        # this method takes in the Log object that contains the Rocksdb stats
+        # and a list of required stats, then it parses the stats line by line
+        # to fetch required stats and add them to the keys_ts object
+        # Example: reqd_stats = ['rocksdb.block.cache.hit.count',
+        # 'rocksdb.db.get.micros.p99']
+        # Let log.get_message() returns following string:
+        # "[WARN] [db/db_impl.cc:485] STATISTICS:\n
+        # rocksdb.block.cache.miss COUNT : 1459\n
+        # rocksdb.block.cache.hit COUNT : 37\n
+        # ...
+        # rocksdb.db.get.micros P50 : 15.6 P95 : 39.7 P99 : 62.6 P100 : 148.0\n
+        # ..."
+        new_lines = log.get_message().split('\n')
+        # let log_ts = 1532518219
+        log_ts = log.get_timestamp()
+        # example updates to keys_ts:
+        # keys_ts[NO_ENTITY]['rocksdb.db.get.micros.p99'][1532518219] = 62.6
+        # keys_ts[NO_ENTITY]['rocksdb.block.cache.hit.count'][1532518219] = 37
+        for line in new_lines[1:]:  # new_lines[0] does not contain any stats
+            stats_on_line = self.parse_log_line_for_stats(line)
+            for stat in stats_on_line:
+                if stat in reqd_stats:
+                    if stat not in self.keys_ts[NO_ENTITY]:
+                        self.keys_ts[NO_ENTITY][stat] = {}
+                    self.keys_ts[NO_ENTITY][stat][log_ts] = stats_on_line[stat]
+
+    def fetch_timeseries(self, reqd_stats):
+        # this method parses the Rocksdb LOG file and generates timeseries for
+        # each of the statistic in the list reqd_stats
+        self.keys_ts = {NO_ENTITY: {}}
+        for file_name in glob.glob(self.logs_file_prefix + '*'):
+            # TODO(poojam23): find a way to distinguish between 'old' log files
+            # from current and previous experiments, present in the same
+            # directory
+            if re.search('old', file_name, re.IGNORECASE):
+                continue
+            with open(file_name, 'r') as db_logs:
+                new_log = None
+                for line in db_logs:
+                    if Log.is_new_log(line):
+                        if (
+                            new_log and
+                            re.search(self.STATS, new_log.get_message())
+                        ):
+                            self.add_to_timeseries(new_log, reqd_stats)
+                        new_log = Log(line, column_families=[])
+                    else:
+                        # To account for logs split into multiple lines
+                        new_log.append_message(line)
+            # Check for the last log in the file.
+            if new_log and re.search(self.STATS, new_log.get_message()):
+                self.add_to_timeseries(new_log, reqd_stats)
+
+
+class DatabasePerfContext(TimeSeriesData):
+    # TODO(poojam23): check if any benchrunner provides PerfContext sampled at
+    # regular intervals
+    def __init__(self, perf_context_ts, stats_freq_sec, cumulative):
+        '''
+        perf_context_ts is expected to be in the following format:
+        Dict[metric, Dict[timestamp, value]], where for
+        each (metric, timestamp) pair, the value is database-wide (i.e.
+        summed over all the threads involved)
+        if stats_freq_sec == 0, per-metric only one value is reported
+        '''
+        super().__init__()
+        self.stats_freq_sec = stats_freq_sec
+        self.keys_ts = {NO_ENTITY: perf_context_ts}
+        if cumulative:
+            self.unaccumulate_metrics()
+
+    def unaccumulate_metrics(self):
+        # if the perf context metrics provided are cumulative in nature, this
+        # method can be used to convert them to a disjoint format
+        epoch_ts = copy.deepcopy(self.keys_ts)
+        for stat in self.keys_ts[NO_ENTITY]:
+            timeseries = sorted(
+                list(self.keys_ts[NO_ENTITY][stat].keys()), reverse=True
+            )
+            if len(timeseries) < 2:
+                continue
+            for ix, ts in enumerate(timeseries[:-1]):
+                epoch_ts[NO_ENTITY][stat][ts] = (
+                    epoch_ts[NO_ENTITY][stat][ts] -
+                    epoch_ts[NO_ENTITY][stat][timeseries[ix+1]]
+                )
+                if epoch_ts[NO_ENTITY][stat][ts] < 0:
+                    raise ValueError('DBPerfContext: really cumulative?')
+            # drop the smallest timestamp in the timeseries for this metric
+            epoch_ts[NO_ENTITY][stat].pop(timeseries[-1])
+        self.keys_ts = epoch_ts
+
+    def get_keys_from_conditions(self, conditions):
+        reqd_stats = []
+        for cond in conditions:
+            reqd_stats.extend([key.lower() for key in cond.keys])
+        return reqd_stats
+
+    def fetch_timeseries(self, statistics):
+        # this method is redundant for DatabasePerfContext because the __init__
+        # does the job of populating 'keys_ts'
+        pass
+
+
+class OdsStatsFetcher(TimeSeriesData):
+    # class constants
+    OUTPUT_FILE = 'temp/stats_out.tmp'
+    ERROR_FILE = 'temp/stats_err.tmp'
+    RAPIDO_COMMAND = "%s --entity=%s --key=%s --tstart=%s --tend=%s --showtime"
+
+    # static methods
+    @staticmethod
+    def _get_string_in_quotes(value):
+        return '"' + str(value) + '"'
+
+    @staticmethod
+    def _get_time_value_pair(pair_string):
+        # example pair_string: '[1532544591, 97.3653601828]'
+        pair_string = pair_string.replace('[', '')
+        pair_string = pair_string.replace(']', '')
+        pair = pair_string.split(',')
+        first = int(pair[0].strip())
+        second = float(pair[1].strip())
+        return [first, second]
+
+    @staticmethod
+    def _get_ods_cli_stime(start_time):
+        diff = int(time.time() - int(start_time))
+        stime = str(diff) + '_s'
+        return stime
+
+    def __init__(
+        self, client, entities, start_time, end_time, key_prefix=None
+    ):
+        super().__init__()
+        self.client = client
+        self.entities = entities
+        self.start_time = start_time
+        self.end_time = end_time
+        self.key_prefix = key_prefix
+        self.stats_freq_sec = 60
+        self.duration_sec = 60
+
+    def execute_script(self, command):
+        print('executing...')
+        print(command)
+        out_file = open(self.OUTPUT_FILE, "w+")
+        err_file = open(self.ERROR_FILE, "w+")
+        subprocess.call(command, shell=True, stdout=out_file, stderr=err_file)
+        out_file.close()
+        err_file.close()
+
+    def parse_rapido_output(self):
+        # Output looks like the following:
+        # <entity_name>\t<key_name>\t[[ts, value], [ts, value], ...]
+        # ts = timestamp; value = value of key_name in entity_name at time ts
+        self.keys_ts = {}
+        with open(self.OUTPUT_FILE, 'r') as fp:
+            for line in fp:
+                token_list = line.strip().split('\t')
+                entity = token_list[0]
+                key = token_list[1]
+                if entity not in self.keys_ts:
+                    self.keys_ts[entity] = {}
+                if key not in self.keys_ts[entity]:
+                    self.keys_ts[entity][key] = {}
+                list_of_lists = [
+                    self._get_time_value_pair(pair_string)
+                    for pair_string in token_list[2].split('],')
+                ]
+                value = {pair[0]: pair[1] for pair in list_of_lists}
+                self.keys_ts[entity][key] = value
+
+    def parse_ods_output(self):
+        # Output looks like the following:
+        # <entity_name>\t<key_name>\t<timestamp>\t<value>
+        # there is one line per (entity_name, key_name, timestamp)
+        self.keys_ts = {}
+        with open(self.OUTPUT_FILE, 'r') as fp:
+            for line in fp:
+                token_list = line.split()
+                entity = token_list[0]
+                if entity not in self.keys_ts:
+                    self.keys_ts[entity] = {}
+                key = token_list[1]
+                if key not in self.keys_ts[entity]:
+                    self.keys_ts[entity][key] = {}
+                self.keys_ts[entity][key][token_list[2]] = token_list[3]
+
+    def fetch_timeseries(self, statistics):
+        # this method fetches the timeseries of required stats from the ODS
+        # service and populates the 'keys_ts' object appropriately
+        print('OdsStatsFetcher: fetching ' + str(statistics))
+        if re.search('rapido', self.client, re.IGNORECASE):
+            command = self.RAPIDO_COMMAND % (
+                self.client,
+                self._get_string_in_quotes(self.entities),
+                self._get_string_in_quotes(','.join(statistics)),
+                self._get_string_in_quotes(self.start_time),
+                self._get_string_in_quotes(self.end_time)
+            )
+            # Run the tool and fetch the time-series data
+            self.execute_script(command)
+            # Parse output and populate the 'keys_ts' map
+            self.parse_rapido_output()
+        elif re.search('ods', self.client, re.IGNORECASE):
+            command = (
+                self.client + ' ' +
+                '--stime=' + self._get_ods_cli_stime(self.start_time) + ' ' +
+                self._get_string_in_quotes(self.entities) + ' ' +
+                self._get_string_in_quotes(','.join(statistics))
+            )
+            # Run the tool and fetch the time-series data
+            self.execute_script(command)
+            # Parse output and populate the 'keys_ts' map
+            self.parse_ods_output()
+
+    def get_keys_from_conditions(self, conditions):
+        reqd_stats = []
+        for cond in conditions:
+            for key in cond.keys:
+                use_prefix = False
+                if key.startswith('[]'):
+                    use_prefix = True
+                    key = key[2:]
+                # TODO(poojam23): this is very hacky and needs to be improved
+                if key.startswith("rocksdb"):
+                    key += ".60"
+                if use_prefix:
+                    if not self.key_prefix:
+                        print('Warning: OdsStatsFetcher might need key prefix')
+                        print('for the key: ' + key)
+                    else:
+                        key = self.key_prefix + "." + key
+                reqd_stats.append(key)
+        return reqd_stats
+
+    def fetch_rate_url(self, entities, keys, window_len, percent, display):
+        # type: (List[str], List[str], str, str, bool) -> str
+        transform_desc = (
+            "rate(" + str(window_len) + ",duration=" + str(self.duration_sec)
+        )
+        if percent:
+            transform_desc = transform_desc + ",%)"
+        else:
+            transform_desc = transform_desc + ")"
+        if re.search('rapido', self.client, re.IGNORECASE):
+            command = self.RAPIDO_COMMAND + " --transform=%s --url=%s"
+            command = command % (
+                self.client,
+                self._get_string_in_quotes(','.join(entities)),
+                self._get_string_in_quotes(','.join(keys)),
+                self._get_string_in_quotes(self.start_time),
+                self._get_string_in_quotes(self.end_time),
+                self._get_string_in_quotes(transform_desc),
+                self._get_string_in_quotes(display)
+            )
+        elif re.search('ods', self.client, re.IGNORECASE):
+            command = (
+                self.client + ' ' +
+                '--stime=' + self._get_ods_cli_stime(self.start_time) + ' ' +
+                '--fburlonly ' +
+                self._get_string_in_quotes(entities) + ' ' +
+                self._get_string_in_quotes(','.join(keys)) + ' ' +
+                self._get_string_in_quotes(transform_desc)
+            )
+        self.execute_script(command)
+        url = ""
+        with open(self.OUTPUT_FILE, 'r') as fp:
+            url = fp.readline()
+        return url
diff -Nru rocksdb-5.15.10/tools/advisor/advisor/db_timeseries_parser.py rocksdb-5.17.2/tools/advisor/advisor/db_timeseries_parser.py
--- rocksdb-5.15.10/tools/advisor/advisor/db_timeseries_parser.py	1970-01-01 00:00:00.000000000 +0000
+++ rocksdb-5.17.2/tools/advisor/advisor/db_timeseries_parser.py	2018-11-12 19:57:32.000000000 +0000
@@ -0,0 +1,208 @@
+# Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+#  This source code is licensed under both the GPLv2 (found in the
+#  COPYING file in the root directory) and Apache 2.0 License
+#  (found in the LICENSE.Apache file in the root directory).
+
+from abc import abstractmethod
+from advisor.db_log_parser import DataSource
+from enum import Enum
+import math
+
+
+NO_ENTITY = 'ENTITY_PLACEHOLDER'
+
+
+class TimeSeriesData(DataSource):
+    class Behavior(Enum):
+        bursty = 1
+        evaluate_expression = 2
+
+    class AggregationOperator(Enum):
+        avg = 1
+        max = 2
+        min = 3
+        latest = 4
+        oldest = 5
+
+    def __init__(self):
+        super().__init__(DataSource.Type.TIME_SERIES)
+        self.keys_ts = None  # Dict[entity, Dict[key, Dict[timestamp, value]]]
+        self.stats_freq_sec = None
+
+    @abstractmethod
+    def get_keys_from_conditions(self, conditions):
+        # This method takes in a list of time-series conditions; for each
+        # condition it manipulates the 'keys' in the way that is supported by
+        # the subclass implementing this method
+        pass
+
+    @abstractmethod
+    def fetch_timeseries(self, required_statistics):
+        # this method takes in a list of statistics and fetches the timeseries
+        # for each of them and populates the 'keys_ts' dictionary
+        pass
+
+    def fetch_burst_epochs(
+        self, entities, statistic, window_sec, threshold, percent
+    ):
+        # type: (str, int, float, bool) -> Dict[str, Dict[int, float]]
+        # this method calculates the (percent) rate change in the 'statistic'
+        # for each entity (over 'window_sec' seconds) and returns the epochs
+        # where this rate change is greater than or equal to the 'threshold'
+        # value
+        if self.stats_freq_sec == 0:
+            # not time series data, cannot check for bursty behavior
+            return
+        if window_sec < self.stats_freq_sec:
+            window_sec = self.stats_freq_sec
+        # 'window_samples' is the number of windows to go back to
+        # compare the current window with, while calculating rate change.
+        window_samples = math.ceil(window_sec / self.stats_freq_sec)
+        burst_epochs = {}
+        # if percent = False:
+        # curr_val = value at window for which rate change is being calculated
+        # prev_val = value at window that is window_samples behind curr_window
+        # Then rate_without_percent =
+        # ((curr_val-prev_val)*duration_sec)/(curr_timestamp-prev_timestamp)
+        # if percent = True:
+        # rate_with_percent = (rate_without_percent * 100) / prev_val
+        # These calculations are in line with the rate() transform supported
+        # by ODS
+        for entity in entities:
+            if statistic not in self.keys_ts[entity]:
+                continue
+            timestamps = sorted(list(self.keys_ts[entity][statistic].keys()))
+            for ix in range(window_samples, len(timestamps), 1):
+                first_ts = timestamps[ix - window_samples]
+                last_ts = timestamps[ix]
+                first_val = self.keys_ts[entity][statistic][first_ts]
+                last_val = self.keys_ts[entity][statistic][last_ts]
+                diff = last_val - first_val
+                if percent:
+                    diff = diff * 100 / first_val
+                rate = (diff * self.duration_sec) / (last_ts - first_ts)
+                # if the rate change is greater than the provided threshold,
+                # then the condition is triggered for entity at time 'last_ts'
+                if rate >= threshold:
+                    if entity not in burst_epochs:
+                        burst_epochs[entity] = {}
+                    burst_epochs[entity][last_ts] = rate
+        return burst_epochs
+
+    def fetch_aggregated_values(self, entity, statistics, aggregation_op):
+        # type: (str, AggregationOperator) -> Dict[str, float]
+        # this method performs the aggregation specified by 'aggregation_op'
+        # on the timeseries of 'statistics' for 'entity' and returns:
+        # Dict[statistic, aggregated_value]
+        result = {}
+        for stat in statistics:
+            if stat not in self.keys_ts[entity]:
+                continue
+            agg_val = None
+            if aggregation_op is self.AggregationOperator.latest:
+                latest_timestamp = max(list(self.keys_ts[entity][stat].keys()))
+                agg_val = self.keys_ts[entity][stat][latest_timestamp]
+            elif aggregation_op is self.AggregationOperator.oldest:
+                oldest_timestamp = min(list(self.keys_ts[entity][stat].keys()))
+                agg_val = self.keys_ts[entity][stat][oldest_timestamp]
+            elif aggregation_op is self.AggregationOperator.max:
+                agg_val = max(list(self.keys_ts[entity][stat].values()))
+            elif aggregation_op is self.AggregationOperator.min:
+                agg_val = min(list(self.keys_ts[entity][stat].values()))
+            elif aggregation_op is self.AggregationOperator.avg:
+                values = list(self.keys_ts[entity][stat].values())
+                agg_val = sum(values) / len(values)
+            result[stat] = agg_val
+        return result
+
+    def check_and_trigger_conditions(self, conditions):
+        # get the list of statistics that need to be fetched
+        reqd_keys = self.get_keys_from_conditions(conditions)
+        # fetch the required statistics and populate the map 'keys_ts'
+        self.fetch_timeseries(reqd_keys)
+        # Trigger the appropriate conditions
+        for cond in conditions:
+            complete_keys = self.get_keys_from_conditions([cond])
+            # Get the entities that have all statistics required by 'cond':
+            # an entity is checked for a given condition only if we possess all
+            # of the condition's 'keys' for that entity
+            entities_with_stats = []
+            for entity in self.keys_ts:
+                stat_missing = False
+                for stat in complete_keys:
+                    if stat not in self.keys_ts[entity]:
+                        stat_missing = True
+                        break
+                if not stat_missing:
+                    entities_with_stats.append(entity)
+            if not entities_with_stats:
+                continue
+            if cond.behavior is self.Behavior.bursty:
+                # for a condition that checks for bursty behavior, only one key
+                # should be present in the condition's 'keys' field
+                result = self.fetch_burst_epochs(
+                    entities_with_stats,
+                    complete_keys[0],  # there should be only one key
+                    cond.window_sec,
+                    cond.rate_threshold,
+                    True
+                )
+                # Trigger in this case is:
+                # Dict[entity_name, Dict[timestamp, rate_change]]
+                # where the inner dictionary contains rate_change values when
+                # the rate_change >= threshold provided, with the
+                # corresponding timestamps
+                if result:
+                    cond.set_trigger(result)
+            elif cond.behavior is self.Behavior.evaluate_expression:
+                self.handle_evaluate_expression(
+                    cond,
+                    complete_keys,
+                    entities_with_stats
+                )
+
+    def handle_evaluate_expression(self, condition, statistics, entities):
+        trigger = {}
+        # check 'condition' for each of these entities
+        for entity in entities:
+            if hasattr(condition, 'aggregation_op'):
+                # in this case, the aggregation operation is performed on each
+                # of the condition's 'keys' and then with aggregated values
+                # condition's 'expression' is evaluated; if it evaluates to
+                # True, then list of the keys values is added to the
+                # condition's trigger: Dict[entity_name, List[stats]]
+                result = self.fetch_aggregated_values(
+                        entity, statistics, condition.aggregation_op
+                )
+                keys = [result[key] for key in statistics]
+                try:
+                    if eval(condition.expression):
+                        trigger[entity] = keys
+                except Exception as e:
+                    print(
+                        'WARNING(TimeSeriesData) check_and_trigger: ' + str(e)
+                    )
+            else:
+                # assumption: all stats have same series of timestamps
+                # this is similar to the above but 'expression' is evaluated at
+                # each timestamp, since there is no aggregation, and all the
+                # epochs are added to the trigger when the condition's
+                # 'expression' evaluated to true; so trigger is:
+                # Dict[entity, Dict[timestamp, List[stats]]]
+                for epoch in self.keys_ts[entity][statistics[0]].keys():
+                    keys = [
+                        self.keys_ts[entity][key][epoch]
+                        for key in statistics
+                    ]
+                    try:
+                        if eval(condition.expression):
+                            if entity not in trigger:
+                                trigger[entity] = {}
+                            trigger[entity][epoch] = keys
+                    except Exception as e:
+                        print(
+                            'WARNING(TimeSeriesData) check_and_trigger: ' +
+                            str(e)
+                        )
+        if trigger:
+            condition.set_trigger(trigger)
diff -Nru rocksdb-5.15.10/tools/advisor/advisor/ini_parser.py rocksdb-5.17.2/tools/advisor/advisor/ini_parser.py
--- rocksdb-5.15.10/tools/advisor/advisor/ini_parser.py	2018-09-13 17:25:20.000000000 +0000
+++ rocksdb-5.17.2/tools/advisor/advisor/ini_parser.py	2018-11-12 19:57:32.000000000 +0000
@@ -62,8 +62,8 @@
     def get_key_value_pair(line):
         line = line.strip()
         key = line.split('=')[0].strip()
-        value = line.split('=')[1].strip()
-        if not value:
+        value = "=".join(line.split('=')[1:])
+        if value == "":  # if the option has no value
             return (key, None)
         values = IniParser.get_list_from_value(value)
         if len(values) == 1:
diff -Nru rocksdb-5.15.10/tools/advisor/advisor/rule_parser_example.py rocksdb-5.17.2/tools/advisor/advisor/rule_parser_example.py
--- rocksdb-5.15.10/tools/advisor/advisor/rule_parser_example.py	1970-01-01 00:00:00.000000000 +0000
+++ rocksdb-5.17.2/tools/advisor/advisor/rule_parser_example.py	2018-11-12 19:57:32.000000000 +0000
@@ -0,0 +1,89 @@
+# Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+#  This source code is licensed under both the GPLv2 (found in the
+#  COPYING file in the root directory) and Apache 2.0 License
+#  (found in the LICENSE.Apache file in the root directory).
+
+from advisor.rule_parser import RulesSpec
+from advisor.db_log_parser import DatabaseLogs, DataSource
+from advisor.db_options_parser import DatabaseOptions
+from advisor.db_stats_fetcher import LogStatsParser, OdsStatsFetcher
+import argparse
+
+
+def main(args):
+    # initialise the RulesSpec parser
+    rule_spec_parser = RulesSpec(args.rules_spec)
+    rule_spec_parser.load_rules_from_spec()
+    rule_spec_parser.perform_section_checks()
+    # initialize the DatabaseOptions object
+    db_options = DatabaseOptions(args.rocksdb_options)
+    # Create DatabaseLogs object
+    db_logs = DatabaseLogs(
+        args.log_files_path_prefix, db_options.get_column_families()
+    )
+    # Create the Log STATS object
+    db_log_stats = LogStatsParser(
+        args.log_files_path_prefix, args.stats_dump_period_sec
+    )
+    data_sources = {
+        DataSource.Type.DB_OPTIONS: [db_options],
+        DataSource.Type.LOG: [db_logs],
+        DataSource.Type.TIME_SERIES: [db_log_stats]
+    }
+    if args.ods_client:
+        data_sources[DataSource.Type.TIME_SERIES].append(OdsStatsFetcher(
+            args.ods_client,
+            args.ods_entity,
+            args.ods_tstart,
+            args.ods_tend,
+            args.ods_key_prefix
+        ))
+    triggered_rules = rule_spec_parser.get_triggered_rules(
+        data_sources, db_options.get_column_families()
+    )
+    rule_spec_parser.print_rules(triggered_rules)
+
+
+if __name__ == '__main__':
+    parser = argparse.ArgumentParser(description='Use this script to get\
+        suggestions for improving Rocksdb performance.')
+    parser.add_argument(
+        '--rules_spec', required=True, type=str,
+        help='path of the file containing the expert-specified Rules'
+    )
+    parser.add_argument(
+        '--rocksdb_options', required=True, type=str,
+        help='path of the starting Rocksdb OPTIONS file'
+    )
+    parser.add_argument(
+        '--log_files_path_prefix', required=True, type=str,
+        help='path prefix of the Rocksdb LOG files'
+    )
+    parser.add_argument(
+        '--stats_dump_period_sec', required=True, type=int,
+        help='the frequency (in seconds) at which STATISTICS are printed to ' +
+        'the Rocksdb LOG file'
+    )
+    # ODS arguments
+    parser.add_argument(
+        '--ods_client', type=str, help='the ODS client binary'
+    )
+    parser.add_argument(
+        '--ods_entity', type=str,
+        help='the servers for which the ODS stats need to be fetched'
+    )
+    parser.add_argument(
+        '--ods_key_prefix', type=str,
+        help='the prefix that needs to be attached to the keys of time ' +
+        'series to be fetched from ODS'
+    )
+    parser.add_argument(
+        '--ods_tstart', type=int,
+        help='start time of timeseries to be fetched from ODS'
+    )
+    parser.add_argument(
+        '--ods_tend', type=int,
+        help='end time of timeseries to be fetched from ODS'
+    )
+    args = parser.parse_args()
+    main(args)
diff -Nru rocksdb-5.15.10/tools/advisor/advisor/rule_parser.py rocksdb-5.17.2/tools/advisor/advisor/rule_parser.py
--- rocksdb-5.15.10/tools/advisor/advisor/rule_parser.py	2018-09-13 17:25:20.000000000 +0000
+++ rocksdb-5.17.2/tools/advisor/advisor/rule_parser.py	2018-11-12 19:57:32.000000000 +0000
@@ -4,11 +4,11 @@
 #  (found in the LICENSE.Apache file in the root directory).
 
 from abc import ABC, abstractmethod
-import argparse
-from advisor.db_log_parser import DatabaseLogs, DataSource
-from advisor.db_options_parser import DatabaseOptions
+from advisor.db_log_parser import DataSource, NO_COL_FAMILY
+from advisor.db_timeseries_parser import TimeSeriesData
 from enum import Enum
 from advisor.ini_parser import IniParser
+import re
 
 
 class Section(ABC):
@@ -29,6 +29,9 @@
         super().__init__(name)
         self.conditions = None
         self.suggestions = None
+        self.overlap_time_seconds = None
+        self.trigger_entities = None
+        self.trigger_column_families = None
 
     def set_parameter(self, key, value):
         # If the Rule is associated with a single suggestion/condition, then
@@ -45,6 +48,8 @@
                 self.suggestions = [value]
             else:
                 self.suggestions = value
+        elif key == 'overlap_time_period':
+            self.overlap_time_seconds = value
 
     def get_suggestions(self):
         return self.suggestions
@@ -58,12 +63,133 @@
             raise ValueError(
                 self.name + ': rule must have at least one suggestion'
             )
+        if self.overlap_time_seconds:
+            if len(self.conditions) != 2:
+                raise ValueError(
+                    self.name + ": rule must be associated with 2 conditions\
+                    in order to check for a time dependency between them"
+                )
+            time_format = '^\d+[s|m|h|d]$'
+            if (
+                not
+                re.match(time_format, self.overlap_time_seconds, re.IGNORECASE)
+            ):
+                raise ValueError(
+                    self.name + ": overlap_time_seconds format: \d+[s|m|h|d]"
+                )
+            else:  # convert to seconds
+                in_seconds = int(self.overlap_time_seconds[:-1])
+                if self.overlap_time_seconds[-1] == 'm':
+                    in_seconds *= 60
+                elif self.overlap_time_seconds[-1] == 'h':
+                    in_seconds *= (60 * 60)
+                elif self.overlap_time_seconds[-1] == 'd':
+                    in_seconds *= (24 * 60 * 60)
+                self.overlap_time_seconds = in_seconds
+
+    def get_overlap_timestamps(self, key1_trigger_epochs, key2_trigger_epochs):
+        # this method takes in 2 timeseries i.e. timestamps at which the
+        # rule's 2 TIME_SERIES conditions were triggered and it finds
+        # (if present) the first pair of timestamps at which the 2 conditions
+        # were triggered within 'overlap_time_seconds' of each other
+        key1_lower_bounds = [
+            epoch - self.overlap_time_seconds
+            for epoch in key1_trigger_epochs
+        ]
+        key1_lower_bounds.sort()
+        key2_trigger_epochs.sort()
+        trigger_ix = 0
+        overlap_pair = None
+        for key1_lb in key1_lower_bounds:
+            while (
+                key2_trigger_epochs[trigger_ix] < key1_lb and
+                trigger_ix < len(key2_trigger_epochs)
+            ):
+                trigger_ix += 1
+            if trigger_ix >= len(key2_trigger_epochs):
+                break
+            if (
+                key2_trigger_epochs[trigger_ix] <=
+                key1_lb + (2 * self.overlap_time_seconds)
+            ):
+                overlap_pair = (
+                    key2_trigger_epochs[trigger_ix],
+                    key1_lb + self.overlap_time_seconds
+                )
+                break
+        return overlap_pair
 
-    def is_triggered(self, conditions_dict):
-        condition_triggers = []
-        for cond in self.conditions:
-            condition_triggers.append(conditions_dict[cond].is_triggered())
-        return all(condition_triggers)
+    def get_trigger_entities(self):
+        return self.trigger_entities
+
+    def get_trigger_column_families(self):
+        return self.trigger_column_families
+
+    def is_triggered(self, conditions_dict, column_families):
+        if self.overlap_time_seconds:
+            condition1 = conditions_dict[self.conditions[0]]
+            condition2 = conditions_dict[self.conditions[1]]
+            if not (
+                condition1.get_data_source() is DataSource.Type.TIME_SERIES and
+                condition2.get_data_source() is DataSource.Type.TIME_SERIES
+            ):
+                raise ValueError(self.name + ': need 2 timeseries conditions')
+
+            map1 = condition1.get_trigger()
+            map2 = condition2.get_trigger()
+            if not (map1 and map2):
+                return False
+
+            self.trigger_entities = {}
+            is_triggered = False
+            entity_intersection = (
+                set(map1.keys()).intersection(set(map2.keys()))
+            )
+            for entity in entity_intersection:
+                overlap_timestamps_pair = (
+                    self.get_overlap_timestamps(
+                        list(map1[entity].keys()), list(map2[entity].keys())
+                    )
+                )
+                if overlap_timestamps_pair:
+                    self.trigger_entities[entity] = overlap_timestamps_pair
+                    is_triggered = True
+            if is_triggered:
+                self.trigger_column_families = set(column_families)
+            return is_triggered
+        else:
+            all_conditions_triggered = True
+            self.trigger_column_families = set(column_families)
+            for cond_name in self.conditions:
+                cond = conditions_dict[cond_name]
+                if not cond.get_trigger():
+                    all_conditions_triggered = False
+                    break
+                if (
+                    cond.get_data_source() is DataSource.Type.LOG or
+                    cond.get_data_source() is DataSource.Type.DB_OPTIONS
+                ):
+                    cond_col_fam = set(cond.get_trigger().keys())
+                    if NO_COL_FAMILY in cond_col_fam:
+                        cond_col_fam = set(column_families)
+                    self.trigger_column_families = (
+                        self.trigger_column_families.intersection(cond_col_fam)
+                    )
+                elif cond.get_data_source() is DataSource.Type.TIME_SERIES:
+                    cond_entities = set(cond.get_trigger().keys())
+                    if self.trigger_entities is None:
+                        self.trigger_entities = cond_entities
+                    else:
+                        self.trigger_entities = (
+                            self.trigger_entities.intersection(cond_entities)
+                        )
+                if not (self.trigger_entities or self.trigger_column_families):
+                    all_conditions_triggered = False
+                    break
+            if not all_conditions_triggered:  # clean up if rule not triggered
+                self.trigger_column_families = None
+                self.trigger_entities = None
+            return all_conditions_triggered
 
     def __repr__(self):
         # Append conditions
@@ -84,6 +210,10 @@
                 is_first = False
             else:
                 rule_string += (", " + sugg)
+        if self.trigger_entities:
+            rule_string += (', entities:: ' + str(self.trigger_entities))
+        if self.trigger_column_families:
+            rule_string += (', col_fam:: ' + str(self.trigger_column_families))
         # Return constructed string
         return rule_string
 
@@ -98,18 +228,27 @@
         super().__init__(name)
         self.option = None
         self.action = None
-        self.suggested_value = None
+        self.suggested_values = None
         self.description = None
 
     def set_parameter(self, key, value):
         if key == 'option':
+            # Note:
+            # case 1: 'option' is supported by Rocksdb OPTIONS file; in this
+            # case the option belongs to one of the sections in the config
+            # file and it's name is prefixed by "<section_type>."
+            # case 2: 'option' is not supported by Rocksdb OPTIONS file; the
+            # option is not expected to have the character '.' in its name
             self.option = value
         elif key == 'action':
             if self.option and not value:
                 raise ValueError(self.name + ': provide action for option')
             self.action = self.Action[value]
-        elif key == 'suggested_value':
-            self.suggested_value = value
+        elif key == 'suggested_values':
+            if isinstance(value, str):
+                self.suggested_values = [value]
+            else:
+                self.suggested_values = value
         elif key == 'description':
             self.description = value
 
@@ -119,33 +258,28 @@
                 raise ValueError(self.name + ': provide option or description')
             if not self.action:
                 raise ValueError(self.name + ': provide action for option')
-            if self.action is self.Action.set and not self.suggested_value:
+            if self.action is self.Action.set and not self.suggested_values:
                 raise ValueError(
                     self.name + ': provide suggested value for option'
                 )
 
     def __repr__(self):
+        sugg_string = "Suggestion: " + self.name
         if self.description:
-            return self.description
-        sugg_string = ""
-        if self.action is self.Action.set:
-            sugg_string = (
-                self.name + ' suggests setting ' + self.option +
-                ' to ' + self.suggested_value
-            )
+            sugg_string += (' description : ' + self.description)
         else:
-            sugg_string = self.name + ' suggests ' + self.action.name + ' in '
-            sugg_string += (self.option + '.')
-            if self.suggested_value:
+            sugg_string += (
+                ' option : ' + self.option + ' action : ' + self.action.name
+            )
+            if self.suggested_values:
                 sugg_string += (
-                    ' The suggested value is ' + self.suggested_value
+                    ' suggested_values : ' + str(self.suggested_values)
                 )
         return sugg_string
 
 
 class Condition(Section):
     def __init__(self, name):
-        # a rule is identified by its name, so there should be no duplicates
         super().__init__(name)
         self.data_source = None
         self.trigger = None
@@ -166,6 +300,9 @@
     def set_trigger(self, condition_trigger):
         self.trigger = condition_trigger
 
+    def get_trigger(self):
+        return self.trigger
+
     def is_triggered(self):
         if self.trigger:
             return True
@@ -173,7 +310,7 @@
 
     def set_parameter(self, key, value):
         # must be defined by the subclass
-        raise ValueError(self.name + ': provide source for condition')
+        raise NotImplementedError(self.name + ': provide source for condition')
 
 
 class LogCondition(Condition):
@@ -183,15 +320,9 @@
         base_condition.__class__ = cls
         return base_condition
 
-    class Scope(Enum):
-        database = 1
-        column_family = 2
-
     def set_parameter(self, key, value):
         if key == 'regex':
             self.regex = value
-        elif key == 'scope':
-            self.scope = self.Scope[value]
 
     def perform_checks(self):
         super().perform_checks()
@@ -199,10 +330,10 @@
             raise ValueError(self.name + ': provide regex for log condition')
 
     def __repr__(self):
-        log_cond_str = (
-            self.name + ' checks if the regex ' + self.regex + ' is found ' +
-            ' in the LOG file in the scope of ' + self.scope.name
-        )
+        log_cond_str = "LogCondition: " + self.name
+        log_cond_str += (" regex: " + self.regex)
+        # if self.trigger:
+        #     log_cond_str += (" trigger: " + str(self.trigger))
         return log_cond_str
 
 
@@ -215,8 +346,11 @@
 
     def set_parameter(self, key, value):
         if key == 'options':
-            self.options = value
-        if key == 'evaluate':
+            if isinstance(value, str):
+                self.options = [value]
+            else:
+                self.options = value
+        elif key == 'evaluate':
             self.eval_expr = value
 
     def perform_checks(self):
@@ -227,15 +361,77 @@
             raise ValueError(self.name + ': expression missing in condition')
 
     def __repr__(self):
-        log_cond_str = (
-            self.name + ' checks if the given expression evaluates to true'
-        )
-        return log_cond_str
+        opt_cond_str = "OptionCondition: " + self.name
+        opt_cond_str += (" options: " + str(self.options))
+        opt_cond_str += (" expression: " + self.eval_expr)
+        if self.trigger:
+            opt_cond_str += (" trigger: " + str(self.trigger))
+        return opt_cond_str
+
+
+class TimeSeriesCondition(Condition):
+    @classmethod
+    def create(cls, base_condition):
+        base_condition.set_data_source(DataSource.Type['TIME_SERIES'])
+        base_condition.__class__ = cls
+        return base_condition
+
+    def set_parameter(self, key, value):
+        if key == 'keys':
+            if isinstance(value, str):
+                self.keys = [value]
+            else:
+                self.keys = value
+        elif key == 'behavior':
+            self.behavior = TimeSeriesData.Behavior[value]
+        elif key == 'rate_threshold':
+            self.rate_threshold = float(value)
+        elif key == 'window_sec':
+            self.window_sec = int(value)
+        elif key == 'evaluate':
+            self.expression = value
+        elif key == 'aggregation_op':
+            self.aggregation_op = TimeSeriesData.AggregationOperator[value]
+
+    def perform_checks(self):
+        if not self.keys:
+            raise ValueError(self.name + ': specify timeseries key')
+        if not self.behavior:
+            raise ValueError(self.name + ': specify triggering behavior')
+        if self.behavior is TimeSeriesData.Behavior.bursty:
+            if not self.rate_threshold:
+                raise ValueError(self.name + ': specify rate burst threshold')
+            if not self.window_sec:
+                self.window_sec = 300  # default window length is 5 minutes
+            if len(self.keys) > 1:
+                raise ValueError(self.name + ': specify only one key')
+        elif self.behavior is TimeSeriesData.Behavior.evaluate_expression:
+            if not (self.expression):
+                raise ValueError(self.name + ': specify evaluation expression')
+        else:
+            raise ValueError(self.name + ': trigger behavior not supported')
+
+    def __repr__(self):
+        ts_cond_str = "TimeSeriesCondition: " + self.name
+        ts_cond_str += (" statistics: " + str(self.keys))
+        ts_cond_str += (" behavior: " + self.behavior.name)
+        if self.behavior is TimeSeriesData.Behavior.bursty:
+            ts_cond_str += (" rate_threshold: " + str(self.rate_threshold))
+            ts_cond_str += (" window_sec: " + str(self.window_sec))
+        if self.behavior is TimeSeriesData.Behavior.evaluate_expression:
+            ts_cond_str += (" expression: " + self.expression)
+            if hasattr(self, 'aggregation_op'):
+                ts_cond_str += (" aggregation_op: " + self.aggregation_op.name)
+        if self.trigger:
+            ts_cond_str += (" trigger: " + str(self.trigger))
+        return ts_cond_str
 
 
 class RulesSpec:
     def __init__(self, rules_path):
         self.file_path = rules_path
+
+    def initialise_fields(self):
         self.rules_dict = {}
         self.conditions_dict = {}
         self.suggestions_dict = {}
@@ -249,9 +445,13 @@
             sugg.perform_checks()
 
     def load_rules_from_spec(self):
+        self.initialise_fields()
         with open(self.file_path, 'r') as db_rules:
             curr_section = None
             for line in db_rules:
+                line = IniParser.remove_trailing_comment(line)
+                if not line:
+                    continue
                 element = IniParser.get_element(line)
                 if element is IniParser.Element.comment:
                     continue
@@ -277,6 +477,8 @@
                                 new_cond = LogCondition.create(new_cond)
                             elif value == 'OPTIONS':
                                 new_cond = OptionCondition.create(new_cond)
+                            elif value == 'TIME_SERIES':
+                                new_cond = TimeSeriesCondition.create(new_cond)
                         else:
                             new_cond.set_parameter(key, value)
                     elif curr_section is IniParser.Element.sugg:
@@ -291,75 +493,36 @@
     def get_suggestions_dict(self):
         return self.suggestions_dict
 
-
-def trigger_conditions(data_sources, conditions_dict):
-    for source in data_sources:
-        cond_subset = [
-            cond
-            for cond in conditions_dict.values()
-            if cond.get_data_source() is source.type
-        ]
-        if not cond_subset:
-            continue
-        source.check_and_trigger_conditions(cond_subset)
-
-
-def get_triggered_rules(rules_dict, conditions_dict):
-    triggered_rules = []
-    for rule in rules_dict.values():
-        if rule.is_triggered(conditions_dict):
-            triggered_rules.append(rule)
-    return triggered_rules
-
-
-def main(args):
-    # Load the rules with their conditions and suggestions.
-    db_rules = RulesSpec(args.rules_spec)
-    db_rules.load_rules_from_spec()
-    # Perform some basic sanity checks for each section.
-    db_rules.perform_section_checks()
-    rules_dict = db_rules.get_rules_dict()
-    conditions_dict = db_rules.get_conditions_dict()
-    suggestions_dict = db_rules.get_suggestions_dict()
-    print()
-    print('RULES')
-    for rule in rules_dict.values():
-        print(repr(rule))
-    print()
-    print('CONDITIONS')
-    for cond in conditions_dict.values():
-        print(repr(cond))
-    print()
-    print('SUGGESTIONS')
-    for sugg in suggestions_dict.values():
-        print(repr(sugg))
-
-    # Initialise the data sources.
-    data_sources = []
-    data_sources.append(DatabaseOptions(args.rocksdb_options))
-    data_sources.append(DatabaseLogs(args.rocksdb_log_prefix))
-
-    # Initialise the ConditionChecker with the provided data sources.
-    trigger_conditions(data_sources, conditions_dict)
-
-    # Check for the conditions read in from the Rules spec, if triggered.
-    print()
-    triggered_rules = get_triggered_rules(rules_dict, conditions_dict)
-    for rule in triggered_rules:
-        print('Rule: ' + rule.name + ' has been triggered and:')
-        rule_suggestions = rule.get_suggestions()
-        for sugg_name in rule_suggestions:
-            print(suggestions_dict[sugg_name])
-
-
-if __name__ == "__main__":
-    parser = argparse.ArgumentParser(description='This script is used for\
-        gauging rocksdb performance using as input: Rocksdb LOG, OPTIONS,\
-        performance context, command-line statistics and statistics published\
-        on ODS and providing as output: suggestions to improve Rocksdb\
-        performance')
-    parser.add_argument('--rules_spec', required=True, type=str)
-    parser.add_argument('--rocksdb_options', required=True, type=str)
-    parser.add_argument('--rocksdb_log_prefix', required=True, type=str)
-    args = parser.parse_args()
-    main(args)
+    def get_triggered_rules(self, data_sources, column_families):
+        self.trigger_conditions(data_sources)
+        triggered_rules = []
+        for rule in self.rules_dict.values():
+            if rule.is_triggered(self.conditions_dict, column_families):
+                triggered_rules.append(rule)
+        return triggered_rules
+
+    def trigger_conditions(self, data_sources):
+        for source_type in data_sources:
+            cond_subset = [
+                cond
+                for cond in self.conditions_dict.values()
+                if cond.get_data_source() is source_type
+            ]
+            if not cond_subset:
+                continue
+            for source in data_sources[source_type]:
+                source.check_and_trigger_conditions(cond_subset)
+
+    def print_rules(self, rules):
+        for rule in rules:
+            print('\nRule: ' + rule.name)
+            for cond_name in rule.conditions:
+                print(repr(self.conditions_dict[cond_name]))
+            for sugg_name in rule.suggestions:
+                print(repr(self.suggestions_dict[sugg_name]))
+            if rule.trigger_entities:
+                print('scope: entities:')
+                print(rule.trigger_entities)
+            if rule.trigger_column_families:
+                print('scope: col_fam:')
+                print(rule.trigger_column_families)
diff -Nru rocksdb-5.15.10/tools/advisor/advisor/rules.ini rocksdb-5.17.2/tools/advisor/advisor/rules.ini
--- rocksdb-5.15.10/tools/advisor/advisor/rules.ini	2018-09-13 17:25:20.000000000 +0000
+++ rocksdb-5.17.2/tools/advisor/advisor/rules.ini	2018-11-12 19:57:32.000000000 +0000
@@ -3,24 +3,28 @@
 #  COPYING file in the root directory) and Apache 2.0 License
 #  (found in the LICENSE.Apache file in the root directory).
 #
-# This ini file is very similar to the Rocksdb ini file in terms of syntax.
+# FORMAT: very similar to the Rocksdb ini file in terms of syntax
 # (refer rocksdb/examples/rocksdb_option_file_example.ini)
-# It is made up of multiple sections and each section is made up of multiple
-# key-value pairs. Each section must have a name. The recognized sections are
-# Rule, Suggestion, Condition followed by their name in "" that acts as an
-# identifier. There should be at least one Rule section in the file.
 #
-# Each rule must be associated with at least one condition and one suggestion.
-# If a Rule is associated with multiple Conditions, then all the conditions
-# must be triggered in order for the Rule to be triggered.
-# The suggestions don't have any ordering amongst them as of now.
+# The Rules INI file is made up of multiple sections and each section is made
+# up of multiple key-value pairs. The recognized section types are:
+# Rule, Suggestion, Condition. Each section must have a name specified in ""
+# in the section header. This name acts as an identifier in that section
+# type's namespace. A section header looks like:
+# [<section_type> "<section_name_identifier>"]
 #
-# A Condition must be associated to a data source specified by the parameter
+# There should be at least one Rule section in the file with its corresponding
+# Condition and Suggestion sections. A Rule is triggered only when all of its
+# conditions are triggered. The order in which a Rule's conditions and
+# suggestions are specified has no significance.
+#
+# A Condition must be associated with a data source specified by the parameter
 # 'source' and this must be the first parameter specified for the Condition.
+# A condition can be associated with one or more Rules.
 #
-# A suggestion is an advised change to a database or column_family option to
-# improve the performance of the database in some way. Every suggestion is
-# is associated with one or more Rules.
+# A Suggestion is an advised change to a Rocksdb option to improve the
+# performance of the database in some way. Every suggestion can be a part of
+# one or more Rules.
 
 [Rule "stall-too-many-memtables"]
 suggestions=inc-bg-flush:inc-write-buffer
@@ -29,7 +33,6 @@
 [Condition "stall-too-many-memtables"]
 source=LOG
 regex=Stopping writes because we have \d+ immutable memtables \(waiting for flush\), max_write_buffer_number is set to \d+
-scope=column_family
 
 [Rule "stall-too-many-L0"]
 suggestions=inc-max-subcompactions:inc-max-bg-compactions:inc-write-buffer-size:dec-max-bytes-for-level-base:inc-l0-slowdown-writes-trigger
@@ -38,7 +41,6 @@
 [Condition "stall-too-many-L0"]
 source=LOG
 regex=Stalling writes because we have \d+ level-0 files
-scope=column_family
 
 [Rule "stop-too-many-L0"]
 suggestions=inc-max-bg-compactions:inc-write-buffer-size:inc-l0-stop-writes-trigger
@@ -47,7 +49,6 @@
 [Condition "stop-too-many-L0"]
 source=LOG
 regex=Stopping writes because we have \d+ level-0 files
-scope=column_family
 
 [Rule "stall-too-many-compaction-bytes"]
 suggestions=inc-max-bg-compactions:inc-write-buffer-size:inc-hard-pending-compaction-bytes-limit:inc-soft-pending-compaction-bytes-limit
@@ -56,11 +57,11 @@
 [Condition "stall-too-many-compaction-bytes"]
 source=LOG
 regex=Stalling writes because of estimated pending compaction bytes \d+
-scope=column_family
 
 [Suggestion "inc-bg-flush"]
 option=DBOptions.max_background_flushes
 action=increase
+suggested_values=2
 
 [Suggestion "inc-write-buffer"]
 option=CFOptions.max_write_buffer_number
@@ -73,6 +74,7 @@
 [Suggestion "inc-max-bg-compactions"]
 option=DBOptions.max_background_compactions
 action=increase
+suggested_values=2
 
 [Suggestion "inc-write-buffer-size"]
 option=CFOptions.write_buffer_size
@@ -100,12 +102,113 @@
 
 [Rule "level0-level1-ratio"]
 conditions=level0-level1-ratio
-suggestions=l0-l1-ratio-health-check
+suggestions=inc-base-max-bytes
 
 [Condition "level0-level1-ratio"]
 source=OPTIONS
 options=CFOptions.level0_file_num_compaction_trigger:CFOptions.write_buffer_size:CFOptions.max_bytes_for_level_base
-evaluate=int(options[0])*int(options[1])-int(options[2])<(-251659456)  # should evaluate to a boolean
+evaluate=int(options[0])*int(options[1])-int(options[2])>=1  # should evaluate to a boolean, condition triggered if evaluates to true
+
+[Suggestion "inc-base-max-bytes"]
+option=CFOptions.max_bytes_for_level_base
+action=increase
+
+[Rules "tuning-iostat-burst"]
+conditions=large-db-get-p99
+suggestions=bytes-per-sync-non0:wal-bytes-per-sync-non0:set-rate-limiter
+#overlap_time_period=10m
+
+[Condition "write-burst"]
+source=TIME_SERIES
+keys=dyno.flash_write_bytes_per_sec
+behavior=bursty
+window_sec=300  # the smaller this window, the more sensitivity to changes in the time series, so the rate_threshold should be bigger; when it's 60, then same as diff(%)
+rate_threshold=20
+
+[Condition "large-p99-read-latency"]
+source=TIME_SERIES
+keys=[]rocksdb.read.block.get.micros.p99
+behavior=bursty
+window_sec=300
+rate_threshold=10
+
+[Condition "large-db-get-p99"]
+source=TIME_SERIES
+keys=[]rocksdb.db.get.micros.p50:[]rocksdb.db.get.micros.p99
+behavior=evaluate_expression
+evaluate=(keys[1]/keys[0])>5
+
+[Suggestion "bytes-per-sync-non0"]
+option=DBOptions.bytes_per_sync
+action=set
+suggested_values=1048576
+
+[Suggestion "wal-bytes-per-sync-non0"]
+option=DBOptions.wal_bytes_per_sync
+action=set
+suggested_values=1048576
+
+[Suggestion "set-rate-limiter"]
+option=rate_limiter_bytes_per_sec
+action=set
+suggested_values=1024000
+
+[Rule "bloom-filter-percent-useful"]
+conditions=bloom-filter-percent-useful
+suggestions=inc-bloom-bits-per-key
+
+[Condition "bloom-filter-percent-useful"]
+source=TIME_SERIES
+keys=[]rocksdb.bloom.filter.useful.count:[]rocksdb.bloom.filter.full.positive.count:[]rocksdb.bloom.filter.full.true.positive.count
+behavior=evaluate_expression
+evaluate=((keys[0]+keys[2])/(keys[0]+keys[1]))<0.9  # should evaluate to a boolean
+aggregation_op=latest
+
+[Rule "bloom-not-enabled"]
+conditions=bloom-not-enabled
+suggestions=inc-bloom-bits-per-key
+
+[Condition "bloom-not-enabled"]
+source=TIME_SERIES
+keys=[]rocksdb.bloom.filter.useful.count:[]rocksdb.bloom.filter.full.positive.count:[]rocksdb.bloom.filter.full.true.positive.count
+behavior=evaluate_expression
+evaluate=keys[0]+keys[1]+keys[2]==0
+aggregation_op=avg
+
+[Suggestion "inc-bloom-bits-per-key"]
+option=bloom_bits
+action=increase
+suggested_values=2
+
+[Rule "small-l0-files"]
+conditions=small-l0-files
+suggestions=dec-max-bytes-for-level-base:inc-write-buffer-size
+
+[Condition "small-l0-files"]
+source=OPTIONS
+options=CFOptions.max_bytes_for_level_base:CFOptions.level0_file_num_compaction_trigger:CFOptions.write_buffer_size
+evaluate=int(options[0])>(10*int(options[1])*int(options[2]))
+
+[Rule "decompress-time-long"]
+conditions=decompress-time-long
+suggestions=dec-block-size:inc-block-cache-size:faster-compression-type
+
+[Condition "decompress-time-long"]
+source=TIME_SERIES
+keys=block_decompress_time:block_read_time:block_checksum_time
+behavior=evaluate_expression
+evaluate=(keys[0]/(keys[0]+keys[1]+keys[2]))>0.3
+
+[Suggestion "dec-block-size"]
+option=TableOptions.BlockBasedTable.block_size
+action=decrease
+
+[Suggestion "inc-block-cache-size"]
+option=cache_size
+action=increase
+suggested_values=16000000
 
-[Suggestion "l0-l1-ratio-health-check"]
-description='modify options such that (level0_file_num_compaction_trigger * write_buffer_size - max_bytes_for_level_base < 5) is satisfied'
+[Suggestion "faster-compression-type"]
+option=CFOptions.compression
+action=set
+suggested_values=kLZ4Compression
diff -Nru rocksdb-5.15.10/tools/advisor/README.md rocksdb-5.17.2/tools/advisor/README.md
--- rocksdb-5.15.10/tools/advisor/README.md	1970-01-01 00:00:00.000000000 +0000
+++ rocksdb-5.17.2/tools/advisor/README.md	2018-11-12 19:57:32.000000000 +0000
@@ -0,0 +1,96 @@
+# Rocksdb Tuning Advisor
+
+## Motivation
+
+The performance of Rocksdb is contingent on its tuning. However,
+because of the complexity of its underlying technology and a large number of
+configurable parameters, a good configuration is sometimes hard to obtain. The aim of
+the python command-line tool, Rocksdb Advisor, is to automate the process of
+suggesting improvements in the configuration based on advice from Rocksdb
+experts.
+
+## Overview
+
+Experts share their wisdom as rules comprising of conditions and suggestions in the INI format (refer
+[rules.ini](https://github.com/facebook/rocksdb/blob/master/tools/advisor/advisor/rules.ini)).
+Users provide the Rocksdb configuration that they want to improve upon (as the
+familiar Rocksdb OPTIONS file —
+[example](https://github.com/facebook/rocksdb/blob/master/examples/rocksdb_option_file_example.ini))
+and the path of the file which contains Rocksdb logs and statistics.
+The [Advisor](https://github.com/facebook/rocksdb/blob/master/tools/advisor/advisor/rule_parser_example.py)
+creates appropriate DataSource objects (for Rocksdb
+[logs](https://github.com/facebook/rocksdb/blob/master/tools/advisor/advisor/db_log_parser.py),
+[options](https://github.com/facebook/rocksdb/blob/master/tools/advisor/advisor/db_options_parser.py),
+[statistics](https://github.com/facebook/rocksdb/blob/master/tools/advisor/advisor/db_stats_fetcher.py) etc.)
+and provides them to the [Rules Engine](https://github.com/facebook/rocksdb/blob/master/tools/advisor/advisor/rule_parser.py).
+The Rules uses rules from experts to parse data-sources and trigger appropriate rules.
+The Advisor's output gives information about which rules were triggered,
+why they were triggered and what each of them suggests. Each suggestion
+provided by a triggered rule advises some action on a Rocksdb
+configuration option, for example, increase CFOptions.write_buffer_size,
+set bloom_bits to 2 etc.
+
+## Usage
+
+### Prerequisites
+The tool needs the following to run:
+* python3
+
+### Running the tool
+An example command to run the tool:
+
+```shell
+cd rocksdb/tools/advisor
+python3 -m advisor.rule_parser_example --rules_spec=advisor/rules.ini --rocksdb_options=test/input_files/OPTIONS-000005 --log_files_path_prefix=test/input_files/LOG-0 --stats_dump_period_sec=20
+```
+
+### Command-line arguments
+
+Most important amongst all the input that the Advisor needs, are the rules
+spec and starting Rocksdb configuration. The configuration is provided as the
+familiar Rocksdb Options file (refer [example](https://github.com/facebook/rocksdb/blob/master/examples/rocksdb_option_file_example.ini)).
+The Rules spec is written in the INI format (more details in
+[rules.ini](https://github.com/facebook/rocksdb/blob/master/tools/advisor/advisor/rules.ini)).
+
+In brief, a Rule is made of conditions and is triggered when all its
+constituent conditions are triggered. When triggered, a Rule suggests changes
+(increase/decrease/set to a suggested value) to certain Rocksdb options that
+aim to improve Rocksdb performance. Every Condition has a 'source' i.e.
+the data source that would be checked for triggering that condition.
+For example, a log Condition (with 'source=LOG') is triggered if a particular
+'regex' is found in the Rocksdb LOG files. As of now the Rules Engine
+supports 3 types of Conditions (and consequently data-sources):
+LOG, OPTIONS, TIME_SERIES. The TIME_SERIES data can be sourced from the
+Rocksdb [statistics](https://github.com/facebook/rocksdb/blob/master/include/rocksdb/statistics.h)
+or [perf context](https://github.com/facebook/rocksdb/blob/master/include/rocksdb/perf_context.h).
+
+For more information about the remaining command-line arguments, run:
+
+```shell
+cd rocksdb/tools/advisor
+python3 -m advisor.rule_parser_example --help
+```
+
+### Sample output
+
+Here, a Rocksdb log-based rule has been triggered:
+
+```shell
+Rule: stall-too-many-memtables
+LogCondition: stall-too-many-memtables regex: Stopping writes because we have \d+ immutable memtables \(waiting for flush\), max_write_buffer_number is set to \d+
+Suggestion: inc-bg-flush option : DBOptions.max_background_flushes action : increase suggested_values : ['2']
+Suggestion: inc-write-buffer option : CFOptions.max_write_buffer_number action : increase
+scope: col_fam:
+{'default'}
+```
+
+## Running the tests
+
+Tests for the code have been added to the
+[test/](https://github.com/facebook/rocksdb/tree/master/tools/advisor/test)
+directory. For example, to run the unit tests for db_log_parser.py:
+
+```shell
+cd rocksdb/tools/advisor
+python3 -m unittest -v test.test_db_log_parser
+```
diff -Nru rocksdb-5.15.10/tools/advisor/test/input_files/LOG-0 rocksdb-5.17.2/tools/advisor/test/input_files/LOG-0
--- rocksdb-5.15.10/tools/advisor/test/input_files/LOG-0	2018-09-13 17:25:20.000000000 +0000
+++ rocksdb-5.17.2/tools/advisor/test/input_files/LOG-0	2018-11-12 19:57:32.000000000 +0000
@@ -23,3 +23,8 @@
 2018/05/25-14:34:21.048592 7f82bd676200 [DEBUG] [db/db_impl_files.cc:261] [JOB 45] Delete /tmp/rocksdbtest-155919/dbbench/000084.sst type=2 #84 -- OK
 2018/05/25-14:34:21.048603 7f82bd676200 EVENT_LOG_v1 {"time_micros": 1527284061048600, "job": 45, "event": "table_file_deletion", "file_number": 84}
 2018/05/25-14:34:21.048981 7f82bd676200 [db/db_impl.cc:398] Shutdown complete
+2018/05/25-14:34:21.049000 7f82bd676200 [db/db_impl.cc:563] [col-fam-A] random log message for testing
+2018/05/25-14:34:21.049010 7f82bd676200 [db/db_impl.cc:234] [col-fam-B] log continuing on next line
+remaining part of the log
+2018/05/25-14:34:21.049020 7f82bd676200 [db/db_impl.cc:653] [col-fam-A] another random log message
+2018/05/25-14:34:21.049025 7f82bd676200 [db/db_impl.cc:331] [unknown] random log message no column family
diff -Nru rocksdb-5.15.10/tools/advisor/test/input_files/log_stats_parser_keys_ts rocksdb-5.17.2/tools/advisor/test/input_files/log_stats_parser_keys_ts
--- rocksdb-5.15.10/tools/advisor/test/input_files/log_stats_parser_keys_ts	1970-01-01 00:00:00.000000000 +0000
+++ rocksdb-5.17.2/tools/advisor/test/input_files/log_stats_parser_keys_ts	2018-11-12 19:57:32.000000000 +0000
@@ -0,0 +1,3 @@
+rocksdb.number.block.decompressed.count: 1530896335 88.0, 1530896361 788338.0, 1530896387 1539256.0, 1530896414 2255696.0, 1530896440 3009325.0, 1530896466 3767183.0, 1530896492 4529775.0, 1530896518 5297809.0, 1530896545 6033802.0, 1530896570 6794129.0
+rocksdb.db.get.micros.p50: 1530896335 295.5, 1530896361 16.561841, 1530896387 16.20677, 1530896414 16.31508, 1530896440 16.346602, 1530896466 16.284669, 1530896492 16.16005, 1530896518 16.069096, 1530896545 16.028746, 1530896570 15.9638
+rocksdb.manifest.file.sync.micros.p99: 1530896335 649.0, 1530896361 835.0, 1530896387 1435.0, 1530896414 9938.0, 1530896440 9938.0, 1530896466 9938.0, 1530896492 9938.0, 1530896518 1882.0, 1530896545 1837.0, 1530896570 1792.0
diff -Nru rocksdb-5.15.10/tools/advisor/test/input_files/OPTIONS-000005 rocksdb-5.17.2/tools/advisor/test/input_files/OPTIONS-000005
--- rocksdb-5.15.10/tools/advisor/test/input_files/OPTIONS-000005	2018-09-13 17:25:20.000000000 +0000
+++ rocksdb-5.17.2/tools/advisor/test/input_files/OPTIONS-000005	2018-11-12 19:57:32.000000000 +0000
@@ -12,7 +12,8 @@
   manual_wal_flush=false
   allow_ingest_behind=false
   db_write_buffer_size=0
-
+  db_log_dir=
+  random_access_max_buffer_size=1048576
 
 [CFOptions "default"]
   ttl=0
@@ -29,3 +30,20 @@
 [TableOptions/BlockBasedTable "default"]
   block_align=false
   index_type=kBinarySearch
+
+[CFOptions "col_fam_A"]
+ttl=0
+max_bytes_for_level_base=268435456
+max_bytes_for_level_multiplier=10.000000
+level0_file_num_compaction_trigger=5
+level0_stop_writes_trigger=36
+write_buffer_size=1024000
+min_write_buffer_number_to_merge=1
+num_levels=5
+compaction_filter_factory=nullptr
+compaction_style=kCompactionStyleLevel
+
+[TableOptions/BlockBasedTable "col_fam_A"]
+block_align=true
+block_restart_interval=16
+index_type=kBinarySearch
diff -Nru rocksdb-5.15.10/tools/advisor/test/input_files/rules_err1.ini rocksdb-5.17.2/tools/advisor/test/input_files/rules_err1.ini
--- rocksdb-5.15.10/tools/advisor/test/input_files/rules_err1.ini	2018-09-13 17:25:20.000000000 +0000
+++ rocksdb-5.17.2/tools/advisor/test/input_files/rules_err1.ini	2018-11-12 19:57:32.000000000 +0000
@@ -5,7 +5,6 @@
 [Condition "normal-rule"]
 source=LOG
 regex=Stopping writes because we have \d+ immutable memtables \(waiting for flush\), max_write_buffer_number is set to \d+
-scope=column_family
 
 [Suggestion "inc-bg-flush"]
 option=DBOptions.max_background_flushes
@@ -43,7 +42,6 @@
 [Condition "missing-regex"]
 source=LOG
 regex=
-scope=column_family
 
 [Suggestion "missing-option"]
 option=
diff -Nru rocksdb-5.15.10/tools/advisor/test/input_files/rules_err2.ini rocksdb-5.17.2/tools/advisor/test/input_files/rules_err2.ini
--- rocksdb-5.15.10/tools/advisor/test/input_files/rules_err2.ini	2018-09-13 17:25:20.000000000 +0000
+++ rocksdb-5.17.2/tools/advisor/test/input_files/rules_err2.ini	2018-11-12 19:57:32.000000000 +0000
@@ -5,7 +5,6 @@
 [Condition "missing-source"]
 source=
 regex=Stopping writes because we have \d+ immutable memtables \(waiting for flush\), max_write_buffer_number is set to \d+
-scope=column_family
 
 [Suggestion "inc-bg-flush"]
 option=DBOptions.max_background_flushes
diff -Nru rocksdb-5.15.10/tools/advisor/test/input_files/rules_err3.ini rocksdb-5.17.2/tools/advisor/test/input_files/rules_err3.ini
--- rocksdb-5.15.10/tools/advisor/test/input_files/rules_err3.ini	2018-09-13 17:25:20.000000000 +0000
+++ rocksdb-5.17.2/tools/advisor/test/input_files/rules_err3.ini	2018-11-12 19:57:32.000000000 +0000
@@ -5,7 +5,6 @@
 [Condition "normal-condition"]
 source=LOG
 regex=Stopping writes because we have \d+ immutable memtables \(waiting for flush\), max_write_buffer_number is set to \d+
-scope=column_family
 
 [Suggestion "missing-action"]
 option=DBOptions.max_background_flushes
diff -Nru rocksdb-5.15.10/tools/advisor/test/input_files/rules_err4.ini rocksdb-5.17.2/tools/advisor/test/input_files/rules_err4.ini
--- rocksdb-5.15.10/tools/advisor/test/input_files/rules_err4.ini	2018-09-13 17:25:20.000000000 +0000
+++ rocksdb-5.17.2/tools/advisor/test/input_files/rules_err4.ini	2018-11-12 19:57:32.000000000 +0000
@@ -5,7 +5,6 @@
 [Condition "normal-condition"]
 source=LOG
 regex=Stopping writes because we have \d+ immutable memtables \(waiting for flush\), max_write_buffer_number is set to \d+
-scope=column_family
 
 [Suggestion "inc-bg-flush"]
 option=DBOptions.max_background_flushes
diff -Nru rocksdb-5.15.10/tools/advisor/test/input_files/test_rules.ini rocksdb-5.17.2/tools/advisor/test/input_files/test_rules.ini
--- rocksdb-5.15.10/tools/advisor/test/input_files/test_rules.ini	2018-09-13 17:25:20.000000000 +0000
+++ rocksdb-5.17.2/tools/advisor/test/input_files/test_rules.ini	2018-11-12 19:57:32.000000000 +0000
@@ -17,26 +17,22 @@
 [Condition "log-1-true"]
 source=LOG
 regex=Stopping writes because we have \d+ immutable memtables \(waiting for flush\), max_write_buffer_number is set to \d+
-scope=column_family
 
 [Condition "log-2-true"]
 source=LOG
 regex=Stalling writes because we have \d+ level-0 files
-scope=column_family
 
 [Condition "log-3-true"]
 source=LOG
 regex=Stopping writes because we have \d+ level-0 files
-scope=column_family
 
 [Condition "log-4-false"]
 source=LOG
 regex=Stalling writes because of estimated pending compaction bytes \d+
-scope=column_family
 
 [Condition "options-1-false"]
 source=OPTIONS
-options=CFOptions.level0_file_num_compaction_trigger:CFOptions.write_buffer_size:random_access_max_buffer_size
+options=CFOptions.level0_file_num_compaction_trigger:CFOptions.write_buffer_size:DBOptions.random_access_max_buffer_size
 evaluate=int(options[0])*int(options[1])-int(options[2])<0  # should evaluate to a boolean
 
 [Suggestion "inc-bg-flush"]
diff -Nru rocksdb-5.15.10/tools/advisor/test/input_files/triggered_rules.ini rocksdb-5.17.2/tools/advisor/test/input_files/triggered_rules.ini
--- rocksdb-5.15.10/tools/advisor/test/input_files/triggered_rules.ini	1970-01-01 00:00:00.000000000 +0000
+++ rocksdb-5.17.2/tools/advisor/test/input_files/triggered_rules.ini	2018-11-12 19:57:32.000000000 +0000
@@ -0,0 +1,83 @@
+[Rule "stall-too-many-memtables"]
+suggestions=inc-bg-flush:inc-write-buffer
+conditions=stall-too-many-memtables
+
+[Condition "stall-too-many-memtables"]
+source=LOG
+regex=Stopping writes because we have \d+ immutable memtables \(waiting for flush\), max_write_buffer_number is set to \d+
+
+[Rule "stall-too-many-L0"]
+suggestions=inc-max-subcompactions:inc-max-bg-compactions:inc-write-buffer-size:dec-max-bytes-for-level-base:inc-l0-slowdown-writes-trigger
+conditions=stall-too-many-L0
+
+[Condition "stall-too-many-L0"]
+source=LOG
+regex=Stalling writes because we have \d+ level-0 files
+
+[Rule "stop-too-many-L0"]
+suggestions=inc-max-bg-compactions:inc-write-buffer-size:inc-l0-stop-writes-trigger
+conditions=stop-too-many-L0
+
+[Condition "stop-too-many-L0"]
+source=LOG
+regex=Stopping writes because we have \d+ level-0 files
+
+[Rule "stall-too-many-compaction-bytes"]
+suggestions=inc-max-bg-compactions:inc-write-buffer-size:inc-hard-pending-compaction-bytes-limit:inc-soft-pending-compaction-bytes-limit
+conditions=stall-too-many-compaction-bytes
+
+[Condition "stall-too-many-compaction-bytes"]
+source=LOG
+regex=Stalling writes because of estimated pending compaction bytes \d+
+
+[Suggestion "inc-bg-flush"]
+option=DBOptions.max_background_flushes
+action=increase
+
+[Suggestion "inc-write-buffer"]
+option=CFOptions.max_write_buffer_number
+action=increase
+
+[Suggestion "inc-max-subcompactions"]
+option=DBOptions.max_subcompactions
+action=increase
+
+[Suggestion "inc-max-bg-compactions"]
+option=DBOptions.max_background_compactions
+action=increase
+
+[Suggestion "inc-write-buffer-size"]
+option=CFOptions.write_buffer_size
+action=increase
+
+[Suggestion "dec-max-bytes-for-level-base"]
+option=CFOptions.max_bytes_for_level_base
+action=decrease
+
+[Suggestion "inc-l0-slowdown-writes-trigger"]
+option=CFOptions.level0_slowdown_writes_trigger
+action=increase
+
+[Suggestion "inc-l0-stop-writes-trigger"]
+option=CFOptions.level0_stop_writes_trigger
+action=increase
+
+[Suggestion "inc-hard-pending-compaction-bytes-limit"]
+option=CFOptions.hard_pending_compaction_bytes_limit
+action=increase
+
+[Suggestion "inc-soft-pending-compaction-bytes-limit"]
+option=CFOptions.soft_pending_compaction_bytes_limit
+action=increase
+
+[Rule "level0-level1-ratio"]
+conditions=level0-level1-ratio
+suggestions=l0-l1-ratio-health-check
+
+[Condition "level0-level1-ratio"]
+source=OPTIONS
+options=CFOptions.level0_file_num_compaction_trigger:CFOptions.write_buffer_size:CFOptions.max_bytes_for_level_base
+evaluate=int(options[0])*int(options[1])-int(options[2])>=-268173312  # should evaluate to a boolean, condition triggered if evaluates to true
+
+[Suggestion "l0-l1-ratio-health-check"]
+description='modify options such that (level0_file_num_compaction_trigger * write_buffer_size - max_bytes_for_level_base < -268173312) is satisfied'
diff -Nru rocksdb-5.15.10/tools/advisor/test/test_db_bench_runner.py rocksdb-5.17.2/tools/advisor/test/test_db_bench_runner.py
--- rocksdb-5.15.10/tools/advisor/test/test_db_bench_runner.py	1970-01-01 00:00:00.000000000 +0000
+++ rocksdb-5.17.2/tools/advisor/test/test_db_bench_runner.py	2018-11-12 19:57:32.000000000 +0000
@@ -0,0 +1,147 @@
+# Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+#  This source code is licensed under both the GPLv2 (found in the
+#  COPYING file in the root directory) and Apache 2.0 License
+#  (found in the LICENSE.Apache file in the root directory).
+
+from advisor.db_bench_runner import DBBenchRunner
+from advisor.db_log_parser import NO_COL_FAMILY, DataSource
+from advisor.db_options_parser import DatabaseOptions
+import os
+import unittest
+
+
+class TestDBBenchRunnerMethods(unittest.TestCase):
+    def setUp(self):
+        self.pos_args = [
+            './../../db_bench',
+            'overwrite',
+            'use_existing_db=true',
+            'duration=10'
+        ]
+        self.bench_runner = DBBenchRunner(self.pos_args)
+        this_path = os.path.abspath(os.path.dirname(__file__))
+        options_path = os.path.join(this_path, 'input_files/OPTIONS-000005')
+        self.db_options = DatabaseOptions(options_path)
+
+    def test_setup(self):
+        self.assertEqual(self.bench_runner.db_bench_binary, self.pos_args[0])
+        self.assertEqual(self.bench_runner.benchmark, self.pos_args[1])
+        self.assertSetEqual(
+            set(self.bench_runner.db_bench_args), set(self.pos_args[2:])
+        )
+
+    def test_get_info_log_file_name(self):
+        log_file_name = DBBenchRunner.get_info_log_file_name(
+            None, 'random_path'
+        )
+        self.assertEqual(log_file_name, 'LOG')
+
+        log_file_name = DBBenchRunner.get_info_log_file_name(
+            '/dev/shm/', '/tmp/rocksdbtest-155919/dbbench/'
+        )
+        self.assertEqual(log_file_name, 'tmp_rocksdbtest-155919_dbbench_LOG')
+
+    def test_get_opt_args_str(self):
+        misc_opt_dict = {'bloom_bits': 2, 'empty_opt': None, 'rate_limiter': 3}
+        optional_args_str = DBBenchRunner.get_opt_args_str(misc_opt_dict)
+        self.assertEqual(optional_args_str, ' --bloom_bits=2 --rate_limiter=3')
+
+    def test_get_log_options(self):
+        db_path = '/tmp/rocksdb-155919/dbbench'
+        # when db_log_dir is present in the db_options
+        update_dict = {
+            'DBOptions.db_log_dir': {NO_COL_FAMILY: '/dev/shm'},
+            'DBOptions.stats_dump_period_sec': {NO_COL_FAMILY: '20'}
+        }
+        self.db_options.update_options(update_dict)
+        log_file_prefix, stats_freq = self.bench_runner.get_log_options(
+            self.db_options, db_path
+        )
+        self.assertEqual(
+            log_file_prefix, '/dev/shm/tmp_rocksdb-155919_dbbench_LOG'
+        )
+        self.assertEqual(stats_freq, 20)
+
+        update_dict = {
+            'DBOptions.db_log_dir': {NO_COL_FAMILY: None},
+            'DBOptions.stats_dump_period_sec': {NO_COL_FAMILY: '30'}
+        }
+        self.db_options.update_options(update_dict)
+        log_file_prefix, stats_freq = self.bench_runner.get_log_options(
+            self.db_options, db_path
+        )
+        self.assertEqual(log_file_prefix, '/tmp/rocksdb-155919/dbbench/LOG')
+        self.assertEqual(stats_freq, 30)
+
+    def test_build_experiment_command(self):
+        # add some misc_options to db_options
+        update_dict = {
+            'bloom_bits': {NO_COL_FAMILY: 2},
+            'rate_limiter_bytes_per_sec': {NO_COL_FAMILY: 128000000}
+        }
+        self.db_options.update_options(update_dict)
+        db_path = '/dev/shm'
+        experiment_command = self.bench_runner._build_experiment_command(
+            self.db_options, db_path
+        )
+        opt_args_str = DBBenchRunner.get_opt_args_str(
+            self.db_options.get_misc_options()
+        )
+        opt_args_str += (
+            ' --options_file=' +
+            self.db_options.generate_options_config('12345')
+        )
+        for arg in self.pos_args[2:]:
+            opt_args_str += (' --' + arg)
+        expected_command = (
+            self.pos_args[0] + ' --benchmarks=' + self.pos_args[1] +
+            ' --statistics --perf_level=3 --db=' + db_path + opt_args_str
+        )
+        self.assertEqual(experiment_command, expected_command)
+
+
+class TestDBBenchRunner(unittest.TestCase):
+    def setUp(self):
+        # Note: the db_bench binary should be present in the rocksdb/ directory
+        self.pos_args = [
+            './../../db_bench',
+            'overwrite',
+            'use_existing_db=true',
+            'duration=20'
+        ]
+        self.bench_runner = DBBenchRunner(self.pos_args)
+        this_path = os.path.abspath(os.path.dirname(__file__))
+        options_path = os.path.join(this_path, 'input_files/OPTIONS-000005')
+        self.db_options = DatabaseOptions(options_path)
+
+    def test_experiment_output(self):
+        update_dict = {'bloom_bits': {NO_COL_FAMILY: 2}}
+        self.db_options.update_options(update_dict)
+        db_path = '/dev/shm'
+        data_sources, throughput = self.bench_runner.run_experiment(
+            self.db_options, db_path
+        )
+        self.assertEqual(
+            data_sources[DataSource.Type.DB_OPTIONS][0].type,
+            DataSource.Type.DB_OPTIONS
+        )
+        self.assertEqual(
+            data_sources[DataSource.Type.LOG][0].type,
+            DataSource.Type.LOG
+        )
+        self.assertEqual(len(data_sources[DataSource.Type.TIME_SERIES]), 2)
+        self.assertEqual(
+            data_sources[DataSource.Type.TIME_SERIES][0].type,
+            DataSource.Type.TIME_SERIES
+        )
+        self.assertEqual(
+            data_sources[DataSource.Type.TIME_SERIES][1].type,
+            DataSource.Type.TIME_SERIES
+        )
+        self.assertEqual(
+            data_sources[DataSource.Type.TIME_SERIES][1].stats_freq_sec, 0
+        )
+
+
+if __name__ == '__main__':
+    unittest.main()
diff -Nru rocksdb-5.15.10/tools/advisor/test/test_db_log_parser.py rocksdb-5.17.2/tools/advisor/test/test_db_log_parser.py
--- rocksdb-5.15.10/tools/advisor/test/test_db_log_parser.py	1970-01-01 00:00:00.000000000 +0000
+++ rocksdb-5.17.2/tools/advisor/test/test_db_log_parser.py	2018-11-12 19:57:32.000000000 +0000
@@ -0,0 +1,103 @@
+# Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+#  This source code is licensed under both the GPLv2 (found in the
+#  COPYING file in the root directory) and Apache 2.0 License
+#  (found in the LICENSE.Apache file in the root directory).
+
+from advisor.db_log_parser import DatabaseLogs, Log, NO_COL_FAMILY
+from advisor.rule_parser import Condition, LogCondition
+import os
+import unittest
+
+
+class TestLog(unittest.TestCase):
+    def setUp(self):
+        self.column_families = ['default', 'col_fam_A']
+
+    def test_get_column_family(self):
+        test_log = (
+            "2018/05/25-14:34:21.047233 7f82ba72e700 [db/flush_job.cc:371] " +
+            "[col_fam_A] [JOB 44] Level-0 flush table #84: 1890780 bytes OK"
+        )
+        db_log = Log(test_log, self.column_families)
+        self.assertEqual('col_fam_A', db_log.get_column_family())
+
+        test_log = (
+            "2018/05/25-14:34:21.047233 7f82ba72e700 [db/flush_job.cc:371] " +
+            "[JOB 44] Level-0 flush table #84: 1890780 bytes OK"
+        )
+        db_log = Log(test_log, self.column_families)
+        db_log.append_message('[default] some remaining part of log')
+        self.assertEqual(NO_COL_FAMILY, db_log.get_column_family())
+
+    def test_get_methods(self):
+        hr_time = "2018/05/25-14:30:25.491635"
+        context = "7f82ba72e700"
+        message = (
+            "[db/flush_job.cc:331] [default] [JOB 10] Level-0 flush table " +
+            "#23: started"
+        )
+        test_log = hr_time + " " + context + " " + message
+        db_log = Log(test_log, self.column_families)
+        self.assertEqual(db_log.get_message(), message)
+        remaining_message = "[col_fam_A] some more logs"
+        db_log.append_message(remaining_message)
+        self.assertEqual(
+            db_log.get_human_readable_time(), "2018/05/25-14:30:25.491635"
+        )
+        self.assertEqual(db_log.get_context(), "7f82ba72e700")
+        self.assertEqual(db_log.get_timestamp(), 1527258625)
+        self.assertEqual(
+            db_log.get_message(), str(message + '\n' + remaining_message)
+        )
+
+    def test_is_new_log(self):
+        new_log = "2018/05/25-14:34:21.047233 context random new log"
+        remaining_log = "2018/05/25 not really a new log"
+        self.assertTrue(Log.is_new_log(new_log))
+        self.assertFalse(Log.is_new_log(remaining_log))
+
+
+class TestDatabaseLogs(unittest.TestCase):
+    def test_check_and_trigger_conditions(self):
+        this_path = os.path.abspath(os.path.dirname(__file__))
+        logs_path_prefix = os.path.join(this_path, 'input_files/LOG-0')
+        column_families = ['default', 'col-fam-A', 'col-fam-B']
+        db_logs = DatabaseLogs(logs_path_prefix, column_families)
+        # matches, has 2 col_fams
+        condition1 = LogCondition.create(Condition('cond-A'))
+        condition1.set_parameter('regex', 'random log message')
+        # matches, multiple lines message
+        condition2 = LogCondition.create(Condition('cond-B'))
+        condition2.set_parameter('regex', 'continuing on next line')
+        # does not match
+        condition3 = LogCondition.create(Condition('cond-C'))
+        condition3.set_parameter('regex', 'this should match no log')
+        db_logs.check_and_trigger_conditions(
+            [condition1, condition2, condition3]
+        )
+        cond1_trigger = condition1.get_trigger()
+        self.assertEqual(2, len(cond1_trigger.keys()))
+        self.assertSetEqual(
+            {'col-fam-A', NO_COL_FAMILY}, set(cond1_trigger.keys())
+        )
+        self.assertEqual(2, len(cond1_trigger['col-fam-A']))
+        messages = [
+            "[db/db_impl.cc:563] [col-fam-A] random log message for testing",
+            "[db/db_impl.cc:653] [col-fam-A] another random log message"
+        ]
+        self.assertIn(cond1_trigger['col-fam-A'][0].get_message(), messages)
+        self.assertIn(cond1_trigger['col-fam-A'][1].get_message(), messages)
+        self.assertEqual(1, len(cond1_trigger[NO_COL_FAMILY]))
+        self.assertEqual(
+            cond1_trigger[NO_COL_FAMILY][0].get_message(),
+            "[db/db_impl.cc:331] [unknown] random log message no column family"
+        )
+        cond2_trigger = condition2.get_trigger()
+        self.assertEqual(['col-fam-B'], list(cond2_trigger.keys()))
+        self.assertEqual(1, len(cond2_trigger['col-fam-B']))
+        self.assertEqual(
+            cond2_trigger['col-fam-B'][0].get_message(),
+            "[db/db_impl.cc:234] [col-fam-B] log continuing on next line\n" +
+            "remaining part of the log"
+        )
+        self.assertIsNone(condition3.get_trigger())
diff -Nru rocksdb-5.15.10/tools/advisor/test/test_db_options_parser.py rocksdb-5.17.2/tools/advisor/test/test_db_options_parser.py
--- rocksdb-5.15.10/tools/advisor/test/test_db_options_parser.py	1970-01-01 00:00:00.000000000 +0000
+++ rocksdb-5.17.2/tools/advisor/test/test_db_options_parser.py	2018-11-12 19:57:32.000000000 +0000
@@ -0,0 +1,216 @@
+# Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+#  This source code is licensed under both the GPLv2 (found in the
+#  COPYING file in the root directory) and Apache 2.0 License
+#  (found in the LICENSE.Apache file in the root directory).
+
+from advisor.db_log_parser import NO_COL_FAMILY
+from advisor.db_options_parser import DatabaseOptions
+from advisor.rule_parser import Condition, OptionCondition
+import os
+import unittest
+
+
+class TestDatabaseOptions(unittest.TestCase):
+    def setUp(self):
+        self.this_path = os.path.abspath(os.path.dirname(__file__))
+        self.og_options = os.path.join(
+            self.this_path, 'input_files/OPTIONS-000005'
+        )
+        misc_options = [
+            'bloom_bits = 4', 'rate_limiter_bytes_per_sec = 1024000'
+        ]
+        # create the options object
+        self.db_options = DatabaseOptions(self.og_options, misc_options)
+        # perform clean-up before running tests
+        self.generated_options = os.path.join(
+            self.this_path, '../temp/OPTIONS_testing.tmp'
+        )
+        if os.path.isfile(self.generated_options):
+            os.remove(self.generated_options)
+
+    def test_get_options_diff(self):
+        old_opt = {
+            'DBOptions.stats_dump_freq_sec': {NO_COL_FAMILY: '20'},
+            'CFOptions.write_buffer_size': {
+                'default': '1024000',
+                'col_fam_A': '128000',
+                'col_fam_B': '128000000'
+            },
+            'DBOptions.use_fsync': {NO_COL_FAMILY: 'true'},
+            'DBOptions.max_log_file_size': {NO_COL_FAMILY: '128000000'}
+        }
+        new_opt = {
+            'bloom_bits': {NO_COL_FAMILY: '4'},
+            'CFOptions.write_buffer_size': {
+                'default': '128000000',
+                'col_fam_A': '128000',
+                'col_fam_C': '128000000'
+            },
+            'DBOptions.use_fsync': {NO_COL_FAMILY: 'true'},
+            'DBOptions.max_log_file_size': {NO_COL_FAMILY: '0'}
+        }
+        diff = DatabaseOptions.get_options_diff(old_opt, new_opt)
+
+        expected_diff = {
+            'DBOptions.stats_dump_freq_sec': {NO_COL_FAMILY: ('20', None)},
+            'bloom_bits': {NO_COL_FAMILY: (None, '4')},
+            'CFOptions.write_buffer_size': {
+                'default': ('1024000', '128000000'),
+                'col_fam_B': ('128000000', None),
+                'col_fam_C': (None, '128000000')
+            },
+            'DBOptions.max_log_file_size': {NO_COL_FAMILY: ('128000000', '0')}
+        }
+        self.assertDictEqual(diff, expected_diff)
+
+    def test_is_misc_option(self):
+        self.assertTrue(DatabaseOptions.is_misc_option('bloom_bits'))
+        self.assertFalse(
+            DatabaseOptions.is_misc_option('DBOptions.stats_dump_freq_sec')
+        )
+
+    def test_set_up(self):
+        options = self.db_options.get_all_options()
+        self.assertEqual(22, len(options.keys()))
+        expected_misc_options = {
+            'bloom_bits': '4', 'rate_limiter_bytes_per_sec': '1024000'
+        }
+        self.assertDictEqual(
+            expected_misc_options, self.db_options.get_misc_options()
+        )
+        self.assertListEqual(
+            ['default', 'col_fam_A'], self.db_options.get_column_families()
+        )
+
+    def test_get_options(self):
+        opt_to_get = [
+            'DBOptions.manual_wal_flush', 'DBOptions.db_write_buffer_size',
+            'bloom_bits', 'CFOptions.compaction_filter_factory',
+            'CFOptions.num_levels', 'rate_limiter_bytes_per_sec',
+            'TableOptions.BlockBasedTable.block_align', 'random_option'
+        ]
+        options = self.db_options.get_options(opt_to_get)
+        expected_options = {
+            'DBOptions.manual_wal_flush': {NO_COL_FAMILY: 'false'},
+            'DBOptions.db_write_buffer_size': {NO_COL_FAMILY: '0'},
+            'bloom_bits': {NO_COL_FAMILY: '4'},
+            'CFOptions.compaction_filter_factory': {
+                'default': 'nullptr', 'col_fam_A': 'nullptr'
+            },
+            'CFOptions.num_levels': {'default': '7', 'col_fam_A': '5'},
+            'rate_limiter_bytes_per_sec': {NO_COL_FAMILY: '1024000'},
+            'TableOptions.BlockBasedTable.block_align': {
+                'default': 'false', 'col_fam_A': 'true'
+            }
+        }
+        self.assertDictEqual(expected_options, options)
+
+    def test_update_options(self):
+        # add new, update old, set old
+        # before updating
+        expected_old_opts = {
+            'DBOptions.db_log_dir': {NO_COL_FAMILY: None},
+            'DBOptions.manual_wal_flush': {NO_COL_FAMILY: 'false'},
+            'bloom_bits': {NO_COL_FAMILY: '4'},
+            'CFOptions.num_levels': {'default': '7', 'col_fam_A': '5'},
+            'TableOptions.BlockBasedTable.block_restart_interval': {
+                'col_fam_A': '16'
+            }
+        }
+        get_opts = list(expected_old_opts.keys())
+        options = self.db_options.get_options(get_opts)
+        self.assertEqual(expected_old_opts, options)
+        # after updating options
+        update_opts = {
+            'DBOptions.db_log_dir': {NO_COL_FAMILY: '/dev/shm'},
+            'DBOptions.manual_wal_flush': {NO_COL_FAMILY: 'true'},
+            'bloom_bits': {NO_COL_FAMILY: '2'},
+            'CFOptions.num_levels': {'col_fam_A': '7'},
+            'TableOptions.BlockBasedTable.block_restart_interval': {
+                'default': '32'
+            },
+            'random_misc_option': {NO_COL_FAMILY: 'something'}
+        }
+        self.db_options.update_options(update_opts)
+        update_opts['CFOptions.num_levels']['default'] = '7'
+        update_opts['TableOptions.BlockBasedTable.block_restart_interval'] = {
+            'default': '32', 'col_fam_A': '16'
+        }
+        get_opts.append('random_misc_option')
+        options = self.db_options.get_options(get_opts)
+        self.assertDictEqual(update_opts, options)
+        expected_misc_options = {
+            'bloom_bits': '2',
+            'rate_limiter_bytes_per_sec': '1024000',
+            'random_misc_option': 'something'
+        }
+        self.assertDictEqual(
+            expected_misc_options, self.db_options.get_misc_options()
+        )
+
+    def test_generate_options_config(self):
+        # make sure file does not exist from before
+        self.assertFalse(os.path.isfile(self.generated_options))
+        self.db_options.generate_options_config('testing')
+        self.assertTrue(os.path.isfile(self.generated_options))
+
+    def test_check_and_trigger_conditions(self):
+        # options only from CFOptions
+        # setup the OptionCondition objects to check and trigger
+        update_dict = {
+            'CFOptions.level0_file_num_compaction_trigger': {'col_fam_A': '4'},
+            'CFOptions.max_bytes_for_level_base': {'col_fam_A': '10'}
+        }
+        self.db_options.update_options(update_dict)
+        cond1 = Condition('opt-cond-1')
+        cond1 = OptionCondition.create(cond1)
+        cond1.set_parameter(
+            'options', [
+                'CFOptions.level0_file_num_compaction_trigger',
+                'TableOptions.BlockBasedTable.block_restart_interval',
+                'CFOptions.max_bytes_for_level_base'
+            ]
+        )
+        cond1.set_parameter(
+            'evaluate',
+            'int(options[0])*int(options[1])-int(options[2])>=0'
+        )
+        # only DBOptions
+        cond2 = Condition('opt-cond-2')
+        cond2 = OptionCondition.create(cond2)
+        cond2.set_parameter(
+            'options', [
+                'DBOptions.db_write_buffer_size',
+                'bloom_bits',
+                'rate_limiter_bytes_per_sec'
+            ]
+        )
+        cond2.set_parameter(
+            'evaluate',
+            '(int(options[2]) * int(options[1]) * int(options[0]))==0'
+        )
+        # mix of CFOptions and DBOptions
+        cond3 = Condition('opt-cond-3')
+        cond3 = OptionCondition.create(cond3)
+        cond3.set_parameter(
+            'options', [
+                'DBOptions.db_write_buffer_size',  # 0
+                'CFOptions.num_levels',  # 5, 7
+                'bloom_bits'  # 4
+            ]
+        )
+        cond3.set_parameter(
+            'evaluate', 'int(options[2])*int(options[0])+int(options[1])>6'
+        )
+        self.db_options.check_and_trigger_conditions([cond1, cond2, cond3])
+
+        cond1_trigger = {'col_fam_A': ['4', '16', '10']}
+        self.assertDictEqual(cond1_trigger, cond1.get_trigger())
+        cond2_trigger = {NO_COL_FAMILY: ['0', '4', '1024000']}
+        self.assertDictEqual(cond2_trigger, cond2.get_trigger())
+        cond3_trigger = {'default': ['0', '7', '4']}
+        self.assertDictEqual(cond3_trigger, cond3.get_trigger())
+
+
+if __name__ == '__main__':
+    unittest.main()
diff -Nru rocksdb-5.15.10/tools/advisor/test/test_db_stats_fetcher.py rocksdb-5.17.2/tools/advisor/test/test_db_stats_fetcher.py
--- rocksdb-5.15.10/tools/advisor/test/test_db_stats_fetcher.py	1970-01-01 00:00:00.000000000 +0000
+++ rocksdb-5.17.2/tools/advisor/test/test_db_stats_fetcher.py	2018-11-12 19:57:32.000000000 +0000
@@ -0,0 +1,126 @@
+# Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+#  This source code is licensed under both the GPLv2 (found in the
+#  COPYING file in the root directory) and Apache 2.0 License
+#  (found in the LICENSE.Apache file in the root directory).
+
+from advisor.db_stats_fetcher import LogStatsParser, DatabasePerfContext
+from advisor.db_timeseries_parser import NO_ENTITY
+from advisor.rule_parser import Condition, TimeSeriesCondition
+import os
+import time
+import unittest
+from unittest.mock import MagicMock
+
+
+class TestLogStatsParser(unittest.TestCase):
+    def setUp(self):
+        this_path = os.path.abspath(os.path.dirname(__file__))
+        stats_file = os.path.join(
+            this_path, 'input_files/log_stats_parser_keys_ts'
+        )
+        # populate the keys_ts dictionary of LogStatsParser
+        self.stats_dict = {NO_ENTITY: {}}
+        with open(stats_file, 'r') as fp:
+            for line in fp:
+                stat_name = line.split(':')[0].strip()
+                self.stats_dict[NO_ENTITY][stat_name] = {}
+                token_list = line.split(':')[1].strip().split(',')
+                for token in token_list:
+                    timestamp = int(token.split()[0])
+                    value = float(token.split()[1])
+                    self.stats_dict[NO_ENTITY][stat_name][timestamp] = value
+        self.log_stats_parser = LogStatsParser('dummy_log_file', 20)
+        self.log_stats_parser.keys_ts = self.stats_dict
+
+    def test_check_and_trigger_conditions_bursty(self):
+        # mock fetch_timeseries() because 'keys_ts' has been pre-populated
+        self.log_stats_parser.fetch_timeseries = MagicMock()
+        # condition: bursty
+        cond1 = Condition('cond-1')
+        cond1 = TimeSeriesCondition.create(cond1)
+        cond1.set_parameter('keys', 'rocksdb.db.get.micros.p50')
+        cond1.set_parameter('behavior', 'bursty')
+        cond1.set_parameter('window_sec', 40)
+        cond1.set_parameter('rate_threshold', 0)
+        self.log_stats_parser.check_and_trigger_conditions([cond1])
+        expected_cond_trigger = {
+            NO_ENTITY: {1530896440: 0.9767546362322214}
+        }
+        self.assertDictEqual(expected_cond_trigger, cond1.get_trigger())
+        # ensure that fetch_timeseries() was called once
+        self.log_stats_parser.fetch_timeseries.assert_called_once()
+
+    def test_check_and_trigger_conditions_eval_agg(self):
+        # mock fetch_timeseries() because 'keys_ts' has been pre-populated
+        self.log_stats_parser.fetch_timeseries = MagicMock()
+        # condition: evaluate_expression
+        cond1 = Condition('cond-1')
+        cond1 = TimeSeriesCondition.create(cond1)
+        cond1.set_parameter('keys', 'rocksdb.db.get.micros.p50')
+        cond1.set_parameter('behavior', 'evaluate_expression')
+        keys = [
+            'rocksdb.manifest.file.sync.micros.p99',
+            'rocksdb.db.get.micros.p50'
+        ]
+        cond1.set_parameter('keys', keys)
+        cond1.set_parameter('aggregation_op', 'latest')
+        # condition evaluates to FALSE
+        cond1.set_parameter('evaluate', 'keys[0]-(keys[1]*100)>200')
+        self.log_stats_parser.check_and_trigger_conditions([cond1])
+        expected_cond_trigger = {NO_ENTITY: [1792.0, 15.9638]}
+        self.assertIsNone(cond1.get_trigger())
+        # condition evaluates to TRUE
+        cond1.set_parameter('evaluate', 'keys[0]-(keys[1]*100)<200')
+        self.log_stats_parser.check_and_trigger_conditions([cond1])
+        expected_cond_trigger = {NO_ENTITY: [1792.0, 15.9638]}
+        self.assertDictEqual(expected_cond_trigger, cond1.get_trigger())
+        # ensure that fetch_timeseries() was called
+        self.log_stats_parser.fetch_timeseries.assert_called()
+
+    def test_check_and_trigger_conditions_eval(self):
+        # mock fetch_timeseries() because 'keys_ts' has been pre-populated
+        self.log_stats_parser.fetch_timeseries = MagicMock()
+        # condition: evaluate_expression
+        cond1 = Condition('cond-1')
+        cond1 = TimeSeriesCondition.create(cond1)
+        cond1.set_parameter('keys', 'rocksdb.db.get.micros.p50')
+        cond1.set_parameter('behavior', 'evaluate_expression')
+        keys = [
+            'rocksdb.manifest.file.sync.micros.p99',
+            'rocksdb.db.get.micros.p50'
+        ]
+        cond1.set_parameter('keys', keys)
+        cond1.set_parameter('evaluate', 'keys[0]-(keys[1]*100)>500')
+        self.log_stats_parser.check_and_trigger_conditions([cond1])
+        expected_trigger = {NO_ENTITY: {
+            1530896414: [9938.0, 16.31508],
+            1530896440: [9938.0, 16.346602],
+            1530896466: [9938.0, 16.284669],
+            1530896492: [9938.0, 16.16005]
+        }}
+        self.assertDictEqual(expected_trigger, cond1.get_trigger())
+        self.log_stats_parser.fetch_timeseries.assert_called_once()
+
+
+class TestDatabasePerfContext(unittest.TestCase):
+    def test_unaccumulate_metrics(self):
+        perf_dict = {
+            "user_key_comparison_count": 675903942,
+            "block_cache_hit_count": 830086,
+        }
+        timestamp = int(time.time())
+        perf_ts = {}
+        for key in perf_dict:
+            perf_ts[key] = {}
+            start_val = perf_dict[key]
+            for ix in range(5):
+                perf_ts[key][timestamp+(ix*10)] = start_val + (2 * ix * ix)
+        db_perf_context = DatabasePerfContext(perf_ts, 10, True)
+        timestamps = [timestamp+(ix*10) for ix in range(1, 5, 1)]
+        values = [val for val in range(2, 15, 4)]
+        inner_dict = {timestamps[ix]: values[ix] for ix in range(4)}
+        expected_keys_ts = {NO_ENTITY: {
+            'user_key_comparison_count': inner_dict,
+            'block_cache_hit_count': inner_dict
+        }}
+        self.assertDictEqual(expected_keys_ts, db_perf_context.keys_ts)
diff -Nru rocksdb-5.15.10/tools/advisor/test/test_rule_parser.py rocksdb-5.17.2/tools/advisor/test/test_rule_parser.py
--- rocksdb-5.15.10/tools/advisor/test/test_rule_parser.py	2018-09-13 17:25:20.000000000 +0000
+++ rocksdb-5.17.2/tools/advisor/test/test_rule_parser.py	2018-11-12 19:57:32.000000000 +0000
@@ -5,8 +5,9 @@
 
 import os
 import unittest
-from advisor.rule_parser import RulesSpec, DatabaseLogs, DatabaseOptions
-from advisor.rule_parser import get_triggered_rules, trigger_conditions
+from advisor.rule_parser import RulesSpec
+from advisor.db_log_parser import DatabaseLogs, DataSource
+from advisor.db_options_parser import DatabaseOptions
 
 RuleToSuggestions = {
     "stall-too-many-memtables": [
@@ -41,16 +42,20 @@
     def setUp(self):
         # load the Rules
         this_path = os.path.abspath(os.path.dirname(__file__))
-        ini_path = os.path.join(this_path, '../advisor/rules.ini')
+        ini_path = os.path.join(this_path, 'input_files/triggered_rules.ini')
         self.db_rules = RulesSpec(ini_path)
         self.db_rules.load_rules_from_spec()
         self.db_rules.perform_section_checks()
         # load the data sources: LOG and OPTIONS
         log_path = os.path.join(this_path, 'input_files/LOG-0')
         options_path = os.path.join(this_path, 'input_files/OPTIONS-000005')
-        self.data_sources = []
-        self.data_sources.append(DatabaseOptions(options_path))
-        self.data_sources.append(DatabaseLogs(log_path))
+        db_options_parser = DatabaseOptions(options_path)
+        self.column_families = db_options_parser.get_column_families()
+        db_logs_parser = DatabaseLogs(log_path, self.column_families)
+        self.data_sources = {
+            DataSource.Type.DB_OPTIONS: [db_options_parser],
+            DataSource.Type.LOG: [db_logs_parser]
+        }
 
     def test_triggered_conditions(self):
         conditions_dict = self.db_rules.get_conditions_dict()
@@ -59,18 +64,25 @@
         for cond in conditions_dict.values():
             self.assertFalse(cond.is_triggered(), repr(cond))
         for rule in rules_dict.values():
-            self.assertFalse(rule.is_triggered(conditions_dict), repr(rule))
+            self.assertFalse(
+                rule.is_triggered(conditions_dict, self.column_families),
+                repr(rule)
+            )
 
-        # Trigger the conditions as per the data sources.
-        trigger_conditions(self.data_sources, conditions_dict)
+        # # Trigger the conditions as per the data sources.
+        # trigger_conditions(, conditions_dict)
+
+        # Get the set of rules that have been triggered
+        triggered_rules = self.db_rules.get_triggered_rules(
+            self.data_sources, self.column_families
+        )
 
         # Make sure each condition and rule is triggered
         for cond in conditions_dict.values():
+            if cond.get_data_source() is DataSource.Type.TIME_SERIES:
+                continue
             self.assertTrue(cond.is_triggered(), repr(cond))
 
-        # Get the set of rules that have been triggered
-        triggered_rules = get_triggered_rules(rules_dict, conditions_dict)
-
         for rule in rules_dict.values():
             self.assertIn(rule, triggered_rules)
             # Check the suggestions made by the triggered rules
@@ -94,9 +106,13 @@
         # load the data sources: LOG and OPTIONS
         log_path = os.path.join(this_path, 'input_files/LOG-1')
         options_path = os.path.join(this_path, 'input_files/OPTIONS-000005')
-        self.data_sources = []
-        self.data_sources.append(DatabaseOptions(options_path))
-        self.data_sources.append(DatabaseLogs(log_path))
+        db_options_parser = DatabaseOptions(options_path)
+        self.column_families = db_options_parser.get_column_families()
+        db_logs_parser = DatabaseLogs(log_path, self.column_families)
+        self.data_sources = {
+            DataSource.Type.DB_OPTIONS: [db_options_parser],
+            DataSource.Type.LOG: [db_logs_parser]
+        }
 
     def test_condition_conjunctions(self):
         conditions_dict = self.db_rules.get_conditions_dict()
@@ -105,10 +121,13 @@
         for cond in conditions_dict.values():
             self.assertFalse(cond.is_triggered(), repr(cond))
         for rule in rules_dict.values():
-            self.assertFalse(rule.is_triggered(conditions_dict), repr(rule))
+            self.assertFalse(
+                rule.is_triggered(conditions_dict, self.column_families),
+                repr(rule)
+            )
 
         # Trigger the conditions as per the data sources.
-        trigger_conditions(self.data_sources, conditions_dict)
+        self.db_rules.trigger_conditions(self.data_sources)
 
         # Check for the conditions
         conds_triggered = ['log-1-true', 'log-2-true', 'log-3-true']
@@ -125,14 +144,16 @@
             'multiple-conds-one-false',
             'multiple-conds-all-false'
         ]
-        for rule in rules_triggered:
+        for rule_name in rules_triggered:
+            rule = rules_dict[rule_name]
             self.assertTrue(
-                rules_dict[rule].is_triggered(conditions_dict),
+                rule.is_triggered(conditions_dict, self.column_families),
                 repr(rule)
             )
-        for rule in rules_not_triggered:
+        for rule_name in rules_not_triggered:
+            rule = rules_dict[rule_name]
             self.assertFalse(
-                rules_dict[rule].is_triggered(conditions_dict),
+                rule.is_triggered(conditions_dict, self.column_families),
                 repr(rule)
             )
 
@@ -191,7 +212,7 @@
         ini_path = os.path.join(self.this_path, 'input_files/rules_err2.ini')
         db_rules = RulesSpec(ini_path)
         regex = '.*provide source for condition.*'
-        with self.assertRaisesRegex(ValueError, regex):
+        with self.assertRaisesRegex(NotImplementedError, regex):
             db_rules.load_rules_from_spec()
 
     def test_suggestion_missing_action(self):
@@ -204,7 +225,7 @@
     def test_section_no_name(self):
         ini_path = os.path.join(self.this_path, 'input_files/rules_err4.ini')
         db_rules = RulesSpec(ini_path)
-        regex = 'Parsing error: section header be like:.*'
+        regex = 'Parsing error: needed section header:.*'
         with self.assertRaisesRegex(ValueError, regex):
             db_rules.load_rules_from_spec()
 
diff -Nru rocksdb-5.15.10/tools/benchmark.sh rocksdb-5.17.2/tools/benchmark.sh
--- rocksdb-5.15.10/tools/benchmark.sh	2018-09-13 17:25:20.000000000 +0000
+++ rocksdb-5.17.2/tools/benchmark.sh	2018-11-12 19:57:32.000000000 +0000
@@ -177,6 +177,7 @@
        $params_bulkload \
        --threads=1 \
        --memtablerep=vector \
+       --allow_concurrent_memtable_write=false \
        --disable_wal=1 \
        --seed=$( date +%s ) \
        2>&1 | tee -a $output_dir/benchmark_bulkload_fillrandom.log"
@@ -229,6 +230,7 @@
        --compaction_style=$2 \
        --subcompactions=$3 \
        --memtablerep=vector \
+       --allow_concurrent_memtable_write=false \
        --disable_wal=1 \
        --max_background_jobs=$4 \
        --seed=$( date +%s ) \
@@ -313,6 +315,7 @@
        --min_level_to_compress=0 \
        --threads=1 \
        --memtablerep=vector \
+       --allow_concurrent_memtable_write=false \
        --disable_wal=$1 \
        --seed=$( date +%s ) \
        2>&1 | tee -a $log_file_name"
diff -Nru rocksdb-5.15.10/tools/check_format_compatible.sh rocksdb-5.17.2/tools/check_format_compatible.sh
--- rocksdb-5.15.10/tools/check_format_compatible.sh	2018-09-13 17:25:20.000000000 +0000
+++ rocksdb-5.17.2/tools/check_format_compatible.sh	2018-11-12 19:57:32.000000000 +0000
@@ -17,7 +17,7 @@
 rm -rf $script_copy_dir
 cp $scriptpath $script_copy_dir -rf
 
-# Generate four random files.
+# Generate random files.
 for i in {1..6}
 do
   input_data[$i]=$input_data_path/data$i
@@ -41,10 +41,22 @@
 EOF
 done
 
+# Generate file(s) with sorted keys.
+sorted_input_data=$input_data_path/sorted_data
+echo == Generating file with sorted keys ${sorted_input_data}
+python - <<EOF
+with open('${sorted_input_data}', 'w') as f:
+  for i in range(0,10):
+    k = str(i)
+    v = "value" + k
+    print >> f, k + " ==> " + v
+EOF
+
 declare -a backward_compatible_checkout_objs=("2.2.fb.branch" "2.3.fb.branch" "2.4.fb.branch" "2.5.fb.branch" "2.6.fb.branch" "2.7.fb.branch" "2.8.1.fb" "3.0.fb.branch" "3.1.fb" "3.2.fb" "3.3.fb" "3.4.fb" "3.5.fb" "3.6.fb" "3.7.fb" "3.8.fb" "3.9.fb")
 declare -a forward_compatible_checkout_objs=("3.10.fb" "3.11.fb" "3.12.fb" "3.13.fb" "4.0.fb" "4.1.fb" "4.2.fb" "4.3.fb" "4.4.fb" "4.5.fb" "4.6.fb" "4.7.fb" "4.8.fb" "4.9.fb" "4.10.fb" "4.11.fb" "4.12.fb" "4.13.fb" "5.0.fb" "5.1.fb" "5.2.fb" "5.3.fb" "5.4.fb" "5.5.fb" "5.6.fb" "5.7.fb" "5.8.fb" "5.9.fb" "5.10.fb")
 declare -a forward_compatible_with_options_checkout_objs=("5.11.fb" "5.12.fb" "5.13.fb" "5.14.fb")
 declare -a checkout_objs=(${backward_compatible_checkout_objs[@]} ${forward_compatible_checkout_objs[@]} ${forward_compatible_with_options_checkout_objs[@]})
+declare -a extern_sst_ingestion_compatible_checkout_objs=("5.14.fb" "5.15.fb")
 
 generate_db()
 {
@@ -68,6 +80,28 @@
     set -e
 }
 
+write_external_sst()
+{
+    set +e
+    $script_copy_dir/write_external_sst.sh $1 $2 $3
+    if [ $? -ne 0 ]; then
+        echo ==== Error writing external SST file using data from $1 to $3 ====
+        exit 1
+    fi
+    set -e
+}
+
+ingest_external_sst()
+{
+    set +e
+    $script_copy_dir/ingest_external_sst.sh $1 $2
+    if [ $? -ne 0 ]; then
+        echo ==== Error ingesting external SST in $2 to DB at $1 ====
+        exit 1
+    fi
+    set -e
+}
+
 # Sandcastle sets us up with a remote that is just another directory on the same
 # machine and doesn't have our branches. Need to fetch them so checkout works.
 # Remote add may fail if added previously (we don't cleanup).
@@ -75,6 +109,41 @@
 set -e
 https_proxy="fwdproxy:8080" git fetch github_origin
 
+# Compatibility test for external SST file ingestion
+for checkout_obj in "${extern_sst_ingestion_compatible_checkout_objs[@]}"
+do
+  echo == Generating DB with extern SST file in "$checkout_obj" ...
+  https_proxy="fwdproxy:8080" git checkout github_origin/$checkout_obj -b $checkout_obj
+  make clean
+  make ldb -j32
+  write_external_sst $input_data_path $test_dir/$checkout_obj $test_dir/$checkout_obj
+  ingest_external_sst $test_dir/$checkout_obj $test_dir/$checkout_obj
+done
+
+checkout_flag=${1:-"master"}
+
+echo == Building $checkout_flag debug
+https_proxy="fwdproxy:8080" git checkout github_origin/$checkout_flag -b tmp-$checkout_flag
+make clean
+make ldb -j32
+compare_base_db_dir=$test_dir"/base_db_dir"
+write_external_sst $input_data_path $compare_base_db_dir $compare_base_db_dir
+ingest_external_sst $compare_base_db_dir $compare_base_db_dir
+
+for checkout_obj in "${extern_sst_ingestion_compatible_checkout_objs[@]}"
+do
+  echo == Build "$checkout_obj" and try to open DB generated using $checkout_flag
+  git checkout $checkout_obj
+  make clean
+  make ldb -j32
+  compare_db $test_dir/$checkout_obj $compare_base_db_dir db_dump.txt 1 1
+  git checkout tmp-$checkout_flag
+  # Clean up
+  git branch -D $checkout_obj
+done
+
+echo == Finish compatibility test for SST ingestion.
+
 for checkout_obj in "${checkout_objs[@]}"
 do
    echo == Generating DB from "$checkout_obj" ...
@@ -87,7 +156,7 @@
 checkout_flag=${1:-"master"}
 
 echo == Building $checkout_flag debug
-https_proxy="fwdproxy:8080" git checkout github_origin/$checkout_flag -b tmp-$checkout_flag
+git checkout tmp-$checkout_flag
 make clean
 make ldb -j32
 compare_base_db_dir=$test_dir"/base_db_dir"
diff -Nru rocksdb-5.15.10/tools/db_bench_tool.cc rocksdb-5.17.2/tools/db_bench_tool.cc
--- rocksdb-5.15.10/tools/db_bench_tool.cc	2018-09-13 17:25:20.000000000 +0000
+++ rocksdb-5.17.2/tools/db_bench_tool.cc	2018-11-12 19:57:32.000000000 +0000
@@ -189,8 +189,8 @@
     "\tresetstats  -- Reset DB stats\n"
     "\tlevelstats  -- Print the number of files and bytes per level\n"
     "\tsstables    -- Print sstable info\n"
-    "\theapprofile -- Dump a heap profile (if supported by this"
-    " port)\n");
+    "\theapprofile -- Dump a heap profile (if supported by this port)\n"
+    "\treplay      -- replay the trace file specified with trace_file\n");
 
 DEFINE_int64(num, 1000000, "Number of key/values to place in database");
 
@@ -466,6 +466,16 @@
 DEFINE_bool(block_align, rocksdb::BlockBasedTableOptions().block_align,
             "Align data blocks on page size");
 
+DEFINE_bool(use_data_block_hash_index, false,
+            "if use kDataBlockBinaryAndHash "
+            "instead of kDataBlockBinarySearch. "
+            "This is valid if only we use BlockTable");
+
+DEFINE_double(data_block_hash_table_util_ratio, 0.75,
+              "util ratio for data block hash index table. "
+              "This is only valid if use_data_block_hash_index is "
+              "set to true");
+
 DEFINE_int64(compressed_cache_size, -1,
              "Number of bytes to use as a cache of compressed data.");
 
@@ -728,6 +738,8 @@
 DEFINE_bool(use_stderr_info_logger, false,
             "Write info logs to stderr instead of to LOG file. ");
 
+DEFINE_string(trace_file, "", "Trace workload to a file. ");
+
 static enum rocksdb::CompressionType StringToCompressionType(const char* ctype) {
   assert(ctype);
 
@@ -767,30 +779,21 @@
 static enum rocksdb::CompressionType FLAGS_compression_type_e =
     rocksdb::kSnappyCompression;
 
-DEFINE_int32(compression_level, -1,
-             "Compression level. For zlib this should be -1 for the "
-             "default level, or between 0 and 9.");
+DEFINE_int32(compression_level, rocksdb::CompressionOptions().level,
+             "Compression level. The meaning of this value is library-"
+             "dependent. If unset, we try to use the default for the library "
+             "specified in `--compression_type`");
 
-DEFINE_int32(compression_max_dict_bytes, 0,
+DEFINE_int32(compression_max_dict_bytes,
+             rocksdb::CompressionOptions().max_dict_bytes,
              "Maximum size of dictionary used to prime the compression "
              "library.");
 
-DEFINE_int32(compression_zstd_max_train_bytes, 0,
+DEFINE_int32(compression_zstd_max_train_bytes,
+             rocksdb::CompressionOptions().zstd_max_train_bytes,
              "Maximum size of training data passed to zstd's dictionary "
              "trainer.");
 
-static bool ValidateCompressionLevel(const char* flagname, int32_t value) {
-  if (value < -1 || value > 9) {
-    fprintf(stderr, "Invalid value for --%s: %d, must be between -1 and 9\n",
-            flagname, value);
-    return false;
-  }
-  return true;
-}
-
-static const bool FLAGS_compression_level_dummy __attribute__((__unused__)) =
-    RegisterFlagValidator(&FLAGS_compression_level, &ValidateCompressionLevel);
-
 DEFINE_int32(min_level_to_compress, -1, "If non-negative, compression starts"
              " from this level. Levels with number < min_level_to_compress are"
              " not compressed. Otherwise, apply compression_type to "
@@ -1970,6 +1973,7 @@
   int64_t max_num_range_tombstones_;
   WriteOptions write_options_;
   Options open_options_;  // keep options around to properly destroy db later
+  TraceOptions trace_options_;
   int64_t reads_;
   int64_t deletes_;
   double read_random_exp_range_;
@@ -1979,6 +1983,52 @@
   bool report_file_operations_;
   bool use_blob_db_;
 
+  class ErrorHandlerListener : public EventListener {
+   public:
+    ErrorHandlerListener()
+        : mutex_(),
+          cv_(&mutex_),
+          no_auto_recovery_(false),
+          recovery_complete_(false) {}
+
+    ~ErrorHandlerListener() {}
+
+    void OnErrorRecoveryBegin(BackgroundErrorReason /*reason*/,
+                              Status /*bg_error*/, bool* auto_recovery) {
+      if (*auto_recovery && no_auto_recovery_) {
+        *auto_recovery = false;
+      }
+    }
+
+    void OnErrorRecoveryCompleted(Status /*old_bg_error*/) {
+      InstrumentedMutexLock l(&mutex_);
+      recovery_complete_ = true;
+      cv_.SignalAll();
+    }
+
+    bool WaitForRecovery(uint64_t /*abs_time_us*/) {
+      InstrumentedMutexLock l(&mutex_);
+      if (!recovery_complete_) {
+        cv_.Wait(/*abs_time_us*/);
+      }
+      if (recovery_complete_) {
+        recovery_complete_ = false;
+        return true;
+      }
+      return false;
+    }
+
+    void EnableAutoRecovery(bool enable = true) { no_auto_recovery_ = !enable; }
+
+   private:
+    InstrumentedMutex mutex_;
+    InstrumentedCondVar cv_;
+    bool no_auto_recovery_;
+    bool recovery_complete_;
+  };
+
+  std::shared_ptr<ErrorHandlerListener> listener_;
+
   bool SanityCheck() {
     if (FLAGS_compression_ratio > 1) {
       fprintf(stderr, "compression_ratio should be between 0 and 1\n");
@@ -2314,6 +2364,8 @@
         }
       }
     }
+
+    listener_.reset(new ErrorHandlerListener());
   }
 
   ~Benchmark() {
@@ -2658,6 +2710,16 @@
         PrintStats("rocksdb.levelstats");
       } else if (name == "sstables") {
         PrintStats("rocksdb.sstables");
+      } else if (name == "replay") {
+        if (num_threads > 1) {
+          fprintf(stderr, "Multi-threaded replay is not yet supported\n");
+          exit(1);
+        }
+        if (FLAGS_trace_file == "") {
+          fprintf(stderr, "Please set --trace_file to be replayed from\n");
+          exit(1);
+        }
+        method = &Benchmark::Replay;
       } else if (!name.empty()) {  // No error message for empty name
         fprintf(stderr, "unknown benchmark '%s'\n", name.c_str());
         exit(1);
@@ -2688,6 +2750,32 @@
 
       if (method != nullptr) {
         fprintf(stdout, "DB path: [%s]\n", FLAGS_db.c_str());
+
+#ifndef ROCKSDB_LITE
+        // A trace_file option can be provided both for trace and replay
+        // operations. But db_bench does not support tracing and replaying at
+        // the same time, for now. So, start tracing only when it is not a
+        // replay.
+        if (FLAGS_trace_file != "" && name != "replay") {
+          std::unique_ptr<TraceWriter> trace_writer;
+          Status s = NewFileTraceWriter(FLAGS_env, EnvOptions(),
+                                        FLAGS_trace_file, &trace_writer);
+          if (!s.ok()) {
+            fprintf(stderr, "Encountered an error starting a trace, %s\n",
+                    s.ToString().c_str());
+            exit(1);
+          }
+          s = db_.db->StartTrace(trace_options_, std::move(trace_writer));
+          if (!s.ok()) {
+            fprintf(stderr, "Encountered an error starting a trace, %s\n",
+                    s.ToString().c_str());
+            exit(1);
+          }
+          fprintf(stdout, "Tracing the workload to: [%s]\n",
+                  FLAGS_trace_file.c_str());
+        }
+#endif  // ROCKSDB_LITE
+
         if (num_warmup > 0) {
           printf("Warming up benchmark by running %d times\n", num_warmup);
         }
@@ -2713,6 +2801,17 @@
         (this->*post_process_method)();
       }
     }
+
+#ifndef ROCKSDB_LITE
+    if (name != "replay" && FLAGS_trace_file != "") {
+      Status s = db_.db->EndTrace();
+      if (!s.ok()) {
+        fprintf(stderr, "Encountered an error ending the trace, %s\n",
+                s.ToString().c_str());
+      }
+    }
+#endif  // ROCKSDB_LITE
+
     if (FLAGS_statistics) {
       fprintf(stdout, "STATISTICS:\n%s\n", dbstats->ToString().c_str());
     }
@@ -3224,6 +3323,15 @@
       block_based_options.enable_index_compression =
           FLAGS_enable_index_compression;
       block_based_options.block_align = FLAGS_block_align;
+      if (FLAGS_use_data_block_hash_index) {
+        block_based_options.data_block_index_type =
+            rocksdb::BlockBasedTableOptions::kDataBlockBinaryAndHash;
+      } else {
+        block_based_options.data_block_index_type =
+            rocksdb::BlockBasedTableOptions::kDataBlockBinarySearch;
+      }
+      block_based_options.data_block_hash_table_util_ratio =
+          FLAGS_data_block_hash_table_util_ratio;
       if (FLAGS_read_cache_path != "") {
 #ifndef ROCKSDB_LITE
         Status rc_status;
@@ -3389,15 +3497,19 @@
     options.compression_opts.max_dict_bytes = FLAGS_compression_max_dict_bytes;
     options.compression_opts.zstd_max_train_bytes =
         FLAGS_compression_zstd_max_train_bytes;
-    if (FLAGS_cache_size) {
-      // If this is a block based table, also need to set block_cache
-      if (options.table_factory->Name() == BlockBasedTableFactory::kName &&
-          options.table_factory->GetOptions() != nullptr) {
-        BlockBasedTableOptions* table_options =
-            reinterpret_cast<BlockBasedTableOptions*>(
-                options.table_factory->GetOptions());
+    // If this is a block based table, set some related options
+    if (options.table_factory->Name() == BlockBasedTableFactory::kName &&
+        options.table_factory->GetOptions() != nullptr) {
+      BlockBasedTableOptions* table_options =
+          reinterpret_cast<BlockBasedTableOptions*>(
+              options.table_factory->GetOptions());
+      if (FLAGS_cache_size) {
         table_options->block_cache = cache_;
       }
+      if (FLAGS_bloom_bits >= 0) {
+        table_options->filter_policy.reset(NewBloomFilterPolicy(
+            FLAGS_bloom_bits, FLAGS_use_block_based_filter));
+      }
     }
     if (FLAGS_row_cache_size) {
       if (FLAGS_cache_numshardbits >= 1) {
@@ -3436,6 +3548,7 @@
           FLAGS_rate_limiter_auto_tuned));
     }
 
+    options.listeners.emplace_back(listener_);
     if (FLAGS_num_multi_db <= 1) {
       OpenDb(options, FLAGS_db, &db_);
     } else {
@@ -3681,7 +3794,8 @@
 
     Duration duration(test_duration, max_ops, ops_per_stage);
     for (size_t i = 0; i < num_key_gens; i++) {
-      key_gens[i].reset(new KeyGenerator(&(thread->rand), write_mode, num_,
+      key_gens[i].reset(new KeyGenerator(&(thread->rand), write_mode,
+                                         num_ + max_num_range_tombstones_,
                                          ops_per_stage));
     }
 
@@ -3829,6 +3943,10 @@
         }
       }
       if (!s.ok()) {
+        s = listener_->WaitForRecovery(600000000) ? Status::OK() : s;
+      }
+
+      if (!s.ok()) {
         fprintf(stderr, "put error: %s\n", s.ToString().c_str());
         exit(1);
       }
@@ -4174,7 +4292,8 @@
     delete iter;
     thread->stats.AddBytes(bytes);
     if (FLAGS_perf_level > rocksdb::PerfLevel::kDisable) {
-      thread->stats.AddMessage(get_perf_context()->ToString());
+      thread->stats.AddMessage(std::string("PERF_CONTEXT:\n") +
+                               get_perf_context()->ToString());
     }
   }
 
@@ -4256,7 +4375,8 @@
     thread->stats.AddMessage(msg);
 
     if (FLAGS_perf_level > rocksdb::PerfLevel::kDisable) {
-      thread->stats.AddMessage(get_perf_context()->ToString());
+      thread->stats.AddMessage(std::string("PERF_CONTEXT:\n") +
+                               get_perf_context()->ToString());
     }
   }
 
@@ -4334,7 +4454,8 @@
     thread->stats.AddMessage(msg);
 
     if (FLAGS_perf_level > rocksdb::PerfLevel::kDisable) {
-      thread->stats.AddMessage(get_perf_context()->ToString());
+      thread->stats.AddMessage(std::string("PERF_CONTEXT:\n") +
+                               get_perf_context()->ToString());
     }
   }
 
@@ -4491,7 +4612,8 @@
     thread->stats.AddBytes(bytes);
     thread->stats.AddMessage(msg);
     if (FLAGS_perf_level > rocksdb::PerfLevel::kDisable) {
-      thread->stats.AddMessage(get_perf_context()->ToString());
+      thread->stats.AddMessage(std::string("PERF_CONTEXT:\n") +
+                               get_perf_context()->ToString());
     }
   }
 
@@ -5053,17 +5175,27 @@
     // The number of iterations is the larger of read_ or write_
     Duration duration(FLAGS_duration, readwrites_);
     while (!duration.Done(1)) {
-      DB* db = SelectDB(thread);
-      GenerateKeyFromInt(thread->rand.Next() % merge_keys_, merge_keys_, &key);
+      DBWithColumnFamilies* db_with_cfh = SelectDBWithCfh(thread);
+      int64_t key_rand = thread->rand.Next() % merge_keys_;
+      GenerateKeyFromInt(key_rand, merge_keys_, &key);
 
-      Status s = db->Merge(write_options_, key, gen.Generate(value_size_));
+      Status s;
+      if (FLAGS_num_column_families > 1) {
+        s = db_with_cfh->db->Merge(write_options_,
+                                   db_with_cfh->GetCfh(key_rand), key,
+                                   gen.Generate(value_size_));
+      } else {
+        s = db_with_cfh->db->Merge(write_options_,
+                                   db_with_cfh->db->DefaultColumnFamily(), key,
+                                   gen.Generate(value_size_));
+      }
 
       if (!s.ok()) {
         fprintf(stderr, "merge error: %s\n", s.ToString().c_str());
         exit(1);
       }
       bytes += key.size() + value_size_;
-      thread->stats.FinishedOps(nullptr, db, 1, kMerge);
+      thread->stats.FinishedOps(nullptr, db_with_cfh->db, 1, kMerge);
     }
 
     // Print some statistics
@@ -5242,7 +5374,8 @@
     thread->stats.AddMessage(msg);
 
     if (FLAGS_perf_level > rocksdb::PerfLevel::kDisable) {
-      thread->stats.AddMessage(get_perf_context()->ToString());
+      thread->stats.AddMessage(std::string("PERF_CONTEXT:\n") +
+                               get_perf_context()->ToString());
     }
     thread->stats.AddBytes(static_cast<int64_t>(inserter.GetBytesInserted()));
   }
@@ -5404,7 +5537,8 @@
     thread->stats.AddBytes(bytes);
     thread->stats.AddMessage(msg);
     if (FLAGS_perf_level > rocksdb::PerfLevel::kDisable) {
-      thread->stats.AddMessage(get_perf_context()->ToString());
+      thread->stats.AddMessage(std::string("PERF_CONTEXT:\n") +
+                               get_perf_context()->ToString());
     }
   }
 
@@ -5525,6 +5659,37 @@
     }
     fprintf(stdout, "\n%s\n", stats.c_str());
   }
+
+  void Replay(ThreadState* thread) {
+    if (db_.db != nullptr) {
+      Replay(thread, &db_);
+    }
+  }
+
+  void Replay(ThreadState* /*thread*/, DBWithColumnFamilies* db_with_cfh) {
+    Status s;
+    unique_ptr<TraceReader> trace_reader;
+    s = NewFileTraceReader(FLAGS_env, EnvOptions(), FLAGS_trace_file,
+                           &trace_reader);
+    if (!s.ok()) {
+      fprintf(
+          stderr,
+          "Encountered an error creating a TraceReader from the trace file. "
+          "Error: %s\n",
+          s.ToString().c_str());
+      exit(1);
+    }
+    Replayer replayer(db_with_cfh->db, db_with_cfh->cfh,
+                      std::move(trace_reader));
+    s = replayer.Replay();
+    if (s.ok()) {
+      fprintf(stdout, "Replay started from trace_file: %s\n",
+              FLAGS_trace_file.c_str());
+    } else {
+      fprintf(stderr, "Starting replay failed. Error: %s\n",
+              s.ToString().c_str());
+    }
+  }
 };
 
 int db_bench_tool(int argc, char** argv) {
diff -Nru rocksdb-5.15.10/tools/db_crashtest.py rocksdb-5.17.2/tools/db_crashtest.py
--- rocksdb-5.15.10/tools/db_crashtest.py	2018-09-13 17:25:20.000000000 +0000
+++ rocksdb-5.17.2/tools/db_crashtest.py	2018-11-12 19:57:32.000000000 +0000
@@ -23,6 +23,8 @@
     "block_size": 16384,
     "cache_size": 1048576,
     "checkpoint_one_in": 1000000,
+    "compression_max_dict_bytes": lambda: 16384 * random.randint(0, 1),
+    "compression_zstd_max_train_bytes": lambda: 65536 * random.randint(0, 1),
     "clear_column_family_one_in": 0,
     "compact_files_one_in": 1000000,
     "compact_range_one_in": 1000000,
@@ -30,6 +32,7 @@
     "destroy_db_initially": 0,
     "enable_pipelined_write": lambda: random.randint(0, 1),
     "expected_values_path": expected_values_file.name,
+    "flush_one_in": 1000000,
     "max_background_compactions": 20,
     "max_bytes_for_level_base": 10485760,
     "max_key": 100000000,
@@ -52,7 +55,8 @@
     "verify_checksum": 1,
     "write_buffer_size": 4 * 1024 * 1024,
     "writepercent": 35,
-    "format_version": lambda: random.randint(2, 3),
+    "format_version": lambda: random.randint(2, 4),
+    "index_block_restart_interval": lambda: random.choice(range(1, 16)),
 }
 
 _TEST_DIR_ENV_VAR = 'TEST_TMPDIR'
@@ -103,9 +107,8 @@
     "max_background_compactions": 1,
     "max_bytes_for_level_base": 67108864,
     "memtablerep": "skip_list",
-    "prefix_size": 0,
-    "prefixpercent": 0,
-    "readpercent": 50,
+    "prefixpercent": 25,
+    "readpercent": 25,
     "target_file_size_base": 16777216,
     "target_file_size_multiplier": 1,
     "test_batches_snapshots": 0,
@@ -123,6 +126,9 @@
 def finalize_and_sanitize(src_params):
     dest_params = dict([(k,  v() if callable(v) else v)
                         for (k, v) in src_params.items()])
+    if dest_params.get("compression_type") != "zstd" or \
+            dest_params.get("compression_max_dict_bytes") == 0:
+        dest_params["compression_zstd_max_train_bytes"] = 0
     if dest_params.get("allow_concurrent_memtable_write", 1) == 1:
         dest_params["memtablerep"] = "skip_list"
     if dest_params["mmap_read"] == 1 or not is_direct_io_supported(
diff -Nru rocksdb-5.15.10/tools/db_stress.cc rocksdb-5.17.2/tools/db_stress.cc
--- rocksdb-5.15.10/tools/db_stress.cc	2018-09-13 17:25:20.000000000 +0000
+++ rocksdb-5.17.2/tools/db_stress.cc	2018-11-12 19:57:32.000000000 +0000
@@ -232,6 +232,11 @@
     static_cast<int32_t>(rocksdb::BlockBasedTableOptions().format_version),
     "Format version of SST files.");
 
+DEFINE_int32(index_block_restart_interval,
+             rocksdb::BlockBasedTableOptions().index_block_restart_interval,
+             "Number of keys between restart points "
+             "for delta encoding of keys in index block.");
+
 DEFINE_int32(max_background_compactions,
              rocksdb::Options().max_background_compactions,
              "The maximum number of concurrent background compactions "
@@ -412,6 +417,10 @@
              "If non-zero, then CompactRange() will be called once for every N "
              "operations on average.  0 indicates CompactRange() is disabled.");
 
+DEFINE_int32(flush_one_in, 0,
+             "If non-zero, then Flush() will be called once for every N ops "
+             "on average.  0 indicates calls to Flush() are disabled.");
+
 DEFINE_int32(compact_range_width, 10000,
              "The width of the ranges passed to CompactRange().");
 
@@ -419,6 +428,10 @@
              "If non-zero, then acquires a snapshot once every N operations on "
              "average.");
 
+DEFINE_bool(compare_full_db_state_snapshot, false,
+            "If set we compare state of entire db (in one of the threads) with"
+            "each snapshot.");
+
 DEFINE_uint64(snapshot_hold_ops, 0,
               "If non-zero, then releases snapshots N operations after they're "
               "acquired.");
@@ -540,6 +553,14 @@
 static enum rocksdb::CompressionType FLAGS_compression_type_e =
     rocksdb::kSnappyCompression;
 
+DEFINE_int32(compression_max_dict_bytes, 0,
+             "Maximum size of dictionary used to prime the compression "
+             "library.");
+
+DEFINE_int32(compression_zstd_max_train_bytes, 0,
+             "Maximum size of training data passed to zstd's dictionary "
+             "trainer.");
+
 DEFINE_string(checksum_type, "kCRC32c", "Algorithm to use to checksum blocks");
 static enum rocksdb::ChecksumType FLAGS_checksum_type_e = rocksdb::kCRC32c;
 
@@ -579,6 +600,31 @@
   fprintf(stdout, "Cannot parse memreptable %s\n", ctype);
   return kSkipList;
 }
+
+#ifdef _MSC_VER
+#pragma warning(push)
+// truncation of constant value on static_cast
+#pragma warning(disable : 4309)
+#endif
+bool GetNextPrefix(const rocksdb::Slice& src, std::string* v) {
+  std::string ret = src.ToString();
+  for (int i = static_cast<int>(ret.size()) - 1; i >= 0; i--) {
+    if (ret[i] != static_cast<char>(255)) {
+      ret[i] = ret[i] + 1;
+      break;
+    } else if (i != 0) {
+      ret[i] = 0;
+    } else {
+      // all FF. No next prefix
+      return false;
+    }
+  }
+  *v = ret;
+  return true;
+}
+#ifdef _MSC_VER
+#pragma warning(pop)
+#endif
 }  // namespace
 
 static enum RepFactory FLAGS_rep_factory;
@@ -618,6 +664,18 @@
   return big_endian_key;
 }
 
+static bool GetIntVal(std::string big_endian_key, uint64_t *key_p) {
+  unsigned int size_key = sizeof(*key_p);
+  assert(big_endian_key.size() == size_key);
+  std::string little_endian_key;
+  little_endian_key.resize(size_key);
+  for (size_t i = 0 ; i < size_key; ++i) {
+    little_endian_key[i] = big_endian_key[size_key - 1 - i];
+  }
+  Slice little_endian_slice = Slice(little_endian_key);
+  return GetFixed64(&little_endian_slice, key_p);
+}
+
 static std::string StringToHex(const std::string& str) {
   std::string result = "0x";
   result.append(Slice(str).ToString(true));
@@ -1174,6 +1232,8 @@
     Status status;
     // The value of the Get
     std::string value;
+    // optional state of all keys in the db
+    std::vector<bool> *key_vec;
   };
   std::queue<std::pair<uint64_t, SnapshotState> > snapshot_queue;
 
@@ -1186,9 +1246,13 @@
   DbStressListener(const std::string& db_name,
                    const std::vector<DbPath>& db_paths,
                    const std::vector<ColumnFamilyDescriptor>& column_families)
-      : db_name_(db_name), db_paths_(db_paths),
-        column_families_(column_families) {}
-  virtual ~DbStressListener() {}
+      : db_name_(db_name),
+        db_paths_(db_paths),
+        column_families_(column_families),
+        num_pending_file_creations_(0) {}
+  virtual ~DbStressListener() {
+    assert(num_pending_file_creations_ == 0);
+  }
 #ifndef ROCKSDB_LITE
   virtual void OnFlushCompleted(DB* /*db*/, const FlushJobInfo& info) override {
     assert(IsValidColumnFamilyName(info.cf_name));
@@ -1213,17 +1277,23 @@
         std::chrono::microseconds(Random::GetTLSInstance()->Uniform(5000)));
   }
 
+  virtual void OnTableFileCreationStarted(
+      const TableFileCreationBriefInfo& /*info*/) override {
+    ++num_pending_file_creations_;
+  }
   virtual void OnTableFileCreated(const TableFileCreationInfo& info) override {
     assert(info.db_name == db_name_);
     assert(IsValidColumnFamilyName(info.cf_name));
-    VerifyFilePath(info.file_path);
+    if (info.file_size) {
+      VerifyFilePath(info.file_path);
+    }
     assert(info.job_id > 0 || FLAGS_compact_files_one_in > 0);
-    if (info.status.ok()) {
-      assert(info.file_size > 0);
+    if (info.status.ok() && info.file_size > 0) {
       assert(info.table_properties.data_size > 0);
       assert(info.table_properties.raw_key_size > 0);
       assert(info.table_properties.num_entries > 0);
     }
+    --num_pending_file_creations_;
   }
 
  protected:
@@ -1296,6 +1366,7 @@
   std::string db_name_;
   std::vector<DbPath> db_paths_;
   std::vector<ColumnFamilyDescriptor> column_families_;
+  std::atomic<int> num_pending_file_creations_;
 };
 
 }  // namespace
@@ -1610,11 +1681,15 @@
     if (!FLAGS_verbose) {
       return;
     }
-    fprintf(stdout, "[CF %d] %" PRIi64 " == > (%" ROCKSDB_PRIszt ") ", cf, key, sz);
+    std::string tmp;
+    tmp.reserve(sz * 2 + 16);
+    char buf[4];
     for (size_t i = 0; i < sz; i++) {
-      fprintf(stdout, "%X", value[i]);
+      snprintf(buf, 4, "%X", value[i]);
+      tmp.append(buf);
     }
-    fprintf(stdout, "\n");
+    fprintf(stdout, "[CF %d] %" PRIi64 " == > (%" ROCKSDB_PRIszt ") %s\n", cf,
+            key, sz, tmp.c_str());
   }
 
   static int64_t GenerateOneKey(ThreadState* thread, uint64_t iteration) {
@@ -1667,6 +1742,21 @@
                                   ")");
       }
     }
+    if (snap_state.key_vec != nullptr) {
+      std::unique_ptr<Iterator> iterator(db->NewIterator(ropt));
+      std::unique_ptr<std::vector<bool>> tmp_bitvec(new std::vector<bool>(FLAGS_max_key));
+      for (iterator->SeekToFirst(); iterator->Valid(); iterator->Next()) {
+        uint64_t key_val;
+        if (GetIntVal(iterator->key().ToString(), &key_val)) {
+          (*tmp_bitvec.get())[key_val] = true;
+        }
+      }
+      if (!std::equal(snap_state.key_vec->begin(),
+                      snap_state.key_vec->end(),
+                      tmp_bitvec.get()->begin())) {
+        return Status::Corruption("Found inconsistent keys at this snapshot");
+      }
+    }
     return Status::OK();
   }
 
@@ -1750,6 +1840,7 @@
           while (!thread->snapshot_queue.empty()) {
             db_->ReleaseSnapshot(
                 thread->snapshot_queue.front().second.snapshot);
+            delete thread->snapshot_queue.front().second.key_vec;
             thread->snapshot_queue.pop();
           }
           thread->shared->IncVotedReopen();
@@ -1861,8 +1952,8 @@
                 db_->CompactFiles(CompactionOptions(), random_cf, input_files,
                                   static_cast<int>(output_level));
             if (!s.ok()) {
-              printf("Unable to perform CompactFiles(): %s\n",
-                     s.ToString().c_str());
+              fprintf(stdout, "Unable to perform CompactFiles(): %s\n",
+                      s.ToString().c_str());
               thread->stats.AddNumCompactFilesFailed(1);
             } else {
               thread->stats.AddNumCompactFilesSucceed(1);
@@ -1884,6 +1975,15 @@
 
       auto column_family = column_families_[rand_column_family];
 
+      if (FLAGS_flush_one_in > 0 &&
+          thread->rand.Uniform(FLAGS_flush_one_in) == 0) {
+        FlushOptions flush_opts;
+        Status status = db_->Flush(flush_opts, column_family);
+        if (!status.ok()) {
+          fprintf(stdout, "Unable to perform Flush(): %s\n", status.ToString().c_str());
+        }
+      }
+
       if (FLAGS_compact_range_one_in > 0 &&
           thread->rand.Uniform(FLAGS_compact_range_one_in) == 0) {
         int64_t end_key_num;
@@ -1905,9 +2005,13 @@
         }
       }
 
+      std::vector<int> rand_column_families =
+          GenerateColumnFamilies(FLAGS_column_families, rand_column_family);
+      std::vector<int64_t> rand_keys = GenerateKeys(rand_key);
+
       if (FLAGS_ingest_external_file_one_in > 0 &&
           thread->rand.Uniform(FLAGS_ingest_external_file_one_in) == 0) {
-        TestIngestExternalFile(thread, {rand_column_family}, {rand_key}, lock);
+        TestIngestExternalFile(thread, rand_column_families, rand_keys, lock);
       }
 
       if (FLAGS_acquire_snapshot_one_in > 0 &&
@@ -1920,9 +2024,23 @@
         // will later read the same key before releasing the snapshot and verify
         // that the results are the same.
         auto status_at = db_->Get(ropt, column_family, key, &value_at);
+        std::vector<bool> *key_vec = nullptr;
+
+        if (FLAGS_compare_full_db_state_snapshot &&
+            (thread->tid == 0)) {
+          key_vec = new std::vector<bool>(FLAGS_max_key);
+          std::unique_ptr<Iterator> iterator(db_->NewIterator(ropt));
+          for (iterator->SeekToFirst(); iterator->Valid(); iterator->Next()) {
+            uint64_t key_val;
+            if (GetIntVal(iterator->key().ToString(), &key_val)) {
+              (*key_vec)[key_val] = true;
+            }
+          }
+        }
+
         ThreadState::SnapshotState snap_state = {
             snapshot, rand_column_family, column_family->GetName(),
-            keystr,   status_at,          value_at};
+            keystr,   status_at,          value_at, key_vec};
         thread->snapshot_queue.emplace(
             std::min(FLAGS_ops_per_thread - 1, i + FLAGS_snapshot_hold_ops),
             snap_state);
@@ -1940,34 +2058,35 @@
           VerificationAbort(shared, "Snapshot gave inconsistent state", s);
         }
         db_->ReleaseSnapshot(snap_state.snapshot);
+        delete snap_state.key_vec;
         thread->snapshot_queue.pop();
       }
 
       int prob_op = thread->rand.Uniform(100);
       if (prob_op >= 0 && prob_op < (int)FLAGS_readpercent) {
         // OPERATION read
-        TestGet(thread, read_opts, {rand_column_family}, {rand_key});
+        TestGet(thread, read_opts, rand_column_families, rand_keys);
       } else if ((int)FLAGS_readpercent <= prob_op && prob_op < prefixBound) {
         // OPERATION prefix scan
         // keys are 8 bytes long, prefix size is FLAGS_prefix_size. There are
         // (8 - FLAGS_prefix_size) bytes besides the prefix. So there will
         // be 2 ^ ((8 - FLAGS_prefix_size) * 8) possible keys with the same
         // prefix
-        TestPrefixScan(thread, read_opts, {rand_column_family}, {rand_key});
+        TestPrefixScan(thread, read_opts, rand_column_families, rand_keys);
       } else if (prefixBound <= prob_op && prob_op < writeBound) {
         // OPERATION write
-        TestPut(thread, write_opts, read_opts, {rand_column_family}, {rand_key},
-            value, lock);
+        TestPut(thread, write_opts, read_opts, rand_column_families, rand_keys,
+                value, lock);
       } else if (writeBound <= prob_op && prob_op < delBound) {
         // OPERATION delete
-        TestDelete(thread, write_opts, {rand_column_family}, {rand_key}, lock);
+        TestDelete(thread, write_opts, rand_column_families, rand_keys, lock);
       } else if (delBound <= prob_op && prob_op < delRangeBound) {
         // OPERATION delete range
-        TestDeleteRange(thread, write_opts, {rand_column_family}, {rand_key},
-            lock);
+        TestDeleteRange(thread, write_opts, rand_column_families, rand_keys,
+                        lock);
       } else {
         // OPERATION iterate
-        TestIterate(thread, read_opts, {rand_column_family}, {rand_key});
+        TestIterate(thread, read_opts, rand_column_families, rand_keys);
       }
       thread->stats.FinishedSingleOp();
     }
@@ -1981,6 +2100,15 @@
 
   virtual bool ShouldAcquireMutexOnKey() const { return false; }
 
+  virtual std::vector<int> GenerateColumnFamilies(
+      const int /* num_column_families */, int rand_column_family) const {
+    return {rand_column_family};
+  }
+
+  virtual std::vector<int64_t> GenerateKeys(int64_t rand_key) const {
+    return {rand_key};
+  }
+
   virtual Status TestGet(ThreadState* thread,
       const ReadOptions& read_opts,
       const std::vector<int>& rand_column_families,
@@ -2022,6 +2150,30 @@
     const Snapshot* snapshot = db_->GetSnapshot();
     ReadOptions readoptionscopy = read_opts;
     readoptionscopy.snapshot = snapshot;
+
+    std::string upper_bound_str;
+    Slice upper_bound;
+    if (thread->rand.OneIn(16)) {
+      // in 1/16 chance, set a iterator upper bound
+      int64_t rand_upper_key = GenerateOneKey(thread, FLAGS_ops_per_thread);
+      upper_bound_str = Key(rand_upper_key);
+      upper_bound = Slice(upper_bound_str);
+      // uppder_bound can be smaller than seek key, but the query itself
+      // should not crash either.
+      readoptionscopy.iterate_upper_bound = &upper_bound;
+    }
+    std::string lower_bound_str;
+    Slice lower_bound;
+    if (thread->rand.OneIn(16)) {
+      // in 1/16 chance, set a iterator lower bound
+      int64_t rand_lower_key = GenerateOneKey(thread, FLAGS_ops_per_thread);
+      lower_bound_str = Key(rand_lower_key);
+      lower_bound = Slice(lower_bound_str);
+      // uppder_bound can be smaller than seek key, but the query itself
+      // should not crash either.
+      readoptionscopy.iterate_lower_bound = &lower_bound;
+    }
+
     auto cfh = column_families_[rand_column_families[0]];
     std::unique_ptr<Iterator> iter(db_->NewIterator(readoptionscopy, cfh));
 
@@ -2149,6 +2301,8 @@
       block_based_options.block_size = FLAGS_block_size;
       block_based_options.format_version =
           static_cast<uint32_t>(FLAGS_format_version);
+      block_based_options.index_block_restart_interval =
+          static_cast<int32_t>(FLAGS_index_block_restart_interval);
       block_based_options.filter_policy = filter_policy_;
       options_.table_factory.reset(
           NewBlockBasedTableFactory(block_based_options));
@@ -2188,6 +2342,10 @@
       options_.level0_file_num_compaction_trigger =
           FLAGS_level0_file_num_compaction_trigger;
       options_.compression = FLAGS_compression_type_e;
+      options_.compression_opts.max_dict_bytes =
+          FLAGS_compression_max_dict_bytes;
+      options_.compression_opts.zstd_max_train_bytes =
+          FLAGS_compression_zstd_max_train_bytes;
       options_.create_if_missing = true;
       options_.max_manifest_file_size = FLAGS_max_manifest_file_size;
       options_.inplace_update_support = FLAGS_in_place_update;
@@ -2564,7 +2722,17 @@
     std::string key_str = Key(rand_keys[0]);
     Slice key = key_str;
     Slice prefix = Slice(key.data(), FLAGS_prefix_size);
-    Iterator* iter = db_->NewIterator(read_opts, cfh);
+
+    std::string upper_bound;
+    Slice ub_slice;
+    ReadOptions ro_copy = read_opts;
+    if (thread->rand.OneIn(2) && GetNextPrefix(prefix, &upper_bound)) {
+      // For half of the time, set the upper bound to the next prefix
+      ub_slice = Slice(upper_bound);
+      ro_copy.iterate_upper_bound = &ub_slice;
+    }
+
+    Iterator* iter = db_->NewIterator(ro_copy, cfh);
     int64_t count = 0;
     for (iter->Seek(prefix);
         iter->Valid() && iter->key().starts_with(prefix); iter->Next()) {
@@ -3085,6 +3253,8 @@
     ReadOptions readoptionscopy[10];
     const Snapshot* snapshot = db_->GetSnapshot();
     Iterator* iters[10];
+    std::string upper_bounds[10];
+    Slice ub_slices[10];
     Status s = Status::OK();
     for (int i = 0; i < 10; i++) {
       prefixes[i] += key.ToString();
@@ -3092,6 +3262,12 @@
       prefix_slices[i] = Slice(prefixes[i]);
       readoptionscopy[i] = readoptions;
       readoptionscopy[i].snapshot = snapshot;
+      if (thread->rand.OneIn(2) &&
+          GetNextPrefix(prefix_slices[i], &(upper_bounds[i]))) {
+        // For half of the time, set the upper bound to the next prefix
+        ub_slices[i] = Slice(upper_bounds[i]);
+        readoptionscopy[i].iterate_upper_bound = &(ub_slices[i]);
+      }
       iters[i] = db_->NewIterator(readoptionscopy[i], cfh);
       iters[i]->Seek(prefix_slices[i]);
     }
diff -Nru rocksdb-5.15.10/tools/ingest_external_sst.sh rocksdb-5.17.2/tools/ingest_external_sst.sh
--- rocksdb-5.15.10/tools/ingest_external_sst.sh	1970-01-01 00:00:00.000000000 +0000
+++ rocksdb-5.17.2/tools/ingest_external_sst.sh	2018-11-12 19:57:32.000000000 +0000
@@ -0,0 +1,17 @@
+#!/usr/bin/env bash
+#
+#
+
+if [ "$#" -lt 2 ]; then
+  echo "usage: $BASH_SOURCE <DB Path> <External SST Dir>"
+  exit 1
+fi
+
+db_dir=$1
+external_sst_dir=$2
+
+for f in `find $external_sst_dir -name extern_sst*`
+do
+  echo == Ingesting external SST file $f to DB at $db_dir
+  ./ldb --db=$db_dir --create_if_missing ingest_extern_sst $f
+done
diff -Nru rocksdb-5.15.10/tools/ldb_cmd.cc rocksdb-5.17.2/tools/ldb_cmd.cc
--- rocksdb-5.15.10/tools/ldb_cmd.cc	2018-09-13 17:25:20.000000000 +0000
+++ rocksdb-5.17.2/tools/ldb_cmd.cc	2018-11-12 19:57:32.000000000 +0000
@@ -1237,7 +1237,7 @@
 
   // Cast as DBImpl to get internal iterator
   std::vector<KeyVersion> key_versions;
-  Status st = GetAllKeyVersions(db_, from_, to_, &key_versions);
+  Status st = GetAllKeyVersions(db_, from_, to_, max_keys_, &key_versions);
   if (!st.ok()) {
     exec_state_ = LDBCommandExecuteResult::Failed(st.ToString());
     return;
@@ -1999,7 +1999,7 @@
     }
     DBOptions db_options;
     log::Reader reader(db_options.info_log, std::move(wal_file_reader),
-                       &reporter, true, 0, log_number);
+                       &reporter, true /* checksum */, log_number);
     std::string scratch;
     WriteBatch batch;
     Slice record;
@@ -3039,6 +3039,8 @@
     "allow_blocking_flush";
 const std::string IngestExternalSstFilesCommand::ARG_INGEST_BEHIND =
     "ingest_behind";
+const std::string IngestExternalSstFilesCommand::ARG_WRITE_GLOBAL_SEQNO =
+    "write_global_seqno";
 
 void IngestExternalSstFilesCommand::Help(std::string& ret) {
   ret.append("  ");
@@ -3049,6 +3051,7 @@
   ret.append(" [--" + ARG_ALLOW_GLOBAL_SEQNO + "] ");
   ret.append(" [--" + ARG_ALLOW_BLOCKING_FLUSH + "] ");
   ret.append(" [--" + ARG_INGEST_BEHIND + "] ");
+  ret.append(" [--" + ARG_WRITE_GLOBAL_SEQNO + "] ");
   ret.append("\n");
 }
 
@@ -3060,12 +3063,14 @@
           options, flags, false /* is_read_only */,
           BuildCmdLineOptions({ARG_MOVE_FILES, ARG_SNAPSHOT_CONSISTENCY,
                                ARG_ALLOW_GLOBAL_SEQNO, ARG_CREATE_IF_MISSING,
-                               ARG_ALLOW_BLOCKING_FLUSH, ARG_INGEST_BEHIND})),
+                               ARG_ALLOW_BLOCKING_FLUSH, ARG_INGEST_BEHIND,
+                               ARG_WRITE_GLOBAL_SEQNO})),
       move_files_(false),
       snapshot_consistency_(true),
       allow_global_seqno_(true),
       allow_blocking_flush_(true),
-      ingest_behind_(false) {
+      ingest_behind_(false),
+      write_global_seqno_(true) {
   create_if_missing_ =
       IsFlagPresent(flags, ARG_CREATE_IF_MISSING) ||
       ParseBooleanOption(options, ARG_CREATE_IF_MISSING, false);
@@ -3082,6 +3087,23 @@
       ParseBooleanOption(options, ARG_ALLOW_BLOCKING_FLUSH, true);
   ingest_behind_ = IsFlagPresent(flags, ARG_INGEST_BEHIND) ||
                    ParseBooleanOption(options, ARG_INGEST_BEHIND, false);
+  write_global_seqno_ =
+      IsFlagPresent(flags, ARG_WRITE_GLOBAL_SEQNO) ||
+      ParseBooleanOption(options, ARG_WRITE_GLOBAL_SEQNO, true);
+
+  if (allow_global_seqno_) {
+    if (!write_global_seqno_) {
+      fprintf(stderr,
+              "Warning: not writing global_seqno to the ingested SST can\n"
+              "prevent older versions of RocksDB from being able to open it\n");
+    }
+  } else {
+    if (write_global_seqno_) {
+      exec_state_ = LDBCommandExecuteResult::Failed(
+          "ldb cannot write global_seqno to the ingested SST when global_seqno "
+          "is not allowed");
+    }
+  }
 
   if (params.size() != 1) {
     exec_state_ =
@@ -3106,6 +3128,7 @@
   ifo.allow_global_seqno = allow_global_seqno_;
   ifo.allow_blocking_flush = allow_blocking_flush_;
   ifo.ingest_behind = ingest_behind_;
+  ifo.write_global_seqno = write_global_seqno_;
   Status status = db_->IngestExternalFile(cfh, {input_sst_path_}, ifo);
   if (!status.ok()) {
     exec_state_ = LDBCommandExecuteResult::Failed(
diff -Nru rocksdb-5.15.10/tools/ldb_cmd_impl.h rocksdb-5.17.2/tools/ldb_cmd_impl.h
--- rocksdb-5.15.10/tools/ldb_cmd_impl.h	2018-09-13 17:25:20.000000000 +0000
+++ rocksdb-5.17.2/tools/ldb_cmd_impl.h	2018-11-12 19:57:32.000000000 +0000
@@ -565,12 +565,14 @@
   bool allow_global_seqno_;
   bool allow_blocking_flush_;
   bool ingest_behind_;
+  bool write_global_seqno_;
 
   static const std::string ARG_MOVE_FILES;
   static const std::string ARG_SNAPSHOT_CONSISTENCY;
   static const std::string ARG_ALLOW_GLOBAL_SEQNO;
   static const std::string ARG_ALLOW_BLOCKING_FLUSH;
   static const std::string ARG_INGEST_BEHIND;
+  static const std::string ARG_WRITE_GLOBAL_SEQNO;
 };
 
 }  // namespace rocksdb
diff -Nru rocksdb-5.15.10/tools/sst_dump_test.cc rocksdb-5.17.2/tools/sst_dump_test.cc
--- rocksdb-5.15.10/tools/sst_dump_test.cc	2018-09-13 17:25:20.000000000 +0000
+++ rocksdb-5.17.2/tools/sst_dump_test.cc	2018-11-12 19:57:32.000000000 +0000
@@ -53,12 +53,13 @@
   rocksdb::InternalKeyComparator ikc(opts.comparator);
   unique_ptr<TableBuilder> tb;
 
-  env->NewWritableFile(file_name, &file, env_options);
+  ASSERT_OK(env->NewWritableFile(file_name, &file, env_options));
+
   opts.table_factory = tf;
   std::vector<std::unique_ptr<IntTblPropCollectorFactory> >
       int_tbl_prop_collector_factories;
-  unique_ptr<WritableFileWriter> file_writer(
-      new WritableFileWriter(std::move(file), EnvOptions()));
+  std::unique_ptr<WritableFileWriter> file_writer(
+      new WritableFileWriter(std::move(file), file_name, EnvOptions()));
   std::string column_family_name;
   int unknown_level = -1;
   tb.reset(opts.table_factory->NewTableBuilder(
@@ -90,30 +91,44 @@
 
 // Test for sst dump tool "raw" mode
 class SSTDumpToolTest : public testing::Test {
+  std::string testDir_;
+
  public:
   BlockBasedTableOptions table_options_;
 
-  SSTDumpToolTest() {}
+  SSTDumpToolTest() { testDir_ = test::TmpDir(); }
 
   ~SSTDumpToolTest() {}
+
+  std::string MakeFilePath(const std::string& file_name) const {
+    std::string path(testDir_);
+    path.append("/").append(file_name);
+    return path;
+  }
+
+  template <std::size_t N>
+  void PopulateCommandArgs(const std::string& file_path, const char* command,
+                           char* (&usage)[N]) const {
+    for (int i = 0; i < static_cast<int>(N); ++i) {
+      usage[i] = new char[optLength];
+    }
+    snprintf(usage[0], optLength, "./sst_dump");
+    snprintf(usage[1], optLength, "%s", command);
+    snprintf(usage[2], optLength, "--file=%s", file_path.c_str());
+  }
 };
 
 TEST_F(SSTDumpToolTest, EmptyFilter) {
-  std::string file_name = "rocksdb_sst_test.sst";
-  createSST(file_name, table_options_);
+  std::string file_path = MakeFilePath("rocksdb_sst_test.sst");
+  createSST(file_path, table_options_);
 
   char* usage[3];
-  for (int i = 0; i < 3; i++) {
-    usage[i] = new char[optLength];
-  }
-  snprintf(usage[0], optLength, "./sst_dump");
-  snprintf(usage[1], optLength, "--command=raw");
-  snprintf(usage[2], optLength, "--file=rocksdb_sst_test.sst");
+  PopulateCommandArgs(file_path, "--command=raw", usage);
 
   rocksdb::SSTDumpTool tool;
   ASSERT_TRUE(!tool.Run(3, usage));
 
-  cleanup(file_name);
+  cleanup(file_path);
   for (int i = 0; i < 3; i++) {
     delete[] usage[i];
   }
@@ -121,21 +136,16 @@
 
 TEST_F(SSTDumpToolTest, FilterBlock) {
   table_options_.filter_policy.reset(rocksdb::NewBloomFilterPolicy(10, true));
-  std::string file_name = "rocksdb_sst_test.sst";
-  createSST(file_name, table_options_);
+  std::string file_path = MakeFilePath("rocksdb_sst_test.sst");
+  createSST(file_path, table_options_);
 
   char* usage[3];
-  for (int i = 0; i < 3; i++) {
-    usage[i] = new char[optLength];
-  }
-  snprintf(usage[0], optLength, "./sst_dump");
-  snprintf(usage[1], optLength, "--command=raw");
-  snprintf(usage[2], optLength, "--file=rocksdb_sst_test.sst");
+  PopulateCommandArgs(file_path, "--command=raw", usage);
 
   rocksdb::SSTDumpTool tool;
   ASSERT_TRUE(!tool.Run(3, usage));
 
-  cleanup(file_name);
+  cleanup(file_path);
   for (int i = 0; i < 3; i++) {
     delete[] usage[i];
   }
@@ -143,21 +153,16 @@
 
 TEST_F(SSTDumpToolTest, FullFilterBlock) {
   table_options_.filter_policy.reset(rocksdb::NewBloomFilterPolicy(10, false));
-  std::string file_name = "rocksdb_sst_test.sst";
-  createSST(file_name, table_options_);
+  std::string file_path = MakeFilePath("rocksdb_sst_test.sst");
+  createSST(file_path, table_options_);
 
   char* usage[3];
-  for (int i = 0; i < 3; i++) {
-    usage[i] = new char[optLength];
-  }
-  snprintf(usage[0], optLength, "./sst_dump");
-  snprintf(usage[1], optLength, "--command=raw");
-  snprintf(usage[2], optLength, "--file=rocksdb_sst_test.sst");
+  PopulateCommandArgs(file_path, "--command=raw", usage);
 
   rocksdb::SSTDumpTool tool;
   ASSERT_TRUE(!tool.Run(3, usage));
 
-  cleanup(file_name);
+  cleanup(file_path);
   for (int i = 0; i < 3; i++) {
     delete[] usage[i];
   }
@@ -165,21 +170,16 @@
 
 TEST_F(SSTDumpToolTest, GetProperties) {
   table_options_.filter_policy.reset(rocksdb::NewBloomFilterPolicy(10, false));
-  std::string file_name = "rocksdb_sst_test.sst";
-  createSST(file_name, table_options_);
+  std::string file_path = MakeFilePath("rocksdb_sst_test.sst");
+  createSST(file_path, table_options_);
 
   char* usage[3];
-  for (int i = 0; i < 3; i++) {
-    usage[i] = new char[optLength];
-  }
-  snprintf(usage[0], optLength, "./sst_dump");
-  snprintf(usage[1], optLength, "--show_properties");
-  snprintf(usage[2], optLength, "--file=rocksdb_sst_test.sst");
+  PopulateCommandArgs(file_path, "--show_properties", usage);
 
   rocksdb::SSTDumpTool tool;
   ASSERT_TRUE(!tool.Run(3, usage));
 
-  cleanup(file_name);
+  cleanup(file_path);
   for (int i = 0; i < 3; i++) {
     delete[] usage[i];
   }
@@ -187,21 +187,16 @@
 
 TEST_F(SSTDumpToolTest, CompressedSizes) {
   table_options_.filter_policy.reset(rocksdb::NewBloomFilterPolicy(10, false));
-  std::string file_name = "rocksdb_sst_test.sst";
-  createSST(file_name, table_options_);
+  std::string file_path = MakeFilePath("rocksdb_sst_test.sst");
+  createSST(file_path, table_options_);
 
   char* usage[3];
-  for (int i = 0; i < 3; i++) {
-    usage[i] = new char[optLength];
-  }
+  PopulateCommandArgs(file_path, "--command=recompress", usage);
 
-  snprintf(usage[0], optLength, "./sst_dump");
-  snprintf(usage[1], optLength, "--command=recompress");
-  snprintf(usage[2], optLength, "--file=rocksdb_sst_test.sst");
   rocksdb::SSTDumpTool tool;
   ASSERT_TRUE(!tool.Run(3, usage));
 
-  cleanup(file_name);
+  cleanup(file_path);
   for (int i = 0; i < 3; i++) {
     delete[] usage[i];
   }
diff -Nru rocksdb-5.15.10/tools/sst_dump_tool.cc rocksdb-5.17.2/tools/sst_dump_tool.cc
--- rocksdb-5.15.10/tools/sst_dump_tool.cc	2018-09-13 17:25:20.000000000 +0000
+++ rocksdb-5.17.2/tools/sst_dump_tool.cc	2018-11-12 19:57:32.000000000 +0000
@@ -163,7 +163,8 @@
   unique_ptr<Env> env(NewMemEnv(Env::Default()));
   env->NewWritableFile(testFileName, &out_file, soptions_);
   unique_ptr<WritableFileWriter> dest_writer;
-  dest_writer.reset(new WritableFileWriter(std::move(out_file), soptions_));
+  dest_writer.reset(
+      new WritableFileWriter(std::move(out_file), testFileName, soptions_));
   BlockBasedTableOptions table_options;
   table_options.block_size = block_size;
   BlockBasedTableFactory block_based_tf(table_options);
diff -Nru rocksdb-5.15.10/tools/trace_analyzer.cc rocksdb-5.17.2/tools/trace_analyzer.cc
--- rocksdb-5.15.10/tools/trace_analyzer.cc	1970-01-01 00:00:00.000000000 +0000
+++ rocksdb-5.17.2/tools/trace_analyzer.cc	2018-11-12 19:57:32.000000000 +0000
@@ -0,0 +1,25 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+//
+#ifndef ROCKSDB_LITE
+#ifndef GFLAGS
+#include <cstdio>
+int main() {
+  fprintf(stderr, "Please install gflags to run rocksdb tools\n");
+  return 1;
+}
+#else
+#include "tools/trace_analyzer_tool.h"
+int main(int argc, char** argv) {
+  return rocksdb::trace_analyzer_tool(argc, argv);
+}
+#endif
+#else
+#include <stdio.h>
+int main(int /*argc*/, char** /*argv*/) {
+  fprintf(stderr, "Not supported in lite mode.\n");
+  return 1;
+}
+#endif  // ROCKSDB_LITE
diff -Nru rocksdb-5.15.10/tools/trace_analyzer_test.cc rocksdb-5.17.2/tools/trace_analyzer_test.cc
--- rocksdb-5.15.10/tools/trace_analyzer_test.cc	1970-01-01 00:00:00.000000000 +0000
+++ rocksdb-5.17.2/tools/trace_analyzer_test.cc	2018-11-12 19:57:32.000000000 +0000
@@ -0,0 +1,721 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+//
+// Copyright (c) 2012 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+
+#ifndef ROCKSDB_LITE
+#ifndef GFLAGS
+#include <cstdio>
+int main() {
+  fprintf(stderr, "Please install gflags to run trace_analyzer test\n");
+  return 1;
+}
+#else
+
+#include <chrono>
+#include <cstdio>
+#include <cstdlib>
+#include <sstream>
+#include <thread>
+
+#include "db/db_test_util.h"
+#include "rocksdb/db.h"
+#include "rocksdb/env.h"
+#include "rocksdb/status.h"
+#include "rocksdb/trace_reader_writer.h"
+#include "tools/trace_analyzer_tool.h"
+#include "util/testharness.h"
+#include "util/testutil.h"
+#include "util/trace_replay.h"
+
+namespace rocksdb {
+
+namespace {
+static const int kMaxArgCount = 100;
+static const size_t kArgBufferSize = 100000;
+}  // namespace
+
+// The helper functions for the test
+class TraceAnalyzerTest : public testing::Test {
+ public:
+  TraceAnalyzerTest() : rnd_(0xFB) {
+    // test_path_ = test::TmpDir() + "trace_analyzer_test";
+    test_path_ = test::PerThreadDBPath("trace_analyzer_test");
+    env_ = rocksdb::Env::Default();
+    env_->CreateDir(test_path_);
+    dbname_ = test_path_ + "/db";
+  }
+
+  ~TraceAnalyzerTest() {}
+
+  void GenerateTrace(std::string trace_path) {
+    Options options;
+    options.create_if_missing = true;
+    options.merge_operator = MergeOperators::CreatePutOperator();
+    ReadOptions ro;
+    WriteOptions wo;
+    TraceOptions trace_opt;
+    DB* db_ = nullptr;
+    std::string value;
+    std::unique_ptr<TraceWriter> trace_writer;
+    Iterator* single_iter = nullptr;
+
+    ASSERT_OK(
+        NewFileTraceWriter(env_, env_options_, trace_path, &trace_writer));
+    ASSERT_OK(DB::Open(options, dbname_, &db_));
+    ASSERT_OK(db_->StartTrace(trace_opt, std::move(trace_writer)));
+
+    WriteBatch batch;
+    ASSERT_OK(batch.Put("a", "aaaaaaaaa"));
+    ASSERT_OK(batch.Merge("b", "aaaaaaaaaaaaaaaaaaaa"));
+    ASSERT_OK(batch.Delete("c"));
+    ASSERT_OK(batch.SingleDelete("d"));
+    ASSERT_OK(batch.DeleteRange("e", "f"));
+    ASSERT_OK(db_->Write(wo, &batch));
+
+    ASSERT_OK(db_->Get(ro, "a", &value));
+    single_iter = db_->NewIterator(ro);
+    single_iter->Seek("a");
+    single_iter->SeekForPrev("b");
+    delete single_iter;
+    std::this_thread::sleep_for (std::chrono::seconds(1));
+
+    db_->Get(ro, "g", &value);
+
+    ASSERT_OK(db_->EndTrace());
+
+    ASSERT_OK(env_->FileExists(trace_path));
+
+    std::unique_ptr<WritableFile> whole_f;
+    std::string whole_path = test_path_ + "/0.txt";
+    ASSERT_OK(env_->NewWritableFile(whole_path, &whole_f, env_options_));
+    std::string whole_str = "0x61\n0x62\n0x63\n0x64\n0x65\n0x66\n";
+    ASSERT_OK(whole_f->Append(whole_str));
+    delete db_;
+    ASSERT_OK(DestroyDB(dbname_, options));
+  }
+
+  void RunTraceAnalyzer(const std::vector<std::string>& args) {
+    char arg_buffer[kArgBufferSize];
+    char* argv[kMaxArgCount];
+    int argc = 0;
+    int cursor = 0;
+
+    for (const auto& arg : args) {
+      ASSERT_LE(cursor + arg.size() + 1, kArgBufferSize);
+      ASSERT_LE(argc + 1, kMaxArgCount);
+      snprintf(arg_buffer + cursor, arg.size() + 1, "%s", arg.c_str());
+
+      argv[argc++] = arg_buffer + cursor;
+      cursor += static_cast<int>(arg.size()) + 1;
+    }
+
+    ASSERT_EQ(0, rocksdb::trace_analyzer_tool(argc, argv));
+  }
+
+  void CheckFileContent(const std::vector<std::string>& cnt,
+                        std::string file_path, bool full_content) {
+    ASSERT_OK(env_->FileExists(file_path));
+    std::unique_ptr<SequentialFile> f_ptr;
+    ASSERT_OK(env_->NewSequentialFile(file_path, &f_ptr, env_options_));
+
+    std::string get_line;
+    std::istringstream iss;
+    bool has_data = true;
+    std::vector<std::string> result;
+    uint32_t count;
+    Status s;
+    for (count = 0; ReadOneLine(&iss, f_ptr.get(), &get_line, &has_data, &s);
+         ++count) {
+      ASSERT_OK(s);
+      result.push_back(get_line);
+    }
+
+    ASSERT_EQ(cnt.size(), result.size());
+    for (int i = 0; i < static_cast<int>(result.size()); i++) {
+      if (full_content) {
+        ASSERT_EQ(result[i], cnt[i]);
+      } else {
+        ASSERT_EQ(result[i][0], cnt[i][0]);
+      }
+    }
+
+    return;
+  }
+
+  void AnalyzeTrace(std::vector<std::string>& paras_diff,
+                    std::string output_path, std::string trace_path) {
+    std::vector<std::string> paras = {"./trace_analyzer",
+                                      "-convert_to_human_readable_trace",
+                                      "-output_key_stats",
+                                      "-output_access_count_stats",
+                                      "-output_prefix=test",
+                                      "-output_prefix_cut=1",
+                                      "-output_time_series",
+                                      "-output_value_distribution",
+                                      "-output_qps_stats",
+                                      "-no_key",
+                                      "-no_print"};
+    for (auto& para : paras_diff) {
+      paras.push_back(para);
+    }
+    Status s = env_->FileExists(trace_path);
+    if (!s.ok()) {
+      GenerateTrace(trace_path);
+    }
+    env_->CreateDir(output_path);
+    RunTraceAnalyzer(paras);
+  }
+
+  rocksdb::Env* env_;
+  EnvOptions env_options_;
+  std::string test_path_;
+  std::string dbname_;
+  Random rnd_;
+};
+
+TEST_F(TraceAnalyzerTest, Get) {
+  std::string trace_path = test_path_ + "/trace";
+  std::string output_path = test_path_ + "/get";
+  std::string file_path;
+  std::vector<std::string> paras = {"-analyze_get"};
+  paras.push_back("-output_dir=" + output_path);
+  paras.push_back("-trace_path=" + trace_path);
+  paras.push_back("-key_space_dir=" + test_path_);
+  AnalyzeTrace(paras, output_path, trace_path);
+
+  // check the key_stats file
+  std::vector<std::string> k_stats = {"0 10 0 1 1.000000", "0 10 1 1 1.000000"};
+  file_path = output_path + "/test-get-0-accessed_key_stats.txt";
+  CheckFileContent(k_stats, file_path, true);
+
+  // Check the access count distribution
+  std::vector<std::string> k_dist = {"access_count: 1 num: 2"};
+  file_path = output_path + "/test-get-0-accessed_key_count_distribution.txt";
+  CheckFileContent(k_dist, file_path, true);
+
+  // Check the trace sequence
+  std::vector<std::string> k_sequence = {"1", "5", "2", "3", "4",
+                                         "0", "6", "7", "0"};
+  file_path = output_path + "/test-human_readable_trace.txt";
+  CheckFileContent(k_sequence, file_path, false);
+
+  // Check the prefix
+  std::vector<std::string> k_prefix = {"0 0 0 0.000000 0.000000 0x30",
+                                       "1 1 1 1.000000 1.000000 0x61"};
+  file_path = output_path + "/test-get-0-accessed_key_prefix_cut.txt";
+  CheckFileContent(k_prefix, file_path, true);
+
+  // Check the time series
+  std::vector<std::string> k_series = {"0 1533000630 0", "0 1533000630 1"};
+  file_path = output_path + "/test-get-0-time_series.txt";
+  CheckFileContent(k_series, file_path, false);
+
+  // Check the accessed key in whole key space
+  std::vector<std::string> k_whole_access = {"0 1"};
+  file_path = output_path + "/test-get-0-whole_key_stats.txt";
+  CheckFileContent(k_whole_access, file_path, true);
+
+  // Check the whole key prefix cut
+  std::vector<std::string> k_whole_prefix = {"0 0x61", "1 0x62", "2 0x63",
+                                             "3 0x64", "4 0x65", "5 0x66"};
+  file_path = output_path + "/test-get-0-whole_key_prefix_cut.txt";
+  CheckFileContent(k_whole_prefix, file_path, true);
+
+  // Check the overall qps
+  std::vector<std::string> all_qps = {"1 0 0 0 0 0 0 0 1"};
+  file_path = output_path + "/test-qps_stats.txt";
+  CheckFileContent(all_qps, file_path, true);
+
+  // Check the qps of get
+  std::vector<std::string> get_qps = {"1"};
+  file_path = output_path + "/test-get-0-qps_stats.txt";
+  CheckFileContent(get_qps, file_path, true);
+
+  // Check the top k qps prefix cut
+  std::vector<std::string> top_qps = {"At time: 0 with QPS: 1",
+                                      "The prefix: 0x61 Access count: 1"};
+  file_path = output_path + "/test-get-0-accessed_top_k_qps_prefix_cut.txt";
+  CheckFileContent(top_qps, file_path, true);
+}
+
+// Test analyzing of Put
+TEST_F(TraceAnalyzerTest, Put) {
+  std::string trace_path = test_path_ + "/trace";
+  std::string output_path = test_path_ + "/put";
+  std::string file_path;
+  std::vector<std::string> paras = {"-analyze_put"};
+  paras.push_back("-output_dir=" + output_path);
+  paras.push_back("-trace_path=" + trace_path);
+  paras.push_back("-key_space_dir=" + test_path_);
+  AnalyzeTrace(paras, output_path, trace_path);
+
+  // check the key_stats file
+  std::vector<std::string> k_stats = {"0 9 0 1 1.000000"};
+  file_path = output_path + "/test-put-0-accessed_key_stats.txt";
+  CheckFileContent(k_stats, file_path, true);
+
+  // Check the access count distribution
+  std::vector<std::string> k_dist = {"access_count: 1 num: 1"};
+  file_path = output_path + "/test-put-0-accessed_key_count_distribution.txt";
+  CheckFileContent(k_dist, file_path, true);
+
+  // Check the trace sequence
+  std::vector<std::string> k_sequence = {"1", "5", "2", "3", "4",
+                                         "0", "6", "7", "0"};
+  file_path = output_path + "/test-human_readable_trace.txt";
+  CheckFileContent(k_sequence, file_path, false);
+
+  // Check the prefix
+  std::vector<std::string> k_prefix = {"0 0 0 0.000000 0.000000 0x30"};
+  file_path = output_path + "/test-put-0-accessed_key_prefix_cut.txt";
+  CheckFileContent(k_prefix, file_path, true);
+
+  // Check the time series
+  std::vector<std::string> k_series = {"1 1533056278 0"};
+  file_path = output_path + "/test-put-0-time_series.txt";
+  CheckFileContent(k_series, file_path, false);
+
+  // Check the accessed key in whole key space
+  std::vector<std::string> k_whole_access = {"0 1"};
+  file_path = output_path + "/test-put-0-whole_key_stats.txt";
+  CheckFileContent(k_whole_access, file_path, true);
+
+  // Check the whole key prefix cut
+  std::vector<std::string> k_whole_prefix = {"0 0x61", "1 0x62", "2 0x63",
+                                             "3 0x64", "4 0x65", "5 0x66"};
+  file_path = output_path + "/test-put-0-whole_key_prefix_cut.txt";
+  CheckFileContent(k_whole_prefix, file_path, true);
+
+  // Check the overall qps
+  std::vector<std::string> all_qps = {"1 1 0 0 0 0 0 0 2"};
+  file_path = output_path + "/test-qps_stats.txt";
+  CheckFileContent(all_qps, file_path, true);
+
+  // Check the qps of Put
+  std::vector<std::string> get_qps = {"1"};
+  file_path = output_path + "/test-put-0-qps_stats.txt";
+  CheckFileContent(get_qps, file_path, true);
+
+  // Check the top k qps prefix cut
+  std::vector<std::string> top_qps = {"At time: 0 with QPS: 1",
+                                      "The prefix: 0x61 Access count: 1"};
+  file_path = output_path + "/test-put-0-accessed_top_k_qps_prefix_cut.txt";
+  CheckFileContent(top_qps, file_path, true);
+
+  // Check the value size distribution
+  std::vector<std::string> value_dist = {
+      "Number_of_value_size_between 0 and 16 is: 1"};
+  file_path = output_path + "/test-put-0-accessed_value_size_distribution.txt";
+  CheckFileContent(value_dist, file_path, true);
+}
+
+// Test analyzing of delete
+TEST_F(TraceAnalyzerTest, Delete) {
+  std::string trace_path = test_path_ + "/trace";
+  std::string output_path = test_path_ + "/delete";
+  std::string file_path;
+  std::vector<std::string> paras = {"-analyze_delete"};
+  paras.push_back("-output_dir=" + output_path);
+  paras.push_back("-trace_path=" + trace_path);
+  paras.push_back("-key_space_dir=" + test_path_);
+  AnalyzeTrace(paras, output_path, trace_path);
+
+  // check the key_stats file
+  std::vector<std::string> k_stats = {"0 0 0 1 1.000000"};
+  file_path = output_path + "/test-delete-0-accessed_key_stats.txt";
+  CheckFileContent(k_stats, file_path, true);
+
+  // Check the access count distribution
+  std::vector<std::string> k_dist = {"access_count: 1 num: 1"};
+  file_path =
+      output_path + "/test-delete-0-accessed_key_count_distribution.txt";
+  CheckFileContent(k_dist, file_path, true);
+
+  // Check the trace sequence
+  std::vector<std::string> k_sequence = {"1", "5", "2", "3", "4",
+                                         "0", "6", "7", "0"};
+  file_path = output_path + "/test-human_readable_trace.txt";
+  CheckFileContent(k_sequence, file_path, false);
+
+  // Check the prefix
+  std::vector<std::string> k_prefix = {"0 0 0 0.000000 0.000000 0x30"};
+  file_path = output_path + "/test-delete-0-accessed_key_prefix_cut.txt";
+  CheckFileContent(k_prefix, file_path, true);
+
+  // Check the time series
+  std::vector<std::string> k_series = {"2 1533000630 0"};
+  file_path = output_path + "/test-delete-0-time_series.txt";
+  CheckFileContent(k_series, file_path, false);
+
+  // Check the accessed key in whole key space
+  std::vector<std::string> k_whole_access = {"2 1"};
+  file_path = output_path + "/test-delete-0-whole_key_stats.txt";
+  CheckFileContent(k_whole_access, file_path, true);
+
+  // Check the whole key prefix cut
+  std::vector<std::string> k_whole_prefix = {"0 0x61", "1 0x62", "2 0x63",
+                                             "3 0x64", "4 0x65", "5 0x66"};
+  file_path = output_path + "/test-delete-0-whole_key_prefix_cut.txt";
+  CheckFileContent(k_whole_prefix, file_path, true);
+
+  // Check the overall qps
+  std::vector<std::string> all_qps = {"1 1 1 0 0 0 0 0 3"};
+  file_path = output_path + "/test-qps_stats.txt";
+  CheckFileContent(all_qps, file_path, true);
+
+  // Check the qps of Delete
+  std::vector<std::string> get_qps = {"1"};
+  file_path = output_path + "/test-delete-0-qps_stats.txt";
+  CheckFileContent(get_qps, file_path, true);
+
+  // Check the top k qps prefix cut
+  std::vector<std::string> top_qps = {"At time: 0 with QPS: 1",
+                                      "The prefix: 0x63 Access count: 1"};
+  file_path = output_path + "/test-delete-0-accessed_top_k_qps_prefix_cut.txt";
+  CheckFileContent(top_qps, file_path, true);
+}
+
+// Test analyzing of Merge
+TEST_F(TraceAnalyzerTest, Merge) {
+  std::string trace_path = test_path_ + "/trace";
+  std::string output_path = test_path_ + "/merge";
+  std::string file_path;
+  std::vector<std::string> paras = {"-analyze_merge"};
+  paras.push_back("-output_dir=" + output_path);
+  paras.push_back("-trace_path=" + trace_path);
+  paras.push_back("-key_space_dir=" + test_path_);
+  AnalyzeTrace(paras, output_path, trace_path);
+
+  // check the key_stats file
+  std::vector<std::string> k_stats = {"0 20 0 1 1.000000"};
+  file_path = output_path + "/test-merge-0-accessed_key_stats.txt";
+  CheckFileContent(k_stats, file_path, true);
+
+  // Check the access count distribution
+  std::vector<std::string> k_dist = {"access_count: 1 num: 1"};
+  file_path = output_path + "/test-merge-0-accessed_key_count_distribution.txt";
+  CheckFileContent(k_dist, file_path, true);
+
+  // Check the trace sequence
+  std::vector<std::string> k_sequence = {"1", "5", "2", "3", "4",
+                                         "0", "6", "7", "0"};
+  file_path = output_path + "/test-human_readable_trace.txt";
+  CheckFileContent(k_sequence, file_path, false);
+
+  // Check the prefix
+  std::vector<std::string> k_prefix = {"0 0 0 0.000000 0.000000 0x30"};
+  file_path = output_path + "/test-merge-0-accessed_key_prefix_cut.txt";
+  CheckFileContent(k_prefix, file_path, true);
+
+  // Check the time series
+  std::vector<std::string> k_series = {"5 1533000630 0"};
+  file_path = output_path + "/test-merge-0-time_series.txt";
+  CheckFileContent(k_series, file_path, false);
+
+  // Check the accessed key in whole key space
+  std::vector<std::string> k_whole_access = {"1 1"};
+  file_path = output_path + "/test-merge-0-whole_key_stats.txt";
+  CheckFileContent(k_whole_access, file_path, true);
+
+  // Check the whole key prefix cut
+  std::vector<std::string> k_whole_prefix = {"0 0x61", "1 0x62", "2 0x63",
+                                             "3 0x64", "4 0x65", "5 0x66"};
+  file_path = output_path + "/test-merge-0-whole_key_prefix_cut.txt";
+  CheckFileContent(k_whole_prefix, file_path, true);
+
+  // Check the overall qps
+  std::vector<std::string> all_qps = {"1 1 1 0 0 1 0 0 4"};
+  file_path = output_path + "/test-qps_stats.txt";
+  CheckFileContent(all_qps, file_path, true);
+
+  // Check the qps of Merge
+  std::vector<std::string> get_qps = {"1"};
+  file_path = output_path + "/test-merge-0-qps_stats.txt";
+  CheckFileContent(get_qps, file_path, true);
+
+  // Check the top k qps prefix cut
+  std::vector<std::string> top_qps = {"At time: 0 with QPS: 1",
+                                      "The prefix: 0x62 Access count: 1"};
+  file_path = output_path + "/test-merge-0-accessed_top_k_qps_prefix_cut.txt";
+  CheckFileContent(top_qps, file_path, true);
+
+  // Check the value size distribution
+  std::vector<std::string> value_dist = {
+      "Number_of_value_size_between 0 and 24 is: 1"};
+  file_path =
+      output_path + "/test-merge-0-accessed_value_size_distribution.txt";
+  CheckFileContent(value_dist, file_path, true);
+}
+
+// Test analyzing of SingleDelete
+TEST_F(TraceAnalyzerTest, SingleDelete) {
+  std::string trace_path = test_path_ + "/trace";
+  std::string output_path = test_path_ + "/single_delete";
+  std::string file_path;
+  std::vector<std::string> paras = {"-analyze_single_delete"};
+  paras.push_back("-output_dir=" + output_path);
+  paras.push_back("-trace_path=" + trace_path);
+  paras.push_back("-key_space_dir=" + test_path_);
+  AnalyzeTrace(paras, output_path, trace_path);
+
+  // check the key_stats file
+  std::vector<std::string> k_stats = {"0 0 0 1 1.000000"};
+  file_path = output_path + "/test-single_delete-0-accessed_key_stats.txt";
+  CheckFileContent(k_stats, file_path, true);
+
+  // Check the access count distribution
+  std::vector<std::string> k_dist = {"access_count: 1 num: 1"};
+  file_path =
+      output_path + "/test-single_delete-0-accessed_key_count_distribution.txt";
+  CheckFileContent(k_dist, file_path, true);
+
+  // Check the trace sequence
+  std::vector<std::string> k_sequence = {"1", "5", "2", "3", "4",
+                                         "0", "6", "7", "0"};
+  file_path = output_path + "/test-human_readable_trace.txt";
+  CheckFileContent(k_sequence, file_path, false);
+
+  // Check the prefix
+  std::vector<std::string> k_prefix = {"0 0 0 0.000000 0.000000 0x30"};
+  file_path = output_path + "/test-single_delete-0-accessed_key_prefix_cut.txt";
+  CheckFileContent(k_prefix, file_path, true);
+
+  // Check the time series
+  std::vector<std::string> k_series = {"3 1533000630 0"};
+  file_path = output_path + "/test-single_delete-0-time_series.txt";
+  CheckFileContent(k_series, file_path, false);
+
+  // Check the accessed key in whole key space
+  std::vector<std::string> k_whole_access = {"3 1"};
+  file_path = output_path + "/test-single_delete-0-whole_key_stats.txt";
+  CheckFileContent(k_whole_access, file_path, true);
+
+  // Check the whole key prefix cut
+  std::vector<std::string> k_whole_prefix = {"0 0x61", "1 0x62", "2 0x63",
+                                             "3 0x64", "4 0x65", "5 0x66"};
+  file_path = output_path + "/test-single_delete-0-whole_key_prefix_cut.txt";
+  CheckFileContent(k_whole_prefix, file_path, true);
+
+  // Check the overall qps
+  std::vector<std::string> all_qps = {"1 1 1 1 0 1 0 0 5"};
+  file_path = output_path + "/test-qps_stats.txt";
+  CheckFileContent(all_qps, file_path, true);
+
+  // Check the qps of SingleDelete
+  std::vector<std::string> get_qps = {"1"};
+  file_path = output_path + "/test-single_delete-0-qps_stats.txt";
+  CheckFileContent(get_qps, file_path, true);
+
+  // Check the top k qps prefix cut
+  std::vector<std::string> top_qps = {"At time: 0 with QPS: 1",
+                                      "The prefix: 0x64 Access count: 1"};
+  file_path =
+      output_path + "/test-single_delete-0-accessed_top_k_qps_prefix_cut.txt";
+  CheckFileContent(top_qps, file_path, true);
+}
+
+// Test analyzing of delete
+TEST_F(TraceAnalyzerTest, DeleteRange) {
+  std::string trace_path = test_path_ + "/trace";
+  std::string output_path = test_path_ + "/range_delete";
+  std::string file_path;
+  std::vector<std::string> paras = {"-analyze_range_delete"};
+  paras.push_back("-output_dir=" + output_path);
+  paras.push_back("-trace_path=" + trace_path);
+  paras.push_back("-key_space_dir=" + test_path_);
+  AnalyzeTrace(paras, output_path, trace_path);
+
+  // check the key_stats file
+  std::vector<std::string> k_stats = {"0 0 0 1 1.000000", "0 0 1 1 1.000000"};
+  file_path = output_path + "/test-range_delete-0-accessed_key_stats.txt";
+  CheckFileContent(k_stats, file_path, true);
+
+  // Check the access count distribution
+  std::vector<std::string> k_dist = {"access_count: 1 num: 2"};
+  file_path =
+      output_path + "/test-range_delete-0-accessed_key_count_distribution.txt";
+  CheckFileContent(k_dist, file_path, true);
+
+  // Check the trace sequence
+  std::vector<std::string> k_sequence = {"1", "5", "2", "3", "4",
+                                         "0", "6", "7", "0"};
+  file_path = output_path + "/test-human_readable_trace.txt";
+  CheckFileContent(k_sequence, file_path, false);
+
+  // Check the prefix
+  std::vector<std::string> k_prefix = {"0 0 0 0.000000 0.000000 0x30",
+                                       "1 1 1 1.000000 1.000000 0x65"};
+  file_path = output_path + "/test-range_delete-0-accessed_key_prefix_cut.txt";
+  CheckFileContent(k_prefix, file_path, true);
+
+  // Check the time series
+  std::vector<std::string> k_series = {"4 1533000630 0", "4 1533060100 1"};
+  file_path = output_path + "/test-range_delete-0-time_series.txt";
+  CheckFileContent(k_series, file_path, false);
+
+  // Check the accessed key in whole key space
+  std::vector<std::string> k_whole_access = {"4 1", "5 1"};
+  file_path = output_path + "/test-range_delete-0-whole_key_stats.txt";
+  CheckFileContent(k_whole_access, file_path, true);
+
+  // Check the whole key prefix cut
+  std::vector<std::string> k_whole_prefix = {"0 0x61", "1 0x62", "2 0x63",
+                                             "3 0x64", "4 0x65", "5 0x66"};
+  file_path = output_path + "/test-range_delete-0-whole_key_prefix_cut.txt";
+  CheckFileContent(k_whole_prefix, file_path, true);
+
+  // Check the overall qps
+  std::vector<std::string> all_qps = {"1 1 1 1 2 1 0 0 7"};
+  file_path = output_path + "/test-qps_stats.txt";
+  CheckFileContent(all_qps, file_path, true);
+
+  // Check the qps of DeleteRange
+  std::vector<std::string> get_qps = {"2"};
+  file_path = output_path + "/test-range_delete-0-qps_stats.txt";
+  CheckFileContent(get_qps, file_path, true);
+
+  // Check the top k qps prefix cut
+  std::vector<std::string> top_qps = {"At time: 0 with QPS: 2",
+                                      "The prefix: 0x65 Access count: 1",
+                                      "The prefix: 0x66 Access count: 1"};
+  file_path =
+      output_path + "/test-range_delete-0-accessed_top_k_qps_prefix_cut.txt";
+  CheckFileContent(top_qps, file_path, true);
+}
+
+// Test analyzing of Iterator
+TEST_F(TraceAnalyzerTest, Iterator) {
+  std::string trace_path = test_path_ + "/trace";
+  std::string output_path = test_path_ + "/iterator";
+  std::string file_path;
+  std::vector<std::string> paras = {"-analyze_iterator"};
+  paras.push_back("-output_dir=" + output_path);
+  paras.push_back("-trace_path=" + trace_path);
+  paras.push_back("-key_space_dir=" + test_path_);
+  AnalyzeTrace(paras, output_path, trace_path);
+
+  // Check the output of Seek
+  // check the key_stats file
+  std::vector<std::string> k_stats = {"0 0 0 1 1.000000"};
+  file_path = output_path + "/test-iterator_Seek-0-accessed_key_stats.txt";
+  CheckFileContent(k_stats, file_path, true);
+
+  // Check the access count distribution
+  std::vector<std::string> k_dist = {"access_count: 1 num: 1"};
+  file_path =
+      output_path + "/test-iterator_Seek-0-accessed_key_count_distribution.txt";
+  CheckFileContent(k_dist, file_path, true);
+
+  // Check the trace sequence
+  std::vector<std::string> k_sequence = {"1", "5", "2", "3", "4",
+                                         "0", "6", "7", "0"};
+  file_path = output_path + "/test-human_readable_trace.txt";
+  CheckFileContent(k_sequence, file_path, false);
+
+  // Check the prefix
+  std::vector<std::string> k_prefix = {"0 0 0 0.000000 0.000000 0x30"};
+  file_path = output_path + "/test-iterator_Seek-0-accessed_key_prefix_cut.txt";
+  CheckFileContent(k_prefix, file_path, true);
+
+  // Check the time series
+  std::vector<std::string> k_series = {"6 1 0"};
+  file_path = output_path + "/test-iterator_Seek-0-time_series.txt";
+  CheckFileContent(k_series, file_path, false);
+
+  // Check the accessed key in whole key space
+  std::vector<std::string> k_whole_access = {"0 1"};
+  file_path = output_path + "/test-iterator_Seek-0-whole_key_stats.txt";
+  CheckFileContent(k_whole_access, file_path, true);
+
+  // Check the whole key prefix cut
+  std::vector<std::string> k_whole_prefix = {"0 0x61", "1 0x62", "2 0x63",
+                                             "3 0x64", "4 0x65", "5 0x66"};
+  file_path = output_path + "/test-iterator_Seek-0-whole_key_prefix_cut.txt";
+  CheckFileContent(k_whole_prefix, file_path, true);
+
+  // Check the overall qps
+  std::vector<std::string> all_qps = {"1 1 1 1 2 1 1 1 9"};
+  file_path = output_path + "/test-qps_stats.txt";
+  CheckFileContent(all_qps, file_path, true);
+
+  // Check the qps of Iterator_Seek
+  std::vector<std::string> get_qps = {"1"};
+  file_path = output_path + "/test-iterator_Seek-0-qps_stats.txt";
+  CheckFileContent(get_qps, file_path, true);
+
+  // Check the top k qps prefix cut
+  std::vector<std::string> top_qps = {"At time: 0 with QPS: 1",
+                                      "The prefix: 0x61 Access count: 1"};
+  file_path =
+      output_path + "/test-iterator_Seek-0-accessed_top_k_qps_prefix_cut.txt";
+  CheckFileContent(top_qps, file_path, true);
+
+  // Check the output of SeekForPrev
+  // check the key_stats file
+  k_stats = {"0 0 0 1 1.000000"};
+  file_path =
+      output_path + "/test-iterator_SeekForPrev-0-accessed_key_stats.txt";
+  CheckFileContent(k_stats, file_path, true);
+
+  // Check the access count distribution
+  k_dist = {"access_count: 1 num: 1"};
+  file_path =
+      output_path +
+      "/test-iterator_SeekForPrev-0-accessed_key_count_distribution.txt";
+  CheckFileContent(k_dist, file_path, true);
+
+  // Check the prefix
+  k_prefix = {"0 0 0 0.000000 0.000000 0x30"};
+  file_path =
+      output_path + "/test-iterator_SeekForPrev-0-accessed_key_prefix_cut.txt";
+  CheckFileContent(k_prefix, file_path, true);
+
+  // Check the time series
+  k_series = {"7 0 0"};
+  file_path = output_path + "/test-iterator_SeekForPrev-0-time_series.txt";
+  CheckFileContent(k_series, file_path, false);
+
+  // Check the accessed key in whole key space
+  k_whole_access = {"1 1"};
+  file_path = output_path + "/test-iterator_SeekForPrev-0-whole_key_stats.txt";
+  CheckFileContent(k_whole_access, file_path, true);
+
+  // Check the whole key prefix cut
+  k_whole_prefix = {"0 0x61", "1 0x62", "2 0x63", "3 0x64", "4 0x65", "5 0x66"};
+  file_path =
+      output_path + "/test-iterator_SeekForPrev-0-whole_key_prefix_cut.txt";
+  CheckFileContent(k_whole_prefix, file_path, true);
+
+  // Check the qps of Iterator_SeekForPrev
+  get_qps = {"1"};
+  file_path = output_path + "/test-iterator_SeekForPrev-0-qps_stats.txt";
+  CheckFileContent(get_qps, file_path, true);
+
+  // Check the top k qps prefix cut
+  top_qps = {"At time: 0 with QPS: 1", "The prefix: 0x62 Access count: 1"};
+  file_path = output_path +
+              "/test-iterator_SeekForPrev-0-accessed_top_k_qps_prefix_cut.txt";
+  CheckFileContent(top_qps, file_path, true);
+}
+
+}  // namespace rocksdb
+
+int main(int argc, char** argv) {
+  ::testing::InitGoogleTest(&argc, argv);
+  return RUN_ALL_TESTS();
+}
+#endif  // GFLAG
+#else
+#include <stdio.h>
+
+int main(int /*argc*/, char** /*argv*/) {
+  fprintf(stderr, "Trace_analyzer test is not supported in ROCKSDB_LITE\n");
+  return 0;
+}
+
+#endif  // !ROCKSDB_LITE  return RUN_ALL_TESTS();
diff -Nru rocksdb-5.15.10/tools/trace_analyzer_tool.cc rocksdb-5.17.2/tools/trace_analyzer_tool.cc
--- rocksdb-5.15.10/tools/trace_analyzer_tool.cc	1970-01-01 00:00:00.000000000 +0000
+++ rocksdb-5.17.2/tools/trace_analyzer_tool.cc	2018-11-12 19:57:32.000000000 +0000
@@ -0,0 +1,1799 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+//
+
+#ifndef ROCKSDB_LITE
+
+#ifndef __STDC_FORMAT_MACROS
+#define __STDC_FORMAT_MACROS
+#endif
+
+#ifdef GFLAGS
+#ifdef NUMA
+#include <numa.h>
+#include <numaif.h>
+#endif
+#ifndef OS_WIN
+#include <unistd.h>
+#endif
+
+#include <cinttypes>
+#include <cmath>
+#include <cstdio>
+#include <cstdlib>
+#include <memory>
+#include <sstream>
+#include <stdexcept>
+
+#include "db/db_impl.h"
+#include "db/memtable.h"
+#include "db/write_batch_internal.h"
+#include "options/cf_options.h"
+#include "rocksdb/db.h"
+#include "rocksdb/env.h"
+#include "rocksdb/iterator.h"
+#include "rocksdb/slice.h"
+#include "rocksdb/slice_transform.h"
+#include "rocksdb/status.h"
+#include "rocksdb/table_properties.h"
+#include "rocksdb/utilities/ldb_cmd.h"
+#include "rocksdb/write_batch.h"
+#include "table/meta_blocks.h"
+#include "table/plain_table_factory.h"
+#include "table/table_reader.h"
+#include "tools/trace_analyzer_tool.h"
+#include "util/coding.h"
+#include "util/compression.h"
+#include "util/file_reader_writer.h"
+#include "util/gflags_compat.h"
+#include "util/random.h"
+#include "util/string_util.h"
+#include "util/trace_replay.h"
+
+using GFLAGS_NAMESPACE::ParseCommandLineFlags;
+using GFLAGS_NAMESPACE::RegisterFlagValidator;
+using GFLAGS_NAMESPACE::SetUsageMessage;
+
+DEFINE_string(trace_path, "", "The trace file path.");
+DEFINE_string(output_dir, "", "The directory to store the output files.");
+DEFINE_string(output_prefix, "trace",
+              "The prefix used for all the output files.");
+DEFINE_bool(output_key_stats, false,
+            "Output the key access count statistics to file\n"
+            "for accessed keys:\n"
+            "file name: <prefix>-<query_type>-<cf_id>-accessed_key_stats.txt\n"
+            "Format:[cf_id value_size access_keyid access_count]\n"
+            "for the whole key space keys:\n"
+            "File name: <prefix>-<query_type>-<cf_id>-whole_key_stats.txt\n"
+            "Format:[whole_key_space_keyid access_count]");
+DEFINE_bool(output_access_count_stats, false,
+            "Output the access count distribution statistics to file.\n"
+            "File name:  <prefix>-<query_type>-<cf_id>-accessed_"
+            "key_count_distribution.txt \n"
+            "Format:[access_count number_of_access_count]");
+DEFINE_bool(output_time_series, false,
+            "Output the access time in second of each key, "
+            "such that we can have the time series data of the queries \n"
+            "File name: <prefix>-<query_type>-<cf_id>-time_series.txt\n"
+            "Format:[type_id time_in_sec access_keyid].");
+DEFINE_int32(output_prefix_cut, 0,
+             "The number of bytes as prefix to cut the keys.\n"
+             "If it is enabled, it will generate the following:\n"
+             "For accessed keys:\n"
+             "File name: <prefix>-<query_type>-<cf_id>-"
+             "accessed_key_prefix_cut.txt \n"
+             "Format:[acessed_keyid access_count_of_prefix "
+             "number_of_keys_in_prefix average_key_access "
+             "prefix_succ_ratio prefix]\n"
+             "For whole key space keys:\n"
+             "File name: <prefix>-<query_type>-<cf_id>"
+             "-whole_key_prefix_cut.txt\n"
+             "Format:[start_keyid_in_whole_keyspace prefix]\n"
+             "if 'output_qps_stats' and 'top_k' are enabled, it will output:\n"
+             "File name: <prefix>-<query_type>-<cf_id>"
+             "-accessed_top_k_qps_prefix_cut.txt\n"
+             "Format:[the_top_ith_qps_time QPS], [prefix qps_of_this_second].");
+DEFINE_bool(convert_to_human_readable_trace, false,
+            "Convert the binary trace file to a human readable txt file "
+            "for further processing. "
+            "This file will be extremely large "
+            "(similar size as the original binary trace file). "
+            "You can specify 'no_key' to reduce the size, if key is not "
+            "needed in the next step.\n"
+            "File name: <prefix>_human_readable_trace.txt\n"
+            "Format:[type_id cf_id value_size time_in_micorsec <key>].");
+DEFINE_bool(output_qps_stats, false,
+            "Output the query per second(qps) statistics \n"
+            "For the overall qps, it will contain all qps of each query type. "
+            "The time is started from the first trace record\n"
+            "File name: <prefix>_qps_stats.txt\n"
+            "Format: [qps_type_1 qps_type_2 ...... overall_qps]\n"
+            "For each cf and query, it will have its own qps output.\n"
+            "File name: <prefix>-<query_type>-<cf_id>_qps_stats.txt \n"
+            "Format:[query_count_in_this_second].");
+DEFINE_bool(no_print, false, "Do not print out any result");
+DEFINE_string(
+    print_correlation, "",
+    "intput format: [correlation pairs][.,.]\n"
+    "Output the query correlations between the pairs of query types "
+    "listed in the parameter, input should select the operations from:\n"
+    "get, put, delete, single_delete, rangle_delete, merge. No space "
+    "between the pairs separated by commar. Example: =[get,get]... "
+    "It will print out the number of pairs of 'A after B' and "
+    "the average time interval between the two query.");
+DEFINE_string(key_space_dir, "",
+              "<the directory stores full key space files> \n"
+              "The key space files should be: <column family id>.txt");
+DEFINE_bool(analyze_get, false, "Analyze the Get query.");
+DEFINE_bool(analyze_put, false, "Analyze the Put query.");
+DEFINE_bool(analyze_delete, false, "Analyze the Delete query.");
+DEFINE_bool(analyze_single_delete, false, "Analyze the SingleDelete query.");
+DEFINE_bool(analyze_range_delete, false, "Analyze the DeleteRange query.");
+DEFINE_bool(analyze_merge, false, "Analyze the Merge query.");
+DEFINE_bool(analyze_iterator, false,
+            " Analyze the iterate query like seek() and seekForPrev().");
+DEFINE_bool(no_key, false,
+            " Does not output the key to the result files to make smaller.");
+DEFINE_bool(print_overall_stats, true,
+            " Print the stats of the whole trace, "
+            "like total requests, keys, and etc.");
+DEFINE_bool(print_key_distribution, false, "Print the key size distribution.");
+DEFINE_bool(
+    output_value_distribution, false,
+    "Out put the value size distribution, only available for Put and Merge.\n"
+    "File name: <prefix>-<query_type>-<cf_id>"
+    "-accessed_value_size_distribution.txt\n"
+    "Format:[Number_of_value_size_between x and "
+    "x+value_interval is: <the count>]");
+DEFINE_int32(print_top_k_access, 1,
+             "<top K of the variables to be printed> "
+             "Print the top k accessed keys, top k accessed prefix "
+             "and etc.");
+DEFINE_int32(output_ignore_count, 0,
+             "<threshold>, ignores the access count <= this value, "
+             "it will shorter the output.");
+DEFINE_int32(value_interval, 8,
+             "To output the value distribution, we need to set the value "
+             "intervals and make the statistic of the value size distribution "
+             "in different intervals. The default is 8.");
+
+namespace rocksdb {
+
+std::map<std::string, int> taOptToIndex = {
+    {"get", 0},           {"put", 1},
+    {"delete", 2},        {"single_delete", 3},
+    {"range_delete", 4},  {"merge", 5},
+    {"iterator_Seek", 6}, {"iterator_SeekForPrev", 7}};
+
+std::map<int, std::string> taIndexToOpt = {
+    {0, "get"},           {1, "put"},
+    {2, "delete"},        {3, "single_delete"},
+    {4, "range_delete"},  {5, "merge"},
+    {6, "iterator_Seek"}, {7, "iterator_SeekForPrev"}};
+
+namespace {
+
+uint64_t MultiplyCheckOverflow(uint64_t op1, uint64_t op2) {
+  if (op1 == 0 || op2 == 0) {
+    return 0;
+  }
+  if (port::kMaxUint64 / op1 < op2) {
+    return op1;
+  }
+  return (op1 * op2);
+}
+
+void DecodeCFAndKeyFromString(std::string& buffer, uint32_t* cf_id, Slice* key) {
+  Slice buf(buffer);
+  GetFixed32(&buf, cf_id);
+  GetLengthPrefixedSlice(&buf, key);
+}
+
+}  // namespace
+
+// The default constructor of AnalyzerOptions
+AnalyzerOptions::AnalyzerOptions()
+    : correlation_map(kTaTypeNum, std::vector<int>(kTaTypeNum, -1)) {}
+
+AnalyzerOptions::~AnalyzerOptions() {}
+
+void AnalyzerOptions::SparseCorrelationInput(const std::string& in_str) {
+  std::string cur = in_str;
+  if (cur.size() == 0) {
+    return;
+  }
+  while (!cur.empty()) {
+    if (cur.compare(0, 1, "[") != 0) {
+      fprintf(stderr, "Invalid correlation input: %s\n", in_str.c_str());
+      exit(1);
+    }
+    std::string opt1, opt2;
+    std::size_t split = cur.find_first_of(",");
+    if (split != std::string::npos) {
+      opt1 = cur.substr(1, split - 1);
+    } else {
+      fprintf(stderr, "Invalid correlation input: %s\n", in_str.c_str());
+      exit(1);
+    }
+    std::size_t end = cur.find_first_of("]");
+    if (end != std::string::npos) {
+      opt2 = cur.substr(split + 1, end - split - 1);
+    } else {
+      fprintf(stderr, "Invalid correlation input: %s\n", in_str.c_str());
+      exit(1);
+    }
+    cur = cur.substr(end + 1);
+
+    if (taOptToIndex.find(opt1) != taOptToIndex.end() &&
+        taOptToIndex.find(opt2) != taOptToIndex.end()) {
+      correlation_list.push_back(
+          std::make_pair(taOptToIndex[opt1], taOptToIndex[opt2]));
+    } else {
+      fprintf(stderr, "Invalid correlation input: %s\n", in_str.c_str());
+      exit(1);
+    }
+  }
+
+  int sequence = 0;
+  for (auto& it : correlation_list) {
+    correlation_map[it.first][it.second] = sequence;
+    sequence++;
+  }
+  return;
+}
+
+// The trace statistic struct constructor
+TraceStats::TraceStats() {
+  cf_id = 0;
+  cf_name = "0";
+  a_count = 0;
+  a_key_id = 0;
+  a_key_size_sqsum = 0;
+  a_key_size_sum = 0;
+  a_key_mid = 0;
+  a_value_size_sqsum = 0;
+  a_value_size_sum = 0;
+  a_value_mid = 0;
+  a_peak_qps = 0;
+  a_ave_qps = 0.0;
+}
+
+TraceStats::~TraceStats() {}
+
+// The trace analyzer constructor
+TraceAnalyzer::TraceAnalyzer(std::string& trace_path, std::string& output_path,
+                             AnalyzerOptions _analyzer_opts)
+    : trace_name_(trace_path),
+      output_path_(output_path),
+      analyzer_opts_(_analyzer_opts) {
+  rocksdb::EnvOptions env_options;
+  env_ = rocksdb::Env::Default();
+  offset_ = 0;
+  c_time_ = 0;
+  total_requests_ = 0;
+  total_access_keys_ = 0;
+  total_gets_ = 0;
+  total_writes_ = 0;
+  begin_time_ = 0;
+  end_time_ = 0;
+  time_series_start_ = 0;
+  ta_.resize(kTaTypeNum);
+  ta_[0].type_name = "get";
+  if (FLAGS_analyze_get) {
+    ta_[0].enabled = true;
+  } else {
+    ta_[0].enabled = false;
+  }
+  ta_[1].type_name = "put";
+  if (FLAGS_analyze_put) {
+    ta_[1].enabled = true;
+  } else {
+    ta_[1].enabled = false;
+  }
+  ta_[2].type_name = "delete";
+  if (FLAGS_analyze_delete) {
+    ta_[2].enabled = true;
+  } else {
+    ta_[2].enabled = false;
+  }
+  ta_[3].type_name = "single_delete";
+  if (FLAGS_analyze_single_delete) {
+    ta_[3].enabled = true;
+  } else {
+    ta_[3].enabled = false;
+  }
+  ta_[4].type_name = "range_delete";
+  if (FLAGS_analyze_range_delete) {
+    ta_[4].enabled = true;
+  } else {
+    ta_[4].enabled = false;
+  }
+  ta_[5].type_name = "merge";
+  if (FLAGS_analyze_merge) {
+    ta_[5].enabled = true;
+  } else {
+    ta_[5].enabled = false;
+  }
+  ta_[6].type_name = "iterator_Seek";
+  if (FLAGS_analyze_iterator) {
+    ta_[6].enabled = true;
+  } else {
+    ta_[6].enabled = false;
+  }
+  ta_[7].type_name = "iterator_SeekForPrev";
+  if (FLAGS_analyze_iterator) {
+    ta_[7].enabled = true;
+  } else {
+    ta_[7].enabled = false;
+  }
+}
+
+TraceAnalyzer::~TraceAnalyzer() {}
+
+// Prepare the processing
+// Initiate the global trace reader and writer here
+Status TraceAnalyzer::PrepareProcessing() {
+  Status s;
+  // Prepare the trace reader
+  s = NewFileTraceReader(env_, env_options_, trace_name_, &trace_reader_);
+  if (!s.ok()) {
+    return s;
+  }
+
+  // Prepare and open the trace sequence file writer if needed
+  if (FLAGS_convert_to_human_readable_trace) {
+    std::string trace_sequence_name;
+    trace_sequence_name =
+        output_path_ + "/" + FLAGS_output_prefix + "-human_readable_trace.txt";
+    s = env_->NewWritableFile(trace_sequence_name, &trace_sequence_f_,
+                              env_options_);
+    if (!s.ok()) {
+      return s;
+    }
+  }
+
+  // prepare the general QPS file writer
+  if (FLAGS_output_qps_stats) {
+    std::string qps_stats_name;
+    qps_stats_name =
+        output_path_ + "/" + FLAGS_output_prefix + "-qps_stats.txt";
+    s = env_->NewWritableFile(qps_stats_name, &qps_f_, env_options_);
+    if (!s.ok()) {
+      return s;
+    }
+  }
+  return Status::OK();
+}
+
+Status TraceAnalyzer::ReadTraceHeader(Trace* header) {
+  assert(header != nullptr);
+  Status s = ReadTraceRecord(header);
+  if (!s.ok()) {
+    return s;
+  }
+  if (header->type != kTraceBegin) {
+    return Status::Corruption("Corrupted trace file. Incorrect header.");
+  }
+  if (header->payload.substr(0, kTraceMagic.length()) != kTraceMagic) {
+    return Status::Corruption("Corrupted trace file. Incorrect magic.");
+  }
+
+  return s;
+}
+
+Status TraceAnalyzer::ReadTraceFooter(Trace* footer) {
+  assert(footer != nullptr);
+  Status s = ReadTraceRecord(footer);
+  if (!s.ok()) {
+    return s;
+  }
+  if (footer->type != kTraceEnd) {
+    return Status::Corruption("Corrupted trace file. Incorrect footer.");
+  }
+  return s;
+}
+
+Status TraceAnalyzer::ReadTraceRecord(Trace* trace) {
+  assert(trace != nullptr);
+  std::string encoded_trace;
+  Status s = trace_reader_->Read(&encoded_trace);
+  if (!s.ok()) {
+    return s;
+  }
+
+  Slice enc_slice = Slice(encoded_trace);
+  GetFixed64(&enc_slice, &trace->ts);
+  trace->type = static_cast<TraceType>(enc_slice[0]);
+  enc_slice.remove_prefix(kTraceTypeSize + kTracePayloadLengthSize);
+  trace->payload = enc_slice.ToString();
+  return s;
+}
+
+// process the trace itself and redirect the trace content
+// to different operation type handler. With different race
+// format, this function can be changed
+Status TraceAnalyzer::StartProcessing() {
+  Status s;
+  Trace header;
+  s = ReadTraceHeader(&header);
+  if (!s.ok()) {
+    fprintf(stderr, "Cannot read the header\n");
+    return s;
+  }
+  if (FLAGS_output_time_series) {
+    time_series_start_ = header.ts;
+  }
+
+  Trace trace;
+  while (s.ok()) {
+    trace.reset();
+    s = ReadTraceRecord(&trace);
+    if (!s.ok()) {
+      break;
+    }
+
+    total_requests_++;
+    end_time_ = trace.ts;
+    if (trace.type == kTraceWrite) {
+      total_writes_++;
+      c_time_ = trace.ts;
+      WriteBatch batch(trace.payload);
+
+      // Note that, if the write happens in a transaction,
+      // 'Write' will be called twice, one for Prepare, one for
+      // Commit. Thus, in the trace, for the same WriteBatch, there
+      // will be two reords if it is in a transaction. Here, we only
+      // process the reord that is committed. If write is non-transaction,
+      // HasBeginPrepare()==false, so we process it normally.
+      if (batch.HasBeginPrepare() && !batch.HasCommit()) {
+        continue;
+      }
+      TraceWriteHandler write_handler(this);
+      s = batch.Iterate(&write_handler);
+      if (!s.ok()) {
+        fprintf(stderr, "Cannot process the write batch in the trace\n");
+        return s;
+      }
+    } else if (trace.type == kTraceGet) {
+      uint32_t cf_id = 0;
+      Slice key;
+      DecodeCFAndKeyFromString(trace.payload, &cf_id, &key);
+      total_gets_++;
+
+      s = HandleGet(cf_id, key.ToString(), trace.ts, 1);
+      if (!s.ok()) {
+        fprintf(stderr, "Cannot process the get in the trace\n");
+        return s;
+      }
+    } else if (trace.type == kTraceIteratorSeek ||
+               trace.type == kTraceIteratorSeekForPrev) {
+      uint32_t cf_id = 0;
+      Slice key;
+      DecodeCFAndKeyFromString(trace.payload, &cf_id, &key);
+      s = HandleIter(cf_id, key.ToString(), trace.ts, trace.type);
+      if (!s.ok()) {
+        fprintf(stderr, "Cannot process the iterator in the trace\n");
+        return s;
+      }
+    } else if (trace.type == kTraceEnd) {
+      break;
+    }
+  }
+  if (s.IsIncomplete()) {
+    // Fix it: Reaching eof returns Incomplete status at the moment.
+    //
+    return Status::OK();
+  }
+  return s;
+}
+
+// After the trace is processed by StartProcessing, the statistic data
+// is stored in the map or other in memory data structures. To get the
+// other statistic result such as key size distribution, value size
+// distribution, these data structures are re-processed here.
+Status TraceAnalyzer::MakeStatistics() {
+  int ret;
+  Status s;
+  for (int type = 0; type < kTaTypeNum; type++) {
+    if (!ta_[type].enabled) {
+      continue;
+    }
+    for (auto& stat : ta_[type].stats) {
+      stat.second.a_key_id = 0;
+      for (auto& record : stat.second.a_key_stats) {
+        record.second.key_id = stat.second.a_key_id;
+        stat.second.a_key_id++;
+        if (record.second.access_count <=
+            static_cast<uint64_t>(FLAGS_output_ignore_count)) {
+          continue;
+        }
+
+        // Generate the key access count distribution data
+        if (FLAGS_output_access_count_stats) {
+          if (stat.second.a_count_stats.find(record.second.access_count) ==
+              stat.second.a_count_stats.end()) {
+            stat.second.a_count_stats[record.second.access_count] = 1;
+          } else {
+            stat.second.a_count_stats[record.second.access_count]++;
+          }
+        }
+
+        // Generate the key size distribution data
+        if (FLAGS_print_key_distribution) {
+          if (stat.second.a_key_size_stats.find(record.first.size()) ==
+              stat.second.a_key_size_stats.end()) {
+            stat.second.a_key_size_stats[record.first.size()] = 1;
+          } else {
+            stat.second.a_key_size_stats[record.first.size()]++;
+          }
+        }
+
+        if (!FLAGS_print_correlation.empty()) {
+          s = MakeStatisticCorrelation(stat.second, record.second);
+          if (!s.ok()) {
+            return s;
+          }
+        }
+      }
+
+      // Output the prefix cut or the whole content of the accessed key space
+      if (FLAGS_output_key_stats || FLAGS_output_prefix_cut > 0) {
+        s = MakeStatisticKeyStatsOrPrefix(stat.second);
+        if (!s.ok()) {
+          return s;
+        }
+      }
+
+      // output the access count distribution
+      if (FLAGS_output_access_count_stats && stat.second.a_count_dist_f) {
+        for (auto& record : stat.second.a_count_stats) {
+          ret = sprintf(buffer_, "access_count: %" PRIu64 " num: %" PRIu64 "\n",
+                        record.first, record.second);
+          if (ret < 0) {
+            return Status::IOError("Format the output failed");
+          }
+          std::string printout(buffer_);
+          s = stat.second.a_count_dist_f->Append(printout);
+          if (!s.ok()) {
+            fprintf(stderr, "Write access count distribution file failed\n");
+            return s;
+          }
+        }
+      }
+
+      // find the medium of the key size
+      uint64_t k_count = 0;
+      for (auto& record : stat.second.a_key_size_stats) {
+        k_count += record.second;
+        if (k_count >= stat.second.a_key_mid) {
+          stat.second.a_key_mid = record.first;
+          break;
+        }
+      }
+
+      // output the value size distribution
+      uint64_t v_begin = 0, v_end = 0, v_count = 0;
+      bool get_mid = false;
+      for (auto& record : stat.second.a_value_size_stats) {
+        v_begin = v_end;
+        v_end = (record.first + 1) * FLAGS_value_interval;
+        v_count += record.second;
+        if (!get_mid && v_count >= stat.second.a_count / 2) {
+          stat.second.a_value_mid = (v_begin + v_end) / 2;
+          get_mid = true;
+        }
+        if (FLAGS_output_value_distribution && stat.second.a_value_size_f &&
+            (type == TraceOperationType::kPut ||
+             type == TraceOperationType::kMerge)) {
+          ret = sprintf(buffer_,
+                        "Number_of_value_size_between %" PRIu64 " and %" PRIu64
+                        " is: %" PRIu64 "\n",
+                        v_begin, v_end, record.second);
+          if (ret < 0) {
+            return Status::IOError("Format output failed");
+          }
+          std::string printout(buffer_);
+          s = stat.second.a_value_size_f->Append(printout);
+          if (!s.ok()) {
+            fprintf(stderr, "Write value size distribution file failed\n");
+            return s;
+          }
+        }
+      }
+    }
+  }
+
+  // Make the QPS statistics
+  if (FLAGS_output_qps_stats) {
+    s = MakeStatisticQPS();
+    if (!s.ok()) {
+      return s;
+    }
+  }
+
+  return Status::OK();
+}
+
+// Process the statistics of the key access and
+// prefix of the accessed keys if required
+Status TraceAnalyzer::MakeStatisticKeyStatsOrPrefix(TraceStats& stats) {
+  int ret;
+  Status s;
+  std::string prefix = "0";
+  uint64_t prefix_access = 0;
+  uint64_t prefix_count = 0;
+  uint64_t prefix_succ_access = 0;
+  double prefix_ave_access = 0.0;
+  stats.a_succ_count = 0;
+  for (auto& record : stats.a_key_stats) {
+    // write the key access statistic file
+    if (!stats.a_key_f) {
+      return Status::IOError("Failed to open accessed_key_stats file.");
+    }
+    stats.a_succ_count += record.second.succ_count;
+    double succ_ratio = 0.0;
+    if (record.second.access_count > 0) {
+      succ_ratio = (static_cast<double>(record.second.succ_count)) /
+                   record.second.access_count;
+    }
+    ret = sprintf(buffer_, "%u %zu %" PRIu64 " %" PRIu64 " %f\n",
+                  record.second.cf_id, record.second.value_size,
+                  record.second.key_id, record.second.access_count, succ_ratio);
+    if (ret < 0) {
+      return Status::IOError("Format output failed");
+    }
+    std::string printout(buffer_);
+    s = stats.a_key_f->Append(printout);
+    if (!s.ok()) {
+      fprintf(stderr, "Write key access file failed\n");
+      return s;
+    }
+
+    // write the prefix cut of the accessed keys
+    if (FLAGS_output_prefix_cut > 0 && stats.a_prefix_cut_f) {
+      if (record.first.compare(0, FLAGS_output_prefix_cut, prefix) != 0) {
+        std::string prefix_out = rocksdb::LDBCommand::StringToHex(prefix);
+        if (prefix_count == 0) {
+          prefix_ave_access = 0.0;
+        } else {
+          prefix_ave_access =
+              (static_cast<double>(prefix_access)) / prefix_count;
+        }
+        double prefix_succ_ratio = 0.0;
+        if (prefix_access > 0) {
+          prefix_succ_ratio =
+              (static_cast<double>(prefix_succ_access)) / prefix_access;
+        }
+        ret = sprintf(buffer_, "%" PRIu64 " %" PRIu64 " %" PRIu64 " %f %f %s\n",
+                      record.second.key_id, prefix_access, prefix_count,
+                      prefix_ave_access, prefix_succ_ratio, prefix_out.c_str());
+        if (ret < 0) {
+          return Status::IOError("Format output failed");
+        }
+        std::string pout(buffer_);
+        s = stats.a_prefix_cut_f->Append(pout);
+        if (!s.ok()) {
+          fprintf(stderr, "Write accessed key prefix file failed\n");
+          return s;
+        }
+
+        // make the top k statistic for the prefix
+        if (static_cast<int32_t>(stats.top_k_prefix_access.size()) <
+            FLAGS_print_top_k_access) {
+          stats.top_k_prefix_access.push(
+              std::make_pair(prefix_access, prefix_out));
+        } else {
+          if (prefix_access > stats.top_k_prefix_access.top().first) {
+            stats.top_k_prefix_access.pop();
+            stats.top_k_prefix_access.push(
+                std::make_pair(prefix_access, prefix_out));
+          }
+        }
+
+        if (static_cast<int32_t>(stats.top_k_prefix_ave.size()) <
+            FLAGS_print_top_k_access) {
+          stats.top_k_prefix_ave.push(
+              std::make_pair(prefix_ave_access, prefix_out));
+        } else {
+          if (prefix_ave_access > stats.top_k_prefix_ave.top().first) {
+            stats.top_k_prefix_ave.pop();
+            stats.top_k_prefix_ave.push(
+                std::make_pair(prefix_ave_access, prefix_out));
+          }
+        }
+
+        prefix = record.first.substr(0, FLAGS_output_prefix_cut);
+        prefix_access = 0;
+        prefix_count = 0;
+        prefix_succ_access = 0;
+      }
+      prefix_access += record.second.access_count;
+      prefix_count += 1;
+      prefix_succ_access += record.second.succ_count;
+    }
+  }
+  return Status::OK();
+}
+
+// Process the statistics of different query type
+// correlations
+Status TraceAnalyzer::MakeStatisticCorrelation(TraceStats& stats,
+                                               StatsUnit& unit) {
+  if (stats.correlation_output.size() !=
+      analyzer_opts_.correlation_list.size()) {
+    return Status::Corruption("Cannot make the statistic of correlation.");
+  }
+
+  for (int i = 0; i < static_cast<int>(analyzer_opts_.correlation_list.size());
+       i++) {
+    if (i >= static_cast<int>(stats.correlation_output.size()) ||
+        i >= static_cast<int>(unit.v_correlation.size())) {
+      break;
+    }
+    stats.correlation_output[i].first += unit.v_correlation[i].count;
+    stats.correlation_output[i].second += unit.v_correlation[i].total_ts;
+  }
+  return Status::OK();
+}
+
+// Process the statistics of QPS
+Status TraceAnalyzer::MakeStatisticQPS() {
+  uint32_t duration =
+      static_cast<uint32_t>((end_time_ - begin_time_) / 1000000);
+  int ret;
+  Status s;
+  std::vector<std::vector<uint32_t>> type_qps(
+      duration, std::vector<uint32_t>(kTaTypeNum + 1, 0));
+  std::vector<uint64_t> qps_sum(kTaTypeNum + 1, 0);
+  std::vector<uint32_t> qps_peak(kTaTypeNum + 1, 0);
+  qps_ave_.resize(kTaTypeNum + 1);
+
+  for (int type = 0; type < kTaTypeNum; type++) {
+    if (!ta_[type].enabled) {
+      continue;
+    }
+    for (auto& stat : ta_[type].stats) {
+      uint32_t time_line = 0;
+      uint64_t cf_qps_sum = 0;
+      for (auto& time_it : stat.second.a_qps_stats) {
+        if (time_it.first >= duration) {
+          continue;
+        }
+        type_qps[time_it.first][kTaTypeNum] += time_it.second;
+        type_qps[time_it.first][type] += time_it.second;
+        cf_qps_sum += time_it.second;
+        if (time_it.second > stat.second.a_peak_qps) {
+          stat.second.a_peak_qps = time_it.second;
+        }
+        if (stat.second.a_qps_f) {
+          while (time_line < time_it.first) {
+            ret = sprintf(buffer_, "%u\n", 0);
+            if (ret < 0) {
+              return Status::IOError("Format the output failed");
+            }
+            std::string printout(buffer_);
+            s = stat.second.a_qps_f->Append(printout);
+            if (!s.ok()) {
+              fprintf(stderr, "Write QPS file failed\n");
+              return s;
+            }
+            time_line++;
+          }
+          ret = sprintf(buffer_, "%u\n", time_it.second);
+          if (ret < 0) {
+            return Status::IOError("Format the output failed");
+          }
+          std::string printout(buffer_);
+          s = stat.second.a_qps_f->Append(printout);
+          if (!s.ok()) {
+            fprintf(stderr, "Write QPS file failed\n");
+            return s;
+          }
+          if (time_line == time_it.first) {
+            time_line++;
+          }
+        }
+
+        // Process the top k QPS peaks
+        if (FLAGS_output_prefix_cut > 0) {
+          if (static_cast<int32_t>(stat.second.top_k_qps_sec.size()) <
+              FLAGS_print_top_k_access) {
+            stat.second.top_k_qps_sec.push(
+                std::make_pair(time_it.second, time_it.first));
+          } else {
+            if (stat.second.top_k_qps_sec.size() > 0 &&
+                stat.second.top_k_qps_sec.top().first < time_it.second) {
+              stat.second.top_k_qps_sec.pop();
+              stat.second.top_k_qps_sec.push(
+                  std::make_pair(time_it.second, time_it.first));
+            }
+          }
+        }
+      }
+      if (duration == 0) {
+        stat.second.a_ave_qps = 0;
+      } else {
+        stat.second.a_ave_qps = (static_cast<double>(cf_qps_sum)) / duration;
+      }
+
+      // output the prefix of top k access peak
+      if (FLAGS_output_prefix_cut > 0 && stat.second.a_top_qps_prefix_f) {
+        while (!stat.second.top_k_qps_sec.empty()) {
+          ret = sprintf(buffer_, "At time: %u with QPS: %u\n",
+                        stat.second.top_k_qps_sec.top().second,
+                        stat.second.top_k_qps_sec.top().first);
+          if (ret < 0) {
+            return Status::IOError("Format the output failed");
+          }
+          std::string printout(buffer_);
+          s = stat.second.a_top_qps_prefix_f->Append(printout);
+          if (!s.ok()) {
+            fprintf(stderr, "Write prefix QPS top K file failed\n");
+            return s;
+          }
+          uint32_t qps_time = stat.second.top_k_qps_sec.top().second;
+          stat.second.top_k_qps_sec.pop();
+          if (stat.second.a_qps_prefix_stats.find(qps_time) !=
+              stat.second.a_qps_prefix_stats.end()) {
+            for (auto& qps_prefix : stat.second.a_qps_prefix_stats[qps_time]) {
+              std::string qps_prefix_out =
+                  rocksdb::LDBCommand::StringToHex(qps_prefix.first);
+              ret = sprintf(buffer_, "The prefix: %s Access count: %u\n",
+                            qps_prefix_out.c_str(), qps_prefix.second);
+              if (ret < 0) {
+                return Status::IOError("Format the output failed");
+              }
+              std::string pout(buffer_);
+              s = stat.second.a_top_qps_prefix_f->Append(pout);
+              if (!s.ok()) {
+                fprintf(stderr, "Write prefix QPS top K file failed\n");
+                return s;
+              }
+            }
+          }
+        }
+      }
+    }
+  }
+
+  if (qps_f_) {
+    for (uint32_t i = 0; i < duration; i++) {
+      for (int type = 0; type <= kTaTypeNum; type++) {
+        if (type < kTaTypeNum) {
+          ret = sprintf(buffer_, "%u ", type_qps[i][type]);
+        } else {
+          ret = sprintf(buffer_, "%u\n", type_qps[i][type]);
+        }
+        if (ret < 0) {
+          return Status::IOError("Format the output failed");
+        }
+        std::string printout(buffer_);
+        s = qps_f_->Append(printout);
+        if (!s.ok()) {
+          return s;
+        }
+        qps_sum[type] += type_qps[i][type];
+        if (type_qps[i][type] > qps_peak[type]) {
+          qps_peak[type] = type_qps[i][type];
+        }
+      }
+    }
+  }
+
+  qps_peak_ = qps_peak;
+  for (int type = 0; type <= kTaTypeNum; type++) {
+    if (duration == 0) {
+      qps_ave_[type] = 0;
+    } else {
+      qps_ave_[type] = (static_cast<double>(qps_sum[type])) / duration;
+    }
+  }
+
+  return Status::OK();
+}
+
+// In reprocessing, if we have the whole key space
+// we can output the access count of all keys in a cf
+// we can make some statistics of the whole key space
+// also, we output the top k accessed keys here
+Status TraceAnalyzer::ReProcessing() {
+  int ret;
+  Status s;
+  for (auto& cf_it : cfs_) {
+    uint32_t cf_id = cf_it.first;
+
+    // output the time series;
+    if (FLAGS_output_time_series) {
+      for (int type = 0; type < kTaTypeNum; type++) {
+        if (!ta_[type].enabled ||
+            ta_[type].stats.find(cf_id) == ta_[type].stats.end()) {
+          continue;
+        }
+        TraceStats& stat = ta_[type].stats[cf_id];
+        if (!stat.time_series_f) {
+          fprintf(stderr, "Cannot write time_series of '%s' in '%u'\n",
+                  ta_[type].type_name.c_str(), cf_id);
+          continue;
+        }
+        while (!stat.time_series.empty()) {
+          uint64_t key_id = 0;
+          auto found = stat.a_key_stats.find(stat.time_series.front().key);
+          if (found != stat.a_key_stats.end()) {
+            key_id = found->second.key_id;
+          }
+          ret = sprintf(buffer_, "%u %" PRIu64 " %" PRIu64 "\n",
+                        stat.time_series.front().type,
+                        stat.time_series.front().ts, key_id);
+          if (ret < 0) {
+            return Status::IOError("Format the output failed");
+          }
+          std::string printout(buffer_);
+          s = stat.time_series_f->Append(printout);
+          if (!s.ok()) {
+            fprintf(stderr, "Write time series file failed\n");
+            return s;
+          }
+          stat.time_series.pop_front();
+        }
+      }
+    }
+
+    // process the whole key space if needed
+    if (!FLAGS_key_space_dir.empty()) {
+      std::string whole_key_path =
+          FLAGS_key_space_dir + "/" + std::to_string(cf_id) + ".txt";
+      std::string input_key, get_key;
+      std::vector<std::string> prefix(kTaTypeNum);
+      std::istringstream iss;
+      bool has_data = true;
+      s = env_->NewSequentialFile(whole_key_path, &wkey_input_f_, env_options_);
+      if (!s.ok()) {
+        fprintf(stderr, "Cannot open the whole key space file of CF: %u\n",
+                cf_id);
+        wkey_input_f_.reset();
+      }
+      if (wkey_input_f_) {
+        for (cfs_[cf_id].w_count = 0;
+             ReadOneLine(&iss, wkey_input_f_.get(), &get_key, &has_data, &s);
+             ++cfs_[cf_id].w_count) {
+          if (!s.ok()) {
+            fprintf(stderr, "Read whole key space file failed\n");
+            return s;
+          }
+
+          input_key = rocksdb::LDBCommand::HexToString(get_key);
+          for (int type = 0; type < kTaTypeNum; type++) {
+            if (!ta_[type].enabled) {
+              continue;
+            }
+            TraceStats& stat = ta_[type].stats[cf_id];
+            if (stat.w_key_f) {
+              if (stat.a_key_stats.find(input_key) != stat.a_key_stats.end()) {
+                ret = sprintf(buffer_, "%" PRIu64 " %" PRIu64 "\n",
+                              cfs_[cf_id].w_count,
+                              stat.a_key_stats[input_key].access_count);
+                if (ret < 0) {
+                  return Status::IOError("Format the output failed");
+                }
+                std::string printout(buffer_);
+                s = stat.w_key_f->Append(printout);
+                if (!s.ok()) {
+                  fprintf(stderr, "Write whole key space access file failed\n");
+                  return s;
+                }
+              }
+            }
+
+            // Output the prefix cut file of the whole key space
+            if (FLAGS_output_prefix_cut > 0 && stat.w_prefix_cut_f) {
+              if (input_key.compare(0, FLAGS_output_prefix_cut, prefix[type]) !=
+                  0) {
+                prefix[type] = input_key.substr(0, FLAGS_output_prefix_cut);
+                std::string prefix_out =
+                    rocksdb::LDBCommand::StringToHex(prefix[type]);
+                ret = sprintf(buffer_, "%" PRIu64 " %s\n", cfs_[cf_id].w_count,
+                              prefix_out.c_str());
+                if (ret < 0) {
+                  return Status::IOError("Format the output failed");
+                }
+                std::string printout(buffer_);
+                s = stat.w_prefix_cut_f->Append(printout);
+                if (!s.ok()) {
+                  fprintf(stderr,
+                          "Write whole key space prefix cut file failed\n");
+                  return s;
+                }
+              }
+            }
+          }
+
+          // Make the statistics fo the key size distribution
+          if (FLAGS_print_key_distribution) {
+            if (cfs_[cf_id].w_key_size_stats.find(input_key.size()) ==
+                cfs_[cf_id].w_key_size_stats.end()) {
+              cfs_[cf_id].w_key_size_stats[input_key.size()] = 1;
+            } else {
+              cfs_[cf_id].w_key_size_stats[input_key.size()]++;
+            }
+          }
+        }
+      }
+    }
+
+    // process the top k accessed keys
+    if (FLAGS_print_top_k_access > 0) {
+      for (int type = 0; type < kTaTypeNum; type++) {
+        if (!ta_[type].enabled ||
+            ta_[type].stats.find(cf_id) == ta_[type].stats.end()) {
+          continue;
+        }
+        TraceStats& stat = ta_[type].stats[cf_id];
+        for (auto& record : stat.a_key_stats) {
+          if (static_cast<int32_t>(stat.top_k_queue.size()) <
+              FLAGS_print_top_k_access) {
+            stat.top_k_queue.push(
+                std::make_pair(record.second.access_count, record.first));
+          } else {
+            if (record.second.access_count > stat.top_k_queue.top().first) {
+              stat.top_k_queue.pop();
+              stat.top_k_queue.push(
+                  std::make_pair(record.second.access_count, record.first));
+            }
+          }
+        }
+      }
+    }
+  }
+  return Status::OK();
+}
+
+// End the processing, print the requested results
+Status TraceAnalyzer::EndProcessing() {
+  if (trace_sequence_f_) {
+    trace_sequence_f_->Close();
+  }
+  if (FLAGS_no_print) {
+    return Status::OK();
+  }
+  PrintStatistics();
+  CloseOutputFiles();
+  return Status::OK();
+}
+
+// Insert the corresponding key statistics to the correct type
+// and correct CF, output the time-series file if needed
+Status TraceAnalyzer::KeyStatsInsertion(const uint32_t& type,
+                                        const uint32_t& cf_id,
+                                        const std::string& key,
+                                        const size_t value_size,
+                                        const uint64_t ts) {
+  Status s;
+  StatsUnit unit;
+  unit.key_id = 0;
+  unit.cf_id = cf_id;
+  unit.value_size = value_size;
+  unit.access_count = 1;
+  unit.latest_ts = ts;
+  if (type != TraceOperationType::kGet || value_size > 0) {
+    unit.succ_count = 1;
+  } else {
+    unit.succ_count = 0;
+  }
+  unit.v_correlation.resize(analyzer_opts_.correlation_list.size());
+  for (int i = 0;
+       i < (static_cast<int>(analyzer_opts_.correlation_list.size())); i++) {
+    unit.v_correlation[i].count = 0;
+    unit.v_correlation[i].total_ts = 0;
+  }
+  std::string prefix;
+  if (FLAGS_output_prefix_cut > 0) {
+    prefix = key.substr(0, FLAGS_output_prefix_cut);
+  }
+
+  if (begin_time_ == 0) {
+    begin_time_ = ts;
+  }
+  uint32_t time_in_sec;
+  if (ts < begin_time_) {
+    time_in_sec = 0;
+  } else {
+    time_in_sec = static_cast<uint32_t>((ts - begin_time_) / 1000000);
+  }
+
+  uint64_t dist_value_size = value_size / FLAGS_value_interval;
+  auto found_stats = ta_[type].stats.find(cf_id);
+  if (found_stats == ta_[type].stats.end()) {
+    ta_[type].stats[cf_id].cf_id = cf_id;
+    ta_[type].stats[cf_id].cf_name = std::to_string(cf_id);
+    ta_[type].stats[cf_id].a_count = 1;
+    ta_[type].stats[cf_id].a_key_id = 0;
+    ta_[type].stats[cf_id].a_key_size_sqsum = MultiplyCheckOverflow(
+        static_cast<uint64_t>(key.size()), static_cast<uint64_t>(key.size()));
+    ta_[type].stats[cf_id].a_key_size_sum = key.size();
+    ta_[type].stats[cf_id].a_value_size_sqsum = MultiplyCheckOverflow(
+        static_cast<uint64_t>(value_size), static_cast<uint64_t>(value_size));
+    ta_[type].stats[cf_id].a_value_size_sum = value_size;
+    s = OpenStatsOutputFiles(ta_[type].type_name, ta_[type].stats[cf_id]);
+    if (!FLAGS_print_correlation.empty()) {
+      s = StatsUnitCorrelationUpdate(unit, type, ts, key);
+    }
+    ta_[type].stats[cf_id].a_key_stats[key] = unit;
+    ta_[type].stats[cf_id].a_value_size_stats[dist_value_size] = 1;
+    ta_[type].stats[cf_id].a_qps_stats[time_in_sec] = 1;
+    ta_[type].stats[cf_id].correlation_output.resize(
+        analyzer_opts_.correlation_list.size());
+    if (FLAGS_output_prefix_cut > 0) {
+      std::map<std::string, uint32_t> tmp_qps_map;
+      tmp_qps_map[prefix] = 1;
+      ta_[type].stats[cf_id].a_qps_prefix_stats[time_in_sec] = tmp_qps_map;
+    }
+  } else {
+    found_stats->second.a_count++;
+    found_stats->second.a_key_size_sqsum += MultiplyCheckOverflow(
+        static_cast<uint64_t>(key.size()), static_cast<uint64_t>(key.size()));
+    found_stats->second.a_key_size_sum += key.size();
+    found_stats->second.a_value_size_sqsum += MultiplyCheckOverflow(
+        static_cast<uint64_t>(value_size), static_cast<uint64_t>(value_size));
+    found_stats->second.a_value_size_sum += value_size;
+    auto found_key = found_stats->second.a_key_stats.find(key);
+    if (found_key == found_stats->second.a_key_stats.end()) {
+      found_stats->second.a_key_stats[key] = unit;
+    } else {
+      found_key->second.access_count++;
+      if (type != TraceOperationType::kGet || value_size > 0) {
+        found_key->second.succ_count++;
+      }
+      if (!FLAGS_print_correlation.empty()) {
+        s = StatsUnitCorrelationUpdate(found_key->second, type, ts, key);
+      }
+    }
+
+    auto found_value =
+        found_stats->second.a_value_size_stats.find(dist_value_size);
+    if (found_value == found_stats->second.a_value_size_stats.end()) {
+      found_stats->second.a_value_size_stats[dist_value_size] = 1;
+    } else {
+      found_value->second++;
+    }
+
+    auto found_qps = found_stats->second.a_qps_stats.find(time_in_sec);
+    if (found_qps == found_stats->second.a_qps_stats.end()) {
+      found_stats->second.a_qps_stats[time_in_sec] = 1;
+    } else {
+      found_qps->second++;
+    }
+
+    if (FLAGS_output_prefix_cut > 0) {
+      auto found_qps_prefix =
+          found_stats->second.a_qps_prefix_stats.find(time_in_sec);
+      if (found_qps_prefix == found_stats->second.a_qps_prefix_stats.end()) {
+        std::map<std::string, uint32_t> tmp_qps_map;
+        found_stats->second.a_qps_prefix_stats[time_in_sec] = tmp_qps_map;
+      }
+      if (found_stats->second.a_qps_prefix_stats[time_in_sec].find(prefix) ==
+          found_stats->second.a_qps_prefix_stats[time_in_sec].end()) {
+        found_stats->second.a_qps_prefix_stats[time_in_sec][prefix] = 1;
+      } else {
+        found_stats->second.a_qps_prefix_stats[time_in_sec][prefix]++;
+      }
+    }
+  }
+
+  if (cfs_.find(cf_id) == cfs_.end()) {
+    CfUnit cf_unit;
+    cf_unit.cf_id = cf_id;
+    cf_unit.w_count = 0;
+    cf_unit.a_count = 0;
+    cfs_[cf_id] = cf_unit;
+  }
+
+  if (FLAGS_output_time_series) {
+    TraceUnit trace_u;
+    trace_u.type = type;
+    trace_u.key = key;
+    trace_u.value_size = value_size;
+    trace_u.ts = (ts - time_series_start_) / 1000000;
+    trace_u.cf_id = cf_id;
+    ta_[type].stats[cf_id].time_series.push_back(trace_u);
+  }
+
+  return Status::OK();
+}
+
+// Update the correlation unit of each key if enabled
+Status TraceAnalyzer::StatsUnitCorrelationUpdate(StatsUnit& unit,
+                                                 const uint32_t& type_second,
+                                                 const uint64_t& ts,
+                                                 const std::string& key) {
+  if (type_second >= kTaTypeNum) {
+    fprintf(stderr, "Unknown Type Id: %u\n", type_second);
+    return Status::NotFound();
+  }
+
+  for (int type_first = 0; type_first < kTaTypeNum; type_first++) {
+    if (type_first >= static_cast<int>(ta_.size()) ||
+        type_first >= static_cast<int>(analyzer_opts_.correlation_map.size())) {
+      break;
+    }
+    if (analyzer_opts_.correlation_map[type_first][type_second] < 0 ||
+        ta_[type_first].stats.find(unit.cf_id) == ta_[type_first].stats.end() ||
+        ta_[type_first].stats[unit.cf_id].a_key_stats.find(key) ==
+            ta_[type_first].stats[unit.cf_id].a_key_stats.end() ||
+        ta_[type_first].stats[unit.cf_id].a_key_stats[key].latest_ts == ts) {
+      continue;
+    }
+
+    int correlation_id =
+        analyzer_opts_.correlation_map[type_first][type_second];
+
+    // after get the x-y operation time or x, update;
+    if (correlation_id < 0 ||
+        correlation_id >= static_cast<int>(unit.v_correlation.size())) {
+      continue;
+    }
+    unit.v_correlation[correlation_id].count++;
+    unit.v_correlation[correlation_id].total_ts +=
+        (ts - ta_[type_first].stats[unit.cf_id].a_key_stats[key].latest_ts);
+  }
+
+  unit.latest_ts = ts;
+  return Status::OK();
+}
+
+// when a new trace statistic is created, the file handler
+// pointers should be initiated if needed according to
+// the trace analyzer options
+Status TraceAnalyzer::OpenStatsOutputFiles(const std::string& type,
+                                           TraceStats& new_stats) {
+  Status s;
+  if (FLAGS_output_key_stats) {
+    s = CreateOutputFile(type, new_stats.cf_name, "accessed_key_stats.txt",
+                         &new_stats.a_key_f);
+    if (!FLAGS_key_space_dir.empty()) {
+      s = CreateOutputFile(type, new_stats.cf_name, "whole_key_stats.txt",
+                           &new_stats.w_key_f);
+    }
+  }
+
+  if (FLAGS_output_access_count_stats) {
+    s = CreateOutputFile(type, new_stats.cf_name,
+                         "accessed_key_count_distribution.txt",
+                         &new_stats.a_count_dist_f);
+  }
+
+  if (FLAGS_output_prefix_cut > 0) {
+    s = CreateOutputFile(type, new_stats.cf_name, "accessed_key_prefix_cut.txt",
+                         &new_stats.a_prefix_cut_f);
+    if (!FLAGS_key_space_dir.empty()) {
+      s = CreateOutputFile(type, new_stats.cf_name, "whole_key_prefix_cut.txt",
+                           &new_stats.w_prefix_cut_f);
+    }
+
+    if (FLAGS_output_qps_stats) {
+      s = CreateOutputFile(type, new_stats.cf_name,
+                           "accessed_top_k_qps_prefix_cut.txt",
+                           &new_stats.a_top_qps_prefix_f);
+    }
+  }
+
+  if (FLAGS_output_time_series) {
+    s = CreateOutputFile(type, new_stats.cf_name, "time_series.txt",
+                         &new_stats.time_series_f);
+  }
+
+  if (FLAGS_output_value_distribution) {
+    s = CreateOutputFile(type, new_stats.cf_name,
+                         "accessed_value_size_distribution.txt",
+                         &new_stats.a_value_size_f);
+  }
+
+  if (FLAGS_output_qps_stats) {
+    s = CreateOutputFile(type, new_stats.cf_name, "qps_stats.txt",
+                         &new_stats.a_qps_f);
+  }
+
+  return Status::OK();
+}
+
+// create the output path of the files to be opened
+Status TraceAnalyzer::CreateOutputFile(
+    const std::string& type, const std::string& cf_name,
+    const std::string& ending, std::unique_ptr<rocksdb::WritableFile>* f_ptr) {
+  std::string path;
+  path = output_path_ + "/" + FLAGS_output_prefix + "-" + type + "-" + cf_name +
+         "-" + ending;
+  Status s;
+  s = env_->NewWritableFile(path, f_ptr, env_options_);
+  if (!s.ok()) {
+    fprintf(stderr, "Cannot open file: %s\n", path.c_str());
+    exit(1);
+  }
+  return Status::OK();
+}
+
+// Close the output files in the TraceStats if they are opened
+void TraceAnalyzer::CloseOutputFiles() {
+  for (int type = 0; type < kTaTypeNum; type++) {
+    if (!ta_[type].enabled) {
+      continue;
+    }
+    for (auto& stat : ta_[type].stats) {
+      if (stat.second.time_series_f) {
+        stat.second.time_series_f->Close();
+      }
+
+      if (stat.second.a_key_f) {
+        stat.second.a_key_f->Close();
+      }
+
+      if (stat.second.a_count_dist_f) {
+        stat.second.a_count_dist_f->Close();
+      }
+
+      if (stat.second.a_prefix_cut_f) {
+        stat.second.a_prefix_cut_f->Close();
+      }
+
+      if (stat.second.a_value_size_f) {
+        stat.second.a_value_size_f->Close();
+      }
+
+      if (stat.second.a_qps_f) {
+        stat.second.a_qps_f->Close();
+      }
+
+      if (stat.second.a_top_qps_prefix_f) {
+        stat.second.a_top_qps_prefix_f->Close();
+      }
+
+      if (stat.second.w_key_f) {
+        stat.second.w_key_f->Close();
+      }
+      if (stat.second.w_prefix_cut_f) {
+        stat.second.w_prefix_cut_f->Close();
+      }
+    }
+  }
+  return;
+}
+
+// Handle the Get request in the trace
+Status TraceAnalyzer::HandleGet(uint32_t column_family_id,
+                                const std::string& key, const uint64_t& ts,
+                                const uint32_t& get_ret) {
+  Status s;
+  size_t value_size = 0;
+  if (FLAGS_convert_to_human_readable_trace && trace_sequence_f_) {
+    s = WriteTraceSequence(TraceOperationType::kGet, column_family_id, key,
+                           value_size, ts);
+    if (!s.ok()) {
+      return Status::Corruption("Failed to write the trace sequence to file");
+    }
+  }
+
+  if (!ta_[TraceOperationType::kGet].enabled) {
+    return Status::OK();
+  }
+  if (get_ret == 1) {
+    value_size = 10;
+  }
+  s = KeyStatsInsertion(TraceOperationType::kGet, column_family_id, key,
+                        value_size, ts);
+  if (!s.ok()) {
+    return Status::Corruption("Failed to insert key statistics");
+  }
+  return s;
+}
+
+// Handle the Put request in the write batch of the trace
+Status TraceAnalyzer::HandlePut(uint32_t column_family_id, const Slice& key,
+                                const Slice& value) {
+  Status s;
+  size_t value_size = value.ToString().size();
+  if (FLAGS_convert_to_human_readable_trace && trace_sequence_f_) {
+    s = WriteTraceSequence(TraceOperationType::kPut, column_family_id,
+                           key.ToString(), value_size, c_time_);
+    if (!s.ok()) {
+      return Status::Corruption("Failed to write the trace sequence to file");
+    }
+  }
+
+  if (!ta_[TraceOperationType::kPut].enabled) {
+    return Status::OK();
+  }
+  s = KeyStatsInsertion(TraceOperationType::kPut, column_family_id,
+                        key.ToString(), value_size, c_time_);
+  if (!s.ok()) {
+    return Status::Corruption("Failed to insert key statistics");
+  }
+  return s;
+}
+
+// Handle the Delete request in the write batch of the trace
+Status TraceAnalyzer::HandleDelete(uint32_t column_family_id,
+                                   const Slice& key) {
+  Status s;
+  size_t value_size = 0;
+  if (FLAGS_convert_to_human_readable_trace && trace_sequence_f_) {
+    s = WriteTraceSequence(TraceOperationType::kDelete, column_family_id,
+                           key.ToString(), value_size, c_time_);
+    if (!s.ok()) {
+      return Status::Corruption("Failed to write the trace sequence to file");
+    }
+  }
+
+  if (!ta_[TraceOperationType::kDelete].enabled) {
+    return Status::OK();
+  }
+  s = KeyStatsInsertion(TraceOperationType::kDelete, column_family_id,
+                        key.ToString(), value_size, c_time_);
+  if (!s.ok()) {
+    return Status::Corruption("Failed to insert key statistics");
+  }
+  return s;
+}
+
+// Handle the SingleDelete request in the write batch of the trace
+Status TraceAnalyzer::HandleSingleDelete(uint32_t column_family_id,
+                                         const Slice& key) {
+  Status s;
+  size_t value_size = 0;
+  if (FLAGS_convert_to_human_readable_trace && trace_sequence_f_) {
+    s = WriteTraceSequence(TraceOperationType::kSingleDelete, column_family_id,
+                           key.ToString(), value_size, c_time_);
+    if (!s.ok()) {
+      return Status::Corruption("Failed to write the trace sequence to file");
+    }
+  }
+
+  if (!ta_[TraceOperationType::kSingleDelete].enabled) {
+    return Status::OK();
+  }
+  s = KeyStatsInsertion(TraceOperationType::kSingleDelete, column_family_id,
+                        key.ToString(), value_size, c_time_);
+  if (!s.ok()) {
+    return Status::Corruption("Failed to insert key statistics");
+  }
+  return s;
+}
+
+// Handle the DeleteRange request in the write batch of the trace
+Status TraceAnalyzer::HandleDeleteRange(uint32_t column_family_id,
+                                        const Slice& begin_key,
+                                        const Slice& end_key) {
+  Status s;
+  size_t value_size = 0;
+  if (FLAGS_convert_to_human_readable_trace && trace_sequence_f_) {
+    s = WriteTraceSequence(TraceOperationType::kRangeDelete, column_family_id,
+                           begin_key.ToString(), value_size, c_time_);
+    if (!s.ok()) {
+      return Status::Corruption("Failed to write the trace sequence to file");
+    }
+  }
+
+  if (!ta_[TraceOperationType::kRangeDelete].enabled) {
+    return Status::OK();
+  }
+  s = KeyStatsInsertion(TraceOperationType::kRangeDelete, column_family_id,
+                        begin_key.ToString(), value_size, c_time_);
+  s = KeyStatsInsertion(TraceOperationType::kRangeDelete, column_family_id,
+                        end_key.ToString(), value_size, c_time_);
+  if (!s.ok()) {
+    return Status::Corruption("Failed to insert key statistics");
+  }
+  return s;
+}
+
+// Handle the Merge request in the write batch of the trace
+Status TraceAnalyzer::HandleMerge(uint32_t column_family_id, const Slice& key,
+                                  const Slice& value) {
+  Status s;
+  size_t value_size = value.ToString().size();
+  if (FLAGS_convert_to_human_readable_trace && trace_sequence_f_) {
+    s = WriteTraceSequence(TraceOperationType::kMerge, column_family_id,
+                           key.ToString(), value_size, c_time_);
+    if (!s.ok()) {
+      return Status::Corruption("Failed to write the trace sequence to file");
+    }
+  }
+
+  if (!ta_[TraceOperationType::kMerge].enabled) {
+    return Status::OK();
+  }
+  s = KeyStatsInsertion(TraceOperationType::kMerge, column_family_id,
+                        key.ToString(), value_size, c_time_);
+  if (!s.ok()) {
+    return Status::Corruption("Failed to insert key statistics");
+  }
+  return s;
+}
+
+// Handle the Iterator request in the trace
+Status TraceAnalyzer::HandleIter(uint32_t column_family_id,
+                                 const std::string& key, const uint64_t& ts,
+                                 TraceType& trace_type) {
+  Status s;
+  size_t value_size = 0;
+  int type = -1;
+  if (trace_type == kTraceIteratorSeek) {
+    type = TraceOperationType::kIteratorSeek;
+  } else if (trace_type == kTraceIteratorSeekForPrev) {
+    type = TraceOperationType::kIteratorSeekForPrev;
+  } else {
+    return s;
+  }
+  if (type == -1) {
+    return s;
+  }
+
+  if (FLAGS_convert_to_human_readable_trace && trace_sequence_f_) {
+    s = WriteTraceSequence(type, column_family_id, key, value_size, ts);
+    if (!s.ok()) {
+      return Status::Corruption("Failed to write the trace sequence to file");
+    }
+  }
+
+  if (!ta_[type].enabled) {
+    return Status::OK();
+  }
+  s = KeyStatsInsertion(type, column_family_id, key, value_size, ts);
+  if (!s.ok()) {
+    return Status::Corruption("Failed to insert key statistics");
+  }
+  return s;
+}
+
+// Before the analyzer is closed, the requested general statistic results are
+// printed out here. In current stage, these information are not output to
+// the files.
+// -----type
+//          |__cf_id
+//                |_statistics
+void TraceAnalyzer::PrintStatistics() {
+  for (int type = 0; type < kTaTypeNum; type++) {
+    if (!ta_[type].enabled) {
+      continue;
+    }
+    ta_[type].total_keys = 0;
+    ta_[type].total_access = 0;
+    ta_[type].total_succ_access = 0;
+    printf("\n################# Operation Type: %s #####################\n",
+           ta_[type].type_name.c_str());
+    if (qps_ave_.size() == kTaTypeNum + 1) {
+      printf("Peak QPS is: %u Average QPS is: %f\n", qps_peak_[type],
+             qps_ave_[type]);
+    }
+    for (auto& stat_it : ta_[type].stats) {
+      if (stat_it.second.a_count == 0) {
+        continue;
+      }
+      TraceStats& stat = stat_it.second;
+      uint64_t total_a_keys = static_cast<uint64_t>(stat.a_key_stats.size());
+      double key_size_ave = 0.0;
+      double value_size_ave = 0.0;
+      double key_size_vari = 0.0;
+      double value_size_vari = 0.0;
+      if (stat.a_count > 0) {
+        key_size_ave =
+            (static_cast<double>(stat.a_key_size_sum)) / stat.a_count;
+        value_size_ave =
+            (static_cast<double>(stat.a_value_size_sum)) / stat.a_count;
+        key_size_vari = std::sqrt((static_cast<double>(stat.a_key_size_sqsum)) /
+                                      stat.a_count -
+                                  key_size_ave * key_size_ave);
+        value_size_vari = std::sqrt(
+            (static_cast<double>(stat.a_value_size_sqsum)) / stat.a_count -
+            value_size_ave * value_size_ave);
+      }
+      if (value_size_ave == 0.0) {
+        stat.a_value_mid = 0;
+      }
+      cfs_[stat.cf_id].a_count += total_a_keys;
+      ta_[type].total_keys += total_a_keys;
+      ta_[type].total_access += stat.a_count;
+      ta_[type].total_succ_access += stat.a_succ_count;
+      printf("*********************************************************\n");
+      printf("colume family id: %u\n", stat.cf_id);
+      printf("Total unique keys in this cf: %" PRIu64 "\n", total_a_keys);
+      printf("Average key size: %f key size medium: %" PRIu64
+             " Key size Variation: %f\n",
+             key_size_ave, stat.a_key_mid, key_size_vari);
+      if (type == kPut || type == kMerge) {
+        printf("Average value size: %f Value size medium: %" PRIu64
+               " Value size variation: %f\n",
+               value_size_ave, stat.a_value_mid, value_size_vari);
+      }
+      printf("Peak QPS is: %u Average QPS is: %f\n", stat.a_peak_qps,
+             stat.a_ave_qps);
+
+      // print the top k accessed key and its access count
+      if (FLAGS_print_top_k_access > 0) {
+        printf("The Top %d keys that are accessed:\n",
+               FLAGS_print_top_k_access);
+        while (!stat.top_k_queue.empty()) {
+          std::string hex_key =
+              rocksdb::LDBCommand::StringToHex(stat.top_k_queue.top().second);
+          printf("Access_count: %" PRIu64 " %s\n", stat.top_k_queue.top().first,
+                 hex_key.c_str());
+          stat.top_k_queue.pop();
+        }
+      }
+
+      // print the top k access prefix range and
+      // top k prefix range with highest average access per key
+      if (FLAGS_output_prefix_cut > 0) {
+        printf("The Top %d accessed prefix range:\n", FLAGS_print_top_k_access);
+        while (!stat.top_k_prefix_access.empty()) {
+          printf("Prefix: %s Access count: %" PRIu64 "\n",
+                 stat.top_k_prefix_access.top().second.c_str(),
+                 stat.top_k_prefix_access.top().first);
+          stat.top_k_prefix_access.pop();
+        }
+
+        printf("The Top %d prefix with highest access per key:\n",
+               FLAGS_print_top_k_access);
+        while (!stat.top_k_prefix_ave.empty()) {
+          printf("Prefix: %s access per key: %f\n",
+                 stat.top_k_prefix_ave.top().second.c_str(),
+                 stat.top_k_prefix_ave.top().first);
+          stat.top_k_prefix_ave.pop();
+        }
+      }
+
+      // print the key size distribution
+      if (FLAGS_print_key_distribution) {
+        printf("The key size distribution\n");
+        for (auto& record : stat.a_key_size_stats) {
+          printf("key_size %" PRIu64 " nums: %" PRIu64 "\n", record.first,
+                 record.second);
+        }
+      }
+
+      // print the operation correlations
+      if (!FLAGS_print_correlation.empty()) {
+        for (int correlation = 0;
+             correlation <
+             static_cast<int>(analyzer_opts_.correlation_list.size());
+             correlation++) {
+          printf(
+              "The correlation statistics of '%s' after '%s' is:",
+              taIndexToOpt[analyzer_opts_.correlation_list[correlation].second]
+                  .c_str(),
+              taIndexToOpt[analyzer_opts_.correlation_list[correlation].first]
+                  .c_str());
+          double correlation_ave = 0.0;
+          if (stat.correlation_output[correlation].first > 0) {
+            correlation_ave =
+                (static_cast<double>(
+                    stat.correlation_output[correlation].second)) /
+                (stat.correlation_output[correlation].first * 1000);
+          }
+          printf(" total numbers: %" PRIu64 " average time: %f(ms)\n",
+                 stat.correlation_output[correlation].first, correlation_ave);
+        }
+      }
+    }
+    printf("*********************************************************\n");
+    printf("Total keys of '%s' is: %" PRIu64 "\n", ta_[type].type_name.c_str(),
+           ta_[type].total_keys);
+    printf("Total access is: %" PRIu64 "\n", ta_[type].total_access);
+    total_access_keys_ += ta_[type].total_keys;
+  }
+
+  // Print the overall statistic information of the trace
+  printf("\n*********************************************************\n");
+  printf("*********************************************************\n");
+  printf("The column family based statistics\n");
+  for (auto& cf : cfs_) {
+    printf("The column family id: %u\n", cf.first);
+    printf("The whole key space key numbers: %" PRIu64 "\n", cf.second.w_count);
+    printf("The accessed key space key numbers: %" PRIu64 "\n",
+           cf.second.a_count);
+  }
+
+  if (FLAGS_print_overall_stats) {
+    printf("\n*********************************************************\n");
+    printf("*********************************************************\n");
+    if (qps_peak_.size() == kTaTypeNum + 1) {
+      printf("Average QPS per second: %f Peak QPS: %u\n", qps_ave_[kTaTypeNum],
+             qps_peak_[kTaTypeNum]);
+    }
+    printf("Total_requests: %" PRIu64 " Total_accessed_keys: %" PRIu64
+           " Total_gets: %" PRIu64 " Total_write_batch: %" PRIu64 "\n",
+           total_requests_, total_access_keys_, total_gets_, total_writes_);
+    for (int type = 0; type < kTaTypeNum; type++) {
+      if (!ta_[type].enabled) {
+        continue;
+      }
+      printf("Operation: '%s' has: %" PRIu64 "\n", ta_[type].type_name.c_str(),
+             ta_[type].total_access);
+    }
+  }
+}
+
+// Write the trace sequence to file
+Status TraceAnalyzer::WriteTraceSequence(const uint32_t& type,
+                                         const uint32_t& cf_id,
+                                         const std::string& key,
+                                         const size_t value_size,
+                                         const uint64_t ts) {
+  std::string hex_key = rocksdb::LDBCommand::StringToHex(key);
+  int ret;
+  ret =
+      sprintf(buffer_, "%u %u %zu %" PRIu64 "\n", type, cf_id, value_size, ts);
+  if (ret < 0) {
+    return Status::IOError("failed to format the output");
+  }
+  std::string printout(buffer_);
+  if (!FLAGS_no_key) {
+    printout = hex_key + " " + printout;
+  }
+  return trace_sequence_f_->Append(printout);
+}
+
+// The entrance function of Trace_Analyzer
+int trace_analyzer_tool(int argc, char** argv) {
+  std::string trace_path;
+  std::string output_path;
+
+  AnalyzerOptions analyzer_opts;
+
+  ParseCommandLineFlags(&argc, &argv, true);
+
+  if (!FLAGS_print_correlation.empty()) {
+    analyzer_opts.SparseCorrelationInput(FLAGS_print_correlation);
+  }
+
+  std::unique_ptr<TraceAnalyzer> analyzer(
+      new TraceAnalyzer(FLAGS_trace_path, FLAGS_output_dir, analyzer_opts));
+
+  if (!analyzer) {
+    fprintf(stderr, "Cannot initiate the trace analyzer\n");
+    exit(1);
+  }
+
+  rocksdb::Status s = analyzer->PrepareProcessing();
+  if (!s.ok()) {
+    fprintf(stderr, "%s\n", s.getState());
+    fprintf(stderr, "Cannot initiate the trace reader\n");
+    exit(1);
+  }
+
+  s = analyzer->StartProcessing();
+  if (!s.ok()) {
+    fprintf(stderr, "%s\n", s.getState());
+    fprintf(stderr, "Cannot processing the trace\n");
+    exit(1);
+  }
+
+  s = analyzer->MakeStatistics();
+  if (!s.ok()) {
+    fprintf(stderr, "%s\n", s.getState());
+    analyzer->EndProcessing();
+    fprintf(stderr, "Cannot make the statistics\n");
+    exit(1);
+  }
+
+  s = analyzer->ReProcessing();
+  if (!s.ok()) {
+    fprintf(stderr, "%s\n", s.getState());
+    fprintf(stderr, "Cannot re-process the trace for more statistics\n");
+    analyzer->EndProcessing();
+    exit(1);
+  }
+
+  s = analyzer->EndProcessing();
+  if (!s.ok()) {
+    fprintf(stderr, "%s\n", s.getState());
+    fprintf(stderr, "Cannot close the trace analyzer\n");
+    exit(1);
+  }
+
+  return 0;
+}
+}  // namespace rocksdb
+
+#endif  // Endif of Gflag
+#endif  // RocksDB LITE
diff -Nru rocksdb-5.15.10/tools/trace_analyzer_tool.h rocksdb-5.17.2/tools/trace_analyzer_tool.h
--- rocksdb-5.15.10/tools/trace_analyzer_tool.h	1970-01-01 00:00:00.000000000 +0000
+++ rocksdb-5.17.2/tools/trace_analyzer_tool.h	2018-11-12 19:57:32.000000000 +0000
@@ -0,0 +1,281 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+
+#pragma once
+#ifndef ROCKSDB_LITE
+
+#include <list>
+#include <map>
+#include <queue>
+#include <set>
+#include <utility>
+#include <vector>
+
+#include "rocksdb/env.h"
+#include "rocksdb/trace_reader_writer.h"
+#include "rocksdb/write_batch.h"
+#include "util/trace_replay.h"
+
+namespace rocksdb {
+
+class DBImpl;
+class WriteBatch;
+
+enum TraceOperationType : int {
+  kGet = 0,
+  kPut = 1,
+  kDelete = 2,
+  kSingleDelete = 3,
+  kRangeDelete = 4,
+  kMerge = 5,
+  kIteratorSeek = 6,
+  kIteratorSeekForPrev = 7,
+  kTaTypeNum = 8
+};
+
+struct TraceUnit {
+  uint64_t ts;
+  uint32_t type;
+  uint32_t cf_id;
+  size_t value_size;
+  std::string key;
+};
+
+struct TypeCorrelation {
+  uint64_t count;
+  uint64_t total_ts;
+};
+
+struct StatsUnit {
+  uint64_t key_id;
+  uint64_t access_count;
+  uint64_t latest_ts;
+  uint64_t succ_count;  // current only used to count Get if key found
+  uint32_t cf_id;
+  size_t value_size;
+  std::vector<TypeCorrelation> v_correlation;
+};
+
+class AnalyzerOptions {
+ public:
+  std::vector<std::vector<int>> correlation_map;
+  std::vector<std::pair<int, int>> correlation_list;
+
+  AnalyzerOptions();
+
+  ~AnalyzerOptions();
+
+  void SparseCorrelationInput(const std::string& in_str);
+};
+
+// Note that, for the variable names  in the trace_analyzer,
+// Starting with 'a_' means the variable is used for 'accessed_keys'.
+// Starting with 'w_' means it is used for 'the whole key space'.
+// Ending with '_f' means a file write or reader pointer.
+// For example, 'a_count' means 'accessed_keys_count',
+// 'w_key_f' means 'whole_key_space_file'.
+
+struct TraceStats {
+  uint32_t cf_id;
+  std::string cf_name;
+  uint64_t a_count;
+  uint64_t a_succ_count;
+  uint64_t a_key_id;
+  uint64_t a_key_size_sqsum;
+  uint64_t a_key_size_sum;
+  uint64_t a_key_mid;
+  uint64_t a_value_size_sqsum;
+  uint64_t a_value_size_sum;
+  uint64_t a_value_mid;
+  uint32_t a_peak_qps;
+  double a_ave_qps;
+  std::map<std::string, StatsUnit> a_key_stats;
+  std::map<uint64_t, uint64_t> a_count_stats;
+  std::map<uint64_t, uint64_t> a_key_size_stats;
+  std::map<uint64_t, uint64_t> a_value_size_stats;
+  std::map<uint32_t, uint32_t> a_qps_stats;
+  std::map<uint32_t, std::map<std::string, uint32_t>> a_qps_prefix_stats;
+  std::priority_queue<std::pair<uint64_t, std::string>,
+                      std::vector<std::pair<uint64_t, std::string>>,
+                      std::greater<std::pair<uint64_t, std::string>>>
+      top_k_queue;
+  std::priority_queue<std::pair<uint64_t, std::string>,
+                      std::vector<std::pair<uint64_t, std::string>>,
+                      std::greater<std::pair<uint64_t, std::string>>>
+      top_k_prefix_access;
+  std::priority_queue<std::pair<double, std::string>,
+                      std::vector<std::pair<double, std::string>>,
+                      std::greater<std::pair<double, std::string>>>
+      top_k_prefix_ave;
+  std::priority_queue<std::pair<uint32_t, uint32_t>,
+                      std::vector<std::pair<uint32_t, uint32_t>>,
+                      std::greater<std::pair<uint32_t, uint32_t>>>
+      top_k_qps_sec;
+  std::list<TraceUnit> time_series;
+  std::vector<std::pair<uint64_t, uint64_t>> correlation_output;
+
+  std::unique_ptr<rocksdb::WritableFile> time_series_f;
+  std::unique_ptr<rocksdb::WritableFile> a_key_f;
+  std::unique_ptr<rocksdb::WritableFile> a_count_dist_f;
+  std::unique_ptr<rocksdb::WritableFile> a_prefix_cut_f;
+  std::unique_ptr<rocksdb::WritableFile> a_value_size_f;
+  std::unique_ptr<rocksdb::WritableFile> a_qps_f;
+  std::unique_ptr<rocksdb::WritableFile> a_top_qps_prefix_f;
+  std::unique_ptr<rocksdb::WritableFile> w_key_f;
+  std::unique_ptr<rocksdb::WritableFile> w_prefix_cut_f;
+
+  TraceStats();
+  ~TraceStats();
+  TraceStats(const TraceStats&) = delete;
+  TraceStats& operator=(const TraceStats&) = delete;
+  TraceStats(TraceStats&&) = default;
+  TraceStats& operator=(TraceStats&&) = default;
+};
+
+struct TypeUnit {
+  std::string type_name;
+  bool enabled;
+  uint64_t total_keys;
+  uint64_t total_access;
+  uint64_t total_succ_access;
+  std::map<uint32_t, TraceStats> stats;
+  TypeUnit() = default;
+  ~TypeUnit() = default;
+  TypeUnit(const TypeUnit&) = delete;
+  TypeUnit& operator=(const TypeUnit&) = delete;
+  TypeUnit(TypeUnit&&) = default;
+  TypeUnit& operator=(TypeUnit&&) = default;
+};
+
+struct CfUnit {
+  uint32_t cf_id;
+  uint64_t w_count;  // total keys in this cf if we use the whole key space
+  uint64_t a_count;  // the total keys in this cf that are accessed
+  std::map<uint64_t, uint64_t> w_key_size_stats;  // whole key space key size
+                                                  // statistic this cf
+};
+
+class TraceAnalyzer {
+ public:
+  TraceAnalyzer(std::string& trace_path, std::string& output_path,
+                AnalyzerOptions _analyzer_opts);
+  ~TraceAnalyzer();
+
+  Status PrepareProcessing();
+
+  Status StartProcessing();
+
+  Status MakeStatistics();
+
+  Status ReProcessing();
+
+  Status EndProcessing();
+
+  Status WriteTraceUnit(TraceUnit& unit);
+
+  // The trace  processing functions for different type
+  Status HandleGet(uint32_t column_family_id, const std::string& key,
+                   const uint64_t& ts, const uint32_t& get_ret);
+  Status HandlePut(uint32_t column_family_id, const Slice& key,
+                   const Slice& value);
+  Status HandleDelete(uint32_t column_family_id, const Slice& key);
+  Status HandleSingleDelete(uint32_t column_family_id, const Slice& key);
+  Status HandleDeleteRange(uint32_t column_family_id, const Slice& begin_key,
+                           const Slice& end_key);
+  Status HandleMerge(uint32_t column_family_id, const Slice& key,
+                     const Slice& value);
+  Status HandleIter(uint32_t column_family_id, const std::string& key,
+                    const uint64_t& ts, TraceType& trace_type);
+  std::vector<TypeUnit>& GetTaVector() { return ta_; }
+
+ private:
+  rocksdb::Env* env_;
+  EnvOptions env_options_;
+  std::unique_ptr<TraceReader> trace_reader_;
+  size_t offset_;
+  char buffer_[1024];
+  uint64_t c_time_;
+  std::string trace_name_;
+  std::string output_path_;
+  AnalyzerOptions analyzer_opts_;
+  uint64_t total_requests_;
+  uint64_t total_access_keys_;
+  uint64_t total_gets_;
+  uint64_t total_writes_;
+  uint64_t begin_time_;
+  uint64_t end_time_;
+  uint64_t time_series_start_;
+  std::unique_ptr<rocksdb::WritableFile> trace_sequence_f_;  // readable trace
+  std::unique_ptr<rocksdb::WritableFile> qps_f_;             // overall qps
+  std::unique_ptr<rocksdb::SequentialFile> wkey_input_f_;
+  std::vector<TypeUnit> ta_;  // The main statistic collecting data structure
+  std::map<uint32_t, CfUnit> cfs_;  // All the cf_id appears in this trace;
+  std::vector<uint32_t> qps_peak_;
+  std::vector<double> qps_ave_;
+
+  Status ReadTraceHeader(Trace* header);
+  Status ReadTraceFooter(Trace* footer);
+  Status ReadTraceRecord(Trace* trace);
+  Status KeyStatsInsertion(const uint32_t& type, const uint32_t& cf_id,
+                           const std::string& key, const size_t value_size,
+                           const uint64_t ts);
+  Status StatsUnitCorrelationUpdate(StatsUnit& unit, const uint32_t& type,
+                                    const uint64_t& ts, const std::string& key);
+  Status OpenStatsOutputFiles(const std::string& type, TraceStats& new_stats);
+  Status CreateOutputFile(const std::string& type, const std::string& cf_name,
+                          const std::string& ending,
+                          std::unique_ptr<rocksdb::WritableFile>* f_ptr);
+  void CloseOutputFiles();
+
+  void PrintStatistics();
+  Status TraceUnitWriter(std::unique_ptr<rocksdb::WritableFile>& f_ptr,
+                         TraceUnit& unit);
+  Status WriteTraceSequence(const uint32_t& type, const uint32_t& cf_id,
+                            const std::string& key, const size_t value_size,
+                            const uint64_t ts);
+  Status MakeStatisticKeyStatsOrPrefix(TraceStats& stats);
+  Status MakeStatisticCorrelation(TraceStats& stats, StatsUnit& unit);
+  Status MakeStatisticQPS();
+};
+
+// write bach handler to be used for WriteBache iterator
+// when processing the write trace
+class TraceWriteHandler : public WriteBatch::Handler {
+ public:
+  TraceWriteHandler() { ta_ptr = nullptr; }
+  explicit TraceWriteHandler(TraceAnalyzer* _ta_ptr) { ta_ptr = _ta_ptr; }
+  ~TraceWriteHandler() {}
+
+  virtual Status PutCF(uint32_t column_family_id, const Slice& key,
+                       const Slice& value) override {
+    return ta_ptr->HandlePut(column_family_id, key, value);
+  }
+  virtual Status DeleteCF(uint32_t column_family_id,
+                          const Slice& key) override {
+    return ta_ptr->HandleDelete(column_family_id, key);
+  }
+  virtual Status SingleDeleteCF(uint32_t column_family_id,
+                                const Slice& key) override {
+    return ta_ptr->HandleSingleDelete(column_family_id, key);
+  }
+  virtual Status DeleteRangeCF(uint32_t column_family_id,
+                               const Slice& begin_key,
+                               const Slice& end_key) override {
+    return ta_ptr->HandleDeleteRange(column_family_id, begin_key, end_key);
+  }
+  virtual Status MergeCF(uint32_t column_family_id, const Slice& key,
+                         const Slice& value) override {
+    return ta_ptr->HandleMerge(column_family_id, key, value);
+  }
+
+ private:
+  TraceAnalyzer* ta_ptr;
+};
+
+int trace_analyzer_tool(int argc, char** argv);
+
+}  // namespace rocksdb
+
+#endif  // ROCKSDB_LITE
diff -Nru rocksdb-5.15.10/tools/write_external_sst.sh rocksdb-5.17.2/tools/write_external_sst.sh
--- rocksdb-5.15.10/tools/write_external_sst.sh	1970-01-01 00:00:00.000000000 +0000
+++ rocksdb-5.17.2/tools/write_external_sst.sh	2018-11-12 19:57:32.000000000 +0000
@@ -0,0 +1,25 @@
+#!/usr/bin/env bash
+#
+#
+#
+
+if [ "$#" -lt 3 ]; then
+  echo "usage: $BASH_SOURCE <input_data_path> <DB Path> <extern SST dir>"
+  exit 1
+fi
+
+input_data_dir=$1
+db_dir=$2
+extern_sst_dir=$3
+rm -rf $db_dir
+
+set -e
+
+n=0
+
+for f in `find $input_data_dir -name sorted_data*`
+do
+  echo == Writing external SST file $f to $extern_sst_dir/extern_sst${n}
+  ./ldb --db=$db_dir --create_if_missing write_extern_sst $extern_sst_dir/extern_sst${n} < $f
+  let "n = n + 1"
+done
diff -Nru rocksdb-5.15.10/USERS.md rocksdb-5.17.2/USERS.md
--- rocksdb-5.15.10/USERS.md	2018-09-13 17:25:20.000000000 +0000
+++ rocksdb-5.17.2/USERS.md	2018-11-12 19:57:32.000000000 +0000
@@ -12,6 +12,7 @@
 6. LogDevice -- a distributed data store for logs [2]
 
 [1] https://research.facebook.com/publications/realtime-data-processing-at-facebook/
+
 [2] https://code.facebook.com/posts/357056558062811/logdevice-a-distributed-data-store-for-logs/
 
 ## LinkedIn
@@ -88,3 +89,6 @@
 
 ## ProfaneDB
 [ProfaneDB](https://profanedb.gitlab.io/) is a database for Protocol Buffers, and uses RocksDB for storage. It is accessible via gRPC, and the schema is defined using directly `.proto` files.
+
+## IOTA Foundation
+ [IOTA Foundation](https://www.iota.org/) is using RocksDB in the [IOTA Reference Implementation (IRI)](https://github.com/iotaledger/iri) to store the local state of the Tangle. The Tangle is the first open-source distributed ledger powering the future of the Internet of Things.
\ No newline at end of file
diff -Nru rocksdb-5.15.10/util/bloom.cc rocksdb-5.17.2/util/bloom.cc
--- rocksdb-5.15.10/util/bloom.cc	2018-09-13 17:25:20.000000000 +0000
+++ rocksdb-5.17.2/util/bloom.cc	2018-11-12 19:57:32.000000000 +0000
@@ -144,13 +144,31 @@
       : data_(const_cast<char*>(contents.data())),
         data_len_(static_cast<uint32_t>(contents.size())),
         num_probes_(0),
-        num_lines_(0) {
+        num_lines_(0),
+        log2_cache_line_size_(0) {
     assert(data_);
     GetFilterMeta(contents, &num_probes_, &num_lines_);
     // Sanitize broken parameter
     if (num_lines_ != 0 && (data_len_-5) % num_lines_ != 0) {
       num_lines_ = 0;
       num_probes_ = 0;
+    } else if (num_lines_ != 0) {
+      while (true) {
+        uint32_t num_lines_at_curr_cache_size =
+            (data_len_ - 5) >> log2_cache_line_size_;
+        if (num_lines_at_curr_cache_size == 0) {
+          // The cache line size seems not a power of two. It's not supported
+          // and indicates a corruption so disable using this filter.
+          assert(false);
+          num_lines_ = 0;
+          num_probes_ = 0;
+          break;
+        }
+        if (num_lines_at_curr_cache_size == num_lines_) {
+          break;
+        }
+        ++log2_cache_line_size_;
+      }
     }
   }
 
@@ -173,6 +191,7 @@
   uint32_t data_len_;
   size_t num_probes_;
   uint32_t num_lines_;
+  uint32_t log2_cache_line_size_;
 
   // Get num_probes, and num_lines from filter
   // If filter format broken, set both to 0.
@@ -180,10 +199,10 @@
                              uint32_t* num_lines);
 
   // "filter" contains the data appended by a preceding call to
-  // CreateFilterFromHash() on this class.  This method must return true if
-  // the key was in the list of keys passed to CreateFilter().
-  // This method may return true or false if the key was not on the
-  // list, but it should aim to return false with a high probability.
+  // FilterBitsBuilder::Finish. This method must return true if the key was
+  // passed to FilterBitsBuilder::AddKey. This method may return true or false
+  // if the key was not on the list, but it should aim to return false with a
+  // high probability.
   //
   // hash: target to be checked
   // filter: the whole filter, including meta data bytes
@@ -222,19 +241,20 @@
   // It is ensured the params are valid before calling it
   assert(num_probes != 0);
   assert(num_lines != 0 && (len - 5) % num_lines == 0);
-  uint32_t cache_line_size = (len - 5) / num_lines;
   const char* data = filter.data();
 
   uint32_t h = hash;
   const uint32_t delta = (h >> 17) | (h << 15);  // Rotate right 17 bits
-  uint32_t b = (h % num_lines) * (cache_line_size * 8);
+  // Left shift by an extra 3 to convert bytes to bits
+  uint32_t b = (h % num_lines) << (log2_cache_line_size_ + 3);
   PREFETCH(&data[b / 8], 0 /* rw */, 1 /* locality */);
-  PREFETCH(&data[b / 8 + cache_line_size - 1], 0 /* rw */, 1 /* locality */);
+  PREFETCH(&data[b / 8 + (1 << log2_cache_line_size_) - 1], 0 /* rw */,
+           1 /* locality */);
 
   for (uint32_t i = 0; i < num_probes; ++i) {
     // Since CACHE_LINE_SIZE is defined as 2^n, this line will be optimized
     //  to a simple and operation by compiler.
-    const uint32_t bitpos = b + (h % (cache_line_size * 8));
+    const uint32_t bitpos = b + (h & ((1 << (log2_cache_line_size_ + 3)) - 1));
     if (((data[bitpos / 8]) & (1 << (bitpos % 8))) == 0) {
       return false;
     }
diff -Nru rocksdb-5.15.10/util/channel.h rocksdb-5.17.2/util/channel.h
--- rocksdb-5.15.10/util/channel.h	2018-09-13 17:25:20.000000000 +0000
+++ rocksdb-5.17.2/util/channel.h	2018-11-12 19:57:32.000000000 +0000
@@ -3,13 +3,13 @@
 //  COPYING file in the root directory) and Apache 2.0 License
 //  (found in the LICENSE.Apache file in the root directory).
 
+#pragma once
+
 #include <condition_variable>
 #include <mutex>
 #include <queue>
 #include <utility>
 
-#pragma once
-
 namespace rocksdb {
 
 template <class T>
diff -Nru rocksdb-5.15.10/util/coding.h rocksdb-5.17.2/util/coding.h
--- rocksdb-5.15.10/util/coding.h	2018-09-13 17:25:20.000000000 +0000
+++ rocksdb-5.17.2/util/coding.h	2018-11-12 19:57:32.000000000 +0000
@@ -64,12 +64,28 @@
 
 extern Slice GetSliceUntil(Slice* slice, char delimiter);
 
+// Borrowed from
+// https://github.com/facebook/fbthrift/blob/449a5f77f9f9bae72c9eb5e78093247eef185c04/thrift/lib/cpp/util/VarintUtils-inl.h#L202-L208
+constexpr inline uint64_t i64ToZigzag(const int64_t l) {
+  return (static_cast<uint64_t>(l) << 1) ^ static_cast<uint64_t>(l >> 63);
+}
+inline int64_t zigzagToI64(uint64_t n) {
+  return (n >> 1) ^ -static_cast<int64_t>(n & 1);
+}
+
 // Pointer-based variants of GetVarint...  These either store a value
 // in *v and return a pointer just past the parsed value, or return
 // nullptr on error.  These routines only look at bytes in the range
 // [p..limit-1]
 extern const char* GetVarint32Ptr(const char* p,const char* limit, uint32_t* v);
 extern const char* GetVarint64Ptr(const char* p,const char* limit, uint64_t* v);
+inline const char* GetVarsignedint64Ptr(const char* p, const char* limit,
+                                        int64_t* value) {
+  uint64_t u = 0;
+  const char* ret = GetVarint64Ptr(p, limit, &u);
+  *value = zigzagToI64(u);
+  return ret;
+}
 
 // Returns the length of the varint32 or varint64 encoding of "v"
 extern int VarintLength(uint64_t v);
@@ -249,11 +265,18 @@
 }
 
 inline void PutVarint64(std::string* dst, uint64_t v) {
-  char buf[10];
+  char buf[kMaxVarint64Length];
   char* ptr = EncodeVarint64(buf, v);
   dst->append(buf, static_cast<size_t>(ptr - buf));
 }
 
+inline void PutVarsignedint64(std::string* dst, int64_t v) {
+  char buf[kMaxVarint64Length];
+  // Using Zigzag format to convert signed to unsigned
+  char* ptr = EncodeVarint64(buf, i64ToZigzag(v));
+  dst->append(buf, static_cast<size_t>(ptr - buf));
+}
+
 inline void PutVarint64Varint64(std::string* dst, uint64_t v1, uint64_t v2) {
   char buf[20];
   char* ptr = EncodeVarint64(buf, v1);
diff -Nru rocksdb-5.15.10/util/comparator.cc rocksdb-5.17.2/util/comparator.cc
--- rocksdb-5.15.10/util/comparator.cc	2018-09-13 17:25:20.000000000 +0000
+++ rocksdb-5.17.2/util/comparator.cc	2018-11-12 19:57:32.000000000 +0000
@@ -124,6 +124,10 @@
       return false;
     }
   }
+
+  virtual bool CanKeysWithDifferentByteContentsBeEqual() const override {
+    return false;
+  }
 };
 
 class ReverseBytewiseComparatorImpl : public BytewiseComparatorImpl {
@@ -188,6 +192,10 @@
   void FindShortSuccessor(std::string* /*key*/) const override {
     // Don't do anything for simplicity.
   }
+
+  virtual bool CanKeysWithDifferentByteContentsBeEqual() const override {
+    return false;
+  }
 };
 }// namespace
 
diff -Nru rocksdb-5.15.10/util/compression.h rocksdb-5.17.2/util/compression.h
--- rocksdb-5.15.10/util/compression.h	2018-09-13 17:25:20.000000000 +0000
+++ rocksdb-5.17.2/util/compression.h	2018-11-12 19:57:32.000000000 +0000
@@ -36,9 +36,9 @@
 
 #if defined(ZSTD)
 #include <zstd.h>
-#if ZSTD_VERSION_NUMBER >= 800  // v0.8.0+
+#if ZSTD_VERSION_NUMBER >= 10103  // v1.1.3+
 #include <zdict.h>
-#endif  // ZSTD_VERSION_NUMBER >= 800
+#endif  // ZSTD_VERSION_NUMBER >= 10103
 namespace rocksdb {
 // Need this for the context allocation override
 // On windows we need to do this explicitly
@@ -1065,9 +1065,10 @@
 inline std::string ZSTD_TrainDictionary(const std::string& samples,
                                         const std::vector<size_t>& sample_lens,
                                         size_t max_dict_bytes) {
-  // Dictionary trainer is available since v0.6.1, but ZSTD was marked stable
-  // only since v0.8.0. For now we enable the feature in stable versions only.
-#if ZSTD_VERSION_NUMBER >= 800  // v0.8.0+
+  // Dictionary trainer is available since v0.6.1 for static linking, but not
+  // available for dynamic linking until v1.1.3. For now we enable the feature
+  // in v1.1.3+ only.
+#if ZSTD_VERSION_NUMBER >= 10103  // v1.1.3+
   std::string dict_data(max_dict_bytes, '\0');
   size_t dict_len = ZDICT_trainFromBuffer(
       &dict_data[0], max_dict_bytes, &samples[0], &sample_lens[0],
@@ -1078,13 +1079,13 @@
   assert(dict_len <= max_dict_bytes);
   dict_data.resize(dict_len);
   return dict_data;
-#else   // up to v0.7.x
+#else   // up to v1.1.2
   assert(false);
   (void)samples;
   (void)sample_lens;
   (void)max_dict_bytes;
   return "";
-#endif  // ZSTD_VERSION_NUMBER >= 800
+#endif  // ZSTD_VERSION_NUMBER >= 10103
 }
 
 inline std::string ZSTD_TrainDictionary(const std::string& samples,
@@ -1092,18 +1093,18 @@
                                         size_t max_dict_bytes) {
   // Dictionary trainer is available since v0.6.1, but ZSTD was marked stable
   // only since v0.8.0. For now we enable the feature in stable versions only.
-#if ZSTD_VERSION_NUMBER >= 800  // v0.8.0+
+#if ZSTD_VERSION_NUMBER >= 10103  // v1.1.3+
   // skips potential partial sample at the end of "samples"
   size_t num_samples = samples.size() >> sample_len_shift;
   std::vector<size_t> sample_lens(num_samples, size_t(1) << sample_len_shift);
   return ZSTD_TrainDictionary(samples, sample_lens, max_dict_bytes);
-#else   // up to v0.7.x
+#else   // up to v1.1.2
   assert(false);
   (void)samples;
   (void)sample_len_shift;
   (void)max_dict_bytes;
   return "";
-#endif  // ZSTD_VERSION_NUMBER >= 800
+#endif  // ZSTD_VERSION_NUMBER >= 10103
 }
 
 }  // namespace rocksdb
diff -Nru rocksdb-5.15.10/util/crc32c_ppc_constants.h rocksdb-5.17.2/util/crc32c_ppc_constants.h
--- rocksdb-5.15.10/util/crc32c_ppc_constants.h	2018-09-13 17:25:20.000000000 +0000
+++ rocksdb-5.17.2/util/crc32c_ppc_constants.h	2018-11-12 19:57:32.000000000 +0000
@@ -5,8 +5,9 @@
 //  of patent rights can be found in the PATENTS file in the same directory.
 //  This source code is also licensed under the GPLv2 license found in the
 //  COPYING file in the root directory of this source tree.
-#ifndef CRC32C_PPC_CONST_H
-#define CRC32C_PPC_CONST_H
+
+#pragma once
+
 #define CRC 0x1edc6f41
 #define REFLECT
 #define CRC_XOR
@@ -898,5 +899,3 @@
         /* 33 bit reflected Barrett constant n */
         .octa 0x00000000000000000000000105ec76f1
 #endif
-
-#endif
diff -Nru rocksdb-5.15.10/util/crc32c_ppc.h rocksdb-5.17.2/util/crc32c_ppc.h
--- rocksdb-5.15.10/util/crc32c_ppc.h	2018-09-13 17:25:20.000000000 +0000
+++ rocksdb-5.17.2/util/crc32c_ppc.h	2018-11-12 19:57:32.000000000 +0000
@@ -6,8 +6,7 @@
 //  This source code is also licensed under the GPLv2 license found in the
 //  COPYING file in the root directory of this source tree.
 
-#ifndef CRC32C_PPC_H
-#define CRC32C_PPC_H
+#pragma once
 
 #ifdef __cplusplus
 extern "C" {
@@ -19,5 +18,3 @@
 #ifdef __cplusplus
 }
 #endif
-
-#endif
diff -Nru rocksdb-5.15.10/util/delete_scheduler_test.cc rocksdb-5.17.2/util/delete_scheduler_test.cc
--- rocksdb-5.15.10/util/delete_scheduler_test.cc	2018-09-13 17:25:20.000000000 +0000
+++ rocksdb-5.17.2/util/delete_scheduler_test.cc	2018-11-12 19:57:32.000000000 +0000
@@ -89,7 +89,7 @@
     std::string data(size, 'A');
     EXPECT_OK(f->Append(data));
     EXPECT_OK(f->Close());
-    sst_file_mgr_->OnAddFile(file_path);
+    sst_file_mgr_->OnAddFile(file_path, false);
     return file_path;
   }
 
diff -Nru rocksdb-5.15.10/util/dynamic_bloom_test.cc rocksdb-5.17.2/util/dynamic_bloom_test.cc
--- rocksdb-5.15.10/util/dynamic_bloom_test.cc	2018-09-13 17:25:20.000000000 +0000
+++ rocksdb-5.17.2/util/dynamic_bloom_test.cc	2018-11-12 19:57:32.000000000 +0000
@@ -258,12 +258,12 @@
 
       timer.Start();
 
-      std::function<void(size_t)> adder = [&](size_t t) {
+      std::function<void(size_t)> adder([&](size_t t) {
         for (uint64_t i = 1 + t; i <= num_keys; i += num_threads) {
           std_bloom.AddConcurrently(
               Slice(reinterpret_cast<const char*>(&i), 8));
         }
-      };
+      });
       for (size_t t = 0; t < num_threads; ++t) {
         threads.emplace_back(adder, t);
       }
@@ -279,13 +279,13 @@
 
       timer.Start();
 
-      std::function<void(size_t)> hitter = [&](size_t t) {
+      std::function<void(size_t)> hitter([&](size_t t) {
         for (uint64_t i = 1 + t; i <= num_keys; i += num_threads) {
           bool f =
               std_bloom.MayContain(Slice(reinterpret_cast<const char*>(&i), 8));
           ASSERT_TRUE(f);
         }
-      };
+      });
       for (size_t t = 0; t < num_threads; ++t) {
         threads.emplace_back(hitter, t);
       }
@@ -302,7 +302,7 @@
       timer.Start();
 
       std::atomic<uint32_t> false_positives(0);
-      std::function<void(size_t)> misser = [&](size_t t) {
+      std::function<void(size_t)> misser([&](size_t t) {
         for (uint64_t i = num_keys + 1 + t; i <= 2 * num_keys;
              i += num_threads) {
           bool f =
@@ -311,7 +311,7 @@
             ++false_positives;
           }
         }
-      };
+      });
       for (size_t t = 0; t < num_threads; ++t) {
         threads.emplace_back(misser, t);
       }
diff -Nru rocksdb-5.15.10/util/fault_injection_test_env.cc rocksdb-5.17.2/util/fault_injection_test_env.cc
--- rocksdb-5.15.10/util/fault_injection_test_env.cc	2018-09-13 17:25:20.000000000 +0000
+++ rocksdb-5.17.2/util/fault_injection_test_env.cc	2018-11-12 19:57:32.000000000 +0000
@@ -197,6 +197,15 @@
   return s;
 }
 
+Status FaultInjectionTestEnv::NewRandomAccessFile(
+    const std::string& fname, std::unique_ptr<RandomAccessFile>* result,
+    const EnvOptions& soptions) {
+  if (!IsFilesystemActive()) {
+    return GetError();
+  }
+  return target()->NewRandomAccessFile(fname, result, soptions);
+}
+
 Status FaultInjectionTestEnv::DeleteFile(const std::string& f) {
   if (!IsFilesystemActive()) {
     return GetError();
diff -Nru rocksdb-5.15.10/util/fault_injection_test_env.h rocksdb-5.17.2/util/fault_injection_test_env.h
--- rocksdb-5.15.10/util/fault_injection_test_env.h	2018-09-13 17:25:20.000000000 +0000
+++ rocksdb-5.17.2/util/fault_injection_test_env.h	2018-11-12 19:57:32.000000000 +0000
@@ -11,8 +11,7 @@
 // the last "sync". It then checks for data loss errors by purposely dropping
 // file data (or entire files) not protected by a "sync".
 
-#ifndef UTIL_FAULT_INJECTION_TEST_ENV_H_
-#define UTIL_FAULT_INJECTION_TEST_ENV_H_
+#pragma once
 
 #include <map>
 #include <set>
@@ -111,11 +110,25 @@
                          unique_ptr<WritableFile>* result,
                          const EnvOptions& soptions) override;
 
+  Status NewRandomAccessFile(const std::string& fname,
+                             std::unique_ptr<RandomAccessFile>* result,
+                             const EnvOptions& soptions) override;
+
   virtual Status DeleteFile(const std::string& f) override;
 
   virtual Status RenameFile(const std::string& s,
                             const std::string& t) override;
 
+  virtual Status GetFreeSpace(const std::string& path,
+                              uint64_t* disk_free) override {
+    if (!IsFilesystemActive() && error_ == Status::NoSpace()) {
+      *disk_free = 0;
+      return Status::OK();
+    } else {
+      return target()->GetFreeSpace(path, disk_free);
+    }
+  }
+
   void WritableFileClosed(const FileState& state);
 
   // For every file that is not fully synced, make a call to `func` with
@@ -171,5 +184,3 @@
 };
 
 }  // namespace rocksdb
-
-#endif  // UTIL_FAULT_INJECTION_TEST_ENV_H_
diff -Nru rocksdb-5.15.10/util/filename.cc rocksdb-5.17.2/util/filename.cc
--- rocksdb-5.15.10/util/filename.cc	2018-09-13 17:25:20.000000000 +0000
+++ rocksdb-5.17.2/util/filename.cc	2018-11-12 19:57:32.000000000 +0000
@@ -80,6 +80,13 @@
   return MakeFileName(blobdirname, number, kRocksDBBlobFileExt.c_str());
 }
 
+std::string BlobFileName(const std::string& dbname, const std::string& blob_dir,
+                         uint64_t number) {
+  assert(number > 0);
+  return MakeFileName(dbname + "/" + blob_dir, number,
+                      kRocksDBBlobFileExt.c_str());
+}
+
 std::string ArchivalDirectory(const std::string& dir) {
   return dir + "/" + ARCHIVAL_DIR;
 }
@@ -370,7 +377,7 @@
   }
   if (s.ok()) {
     if (directory_to_fsync != nullptr) {
-      directory_to_fsync->Fsync();
+      s = directory_to_fsync->Fsync();
     }
   } else {
     env->DeleteFile(tmp);
diff -Nru rocksdb-5.15.10/util/filename.h rocksdb-5.17.2/util/filename.h
--- rocksdb-5.15.10/util/filename.h	2018-09-13 17:25:20.000000000 +0000
+++ rocksdb-5.17.2/util/filename.h	2018-11-12 19:57:32.000000000 +0000
@@ -49,6 +49,9 @@
 
 extern std::string BlobFileName(const std::string& bdirname, uint64_t number);
 
+extern std::string BlobFileName(const std::string& dbname,
+                                const std::string& blob_dir, uint64_t number);
+
 static const std::string ARCHIVAL_DIR = "archive";
 
 extern std::string ArchivalDirectory(const std::string& dbname);
diff -Nru rocksdb-5.15.10/util/file_reader_writer.cc rocksdb-5.17.2/util/file_reader_writer.cc
--- rocksdb-5.15.10/util/file_reader_writer.cc	2018-09-13 17:25:20.000000000 +0000
+++ rocksdb-5.17.2/util/file_reader_writer.cc	2018-11-12 19:57:32.000000000 +0000
@@ -62,7 +62,7 @@
 Status SequentialFileReader::Skip(uint64_t n) {
 #ifndef ROCKSDB_LITE
   if (use_direct_io()) {
-    offset_ += n;
+    offset_ += static_cast<size_t>(n);
     return Status::OK();
   }
 #endif  // !ROCKSDB_LITE
@@ -81,9 +81,9 @@
     if (use_direct_io()) {
 #ifndef ROCKSDB_LITE
       size_t alignment = file_->GetRequiredBufferAlignment();
-      size_t aligned_offset = TruncateToPageBoundary(alignment, offset);
-      size_t offset_advance = offset - aligned_offset;
-      size_t read_size = Roundup(offset + n, alignment) - aligned_offset;
+      size_t aligned_offset = TruncateToPageBoundary(alignment, static_cast<size_t>(offset));
+      size_t offset_advance = static_cast<size_t>(offset) - aligned_offset;
+      size_t read_size = Roundup(static_cast<size_t>(offset + n), alignment) - aligned_offset;
       AlignedBuffer buf;
       buf.Alignment(alignment);
       buf.AllocateNewBuffer(read_size);
@@ -673,7 +673,7 @@
       // Only a few requested bytes are in the buffer. memmove those chunk of
       // bytes to the beginning, and memcpy them back into the new buffer if a
       // new buffer is created.
-      chunk_offset_in_buffer = Rounddown(offset - buffer_offset_, alignment);
+      chunk_offset_in_buffer = Rounddown(static_cast<size_t>(offset - buffer_offset_), alignment);
       chunk_len = buffer_.CurrentSize() - chunk_offset_in_buffer;
       assert(chunk_offset_in_buffer % alignment == 0);
       assert(chunk_len % alignment == 0);
@@ -694,11 +694,11 @@
     buffer_.Alignment(alignment);
     buffer_.AllocateNewBuffer(static_cast<size_t>(roundup_len),
                               copy_data_to_new_buffer, chunk_offset_in_buffer,
-                              chunk_len);
+                              static_cast<size_t>(chunk_len));
   } else if (chunk_len > 0) {
     // New buffer not needed. But memmove bytes from tail to the beginning since
     // chunk_len is greater than 0.
-    buffer_.RefitTail(chunk_offset_in_buffer, chunk_len);
+    buffer_.RefitTail(static_cast<size_t>(chunk_offset_in_buffer), static_cast<size_t>(chunk_len));
   }
 
   Slice result;
@@ -707,14 +707,17 @@
                    buffer_.BufferStart() + chunk_len);
   if (s.ok()) {
     buffer_offset_ = rounddown_offset;
-    buffer_.Size(chunk_len + result.size());
+    buffer_.Size(static_cast<size_t>(chunk_len) + result.size());
   }
   return s;
 }
 
 bool FilePrefetchBuffer::TryReadFromCache(uint64_t offset, size_t n,
                                           Slice* result) {
-  if (offset < buffer_offset_) {
+  if (track_min_offset_ && offset < min_offset_read_) {
+    min_offset_read_ = static_cast<size_t>(offset);
+  }
+  if (!enable_ || offset < buffer_offset_) {
     return false;
   }
 
@@ -757,4 +760,41 @@
   return s;
 }
 
+bool ReadOneLine(std::istringstream* iss, SequentialFile* seq_file,
+                 std::string* output, bool* has_data, Status* result) {
+  const int kBufferSize = 8192;
+  char buffer[kBufferSize + 1];
+  Slice input_slice;
+
+  std::string line;
+  bool has_complete_line = false;
+  while (!has_complete_line) {
+    if (std::getline(*iss, line)) {
+      has_complete_line = !iss->eof();
+    } else {
+      has_complete_line = false;
+    }
+    if (!has_complete_line) {
+      // if we're not sure whether we have a complete line,
+      // further read from the file.
+      if (*has_data) {
+        *result = seq_file->Read(kBufferSize, &input_slice, buffer);
+      }
+      if (input_slice.size() == 0) {
+        // meaning we have read all the data
+        *has_data = false;
+        break;
+      } else {
+        iss->str(line + input_slice.ToString());
+        // reset the internal state of iss so that we can keep reading it.
+        iss->clear();
+        *has_data = (input_slice.size() == kBufferSize);
+        continue;
+      }
+    }
+  }
+  *output = line;
+  return *has_data || has_complete_line;
+}
+
 }  // namespace rocksdb
diff -Nru rocksdb-5.15.10/util/file_reader_writer.h rocksdb-5.17.2/util/file_reader_writer.h
--- rocksdb-5.15.10/util/file_reader_writer.h	2018-09-13 17:25:20.000000000 +0000
+++ rocksdb-5.17.2/util/file_reader_writer.h	2018-11-12 19:57:32.000000000 +0000
@@ -8,6 +8,7 @@
 // found in the LICENSE file. See the AUTHORS file for names of contributors.
 #pragma once
 #include <atomic>
+#include <sstream>
 #include <string>
 #include "port/port.h"
 #include "rocksdb/env.h"
@@ -124,6 +125,7 @@
 class WritableFileWriter {
  private:
   std::unique_ptr<WritableFile> writable_file_;
+  std::string file_name_;
   AlignedBuffer           buf_;
   size_t                  max_buffer_size_;
   // Actually written data size can be used for truncate
@@ -143,8 +145,10 @@
 
  public:
   WritableFileWriter(std::unique_ptr<WritableFile>&& file,
-                     const EnvOptions& options, Statistics* stats = nullptr)
+                     const std::string& _file_name, const EnvOptions& options,
+                     Statistics* stats = nullptr)
       : writable_file_(std::move(file)),
+        file_name_(_file_name),
         buf_(),
         max_buffer_size_(options.writable_file_max_buffer_size),
         filesize_(0),
@@ -168,6 +172,8 @@
 
   ~WritableFileWriter() { Close(); }
 
+  std::string file_name() const { return file_name_; }
+
   Status Append(const Slice& data);
 
   Status Pad(const size_t pad_bytes);
@@ -213,24 +219,44 @@
 // readahead_size will be doubled on every IO, until max_readahead_size.
 class FilePrefetchBuffer {
  public:
+  // If `track_min_offset` is true, track minimum offset ever read.
   FilePrefetchBuffer(RandomAccessFileReader* file_reader = nullptr,
-                     size_t readadhead_size = 0, size_t max_readahead_size = 0)
+                     size_t readadhead_size = 0, size_t max_readahead_size = 0,
+                     bool enable = true, bool track_min_offset = false)
       : buffer_offset_(0),
         file_reader_(file_reader),
         readahead_size_(readadhead_size),
-        max_readahead_size_(max_readahead_size) {}
+        max_readahead_size_(max_readahead_size),
+        min_offset_read_(port::kMaxSizet),
+        enable_(enable),
+        track_min_offset_(track_min_offset) {}
   Status Prefetch(RandomAccessFileReader* reader, uint64_t offset, size_t n);
   bool TryReadFromCache(uint64_t offset, size_t n, Slice* result);
 
+  // The minimum `offset` ever passed to TryReadFromCache(). Only be tracked
+  // if track_min_offset = true.
+  size_t min_offset_read() const { return min_offset_read_; }
+
  private:
   AlignedBuffer buffer_;
   uint64_t buffer_offset_;
   RandomAccessFileReader* file_reader_;
   size_t readahead_size_;
   size_t max_readahead_size_;
+  // The minimum `offset` ever passed to TryReadFromCache().
+  size_t min_offset_read_;
+  // if false, TryReadFromCache() always return false, and we only take stats
+  // for track_min_offset_ if track_min_offset_ = true
+  bool enable_;
+  // If true, track minimum `offset` ever passed to TryReadFromCache(), which
+  // can be fetched from min_offset_read().
+  bool track_min_offset_;
 };
 
 extern Status NewWritableFile(Env* env, const std::string& fname,
                               unique_ptr<WritableFile>* result,
                               const EnvOptions& options);
+bool ReadOneLine(std::istringstream* iss, SequentialFile* seq_file,
+                 std::string* output, bool* has_data, Status* result);
+
 }  // namespace rocksdb
diff -Nru rocksdb-5.15.10/util/file_reader_writer_test.cc rocksdb-5.17.2/util/file_reader_writer_test.cc
--- rocksdb-5.15.10/util/file_reader_writer_test.cc	2018-09-13 17:25:20.000000000 +0000
+++ rocksdb-5.17.2/util/file_reader_writer_test.cc	2018-11-12 19:57:32.000000000 +0000
@@ -73,7 +73,7 @@
   env_options.bytes_per_sync = kMb;
   unique_ptr<FakeWF> wf(new FakeWF);
   unique_ptr<WritableFileWriter> writer(
-      new WritableFileWriter(std::move(wf), env_options));
+      new WritableFileWriter(std::move(wf), "" /* don't care */, env_options));
   Random r(301);
   std::unique_ptr<char[]> large_buf(new char[10 * kMb]);
   for (int i = 0; i < 1000; i++) {
@@ -154,8 +154,8 @@
                                      false,
 #endif
                                      no_flush));
-    unique_ptr<WritableFileWriter> writer(
-        new WritableFileWriter(std::move(wf), env_options));
+    unique_ptr<WritableFileWriter> writer(new WritableFileWriter(
+        std::move(wf), "" /* don't care */, env_options));
 
     std::string target;
     for (int i = 0; i < 20; i++) {
@@ -209,7 +209,7 @@
   unique_ptr<FakeWF> wf(new FakeWF());
   wf->Setuse_direct_io(true);
   unique_ptr<WritableFileWriter> writer(
-      new WritableFileWriter(std::move(wf), EnvOptions()));
+      new WritableFileWriter(std::move(wf), "" /* don't care */, EnvOptions()));
 
   ASSERT_OK(writer->Append(std::string(2 * kMb, 'a')));
 
@@ -238,8 +238,9 @@
     return std::string(result.data(), result.size());
   }
   void ResetSourceStr(const std::string& str = "") {
-    auto write_holder = std::unique_ptr<WritableFileWriter>(
-        test::GetWritableFileWriter(new test::StringSink(&control_contents_)));
+    auto write_holder =
+        std::unique_ptr<WritableFileWriter>(test::GetWritableFileWriter(
+            new test::StringSink(&control_contents_), "" /* don't care */));
     write_holder->Append(Slice(str));
     write_holder->Flush();
     auto read_holder = std::unique_ptr<RandomAccessFile>(
diff -Nru rocksdb-5.15.10/util/file_util.cc rocksdb-5.17.2/util/file_util.cc
--- rocksdb-5.15.10/util/file_util.cc	2018-09-13 17:25:20.000000000 +0000
+++ rocksdb-5.17.2/util/file_util.cc	2018-11-12 19:57:32.000000000 +0000
@@ -42,7 +42,8 @@
       }
     }
     src_reader.reset(new SequentialFileReader(std::move(srcfile), source));
-    dest_writer.reset(new WritableFileWriter(std::move(destfile), soptions));
+    dest_writer.reset(
+        new WritableFileWriter(std::move(destfile), destination, soptions));
   }
 
   char buffer[4096];
@@ -67,7 +68,7 @@
 
 // Utility function to create a file with the provided contents
 Status CreateFile(Env* env, const std::string& destination,
-                  const std::string& contents) {
+                  const std::string& contents, bool use_fsync) {
   const EnvOptions soptions;
   Status s;
   unique_ptr<WritableFileWriter> dest_writer;
@@ -77,8 +78,13 @@
   if (!s.ok()) {
     return s;
   }
-  dest_writer.reset(new WritableFileWriter(std::move(destfile), soptions));
-  return dest_writer->Append(Slice(contents));
+  dest_writer.reset(
+      new WritableFileWriter(std::move(destfile), destination, soptions));
+  s = dest_writer->Append(Slice(contents));
+  if (!s.ok()) {
+    return s;
+  }
+  return dest_writer->Sync(use_fsync);
 }
 
 Status DeleteSSTFile(const ImmutableDBOptions* db_options,
diff -Nru rocksdb-5.15.10/util/file_util.h rocksdb-5.17.2/util/file_util.h
--- rocksdb-5.15.10/util/file_util.h	2018-09-13 17:25:20.000000000 +0000
+++ rocksdb-5.17.2/util/file_util.h	2018-11-12 19:57:32.000000000 +0000
@@ -19,7 +19,7 @@
                        bool use_fsync);
 
 extern Status CreateFile(Env* env, const std::string& destination,
-                         const std::string& contents);
+                         const std::string& contents, bool use_fsync);
 
 extern Status DeleteSSTFile(const ImmutableDBOptions* db_options,
                             const std::string& fname,
diff -Nru rocksdb-5.15.10/util/ppc-opcode.h rocksdb-5.17.2/util/ppc-opcode.h
--- rocksdb-5.15.10/util/ppc-opcode.h	2018-09-13 17:25:20.000000000 +0000
+++ rocksdb-5.17.2/util/ppc-opcode.h	2018-11-12 19:57:32.000000000 +0000
@@ -6,8 +6,7 @@
 //  This source code is also licensed under the GPLv2 license found in the
 //  COPYING file in the root directory of this source tree.
 
-#ifndef __OPCODES_H
-#define __OPCODES_H
+#pragma once
 
 #define __PPC_RA(a) (((a)&0x1f) << 16)
 #define __PPC_RB(b) (((b)&0x1f) << 11)
@@ -27,5 +26,3 @@
 #define VPMSUMD(t, a, b) .long PPC_INST_VPMSUMD | VSX_XX3((t), a, b)
 #define MFVRD(a, t) .long PPC_INST_MFVSRD | VSX_XX1((t) + 32, a, 0)
 #define MTVRD(t, a) .long PPC_INST_MTVSRD | VSX_XX1((t) + 32, a, 0)
-
-#endif
diff -Nru rocksdb-5.15.10/util/repeatable_thread.h rocksdb-5.17.2/util/repeatable_thread.h
--- rocksdb-5.15.10/util/repeatable_thread.h	1970-01-01 00:00:00.000000000 +0000
+++ rocksdb-5.17.2/util/repeatable_thread.h	2018-11-12 19:57:32.000000000 +0000
@@ -0,0 +1,142 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+
+#pragma once
+
+#include <functional>
+#include <string>
+
+#include "port/port.h"
+#include "rocksdb/env.h"
+#include "util/mutexlock.h"
+
+namespace rocksdb {
+
+class RepeatableThread {
+ public:
+  RepeatableThread(std::function<void()> function,
+                   const std::string& thread_name, Env* env, uint64_t delay_us,
+                   uint64_t initial_delay_us = 0)
+      : function_(function),
+        thread_name_("rocksdb:" + thread_name),
+        env_(env),
+        delay_us_(delay_us),
+        initial_delay_us_(initial_delay_us),
+        cond_var_(&mutex_),
+        running_(true),
+#ifndef NDEBUG
+        waiting_(false),
+        run_count_(0),
+#endif
+        thread_([this] { thread(); }) {
+  }
+
+  void cancel() {
+    {
+      MutexLock l(&mutex_);
+      if (!running_) {
+        return;
+      }
+      running_ = false;
+      cond_var_.SignalAll();
+    }
+    thread_.join();
+  }
+
+  ~RepeatableThread() { cancel(); }
+
+#ifndef NDEBUG
+  // Wait until RepeatableThread starting waiting, call the optional callback,
+  // then wait for one run of RepeatableThread. Tests can use provide a
+  // custom env object to mock time, and use the callback here to bump current
+  // time and trigger RepeatableThread. See repeatable_thread_test for example.
+  //
+  // Note: only support one caller of this method.
+  void TEST_WaitForRun(std::function<void()> callback = nullptr) {
+    MutexLock l(&mutex_);
+    while (!waiting_) {
+      cond_var_.Wait();
+    }
+    uint64_t prev_count = run_count_;
+    if (callback != nullptr) {
+      callback();
+    }
+    cond_var_.SignalAll();
+    while (!(run_count_ > prev_count)) {
+      cond_var_.Wait();
+    }
+  }
+#endif
+
+ private:
+  bool wait(uint64_t delay) {
+    MutexLock l(&mutex_);
+    if (running_ && delay > 0) {
+      uint64_t wait_until = env_->NowMicros() + delay;
+#ifndef NDEBUG
+      waiting_ = true;
+      cond_var_.SignalAll();
+#endif
+      while (running_) {
+        cond_var_.TimedWait(wait_until);
+        if (env_->NowMicros() >= wait_until) {
+          break;
+        }
+      }
+#ifndef NDEBUG
+      waiting_ = false;
+#endif
+    }
+    return running_;
+  }
+
+  void thread() {
+#if defined(_GNU_SOURCE) && defined(__GLIBC_PREREQ)
+#if __GLIBC_PREREQ(2, 12)
+    // Set thread name.
+    auto thread_handle = thread_.native_handle();
+    int ret __attribute__((__unused__)) =
+        pthread_setname_np(thread_handle, thread_name_.c_str());
+    assert(ret == 0);
+#endif
+#endif
+
+    assert(delay_us_ > 0);
+    if (!wait(initial_delay_us_)) {
+      return;
+    }
+    do {
+      function_();
+#ifndef NDEBUG
+      {
+        MutexLock l(&mutex_);
+        run_count_++;
+        cond_var_.SignalAll();
+      }
+#endif
+    } while (wait(delay_us_));
+  }
+
+  const std::function<void()> function_;
+  const std::string thread_name_;
+  Env* const env_;
+  const uint64_t delay_us_;
+  const uint64_t initial_delay_us_;
+
+  // Mutex lock should be held when accessing running_, waiting_
+  // and run_count_.
+  port::Mutex mutex_;
+  port::CondVar cond_var_;
+  bool running_;
+#ifndef NDEBUG
+  // RepeatableThread waiting for timeout.
+  bool waiting_;
+  // Times function_ had run.
+  uint64_t run_count_;
+#endif
+  port::Thread thread_;
+};
+
+}  // namespace rocksdb
diff -Nru rocksdb-5.15.10/util/repeatable_thread_test.cc rocksdb-5.17.2/util/repeatable_thread_test.cc
--- rocksdb-5.15.10/util/repeatable_thread_test.cc	1970-01-01 00:00:00.000000000 +0000
+++ rocksdb-5.17.2/util/repeatable_thread_test.cc	2018-11-12 19:57:32.000000000 +0000
@@ -0,0 +1,76 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+
+#include <atomic>
+#include <memory>
+
+#include "db/db_test_util.h"
+#include "util/repeatable_thread.h"
+#include "util/testharness.h"
+
+class RepeatableThreadTest : public testing::Test {
+ public:
+  RepeatableThreadTest()
+      : mock_env_(new rocksdb::MockTimeEnv(rocksdb::Env::Default())) {}
+
+ protected:
+  std::unique_ptr<rocksdb::MockTimeEnv> mock_env_;
+};
+
+TEST_F(RepeatableThreadTest, TimedTest) {
+  constexpr uint64_t kSecond = 1000000;  // 1s = 1000000us
+  constexpr int kIteration = 3;
+  rocksdb::Env* env = rocksdb::Env::Default();
+  rocksdb::port::Mutex mutex;
+  rocksdb::port::CondVar test_cv(&mutex);
+  int count = 0;
+  uint64_t prev_time = env->NowMicros();
+  rocksdb::RepeatableThread thread(
+      [&] {
+        rocksdb::MutexLock l(&mutex);
+        count++;
+        uint64_t now = env->NowMicros();
+        assert(count == 1 || prev_time + 1 * kSecond <= now);
+        prev_time = now;
+        if (count >= kIteration) {
+          test_cv.SignalAll();
+        }
+      },
+      "rt_test", env, 1 * kSecond);
+  // Wait for execution finish.
+  {
+    rocksdb::MutexLock l(&mutex);
+    while (count < kIteration) {
+      test_cv.Wait();
+    }
+  }
+
+  // Test cancel
+  thread.cancel();
+}
+
+TEST_F(RepeatableThreadTest, MockEnvTest) {
+  constexpr uint64_t kSecond = 1000000;  // 1s = 1000000us
+  constexpr int kIteration = 3;
+  mock_env_->set_current_time(0);  // in seconds
+  std::atomic<int> count{0};
+  rocksdb::RepeatableThread thread([&] { count++; }, "rt_test", mock_env_.get(),
+                                   1 * kSecond, 1 * kSecond);
+  for (int i = 1; i <= kIteration; i++) {
+    // Bump current time
+    thread.TEST_WaitForRun([&] { mock_env_->set_current_time(i); });
+  }
+  // Test function should be exectued exactly kIteraion times.
+  ASSERT_EQ(kIteration, count.load());
+
+  // Test cancel
+  thread.cancel();
+}
+
+int main(int argc, char** argv) {
+  ::testing::InitGoogleTest(&argc, argv);
+
+  return RUN_ALL_TESTS();
+}
diff -Nru rocksdb-5.15.10/util/sst_file_manager_impl.cc rocksdb-5.17.2/util/sst_file_manager_impl.cc
--- rocksdb-5.15.10/util/sst_file_manager_impl.cc	2018-09-13 17:25:20.000000000 +0000
+++ rocksdb-5.17.2/util/sst_file_manager_impl.cc	2018-11-12 19:57:32.000000000 +0000
@@ -7,6 +7,7 @@
 
 #include <vector>
 
+#include "db/db_impl.h"
 #include "port/port.h"
 #include "rocksdb/env.h"
 #include "rocksdb/sst_file_manager.h"
@@ -23,20 +24,45 @@
     : env_(env),
       logger_(logger),
       total_files_size_(0),
+      in_progress_files_size_(0),
       compaction_buffer_size_(0),
       cur_compactions_reserved_size_(0),
       max_allowed_space_(0),
       delete_scheduler_(env, rate_bytes_per_sec, logger.get(), this,
-                        max_trash_db_ratio, bytes_max_delete_chunk) {}
+                        max_trash_db_ratio, bytes_max_delete_chunk),
+      cv_(&mu_),
+      closing_(false),
+      bg_thread_(nullptr),
+      reserved_disk_buffer_(0),
+      free_space_trigger_(0),
+      cur_instance_(nullptr) {
+}
+
+SstFileManagerImpl::~SstFileManagerImpl() {
+  Close();
+}
 
-SstFileManagerImpl::~SstFileManagerImpl() {}
+void SstFileManagerImpl::Close() {
+  {
+    MutexLock l(&mu_);
+    if (closing_) {
+      return;
+    }
+    closing_ = true;
+    cv_.SignalAll();
+  }
+  if (bg_thread_) {
+    bg_thread_->join();
+  }
+}
 
-Status SstFileManagerImpl::OnAddFile(const std::string& file_path) {
+Status SstFileManagerImpl::OnAddFile(const std::string& file_path,
+                                     bool compaction) {
   uint64_t file_size;
   Status s = env_->GetFileSize(file_path, &file_size);
   if (s.ok()) {
     MutexLock l(&mu_);
-    OnAddFileImpl(file_path, file_size);
+    OnAddFileImpl(file_path, file_size, compaction);
   }
   TEST_SYNC_POINT("SstFileManagerImpl::OnAddFile");
   return s;
@@ -61,6 +87,19 @@
     }
   }
   cur_compactions_reserved_size_ -= size_added_by_compaction;
+
+  auto new_files = c->edit()->GetNewFiles();
+  for (auto& new_file : new_files) {
+    auto fn = TableFileName(c->immutable_cf_options()->cf_paths,
+                            new_file.second.fd.GetNumber(),
+                            new_file.second.fd.GetPathId());
+    if (in_progress_files_.find(fn) != in_progress_files_.end()) {
+      auto tracked_file = tracked_files_.find(fn);
+      assert(tracked_file != tracked_files_.end());
+      in_progress_files_size_ -= tracked_file->second;
+      in_progress_files_.erase(fn);
+    }
+  }
 }
 
 Status SstFileManagerImpl::OnMoveFile(const std::string& old_path,
@@ -71,7 +110,7 @@
     if (file_size != nullptr) {
       *file_size = tracked_files_[old_path];
     }
-    OnAddFileImpl(new_path, tracked_files_[old_path]);
+    OnAddFileImpl(new_path, tracked_files_[old_path], false);
     OnDeleteFileImpl(old_path);
   }
   TEST_SYNC_POINT("SstFileManagerImpl::OnMoveFile");
@@ -107,7 +146,8 @@
 }
 
 bool SstFileManagerImpl::EnoughRoomForCompaction(
-    const std::vector<CompactionInputFiles>& inputs) {
+    ColumnFamilyData* cfd, const std::vector<CompactionInputFiles>& inputs,
+    Status bg_error) {
   MutexLock l(&mu_);
   uint64_t size_added_by_compaction = 0;
   // First check if we even have the space to do the compaction
@@ -118,15 +158,47 @@
     }
   }
 
+  // Update cur_compactions_reserved_size_ so concurrent compaction
+  // don't max out space
+  size_t needed_headroom =
+      cur_compactions_reserved_size_ + size_added_by_compaction +
+      compaction_buffer_size_;
   if (max_allowed_space_ != 0 &&
-      (size_added_by_compaction + cur_compactions_reserved_size_ +
-           total_files_size_ + compaction_buffer_size_ >
-       max_allowed_space_)) {
+      (needed_headroom + total_files_size_ > max_allowed_space_)) {
     return false;
   }
-  // Update cur_compactions_reserved_size_ so concurrent compaction
-  // don't max out space
+
+  // Implement more aggressive checks only if this DB instance has already
+  // seen a NoSpace() error. This is tin order to contain a single potentially
+  // misbehaving DB instance and prevent it from slowing down compactions of
+  // other DB instances
+  if (CheckFreeSpace() && bg_error == Status::NoSpace()) {
+    auto fn =
+        TableFileName(cfd->ioptions()->cf_paths, inputs[0][0]->fd.GetNumber(),
+                      inputs[0][0]->fd.GetPathId());
+    uint64_t free_space = 0;
+    env_->GetFreeSpace(fn, &free_space);
+    // needed_headroom is based on current size reserved by compactions,
+    // minus any files created by running compactions as they would count
+    // against the reserved size. If user didn't specify any compaction
+    // buffer, add reserved_disk_buffer_ that's calculated by default so the
+    // compaction doesn't end up leaving nothing for logs and flush SSTs
+    if (compaction_buffer_size_ == 0) {
+      needed_headroom += reserved_disk_buffer_;
+    }
+    needed_headroom -= in_progress_files_size_;
+    if (free_space < needed_headroom + size_added_by_compaction) {
+      // We hit the condition of not enough disk space
+      ROCKS_LOG_ERROR(logger_, "free space [%d bytes] is less than "
+          "needed headroom [%d bytes]\n", free_space, needed_headroom);
+      return false;
+    }
+  }
+
   cur_compactions_reserved_size_ += size_added_by_compaction;
+  // Take a snapshot of cur_compactions_reserved_size_ for when we encounter
+  // a NoSpace error.
+  free_space_trigger_ = cur_compactions_reserved_size_;
   return true;
 }
 
@@ -162,6 +234,173 @@
   return delete_scheduler_.SetMaxTrashDBRatio(r);
 }
 
+uint64_t SstFileManagerImpl::GetTotalTrashSize() {
+  return delete_scheduler_.GetTotalTrashSize();
+}
+
+void SstFileManagerImpl::ReserveDiskBuffer(uint64_t size,
+                                           const std::string& path) {
+  MutexLock l(&mu_);
+
+  reserved_disk_buffer_ += size;
+  if (path_.empty()) {
+    path_ = path;
+  }
+}
+
+void SstFileManagerImpl::ClearError() {
+  while (true) {
+    MutexLock l(&mu_);
+
+    if (closing_) {
+      return;
+    }
+
+    uint64_t free_space;
+    Status s = env_->GetFreeSpace(path_, &free_space);
+    if (s.ok()) {
+      // In case of multi-DB instances, some of them may have experienced a
+      // soft error and some a hard error. In the SstFileManagerImpl, a hard
+      // error will basically override previously reported soft errors. Once
+      // we clear the hard error, we don't keep track of previous errors for
+      // now
+      if (bg_err_.severity() == Status::Severity::kHardError) {
+        if (free_space < reserved_disk_buffer_) {
+          ROCKS_LOG_ERROR(logger_, "free space [%d bytes] is less than "
+              "required disk buffer [%d bytes]\n", free_space,
+              reserved_disk_buffer_);
+          ROCKS_LOG_ERROR(logger_, "Cannot clear hard error\n");
+          s = Status::NoSpace();
+        }
+      } else if (bg_err_.severity() == Status::Severity::kSoftError) {
+        if (free_space < free_space_trigger_) {
+          ROCKS_LOG_WARN(logger_, "free space [%d bytes] is less than "
+              "free space for compaction trigger [%d bytes]\n", free_space,
+              free_space_trigger_);
+          ROCKS_LOG_WARN(logger_, "Cannot clear soft error\n");
+          s = Status::NoSpace();
+        }
+      }
+    }
+
+    // Someone could have called CancelErrorRecovery() and the list could have
+    // become empty, so check again here
+    if (s.ok() && !error_handler_list_.empty()) {
+      auto error_handler = error_handler_list_.front();
+      // Since we will release the mutex, set cur_instance_ to signal to the
+      // shutdown thread, if it calls // CancelErrorRecovery() the meantime,
+      // to indicate that this DB instance is busy. The DB instance is
+      // guaranteed to not be deleted before RecoverFromBGError() returns,
+      // since the ErrorHandler::recovery_in_prog_ flag would be true
+      cur_instance_ = error_handler;
+      mu_.Unlock();
+      s = error_handler->RecoverFromBGError();
+      mu_.Lock();
+      // The DB instance might have been deleted while we were
+      // waiting for the mutex, so check cur_instance_ to make sure its
+      // still non-null
+      if (cur_instance_) {
+        // Check for error again, since the instance may have recovered but
+        // immediately got another error. If that's the case, and the new
+        // error is also a NoSpace() non-fatal error, leave the instance in
+        // the list
+        Status err = cur_instance_->GetBGError();
+        if (s.ok() && err == Status::NoSpace() &&
+            err.severity() < Status::Severity::kFatalError) {
+          s = err;
+        }
+        cur_instance_ = nullptr;
+      }
+
+      if (s.ok() || s.IsShutdownInProgress() ||
+          (!s.ok() && s.severity() >= Status::Severity::kFatalError)) {
+        // If shutdown is in progress, abandon this handler instance
+        // and continue with the others
+        error_handler_list_.pop_front();
+      }
+    }
+
+    if (!error_handler_list_.empty()) {
+      // If there are more instances to be recovered, reschedule after 5
+      // seconds
+      int64_t wait_until = env_->NowMicros() + 5000000;
+      cv_.TimedWait(wait_until);
+    }
+
+    // Check again for error_handler_list_ empty, as a DB instance shutdown
+    // could have removed it from the queue while we were in timed wait
+    if (error_handler_list_.empty()) {
+      ROCKS_LOG_INFO(logger_, "Clearing error\n");
+      bg_err_ = Status::OK();
+      return;
+    }
+  }
+}
+
+void SstFileManagerImpl::StartErrorRecovery(ErrorHandler* handler,
+                                            Status bg_error) {
+  MutexLock l(&mu_);
+  if (bg_error.severity() == Status::Severity::kSoftError) {
+    if (bg_err_.ok()) {
+      // Setting bg_err_ basically means we're in degraded mode
+      // Assume that all pending compactions will fail similarly. The trigger
+      // for clearing this condition is set to current compaction reserved
+      // size, so we stop checking disk space available in
+      // EnoughRoomForCompaction once this much free space is available
+      bg_err_ = bg_error;
+    }
+  } else if (bg_error.severity() == Status::Severity::kHardError) {
+    bg_err_ = bg_error;
+  } else {
+    assert(false);
+  }
+
+  // If this is the first instance of this error, kick of a thread to poll
+  // and recover from this condition
+  if (error_handler_list_.empty()) {
+    error_handler_list_.push_back(handler);
+    // Release lock before calling join. Its ok to do so because
+    // error_handler_list_ is now non-empty, so no other invocation of this
+    // function will execute this piece of code
+    mu_.Unlock();
+    if (bg_thread_) {
+      bg_thread_->join();
+    }
+    // Start a new thread. The previous one would have exited.
+    bg_thread_.reset(new port::Thread(&SstFileManagerImpl::ClearError, this));
+    mu_.Lock();
+  } else {
+    // Check if this DB instance is already in the list
+    for (auto iter = error_handler_list_.begin();
+         iter != error_handler_list_.end(); ++iter) {
+      if ((*iter) == handler) {
+        return;
+      }
+    }
+    error_handler_list_.push_back(handler);
+  }
+}
+
+bool SstFileManagerImpl::CancelErrorRecovery(ErrorHandler* handler) {
+  MutexLock l(&mu_);
+
+  if (cur_instance_ == handler) {
+    // This instance is currently busy attempting to recover
+    // Nullify it so the recovery thread doesn't attempt to access it again
+    cur_instance_ = nullptr;
+    return false;
+  }
+
+  for (auto iter = error_handler_list_.begin();
+       iter != error_handler_list_.end(); ++iter) {
+    if ((*iter) == handler) {
+      error_handler_list_.erase(iter);
+      return true;
+    }
+  }
+  return false;
+}
+
 Status SstFileManagerImpl::ScheduleFileDeletion(
     const std::string& file_path, const std::string& path_to_sync) {
   return delete_scheduler_.DeleteFile(file_path, path_to_sync);
@@ -172,14 +411,24 @@
 }
 
 void SstFileManagerImpl::OnAddFileImpl(const std::string& file_path,
-                                       uint64_t file_size) {
+                                       uint64_t file_size, bool compaction) {
   auto tracked_file = tracked_files_.find(file_path);
   if (tracked_file != tracked_files_.end()) {
     // File was added before, we will just update the size
+    assert(!compaction);
     total_files_size_ -= tracked_file->second;
     total_files_size_ += file_size;
+    cur_compactions_reserved_size_ -= file_size;
   } else {
     total_files_size_ += file_size;
+    if (compaction) {
+      // Keep track of the size of files created by in-progress compactions.
+      // When calculating whether there's enough headroom for new compactions,
+      // this will be subtracted from cur_compactions_reserved_size_.
+      // Otherwise, compactions will be double counted.
+      in_progress_files_size_ += file_size;
+      in_progress_files_.insert(file_path);
+    }
   }
   tracked_files_[file_path] = file_size;
 }
@@ -188,10 +437,16 @@
   auto tracked_file = tracked_files_.find(file_path);
   if (tracked_file == tracked_files_.end()) {
     // File is not tracked
+    assert(in_progress_files_.find(file_path) == in_progress_files_.end());
     return;
   }
 
   total_files_size_ -= tracked_file->second;
+  // Check if it belonged to an in-progress compaction
+  if (in_progress_files_.find(file_path) != in_progress_files_.end()) {
+    in_progress_files_size_ -= tracked_file->second;
+    in_progress_files_.erase(file_path);
+  }
   tracked_files_.erase(tracked_file);
 }
 
diff -Nru rocksdb-5.15.10/util/sst_file_manager_impl.h rocksdb-5.17.2/util/sst_file_manager_impl.h
--- rocksdb-5.15.10/util/sst_file_manager_impl.h	2018-09-13 17:25:20.000000000 +0000
+++ rocksdb-5.17.2/util/sst_file_manager_impl.h	2018-11-12 19:57:32.000000000 +0000
@@ -12,6 +12,7 @@
 #include "port/port.h"
 
 #include "db/compaction.h"
+#include "db/error_handler.h"
 #include "rocksdb/sst_file_manager.h"
 #include "util/delete_scheduler.h"
 
@@ -33,7 +34,7 @@
   ~SstFileManagerImpl();
 
   // DB will call OnAddFile whenever a new sst file is added.
-  Status OnAddFile(const std::string& file_path);
+  Status OnAddFile(const std::string& file_path, bool compaction = false);
 
   // DB will call OnDeleteFile whenever an sst file is deleted.
   Status OnDeleteFile(const std::string& file_path);
@@ -67,7 +68,9 @@
   // estimates how much space is currently being used by compactions (i.e.
   // if a compaction has started, this function bumps the used space by
   // the full compaction size).
-  bool EnoughRoomForCompaction(const std::vector<CompactionInputFiles>& inputs);
+  bool EnoughRoomForCompaction(ColumnFamilyData* cfd,
+                               const std::vector<CompactionInputFiles>& inputs,
+                               Status bg_error);
 
   // Bookkeeping so total_file_sizes_ goes back to normal after compaction
   // finishes
@@ -93,6 +96,21 @@
   // Update trash/DB size ratio where new files will be deleted immediately
   virtual void SetMaxTrashDBRatio(double ratio) override;
 
+  // Return the total size of trash files
+  uint64_t GetTotalTrashSize() override;
+
+  // Called by each DB instance using this sst file manager to reserve
+  // disk buffer space for recovery from out of space errors
+  void ReserveDiskBuffer(uint64_t buffer, const std::string& path);
+
+  // Set a flag upon encountering disk full. May enqueue the ErrorHandler
+  // instance for background polling and recovery
+  void StartErrorRecovery(ErrorHandler* db, Status bg_error);
+
+  // Remove the given Errorhandler instance from the recovery queue. Its
+  // not guaranteed
+  bool CancelErrorRecovery(ErrorHandler* db);
+
   // Mark file as trash and schedule it's deletion.
   virtual Status ScheduleFileDeletion(const std::string& file_path,
                                       const std::string& dir_to_sync);
@@ -103,18 +121,30 @@
 
   DeleteScheduler* delete_scheduler() { return &delete_scheduler_; }
 
+  // Stop the error recovery background thread. This should be called only
+  // once in the object's lifetime, and before the destructor
+  void Close();
+
  private:
   // REQUIRES: mutex locked
-  void OnAddFileImpl(const std::string& file_path, uint64_t file_size);
+  void OnAddFileImpl(const std::string& file_path, uint64_t file_size,
+                     bool compaction);
   // REQUIRES: mutex locked
   void OnDeleteFileImpl(const std::string& file_path);
 
+  void ClearError();
+  bool CheckFreeSpace() {
+    return bg_err_.severity() == Status::Severity::kSoftError;
+  }
+
   Env* env_;
   std::shared_ptr<Logger> logger_;
   // Mutex to protect tracked_files_, total_files_size_
   port::Mutex mu_;
   // The summation of the sizes of all files in tracked_files_ map
   uint64_t total_files_size_;
+  // The summation of all output files of in-progress compactions
+  uint64_t in_progress_files_size_;
   // Compactions should only execute if they can leave at least
   // this amount of buffer space for logs and flushes
   uint64_t compaction_buffer_size_;
@@ -123,10 +153,32 @@
   // A map containing all tracked files and there sizes
   //  file_path => file_size
   std::unordered_map<std::string, uint64_t> tracked_files_;
+  // A set of files belonging to in-progress compactions
+  std::unordered_set<std::string> in_progress_files_;
   // The maximum allowed space (in bytes) for sst files.
   uint64_t max_allowed_space_;
   // DeleteScheduler used to throttle file deletition.
   DeleteScheduler delete_scheduler_;
+  port::CondVar cv_;
+  // Flag to force error recovery thread to exit
+  bool closing_;
+  // Background error recovery thread
+  std::unique_ptr<port::Thread> bg_thread_;
+  // A path in the filesystem corresponding to this SFM. This is used for
+  // calling Env::GetFreeSpace. Posix requires a path in the filesystem
+  std::string path_;
+  // Save the current background error
+  Status bg_err_;
+  // Amount of free disk headroom before allowing recovery from hard errors
+  uint64_t reserved_disk_buffer_;
+  // For soft errors, amount of free disk space before we can allow
+  // compactions to run full throttle. If disk space is below this trigger,
+  // compactions will be gated by free disk space > input size
+  uint64_t free_space_trigger_;
+  // List of database error handler instances tracked by this sst file manager
+  std::list<ErrorHandler*> error_handler_list_;
+  // Pointer to ErrorHandler instance that is currently processing recovery
+  ErrorHandler* cur_instance_;
 };
 
 }  // namespace rocksdb
diff -Nru rocksdb-5.15.10/util/status.cc rocksdb-5.17.2/util/status.cc
--- rocksdb-5.15.10/util/status.cc	2018-09-13 17:25:20.000000000 +0000
+++ rocksdb-5.17.2/util/status.cc	2018-11-12 19:57:32.000000000 +0000
@@ -19,8 +19,7 @@
 
 const char* Status::CopyState(const char* state) {
 #ifdef OS_WIN
-  const size_t cch =
-      std::strlen(state) + 1; // +1 for the null terminator
+  const size_t cch = std::strlen(state) + 1;  // +1 for the null terminator
   char* result = new char[cch];
   errno_t ret;
   ret = strncpy_s(result, cch, state, cch - 1);
@@ -28,13 +27,25 @@
   assert(ret == 0);
   return result;
 #else
-  const size_t cch =
-      std::strlen(state) + 1; // +1 for the null terminator
+  const size_t cch = std::strlen(state) + 1;  // +1 for the null terminator
   return std::strncpy(new char[cch], state, cch);
 #endif
 }
 
-Status::Status(Code _code, SubCode _subcode, const Slice& msg, const Slice& msg2)
+static const char* msgs[static_cast<int>(Status::kMaxSubCode)] = {
+    "",                                                   // kNone
+    "Timeout Acquiring Mutex",                            // kMutexTimeout
+    "Timeout waiting to lock key",                        // kLockTimeout
+    "Failed to acquire lock due to max_num_locks limit",  // kLockLimit
+    "No space left on device",                            // kNoSpace
+    "Deadlock",                                           // kDeadlock
+    "Stale file handle",                                  // kStaleFile
+    "Memory limit reached",                               // kMemoryLimit
+    "Space limit reached"                                 // kSpaceLimit
+};
+
+Status::Status(Code _code, SubCode _subcode, const Slice& msg,
+               const Slice& msg2)
     : code_(_code), subcode_(_subcode), sev_(kNoError) {
   assert(code_ != kOk);
   assert(subcode_ != kMaxSubCode);
diff -Nru rocksdb-5.15.10/util/status_message.cc rocksdb-5.17.2/util/status_message.cc
--- rocksdb-5.15.10/util/status_message.cc	2018-09-13 17:25:20.000000000 +0000
+++ rocksdb-5.17.2/util/status_message.cc	1970-01-01 00:00:00.000000000 +0000
@@ -1,22 +0,0 @@
-// Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
-//  This source code is licensed under both the GPLv2 (found in the
-//  COPYING file in the root directory) and Apache 2.0 License
-//  (found in the LICENSE.Apache file in the root directory).
-
-#include "rocksdb/status.h"
-
-namespace rocksdb {
-
-const char* Status::msgs[] = {
-    "",                                                   // kNone
-    "Timeout Acquiring Mutex",                            // kMutexTimeout
-    "Timeout waiting to lock key",                        // kLockTimeout
-    "Failed to acquire lock due to max_num_locks limit",  // kLockLimit
-    "No space left on device",                            // kNoSpace
-    "Deadlock",                                           // kDeadlock
-    "Stale file handle",                                  // kStaleFile
-    "Memory limit reached",                               // kMemoryLimit
-    "Space limit reached"                                 // kSpaceLimit
-};
-
-}  // namespace rocksdb
diff -Nru rocksdb-5.15.10/util/testutil.cc rocksdb-5.17.2/util/testutil.cc
--- rocksdb-5.15.10/util/testutil.cc	2018-09-13 17:25:20.000000000 +0000
+++ rocksdb-5.17.2/util/testutil.cc	2018-11-12 19:57:32.000000000 +0000
@@ -20,7 +20,7 @@
 namespace test {
 
 const uint32_t kDefaultFormatVersion = BlockBasedTableOptions().format_version;
-const uint32_t kLatestFormatVersion = 3u;
+const uint32_t kLatestFormatVersion = 4u;
 
 Slice RandomString(Random* rnd, int len, std::string* dst) {
   dst->resize(len);
@@ -124,9 +124,10 @@
   return &uint64comp;
 }
 
-WritableFileWriter* GetWritableFileWriter(WritableFile* wf) {
+WritableFileWriter* GetWritableFileWriter(WritableFile* wf,
+                                          const std::string& fname) {
   unique_ptr<WritableFile> file(wf);
-  return new WritableFileWriter(std::move(file), EnvOptions());
+  return new WritableFileWriter(std::move(file), fname, EnvOptions());
 }
 
 RandomAccessFileReader* GetRandomAccessFileReader(RandomAccessFile* raf) {
diff -Nru rocksdb-5.15.10/util/testutil.h rocksdb-5.17.2/util/testutil.h
--- rocksdb-5.15.10/util/testutil.h	2018-09-13 17:25:20.000000000 +0000
+++ rocksdb-5.17.2/util/testutil.h	2018-11-12 19:57:32.000000000 +0000
@@ -183,7 +183,8 @@
   std::vector<std::string> values_;
   size_t current_;
 };
-extern WritableFileWriter* GetWritableFileWriter(WritableFile* wf);
+extern WritableFileWriter* GetWritableFileWriter(WritableFile* wf,
+                                                 const std::string& fname);
 
 extern RandomAccessFileReader* GetRandomAccessFileReader(RandomAccessFile* raf);
 
@@ -249,7 +250,7 @@
 
   Status Write(uint64_t offset, const Slice& data) override {
     if (offset + data.size() > ss_->contents_.size()) {
-      ss_->contents_.resize(offset + data.size(), '\0');
+      ss_->contents_.resize(static_cast<size_t>(offset) + data.size(), '\0');
     }
 
     char* pos = const_cast<char*>(ss_->contents_.data() + offset);
@@ -517,7 +518,7 @@
             "Attemp to read when it already reached eof.");
       }
       // TODO(yhchiang): Currently doesn't handle the overflow case.
-      offset_ += n;
+      offset_ += static_cast<size_t>(n);
       return Status::OK();
     }
 
@@ -531,7 +532,7 @@
     explicit StringSink(std::string* contents)
         : WritableFile(), contents_(contents) {}
     virtual Status Truncate(uint64_t size) override {
-      contents_->resize(size);
+      contents_->resize(static_cast<size_t>(size));
       return Status::OK();
     }
     virtual Status Close() override { return Status::OK(); }
diff -Nru rocksdb-5.15.10/util/thread_local.cc rocksdb-5.17.2/util/thread_local.cc
--- rocksdb-5.15.10/util/thread_local.cc	2018-09-13 17:25:20.000000000 +0000
+++ rocksdb-5.17.2/util/thread_local.cc	2018-11-12 19:57:32.000000000 +0000
@@ -184,7 +184,8 @@
 void NTAPI WinOnThreadExit(PVOID module, DWORD reason, PVOID reserved) {
   // We decided to punt on PROCESS_EXIT
   if (DLL_THREAD_DETACH == reason) {
-    if (thread_local_key != pthread_key_t(-1) && thread_local_inclass_routine != nullptr) {
+    if (thread_local_key != pthread_key_t(-1) &&
+        thread_local_inclass_routine != nullptr) {
       void* tls = TlsGetValue(thread_local_key);
       if (tls != nullptr) {
         thread_local_inclass_routine(tls);
diff -Nru rocksdb-5.15.10/util/threadpool_imp.cc rocksdb-5.17.2/util/threadpool_imp.cc
--- rocksdb-5.15.10/util/threadpool_imp.cc	2018-09-13 17:25:20.000000000 +0000
+++ rocksdb-5.17.2/util/threadpool_imp.cc	2018-11-12 19:57:32.000000000 +0000
@@ -469,16 +469,12 @@
 
 void ThreadPoolImpl::Schedule(void(*function)(void* arg1), void* arg,
   void* tag, void(*unschedFunction)(void* arg)) {
-
-  std::function<void()> fn = [arg, function] { function(arg); };
-
-  std::function<void()> unfn;
-  if (unschedFunction != nullptr) {
-    auto uf = [arg, unschedFunction] { unschedFunction(arg); };
-    unfn = std::move(uf);
+  if (unschedFunction == nullptr) {
+    impl_->Submit(std::bind(function, arg), std::function<void()>(), tag);
+  } else {
+    impl_->Submit(std::bind(function, arg), std::bind(unschedFunction, arg),
+                  tag);
   }
-
-  impl_->Submit(std::move(fn), std::move(unfn), tag);
 }
 
 int ThreadPoolImpl::UnSchedule(void* arg) {
diff -Nru rocksdb-5.15.10/util/trace_replay.cc rocksdb-5.17.2/util/trace_replay.cc
--- rocksdb-5.15.10/util/trace_replay.cc	1970-01-01 00:00:00.000000000 +0000
+++ rocksdb-5.17.2/util/trace_replay.cc	2018-11-12 19:57:32.000000000 +0000
@@ -0,0 +1,255 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+
+#include "util/trace_replay.h"
+
+#include <chrono>
+#include <sstream>
+#include <thread>
+#include "db/db_impl.h"
+#include "rocksdb/slice.h"
+#include "rocksdb/write_batch.h"
+#include "util/coding.h"
+#include "util/string_util.h"
+
+namespace rocksdb {
+
+namespace {
+void EncodeCFAndKey(std::string* dst, uint32_t cf_id, const Slice& key) {
+  PutFixed32(dst, cf_id);
+  PutLengthPrefixedSlice(dst, key);
+}
+
+void DecodeCFAndKey(std::string& buffer, uint32_t* cf_id, Slice* key) {
+  Slice buf(buffer);
+  GetFixed32(&buf, cf_id);
+  GetLengthPrefixedSlice(&buf, key);
+}
+}  // namespace
+
+Tracer::Tracer(Env* env, std::unique_ptr<TraceWriter>&& trace_writer)
+    : env_(env), trace_writer_(std::move(trace_writer)) {
+  WriteHeader();
+}
+
+Tracer::~Tracer() { trace_writer_.reset(); }
+
+Status Tracer::Write(WriteBatch* write_batch) {
+  Trace trace;
+  trace.ts = env_->NowMicros();
+  trace.type = kTraceWrite;
+  trace.payload = write_batch->Data();
+  return WriteTrace(trace);
+}
+
+Status Tracer::Get(ColumnFamilyHandle* column_family, const Slice& key) {
+  Trace trace;
+  trace.ts = env_->NowMicros();
+  trace.type = kTraceGet;
+  EncodeCFAndKey(&trace.payload, column_family->GetID(), key);
+  return WriteTrace(trace);
+}
+
+Status Tracer::IteratorSeek(const uint32_t& cf_id, const Slice& key) {
+  Trace trace;
+  trace.ts = env_->NowMicros();
+  trace.type = kTraceIteratorSeek;
+  EncodeCFAndKey(&trace.payload, cf_id, key);
+  return WriteTrace(trace);
+}
+
+Status Tracer::IteratorSeekForPrev(const uint32_t& cf_id, const Slice& key) {
+  Trace trace;
+  trace.ts = env_->NowMicros();
+  trace.type = kTraceIteratorSeekForPrev;
+  EncodeCFAndKey(&trace.payload, cf_id, key);
+  return WriteTrace(trace);
+}
+
+Status Tracer::WriteHeader() {
+  std::ostringstream s;
+  s << kTraceMagic << "\t"
+    << "Trace Version: 0.1\t"
+    << "RocksDB Version: " << kMajorVersion << "." << kMinorVersion << "\t"
+    << "Format: Timestamp OpType Payload\n";
+  std::string header(s.str());
+
+  Trace trace;
+  trace.ts = env_->NowMicros();
+  trace.type = kTraceBegin;
+  trace.payload = header;
+  return WriteTrace(trace);
+}
+
+Status Tracer::WriteFooter() {
+  Trace trace;
+  trace.ts = env_->NowMicros();
+  trace.type = kTraceEnd;
+  trace.payload = "";
+  return WriteTrace(trace);
+}
+
+Status Tracer::WriteTrace(const Trace& trace) {
+  std::string encoded_trace;
+  PutFixed64(&encoded_trace, trace.ts);
+  encoded_trace.push_back(trace.type);
+  PutFixed32(&encoded_trace, static_cast<uint32_t>(trace.payload.size()));
+  encoded_trace.append(trace.payload);
+  return trace_writer_->Write(Slice(encoded_trace));
+}
+
+Status Tracer::Close() { return WriteFooter(); }
+
+Replayer::Replayer(DB* db, const std::vector<ColumnFamilyHandle*>& handles,
+                   unique_ptr<TraceReader>&& reader)
+    : trace_reader_(std::move(reader)) {
+  assert(db != nullptr);
+  db_ = static_cast<DBImpl*>(db->GetRootDB());
+  for (ColumnFamilyHandle* cfh : handles) {
+    cf_map_[cfh->GetID()] = cfh;
+  }
+}
+
+Replayer::~Replayer() { trace_reader_.reset(); }
+
+Status Replayer::Replay() {
+  Status s;
+  Trace header;
+  s = ReadHeader(&header);
+  if (!s.ok()) {
+    return s;
+  }
+
+  std::chrono::system_clock::time_point replay_epoch =
+      std::chrono::system_clock::now();
+  WriteOptions woptions;
+  ReadOptions roptions;
+  Trace trace;
+  uint64_t ops = 0;
+  Iterator* single_iter = nullptr;
+  while (s.ok()) {
+    trace.reset();
+    s = ReadTrace(&trace);
+    if (!s.ok()) {
+      break;
+    }
+
+    std::this_thread::sleep_until(
+        replay_epoch + std::chrono::microseconds(trace.ts - header.ts));
+    if (trace.type == kTraceWrite) {
+      WriteBatch batch(trace.payload);
+      db_->Write(woptions, &batch);
+      ops++;
+    } else if (trace.type == kTraceGet) {
+      uint32_t cf_id = 0;
+      Slice key;
+      DecodeCFAndKey(trace.payload, &cf_id, &key);
+      if (cf_id > 0 && cf_map_.find(cf_id) == cf_map_.end()) {
+        return Status::Corruption("Invalid Column Family ID.");
+      }
+
+      std::string value;
+      if (cf_id == 0) {
+        db_->Get(roptions, key, &value);
+      } else {
+        db_->Get(roptions, cf_map_[cf_id], key, &value);
+      }
+      ops++;
+    } else if (trace.type == kTraceIteratorSeek) {
+      uint32_t cf_id = 0;
+      Slice key;
+      DecodeCFAndKey(trace.payload, &cf_id, &key);
+      if (cf_id > 0 && cf_map_.find(cf_id) == cf_map_.end()) {
+        return Status::Corruption("Invalid Column Family ID.");
+      }
+
+      if (cf_id == 0) {
+        single_iter = db_->NewIterator(roptions);
+      } else {
+        single_iter = db_->NewIterator(roptions, cf_map_[cf_id]);
+      }
+      single_iter->Seek(key);
+      ops++;
+      delete single_iter;
+    } else if (trace.type == kTraceIteratorSeekForPrev) {
+      // Currently, only support to call the Seek()
+      uint32_t cf_id = 0;
+      Slice key;
+      DecodeCFAndKey(trace.payload, &cf_id, &key);
+      if (cf_id > 0 && cf_map_.find(cf_id) == cf_map_.end()) {
+        return Status::Corruption("Invalid Column Family ID.");
+      }
+
+      if (cf_id == 0) {
+        single_iter = db_->NewIterator(roptions);
+      } else {
+        single_iter = db_->NewIterator(roptions, cf_map_[cf_id]);
+      }
+      single_iter->SeekForPrev(key);
+      ops++;
+      delete single_iter;
+    } else if (trace.type == kTraceEnd) {
+      // Do nothing for now.
+      // TODO: Add some validations later.
+      break;
+    }
+  }
+
+  if (s.IsIncomplete()) {
+    // Reaching eof returns Incomplete status at the moment.
+    // Could happen when killing a process without calling EndTrace() API.
+    // TODO: Add better error handling.
+    return Status::OK();
+  }
+  return s;
+}
+
+Status Replayer::ReadHeader(Trace* header) {
+  assert(header != nullptr);
+  Status s = ReadTrace(header);
+  if (!s.ok()) {
+    return s;
+  }
+  if (header->type != kTraceBegin) {
+    return Status::Corruption("Corrupted trace file. Incorrect header.");
+  }
+  if (header->payload.substr(0, kTraceMagic.length()) != kTraceMagic) {
+    return Status::Corruption("Corrupted trace file. Incorrect magic.");
+  }
+
+  return s;
+}
+
+Status Replayer::ReadFooter(Trace* footer) {
+  assert(footer != nullptr);
+  Status s = ReadTrace(footer);
+  if (!s.ok()) {
+    return s;
+  }
+  if (footer->type != kTraceEnd) {
+    return Status::Corruption("Corrupted trace file. Incorrect footer.");
+  }
+
+  // TODO: Add more validations later
+  return s;
+}
+
+Status Replayer::ReadTrace(Trace* trace) {
+  assert(trace != nullptr);
+  std::string encoded_trace;
+  Status s = trace_reader_->Read(&encoded_trace);
+  if (!s.ok()) {
+    return s;
+  }
+
+  Slice enc_slice = Slice(encoded_trace);
+  GetFixed64(&enc_slice, &trace->ts);
+  trace->type = static_cast<TraceType>(enc_slice[0]);
+  enc_slice.remove_prefix(kTraceTypeSize + kTracePayloadLengthSize);
+  trace->payload = enc_slice.ToString();
+  return s;
+}
+
+}  // namespace rocksdb
diff -Nru rocksdb-5.15.10/util/trace_replay.h rocksdb-5.17.2/util/trace_replay.h
--- rocksdb-5.15.10/util/trace_replay.h	1970-01-01 00:00:00.000000000 +0000
+++ rocksdb-5.17.2/util/trace_replay.h	2018-11-12 19:57:32.000000000 +0000
@@ -0,0 +1,96 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+
+#pragma once
+
+#include <memory>
+#include <unordered_map>
+#include <utility>
+
+#include "rocksdb/env.h"
+#include "rocksdb/trace_reader_writer.h"
+
+namespace rocksdb {
+
+class ColumnFamilyHandle;
+class ColumnFamilyData;
+class DB;
+class DBImpl;
+class Slice;
+class WriteBatch;
+
+const std::string kTraceMagic = "feedcafedeadbeef";
+const unsigned int kTraceTimestampSize = 8;
+const unsigned int kTraceTypeSize = 1;
+const unsigned int kTracePayloadLengthSize = 4;
+const unsigned int kTraceMetadataSize =
+    kTraceTimestampSize + kTraceTypeSize + kTracePayloadLengthSize;
+
+enum TraceType : char {
+  kTraceBegin = 1,
+  kTraceEnd = 2,
+  kTraceWrite = 3,
+  kTraceGet = 4,
+  kTraceIteratorSeek = 5,
+  kTraceIteratorSeekForPrev = 6,
+  kTraceMax,
+};
+
+// TODO: This should also be made part of public interface to help users build
+// custom TracerReaders and TraceWriters.
+struct Trace {
+  uint64_t ts;
+  TraceType type;
+  std::string payload;
+
+  void reset() {
+    ts = 0;
+    type = kTraceMax;
+    payload.clear();
+  }
+};
+
+// Trace RocksDB operations using a TraceWriter.
+class Tracer {
+ public:
+  Tracer(Env* env, std::unique_ptr<TraceWriter>&& trace_writer);
+  ~Tracer();
+
+  Status Write(WriteBatch* write_batch);
+  Status Get(ColumnFamilyHandle* cfname, const Slice& key);
+  Status IteratorSeek(const uint32_t& cf_id, const Slice& key);
+  Status IteratorSeekForPrev(const uint32_t& cf_id, const Slice& key);
+
+  Status Close();
+
+ private:
+  Status WriteHeader();
+  Status WriteFooter();
+  Status WriteTrace(const Trace& trace);
+
+  Env* env_;
+  unique_ptr<TraceWriter> trace_writer_;
+};
+
+// Replay RocksDB operations from a trace.
+class Replayer {
+ public:
+  Replayer(DB* db, const std::vector<ColumnFamilyHandle*>& handles,
+           std::unique_ptr<TraceReader>&& reader);
+  ~Replayer();
+
+  Status Replay();
+
+ private:
+  Status ReadHeader(Trace* header);
+  Status ReadFooter(Trace* footer);
+  Status ReadTrace(Trace* trace);
+
+  DBImpl* db_;
+  std::unique_ptr<TraceReader> trace_reader_;
+  std::unordered_map<uint32_t, ColumnFamilyHandle*> cf_map_;
+};
+
+}  // namespace rocksdb
diff -Nru rocksdb-5.15.10/util/transaction_test_util.cc rocksdb-5.17.2/util/transaction_test_util.cc
--- rocksdb-5.15.10/util/transaction_test_util.cc	2018-09-13 17:25:20.000000000 +0000
+++ rocksdb-5.17.2/util/transaction_test_util.cc	2018-11-12 19:57:32.000000000 +0000
@@ -47,6 +47,14 @@
 bool RandomTransactionInserter::TransactionDBInsert(
     TransactionDB* db, const TransactionOptions& txn_options) {
   txn_ = db->BeginTransaction(write_options_, txn_options, txn_);
+
+  std::hash<std::thread::id> hasher;
+  char name[64];
+  snprintf(name, 64, "txn%" ROCKSDB_PRIszt "-%d",
+           hasher(std::this_thread::get_id()), txn_id_++);
+  assert(strlen(name) < 64 - 1);
+  txn_->SetName(name);
+
   bool take_snapshot = rand_->OneIn(2);
   if (take_snapshot) {
     txn_->SetSnapshot();
@@ -129,6 +137,7 @@
   std::iota(set_vec.begin(), set_vec.end(), static_cast<uint16_t>(0));
   std::random_shuffle(set_vec.begin(), set_vec.end(),
                       [&](uint64_t r) { return rand_->Uniform(r); });
+
   // For each set, pick a key at random and increment it
   for (uint16_t set_i : set_vec) {
     uint64_t int_value = 0;
@@ -173,14 +182,8 @@
 
   if (s.ok()) {
     if (txn != nullptr) {
-      std::hash<std::thread::id> hasher;
-      char name[64];
-      snprintf(name, 64, "txn%" ROCKSDB_PRIszt "-%d", hasher(std::this_thread::get_id()),
-               txn_id_++);
-      assert(strlen(name) < 64 - 1);
       if (!is_optimistic && !rand_->OneIn(10)) {
         // also try commit without prpare
-        txn->SetName(name);
         s = txn->Prepare();
         assert(s.ok());
       }
diff -Nru rocksdb-5.15.10/utilities/backupable/backupable_db.cc rocksdb-5.17.2/utilities/backupable/backupable_db.cc
--- rocksdb-5.15.10/utilities/backupable/backupable_db.cc	2018-09-13 17:25:20.000000000 +0000
+++ rocksdb-5.17.2/utilities/backupable/backupable_db.cc	2018-11-12 19:57:32.000000000 +0000
@@ -781,7 +781,7 @@
 
   RateLimiter* rate_limiter = options_.backup_rate_limiter.get();
   if (rate_limiter) {
-    copy_file_buffer_size_ = rate_limiter->GetSingleBurstBytes();
+    copy_file_buffer_size_ = static_cast<size_t>(rate_limiter->GetSingleBurstBytes());
   }
 
   // A set into which we will insert the dst_paths that are calculated for live
@@ -874,19 +874,19 @@
         GetAbsolutePath(GetPrivateFileRel(new_backup_id, false)),
         &backup_private_directory);
     if (backup_private_directory != nullptr) {
-      backup_private_directory->Fsync();
+      s = backup_private_directory->Fsync();
     }
-    if (private_directory_ != nullptr) {
-      private_directory_->Fsync();
+    if (s.ok() && private_directory_ != nullptr) {
+      s = private_directory_->Fsync();
     }
-    if (meta_directory_ != nullptr) {
-      meta_directory_->Fsync();
+    if (s.ok() && meta_directory_ != nullptr) {
+      s = meta_directory_->Fsync();
     }
-    if (shared_directory_ != nullptr) {
-      shared_directory_->Fsync();
+    if (s.ok() && shared_directory_ != nullptr) {
+      s = shared_directory_->Fsync();
     }
-    if (backup_directory_ != nullptr) {
-      backup_directory_->Fsync();
+    if (s.ok() && backup_directory_ != nullptr) {
+      s = backup_directory_->Fsync();
     }
   }
 
@@ -1078,7 +1078,7 @@
 
   RateLimiter* rate_limiter = options_.restore_rate_limiter.get();
   if (rate_limiter) {
-    copy_file_buffer_size_ = rate_limiter->GetSingleBurstBytes();
+    copy_file_buffer_size_ = static_cast<size_t>(rate_limiter->GetSingleBurstBytes());
   }
   Status s;
   std::vector<RestoreAfterCopyOrCreateWorkItem> restore_items_to_finish;
@@ -1214,7 +1214,7 @@
   }
 
   unique_ptr<WritableFileWriter> dest_writer(
-      new WritableFileWriter(std::move(dst_file), env_options));
+      new WritableFileWriter(std::move(dst_file), dst, env_options));
   unique_ptr<SequentialFileReader> src_reader;
   unique_ptr<char[]> buf;
   if (!src.empty()) {
@@ -1231,7 +1231,7 @@
     if (!src.empty()) {
       size_t buffer_to_read = (copy_file_buffer_size_ < size_limit)
                                   ? copy_file_buffer_size_
-                                  : size_limit;
+                                  : static_cast<size_t>(size_limit);
       s = src_reader->Read(buffer_to_read, &data, buf.get());
       processed_buffer_size += buffer_to_read;
     } else {
@@ -1426,7 +1426,7 @@
       return Status::Incomplete("Backup stopped");
     }
     size_t buffer_to_read = (copy_file_buffer_size_ < size_limit) ?
-      copy_file_buffer_size_ : size_limit;
+      copy_file_buffer_size_ : static_cast<size_t>(size_limit);
     s = src_reader->Read(buffer_to_read, &data, buf.get());
 
     if (!s.ok()) {
@@ -1754,7 +1754,7 @@
     std::string hex_encoded_metadata =
         Slice(app_metadata_).ToString(/* hex */ true);
 
-    // +1 to accomodate newline character
+    // +1 to accommodate newline character
     size_t hex_meta_strlen = kMetaDataPrefix.ToString().length() + hex_encoded_metadata.length() + 1;
     if (hex_meta_strlen >= buf_size) {
       return Status::Corruption("Buffer too small to fit backup metadata");
diff -Nru rocksdb-5.15.10/utilities/blob_db/blob_db_impl.cc rocksdb-5.17.2/utilities/blob_db/blob_db_impl.cc
--- rocksdb-5.15.10/utilities/blob_db/blob_db_impl.cc	2018-09-13 17:25:20.000000000 +0000
+++ rocksdb-5.17.2/utilities/blob_db/blob_db_impl.cc	2018-11-12 19:57:32.000000000 +0000
@@ -304,13 +304,17 @@
   open_file_count_--;
 }
 
-std::shared_ptr<RandomAccessFileReader> BlobDBImpl::GetOrOpenRandomAccessReader(
-    const std::shared_ptr<BlobFile>& bfile, Env* env,
-    const EnvOptions& env_options) {
+Status BlobDBImpl::GetBlobFileReader(
+    const std::shared_ptr<BlobFile>& blob_file,
+    std::shared_ptr<RandomAccessFileReader>* reader) {
+  assert(reader != nullptr);
   bool fresh_open = false;
-  auto rar = bfile->GetOrOpenRandomAccessReader(env, env_options, &fresh_open);
-  if (fresh_open) open_file_count_++;
-  return rar;
+  Status s = blob_file->GetReader(env_, env_options_, reader, &fresh_open);
+  if (s.ok() && fresh_open) {
+    assert(*reader != nullptr);
+    open_file_count_++;
+  }
+  return s;
 }
 
 std::shared_ptr<BlobFile> BlobDBImpl::NewBlobFile(const std::string& reason) {
@@ -338,7 +342,7 @@
   }
 
   std::unique_ptr<WritableFileWriter> fwriter;
-  fwriter.reset(new WritableFileWriter(std::move(wfile), env_options_));
+  fwriter.reset(new WritableFileWriter(std::move(wfile), fpath, env_options_));
 
   uint64_t boffset = bfile->GetFileSize();
   if (debug_level_ >= 2 && boffset) {
@@ -562,8 +566,8 @@
       return Status::NotSupported(
           "Blob DB doesn't support non-default column family.");
     }
-    Status s = blob_db_impl_->PutBlobValue(options_, key, value,
-                                           kNoExpiration, &batch_);
+    Status s = blob_db_impl_->PutBlobValue(options_, key, value, kNoExpiration,
+                                           &batch_);
     return s;
   }
 
@@ -621,39 +625,6 @@
   return db_->Write(options, blob_inserter.batch());
 }
 
-Status BlobDBImpl::GetLiveFiles(std::vector<std::string>& ret,
-                                uint64_t* manifest_file_size,
-                                bool flush_memtable) {
-  // Hold a lock in the beginning to avoid updates to base DB during the call
-  ReadLock rl(&mutex_);
-  Status s = db_->GetLiveFiles(ret, manifest_file_size, flush_memtable);
-  if (!s.ok()) {
-    return s;
-  }
-  ret.reserve(ret.size() + blob_files_.size());
-  for (auto bfile_pair : blob_files_) {
-    auto blob_file = bfile_pair.second;
-    ret.emplace_back(blob_file->PathName());
-  }
-  return Status::OK();
-}
-
-void BlobDBImpl::GetLiveFilesMetaData(std::vector<LiveFileMetaData>* metadata) {
-  // Hold a lock in the beginning to avoid updates to base DB during the call
-  ReadLock rl(&mutex_);
-  db_->GetLiveFilesMetaData(metadata);
-  for (auto bfile_pair : blob_files_) {
-    auto blob_file = bfile_pair.second;
-    LiveFileMetaData filemetadata;
-    filemetadata.size = blob_file->GetFileSize();
-    filemetadata.name = blob_file->PathName();
-    auto cfh =
-        reinterpret_cast<ColumnFamilyHandleImpl*>(DefaultColumnFamily());
-    filemetadata.column_family_name = cfh->GetName();
-    metadata->emplace_back(filemetadata);
-  }
-}
-
 Status BlobDBImpl::Put(const WriteOptions& options, const Slice& key,
                        const Slice& value) {
   return PutUntil(options, key, value, kNoExpiration);
@@ -1031,22 +1002,25 @@
   }
 
   // takes locks when called
-  std::shared_ptr<RandomAccessFileReader> reader =
-      GetOrOpenRandomAccessReader(bfile, env_, env_options_);
+  std::shared_ptr<RandomAccessFileReader> reader;
+  s = GetBlobFileReader(bfile, &reader);
+  if (!s.ok()) {
+    return s;
+  }
 
   assert(blob_index.offset() > key.size() + sizeof(uint32_t));
   uint64_t record_offset = blob_index.offset() - key.size() - sizeof(uint32_t);
   uint64_t record_size = sizeof(uint32_t) + key.size() + blob_index.size();
 
   // Allocate the buffer. This is safe in C++11
-  std::string buffer_str(record_size, static_cast<char>(0));
+  std::string buffer_str(static_cast<size_t>(record_size), static_cast<char>(0));
   char* buffer = &buffer_str[0];
 
   // A partial blob record contain checksum, key and value.
   Slice blob_record;
   {
     StopWatch read_sw(env_, statistics_, BLOB_DB_BLOB_FILE_READ_MICROS);
-    s = reader->Read(record_offset, record_size, &blob_record, buffer);
+    s = reader->Read(record_offset, static_cast<size_t>(record_size), &blob_record, buffer);
     RecordTick(statistics_, BLOB_DB_BLOB_FILE_BYTES_READ, blob_record.size());
   }
   if (!s.ok()) {
@@ -1072,7 +1046,7 @@
   }
   Slice crc_slice(blob_record.data(), sizeof(uint32_t));
   Slice blob_value(blob_record.data() + sizeof(uint32_t) + key.size(),
-                   blob_index.size());
+                   static_cast<size_t>(blob_index.size()));
   uint32_t crc_exp;
   if (!GetFixed32(&crc_slice, &crc_exp)) {
     ROCKS_LOG_DEBUG(db_options_.info_log,
@@ -1144,9 +1118,10 @@
   ReadOptions ro(read_options);
   bool snapshot_created = SetSnapshotIfNeeded(&ro);
 
+  PinnableSlice index_entry;
   Status s;
   bool is_blob_index = false;
-  s = db_impl_->GetImpl(ro, column_family, key, value,
+  s = db_impl_->GetImpl(ro, column_family, key, &index_entry,
                         nullptr /*value_found*/, nullptr /*read_callback*/,
                         &is_blob_index);
   TEST_SYNC_POINT("BlobDBImpl::Get:AfterIndexEntryGet:1");
@@ -1154,27 +1129,30 @@
   if (expiration != nullptr) {
     *expiration = kNoExpiration;
   }
-  if (s.ok() && is_blob_index) {
-    std::string index_entry = value->ToString();
-    value->Reset();
-    s = GetBlobValue(key, index_entry, value, expiration);
+  RecordTick(statistics_, BLOB_DB_NUM_KEYS_READ);
+  if (s.ok()) {
+    if (is_blob_index) {
+      s = GetBlobValue(key, index_entry, value, expiration);
+    } else {
+      // The index entry is the value itself in this case.
+      value->PinSelf(index_entry);
+    }
+    RecordTick(statistics_, BLOB_DB_BYTES_READ, value->size());
   }
   if (snapshot_created) {
     db_->ReleaseSnapshot(ro.snapshot);
   }
-  RecordTick(statistics_, BLOB_DB_NUM_KEYS_READ);
-  RecordTick(statistics_, BLOB_DB_BYTES_READ, value->size());
   return s;
 }
 
 std::pair<bool, int64_t> BlobDBImpl::SanityCheck(bool aborted) {
-  if (aborted) return std::make_pair(false, -1);
+  if (aborted) {
+    return std::make_pair(false, -1);
+  }
 
   ROCKS_LOG_INFO(db_options_.info_log, "Starting Sanity Check");
-
   ROCKS_LOG_INFO(db_options_.info_log, "Number of files %" PRIu64,
                  blob_files_.size());
-
   ROCKS_LOG_INFO(db_options_.info_log, "Number of open files %" PRIu64,
                  open_ttl_files_.size());
 
@@ -1182,14 +1160,33 @@
     assert(!bfile->Immutable());
   }
 
-  uint64_t epoch_now = EpochNow();
+  uint64_t now = EpochNow();
 
-  for (auto bfile_pair : blob_files_) {
-    auto bfile = bfile_pair.second;
-    ROCKS_LOG_INFO(
-        db_options_.info_log, "Blob File %s %" PRIu64 " %" PRIu64 " %" PRIu64,
-        bfile->PathName().c_str(), bfile->GetFileSize(), bfile->BlobCount(),
-        (bfile->expiration_range_.second - epoch_now));
+  for (auto blob_file_pair : blob_files_) {
+    auto blob_file = blob_file_pair.second;
+    char buf[1000];
+    int pos = snprintf(buf, sizeof(buf),
+                       "Blob file %" PRIu64 ", size %" PRIu64
+                       ", blob count %" PRIu64 ", immutable %d",
+                       blob_file->BlobFileNumber(), blob_file->GetFileSize(),
+                       blob_file->BlobCount(), blob_file->Immutable());
+    if (blob_file->HasTTL()) {
+      auto expiration_range = blob_file->GetExpirationRange();
+      pos += snprintf(buf + pos, sizeof(buf) - pos,
+                      ", expiration range (%" PRIu64 ", %" PRIu64 ")",
+                      expiration_range.first, expiration_range.second);
+      if (!blob_file->Obsolete()) {
+        pos += snprintf(buf + pos, sizeof(buf) - pos,
+                        ", expire in %" PRIu64 " seconds",
+                        expiration_range.second - now);
+      }
+    }
+    if (blob_file->Obsolete()) {
+      pos += snprintf(buf + pos, sizeof(buf) - pos, ", obsolete at %" PRIu64,
+                      blob_file->GetObsoleteSequence());
+    }
+    snprintf(buf + pos, sizeof(buf) - pos, ".");
+    ROCKS_LOG_INFO(db_options_.info_log, "%s", buf);
   }
 
   // reschedule
@@ -1279,7 +1276,14 @@
       oldest_snapshot = snapshots.oldest()->GetSequenceNumber();
     }
   }
-  return oldest_snapshot < obsolete_sequence;
+  bool visible = oldest_snapshot < obsolete_sequence;
+  if (visible) {
+    ROCKS_LOG_INFO(db_options_.info_log,
+                   "Obsolete blob file %" PRIu64 " (obsolete at %" PRIu64
+                   ") visible to oldest snapshot %" PRIu64 ".",
+                   bfile->BlobFileNumber(), obsolete_sequence, oldest_snapshot);
+  }
+  return visible;
 }
 
 std::pair<bool, int64_t> BlobDBImpl::EvictExpiredFiles(bool aborted) {
@@ -1680,16 +1684,21 @@
 }
 
 std::pair<bool, int64_t> BlobDBImpl::DeleteObsoleteFiles(bool aborted) {
-  if (aborted) return std::make_pair(false, -1);
+  if (aborted) {
+    return std::make_pair(false, -1);
+  }
 
-  {
-    ReadLock rl(&mutex_);
-    if (obsolete_files_.empty()) return std::make_pair(true, -1);
+  MutexLock delete_file_lock(&delete_file_mutex_);
+  if (disable_file_deletions_ > 0) {
+    return std::make_pair(true, -1);
   }
 
   std::list<std::shared_ptr<BlobFile>> tobsolete;
   {
     WriteLock wl(&mutex_);
+    if (obsolete_files_.empty()) {
+      return std::make_pair(true, -1);
+    }
     tobsolete.swap(obsolete_files_);
   }
 
@@ -1730,7 +1739,11 @@
 
   // directory change. Fsync
   if (file_deleted) {
-    dir_ent_->Fsync();
+    Status s = dir_ent_->Fsync();
+    if (!s.ok()) {
+      ROCKS_LOG_ERROR(db_options_.info_log, "Failed to sync dir %s: %s",
+                      blob_dir_.c_str(), s.ToString().c_str());
+    }
   }
 
   // put files back into obsolete if for some reason, delete failed
diff -Nru rocksdb-5.15.10/utilities/blob_db/blob_db_impl_filesnapshot.cc rocksdb-5.17.2/utilities/blob_db/blob_db_impl_filesnapshot.cc
--- rocksdb-5.15.10/utilities/blob_db/blob_db_impl_filesnapshot.cc	1970-01-01 00:00:00.000000000 +0000
+++ rocksdb-5.17.2/utilities/blob_db/blob_db_impl_filesnapshot.cc	2018-11-12 19:57:32.000000000 +0000
@@ -0,0 +1,108 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+
+#ifndef ROCKSDB_LITE
+
+#include "utilities/blob_db/blob_db_impl.h"
+
+#include "util/filename.h"
+#include "util/logging.h"
+#include "util/mutexlock.h"
+
+// BlobDBImpl methods to get snapshot of files, e.g. for replication.
+
+namespace rocksdb {
+namespace blob_db {
+
+Status BlobDBImpl::DisableFileDeletions() {
+  // Disable base DB file deletions.
+  Status s = db_impl_->DisableFileDeletions();
+  if (!s.ok()) {
+    return s;
+  }
+
+  int count = 0;
+  {
+    // Hold delete_file_mutex_ to make sure no DeleteObsoleteFiles job
+    // is running.
+    MutexLock l(&delete_file_mutex_);
+    count = ++disable_file_deletions_;
+  }
+
+  ROCKS_LOG_INFO(db_options_.info_log,
+                 "Disalbed blob file deletions. count: %d", count);
+  return Status::OK();
+}
+
+Status BlobDBImpl::EnableFileDeletions(bool force) {
+  // Enable base DB file deletions.
+  Status s = db_impl_->EnableFileDeletions(force);
+  if (!s.ok()) {
+    return s;
+  }
+
+  int count = 0;
+  {
+    MutexLock l(&delete_file_mutex_);
+    if (force) {
+      disable_file_deletions_ = 0;
+    } else if (disable_file_deletions_ > 0) {
+      count = --disable_file_deletions_;
+    }
+    assert(count >= 0);
+  }
+
+  ROCKS_LOG_INFO(db_options_.info_log, "Enabled blob file deletions. count: %d",
+                 count);
+  // Consider trigger DeleteobsoleteFiles once after re-enabled, if we are to
+  // make DeleteobsoleteFiles re-run interval configuration.
+  return Status::OK();
+}
+
+Status BlobDBImpl::GetLiveFiles(std::vector<std::string>& ret,
+                                uint64_t* manifest_file_size,
+                                bool flush_memtable) {
+  if (!bdb_options_.path_relative) {
+    return Status::NotSupported(
+        "Not able to get relative blob file path from absolute blob_dir.");
+  }
+  // Hold a lock in the beginning to avoid updates to base DB during the call
+  ReadLock rl(&mutex_);
+  Status s = db_->GetLiveFiles(ret, manifest_file_size, flush_memtable);
+  if (!s.ok()) {
+    return s;
+  }
+  ret.reserve(ret.size() + blob_files_.size());
+  for (auto bfile_pair : blob_files_) {
+    auto blob_file = bfile_pair.second;
+    // Path should be relative to db_name, but begin with slash.
+    ret.emplace_back(
+        BlobFileName("", bdb_options_.blob_dir, blob_file->BlobFileNumber()));
+  }
+  return Status::OK();
+}
+
+void BlobDBImpl::GetLiveFilesMetaData(std::vector<LiveFileMetaData>* metadata) {
+  // Path should be relative to db_name.
+  assert(bdb_options_.path_relative);
+  // Hold a lock in the beginning to avoid updates to base DB during the call
+  ReadLock rl(&mutex_);
+  db_->GetLiveFilesMetaData(metadata);
+  for (auto bfile_pair : blob_files_) {
+    auto blob_file = bfile_pair.second;
+    LiveFileMetaData filemetadata;
+    filemetadata.size = static_cast<size_t>(blob_file->GetFileSize());
+    // Path should be relative to db_name, but begin with slash.
+    filemetadata.name =
+        BlobFileName("", bdb_options_.blob_dir, blob_file->BlobFileNumber());
+    auto cfh = reinterpret_cast<ColumnFamilyHandleImpl*>(DefaultColumnFamily());
+    filemetadata.column_family_name = cfh->GetName();
+    metadata->emplace_back(filemetadata);
+  }
+}
+
+}  // namespace blob_db
+}  // namespace rocksdb
+#endif  // !ROCKSDB_LITE
diff -Nru rocksdb-5.15.10/utilities/blob_db/blob_db_impl.h rocksdb-5.17.2/utilities/blob_db/blob_db_impl.h
--- rocksdb-5.15.10/utilities/blob_db/blob_db_impl.h	2018-09-13 17:25:20.000000000 +0000
+++ rocksdb-5.17.2/utilities/blob_db/blob_db_impl.h	2018-11-12 19:57:32.000000000 +0000
@@ -155,12 +155,6 @@
 
   virtual Status Close() override;
 
-  virtual Status GetLiveFiles(std::vector<std::string>&,
-                              uint64_t* manifest_file_size,
-                              bool flush_memtable = true) override;
-  virtual void GetLiveFilesMetaData(
-      std::vector<LiveFileMetaData>* ) override;
-
   using BlobDB::PutWithTTL;
   Status PutWithTTL(const WriteOptions& options, const Slice& key,
                     const Slice& value, uint64_t ttl) override;
@@ -175,6 +169,15 @@
              const DBOptions& db_options,
              const ColumnFamilyOptions& cf_options);
 
+  virtual Status DisableFileDeletions() override;
+
+  virtual Status EnableFileDeletions(bool force) override;
+
+  virtual Status GetLiveFiles(std::vector<std::string>&,
+                              uint64_t* manifest_file_size,
+                              bool flush_memtable = true) override;
+  virtual void GetLiveFilesMetaData(std::vector<LiveFileMetaData>*) override;
+
   ~BlobDBImpl();
 
   Status Open(std::vector<ColumnFamilyHandle*>* handles);
@@ -293,11 +296,8 @@
   // Open all blob files found in blob_dir.
   Status OpenAllBlobFiles();
 
-  // hold write mutex on file and call
-  // creates a Random Access reader for GET call
-  std::shared_ptr<RandomAccessFileReader> GetOrOpenRandomAccessReader(
-      const std::shared_ptr<BlobFile>& bfile, Env* env,
-      const EnvOptions& env_options);
+  Status GetBlobFileReader(const std::shared_ptr<BlobFile>& blob_file,
+                           std::shared_ptr<RandomAccessFileReader>* reader);
 
   // hold write mutex on file and call.
   // Close the above Random Access reader
@@ -408,6 +408,26 @@
 
   std::list<std::shared_ptr<BlobFile>> obsolete_files_;
 
+  // DeleteObsoleteFiles, DiableFileDeletions and EnableFileDeletions block
+  // on the mutex to avoid contention.
+  //
+  // While DeleteObsoleteFiles hold both mutex_ and delete_file_mutex_, note
+  // the difference. mutex_ only needs to be held when access the
+  // data-structure, and delete_file_mutex_ needs to be held the whole time
+  // during DeleteObsoleteFiles to avoid being run simultaneously with
+  // DisableFileDeletions.
+  //
+  // If both of mutex_ and delete_file_mutex_ needs to be held, it is adviced
+  // to hold delete_file_mutex_ first to avoid deadlock.
+  mutable port::Mutex delete_file_mutex_;
+
+  // Each call of DisableFileDeletions will increase disable_file_deletion_
+  // by 1. EnableFileDeletions will either decrease the count by 1 or reset
+  // it to zeor, depending on the force flag.
+  //
+  // REQUIRES: access with delete_file_mutex_ held.
+  int disable_file_deletions_ = 0;
+
   uint32_t debug_level_;
 };
 
diff -Nru rocksdb-5.15.10/utilities/blob_db/blob_db_test.cc rocksdb-5.17.2/utilities/blob_db/blob_db_test.cc
--- rocksdb-5.15.10/utilities/blob_db/blob_db_test.cc	2018-09-13 17:25:20.000000000 +0000
+++ rocksdb-5.17.2/utilities/blob_db/blob_db_test.cc	2018-11-12 19:57:32.000000000 +0000
@@ -16,6 +16,7 @@
 #include "port/port.h"
 #include "rocksdb/utilities/debug.h"
 #include "util/cast_util.h"
+#include "util/fault_injection_test_env.h"
 #include "util/random.h"
 #include "util/string_util.h"
 #include "util/sync_point.h"
@@ -40,6 +41,7 @@
   BlobDBTest()
       : dbname_(test::PerThreadDBPath("blob_db_test")),
         mock_env_(new MockTimeEnv(Env::Default())),
+        fault_injection_env_(new FaultInjectionTestEnv(Env::Default())),
         blob_db_(nullptr) {
     Status s = DestroyBlobDB(dbname_, Options(), BlobDBOptions());
     assert(s.ok());
@@ -196,8 +198,9 @@
       const std::map<std::string, KeyVersion> &expected_versions) {
     auto *bdb_impl = static_cast<BlobDBImpl *>(blob_db_);
     DB *db = blob_db_->GetRootDB();
+    const size_t kMaxKeys = 10000;
     std::vector<KeyVersion> versions;
-    GetAllKeyVersions(db, "", "", &versions);
+    GetAllKeyVersions(db, "", "", kMaxKeys, &versions);
     ASSERT_EQ(expected_versions.size(), versions.size());
     size_t i = 0;
     for (auto &key_version : expected_versions) {
@@ -235,6 +238,7 @@
 
   const std::string dbname_;
   std::unique_ptr<MockTimeEnv> mock_env_;
+  std::unique_ptr<FaultInjectionTestEnv> fault_injection_env_;
   BlobDB *blob_db_;
 };  // class BlobDBTest
 
@@ -353,6 +357,23 @@
   ASSERT_EQ(300 /* = 100 + 200 */, expiration);
 }
 
+TEST_F(BlobDBTest, GetIOError) {
+  Options options;
+  options.env = fault_injection_env_.get();
+  BlobDBOptions bdb_options;
+  bdb_options.min_blob_size = 0;  // Make sure value write to blob file
+  bdb_options.disable_background_tasks = true;
+  Open(bdb_options, options);
+  ColumnFamilyHandle *column_family = blob_db_->DefaultColumnFamily();
+  PinnableSlice value;
+  ASSERT_OK(Put("foo", "bar"));
+  fault_injection_env_->SetFilesystemActive(false, Status::IOError());
+  Status s = blob_db_->Get(ReadOptions(), column_family, "foo", &value);
+  ASSERT_TRUE(s.IsIOError());
+  // Reactivate file system to allow test to close DB.
+  fault_injection_env_->SetFilesystemActive(true);
+}
+
 TEST_F(BlobDBTest, WriteBatch) {
   Random rnd(301);
   BlobDBOptions bdb_options;
@@ -460,7 +481,6 @@
   Reopen(bdb_options);
   VerifyDB(data);
 }
-
 #endif
 
 TEST_F(BlobDBTest, MultipleWriters) {
@@ -833,6 +853,8 @@
 TEST_F(BlobDBTest, GetLiveFilesMetaData) {
   Random rnd(301);
   BlobDBOptions bdb_options;
+  bdb_options.blob_dir = "blob_dir";
+  bdb_options.path_relative = true;
   bdb_options.min_blob_size = 0;
   bdb_options.disable_background_tasks = true;
   Open(bdb_options);
@@ -840,16 +862,16 @@
   for (size_t i = 0; i < 100; i++) {
     PutRandom("key" + ToString(i), &rnd, &data);
   }
-  auto *bdb_impl = static_cast<BlobDBImpl *>(blob_db_);
   std::vector<LiveFileMetaData> metadata;
-  bdb_impl->GetLiveFilesMetaData(&metadata);
+  blob_db_->GetLiveFilesMetaData(&metadata);
   ASSERT_EQ(1U, metadata.size());
-  std::string filename = dbname_ + "/blob_dir/000001.blob";
+  // Path should be relative to db_name, but begin with slash.
+  std::string filename = "/blob_dir/000001.blob";
   ASSERT_EQ(filename, metadata[0].name);
   ASSERT_EQ("default", metadata[0].column_family_name);
   std::vector<std::string> livefile;
   uint64_t mfs;
-  bdb_impl->GetLiveFiles(livefile, &mfs, false);
+  ASSERT_OK(blob_db_->GetLiveFiles(livefile, &mfs, false));
   ASSERT_EQ(4U, livefile.size());
   ASSERT_EQ(filename, livefile[3]);
   VerifyDB(data);
@@ -1232,7 +1254,8 @@
   blob_db_->ReleaseSnapshot(snapshot);
   // Verify expired blob index are filtered.
   std::vector<KeyVersion> versions;
-  GetAllKeyVersions(blob_db_, "", "", &versions);
+  const size_t kMaxKeys = 10000;
+  GetAllKeyVersions(blob_db_, "", "", kMaxKeys, &versions);
   ASSERT_EQ(data_after_compact.size(), versions.size());
   for (auto &version : versions) {
     ASSERT_TRUE(data_after_compact.count(version.user_key) > 0);
@@ -1262,9 +1285,11 @@
   ASSERT_EQ(2, blob_files[1]->BlobFileNumber());
   ASSERT_OK(blob_db_impl()->TEST_CloseBlobFile(blob_files[1]));
 
+  const size_t kMaxKeys = 10000;
+
   DB *base_db = blob_db_->GetRootDB();
   std::vector<KeyVersion> versions;
-  ASSERT_OK(GetAllKeyVersions(base_db, "", "", &versions));
+  ASSERT_OK(GetAllKeyVersions(base_db, "", "", kMaxKeys, &versions));
   ASSERT_EQ(2, versions.size());
   ASSERT_EQ("bar", versions[0].user_key);
   ASSERT_EQ("foo", versions[1].user_key);
@@ -1272,7 +1297,7 @@
 
   ASSERT_OK(blob_db_->Flush(FlushOptions()));
   ASSERT_OK(blob_db_->CompactRange(CompactRangeOptions(), nullptr, nullptr));
-  ASSERT_OK(GetAllKeyVersions(base_db, "", "", &versions));
+  ASSERT_OK(GetAllKeyVersions(base_db, "", "", kMaxKeys, &versions));
   ASSERT_EQ(2, versions.size());
   ASSERT_EQ("bar", versions[0].user_key);
   ASSERT_EQ("foo", versions[1].user_key);
@@ -1282,7 +1307,7 @@
   blob_db_impl()->TEST_ObsoleteBlobFile(blob_files[0]);
   blob_db_impl()->TEST_DeleteObsoleteFiles();
   ASSERT_OK(blob_db_->CompactRange(CompactRangeOptions(), nullptr, nullptr));
-  ASSERT_OK(GetAllKeyVersions(base_db, "", "", &versions));
+  ASSERT_OK(GetAllKeyVersions(base_db, "", "", kMaxKeys, &versions));
   ASSERT_EQ(1, versions.size());
   ASSERT_EQ("bar", versions[0].user_key);
   VerifyDB({{"bar", "v2"}});
@@ -1291,7 +1316,7 @@
   blob_db_impl()->TEST_ObsoleteBlobFile(blob_files[1]);
   blob_db_impl()->TEST_DeleteObsoleteFiles();
   ASSERT_OK(blob_db_->CompactRange(CompactRangeOptions(), nullptr, nullptr));
-  ASSERT_OK(GetAllKeyVersions(base_db, "", "", &versions));
+  ASSERT_OK(GetAllKeyVersions(base_db, "", "", kMaxKeys, &versions));
   ASSERT_EQ(0, versions.size());
   VerifyDB({});
 }
@@ -1409,6 +1434,52 @@
   blob_db_impl()->TEST_DeleteObsoleteFiles();
   ASSERT_EQ(0, blob_db_impl()->TEST_GetBlobFiles().size());
   ASSERT_EQ(0, blob_db_impl()->TEST_GetObsoleteFiles().size());
+  // Make sure we don't return garbage value after blob file being evicted,
+  // but the blob index still exists in the LSM tree.
+  std::string val = "";
+  ASSERT_TRUE(blob_db_->Get(ReadOptions(), "foo", &val).IsNotFound());
+  ASSERT_EQ("", val);
+}
+
+TEST_F(BlobDBTest, DisableFileDeletions) {
+  BlobDBOptions bdb_options;
+  bdb_options.disable_background_tasks = true;
+  Open(bdb_options);
+  std::map<std::string, std::string> data;
+  for (bool force : {true, false}) {
+    ASSERT_OK(Put("foo", "v", &data));
+    auto blob_files = blob_db_impl()->TEST_GetBlobFiles();
+    ASSERT_EQ(1, blob_files.size());
+    auto blob_file = blob_files[0];
+    ASSERT_OK(blob_db_impl()->TEST_CloseBlobFile(blob_file));
+    blob_db_impl()->TEST_ObsoleteBlobFile(blob_file);
+    ASSERT_EQ(1, blob_db_impl()->TEST_GetBlobFiles().size());
+    ASSERT_EQ(1, blob_db_impl()->TEST_GetObsoleteFiles().size());
+    // Call DisableFileDeletions twice.
+    ASSERT_OK(blob_db_->DisableFileDeletions());
+    ASSERT_OK(blob_db_->DisableFileDeletions());
+    // File deletions should be disabled.
+    blob_db_impl()->TEST_DeleteObsoleteFiles();
+    ASSERT_EQ(1, blob_db_impl()->TEST_GetBlobFiles().size());
+    ASSERT_EQ(1, blob_db_impl()->TEST_GetObsoleteFiles().size());
+    VerifyDB(data);
+    // Enable file deletions once. If force=true, file deletion is enabled.
+    // Otherwise it needs to enable it for a second time.
+    ASSERT_OK(blob_db_->EnableFileDeletions(force));
+    blob_db_impl()->TEST_DeleteObsoleteFiles();
+    if (!force) {
+      ASSERT_EQ(1, blob_db_impl()->TEST_GetBlobFiles().size());
+      ASSERT_EQ(1, blob_db_impl()->TEST_GetObsoleteFiles().size());
+      VerifyDB(data);
+      // Call EnableFileDeletions a second time.
+      ASSERT_OK(blob_db_->EnableFileDeletions(false));
+      blob_db_impl()->TEST_DeleteObsoleteFiles();
+    }
+    // Regardless of value of `force`, file should be deleted by now.
+    ASSERT_EQ(0, blob_db_impl()->TEST_GetBlobFiles().size());
+    ASSERT_EQ(0, blob_db_impl()->TEST_GetObsoleteFiles().size());
+    VerifyDB({});
+  }
 }
 
 }  //  namespace blob_db
diff -Nru rocksdb-5.15.10/utilities/blob_db/blob_dump_tool.cc rocksdb-5.17.2/utilities/blob_db/blob_dump_tool.cc
--- rocksdb-5.15.10/utilities/blob_db/blob_dump_tool.cc	2018-09-13 17:25:20.000000000 +0000
+++ rocksdb-5.17.2/utilities/blob_db/blob_dump_tool.cc	2018-11-12 19:57:32.000000000 +0000
@@ -199,7 +199,7 @@
     fprintf(stdout, "  expiration : %" PRIu64 "\n", record.expiration);
   }
   *offset += BlobLogRecord::kHeaderSize;
-  s = Read(*offset, key_size + value_size, &slice);
+  s = Read(*offset, static_cast<size_t>(key_size + value_size), &slice);
   if (!s.ok()) {
     return s;
   }
@@ -210,8 +210,8 @@
     BlockContents contents;
     UncompressionContext uncompression_ctx(compression);
     s = UncompressBlockContentsForCompressionType(
-        uncompression_ctx, slice.data() + key_size, value_size, &contents,
-        2 /*compress_format_version*/, ImmutableCFOptions(Options()));
+        uncompression_ctx, slice.data() + key_size, static_cast<size_t>(value_size),
+        &contents, 2 /*compress_format_version*/, ImmutableCFOptions(Options()));
     if (!s.ok()) {
       return s;
     }
@@ -219,10 +219,10 @@
   }
   if (show_key != DisplayType::kNone) {
     fprintf(stdout, "  key        : ");
-    DumpSlice(Slice(slice.data(), key_size), show_key);
+    DumpSlice(Slice(slice.data(), static_cast<size_t>(key_size)), show_key);
     if (show_blob != DisplayType::kNone) {
       fprintf(stdout, "  blob       : ");
-      DumpSlice(Slice(slice.data() + key_size, value_size), show_blob);
+      DumpSlice(Slice(slice.data() + static_cast<size_t>(key_size), static_cast<size_t>(value_size)), show_blob);
     }
     if (show_uncompressed_blob != DisplayType::kNone) {
       fprintf(stdout, "  raw blob   : ");
diff -Nru rocksdb-5.15.10/utilities/blob_db/blob_file.cc rocksdb-5.17.2/utilities/blob_db/blob_file.cc
--- rocksdb-5.15.10/utilities/blob_db/blob_file.cc	2018-09-13 17:25:20.000000000 +0000
+++ rocksdb-5.17.2/utilities/blob_db/blob_file.cc	2018-11-12 19:57:32.000000000 +0000
@@ -191,36 +191,48 @@
   last_access_ = -1;
 }
 
-std::shared_ptr<RandomAccessFileReader> BlobFile::GetOrOpenRandomAccessReader(
-    Env* env, const EnvOptions& env_options, bool* fresh_open) {
+Status BlobFile::GetReader(Env* env, const EnvOptions& env_options,
+                           std::shared_ptr<RandomAccessFileReader>* reader,
+                           bool* fresh_open) {
+  assert(reader != nullptr);
+  assert(fresh_open != nullptr);
   *fresh_open = false;
   int64_t current_time = 0;
   env->GetCurrentTime(&current_time);
   last_access_.store(current_time);
+  Status s;
 
   {
     ReadLock lockbfile_r(&mutex_);
-    if (ra_file_reader_) return ra_file_reader_;
+    if (ra_file_reader_) {
+      *reader = ra_file_reader_;
+      return s;
+    }
   }
 
   WriteLock lockbfile_w(&mutex_);
-  if (ra_file_reader_) return ra_file_reader_;
+  // Double check.
+  if (ra_file_reader_) {
+    *reader = ra_file_reader_;
+    return s;
+  }
 
   std::unique_ptr<RandomAccessFile> rfile;
-  Status s = env->NewRandomAccessFile(PathName(), &rfile, env_options);
+  s = env->NewRandomAccessFile(PathName(), &rfile, env_options);
   if (!s.ok()) {
     ROCKS_LOG_ERROR(info_log_,
                     "Failed to open blob file for random-read: %s status: '%s'"
                     " exists: '%s'",
                     PathName().c_str(), s.ToString().c_str(),
                     env->FileExists(PathName()).ToString().c_str());
-    return nullptr;
+    return s;
   }
 
   ra_file_reader_ = std::make_shared<RandomAccessFileReader>(std::move(rfile),
                                                              PathName());
+  *reader = ra_file_reader_;
   *fresh_open = true;
-  return ra_file_reader_;
+  return s;
 }
 
 Status BlobFile::ReadMetadata(Env* env, const EnvOptions& env_options) {
diff -Nru rocksdb-5.15.10/utilities/blob_db/blob_file.h rocksdb-5.17.2/utilities/blob_db/blob_file.h
--- rocksdb-5.15.10/utilities/blob_db/blob_file.h	2018-09-13 17:25:20.000000000 +0000
+++ rocksdb-5.17.2/utilities/blob_db/blob_file.h	2018-11-12 19:57:32.000000000 +0000
@@ -181,6 +181,10 @@
   // footer_valid_ to false and return Status::OK.
   Status ReadMetadata(Env* env, const EnvOptions& env_options);
 
+  Status GetReader(Env* env, const EnvOptions& env_options,
+                   std::shared_ptr<RandomAccessFileReader>* reader,
+                   bool* fresh_open);
+
  private:
   std::shared_ptr<Reader> OpenRandomAccessReader(
       Env* env, const DBOptions& db_options,
@@ -190,9 +194,6 @@
 
   Status WriteFooterAndCloseLocked();
 
-  std::shared_ptr<RandomAccessFileReader> GetOrOpenRandomAccessReader(
-      Env* env, const EnvOptions& env_options, bool* fresh_open);
-
   void CloseRandomAccessLocked();
 
   // this is used, when you are reading only the footer of a
diff -Nru rocksdb-5.15.10/utilities/blob_db/blob_log_reader.cc rocksdb-5.17.2/utilities/blob_db/blob_log_reader.cc
--- rocksdb-5.15.10/utilities/blob_db/blob_log_reader.cc	2018-09-13 17:25:20.000000000 +0000
+++ rocksdb-5.17.2/utilities/blob_db/blob_log_reader.cc	2018-11-12 19:57:32.000000000 +0000
@@ -26,8 +26,8 @@
 
 Status Reader::ReadSlice(uint64_t size, Slice* slice, std::string* buf) {
   StopWatch read_sw(env_, statistics_, BLOB_DB_BLOB_FILE_READ_MICROS);
-  buf->reserve(size);
-  Status s = file_->Read(next_byte_, size, slice, &(*buf)[0]);
+  buf->reserve(static_cast<size_t>(size));
+  Status s = file_->Read(next_byte_, static_cast<size_t>(size), slice, &(*buf)[0]);
   next_byte_ += size;
   if (!s.ok()) {
     return s;
diff -Nru rocksdb-5.15.10/utilities/checkpoint/checkpoint_impl.cc rocksdb-5.17.2/utilities/checkpoint/checkpoint_impl.cc
--- rocksdb-5.15.10/utilities/checkpoint/checkpoint_impl.cc	2018-09-13 17:25:20.000000000 +0000
+++ rocksdb-5.17.2/utilities/checkpoint/checkpoint_impl.cc	2018-11-12 19:57:32.000000000 +0000
@@ -120,7 +120,8 @@
         } /* copy_file_cb */,
         [&](const std::string& fname, const std::string& contents, FileType) {
           ROCKS_LOG_INFO(db_options.info_log, "Creating %s", fname.c_str());
-          return CreateFile(db_->GetEnv(), full_private_path + fname, contents);
+          return CreateFile(db_->GetEnv(), full_private_path + fname, contents,
+                            db_options.use_fsync);
         } /* create_file_cb */,
         &sequence_number, log_size_for_flush);
     // we copied all the files, enable file deletions
diff -Nru rocksdb-5.15.10/utilities/checkpoint/checkpoint_test.cc rocksdb-5.17.2/utilities/checkpoint/checkpoint_test.cc
--- rocksdb-5.15.10/utilities/checkpoint/checkpoint_test.cc	2018-09-13 17:25:20.000000000 +0000
+++ rocksdb-5.17.2/utilities/checkpoint/checkpoint_test.cc	2018-11-12 19:57:32.000000000 +0000
@@ -17,12 +17,13 @@
 #include <thread>
 #include <utility>
 #include "db/db_impl.h"
-#include "port/stack_trace.h"
 #include "port/port.h"
+#include "port/stack_trace.h"
 #include "rocksdb/db.h"
 #include "rocksdb/env.h"
 #include "rocksdb/utilities/checkpoint.h"
 #include "rocksdb/utilities/transaction_db.h"
+#include "util/fault_injection_test_env.h"
 #include "util/sync_point.h"
 #include "util/testharness.h"
 
@@ -585,6 +586,32 @@
   thread.join();
 }
 
+TEST_F(CheckpointTest, CheckpointWithUnsyncedDataDropped) {
+  Options options = CurrentOptions();
+  std::unique_ptr<FaultInjectionTestEnv> env(new FaultInjectionTestEnv(env_));
+  options.env = env.get();
+  Reopen(options);
+  ASSERT_OK(Put("key1", "val1"));
+  Checkpoint* checkpoint;
+  ASSERT_OK(Checkpoint::Create(db_, &checkpoint));
+  ASSERT_OK(checkpoint->CreateCheckpoint(snapshot_name_));
+  delete checkpoint;
+  env->DropUnsyncedFileData();
+
+  // make sure it's openable even though whatever data that wasn't synced got
+  // dropped.
+  options.env = env_;
+  DB* snapshot_db;
+  ASSERT_OK(DB::Open(options, snapshot_name_, &snapshot_db));
+  ReadOptions read_opts;
+  std::string get_result;
+  ASSERT_OK(snapshot_db->Get(read_opts, "key1", &get_result));
+  ASSERT_EQ("val1", get_result);
+  delete snapshot_db;
+  delete db_;
+  db_ = nullptr;
+}
+
 }  // namespace rocksdb
 
 int main(int argc, char** argv) {
diff -Nru rocksdb-5.15.10/utilities/col_buf_decoder.cc rocksdb-5.17.2/utilities/col_buf_decoder.cc
--- rocksdb-5.15.10/utilities/col_buf_decoder.cc	2018-09-13 17:25:20.000000000 +0000
+++ rocksdb-5.17.2/utilities/col_buf_decoder.cc	2018-11-12 19:57:32.000000000 +0000
@@ -147,7 +147,7 @@
              col_compression_type_ == kColDict) {
     uint64_t dict_val = read_val;
     assert(dict_val < dict_vec_.size());
-    write_val = dict_vec_[dict_val];
+    write_val = dict_vec_[static_cast<size_t>(dict_val)];
   }
 
   // dest->append(reinterpret_cast<char*>(&write_val), size_);
@@ -222,7 +222,7 @@
       uint64_t dict_val;
       ReadVarint64(&src, &dict_val);
       assert(dict_val < dict_vec_.size());
-      chunk_buf = dict_vec_[dict_val];
+      chunk_buf = dict_vec_[static_cast<size_t>(dict_val)];
     } else {
       memcpy(&chunk_buf, src, chunk_size);
       src += chunk_size;
diff -Nru rocksdb-5.15.10/utilities/column_aware_encoding_util.cc rocksdb-5.17.2/utilities/column_aware_encoding_util.cc
--- rocksdb-5.15.10/utilities/column_aware_encoding_util.cc	2018-09-13 17:25:20.000000000 +0000
+++ rocksdb-5.17.2/utilities/column_aware_encoding_util.cc	2018-11-12 19:57:32.000000000 +0000
@@ -100,7 +100,7 @@
 
     size_t num_kv_pairs;
     const char* header_content_ptr = content_ptr;
-    num_kv_pairs = DecodeFixed64(header_content_ptr);
+    num_kv_pairs = static_cast<size_t>(DecodeFixed64(header_content_ptr));
 
     header_content_ptr += sizeof(size_t);
     size_t num_key_columns = key_col_bufs.size();
@@ -118,7 +118,7 @@
       key_content_ptr[i] = col_content_ptr;
       key_content_ptr[i] += key_col_bufs[i]->Init(key_content_ptr[i]);
       size_t offset;
-      offset = DecodeFixed64(header_content_ptr);
+      offset = static_cast<size_t>(DecodeFixed64(header_content_ptr));
       header_content_ptr += sizeof(size_t);
       col_content_ptr += offset;
     }
@@ -126,7 +126,7 @@
       value_content_ptr[i] = col_content_ptr;
       value_content_ptr[i] += value_col_bufs[i]->Init(value_content_ptr[i]);
       size_t offset;
-      offset = DecodeFixed64(header_content_ptr);
+      offset = static_cast<size_t>(DecodeFixed64(header_content_ptr));
       header_content_ptr += sizeof(size_t);
       col_content_ptr += offset;
     }
diff -Nru rocksdb-5.15.10/utilities/debug.cc rocksdb-5.17.2/utilities/debug.cc
--- rocksdb-5.15.10/utilities/debug.cc	2018-09-13 17:25:20.000000000 +0000
+++ rocksdb-5.17.2/utilities/debug.cc	2018-11-12 19:57:32.000000000 +0000
@@ -12,6 +12,7 @@
 namespace rocksdb {
 
 Status GetAllKeyVersions(DB* db, Slice begin_key, Slice end_key,
+                         size_t max_num_ikeys,
                          std::vector<KeyVersion>* key_versions) {
   assert(key_versions != nullptr);
   key_versions->clear();
@@ -30,6 +31,7 @@
     iter->SeekToFirst();
   }
 
+  size_t num_keys = 0;
   for (; iter->Valid(); iter->Next()) {
     ParsedInternalKey ikey;
     if (!ParseInternalKey(iter->key(), &ikey)) {
@@ -46,6 +48,9 @@
                                iter->value().ToString() /* _value */,
                                ikey.sequence /* _sequence */,
                                static_cast<int>(ikey.type) /* _type */);
+    if (++num_keys >= max_num_ikeys) {
+      break;
+    }
   }
   return Status::OK();
 }
diff -Nru rocksdb-5.15.10/utilities/document/document_db.cc rocksdb-5.17.2/utilities/document/document_db.cc
--- rocksdb-5.15.10/utilities/document/document_db.cc	2018-09-13 17:25:20.000000000 +0000
+++ rocksdb-5.17.2/utilities/document/document_db.cc	2018-11-12 19:57:32.000000000 +0000
@@ -1155,10 +1155,10 @@
   Options rocksdb_options;
   rocksdb_options.max_background_compactions = options.background_threads - 1;
   rocksdb_options.max_background_flushes = 1;
-  rocksdb_options.write_buffer_size = options.memtable_size;
+  rocksdb_options.write_buffer_size = static_cast<size_t>(options.memtable_size);
   rocksdb_options.max_write_buffer_number = 6;
   BlockBasedTableOptions table_options;
-  table_options.block_cache = NewLRUCache(options.cache_size);
+  table_options.block_cache = NewLRUCache(static_cast<size_t>(options.cache_size));
   rocksdb_options.table_factory.reset(NewBlockBasedTableFactory(table_options));
   return rocksdb_options;
 }
diff -Nru rocksdb-5.15.10/utilities/document/document_db_test.cc rocksdb-5.17.2/utilities/document/document_db_test.cc
--- rocksdb-5.15.10/utilities/document/document_db_test.cc	2018-09-13 17:25:20.000000000 +0000
+++ rocksdb-5.17.2/utilities/document/document_db_test.cc	2018-11-12 19:57:32.000000000 +0000
@@ -75,8 +75,10 @@
   ASSERT_OK(DocumentDB::Open(options, dbname_, {}, &db_));
   CreateIndexes({index});
   delete db_;
+  db_ = nullptr;
   // now there is index present
   ASSERT_OK(DocumentDB::Open(options, dbname_, {index}, &db_));
+  assert(db_ != nullptr);
   delete index.description;
 
   std::vector<std::string> json_objects = {
diff -Nru rocksdb-5.15.10/utilities/env_mirror.cc rocksdb-5.17.2/utilities/env_mirror.cc
--- rocksdb-5.15.10/utilities/env_mirror.cc	2018-09-13 17:25:20.000000000 +0000
+++ rocksdb-5.17.2/utilities/env_mirror.cc	2018-11-12 19:57:32.000000000 +0000
@@ -64,7 +64,8 @@
   std::string fname;
   explicit RandomAccessFileMirror(std::string f) : fname(f) {}
 
-  Status Read(uint64_t offset, size_t n, Slice* result, char* scratch) const override {
+  Status Read(uint64_t offset, size_t n, Slice* result,
+              char* scratch) const override {
     Status as = a_->Read(offset, n, result, scratch);
     if (as == Status::OK()) {
       char* bscratch = new char[n];
diff -Nru rocksdb-5.15.10/utilities/merge_operators/bytesxor.h rocksdb-5.17.2/utilities/merge_operators/bytesxor.h
--- rocksdb-5.15.10/utilities/merge_operators/bytesxor.h	2018-09-13 17:25:20.000000000 +0000
+++ rocksdb-5.17.2/utilities/merge_operators/bytesxor.h	2018-11-12 19:57:32.000000000 +0000
@@ -3,8 +3,7 @@
 //  COPYING file in the root directory) and Apache 2.0 License
 //  (found in the LICENSE.Apache file in the root directory).
 
-#ifndef UTILITIES_MERGE_OPERATORS_BYTESXOR_H_
-#define UTILITIES_MERGE_OPERATORS_BYTESXOR_H_
+#pragma once
 
 #include <algorithm>
 #include <memory>
@@ -38,5 +37,3 @@
 };
 
 }  // namespace rocksdb
-
-#endif  // UTILITIES_MERGE_OPERATORS_BYTESXOR_H_
diff -Nru rocksdb-5.15.10/utilities/persistent_cache/block_cache_tier.cc rocksdb-5.17.2/utilities/persistent_cache/block_cache_tier.cc
--- rocksdb-5.15.10/utilities/persistent_cache/block_cache_tier.cc	2018-09-13 17:25:20.000000000 +0000
+++ rocksdb-5.17.2/utilities/persistent_cache/block_cache_tier.cc	2018-11-12 19:57:32.000000000 +0000
@@ -163,7 +163,7 @@
       stats_.read_hit_latency_.Average());
   Add(&stats, "persistentcache.blockcachetier.read_miss_latency",
       stats_.read_miss_latency_.Average());
-  Add(&stats, "persistenetcache.blockcachetier.write_latency",
+  Add(&stats, "persistentcache.blockcachetier.write_latency",
       stats_.write_latency_.Average());
 
   auto out = PersistentCacheTier::Stats();
diff -Nru rocksdb-5.15.10/utilities/persistent_cache/block_cache_tier.h rocksdb-5.17.2/utilities/persistent_cache/block_cache_tier.h
--- rocksdb-5.15.10/utilities/persistent_cache/block_cache_tier.h	2018-09-13 17:25:20.000000000 +0000
+++ rocksdb-5.17.2/utilities/persistent_cache/block_cache_tier.h	2018-11-12 19:57:32.000000000 +0000
@@ -44,9 +44,9 @@
  public:
   explicit BlockCacheTier(const PersistentCacheConfig& opt)
       : opt_(opt),
-        insert_ops_(opt_.max_write_pipeline_backlog_size),
+        insert_ops_(static_cast<size_t>(opt_.max_write_pipeline_backlog_size)),
         buffer_allocator_(opt.write_buffer_size, opt.write_buffer_count()),
-        writer_(this, opt_.writer_qdepth, opt_.writer_dispatch_size) {
+        writer_(this, opt_.writer_qdepth, static_cast<size_t>(opt_.writer_dispatch_size)) {
     Info(opt_.log, "Initializing allocator. size=%d B count=%d",
          opt_.write_buffer_size, opt_.write_buffer_count());
   }
diff -Nru rocksdb-5.15.10/utilities/redis/redis_list_exception.h rocksdb-5.17.2/utilities/redis/redis_list_exception.h
--- rocksdb-5.15.10/utilities/redis/redis_list_exception.h	2018-09-13 17:25:20.000000000 +0000
+++ rocksdb-5.17.2/utilities/redis/redis_list_exception.h	2018-11-12 19:57:32.000000000 +0000
@@ -5,8 +5,8 @@
  * Copyright 2013 Facebook
  */
 
-#ifndef ROCKSDB_LITE
 #pragma once
+#ifndef ROCKSDB_LITE
 #include <exception>
 
 namespace rocksdb {
diff -Nru rocksdb-5.15.10/utilities/simulator_cache/sim_cache.cc rocksdb-5.17.2/utilities/simulator_cache/sim_cache.cc
--- rocksdb-5.15.10/utilities/simulator_cache/sim_cache.cc	2018-09-13 17:25:20.000000000 +0000
+++ rocksdb-5.17.2/utilities/simulator_cache/sim_cache.cc	2018-11-12 19:57:32.000000000 +0000
@@ -46,7 +46,8 @@
     if (!status.ok()) {
       return status;
     }
-    file_writer_.reset(new WritableFileWriter(std::move(log_file), env_opts));
+    file_writer_.reset(new WritableFileWriter(std::move(log_file),
+                                              activity_log_file, env_opts));
 
     max_logging_size_ = max_logging_size;
     activity_logging_enabled_.store(true);
diff -Nru rocksdb-5.15.10/utilities/spatialdb/spatial_db.cc rocksdb-5.17.2/utilities/spatialdb/spatial_db.cc
--- rocksdb-5.15.10/utilities/spatialdb/spatial_db.cc	2018-09-13 17:25:20.000000000 +0000
+++ rocksdb-5.17.2/utilities/spatialdb/spatial_db.cc	2018-11-12 19:57:32.000000000 +0000
@@ -354,8 +354,8 @@
       : value_getter_(value_getter), valid_(true) {
     // calculate quad keys we'll need to query
     std::vector<uint64_t> quad_keys;
-    quad_keys.reserve((tile_bbox.max_x - tile_bbox.min_x + 1) *
-                      (tile_bbox.max_y - tile_bbox.min_y + 1));
+    quad_keys.reserve(static_cast<size_t>((tile_bbox.max_x - tile_bbox.min_x + 1) *
+                      (tile_bbox.max_y - tile_bbox.min_y + 1)));
     for (uint64_t x = tile_bbox.min_x; x <= tile_bbox.max_x; ++x) {
       for (uint64_t y = tile_bbox.min_y; y <= tile_bbox.max_y; ++y) {
         quad_keys.push_back(GetQuadKeyFromTile(x, y, tile_bits));
@@ -791,7 +791,7 @@
   db_options.create_missing_column_families = true;
   db_options.error_if_exists = true;
 
-  auto block_cache = NewLRUCache(options.cache_size);
+  auto block_cache = NewLRUCache(static_cast<size_t>(options.cache_size));
   ColumnFamilyOptions column_family_options =
       GetColumnFamilyOptions(options, block_cache);
 
@@ -832,7 +832,7 @@
 Status SpatialDB::Open(const SpatialDBOptions& options, const std::string& name,
                        SpatialDB** db, bool read_only) {
   DBOptions db_options = GetDBOptionsFromSpatialDBOptions(options);
-  auto block_cache = NewLRUCache(options.cache_size);
+  auto block_cache = NewLRUCache(static_cast<size_t>(options.cache_size));
   ColumnFamilyOptions column_family_options =
       GetColumnFamilyOptions(options, block_cache);
 
diff -Nru rocksdb-5.15.10/utilities/table_properties_collectors/compact_on_deletion_collector.cc rocksdb-5.17.2/utilities/table_properties_collectors/compact_on_deletion_collector.cc
--- rocksdb-5.15.10/utilities/table_properties_collectors/compact_on_deletion_collector.cc	2018-09-13 17:25:20.000000000 +0000
+++ rocksdb-5.17.2/utilities/table_properties_collectors/compact_on_deletion_collector.cc	2018-11-12 19:57:32.000000000 +0000
@@ -20,7 +20,6 @@
       deletion_trigger_(deletion_trigger),
       need_compaction_(false),
       finished_(false) {
-  assert(bucket_size_ > 0U);
   memset(num_deletions_in_buckets_, 0, sizeof(size_t) * kNumBuckets);
 }
 
@@ -35,6 +34,11 @@
                                               SequenceNumber /*seq*/,
                                               uint64_t /*file_size*/) {
   assert(!finished_);
+  if (bucket_size_ == 0) {
+    // This collector is effectively disabled
+    return Status::OK();
+  }
+
   if (need_compaction_) {
     // If the output file already needs to be compacted, skip the check.
     return Status::OK();
@@ -71,14 +75,14 @@
 CompactOnDeletionCollectorFactory::CreateTablePropertiesCollector(
     TablePropertiesCollectorFactory::Context /*context*/) {
   return new CompactOnDeletionCollector(
-      sliding_window_size_, deletion_trigger_);
+      sliding_window_size_.load(), deletion_trigger_.load());
 }
 
-std::shared_ptr<TablePropertiesCollectorFactory>
+std::shared_ptr<CompactOnDeletionCollectorFactory>
     NewCompactOnDeletionCollectorFactory(
         size_t sliding_window_size,
         size_t deletion_trigger) {
-  return std::shared_ptr<TablePropertiesCollectorFactory>(
+  return std::shared_ptr<CompactOnDeletionCollectorFactory>(
       new CompactOnDeletionCollectorFactory(
           sliding_window_size, deletion_trigger));
 }
diff -Nru rocksdb-5.15.10/utilities/table_properties_collectors/compact_on_deletion_collector.h rocksdb-5.17.2/utilities/table_properties_collectors/compact_on_deletion_collector.h
--- rocksdb-5.15.10/utilities/table_properties_collectors/compact_on_deletion_collector.h	2018-09-13 17:25:20.000000000 +0000
+++ rocksdb-5.17.2/utilities/table_properties_collectors/compact_on_deletion_collector.h	2018-11-12 19:57:32.000000000 +0000
@@ -9,38 +9,6 @@
 #include "rocksdb/utilities/table_properties_collectors.h"
 namespace rocksdb {
 
-// A factory of a table property collector that marks a SST
-// file as need-compaction when it observe at least "D" deletion
-// entries in any "N" consecutive entires.
-class CompactOnDeletionCollectorFactory
-    : public TablePropertiesCollectorFactory {
- public:
-  // A factory of a table property collector that marks a SST
-  // file as need-compaction when it observe at least "D" deletion
-  // entries in any "N" consecutive entires.
-  //
-  // @param sliding_window_size "N"
-  // @param deletion_trigger "D"
-  CompactOnDeletionCollectorFactory(
-      size_t sliding_window_size,
-      size_t deletion_trigger) :
-          sliding_window_size_(sliding_window_size),
-          deletion_trigger_(deletion_trigger) {}
-
-  virtual ~CompactOnDeletionCollectorFactory() {}
-
-  virtual TablePropertiesCollector* CreateTablePropertiesCollector(
-      TablePropertiesCollectorFactory::Context context) override;
-
-  virtual const char* Name() const override {
-    return "CompactOnDeletionCollector";
-  }
-
- private:
-  size_t sliding_window_size_;
-  size_t deletion_trigger_;
-};
-
 class CompactOnDeletionCollector : public TablePropertiesCollector {
  public:
   CompactOnDeletionCollector(
diff -Nru rocksdb-5.15.10/utilities/trace/file_trace_reader_writer.cc rocksdb-5.17.2/utilities/trace/file_trace_reader_writer.cc
--- rocksdb-5.15.10/utilities/trace/file_trace_reader_writer.cc	1970-01-01 00:00:00.000000000 +0000
+++ rocksdb-5.17.2/utilities/trace/file_trace_reader_writer.cc	2018-11-12 19:57:32.000000000 +0000
@@ -0,0 +1,118 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+
+#include "utilities/trace/file_trace_reader_writer.h"
+
+#include "util/coding.h"
+#include "util/file_reader_writer.h"
+#include "util/trace_replay.h"
+
+namespace rocksdb {
+
+const unsigned int FileTraceReader::kBufferSize = 1024;  // 1KB
+
+FileTraceReader::FileTraceReader(
+    std::unique_ptr<RandomAccessFileReader>&& reader)
+    : file_reader_(std::move(reader)),
+      offset_(0),
+      buffer_(new char[kBufferSize]) {}
+
+FileTraceReader::~FileTraceReader() {
+  Close();
+  delete[] buffer_;
+}
+
+Status FileTraceReader::Close() {
+  file_reader_.reset();
+  return Status::OK();
+}
+
+Status FileTraceReader::Read(std::string* data) {
+  assert(file_reader_ != nullptr);
+  Status s = file_reader_->Read(offset_, kTraceMetadataSize, &result_, buffer_);
+  if (!s.ok()) {
+    return s;
+  }
+  if (result_.size() == 0) {
+    // No more data to read
+    // Todo: Come up with a better way to indicate end of data. May be this
+    // could be avoided once footer is introduced.
+    return Status::Incomplete();
+  }
+  if (result_.size() < kTraceMetadataSize) {
+    return Status::Corruption("Corrupted trace file.");
+  }
+  *data = result_.ToString();
+  offset_ += kTraceMetadataSize;
+
+  uint32_t payload_len =
+      DecodeFixed32(&buffer_[kTraceTimestampSize + kTraceTypeSize]);
+
+  // Read Payload
+  unsigned int bytes_to_read = payload_len;
+  unsigned int to_read =
+      bytes_to_read > kBufferSize ? kBufferSize : bytes_to_read;
+  while (to_read > 0) {
+    s = file_reader_->Read(offset_, to_read, &result_, buffer_);
+    if (!s.ok()) {
+      return s;
+    }
+    if (result_.size() < to_read) {
+      return Status::Corruption("Corrupted trace file.");
+    }
+    data->append(result_.data(), result_.size());
+
+    offset_ += to_read;
+    bytes_to_read -= to_read;
+    to_read = bytes_to_read > kBufferSize ? kBufferSize : bytes_to_read;
+  }
+
+  return s;
+}
+
+FileTraceWriter::~FileTraceWriter() { Close(); }
+
+Status FileTraceWriter::Close() {
+  file_writer_.reset();
+  return Status::OK();
+}
+
+Status FileTraceWriter::Write(const Slice& data) {
+  return file_writer_->Append(data);
+}
+
+Status NewFileTraceReader(Env* env, const EnvOptions& env_options,
+                          const std::string& trace_filename,
+                          std::unique_ptr<TraceReader>* trace_reader) {
+  unique_ptr<RandomAccessFile> trace_file;
+  Status s = env->NewRandomAccessFile(trace_filename, &trace_file, env_options);
+  if (!s.ok()) {
+    return s;
+  }
+
+  unique_ptr<RandomAccessFileReader> file_reader;
+  file_reader.reset(
+      new RandomAccessFileReader(std::move(trace_file), trace_filename));
+  trace_reader->reset(new FileTraceReader(std::move(file_reader)));
+  return s;
+}
+
+Status NewFileTraceWriter(Env* env, const EnvOptions& env_options,
+                          const std::string& trace_filename,
+                          std::unique_ptr<TraceWriter>* trace_writer) {
+  unique_ptr<WritableFile> trace_file;
+  Status s = env->NewWritableFile(trace_filename, &trace_file, env_options);
+  if (!s.ok()) {
+    return s;
+  }
+
+  unique_ptr<WritableFileWriter> file_writer;
+  file_writer.reset(new WritableFileWriter(std::move(trace_file),
+                                           trace_filename, env_options));
+  trace_writer->reset(new FileTraceWriter(std::move(file_writer)));
+  return s;
+}
+
+}  // namespace rocksdb
diff -Nru rocksdb-5.15.10/utilities/trace/file_trace_reader_writer.h rocksdb-5.17.2/utilities/trace/file_trace_reader_writer.h
--- rocksdb-5.15.10/utilities/trace/file_trace_reader_writer.h	1970-01-01 00:00:00.000000000 +0000
+++ rocksdb-5.17.2/utilities/trace/file_trace_reader_writer.h	2018-11-12 19:57:32.000000000 +0000
@@ -0,0 +1,47 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+
+#pragma once
+
+#include "rocksdb/trace_reader_writer.h"
+
+namespace rocksdb {
+
+class RandomAccessFileReader;
+class WritableFileWriter;
+
+// FileTraceReader allows reading RocksDB traces from a file.
+class FileTraceReader : public TraceReader {
+ public:
+  explicit FileTraceReader(std::unique_ptr<RandomAccessFileReader>&& reader);
+  ~FileTraceReader();
+
+  virtual Status Read(std::string* data) override;
+  virtual Status Close() override;
+
+ private:
+  unique_ptr<RandomAccessFileReader> file_reader_;
+  Slice result_;
+  size_t offset_;
+  char* const buffer_;
+
+  static const unsigned int kBufferSize;
+};
+
+// FileTraceWriter allows writing RocksDB traces to a file.
+class FileTraceWriter : public TraceWriter {
+ public:
+  explicit FileTraceWriter(std::unique_ptr<WritableFileWriter>&& file_writer)
+      : file_writer_(std::move(file_writer)) {}
+  ~FileTraceWriter();
+
+  virtual Status Write(const Slice& data) override;
+  virtual Status Close() override;
+
+ private:
+  unique_ptr<WritableFileWriter> file_writer_;
+};
+
+}  // namespace rocksdb
diff -Nru rocksdb-5.15.10/utilities/transactions/pessimistic_transaction.cc rocksdb-5.17.2/utilities/transactions/pessimistic_transaction.cc
--- rocksdb-5.15.10/utilities/transactions/pessimistic_transaction.cc	2018-09-13 17:25:20.000000000 +0000
+++ rocksdb-5.17.2/utilities/transactions/pessimistic_transaction.cc	2018-11-12 19:57:32.000000000 +0000
@@ -46,7 +46,8 @@
       waiting_key_(nullptr),
       lock_timeout_(0),
       deadlock_detect_(false),
-      deadlock_detect_depth_(0) {
+      deadlock_detect_depth_(0),
+      skip_concurrency_control_(false) {
   txn_db_impl_ =
       static_cast_with_check<PessimisticTransactionDB, TransactionDB>(txn_db);
   db_impl_ = static_cast_with_check<DBImpl, DB>(db_);
@@ -61,6 +62,7 @@
   deadlock_detect_ = txn_options.deadlock_detect;
   deadlock_detect_depth_ = txn_options.deadlock_detect_depth;
   write_batch_.SetMaxBytes(txn_options.max_write_batch_size);
+  skip_concurrency_control_ = txn_options.skip_concurrency_control;
 
   lock_timeout_ = txn_options.lock_timeout * 1000;
   if (lock_timeout_ < 0) {
@@ -191,14 +193,22 @@
   }
 
   if (can_prepare) {
+    bool wal_already_marked = false;
     txn_state_.store(AWAITING_PREPARE);
     // transaction can't expire after preparation
     expiration_time_ = 0;
+    if (log_number_ > 0) {
+      assert(txn_db_impl_->GetTxnDBOptions().write_policy == WRITE_UNPREPARED);
+      wal_already_marked = true;
+    }
+
     s = PrepareInternal();
     if (s.ok()) {
       assert(log_number_ != 0);
-      dbimpl_->logs_with_prep_tracker()->MarkLogAsContainingPrepSection(
-          log_number_);
+      if (!wal_already_marked) {
+        dbimpl_->logs_with_prep_tracker()->MarkLogAsContainingPrepSection(
+            log_number_);
+      }
       txn_state_.store(PREPARED);
     }
   } else if (txn_state_ == LOCKS_STOLEN) {
@@ -264,7 +274,14 @@
           "Commit-time batch contains values that will not be committed.");
     } else {
       txn_state_.store(AWAITING_COMMIT);
+      if (log_number_ > 0) {
+        dbimpl_->logs_with_prep_tracker()->MarkLogAsHavingPrepSectionFlushed(
+            log_number_);
+      }
       s = CommitWithoutPrepareInternal();
+      if (!name_.empty()) {
+        txn_db_impl_->UnregisterTransaction(this);
+      }
       Clear();
       if (s.ok()) {
         txn_state_.store(COMMITED);
@@ -349,6 +366,16 @@
       txn_state_.store(ROLLEDBACK);
     }
   } else if (txn_state_ == STARTED) {
+    if (log_number_ > 0) {
+      assert(txn_db_impl_->GetTxnDBOptions().write_policy == WRITE_UNPREPARED);
+      assert(GetId() > 0);
+      s = RollbackInternal();
+
+      if (s.ok()) {
+        dbimpl_->logs_with_prep_tracker()->MarkLogAsHavingPrepSectionFlushed(
+            log_number_);
+      }
+    }
     // prepare couldn't have taken place
     Clear();
   } else if (txn_state_ == COMMITED) {
@@ -467,11 +494,14 @@
 Status PessimisticTransaction::TryLock(ColumnFamilyHandle* column_family,
                                        const Slice& key, bool read_only,
                                        bool exclusive, bool skip_validate) {
+  Status s;
+  if (UNLIKELY(skip_concurrency_control_)) {
+    return s;
+  }
   uint32_t cfh_id = GetColumnFamilyID(column_family);
   std::string key_str = key.ToString();
   bool previously_locked;
   bool lock_upgrade = false;
-  Status s;
 
   // lock this key if this transactions hasn't already locked it
   SequenceNumber tracked_at_seq = kMaxSequenceNumber;
diff -Nru rocksdb-5.15.10/utilities/transactions/pessimistic_transaction_db.cc rocksdb-5.17.2/utilities/transactions/pessimistic_transaction_db.cc
--- rocksdb-5.15.10/utilities/transactions/pessimistic_transaction_db.cc	2018-09-13 17:25:20.000000000 +0000
+++ rocksdb-5.17.2/utilities/transactions/pessimistic_transaction_db.cc	2018-11-12 19:57:32.000000000 +0000
@@ -133,6 +133,14 @@
     WriteOptions w_options;
     w_options.sync = true;
     TransactionOptions t_options;
+    // This would help avoiding deadlock for keys that although exist in the WAL
+    // did not go through concurrency control. This includes the merge that
+    // MyRocks uses for auto-inc columns. It is safe to do so, since (i) if
+    // there is a conflict between the keys of two transactions that must be
+    // avoided, it is already avoided by the application, MyRocks, before the
+    // restart (ii) application, MyRocks, guarntees to rollback/commit the
+    // recovered transactions before new transactions start.
+    t_options.skip_concurrency_control = true;
 
     Transaction* real_trx = BeginTransaction(w_options, t_options, nullptr);
     assert(real_trx);
diff -Nru rocksdb-5.15.10/utilities/transactions/pessimistic_transaction_db.h rocksdb-5.17.2/utilities/transactions/pessimistic_transaction_db.h
--- rocksdb-5.15.10/utilities/transactions/pessimistic_transaction_db.h	2018-09-13 17:25:20.000000000 +0000
+++ rocksdb-5.17.2/utilities/transactions/pessimistic_transaction_db.h	2018-11-12 19:57:32.000000000 +0000
@@ -136,13 +136,15 @@
  private:
   friend class WritePreparedTxnDB;
   friend class WritePreparedTxnDBMock;
+  friend class WriteUnpreparedTxn;
   friend class TransactionTest_DoubleEmptyWrite_Test;
   friend class TransactionTest_DuplicateKeys_Test;
   friend class TransactionTest_PersistentTwoPhaseTransactionTest_Test;
   friend class TransactionStressTest_TwoPhaseLongPrepareTest_Test;
   friend class TransactionTest_TwoPhaseDoubleRecoveryTest_Test;
   friend class TransactionTest_TwoPhaseOutOfOrderDelete_Test;
-  friend class WriteUnpreparedTransactionTest_RecoveryRollbackUnprepared_Test;
+  friend class WriteUnpreparedTransactionTest_RecoveryTest_Test;
+  friend class WriteUnpreparedTransactionTest_MarkLogWithPrepSection_Test;
   TransactionLockMgr lock_mgr_;
 
   // Must be held when adding/dropping column families.
diff -Nru rocksdb-5.15.10/utilities/transactions/pessimistic_transaction.h rocksdb-5.17.2/utilities/transactions/pessimistic_transaction.h
--- rocksdb-5.15.10/utilities/transactions/pessimistic_transaction.h	2018-09-13 17:25:20.000000000 +0000
+++ rocksdb-5.17.2/utilities/transactions/pessimistic_transaction.h	2018-11-12 19:57:32.000000000 +0000
@@ -130,7 +130,7 @@
 
   virtual Status RollbackInternal() = 0;
 
-  void Initialize(const TransactionOptions& txn_options);
+  virtual void Initialize(const TransactionOptions& txn_options);
 
   Status LockBatch(WriteBatch* batch, TransactionKeyMap* keys_to_unlock);
 
@@ -183,6 +183,9 @@
   // Whether to perform deadlock detection or not.
   int64_t deadlock_detect_depth_;
 
+  // Refer to TransactionOptions::skip_concurrency_control
+  bool skip_concurrency_control_;
+
   virtual Status ValidateSnapshot(ColumnFamilyHandle* column_family,
                                   const Slice& key,
                                   SequenceNumber* tracked_at_seq);
diff -Nru rocksdb-5.15.10/utilities/transactions/transaction_base.cc rocksdb-5.17.2/utilities/transactions/transaction_base.cc
--- rocksdb-5.15.10/utilities/transactions/transaction_base.cc	2018-09-13 17:25:20.000000000 +0000
+++ rocksdb-5.17.2/utilities/transactions/transaction_base.cc	2018-11-12 19:57:32.000000000 +0000
@@ -178,6 +178,19 @@
     return Status::NotFound();
   }
 }
+  
+Status TransactionBaseImpl::PopSavePoint() {
+  if (save_points_ == nullptr ||
+      save_points_->empty()) {
+    // No SavePoint yet.
+    assert(write_batch_.PopSavePoint().IsNotFound());
+    return Status::NotFound();
+  }
+
+  assert(!save_points_->empty()); 
+  save_points_->pop();
+  return write_batch_.PopSavePoint();
+}
 
 Status TransactionBaseImpl::Get(const ReadOptions& read_options,
                                 ColumnFamilyHandle* column_family,
diff -Nru rocksdb-5.15.10/utilities/transactions/transaction_base.h rocksdb-5.17.2/utilities/transactions/transaction_base.h
--- rocksdb-5.15.10/utilities/transactions/transaction_base.h	2018-09-13 17:25:20.000000000 +0000
+++ rocksdb-5.17.2/utilities/transactions/transaction_base.h	2018-11-12 19:57:32.000000000 +0000
@@ -45,6 +45,8 @@
   void SetSavePoint() override;
 
   Status RollbackToSavePoint() override;
+  
+  Status PopSavePoint() override;
 
   using Transaction::Get;
   Status Get(const ReadOptions& options, ColumnFamilyHandle* column_family,
diff -Nru rocksdb-5.15.10/utilities/transactions/transaction_lock_mgr.cc rocksdb-5.17.2/utilities/transactions/transaction_lock_mgr.cc
--- rocksdb-5.15.10/utilities/transactions/transaction_lock_mgr.cc	2018-09-13 17:25:20.000000000 +0000
+++ rocksdb-5.17.2/utilities/transactions/transaction_lock_mgr.cc	2018-11-12 19:57:32.000000000 +0000
@@ -446,8 +446,8 @@
     const autovector<TransactionID>& wait_ids, const std::string& key,
     const uint32_t& cf_id, const bool& exclusive, Env* const env) {
   auto id = txn->GetID();
-  std::vector<int> queue_parents(txn->GetDeadlockDetectDepth());
-  std::vector<TransactionID> queue_values(txn->GetDeadlockDetectDepth());
+  std::vector<int> queue_parents(static_cast<size_t>(txn->GetDeadlockDetectDepth()));
+  std::vector<TransactionID> queue_values(static_cast<size_t>(txn->GetDeadlockDetectDepth()));
   std::lock_guard<std::mutex> lock(wait_txn_map_mutex_);
   assert(!wait_txn_map_.Contains(id));
 
diff -Nru rocksdb-5.15.10/utilities/transactions/transaction_test.cc rocksdb-5.17.2/utilities/transactions/transaction_test.cc
--- rocksdb-5.15.10/utilities/transactions/transaction_test.cc	2018-09-13 17:25:20.000000000 +0000
+++ rocksdb-5.17.2/utilities/transactions/transaction_test.cc	2018-11-12 19:57:32.000000000 +0000
@@ -796,6 +796,7 @@
   WriteOptions write_options;
   options.write_buffer_size = 1024;
   ASSERT_OK(ReOpenNoDelete());
+  assert(db != nullptr);
   Random rnd(47);
   std::vector<Transaction*> txns;
   DBImpl* db_impl = reinterpret_cast<DBImpl*>(db->GetRootDB());
@@ -1254,6 +1255,7 @@
   reinterpret_cast<PessimisticTransactionDB*>(db)->TEST_Crash();
   s = ReOpenNoDelete();
   ASSERT_OK(s);
+  assert(db != nullptr);
   db_impl = reinterpret_cast<DBImpl*>(db->GetRootDB());
 
   // find trans in list of prepared transactions
@@ -1717,7 +1719,7 @@
   }
 
   // flush only cfa memtable
-  s = db_impl->TEST_FlushMemTable(true, cfa);
+  s = db_impl->TEST_FlushMemTable(true, false, cfa);
   ASSERT_OK(s);
 
   switch (txn_db_options.write_policy) {
@@ -1736,7 +1738,7 @@
   }
 
   // flush only cfb memtable
-  s = db_impl->TEST_FlushMemTable(true, cfb);
+  s = db_impl->TEST_FlushMemTable(true, false, cfb);
   ASSERT_OK(s);
 
   // should show not dependency on logs
@@ -3771,6 +3773,79 @@
   delete txn2;
 }
 
+TEST_P(TransactionTest, SavepointTest3) {
+  WriteOptions write_options;
+  ReadOptions read_options;
+  TransactionOptions txn_options;
+  Status s;
+
+  txn_options.lock_timeout = 1;  // 1 ms
+  Transaction* txn1 = db->BeginTransaction(write_options, txn_options);
+  ASSERT_TRUE(txn1);
+
+  s = txn1->PopSavePoint();  // No SavePoint present
+  ASSERT_TRUE(s.IsNotFound());
+
+  s = txn1->Put("A", "");
+  ASSERT_OK(s);
+
+  s = txn1->PopSavePoint();  // Still no SavePoint present
+  ASSERT_TRUE(s.IsNotFound());
+
+  txn1->SetSavePoint();  // 1
+
+  s = txn1->Put("A", "a");
+  ASSERT_OK(s);
+
+  s = txn1->PopSavePoint();  // Remove 1
+  ASSERT_TRUE(txn1->RollbackToSavePoint().IsNotFound());
+
+  // Verify that "A" is still locked
+  Transaction* txn2 = db->BeginTransaction(write_options, txn_options);
+  ASSERT_TRUE(txn2);
+
+  s = txn2->Put("A", "a2");
+  ASSERT_TRUE(s.IsTimedOut());
+  delete txn2;
+
+  txn1->SetSavePoint();  // 2
+
+  s = txn1->Put("B", "b");
+  ASSERT_OK(s);
+
+  txn1->SetSavePoint();  // 3
+
+  s = txn1->Put("B", "b2");
+  ASSERT_OK(s);
+
+  ASSERT_OK(txn1->RollbackToSavePoint());  // Roll back to 2
+
+  s = txn1->PopSavePoint();
+  ASSERT_OK(s);
+
+  s = txn1->PopSavePoint();
+  ASSERT_TRUE(s.IsNotFound());
+
+  s = txn1->Commit();
+  ASSERT_OK(s);
+  delete txn1;
+
+  std::string value;
+
+  // tnx1 should have modified "A" to "a"
+  s = db->Get(read_options, "A", &value);
+  ASSERT_OK(s);
+  ASSERT_EQ("a", value);
+
+  // tnx1 should have set "B" to just "b"
+  s = db->Get(read_options, "B", &value);
+  ASSERT_OK(s);
+  ASSERT_EQ("b", value);
+
+  s = db->Get(read_options, "C", &value);
+  ASSERT_TRUE(s.IsNotFound());
+}
+
 TEST_P(TransactionTest, UndoGetForUpdateTest) {
   WriteOptions write_options;
   ReadOptions read_options;
@@ -4950,8 +5025,16 @@
   ASSERT_EQ(2, txn->GetNumPuts());
 
   s = txn->Put(Slice("b"), Slice("...."));
-  ASSERT_TRUE(s.IsMemoryLimit());
-  ASSERT_EQ(2, txn->GetNumPuts());
+  auto pdb = reinterpret_cast<PessimisticTransactionDB*>(db);
+  // For write unprepared, write batches exceeding max_write_batch_size will
+  // just flush to DB instead of returning a memory limit error.
+  if (pdb->GetTxnDBOptions().write_policy != WRITE_UNPREPARED) {
+    ASSERT_TRUE(s.IsMemoryLimit());
+    ASSERT_EQ(2, txn->GetNumPuts());
+  } else {
+    ASSERT_OK(s);
+    ASSERT_EQ(3, txn->GetNumPuts());
+  }
 
   txn->Rollback();
   delete txn;
@@ -5285,10 +5368,6 @@
           s = txn0->Commit();
           ASSERT_OK(s);
         }
-        if (!do_prepare && !do_rollback) {
-          auto pdb = reinterpret_cast<PessimisticTransactionDB*>(db);
-          pdb->UnregisterTransaction(txn0);
-        }
         delete txn0;
         ReadOptions ropt;
         PinnableSlice pinnable_val;
@@ -5434,7 +5513,7 @@
     db->FlushWAL(true);
     // Flush only cf 1
     reinterpret_cast<DBImpl*>(db->GetRootDB())
-        ->TEST_FlushMemTable(true, handles[1]);
+        ->TEST_FlushMemTable(true, false, handles[1]);
     reinterpret_cast<PessimisticTransactionDB*>(db)->TEST_Crash();
     ASSERT_OK(ReOpenNoDelete(cfds, &handles));
     txn0 = db->GetTransactionByName("xid");
@@ -5472,7 +5551,7 @@
     ASSERT_OK(db->FlushWAL(true));
     // Flush only cf 1
     reinterpret_cast<DBImpl*>(db->GetRootDB())
-        ->TEST_FlushMemTable(true, handles[1]);
+        ->TEST_FlushMemTable(true, false, handles[1]);
     reinterpret_cast<PessimisticTransactionDB*>(db)->TEST_Crash();
     ASSERT_OK(ReOpenNoDelete(cfds, &handles));
     txn0 = db->GetTransactionByName("xid");
@@ -5505,7 +5584,7 @@
     ASSERT_OK(db->FlushWAL(true));
     // Flush only cf 1
     reinterpret_cast<DBImpl*>(db->GetRootDB())
-        ->TEST_FlushMemTable(true, handles[1]);
+        ->TEST_FlushMemTable(true, false, handles[1]);
     reinterpret_cast<PessimisticTransactionDB*>(db)->TEST_Crash();
     ASSERT_OK(ReOpenNoDelete(cfds, &handles));
     txn0 = db->GetTransactionByName("xid");
@@ -5532,7 +5611,7 @@
     ASSERT_OK(db->FlushWAL(true));
     // Flush only cf 1
     reinterpret_cast<DBImpl*>(db->GetRootDB())
-        ->TEST_FlushMemTable(true, handles[1]);
+        ->TEST_FlushMemTable(true, false, handles[1]);
     reinterpret_cast<PessimisticTransactionDB*>(db)->TEST_Crash();
     ASSERT_OK(ReOpenNoDelete(cfds, &handles));
     txn0 = db->GetTransactionByName("xid");
@@ -5559,7 +5638,7 @@
     ASSERT_OK(db->FlushWAL(true));
     // Flush only cf 1
     reinterpret_cast<DBImpl*>(db->GetRootDB())
-        ->TEST_FlushMemTable(true, handles[1]);
+        ->TEST_FlushMemTable(true, false, handles[1]);
     reinterpret_cast<PessimisticTransactionDB*>(db)->TEST_Crash();
     ASSERT_OK(ReOpenNoDelete(cfds, &handles));
     txn0 = db->GetTransactionByName("xid");
diff -Nru rocksdb-5.15.10/utilities/transactions/transaction_test.h rocksdb-5.17.2/utilities/transactions/transaction_test.h
--- rocksdb-5.15.10/utilities/transactions/transaction_test.h	2018-09-13 17:25:20.000000000 +0000
+++ rocksdb-5.17.2/utilities/transactions/transaction_test.h	2018-11-12 19:57:32.000000000 +0000
@@ -100,6 +100,7 @@
     } else {
       s = OpenWithStackableDB();
     }
+    assert(!s.ok() || db != nullptr);
     return s;
   }
 
@@ -121,6 +122,7 @@
     } else {
       s = OpenWithStackableDB(cfs, handles);
     }
+    assert(db != nullptr);
     return s;
   }
 
@@ -134,6 +136,7 @@
     } else {
       s = OpenWithStackableDB();
     }
+    assert(db != nullptr);
     return s;
   }
 
@@ -184,15 +187,17 @@
         txn_db_options.write_policy == WRITE_PREPARED;
     Status s = DBImpl::Open(options_copy, dbname, column_families, &handles,
                             &root_db, use_seq_per_batch, use_batch_per_txn);
-    StackableDB* stackable_db = new StackableDB(root_db);
-    if (s.ok()) {
-      assert(root_db != nullptr);
-      assert(handles.size() == 1);
-      s = TransactionDB::WrapStackableDB(stackable_db, txn_db_options,
-                                         compaction_enabled_cf_indices, handles,
-                                         &db);
-      delete handles[0];
+    if (!s.ok()) {
+      delete root_db;
+      return s;
     }
+    StackableDB* stackable_db = new StackableDB(root_db);
+    assert(root_db != nullptr);
+    assert(handles.size() == 1);
+    s = TransactionDB::WrapStackableDB(stackable_db, txn_db_options,
+                                       compaction_enabled_cf_indices, handles,
+                                       &db);
+    delete handles[0];
     if (!s.ok()) {
       delete stackable_db;
       // just in case it was not deleted (and not set to nullptr).
@@ -272,8 +277,6 @@
         exp_seq++;
       }
     }
-    auto pdb = reinterpret_cast<PessimisticTransactionDB*>(db);
-    pdb->UnregisterTransaction(txn);
     delete txn;
   };
   std::function<void(size_t)> txn_t3 = [&](size_t index) {
@@ -387,12 +390,6 @@
             ASSERT_OK(txn->Prepare());
           }
           ASSERT_OK(txn->Commit());
-          if (type == 2) {
-            auto pdb = reinterpret_cast<PessimisticTransactionDB*>(db);
-            // TODO(myabandeh): this is counter-intuitive. The destructor should
-            // also do the unregistering.
-            pdb->UnregisterTransaction(txn);
-          }
           delete txn;
           break;
         default:
@@ -414,7 +411,7 @@
     if (empty_wal) {
       ASSERT_OK(s);
     } else {
-      // Test that we can detect the WAL that is produced by an incompatbile
+      // Test that we can detect the WAL that is produced by an incompatible
       // WritePolicy and fail fast before mis-interpreting the WAL.
       ASSERT_TRUE(s.IsNotSupported());
       return;
diff -Nru rocksdb-5.15.10/utilities/transactions/write_prepared_transaction_test.cc rocksdb-5.17.2/utilities/transactions/write_prepared_transaction_test.cc
--- rocksdb-5.15.10/utilities/transactions/write_prepared_transaction_test.cc	2018-09-13 17:25:20.000000000 +0000
+++ rocksdb-5.17.2/utilities/transactions/write_prepared_transaction_test.cc	2018-11-12 19:57:32.000000000 +0000
@@ -493,8 +493,10 @@
   // Verify all versions of keys.
   void VerifyInternalKeys(const std::vector<KeyVersion>& expected_versions) {
     std::vector<KeyVersion> versions;
+    const size_t kMaxKeys = 100000;
     ASSERT_OK(GetAllKeyVersions(db, expected_versions.front().user_key,
-                                expected_versions.back().user_key, &versions));
+                                expected_versions.back().user_key, kMaxKeys,
+                                &versions));
     ASSERT_EQ(expected_versions.size(), versions.size());
     for (size_t i = 0; i < versions.size(); i++) {
       ASSERT_EQ(expected_versions[i].user_key, versions[i].user_key);
@@ -1013,6 +1015,7 @@
   wp_db->db_impl_->FlushWAL(true);
   wp_db->TEST_Crash();
   ReOpenNoDelete();
+  assert(db != nullptr);
   wp_db = dynamic_cast<WritePreparedTxnDB*>(db);
   wp_db->AdvanceMaxEvictedSeq(0, new_max);
   s = db->Get(ropt, db->DefaultColumnFamily(), "key", &pinnable_val);
@@ -1144,6 +1147,7 @@
     // Check if recovery preserves the last sequence number
     db_impl->FlushWAL(true);
     ReOpenNoDelete();
+    assert(db != nullptr);
     db_impl = reinterpret_cast<DBImpl*>(db->GetRootDB());
     seq = db_impl->TEST_GetLastVisibleSequence();
     ASSERT_EQ(exp_seq, seq);
@@ -1156,6 +1160,7 @@
     // Check if recovery after flush preserves the last sequence number
     db_impl->FlushWAL(true);
     ReOpenNoDelete();
+    assert(db != nullptr);
     db_impl = reinterpret_cast<DBImpl*>(db->GetRootDB());
     seq = db_impl->GetLatestSequenceNumber();
     ASSERT_EQ(exp_seq, seq);
@@ -1210,6 +1215,7 @@
   wp_db->db_impl_->FlushWAL(true);
   wp_db->TEST_Crash();
   ReOpenNoDelete();
+  assert(db != nullptr);
   wp_db = dynamic_cast<WritePreparedTxnDB*>(db);
   // After recovery, all the uncommitted txns (0 and 1) should be inserted into
   // delayed_prepared_
@@ -1254,6 +1260,7 @@
   wp_db->db_impl_->FlushWAL(true);
   wp_db->TEST_Crash();
   ReOpenNoDelete();
+  assert(db != nullptr);
   wp_db = dynamic_cast<WritePreparedTxnDB*>(db);
   ASSERT_TRUE(wp_db->prepared_txns_.empty());
   ASSERT_FALSE(wp_db->delayed_prepared_empty_);
@@ -1288,6 +1295,7 @@
   delete txn2;
   wp_db->db_impl_->FlushWAL(true);
   ReOpenNoDelete();
+  assert(db != nullptr);
   wp_db = dynamic_cast<WritePreparedTxnDB*>(db);
   ASSERT_TRUE(wp_db->prepared_txns_.empty());
   ASSERT_TRUE(wp_db->delayed_prepared_empty_);
@@ -1299,55 +1307,6 @@
   pinnable_val.Reset();
 }
 
-// After recovery the new transactions should still conflict with recovered
-// transactions.
-TEST_P(WritePreparedTransactionTest, ConflictDetectionAfterRecoveryTest) {
-  options.disable_auto_compactions = true;
-  ReOpen();
-
-  TransactionOptions txn_options;
-  WriteOptions write_options;
-  size_t index = 0;
-  Transaction* txn0 = db->BeginTransaction(write_options, txn_options);
-  auto istr0 = std::to_string(index);
-  auto s = txn0->SetName("xid" + istr0);
-  ASSERT_OK(s);
-  s = txn0->Put(Slice("key" + istr0), Slice("bar0" + istr0));
-  ASSERT_OK(s);
-  s = txn0->Prepare();
-
-  // With the same index 0 and key prefix, txn_t0 should conflict with txn0
-  txn_t0_with_status(0, Status::TimedOut());
-  delete txn0;
-
-  auto db_impl = reinterpret_cast<DBImpl*>(db->GetRootDB());
-  db_impl->FlushWAL(true);
-  dynamic_cast<WritePreparedTxnDB*>(db)->TEST_Crash();
-  ReOpenNoDelete();
-
-  // It should still conflict after the recovery
-  txn_t0_with_status(0, Status::TimedOut());
-
-  db_impl = reinterpret_cast<DBImpl*>(db->GetRootDB());
-  db_impl->FlushWAL(true);
-  ReOpenNoDelete();
-
-  // Check that a recovered txn will still cause conflicts after 2nd recovery
-  txn_t0_with_status(0, Status::TimedOut());
-
-  txn0 = db->GetTransactionByName("xid" + istr0);
-  ASSERT_NE(txn0, nullptr);
-  txn0->Commit();
-  delete txn0;
-
-  db_impl = reinterpret_cast<DBImpl*>(db->GetRootDB());
-  db_impl->FlushWAL(true);
-  ReOpenNoDelete();
-
-  // tnx0 is now committed and should no longer cause a conflict
-  txn_t0_with_status(0, Status::OK());
-}
-
 // After recovery the commit map is empty while the max is set. The code would
 // go through a different path which requires a separate test.
 TEST_P(WritePreparedTransactionTest, IsInSnapshotEmptyMapTest) {
@@ -1588,6 +1547,7 @@
           db_impl->FlushWAL(true);
           dynamic_cast<WritePreparedTxnDB*>(db)->TEST_Crash();
           ReOpenNoDelete();
+          assert(db != nullptr);
           wp_db = dynamic_cast<WritePreparedTxnDB*>(db);
           txn = db->GetTransactionByName("xid0");
           ASSERT_FALSE(wp_db->delayed_prepared_empty_);
diff -Nru rocksdb-5.15.10/utilities/transactions/write_prepared_txn.cc rocksdb-5.17.2/utilities/transactions/write_prepared_txn.cc
--- rocksdb-5.15.10/utilities/transactions/write_prepared_txn.cc	2018-09-13 17:25:20.000000000 +0000
+++ rocksdb-5.17.2/utilities/transactions/write_prepared_txn.cc	2018-11-12 19:57:32.000000000 +0000
@@ -34,6 +34,11 @@
     : PessimisticTransaction(txn_db, write_options, txn_options),
       wpt_db_(txn_db) {}
 
+void WritePreparedTxn::Initialize(const TransactionOptions& txn_options) {
+  PessimisticTransaction::Initialize(txn_options);
+  prepare_batch_cnt_ = 0;
+}
+
 Status WritePreparedTxn::Get(const ReadOptions& read_options,
                              ColumnFamilyHandle* column_family,
                              const Slice& key, PinnableSlice* pinnable_val) {
diff -Nru rocksdb-5.15.10/utilities/transactions/write_prepared_txn_db.cc rocksdb-5.17.2/utilities/transactions/write_prepared_txn_db.cc
--- rocksdb-5.15.10/utilities/transactions/write_prepared_txn_db.cc	2018-09-13 17:25:20.000000000 +0000
+++ rocksdb-5.17.2/utilities/transactions/write_prepared_txn_db.cc	2018-11-12 19:57:32.000000000 +0000
@@ -460,7 +460,7 @@
 bool WritePreparedTxnDB::GetCommitEntry(const uint64_t indexed_seq,
                                         CommitEntry64b* entry_64b,
                                         CommitEntry* entry) const {
-  *entry_64b = commit_cache_[indexed_seq].load(std::memory_order_acquire);
+  *entry_64b = commit_cache_[static_cast<size_t>(indexed_seq)].load(std::memory_order_acquire);
   bool valid = entry_64b->Parse(indexed_seq, entry, FORMAT);
   return valid;
 }
@@ -469,7 +469,7 @@
                                         const CommitEntry& new_entry,
                                         CommitEntry* evicted_entry) {
   CommitEntry64b new_entry_64b(new_entry, FORMAT);
-  CommitEntry64b evicted_entry_64b = commit_cache_[indexed_seq].exchange(
+  CommitEntry64b evicted_entry_64b = commit_cache_[static_cast<size_t>(indexed_seq)].exchange(
       new_entry_64b, std::memory_order_acq_rel);
   bool valid = evicted_entry_64b.Parse(indexed_seq, evicted_entry, FORMAT);
   return valid;
@@ -478,7 +478,7 @@
 bool WritePreparedTxnDB::ExchangeCommitEntry(const uint64_t indexed_seq,
                                              CommitEntry64b& expected_entry_64b,
                                              const CommitEntry& new_entry) {
-  auto& atomic_entry = commit_cache_[indexed_seq];
+  auto& atomic_entry = commit_cache_[static_cast<size_t>(indexed_seq)];
   CommitEntry64b new_entry_64b(new_entry, FORMAT);
   bool succ = atomic_entry.compare_exchange_strong(
       expected_entry_64b, new_entry_64b, std::memory_order_acq_rel,
diff -Nru rocksdb-5.15.10/utilities/transactions/write_prepared_txn_db.h rocksdb-5.17.2/utilities/transactions/write_prepared_txn_db.h
--- rocksdb-5.15.10/utilities/transactions/write_prepared_txn_db.h	2018-09-13 17:25:20.000000000 +0000
+++ rocksdb-5.17.2/utilities/transactions/write_prepared_txn_db.h	2018-11-12 19:57:32.000000000 +0000
@@ -384,7 +384,7 @@
   friend class WritePreparedTransactionTest_OldCommitMapGC_Test;
   friend class WritePreparedTransactionTest_RollbackTest_Test;
   friend class WriteUnpreparedTxnDB;
-  friend class WriteUnpreparedTransactionTest_RecoveryRollbackUnprepared_Test;
+  friend class WriteUnpreparedTransactionTest_RecoveryTest_Test;
 
   void Init(const TransactionDBOptions& /* unused */);
 
diff -Nru rocksdb-5.15.10/utilities/transactions/write_prepared_txn.h rocksdb-5.17.2/utilities/transactions/write_prepared_txn.h
--- rocksdb-5.15.10/utilities/transactions/write_prepared_txn.h	2018-09-13 17:25:20.000000000 +0000
+++ rocksdb-5.17.2/utilities/transactions/write_prepared_txn.h	2018-11-12 19:57:32.000000000 +0000
@@ -64,6 +64,7 @@
   virtual void SetSnapshot() override;
 
  protected:
+  void Initialize(const TransactionOptions& txn_options) override;
   // Override the protected SetId to make it visible to the friend class
   // WritePreparedTxnDB
   inline void SetId(uint64_t id) override { Transaction::SetId(id); }
@@ -72,6 +73,7 @@
   friend class WritePreparedTransactionTest_BasicRecoveryTest_Test;
   friend class WritePreparedTxnDB;
   friend class WriteUnpreparedTxnDB;
+  friend class WriteUnpreparedTxn;
 
   Status PrepareInternal() override;
 
diff -Nru rocksdb-5.15.10/utilities/transactions/write_unprepared_transaction_test.cc rocksdb-5.17.2/utilities/transactions/write_unprepared_transaction_test.cc
--- rocksdb-5.15.10/utilities/transactions/write_unprepared_transaction_test.cc	2018-09-13 17:25:20.000000000 +0000
+++ rocksdb-5.17.2/utilities/transactions/write_unprepared_transaction_test.cc	2018-11-12 19:57:32.000000000 +0000
@@ -179,6 +179,11 @@
   iter->Prev();
   verify_state(iter, "a", "v7");
 
+  // Since the unprep_seqs_ data were faked for testing, we do not want the
+  // destructor for the transaction to be rolling back data that did not
+  // exist.
+  wup_txn->unprep_seqs_.clear();
+
   db->ReleaseSnapshot(snapshot0);
   db->ReleaseSnapshot(snapshot2);
   db->ReleaseSnapshot(snapshot4);
@@ -188,108 +193,235 @@
   delete txn;
 }
 
-TEST_P(WriteUnpreparedTransactionTest, RecoveryRollbackUnprepared) {
+// This tests how write unprepared behaves during recovery when the DB crashes
+// after a transaction has either been unprepared or prepared, and tests if
+// the changes are correctly applied for prepared transactions if we decide to
+// rollback/commit.
+TEST_P(WriteUnpreparedTransactionTest, RecoveryTest) {
   WriteOptions write_options;
   write_options.disableWAL = false;
-  uint64_t seq_used = kMaxSequenceNumber;
-  uint64_t log_number;
-  WriteBatch batch;
+  TransactionOptions txn_options;
   std::vector<Transaction*> prepared_trans;
   WriteUnpreparedTxnDB* wup_db;
   options.disable_auto_compactions = true;
 
-  // Try unprepared batches with empty database.
-  for (int num_batches = 0; num_batches < 10; num_batches++) {
-    // Reset database.
-    prepared_trans.clear();
-    ReOpen();
-    wup_db = dynamic_cast<WriteUnpreparedTxnDB*>(db);
-
-    // Write num_batches unprepared batches into the WAL.
-    for (int i = 0; i < num_batches; i++) {
-      batch.Clear();
-      // TODO(lth): Instead of manually calling WriteImpl with a write batch,
-      // use methods on Transaction instead once it is implemented.
-      ASSERT_OK(WriteBatchInternal::InsertNoop(&batch));
-      ASSERT_OK(WriteBatchInternal::Put(&batch,
-                                        db->DefaultColumnFamily()->GetID(),
-                                        "k" + ToString(i), "value"));
-      // MarkEndPrepare will change the Noop marker into an unprepared marker.
-      ASSERT_OK(WriteBatchInternal::MarkEndPrepare(
-          &batch, Slice("xid1"), /* write after commit */ false,
-          /* unprepared batch */ true));
-      ASSERT_OK(wup_db->db_impl_->WriteImpl(
-          write_options, &batch, /*callback*/ nullptr, &log_number,
-          /*log ref*/ 0, /* disable memtable */ true, &seq_used,
-          /* prepare_batch_cnt_ */ 1));
-    }
+  enum Action { UNPREPARED, ROLLBACK, COMMIT };
 
-    // Crash and run recovery code paths.
-    wup_db->db_impl_->FlushWAL(true);
-    wup_db->TEST_Crash();
-    ReOpenNoDelete();
-    wup_db = dynamic_cast<WriteUnpreparedTxnDB*>(db);
-
-    db->GetAllPreparedTransactions(&prepared_trans);
-    ASSERT_EQ(prepared_trans.size(), 0);
-
-    // Check that DB is empty.
-    Iterator* iter = db->NewIterator(ReadOptions());
-    iter->SeekToFirst();
-    ASSERT_FALSE(iter->Valid());
-    delete iter;
+  // batch_size of 1 causes writes to DB for every marker.
+  for (size_t batch_size : {1, 1000000}) {
+    txn_options.max_write_batch_size = batch_size;
+    for (bool empty : {true, false}) {
+      for (Action a : {UNPREPARED, ROLLBACK, COMMIT}) {
+        for (int num_batches = 1; num_batches < 10; num_batches++) {
+          // Reset database.
+          prepared_trans.clear();
+          ReOpen();
+          wup_db = dynamic_cast<WriteUnpreparedTxnDB*>(db);
+          if (!empty) {
+            for (int i = 0; i < num_batches; i++) {
+              ASSERT_OK(db->Put(WriteOptions(), "k" + ToString(i),
+                                "before value" + ToString(i)));
+            }
+          }
+
+          // Write num_batches unprepared batches.
+          Transaction* txn = db->BeginTransaction(write_options, txn_options);
+          WriteUnpreparedTxn* wup_txn = dynamic_cast<WriteUnpreparedTxn*>(txn);
+          txn->SetName("xid");
+          for (int i = 0; i < num_batches; i++) {
+            ASSERT_OK(txn->Put("k" + ToString(i), "value" + ToString(i)));
+            if (txn_options.max_write_batch_size == 1) {
+              ASSERT_EQ(wup_txn->GetUnpreparedSequenceNumbers().size(), i + 1);
+            } else {
+              ASSERT_EQ(wup_txn->GetUnpreparedSequenceNumbers().size(), 0);
+            }
+          }
+          if (a == UNPREPARED) {
+            // This is done to prevent the destructor from rolling back the
+            // transaction for us, since we want to pretend we crashed and
+            // test that recovery does the rollback.
+            wup_txn->unprep_seqs_.clear();
+          } else {
+            txn->Prepare();
+          }
+          delete txn;
+
+          // Crash and run recovery code paths.
+          wup_db->db_impl_->FlushWAL(true);
+          wup_db->TEST_Crash();
+          ReOpenNoDelete();
+          assert(db != nullptr);
+
+          db->GetAllPreparedTransactions(&prepared_trans);
+          ASSERT_EQ(prepared_trans.size(), a == UNPREPARED ? 0 : 1);
+          if (a == ROLLBACK) {
+            ASSERT_OK(prepared_trans[0]->Rollback());
+            delete prepared_trans[0];
+          } else if (a == COMMIT) {
+            ASSERT_OK(prepared_trans[0]->Commit());
+            delete prepared_trans[0];
+          }
+
+          Iterator* iter = db->NewIterator(ReadOptions());
+          iter->SeekToFirst();
+          // Check that DB has before values.
+          if (!empty || a == COMMIT) {
+            for (int i = 0; i < num_batches; i++) {
+              ASSERT_TRUE(iter->Valid());
+              ASSERT_EQ(iter->key().ToString(), "k" + ToString(i));
+              if (a == COMMIT) {
+                ASSERT_EQ(iter->value().ToString(), "value" + ToString(i));
+              } else {
+                ASSERT_EQ(iter->value().ToString(),
+                          "before value" + ToString(i));
+              }
+              iter->Next();
+            }
+          }
+          ASSERT_FALSE(iter->Valid());
+          delete iter;
+        }
+      }
+    }
   }
+}
 
-  // Try unprepared batches with non-empty database.
-  for (int num_batches = 1; num_batches < 10; num_batches++) {
-    // Reset database.
-    prepared_trans.clear();
-    ReOpen();
-    wup_db = dynamic_cast<WriteUnpreparedTxnDB*>(db);
-    for (int i = 0; i < num_batches; i++) {
-      ASSERT_OK(db->Put(WriteOptions(), "k" + ToString(i),
-                        "before value " + ToString(i)));
-    }
+// Basic test to see that unprepared batch gets written to DB when batch size
+// is exceeded. It also does some basic checks to see if commit/rollback works
+// as expected for write unprepared.
+TEST_P(WriteUnpreparedTransactionTest, UnpreparedBatch) {
+  WriteOptions write_options;
+  TransactionOptions txn_options;
+  const int kNumKeys = 10;
 
-    // Write num_batches unprepared batches into the WAL.
-    for (int i = 0; i < num_batches; i++) {
-      batch.Clear();
-      // TODO(lth): Instead of manually calling WriteImpl with a write batch,
-      // use methods on Transaction instead once it is implemented.
-      ASSERT_OK(WriteBatchInternal::InsertNoop(&batch));
-      ASSERT_OK(WriteBatchInternal::Put(&batch,
-                                        db->DefaultColumnFamily()->GetID(),
-                                        "k" + ToString(i), "value"));
-      // MarkEndPrepare will change the Noop marker into an unprepared marker.
-      ASSERT_OK(WriteBatchInternal::MarkEndPrepare(
-          &batch, Slice("xid1"), /* write after commit */ false,
-          /* unprepared batch */ true));
-      ASSERT_OK(wup_db->db_impl_->WriteImpl(
-          write_options, &batch, /*callback*/ nullptr, &log_number,
-          /*log ref*/ 0, /* disable memtable */ true, &seq_used,
-          /* prepare_batch_cnt_ */ 1));
+  // batch_size of 1 causes writes to DB for every marker.
+  for (size_t batch_size : {1, 1000000}) {
+    txn_options.max_write_batch_size = batch_size;
+    for (bool prepare : {false, true}) {
+      for (bool commit : {false, true}) {
+        ReOpen();
+        Transaction* txn = db->BeginTransaction(write_options, txn_options);
+        WriteUnpreparedTxn* wup_txn = dynamic_cast<WriteUnpreparedTxn*>(txn);
+        txn->SetName("xid");
+
+        for (int i = 0; i < kNumKeys; i++) {
+          txn->Put("k" + ToString(i), "v" + ToString(i));
+          if (txn_options.max_write_batch_size == 1) {
+            ASSERT_EQ(wup_txn->GetUnpreparedSequenceNumbers().size(), i + 1);
+          } else {
+            ASSERT_EQ(wup_txn->GetUnpreparedSequenceNumbers().size(), 0);
+          }
+        }
+
+        if (prepare) {
+          ASSERT_OK(txn->Prepare());
+        }
+
+        Iterator* iter = db->NewIterator(ReadOptions());
+        iter->SeekToFirst();
+        assert(!iter->Valid());
+        ASSERT_FALSE(iter->Valid());
+        delete iter;
+
+        if (commit) {
+          ASSERT_OK(txn->Commit());
+        } else {
+          ASSERT_OK(txn->Rollback());
+        }
+        delete txn;
+
+        iter = db->NewIterator(ReadOptions());
+        iter->SeekToFirst();
+
+        for (int i = 0; i < (commit ? kNumKeys : 0); i++) {
+          ASSERT_TRUE(iter->Valid());
+          ASSERT_EQ(iter->key().ToString(), "k" + ToString(i));
+          ASSERT_EQ(iter->value().ToString(), "v" + ToString(i));
+          iter->Next();
+        }
+        ASSERT_FALSE(iter->Valid());
+        delete iter;
+      }
     }
+  }
+}
+
+// Test whether logs containing unprepared/prepared batches are kept even
+// after memtable finishes flushing, and whether they are removed when
+// transaction commits/aborts.
+//
+// TODO(lth): Merge with TransactionTest/TwoPhaseLogRollingTest tests.
+TEST_P(WriteUnpreparedTransactionTest, MarkLogWithPrepSection) {
+  WriteOptions write_options;
+  TransactionOptions txn_options;
+  // batch_size of 1 causes writes to DB for every marker.
+  txn_options.max_write_batch_size = 1;
+  const int kNumKeys = 10;
+
+  WriteOptions wopts;
+  wopts.sync = true;
+
+  for (bool prepare : {false, true}) {
+    for (bool commit : {false, true}) {
+      ReOpen();
+      auto wup_db = dynamic_cast<WriteUnpreparedTxnDB*>(db);
+      auto db_impl = wup_db->db_impl_;
+
+      Transaction* txn1 = db->BeginTransaction(write_options, txn_options);
+      ASSERT_OK(txn1->SetName("xid1"));
+
+      Transaction* txn2 = db->BeginTransaction(write_options, txn_options);
+      ASSERT_OK(txn2->SetName("xid2"));
+
+      // Spread this transaction across multiple log files.
+      for (int i = 0; i < kNumKeys; i++) {
+        ASSERT_OK(txn1->Put("k1" + ToString(i), "v" + ToString(i)));
+        if (i >= kNumKeys / 2) {
+          ASSERT_OK(txn2->Put("k2" + ToString(i), "v" + ToString(i)));
+        }
+
+        if (i > 0) {
+          db_impl->TEST_SwitchWAL();
+        }
+      }
+
+      ASSERT_GT(txn1->GetLogNumber(), 0);
+      ASSERT_GT(txn2->GetLogNumber(), 0);
+
+      ASSERT_EQ(db_impl->TEST_FindMinLogContainingOutstandingPrep(),
+                txn1->GetLogNumber());
+      ASSERT_GT(db_impl->TEST_LogfileNumber(), txn1->GetLogNumber());
+
+      if (prepare) {
+        ASSERT_OK(txn1->Prepare());
+        ASSERT_OK(txn2->Prepare());
+      }
+
+      ASSERT_GE(db_impl->TEST_LogfileNumber(), txn1->GetLogNumber());
+      ASSERT_GE(db_impl->TEST_LogfileNumber(), txn2->GetLogNumber());
+
+      ASSERT_EQ(db_impl->TEST_FindMinLogContainingOutstandingPrep(),
+                txn1->GetLogNumber());
+      if (commit) {
+        ASSERT_OK(txn1->Commit());
+      } else {
+        ASSERT_OK(txn1->Rollback());
+      }
+
+      ASSERT_EQ(db_impl->TEST_FindMinLogContainingOutstandingPrep(),
+                txn2->GetLogNumber());
+
+      if (commit) {
+        ASSERT_OK(txn2->Commit());
+      } else {
+        ASSERT_OK(txn2->Rollback());
+      }
+
+      ASSERT_EQ(db_impl->TEST_FindMinLogContainingOutstandingPrep(), 0);
 
-    // Crash and run recovery code paths.
-    wup_db->db_impl_->FlushWAL(true);
-    wup_db->TEST_Crash();
-    ReOpenNoDelete();
-    wup_db = dynamic_cast<WriteUnpreparedTxnDB*>(db);
-
-    db->GetAllPreparedTransactions(&prepared_trans);
-    ASSERT_EQ(prepared_trans.size(), 0);
-
-    // Check that DB has before values.
-    Iterator* iter = db->NewIterator(ReadOptions());
-    iter->SeekToFirst();
-    for (int i = 0; i < num_batches; i++) {
-      ASSERT_TRUE(iter->Valid());
-      ASSERT_EQ(iter->key().ToString(), "k" + ToString(i));
-      ASSERT_EQ(iter->value().ToString(), "before value " + ToString(i));
-      iter->Next();
+      delete txn1;
+      delete txn2;
     }
-    ASSERT_FALSE(iter->Valid());
-    delete iter;
   }
 }
 
diff -Nru rocksdb-5.15.10/utilities/transactions/write_unprepared_txn.cc rocksdb-5.17.2/utilities/transactions/write_unprepared_txn.cc
--- rocksdb-5.15.10/utilities/transactions/write_unprepared_txn.cc	2018-09-13 17:25:20.000000000 +0000
+++ rocksdb-5.17.2/utilities/transactions/write_unprepared_txn.cc	2018-11-12 19:57:32.000000000 +0000
@@ -46,7 +46,436 @@
 WriteUnpreparedTxn::WriteUnpreparedTxn(WriteUnpreparedTxnDB* txn_db,
                                        const WriteOptions& write_options,
                                        const TransactionOptions& txn_options)
-    : WritePreparedTxn(txn_db, write_options, txn_options), wupt_db_(txn_db) {}
+    : WritePreparedTxn(txn_db, write_options, txn_options), wupt_db_(txn_db) {
+  max_write_batch_size_ = txn_options.max_write_batch_size;
+  // We set max bytes to zero so that we don't get a memory limit error.
+  // Instead of trying to keep write batch strictly under the size limit, we
+  // just flush to DB when the limit is exceeded in write unprepared, to avoid
+  // having retry logic. This also allows very big key-value pairs that exceed
+  // max bytes to succeed.
+  write_batch_.SetMaxBytes(0);
+}
+
+WriteUnpreparedTxn::~WriteUnpreparedTxn() {
+  if (!unprep_seqs_.empty()) {
+    assert(log_number_ > 0);
+    assert(GetId() > 0);
+    assert(!name_.empty());
+
+    // We should rollback regardless of GetState, but some unit tests that
+    // test crash recovery run the destructor assuming that rollback does not
+    // happen, so that rollback during recovery can be exercised.
+    if (GetState() == STARTED) {
+      auto s __attribute__((__unused__)) = RollbackInternal();
+      // TODO(lth): Better error handling.
+      assert(s.ok());
+      dbimpl_->logs_with_prep_tracker()->MarkLogAsHavingPrepSectionFlushed(
+          log_number_);
+    }
+  }
+}
+
+void WriteUnpreparedTxn::Initialize(const TransactionOptions& txn_options) {
+  PessimisticTransaction::Initialize(txn_options);
+  max_write_batch_size_ = txn_options.max_write_batch_size;
+  write_batch_.SetMaxBytes(0);
+  unprep_seqs_.clear();
+  write_set_keys_.clear();
+}
+
+Status WriteUnpreparedTxn::Put(ColumnFamilyHandle* column_family,
+                               const Slice& key, const Slice& value) {
+  Status s = MaybeFlushWriteBatchToDB();
+  if (!s.ok()) {
+    return s;
+  }
+  return TransactionBaseImpl::Put(column_family, key, value);
+}
+
+Status WriteUnpreparedTxn::Put(ColumnFamilyHandle* column_family,
+                               const SliceParts& key, const SliceParts& value) {
+  Status s = MaybeFlushWriteBatchToDB();
+  if (!s.ok()) {
+    return s;
+  }
+  return TransactionBaseImpl::Put(column_family, key, value);
+}
+
+Status WriteUnpreparedTxn::Merge(ColumnFamilyHandle* column_family,
+                                 const Slice& key, const Slice& value) {
+  Status s = MaybeFlushWriteBatchToDB();
+  if (!s.ok()) {
+    return s;
+  }
+  return TransactionBaseImpl::Merge(column_family, key, value);
+}
+
+Status WriteUnpreparedTxn::Delete(ColumnFamilyHandle* column_family,
+                                  const Slice& key) {
+  Status s = MaybeFlushWriteBatchToDB();
+  if (!s.ok()) {
+    return s;
+  }
+  return TransactionBaseImpl::Delete(column_family, key);
+}
+
+Status WriteUnpreparedTxn::Delete(ColumnFamilyHandle* column_family,
+                                  const SliceParts& key) {
+  Status s = MaybeFlushWriteBatchToDB();
+  if (!s.ok()) {
+    return s;
+  }
+  return TransactionBaseImpl::Delete(column_family, key);
+}
+
+Status WriteUnpreparedTxn::SingleDelete(ColumnFamilyHandle* column_family,
+                                        const Slice& key) {
+  Status s = MaybeFlushWriteBatchToDB();
+  if (!s.ok()) {
+    return s;
+  }
+  return TransactionBaseImpl::SingleDelete(column_family, key);
+}
+
+Status WriteUnpreparedTxn::SingleDelete(ColumnFamilyHandle* column_family,
+                                        const SliceParts& key) {
+  Status s = MaybeFlushWriteBatchToDB();
+  if (!s.ok()) {
+    return s;
+  }
+  return TransactionBaseImpl::SingleDelete(column_family, key);
+}
+
+Status WriteUnpreparedTxn::MaybeFlushWriteBatchToDB() {
+  const bool kPrepared = true;
+  Status s;
+
+  bool needs_mark = (log_number_ == 0);
+
+  if (max_write_batch_size_ != 0 &&
+      write_batch_.GetDataSize() > max_write_batch_size_) {
+    assert(GetState() != PREPARED);
+    s = FlushWriteBatchToDB(!kPrepared);
+    if (s.ok()) {
+      assert(log_number_ > 0);
+      // This is done to prevent WAL files after log_number_ from being
+      // deleted, because they could potentially contain unprepared batches.
+      if (needs_mark) {
+        dbimpl_->logs_with_prep_tracker()->MarkLogAsContainingPrepSection(
+            log_number_);
+      }
+    }
+  }
+  return s;
+}
+
+void WriteUnpreparedTxn::UpdateWriteKeySet(uint32_t cfid, const Slice& key) {
+  // TODO(lth): write_set_keys_ can just be a std::string instead of a vector.
+  write_set_keys_[cfid].push_back(key.ToString());
+}
+
+Status WriteUnpreparedTxn::FlushWriteBatchToDB(bool prepared) {
+  if (name_.empty()) {
+    return Status::InvalidArgument("Cannot write to DB without SetName.");
+  }
+
+  // Update write_key_set_ for rollback purposes.
+  KeySetBuilder keyset_handler(
+      this, wupt_db_->txn_db_options_.rollback_merge_operands);
+  auto s = GetWriteBatch()->GetWriteBatch()->Iterate(&keyset_handler);
+  assert(s.ok());
+  if (!s.ok()) {
+    return s;
+  }
+
+  // TODO(lth): Reduce duplicate code with WritePrepared prepare logic.
+  WriteOptions write_options = write_options_;
+  write_options.disableWAL = false;
+  const bool WRITE_AFTER_COMMIT = true;
+  // MarkEndPrepare will change Noop marker to the appropriate marker.
+  WriteBatchInternal::MarkEndPrepare(GetWriteBatch()->GetWriteBatch(), name_,
+                                     !WRITE_AFTER_COMMIT, !prepared);
+  // For each duplicate key we account for a new sub-batch
+  prepare_batch_cnt_ = GetWriteBatch()->SubBatchCnt();
+  // AddPrepared better to be called in the pre-release callback otherwise there
+  // is a non-zero chance of max advancing prepare_seq and readers assume the
+  // data as committed.
+  // Also having it in the PreReleaseCallback allows in-order addition of
+  // prepared entries to PrepareHeap and hence enables an optimization. Refer to
+  // SmallestUnCommittedSeq for more details.
+  AddPreparedCallback add_prepared_callback(
+      wpt_db_, prepare_batch_cnt_,
+      db_impl_->immutable_db_options().two_write_queues);
+  const bool DISABLE_MEMTABLE = true;
+  uint64_t seq_used = kMaxSequenceNumber;
+  // log_number_ should refer to the oldest log containing uncommitted data
+  // from the current transaction. This means that if log_number_ is set,
+  // WriteImpl should not overwrite that value, so set log_used to nullptr if
+  // log_number_ is already set.
+  uint64_t* log_used = log_number_ ? nullptr : &log_number_;
+  s = db_impl_->WriteImpl(write_options, GetWriteBatch()->GetWriteBatch(),
+                          /*callback*/ nullptr, log_used, /*log ref*/
+                          0, !DISABLE_MEMTABLE, &seq_used, prepare_batch_cnt_,
+                          &add_prepared_callback);
+  assert(!s.ok() || seq_used != kMaxSequenceNumber);
+  auto prepare_seq = seq_used;
+
+  // Only call SetId if it hasn't been set yet.
+  if (GetId() == 0) {
+    SetId(prepare_seq);
+  }
+  // unprep_seqs_ will also contain prepared seqnos since they are treated in
+  // the same way in the prepare/commit callbacks. See the comment on the
+  // definition of unprep_seqs_.
+  unprep_seqs_[prepare_seq] = prepare_batch_cnt_;
+
+  // Reset transaction state.
+  if (!prepared) {
+    prepare_batch_cnt_ = 0;
+    write_batch_.Clear();
+    WriteBatchInternal::InsertNoop(write_batch_.GetWriteBatch());
+  }
+
+  return s;
+}
+
+Status WriteUnpreparedTxn::PrepareInternal() {
+  const bool kPrepared = true;
+  return FlushWriteBatchToDB(kPrepared);
+}
+
+Status WriteUnpreparedTxn::CommitWithoutPrepareInternal() {
+  if (unprep_seqs_.empty()) {
+    assert(log_number_ == 0);
+    assert(GetId() == 0);
+    return WritePreparedTxn::CommitWithoutPrepareInternal();
+  }
+
+  // TODO(lth): We should optimize commit without prepare to not perform
+  // a prepare under the hood.
+  auto s = PrepareInternal();
+  if (!s.ok()) {
+    return s;
+  }
+  return CommitInternal();
+}
+
+Status WriteUnpreparedTxn::CommitInternal() {
+  // TODO(lth): Reduce duplicate code with WritePrepared commit logic.
+
+  // We take the commit-time batch and append the Commit marker.  The Memtable
+  // will ignore the Commit marker in non-recovery mode
+  WriteBatch* working_batch = GetCommitTimeWriteBatch();
+  const bool empty = working_batch->Count() == 0;
+  WriteBatchInternal::MarkCommit(working_batch, name_);
+
+  const bool for_recovery = use_only_the_last_commit_time_batch_for_recovery_;
+  if (!empty && for_recovery) {
+    // When not writing to memtable, we can still cache the latest write batch.
+    // The cached batch will be written to memtable in WriteRecoverableState
+    // during FlushMemTable
+    WriteBatchInternal::SetAsLastestPersistentState(working_batch);
+  }
+
+  const bool includes_data = !empty && !for_recovery;
+  size_t commit_batch_cnt = 0;
+  if (UNLIKELY(includes_data)) {
+    ROCKS_LOG_WARN(db_impl_->immutable_db_options().info_log,
+                   "Duplicate key overhead");
+    SubBatchCounter counter(*wpt_db_->GetCFComparatorMap());
+    auto s = working_batch->Iterate(&counter);
+    assert(s.ok());
+    commit_batch_cnt = counter.BatchCount();
+  }
+  const bool disable_memtable = !includes_data;
+  const bool do_one_write =
+      !db_impl_->immutable_db_options().two_write_queues || disable_memtable;
+  const bool publish_seq = do_one_write;
+  // Note: CommitTimeWriteBatch does not need AddPrepared since it is written to
+  // DB in one shot. min_uncommitted still works since it requires capturing
+  // data that is written to DB but not yet committed, while
+  // CommitTimeWriteBatch commits with PreReleaseCallback.
+  WriteUnpreparedCommitEntryPreReleaseCallback update_commit_map(
+      wpt_db_, db_impl_, unprep_seqs_, commit_batch_cnt, publish_seq);
+  uint64_t seq_used = kMaxSequenceNumber;
+  // Since the prepared batch is directly written to memtable, there is already
+  // a connection between the memtable and its WAL, so there is no need to
+  // redundantly reference the log that contains the prepared data.
+  const uint64_t zero_log_number = 0ull;
+  size_t batch_cnt = UNLIKELY(commit_batch_cnt) ? commit_batch_cnt : 1;
+  auto s = db_impl_->WriteImpl(write_options_, working_batch, nullptr, nullptr,
+                               zero_log_number, disable_memtable, &seq_used,
+                               batch_cnt, &update_commit_map);
+  assert(!s.ok() || seq_used != kMaxSequenceNumber);
+  if (LIKELY(do_one_write || !s.ok())) {
+    if (LIKELY(s.ok())) {
+      // Note RemovePrepared should be called after WriteImpl that publishsed
+      // the seq. Otherwise SmallestUnCommittedSeq optimization breaks.
+      for (const auto& seq : unprep_seqs_) {
+        wpt_db_->RemovePrepared(seq.first, seq.second);
+      }
+    }
+    unprep_seqs_.clear();
+    write_set_keys_.clear();
+    return s;
+  }  // else do the 2nd write to publish seq
+  // Note: the 2nd write comes with a performance penality. So if we have too
+  // many of commits accompanied with ComitTimeWriteBatch and yet we cannot
+  // enable use_only_the_last_commit_time_batch_for_recovery_ optimization,
+  // two_write_queues should be disabled to avoid many additional writes here.
+  class PublishSeqPreReleaseCallback : public PreReleaseCallback {
+   public:
+    explicit PublishSeqPreReleaseCallback(DBImpl* db_impl)
+        : db_impl_(db_impl) {}
+    virtual Status Callback(SequenceNumber seq, bool is_mem_disabled
+                            __attribute__((__unused__))) override {
+      assert(is_mem_disabled);
+      assert(db_impl_->immutable_db_options().two_write_queues);
+      db_impl_->SetLastPublishedSequence(seq);
+      return Status::OK();
+    }
+
+   private:
+    DBImpl* db_impl_;
+  } publish_seq_callback(db_impl_);
+  WriteBatch empty_batch;
+  empty_batch.PutLogData(Slice());
+  // In the absence of Prepare markers, use Noop as a batch separator
+  WriteBatchInternal::InsertNoop(&empty_batch);
+  const bool DISABLE_MEMTABLE = true;
+  const size_t ONE_BATCH = 1;
+  const uint64_t NO_REF_LOG = 0;
+  s = db_impl_->WriteImpl(write_options_, &empty_batch, nullptr, nullptr,
+                          NO_REF_LOG, DISABLE_MEMTABLE, &seq_used, ONE_BATCH,
+                          &publish_seq_callback);
+  assert(!s.ok() || seq_used != kMaxSequenceNumber);
+  // Note RemovePrepared should be called after WriteImpl that publishsed the
+  // seq. Otherwise SmallestUnCommittedSeq optimization breaks.
+  for (const auto& seq : unprep_seqs_) {
+    wpt_db_->RemovePrepared(seq.first, seq.second);
+  }
+  unprep_seqs_.clear();
+  write_set_keys_.clear();
+  return s;
+}
+
+Status WriteUnpreparedTxn::RollbackInternal() {
+  // TODO(lth): Reduce duplicate code with WritePrepared rollback logic.
+  WriteBatchWithIndex rollback_batch(
+      wpt_db_->DefaultColumnFamily()->GetComparator(), 0, true, 0);
+  assert(GetId() != kMaxSequenceNumber);
+  assert(GetId() > 0);
+  const auto& cf_map = *wupt_db_->GetCFHandleMap();
+  // In WritePrepared, the txn is is the same as prepare seq
+  auto last_visible_txn = GetId() - 1;
+  Status s;
+
+  ReadOptions roptions;
+  // Note that we do not use WriteUnpreparedTxnReadCallback because we do not
+  // need to read our own writes when reading prior versions of the key for
+  // rollback.
+  WritePreparedTxnReadCallback callback(wpt_db_, last_visible_txn, 0);
+  for (const auto& cfkey : write_set_keys_) {
+    const auto cfid = cfkey.first;
+    const auto& keys = cfkey.second;
+    for (const auto& key : keys) {
+      const auto& cf_handle = cf_map.at(cfid);
+      PinnableSlice pinnable_val;
+      bool not_used;
+      s = db_impl_->GetImpl(roptions, cf_handle, key, &pinnable_val, &not_used,
+                            &callback);
+
+      if (s.ok()) {
+        s = rollback_batch.Put(cf_handle, key, pinnable_val);
+        assert(s.ok());
+      } else if (s.IsNotFound()) {
+        s = rollback_batch.Delete(cf_handle, key);
+        assert(s.ok());
+      } else {
+        return s;
+      }
+    }
+  }
+
+  // The Rollback marker will be used as a batch separator
+  WriteBatchInternal::MarkRollback(rollback_batch.GetWriteBatch(), name_);
+  bool do_one_write = !db_impl_->immutable_db_options().two_write_queues;
+  const bool DISABLE_MEMTABLE = true;
+  const uint64_t NO_REF_LOG = 0;
+  uint64_t seq_used = kMaxSequenceNumber;
+  // TODO(lth): We write rollback batch all in a single batch here, but this
+  // should be subdivded into multiple batches as well. In phase 2, when key
+  // sets are read from WAL, this will happen naturally.
+  const size_t ONE_BATCH = 1;
+  // We commit the rolled back prepared batches. ALthough this is
+  // counter-intuitive, i) it is safe to do so, since the prepared batches are
+  // already canceled out by the rollback batch, ii) adding the commit entry to
+  // CommitCache will allow us to benefit from the existing mechanism in
+  // CommitCache that keeps an entry evicted due to max advance and yet overlaps
+  // with a live snapshot around so that the live snapshot properly skips the
+  // entry even if its prepare seq is lower than max_evicted_seq_.
+  WriteUnpreparedCommitEntryPreReleaseCallback update_commit_map(
+      wpt_db_, db_impl_, unprep_seqs_, ONE_BATCH);
+  // Note: the rollback batch does not need AddPrepared since it is written to
+  // DB in one shot. min_uncommitted still works since it requires capturing
+  // data that is written to DB but not yet committed, while the roolback
+  // batch commits with PreReleaseCallback.
+  s = db_impl_->WriteImpl(write_options_, rollback_batch.GetWriteBatch(),
+                          nullptr, nullptr, NO_REF_LOG, !DISABLE_MEMTABLE,
+                          &seq_used, rollback_batch.SubBatchCnt(),
+                          do_one_write ? &update_commit_map : nullptr);
+  assert(!s.ok() || seq_used != kMaxSequenceNumber);
+  if (!s.ok()) {
+    return s;
+  }
+  if (do_one_write) {
+    for (const auto& seq : unprep_seqs_) {
+      wpt_db_->RemovePrepared(seq.first, seq.second);
+    }
+    unprep_seqs_.clear();
+    write_set_keys_.clear();
+    return s;
+  }  // else do the 2nd write for commit
+  uint64_t& prepare_seq = seq_used;
+  ROCKS_LOG_DETAILS(db_impl_->immutable_db_options().info_log,
+                    "RollbackInternal 2nd write prepare_seq: %" PRIu64,
+                    prepare_seq);
+  // Commit the batch by writing an empty batch to the queue that will release
+  // the commit sequence number to readers.
+  const size_t ZERO_COMMITS = 0;
+  WritePreparedCommitEntryPreReleaseCallback update_commit_map_with_prepare(
+      wpt_db_, db_impl_, prepare_seq, ONE_BATCH, ZERO_COMMITS);
+  WriteBatch empty_batch;
+  empty_batch.PutLogData(Slice());
+  // In the absence of Prepare markers, use Noop as a batch separator
+  WriteBatchInternal::InsertNoop(&empty_batch);
+  s = db_impl_->WriteImpl(write_options_, &empty_batch, nullptr, nullptr,
+                          NO_REF_LOG, DISABLE_MEMTABLE, &seq_used, ONE_BATCH,
+                          &update_commit_map_with_prepare);
+  assert(!s.ok() || seq_used != kMaxSequenceNumber);
+  // Mark the txn as rolled back
+  uint64_t& rollback_seq = seq_used;
+  if (s.ok()) {
+    // Note: it is safe to do it after PreReleaseCallback via WriteImpl since
+    // all the writes by the prpared batch are already blinded by the rollback
+    // batch. The only reason we commit the prepared batch here is to benefit
+    // from the existing mechanism in CommitCache that takes care of the rare
+    // cases that the prepare seq is visible to a snsapshot but max evicted seq
+    // advances that prepare seq.
+    for (const auto& seq : unprep_seqs_) {
+      for (size_t i = 0; i < seq.second; i++) {
+        wpt_db_->AddCommitted(seq.first + i, rollback_seq);
+      }
+    }
+    for (const auto& seq : unprep_seqs_) {
+      wpt_db_->RemovePrepared(seq.first, seq.second);
+    }
+  }
+
+  unprep_seqs_.clear();
+  write_set_keys_.clear();
+  return s;
+}
 
 Status WriteUnpreparedTxn::Get(const ReadOptions& options,
                                ColumnFamilyHandle* column_family,
diff -Nru rocksdb-5.15.10/utilities/transactions/write_unprepared_txn_db.cc rocksdb-5.17.2/utilities/transactions/write_unprepared_txn_db.cc
--- rocksdb-5.15.10/utilities/transactions/write_unprepared_txn_db.cc	2018-09-13 17:25:20.000000000 +0000
+++ rocksdb-5.17.2/utilities/transactions/write_unprepared_txn_db.cc	2018-11-12 19:57:32.000000000 +0000
@@ -24,7 +24,6 @@
   assert(rtxn->unprepared_);
   auto cf_map_shared_ptr = WritePreparedTxnDB::GetCFHandleMap();
   auto cf_comp_map_shared_ptr = WritePreparedTxnDB::GetCFComparatorMap();
-  const bool kRollbackMergeOperands = true;
   WriteOptions w_options;
   // If we crash during recovery, we can just recalculate and rewrite the
   // rollback batch.
@@ -131,7 +130,7 @@
       }
     } rollback_handler(db_impl_, this, last_visible_txn, &rollback_batch,
                        *cf_comp_map_shared_ptr.get(), *cf_map_shared_ptr.get(),
-                       !kRollbackMergeOperands);
+                       txn_db_options_.rollback_merge_operands);
 
     auto s = batch->Iterate(&rollback_handler);
     if (!s.ok()) {
@@ -240,7 +239,7 @@
     TransactionOptions t_options;
 
     auto first_log_number = recovered_trx->batches_.begin()->second.log_number_;
-    auto last_seq = recovered_trx->batches_.rbegin()->first;
+    auto first_seq = recovered_trx->batches_.begin()->first;
     auto last_prepare_batch_cnt =
         recovered_trx->batches_.begin()->second.batch_cnt_;
 
@@ -250,7 +249,7 @@
         static_cast_with_check<WriteUnpreparedTxn, Transaction>(real_trx);
 
     real_trx->SetLogNumber(first_log_number);
-    real_trx->SetId(last_seq);
+    real_trx->SetId(first_seq);
     s = real_trx->SetName(recovered_trx->name_);
     if (!s.ok()) {
       break;
@@ -268,6 +267,13 @@
       }
       assert(wupt->unprep_seqs_.count(seq) == 0);
       wupt->unprep_seqs_[seq] = cnt;
+      KeySetBuilder keyset_handler(wupt,
+                                   txn_db_options_.rollback_merge_operands);
+      s = batch_info.batch_->Iterate(&keyset_handler);
+      assert(s.ok());
+      if (!s.ok()) {
+        break;
+      }
     }
 
     wupt->write_batch_.Clear();
@@ -366,5 +372,29 @@
   return db_iter;
 }
 
+Status KeySetBuilder::PutCF(uint32_t cf, const Slice& key,
+                            const Slice& /*val*/) {
+  txn_->UpdateWriteKeySet(cf, key);
+  return Status::OK();
+}
+
+Status KeySetBuilder::DeleteCF(uint32_t cf, const Slice& key) {
+  txn_->UpdateWriteKeySet(cf, key);
+  return Status::OK();
+}
+
+Status KeySetBuilder::SingleDeleteCF(uint32_t cf, const Slice& key) {
+  txn_->UpdateWriteKeySet(cf, key);
+  return Status::OK();
+}
+
+Status KeySetBuilder::MergeCF(uint32_t cf, const Slice& key,
+                              const Slice& /*val*/) {
+  if (rollback_merge_operands_) {
+    txn_->UpdateWriteKeySet(cf, key);
+  }
+  return Status::OK();
+}
+
 }  //  namespace rocksdb
 #endif  // ROCKSDB_LITE
diff -Nru rocksdb-5.15.10/utilities/transactions/write_unprepared_txn_db.h rocksdb-5.17.2/utilities/transactions/write_unprepared_txn_db.h
--- rocksdb-5.15.10/utilities/transactions/write_unprepared_txn_db.h	2018-09-13 17:25:20.000000000 +0000
+++ rocksdb-5.17.2/utilities/transactions/write_unprepared_txn_db.h	2018-11-12 19:57:32.000000000 +0000
@@ -40,5 +40,98 @@
   Status RollbackRecoveredTransaction(const DBImpl::RecoveredTransaction* rtxn);
 };
 
+class WriteUnpreparedCommitEntryPreReleaseCallback : public PreReleaseCallback {
+  // TODO(lth): Reduce code duplication with
+  // WritePreparedCommitEntryPreReleaseCallback
+ public:
+  // includes_data indicates that the commit also writes non-empty
+  // CommitTimeWriteBatch to memtable, which needs to be committed separately.
+  WriteUnpreparedCommitEntryPreReleaseCallback(
+      WritePreparedTxnDB* db, DBImpl* db_impl,
+      const std::map<SequenceNumber, size_t>& unprep_seqs,
+      size_t data_batch_cnt = 0, bool publish_seq = true)
+      : db_(db),
+        db_impl_(db_impl),
+        unprep_seqs_(unprep_seqs),
+        data_batch_cnt_(data_batch_cnt),
+        includes_data_(data_batch_cnt_ > 0),
+        publish_seq_(publish_seq) {
+    assert(unprep_seqs.size() > 0);
+  }
+
+  virtual Status Callback(SequenceNumber commit_seq, bool is_mem_disabled
+                          __attribute__((__unused__))) override {
+    const uint64_t last_commit_seq = LIKELY(data_batch_cnt_ <= 1)
+                                         ? commit_seq
+                                         : commit_seq + data_batch_cnt_ - 1;
+    // Recall that unprep_seqs maps (un)prepared_seq => prepare_batch_cnt.
+    for (const auto& s : unprep_seqs_) {
+      for (size_t i = 0; i < s.second; i++) {
+        db_->AddCommitted(s.first + i, last_commit_seq);
+      }
+    }
+
+    if (includes_data_) {
+      assert(data_batch_cnt_);
+      // Commit the data that is accompanied with the commit request
+      for (size_t i = 0; i < data_batch_cnt_; i++) {
+        // For commit seq of each batch use the commit seq of the last batch.
+        // This would make debugging easier by having all the batches having
+        // the same sequence number.
+        db_->AddCommitted(commit_seq + i, last_commit_seq);
+      }
+    }
+    if (db_impl_->immutable_db_options().two_write_queues && publish_seq_) {
+      assert(is_mem_disabled);  // implies the 2nd queue
+      // Publish the sequence number. We can do that here assuming the callback
+      // is invoked only from one write queue, which would guarantee that the
+      // publish sequence numbers will be in order, i.e., once a seq is
+      // published all the seq prior to that are also publishable.
+      db_impl_->SetLastPublishedSequence(last_commit_seq);
+    }
+    // else SequenceNumber that is updated as part of the write already does the
+    // publishing
+    return Status::OK();
+  }
+
+ private:
+  WritePreparedTxnDB* db_;
+  DBImpl* db_impl_;
+  const std::map<SequenceNumber, size_t>& unprep_seqs_;
+  size_t data_batch_cnt_;
+  // Either because it is commit without prepare or it has a
+  // CommitTimeWriteBatch
+  bool includes_data_;
+  // Should the callback also publishes the commit seq number
+  bool publish_seq_;
+};
+
+struct KeySetBuilder : public WriteBatch::Handler {
+  WriteUnpreparedTxn* txn_;
+  bool rollback_merge_operands_;
+
+  KeySetBuilder(WriteUnpreparedTxn* txn, bool rollback_merge_operands)
+      : txn_(txn), rollback_merge_operands_(rollback_merge_operands) {}
+
+  Status PutCF(uint32_t cf, const Slice& key, const Slice& val) override;
+
+  Status DeleteCF(uint32_t cf, const Slice& key) override;
+
+  Status SingleDeleteCF(uint32_t cf, const Slice& key) override;
+
+  Status MergeCF(uint32_t cf, const Slice& key, const Slice& val) override;
+
+  // Recovered batches do not contain 2PC markers.
+  Status MarkNoop(bool) override { return Status::InvalidArgument(); }
+  Status MarkBeginPrepare(bool) override { return Status::InvalidArgument(); }
+  Status MarkEndPrepare(const Slice&) override {
+    return Status::InvalidArgument();
+  }
+  Status MarkCommit(const Slice&) override { return Status::InvalidArgument(); }
+  Status MarkRollback(const Slice&) override {
+    return Status::InvalidArgument();
+  }
+};
+
 }  //  namespace rocksdb
 #endif  // ROCKSDB_LITE
diff -Nru rocksdb-5.15.10/utilities/transactions/write_unprepared_txn.h rocksdb-5.17.2/utilities/transactions/write_unprepared_txn.h
--- rocksdb-5.15.10/utilities/transactions/write_unprepared_txn.h	2018-09-13 17:25:20.000000000 +0000
+++ rocksdb-5.17.2/utilities/transactions/write_unprepared_txn.h	2018-11-12 19:57:32.000000000 +0000
@@ -7,6 +7,8 @@
 
 #ifndef ROCKSDB_LITE
 
+#include <set>
+
 #include "utilities/transactions/write_prepared_txn.h"
 #include "utilities/transactions/write_unprepared_txn_db.h"
 
@@ -42,7 +44,53 @@
                      const WriteOptions& write_options,
                      const TransactionOptions& txn_options);
 
-  virtual ~WriteUnpreparedTxn() {}
+  virtual ~WriteUnpreparedTxn();
+
+  using TransactionBaseImpl::Put;
+  virtual Status Put(ColumnFamilyHandle* column_family, const Slice& key,
+                     const Slice& value) override;
+  virtual Status Put(ColumnFamilyHandle* column_family, const SliceParts& key,
+                     const SliceParts& value) override;
+
+  using TransactionBaseImpl::Merge;
+  virtual Status Merge(ColumnFamilyHandle* column_family, const Slice& key,
+                       const Slice& value) override;
+
+  using TransactionBaseImpl::Delete;
+  virtual Status Delete(ColumnFamilyHandle* column_family,
+                        const Slice& key) override;
+  virtual Status Delete(ColumnFamilyHandle* column_family,
+                        const SliceParts& key) override;
+
+  using TransactionBaseImpl::SingleDelete;
+  virtual Status SingleDelete(ColumnFamilyHandle* column_family,
+                              const Slice& key) override;
+  virtual Status SingleDelete(ColumnFamilyHandle* column_family,
+                              const SliceParts& key) override;
+
+  virtual Status RebuildFromWriteBatch(WriteBatch*) override {
+    // This function was only useful for recovering prepared transactions, but
+    // is unused for write prepared because a transaction may consist of
+    // multiple write batches.
+    //
+    // If there are use cases outside of recovery that can make use of this,
+    // then support could be added.
+    return Status::NotSupported("Not supported for WriteUnprepared");
+  }
+
+  const std::map<SequenceNumber, size_t>& GetUnpreparedSequenceNumbers();
+
+  void UpdateWriteKeySet(uint32_t cfid, const Slice& key);
+
+ protected:
+  void Initialize(const TransactionOptions& txn_options) override;
+
+  Status PrepareInternal() override;
+
+  Status CommitWithoutPrepareInternal() override;
+  Status CommitInternal() override;
+
+  Status RollbackInternal() override;
 
   // Get and GetIterator needs to be overridden so that a ReadCallback to
   // handle read-your-own-write is used.
@@ -56,20 +104,37 @@
   virtual Iterator* GetIterator(const ReadOptions& options,
                                 ColumnFamilyHandle* column_family) override;
 
-  const std::map<SequenceNumber, size_t>& GetUnpreparedSequenceNumbers();
-
  private:
   friend class WriteUnpreparedTransactionTest_ReadYourOwnWrite_Test;
+  friend class WriteUnpreparedTransactionTest_RecoveryTest_Test;
+  friend class WriteUnpreparedTransactionTest_UnpreparedBatch_Test;
   friend class WriteUnpreparedTxnDB;
 
+  Status MaybeFlushWriteBatchToDB();
+  Status FlushWriteBatchToDB(bool prepared);
+
+  // For write unprepared, we check on every writebatch append to see if
+  // max_write_batch_size_ has been exceeded, and then call
+  // FlushWriteBatchToDB if so. This logic is encapsulated in
+  // MaybeFlushWriteBatchToDB.
+  size_t max_write_batch_size_;
   WriteUnpreparedTxnDB* wupt_db_;
 
   // Ordered list of unprep_seq sequence numbers that we have already written
   // to DB.
   //
-  // This maps unprep_seq => prepare_batch_cnt for each prepared batch written
-  // by this transactioin.
+  // This maps unprep_seq => prepare_batch_cnt for each unprepared batch
+  // written by this transaction.
+  //
+  // Note that this contains both prepared and unprepared batches, since they
+  // are treated similarily in prepare heap/commit map, so it simplifies the
+  // commit callbacks.
   std::map<SequenceNumber, size_t> unprep_seqs_;
+
+  // Set of keys that have written to that have already been written to DB
+  // (ie. not in write_batch_).
+  //
+  std::map<uint32_t, std::vector<std::string>> write_set_keys_;
 };
 
 }  // namespace rocksdb
diff -Nru rocksdb-5.15.10/utilities/write_batch_with_index/write_batch_with_index.cc rocksdb-5.17.2/utilities/write_batch_with_index/write_batch_with_index.cc
--- rocksdb-5.15.10/utilities/write_batch_with_index/write_batch_with_index.cc	2018-09-13 17:25:20.000000000 +0000
+++ rocksdb-5.17.2/utilities/write_batch_with_index/write_batch_with_index.cc	2018-11-12 19:57:32.000000000 +0000
@@ -352,14 +352,16 @@
   }
 
   virtual void SeekToFirst() override {
-    WriteBatchIndexEntry search_entry(WriteBatchIndexEntry::kFlagMin,
-                                      column_family_id_, 0, 0);
+    WriteBatchIndexEntry search_entry(
+        nullptr /* search_key */, column_family_id_,
+        true /* is_forward_direction */, true /* is_seek_to_first */);
     skip_list_iter_.Seek(&search_entry);
   }
 
   virtual void SeekToLast() override {
-    WriteBatchIndexEntry search_entry(WriteBatchIndexEntry::kFlagMin,
-                                      column_family_id_ + 1, 0, 0);
+    WriteBatchIndexEntry search_entry(
+        nullptr /* search_key */, column_family_id_ + 1,
+        true /* is_forward_direction */, true /* is_seek_to_first */);
     skip_list_iter_.Seek(&search_entry);
     if (!skip_list_iter_.Valid()) {
       skip_list_iter_.SeekToLast();
@@ -369,12 +371,16 @@
   }
 
   virtual void Seek(const Slice& key) override {
-    WriteBatchIndexEntry search_entry(&key, column_family_id_);
+    WriteBatchIndexEntry search_entry(&key, column_family_id_,
+                                      true /* is_forward_direction */,
+                                      false /* is_seek_to_first */);
     skip_list_iter_.Seek(&search_entry);
   }
 
   virtual void SeekForPrev(const Slice& key) override {
-    WriteBatchIndexEntry search_entry(&key, column_family_id_);
+    WriteBatchIndexEntry search_entry(&key, column_family_id_,
+                                      false /* is_forward_direction */,
+                                      false /* is_seek_to_first */);
     skip_list_iter_.SeekForPrev(&search_entry);
   }
 
@@ -938,5 +944,9 @@
   rep->write_batch.SetMaxBytes(max_bytes);
 }
 
+size_t WriteBatchWithIndex::GetDataSize() const {
+  return rep->write_batch.GetDataSize();
+}
+
 }  // namespace rocksdb
 #endif  // !ROCKSDB_LITE
diff -Nru rocksdb-5.15.10/utilities/write_batch_with_index/write_batch_with_index_internal.cc rocksdb-5.17.2/utilities/write_batch_with_index/write_batch_with_index_internal.cc
--- rocksdb-5.15.10/utilities/write_batch_with_index/write_batch_with_index_internal.cc	2018-09-13 17:25:20.000000000 +0000
+++ rocksdb-5.17.2/utilities/write_batch_with_index/write_batch_with_index_internal.cc	2018-11-12 19:57:32.000000000 +0000
@@ -85,6 +85,20 @@
   return Status::OK();
 }
 
+// If both of `entry1` and `entry2` point to real entry in write batch, we
+// compare the entries as following:
+// 1. first compare the column family, the one with larger CF will be larger;
+// 2. Inside the same CF, we first decode the entry to find the key of the entry
+//    and the entry with larger key will be larger;
+// 3. If two entries are of the same CF and offset, the one with larger offset
+//    will be larger.
+// Some times either `entry1` or `entry2` is dummy entry, which is actually
+// a search key. In this case, in step 2, we don't go ahead and decode the
+// entry but use the value in WriteBatchIndexEntry::search_key.
+// One special case is WriteBatchIndexEntry::key_size is kFlagMinInCf.
+// This indicate that we are going to seek to the first of the column family.
+// Once we see this, this entry will be smaller than all the real entries of
+// the column family.
 int WriteBatchEntryComparator::operator()(
     const WriteBatchIndexEntry* entry1,
     const WriteBatchIndexEntry* entry2) const {
@@ -94,9 +108,10 @@
     return -1;
   }
 
-  if (entry1->offset == WriteBatchIndexEntry::kFlagMin) {
+  // Deal with special case of seeking to the beginning of a column family
+  if (entry1->is_min_in_cf()) {
     return -1;
-  } else if (entry2->offset == WriteBatchIndexEntry::kFlagMin) {
+  } else if (entry2->is_min_in_cf()) {
     return 1;
   }
 
diff -Nru rocksdb-5.15.10/utilities/write_batch_with_index/write_batch_with_index_internal.h rocksdb-5.17.2/utilities/write_batch_with_index/write_batch_with_index_internal.h
--- rocksdb-5.15.10/utilities/write_batch_with_index/write_batch_with_index_internal.h	2018-09-13 17:25:20.000000000 +0000
+++ rocksdb-5.17.2/utilities/write_batch_with_index/write_batch_with_index_internal.h	2018-11-12 19:57:32.000000000 +0000
@@ -31,21 +31,52 @@
         key_offset(ko),
         key_size(ksz),
         search_key(nullptr) {}
-  WriteBatchIndexEntry(const Slice* sk, uint32_t c)
-      : offset(0),
-        column_family(c),
+  // Create a dummy entry as the search key. This index entry won't be backed
+  // by an entry from the write batch, but a pointer to the search key. Or a
+  // special flag of offset can indicate we are seek to first.
+  // @_search_key: the search key
+  // @_column_family: column family
+  // @is_forward_direction: true for Seek(). False for SeekForPrev()
+  // @is_seek_to_first: true if we seek to the beginning of the column family
+  //                    _search_key should be null in this case.
+  WriteBatchIndexEntry(const Slice* _search_key, uint32_t _column_family,
+                       bool is_forward_direction, bool is_seek_to_first)
+      // For SeekForPrev(), we need to make the dummy entry larger than any
+      // entry who has the same search key. Otherwise, we'll miss those entries.
+      : offset(is_forward_direction ? 0 : port::kMaxSizet),
+        column_family(_column_family),
         key_offset(0),
-        key_size(0),
-        search_key(sk) {}
+        key_size(is_seek_to_first ? kFlagMinInCf : 0),
+        search_key(_search_key) {
+    assert(_search_key != nullptr || is_seek_to_first);
+  }
 
-  // If this flag appears in the offset, it indicates a key that is smaller
-  // than any other entry for the same column family
-  static const size_t kFlagMin = port::kMaxSizet;
+  // If this flag appears in the key_size, it indicates a
+  // key that is smaller than any other entry for the same column family.
+  static const size_t kFlagMinInCf = port::kMaxSizet;
 
-  size_t offset;           // offset of an entry in write batch's string buffer.
-  uint32_t column_family;  // column family of the entry.
+  bool is_min_in_cf() const {
+    assert(key_size != kFlagMinInCf ||
+           (key_offset == 0 && search_key == nullptr));
+    return key_size == kFlagMinInCf;
+  }
+
+  // offset of an entry in write batch's string buffer. If this is a dummy
+  // lookup key, in which case search_key != nullptr, offset is set to either
+  // 0 or max, only for comparison purpose. Because when entries have the same
+  // key, the entry with larger offset is larger, offset = 0 will make a seek
+  // key small or equal than all the entries with the seek key, so that Seek()
+  // will find all the entries of the same key. Similarly, offset = MAX will
+  // make the entry just larger than all entries with the search key so
+  // SeekForPrev() will see all the keys with the same key.
+  size_t offset;
+  uint32_t column_family;  // c1olumn family of the entry.
   size_t key_offset;       // offset of the key in write batch's string buffer.
-  size_t key_size;         // size of the key.
+  size_t key_size;         // size of the key. kFlagMinInCf indicates
+                           // that this is a dummy look up entry for
+                           // SeekToFirst() to the beginning of the column
+                           // family. We use the flag here to save a boolean
+                           // in the struct.
 
   const Slice* search_key;  // if not null, instead of reading keys from
                             // write batch, use it to compare. This is used
diff -Nru rocksdb-5.15.10/utilities/write_batch_with_index/write_batch_with_index_test.cc rocksdb-5.17.2/utilities/write_batch_with_index/write_batch_with_index_test.cc
--- rocksdb-5.15.10/utilities/write_batch_with_index/write_batch_with_index_test.cc	2018-09-13 17:25:20.000000000 +0000
+++ rocksdb-5.17.2/utilities/write_batch_with_index/write_batch_with_index_test.cc	2018-11-12 19:57:32.000000000 +0000
@@ -621,7 +621,7 @@
     for (int i = 0; i < 128; i++) {
       // Random walk and make sure iter and result_iter returns the
       // same key and value
-      int type = rnd.Uniform(5);
+      int type = rnd.Uniform(6);
       ASSERT_OK(iter->status());
       switch (type) {
         case 0:
@@ -642,7 +642,15 @@
           result_iter->Seek(key);
           break;
         }
-        case 3:
+        case 3: {
+          // SeekForPrev to random key
+          auto key_idx = rnd.Uniform(static_cast<int>(source_strings.size()));
+          auto key = source_strings[key_idx];
+          iter->SeekForPrev(key);
+          result_iter->SeekForPrev(key);
+          break;
+        }
+        case 4:
           // Next
           if (is_valid) {
             iter->Next();
@@ -652,7 +660,7 @@
           }
           break;
         default:
-          assert(type == 4);
+          assert(type == 5);
           // Prev
           if (is_valid) {
             iter->Prev();