diff -Nru rocksdb-5.15.10/buckifier/buckify_rocksdb.py rocksdb-5.17.2/buckifier/buckify_rocksdb.py --- rocksdb-5.15.10/buckifier/buckify_rocksdb.py 2018-09-13 17:25:20.000000000 +0000 +++ rocksdb-5.17.2/buckifier/buckify_rocksdb.py 2018-11-12 19:57:32.000000000 +0000 @@ -109,12 +109,14 @@ "rocksdb_test_lib", src_mk.get("MOCK_LIB_SOURCES", []) + src_mk.get("TEST_LIB_SOURCES", []) + - src_mk.get("EXP_LIB_SOURCES", []), + src_mk.get("EXP_LIB_SOURCES", []) + + src_mk.get("ANALYZER_LIB_SOURCES", []), [":rocksdb_lib"]) # rocksdb_tools_lib TARGETS.add_library( "rocksdb_tools_lib", src_mk.get("BENCH_LIB_SOURCES", []) + + src_mk.get("ANALYZER_LIB_SOURCES", []) + ["util/testutil.cc"], [":rocksdb_lib"]) diff -Nru rocksdb-5.15.10/buckifier/targets_builder.py rocksdb-5.17.2/buckifier/targets_builder.py --- rocksdb-5.15.10/buckifier/targets_builder.py 2018-09-13 17:25:20.000000000 +0000 +++ rocksdb-5.17.2/buckifier/targets_builder.py 2018-11-12 19:57:32.000000000 +0000 @@ -10,7 +10,7 @@ if len(lst) == 1: return "\"%s\"" % lst[0] - + separator = "\",\n%s\"" % (" " * indent) res = separator.join(sorted(lst)) res = "\n" + (" " * indent) + "\"" + res + "\",\n" + (" " * (indent - 4)) @@ -31,13 +31,16 @@ self.targets_file.close() def add_library(self, name, srcs, deps=None, headers=None): + headers_attr_prefix = "" if headers is None: + headers_attr_prefix = "auto_" headers = "AutoHeaders.RECURSIVE_GLOB" - self.targets_file.write(targets_cfg.library_template % ( - name, - pretty_list(srcs), - headers, - pretty_list(deps))) + self.targets_file.write(targets_cfg.library_template.format( + name=name, + srcs=pretty_list(srcs), + headers_attr_prefix=headers_attr_prefix, + headers=headers, + deps=pretty_list(deps))) self.total_lib = self.total_lib + 1 def add_binary(self, name, srcs, deps=None): diff -Nru rocksdb-5.15.10/buckifier/targets_cfg.py rocksdb-5.17.2/buckifier/targets_cfg.py --- rocksdb-5.15.10/buckifier/targets_cfg.py 2018-09-13 17:25:20.000000000 +0000 +++ rocksdb-5.17.2/buckifier/targets_cfg.py 2018-11-12 19:57:32.000000000 +0000 @@ -2,7 +2,9 @@ from __future__ import division from __future__ import print_function from __future__ import unicode_literals -rocksdb_target_header = """REPO_PATH = package_name() + "/" +rocksdb_target_header = """load("@fbcode_macros//build_defs:auto_headers.bzl", "AutoHeaders") + +REPO_PATH = package_name() + "/" BUCK_BINS = "buck-out/gen/" + REPO_PATH @@ -73,13 +75,13 @@ library_template = """ cpp_library( - name = "%s", - srcs = [%s], - headers = %s, + name = "{name}", + srcs = [{srcs}], + {headers_attr_prefix}headers = {headers}, arch_preprocessor_flags = rocksdb_arch_preprocessor_flags, compiler_flags = rocksdb_compiler_flags, preprocessor_flags = rocksdb_preprocessor_flags, - deps = [%s], + deps = [{deps}], external_deps = rocksdb_external_deps, ) """ @@ -118,21 +120,20 @@ ttype = "gtest" if test_cfg[2] == "parallel" else "simple" test_bin = test_name + "_bin" - cpp_binary ( - name = test_bin, - srcs = [test_cc], - deps = [":rocksdb_test_lib"], - preprocessor_flags = rocksdb_preprocessor_flags, - arch_preprocessor_flags = rocksdb_arch_preprocessor_flags, - compiler_flags = rocksdb_compiler_flags, - external_deps = rocksdb_external_deps, + cpp_binary( + name = test_bin, + srcs = [test_cc], + arch_preprocessor_flags = rocksdb_arch_preprocessor_flags, + compiler_flags = rocksdb_compiler_flags, + preprocessor_flags = rocksdb_preprocessor_flags, + deps = [":rocksdb_test_lib"], + external_deps = rocksdb_external_deps, ) custom_unittest( - name = test_name, - type = ttype, - deps = [":" + test_bin], - command = [TEST_RUNNER, BUCK_BINS + test_bin] + name = test_name, + command = [TEST_RUNNER, BUCK_BINS + test_bin], + type = ttype, + deps = [":" + test_bin], ) - """ diff -Nru rocksdb-5.15.10/build_tools/dependencies.sh rocksdb-5.17.2/build_tools/dependencies.sh --- rocksdb-5.15.10/build_tools/dependencies.sh 2018-09-13 17:25:20.000000000 +0000 +++ rocksdb-5.17.2/build_tools/dependencies.sh 2018-11-12 19:57:32.000000000 +0000 @@ -1,19 +1,18 @@ -# shellcheck disable=SC2148 -GCC_BASE=/mnt/gvfs/third-party2/gcc/8219ec1bcedf8ad9da05e121e193364de2cc4f61/5.x/centos6-native/c447969 -CLANG_BASE=/mnt/gvfs/third-party2/llvm-fb/64d8d58e3d84f8bde7a029763d4f5baf39d0d5b9/stable/centos6-native/6aaf4de -LIBGCC_BASE=/mnt/gvfs/third-party2/libgcc/ba9be983c81de7299b59fe71950c664a84dcb5f8/5.x/gcc-5-glibc-2.23/339d858 -GLIBC_BASE=/mnt/gvfs/third-party2/glibc/f20197cf3d4bd50339c9777aaa0b2ccadad9e2cb/2.23/gcc-5-glibc-2.23/ca1d1c0 -SNAPPY_BASE=/mnt/gvfs/third-party2/snappy/6427ce8c7496e4ab06c2da81543b94c0de8be3d0/1.1.3/gcc-5-glibc-2.23/9bc6787 -ZLIB_BASE=/mnt/gvfs/third-party2/zlib/8f1e8b867d26efef93eac2fabbdb2e1d512665d7/1.2.8/gcc-5-glibc-2.23/9bc6787 -BZIP2_BASE=/mnt/gvfs/third-party2/bzip2/70471c0571559fe0af7db6d7e8860b93a7eadfe1/1.0.6/gcc-5-glibc-2.23/9bc6787 -LZ4_BASE=/mnt/gvfs/third-party2/lz4/453c89d6f0e68cdf1c151c769197fabedad9cac8/r131/gcc-5-glibc-2.23/9bc6787 -ZSTD_BASE=/mnt/gvfs/third-party2/zstd/00a40fa5f8bd2cd0622f2e868552793aef37ccf4/1.3.0/gcc-5-glibc-2.23/03859b5 -GFLAGS_BASE=/mnt/gvfs/third-party2/gflags/47eef08f9acb77de982fbda6047c26d330739538/2.2.0/gcc-5-glibc-2.23/9bc6787 -JEMALLOC_BASE=/mnt/gvfs/third-party2/jemalloc/4414ddc78df8008b35cc4adac23590ad29148584/master/gcc-5-glibc-2.23/d506c82 -NUMA_BASE=/mnt/gvfs/third-party2/numa/9d7ae2693d05d62f9a579cb21e6b717cf257a75d/2.0.11/gcc-5-glibc-2.23/9bc6787 -LIBUNWIND_BASE=/mnt/gvfs/third-party2/libunwind/2b2dd58e3a52ccf2c1d827def59e5f740de0ad15/1.2/gcc-5-glibc-2.23/b443de1 -TBB_BASE=/mnt/gvfs/third-party2/tbb/379addf7ab2468a2b4293b47456cfcd1c9cb318d/4.3/gcc-5-glibc-2.23/9bc6787 -KERNEL_HEADERS_BASE=/mnt/gvfs/third-party2/kernel-headers/3f68f5fe65a85b7c2d3e66852268fbd1efdb3151/4.0.9-36_fbk5_2933_gd092e3f/gcc-5-glibc-2.23/da39a3e -BINUTILS_BASE=/mnt/gvfs/third-party2/binutils/b9fab0aec99d9c36408e810b2677e91c12807afd/2.28/centos6-native/da39a3e -VALGRIND_BASE=/mnt/gvfs/third-party2/valgrind/423431d61786b20bcc3bde8972901130cb29e6b3/3.11.0/gcc-5-glibc-2.23/9bc6787 -LUA_BASE=/mnt/gvfs/third-party2/lua/3b0bb3bd9a0f690a069c479fcc0f7424fc7456d2/5.2.3/gcc-5-glibc-2.23/65372bd +GCC_BASE=/mnt/gvfs/third-party2/gcc/112ec378fec7002ad3e09afde022e656049f7191/5.x/centos7-native/c447969 +CLANG_BASE=/mnt/gvfs/third-party2/llvm-fb/04999bdb3ce81a11073535dcb00b5e13dc1cbaf5/stable/centos7-native/c9f9104 +LIBGCC_BASE=/mnt/gvfs/third-party2/libgcc/92b0c8e5c8eecc71eb042594ce1ab3413799b385/5.x/gcc-5-glibc-2.23/339d858 +GLIBC_BASE=/mnt/gvfs/third-party2/glibc/3d8698d5973ba94f41620a80a67e4457fdf01e90/2.23/gcc-5-glibc-2.23/ca1d1c0 +SNAPPY_BASE=/mnt/gvfs/third-party2/snappy/7f9bdaada18f59bc27ec2b0871eb8a6144343aef/1.1.3/gcc-5-glibc-2.23/9bc6787 +ZLIB_BASE=/mnt/gvfs/third-party2/zlib/22c2d65676fb7c23cfa797c4f6937f38b026f3cf/1.2.8/gcc-5-glibc-2.23/9bc6787 +BZIP2_BASE=/mnt/gvfs/third-party2/bzip2/dc49a21c5fceec6456a7a28a94dcd16690af1337/1.0.6/gcc-5-glibc-2.23/9bc6787 +LZ4_BASE=/mnt/gvfs/third-party2/lz4/907b498203d297947f3bb70b9466f47e100f1873/r131/gcc-5-glibc-2.23/9bc6787 +ZSTD_BASE=/mnt/gvfs/third-party2/zstd/af6628a46758f1a15484a1760cd7294164bc5ba1/1.3.5/gcc-5-glibc-2.23/03859b5 +GFLAGS_BASE=/mnt/gvfs/third-party2/gflags/0b9929d2588991c65a57168bf88aff2db87c5d48/2.2.0/gcc-5-glibc-2.23/9bc6787 +JEMALLOC_BASE=/mnt/gvfs/third-party2/jemalloc/b1a0e56c1e3e6929813a4331ade3a58ff083afbb/master/gcc-5-glibc-2.23/aa64d6b +NUMA_BASE=/mnt/gvfs/third-party2/numa/9cbf2460284c669ed19c3ccb200a71f7dd7e53c7/2.0.11/gcc-5-glibc-2.23/9bc6787 +LIBUNWIND_BASE=/mnt/gvfs/third-party2/libunwind/bf3d7497fe4e6d007354f0adffa16ce3003f8338/1.3/gcc-5-glibc-2.23/b443de1 +TBB_BASE=/mnt/gvfs/third-party2/tbb/ff4e0b093534704d8abab678a4fd7f5ea7b094c7/2018_U5/gcc-5-glibc-2.23/9bc6787 +KERNEL_HEADERS_BASE=/mnt/gvfs/third-party2/kernel-headers/b5c4a61a5c483ba24722005ae07895971a2ac707/4.0.9-36_fbk5_2933_gd092e3f/gcc-5-glibc-2.23/da39a3e +BINUTILS_BASE=/mnt/gvfs/third-party2/binutils/55031de95a2b46c82948743419a603b3d6aefe28/2.29.1/centos7-native/da39a3e +VALGRIND_BASE=/mnt/gvfs/third-party2/valgrind/f3f697a28122e6bcd513273dd9c1ff23852fc59f/3.13.0/gcc-5-glibc-2.23/9bc6787 +LUA_BASE=/mnt/gvfs/third-party2/lua/f0cd714433206d5139df61659eb7b28b1dea6683/5.2.3/gcc-5-glibc-2.23/65372bd diff -Nru rocksdb-5.15.10/build_tools/error_filter.py rocksdb-5.17.2/build_tools/error_filter.py --- rocksdb-5.15.10/build_tools/error_filter.py 2018-09-13 17:25:20.000000000 +0000 +++ rocksdb-5.17.2/build_tools/error_filter.py 2018-11-12 19:57:32.000000000 +0000 @@ -64,8 +64,12 @@ class CompilerErrorParser(MatchErrorParser): def __init__(self): - # format: '::: error: ' - super(CompilerErrorParser, self).__init__(r'\S+:\d+:\d+: error:') + # format (compile error): + # '::: error: ' + # format (link error): + # ':: error: ' + # The below regex catches both + super(CompilerErrorParser, self).__init__(r'\S+:\d+: error:') class ScanBuildErrorParser(MatchErrorParser): diff -Nru rocksdb-5.15.10/build_tools/fbcode_config.sh rocksdb-5.17.2/build_tools/fbcode_config.sh --- rocksdb-5.15.10/build_tools/fbcode_config.sh 2018-09-13 17:25:20.000000000 +0000 +++ rocksdb-5.17.2/build_tools/fbcode_config.sh 2018-11-12 19:57:32.000000000 +0000 @@ -43,11 +43,15 @@ LZ4_INCLUDE=" -I $LZ4_BASE/include/" LZ4_LIBS=" $LZ4_BASE/lib/liblz4.a" CFLAGS+=" -DLZ4" +fi - ZSTD_INCLUDE=" -I $ZSTD_BASE/include/" +ZSTD_INCLUDE=" -I $ZSTD_BASE/include/" +if test -z $PIC_BUILD; then ZSTD_LIBS=" $ZSTD_BASE/lib/libzstd.a" - CFLAGS+=" -DZSTD" +else + ZSTD_LIBS=" $ZSTD_BASE/lib/libzstd_pic.a" fi +CFLAGS+=" -DZSTD" # location of gflags headers and libraries GFLAGS_INCLUDE=" -I $GFLAGS_BASE/include/" diff -Nru rocksdb-5.15.10/build_tools/rocksdb-lego-determinator rocksdb-5.17.2/build_tools/rocksdb-lego-determinator --- rocksdb-5.15.10/build_tools/rocksdb-lego-determinator 2018-09-13 17:25:20.000000000 +0000 +++ rocksdb-5.17.2/build_tools/rocksdb-lego-determinator 2018-11-12 19:57:32.000000000 +0000 @@ -88,6 +88,8 @@ LITE="OPT=\"-DROCKSDB_LITE -g\"" TSAN="COMPILE_WITH_TSAN=1" UBSAN="COMPILE_WITH_UBSAN=1" +TSAN_CRASH='CRASH_TEST_EXT_ARGS="--compression_type=zstd --log2_keys_per_lock=22"' +NON_TSAN_CRASH="CRASH_TEST_EXT_ARGS=--compression_type=zstd" DISABLE_JEMALLOC="DISABLE_JEMALLOC=1" HTTP_PROXY="https_proxy=http://fwdproxy.29.prn1:8080 http_proxy=http://fwdproxy.29.prn1:8080 ftp_proxy=http://fwdproxy.29.prn1:8080" SETUP_JAVA_ENV="export $HTTP_PROXY; export JAVA_HOME=/usr/local/jdk-8u60-64/; export PATH=\$JAVA_HOME/bin:\$PATH" @@ -380,14 +382,14 @@ $CLEANUP_ENV, { 'name':'Build and run RocksDB debug stress tests', - 'shell':'$SHM $DEBUG make J=1 db_stress || $CONTRUN_NAME=db_stress $TASK_CREATION_TOOL', + 'shell':'$SHM $DEBUG $NON_TSAN_CRASH make J=1 db_stress || $CONTRUN_NAME=db_stress $TASK_CREATION_TOOL', 'user':'root', $PARSER }, { 'name':'Build and run RocksDB debug crash tests', 'timeout': 86400, - 'shell':'$SHM $DEBUG make J=1 crash_test || $CONTRUN_NAME=crash_test $TASK_CREATION_TOOL', + 'shell':'$SHM $DEBUG $NON_TSAN_CRASH make J=1 crash_test || $CONTRUN_NAME=crash_test $TASK_CREATION_TOOL', 'user':'root', $PARSER } @@ -452,7 +454,7 @@ { 'name':'Build and run RocksDB debug asan_crash_test', 'timeout': 86400, - 'shell':'$SHM $DEBUG make J=1 asan_crash_test || $CONTRUN_NAME=asan_crash_test $TASK_CREATION_TOOL', + 'shell':'$SHM $DEBUG $NON_TSAN_CRASH make J=1 asan_crash_test || $CONTRUN_NAME=asan_crash_test $TASK_CREATION_TOOL', 'user':'root', $PARSER }, @@ -494,7 +496,7 @@ { 'name':'Build and run RocksDB debug ubsan_crash_test', 'timeout': 86400, - 'shell':'$SHM $DEBUG make J=1 ubsan_crash_test || $CONTRUN_NAME=ubsan_crash_test $TASK_CREATION_TOOL', + 'shell':'$SHM $DEBUG $NON_TSAN_CRASH make J=1 ubsan_crash_test || $CONTRUN_NAME=ubsan_crash_test $TASK_CREATION_TOOL', 'user':'root', $PARSER }, @@ -560,7 +562,7 @@ { 'name':'Compile and run', 'timeout': 86400, - 'shell':'set -o pipefail && $SHM $DEBUG $TSAN CRASH_TEST_KILL_ODD=1887 CRASH_TEST_EXT_ARGS=--log2_keys_per_lock=22 make J=1 crash_test || $CONTRUN_NAME=tsan_crash_test $TASK_CREATION_TOOL', + 'shell':'set -o pipefail && $SHM $DEBUG $TSAN $TSAN_CRASH CRASH_TEST_KILL_ODD=1887 make J=1 crash_test || $CONTRUN_NAME=tsan_crash_test $TASK_CREATION_TOOL', 'user':'root', $PARSER }, diff -Nru rocksdb-5.15.10/build_tools/update_dependencies.sh rocksdb-5.17.2/build_tools/update_dependencies.sh --- rocksdb-5.15.10/build_tools/update_dependencies.sh 2018-09-13 17:25:20.000000000 +0000 +++ rocksdb-5.17.2/build_tools/update_dependencies.sh 2018-11-12 19:57:32.000000000 +0000 @@ -65,8 +65,8 @@ echo "Writing dependencies to $OUTPUT" # Compilers locations -GCC_BASE=`readlink -f $TP2_LATEST/gcc/5.x/centos6-native/*/` -CLANG_BASE=`readlink -f $TP2_LATEST/llvm-fb/stable/centos6-native/*/` +GCC_BASE=`readlink -f $TP2_LATEST/gcc/5.x/centos7-native/*/` +CLANG_BASE=`readlink -f $TP2_LATEST/llvm-fb/stable/centos7-native/*/` log_variable GCC_BASE log_variable CLANG_BASE @@ -86,7 +86,7 @@ get_lib_base tbb LATEST gcc-5-glibc-2.23 get_lib_base kernel-headers 4.0.9-36_fbk5_2933_gd092e3f gcc-5-glibc-2.23 -get_lib_base binutils LATEST centos6-native +get_lib_base binutils LATEST centos7-native get_lib_base valgrind LATEST gcc-5-glibc-2.23 get_lib_base lua 5.2.3 gcc-5-glibc-2.23 diff -Nru rocksdb-5.15.10/CMakeLists.txt rocksdb-5.17.2/CMakeLists.txt --- rocksdb-5.15.10/CMakeLists.txt 2018-09-13 17:25:20.000000000 +0000 +++ rocksdb-5.17.2/CMakeLists.txt 2018-11-12 19:57:32.000000000 +0000 @@ -336,12 +336,18 @@ # Used to run CI build and tests so we can run faster option(OPTDBG "Build optimized debug build with MSVC" OFF) +option(WITH_RUNTIME_DEBUG "build with debug version of runtime library" ON) if(MSVC) if(OPTDBG) message(STATUS "Debug optimization is enabled") - set(CMAKE_CXX_FLAGS_DEBUG "/Oxt /${RUNTIME_LIBRARY}d") + set(CMAKE_CXX_FLAGS_DEBUG "/Oxt") else() - set(CMAKE_CXX_FLAGS_DEBUG "${CMAKE_CXX_FLAGS_DEBUG} /Od /RTC1 /Gm /${RUNTIME_LIBRARY}d") + set(CMAKE_CXX_FLAGS_DEBUG "${CMAKE_CXX_FLAGS_DEBUG} /Od /RTC1 /Gm") + endif() + if(WITH_RUNTIME_DEBUG) + set(CMAKE_CXX_FLAGS_DEBUG "${CMAKE_CXX_FLAGS_DEBUG} /${RUNTIME_LIBRARY}d") + else() + set(CMAKE_CXX_FLAGS_DEBUG "${CMAKE_CXX_FLAGS_DEBUG} /${RUNTIME_LIBRARY}") endif() set(CMAKE_CXX_FLAGS_RELEASE "${CMAKE_CXX_FLAGS_RELEASE} /Oxt /Zp8 /Gm- /Gy /${RUNTIME_LIBRARY}") @@ -549,6 +555,8 @@ table/cuckoo_table_builder.cc table/cuckoo_table_factory.cc table/cuckoo_table_reader.cc + table/data_block_hash_index.cc + table/data_block_footer.cc table/flush_block_policy.cc table/format.cc table/full_filter_block.cc @@ -572,6 +580,7 @@ tools/ldb_cmd.cc tools/ldb_tool.cc tools/sst_dump_tool.cc + tools/trace_analyzer_tool.cc util/arena.cc util/auto_roll_logger.cc util/bloom.cc @@ -596,19 +605,20 @@ util/slice.cc util/sst_file_manager_impl.cc util/status.cc - util/status_message.cc util/string_util.cc util/sync_point.cc util/sync_point_impl.cc util/testutil.cc util/thread_local.cc util/threadpool_imp.cc + util/trace_replay.cc util/transaction_test_util.cc util/xxhash.cc utilities/backupable/backupable_db.cc utilities/blob_db/blob_compaction_filter.cc utilities/blob_db/blob_db.cc utilities/blob_db/blob_db_impl.cc + utilities/blob_db/blob_db_impl_filesnapshot.cc utilities/blob_db/blob_dump_tool.cc utilities/blob_db/blob_file.cc utilities/blob_db/blob_log_reader.cc @@ -650,6 +660,7 @@ utilities/simulator_cache/sim_cache.cc utilities/spatialdb/spatial_db.cc utilities/table_properties_collectors/compact_on_deletion_collector.cc + utilities/trace/file_trace_reader_writer.cc utilities/transactions/optimistic_transaction_db_impl.cc utilities/transactions/optimistic_transaction.cc utilities/transactions/pessimistic_transaction.cc @@ -913,12 +924,14 @@ table/cleanable_test.cc table/cuckoo_table_builder_test.cc table/cuckoo_table_reader_test.cc + table/data_block_hash_index_test.cc table/full_filter_block_test.cc table/merger_test.cc table/table_test.cc tools/ldb_cmd_test.cc tools/reduce_levels_test.cc tools/sst_dump_test.cc + tools/trace_analyzer_test.cc util/arena_test.cc util/auto_roll_logger_test.cc util/autovector_test.cc @@ -933,6 +946,7 @@ util/hash_test.cc util/heap_test.cc util/rate_limiter_test.cc + util/repeatable_thread_test.cc util/slice_transform_test.cc util/timer_queue_test.cc util/thread_list_test.cc @@ -975,6 +989,7 @@ set(BENCHMARKS cache/cache_bench.cc memtable/memtablerep_bench.cc + db/range_del_aggregator_bench.cc tools/db_bench.cc table/table_reader_bench.cc utilities/column_aware_encoding_exp.cc diff -Nru rocksdb-5.15.10/db/builder.cc rocksdb-5.17.2/db/builder.cc --- rocksdb-5.15.10/db/builder.cc 2018-09-13 17:25:20.000000000 +0000 +++ rocksdb-5.17.2/db/builder.cc 2018-11-12 19:57:32.000000000 +0000 @@ -121,8 +121,8 @@ file->SetIOPriority(io_priority); file->SetWriteLifeTimeHint(write_hint); - file_writer.reset(new WritableFileWriter(std::move(file), env_options, - ioptions.statistics)); + file_writer.reset(new WritableFileWriter( + std::move(file), fname, env_options, ioptions.statistics)); builder = NewTableBuilder( ioptions, mutable_cf_options, internal_comparator, int_tbl_prop_collector_factories, column_family_id, diff -Nru rocksdb-5.15.10/db/builder.h rocksdb-5.17.2/db/builder.h --- rocksdb-5.15.10/db/builder.h 2018-09-13 17:25:20.000000000 +0000 +++ rocksdb-5.17.2/db/builder.h 2018-11-12 19:57:32.000000000 +0000 @@ -35,7 +35,6 @@ class TableBuilder; class WritableFileWriter; class InternalStats; -class InternalIterator; // @param column_family_name Name of the column family that is also identified // by column_family_id, or empty string if unknown. It must outlive the diff -Nru rocksdb-5.15.10/db/c.cc rocksdb-5.17.2/db/c.cc --- rocksdb-5.15.10/db/c.cc 2018-09-13 17:25:20.000000000 +0000 +++ rocksdb-5.17.2/db/c.cc 2018-11-12 19:57:32.000000000 +0000 @@ -33,6 +33,7 @@ #include "rocksdb/utilities/backupable_db.h" #include "rocksdb/utilities/checkpoint.h" #include "rocksdb/utilities/db_ttl.h" +#include "rocksdb/utilities/memory_util.h" #include "rocksdb/utilities/optimistic_transaction_db.h" #include "rocksdb/utilities/transaction.h" #include "rocksdb/utilities/transaction_db.h" @@ -41,6 +42,10 @@ #include "rocksdb/perf_context.h" #include "utilities/merge_operators.h" +#include +#include +#include + using rocksdb::BytewiseComparator; using rocksdb::Cache; using rocksdb::ColumnFamilyDescriptor; @@ -108,8 +113,12 @@ using rocksdb::BatchResult; using rocksdb::PerfLevel; using rocksdb::PerfContext; +using rocksdb::MemoryUtil; using std::shared_ptr; +using std::vector; +using std::unordered_set; +using std::map; extern "C" { @@ -2402,7 +2411,7 @@ void rocksdb_options_set_writable_file_max_buffer_size(rocksdb_options_t* opt, uint64_t v) { - opt->rep.writable_file_max_buffer_size = v; + opt->rep.writable_file_max_buffer_size = static_cast(v); } void rocksdb_options_set_allow_concurrent_memtable_write(rocksdb_options_t* opt, @@ -2433,6 +2442,20 @@ opt->rep.max_write_buffer_number_to_maintain = n; } +void rocksdb_options_set_enable_pipelined_write(rocksdb_options_t* opt, + unsigned char v) { + opt->rep.enable_pipelined_write = v; +} + +void rocksdb_options_set_max_subcompactions(rocksdb_options_t* opt, + uint32_t n) { + opt->rep.max_subcompactions = n; +} + +void rocksdb_options_set_max_background_jobs(rocksdb_options_t* opt, int n) { + opt->rep.max_background_jobs = n; +} + void rocksdb_options_set_max_background_compactions(rocksdb_options_t* opt, int n) { opt->rep.max_background_compactions = n; } @@ -4087,6 +4110,98 @@ *vlen = v->rep.size(); return v->rep.data(); } + +// container to keep databases and caches in order to use rocksdb::MemoryUtil +struct rocksdb_memory_consumers_t { + std::vector dbs; + std::unordered_set caches; +}; + +// initializes new container of memory consumers +rocksdb_memory_consumers_t* rocksdb_memory_consumers_create() { + return new rocksdb_memory_consumers_t; +} + +// adds datatabase to the container of memory consumers +void rocksdb_memory_consumers_add_db(rocksdb_memory_consumers_t* consumers, + rocksdb_t* db) { + consumers->dbs.push_back(db); +} + +// adds cache to the container of memory consumers +void rocksdb_memory_consumers_add_cache(rocksdb_memory_consumers_t* consumers, + rocksdb_cache_t* cache) { + consumers->caches.insert(cache); +} + +// deletes container with memory consumers +void rocksdb_memory_consumers_destroy(rocksdb_memory_consumers_t* consumers) { + delete consumers; +} + +// contains memory usage statistics provided by rocksdb::MemoryUtil +struct rocksdb_memory_usage_t { + uint64_t mem_table_total; + uint64_t mem_table_unflushed; + uint64_t mem_table_readers_total; + uint64_t cache_total; +}; + +// estimates amount of memory occupied by consumers (dbs and caches) +rocksdb_memory_usage_t* rocksdb_approximate_memory_usage_create( + rocksdb_memory_consumers_t* consumers, char** errptr) { + + vector dbs; + for (auto db : consumers->dbs) { + dbs.push_back(db->rep); + } + + unordered_set cache_set; + for (auto cache : consumers->caches) { + cache_set.insert(const_cast(cache->rep.get())); + } + + std::map usage_by_type; + + auto status = MemoryUtil::GetApproximateMemoryUsageByType(dbs, cache_set, + &usage_by_type); + if (SaveError(errptr, status)) { + return nullptr; + } + + auto result = new rocksdb_memory_usage_t; + result->mem_table_total = usage_by_type[MemoryUtil::kMemTableTotal]; + result->mem_table_unflushed = usage_by_type[MemoryUtil::kMemTableUnFlushed]; + result->mem_table_readers_total = usage_by_type[MemoryUtil::kTableReadersTotal]; + result->cache_total = usage_by_type[MemoryUtil::kCacheTotal]; + return result; +} + +uint64_t rocksdb_approximate_memory_usage_get_mem_table_total( + rocksdb_memory_usage_t* memory_usage) { + return memory_usage->mem_table_total; +} + +uint64_t rocksdb_approximate_memory_usage_get_mem_table_unflushed( + rocksdb_memory_usage_t* memory_usage) { + return memory_usage->mem_table_unflushed; +} + +uint64_t rocksdb_approximate_memory_usage_get_mem_table_readers_total( + rocksdb_memory_usage_t* memory_usage) { + return memory_usage->mem_table_readers_total; +} + +uint64_t rocksdb_approximate_memory_usage_get_cache_total( + rocksdb_memory_usage_t* memory_usage) { + return memory_usage->cache_total; +} + +// deletes container with memory usage estimates +void rocksdb_approximate_memory_usage_destroy(rocksdb_memory_usage_t* usage) { + delete usage; +} + } // end extern "C" #endif // !ROCKSDB_LITE diff -Nru rocksdb-5.15.10/db/compacted_db_impl.cc rocksdb-5.17.2/db/compacted_db_impl.cc --- rocksdb-5.15.10/db/compacted_db_impl.cc 2018-09-13 17:25:20.000000000 +0000 +++ rocksdb-5.17.2/db/compacted_db_impl.cc 2018-11-12 19:57:32.000000000 +0000 @@ -25,22 +25,12 @@ } size_t CompactedDBImpl::FindFile(const Slice& key) { - size_t left = 0; size_t right = files_.num_files - 1; - while (left < right) { - size_t mid = (left + right) >> 1; - const FdWithKeyRange& f = files_.files[mid]; - if (user_comparator_->Compare(ExtractUserKey(f.largest_key), key) < 0) { - // Key at "mid.largest" is < "target". Therefore all - // files at or before "mid" are uninteresting. - left = mid + 1; - } else { - // Key at "mid.largest" is >= "target". Therefore all files - // after "mid" are uninteresting. - right = mid; - } - } - return right; + auto cmp = [&](const FdWithKeyRange& f, const Slice& k) -> bool { + return user_comparator_->Compare(ExtractUserKey(f.largest_key), k) < 0; + }; + return static_cast(std::lower_bound(files_.files, + files_.files + right, key, cmp) - files_.files); } Status CompactedDBImpl::Get(const ReadOptions& options, ColumnFamilyHandle*, diff -Nru rocksdb-5.15.10/db/compact_files_test.cc rocksdb-5.17.2/db/compact_files_test.cc --- rocksdb-5.15.10/db/compact_files_test.cc 2018-09-13 17:25:20.000000000 +0000 +++ rocksdb-5.17.2/db/compact_files_test.cc 2018-11-12 19:57:32.000000000 +0000 @@ -308,6 +308,55 @@ delete db; } +TEST_F(CompactFilesTest, SentinelCompressionType) { + if (!Zlib_Supported()) { + fprintf(stderr, "zlib compression not supported, skip this test\n"); + return; + } + if (!Snappy_Supported()) { + fprintf(stderr, "snappy compression not supported, skip this test\n"); + return; + } + // Check that passing `CompressionType::kDisableCompressionOption` to + // `CompactFiles` causes it to use the column family compression options. + for (auto compaction_style : + {CompactionStyle::kCompactionStyleLevel, + CompactionStyle::kCompactionStyleUniversal, + CompactionStyle::kCompactionStyleNone}) { + DestroyDB(db_name_, Options()); + Options options; + options.compaction_style = compaction_style; + // L0: Snappy, L1: ZSTD, L2: Snappy + options.compression_per_level = {CompressionType::kSnappyCompression, + CompressionType::kZlibCompression, + CompressionType::kSnappyCompression}; + options.create_if_missing = true; + FlushedFileCollector* collector = new FlushedFileCollector(); + options.listeners.emplace_back(collector); + DB* db = nullptr; + ASSERT_OK(DB::Open(options, db_name_, &db)); + + db->Put(WriteOptions(), "key", "val"); + db->Flush(FlushOptions()); + + auto l0_files = collector->GetFlushedFiles(); + ASSERT_EQ(1, l0_files.size()); + + // L0->L1 compaction, so output should be ZSTD-compressed + CompactionOptions compaction_opts; + compaction_opts.compression = CompressionType::kDisableCompressionOption; + ASSERT_OK(db->CompactFiles(compaction_opts, l0_files, 1)); + + rocksdb::TablePropertiesCollection all_tables_props; + ASSERT_OK(db->GetPropertiesOfAllTables(&all_tables_props)); + for (const auto& name_and_table_props : all_tables_props) { + ASSERT_EQ(CompressionTypeToString(CompressionType::kZlibCompression), + name_and_table_props.second->compression_name); + } + delete db; + } +} + } // namespace rocksdb int main(int argc, char** argv) { diff -Nru rocksdb-5.15.10/db/compaction_iterator.cc rocksdb-5.17.2/db/compaction_iterator.cc --- rocksdb-5.15.10/db/compaction_iterator.cc 2018-09-13 17:25:20.000000000 +0000 +++ rocksdb-5.17.2/db/compaction_iterator.cc 2018-11-12 19:57:32.000000000 +0000 @@ -18,8 +18,8 @@ SequenceNumber earliest_write_conflict_snapshot, const SnapshotChecker* snapshot_checker, Env* env, bool report_detailed_time, bool expect_valid_internal_key, - RangeDelAggregator* range_del_agg, - const Compaction* compaction, const CompactionFilter* compaction_filter, + RangeDelAggregator* range_del_agg, const Compaction* compaction, + const CompactionFilter* compaction_filter, const std::atomic* shutting_down, const SequenceNumber preserve_deletes_seqnum) : CompactionIterator( @@ -77,6 +77,12 @@ earliest_snapshot_ = snapshots_->at(0); latest_snapshot_ = snapshots_->back(); } +#ifndef NDEBUG + // findEarliestVisibleSnapshot assumes this ordering. + for (size_t i = 1; i < snapshots_->size(); ++i) { + assert(snapshots_->at(i - 1) <= snapshots_->at(i)); + } +#endif if (compaction_filter_ != nullptr) { if (compaction_filter_->IgnoreSnapshots()) { ignore_snapshots_ = true; @@ -505,6 +511,31 @@ ++iter_stats_.num_optimized_del_drop_obsolete; } input_->Next(); + } else if ((ikey_.type == kTypeDeletion) && bottommost_level_ && + ikeyNotNeededForIncrementalSnapshot()) { + // Handle the case where we have a delete key at the bottom most level + // We can skip outputting the key iff there are no subsequent puts for this + // key + ParsedInternalKey next_ikey; + input_->Next(); + // Skip over all versions of this key that happen to occur in the same snapshot + // range as the delete + while (input_->Valid() && + ParseInternalKey(input_->key(), &next_ikey) && + cmp_->Equal(ikey_.user_key, next_ikey.user_key) && + (prev_snapshot == 0 || next_ikey.sequence > prev_snapshot || + (snapshot_checker_ != nullptr && + UNLIKELY(!snapshot_checker_->IsInSnapshot(next_ikey.sequence, + prev_snapshot))))) { + input_->Next(); + } + // If you find you still need to output a row with this key, we need to output the + // delete too + if (input_->Valid() && ParseInternalKey(input_->key(), &next_ikey) && + cmp_->Equal(ikey_.user_key, next_ikey.user_key)) { + valid_ = true; + at_next_ = true; + } } else if (ikey_.type == kTypeMerge) { if (!merge_helper_->HasOperator()) { status_ = Status::InvalidArgument( @@ -603,18 +634,23 @@ inline SequenceNumber CompactionIterator::findEarliestVisibleSnapshot( SequenceNumber in, SequenceNumber* prev_snapshot) { assert(snapshots_->size()); - SequenceNumber prev = kMaxSequenceNumber; - for (const auto cur : *snapshots_) { - assert(prev == kMaxSequenceNumber || prev <= cur); - if (cur >= in && (snapshot_checker_ == nullptr || - snapshot_checker_->IsInSnapshot(in, cur))) { - *prev_snapshot = prev == kMaxSequenceNumber ? 0 : prev; + auto snapshots_iter = std::lower_bound( + snapshots_->begin(), snapshots_->end(), in); + if (snapshots_iter == snapshots_->begin()) { + *prev_snapshot = 0; + } else { + *prev_snapshot = *std::prev(snapshots_iter); + assert(*prev_snapshot < in); + } + for (; snapshots_iter != snapshots_->end(); ++snapshots_iter) { + auto cur = *snapshots_iter; + assert(in <= cur); + if (snapshot_checker_ == nullptr || + snapshot_checker_->IsInSnapshot(in, cur)) { return cur; } - prev = cur; - assert(prev < kMaxSequenceNumber); + *prev_snapshot = cur; } - *prev_snapshot = prev; return kMaxSequenceNumber; } diff -Nru rocksdb-5.15.10/db/compaction_iterator_test.cc rocksdb-5.17.2/db/compaction_iterator_test.cc --- rocksdb-5.15.10/db/compaction_iterator_test.cc 2018-09-13 17:25:20.000000000 +0000 +++ rocksdb-5.17.2/db/compaction_iterator_test.cc 2018-11-12 19:57:32.000000000 +0000 @@ -247,9 +247,8 @@ c_iter_.reset(new CompactionIterator( iter_.get(), cmp_, merge_helper_.get(), last_sequence, &snapshots_, earliest_write_conflict_snapshot, snapshot_checker_.get(), - Env::Default(), false /* report_detailed_time */, - false, range_del_agg_.get(), std::move(compaction), filter, - &shutting_down_)); + Env::Default(), false /* report_detailed_time */, false, + range_del_agg_.get(), std::move(compaction), filter, &shutting_down_)); } void AddSnapshot(SequenceNumber snapshot, @@ -672,8 +671,12 @@ TEST_P(CompactionIteratorTest, RemoveDeletionAtBottomLevel) { AddSnapshot(1); RunTest({test::KeyStr("a", 1, kTypeDeletion), - test::KeyStr("b", 2, kTypeDeletion)}, - {"", ""}, {test::KeyStr("b", 2, kTypeDeletion)}, {""}, + test::KeyStr("b", 3, kTypeDeletion), + test::KeyStr("b", 1, kTypeValue)}, + {"", "", ""}, + {test::KeyStr("b", 3, kTypeDeletion), + test::KeyStr("b", 0, kTypeValue)}, + {"", ""}, kMaxSequenceNumber /*last_commited_seq*/, nullptr /*merge_operator*/, nullptr /*compaction_filter*/, true /*bottommost_level*/); } @@ -842,12 +845,25 @@ {test::KeyStr("a", 1, kTypeDeletion), test::KeyStr("b", 2, kTypeDeletion), test::KeyStr("c", 3, kTypeDeletion)}, {"", "", ""}, - {test::KeyStr("b", 2, kTypeDeletion), - test::KeyStr("c", 3, kTypeDeletion)}, + {}, {"", ""}, kMaxSequenceNumber /*last_commited_seq*/, nullptr /*merge_operator*/, nullptr /*compaction_filter*/, true /*bottommost_level*/); } + +TEST_F(CompactionIteratorWithSnapshotCheckerTest, + NotRemoveDeletionIfValuePresentToEarlierSnapshot) { + AddSnapshot(2,1); + RunTest( + {test::KeyStr("a", 4, kTypeDeletion), test::KeyStr("a", 1, kTypeValue), + test::KeyStr("b", 3, kTypeValue)}, + {"", "", ""}, + {test::KeyStr("a", 4, kTypeDeletion), test::KeyStr("a", 0, kTypeValue), + test::KeyStr("b", 3, kTypeValue)}, + {"", "", ""}, kMaxSequenceNumber /*last_commited_seq*/, + nullptr /*merge_operator*/, nullptr /*compaction_filter*/, + true /*bottommost_level*/); +} TEST_F(CompactionIteratorWithSnapshotCheckerTest, NotRemoveSingleDeletionIfNotVisibleToEarliestSnapshot) { diff -Nru rocksdb-5.15.10/db/compaction_job.cc rocksdb-5.17.2/db/compaction_job.cc --- rocksdb-5.15.10/db/compaction_job.cc 2018-09-13 17:25:20.000000000 +0000 +++ rocksdb-5.17.2/db/compaction_job.cc 2018-11-12 19:57:32.000000000 +0000 @@ -593,13 +593,11 @@ thread.join(); } - if (output_directory_) { - output_directory_->Fsync(); - } - compaction_stats_.micros = env_->NowMicros() - start_micros; MeasureTime(stats_, COMPACTION_TIME, compaction_stats_.micros); + TEST_SYNC_POINT("CompactionJob::Run:BeforeVerify"); + // Check if any thread encountered an error during execution Status status; for (const auto& state : compact_->sub_compact_states) { @@ -609,6 +607,10 @@ } } + if (status.ok() && output_directory_) { + status = output_directory_->Fsync(); + } + if (status.ok()) { thread_pool.clear(); std::vector files_meta; @@ -1307,9 +1309,7 @@ // VersionEdit. assert(!sub_compact->outputs.empty()); sub_compact->outputs.pop_back(); - sub_compact->builder.reset(); - sub_compact->current_output_file_size = 0; - return s; + meta = nullptr; } if (s.ok() && (current_entries > 0 || tp.num_range_deletions > 0)) { @@ -1463,8 +1463,9 @@ writable_file->SetWriteLifeTimeHint(write_hint_); writable_file->SetPreallocationBlockSize(static_cast( sub_compact->compaction->OutputFilePreallocationSize())); - sub_compact->outfile.reset(new WritableFileWriter( - std::move(writable_file), env_options_, db_options_.statistics.get())); + sub_compact->outfile.reset( + new WritableFileWriter(std::move(writable_file), fname, env_options_, + db_options_.statistics.get())); // If the Column family flag is to only optimize filters for hits, // we can skip creating filters if this is the bottommost_level where diff -Nru rocksdb-5.15.10/db/compaction_job_test.cc rocksdb-5.17.2/db/compaction_job_test.cc --- rocksdb-5.15.10/db/compaction_job_test.cc 2018-09-13 17:25:20.000000000 +0000 +++ rocksdb-5.17.2/db/compaction_job_test.cc 2018-11-12 19:57:32.000000000 +0000 @@ -79,7 +79,7 @@ shutting_down_(false), preserve_deletes_seqnum_(0), mock_table_factory_(new mock::MockTableFactory()), - error_handler_(db_options_, &mutex_) { + error_handler_(nullptr, db_options_, &mutex_) { EXPECT_OK(env_->CreateDirIfMissing(dbname_)); db_options_.db_paths.emplace_back(dbname_, std::numeric_limits::max()); @@ -205,7 +205,7 @@ manifest, &file, env_->OptimizeForManifestWrite(env_options_)); ASSERT_OK(s); unique_ptr file_writer( - new WritableFileWriter(std::move(file), env_options_)); + new WritableFileWriter(std::move(file), manifest, env_options_)); { log::Writer log(std::move(file_writer), 0, false); std::string record; diff -Nru rocksdb-5.15.10/db/compaction_picker.cc rocksdb-5.17.2/db/compaction_picker.cc --- rocksdb-5.15.10/db/compaction_picker.cc 2018-09-13 17:25:20.000000000 +0000 +++ rocksdb-5.17.2/db/compaction_picker.cc 2018-11-12 19:57:32.000000000 +0000 @@ -49,7 +49,7 @@ // increasing. size_t new_compact_bytes_per_del_file = 0; for (span_len = 1; span_len < level_files.size(); ++span_len) { - compact_bytes += level_files[span_len]->fd.file_size; + compact_bytes += static_cast(level_files[span_len]->fd.file_size); new_compact_bytes_per_del_file = compact_bytes / span_len; if (level_files[span_len]->being_compacted || new_compact_bytes_per_del_file > compact_bytes_per_del_file) { @@ -219,7 +219,8 @@ bool CompactionPicker::ExpandInputsToCleanCut(const std::string& /*cf_name*/, VersionStorageInfo* vstorage, - CompactionInputFiles* inputs) { + CompactionInputFiles* inputs, + InternalKey** next_smallest) { // This isn't good compaction assert(!inputs->empty()); @@ -242,7 +243,8 @@ GetRange(*inputs, &smallest, &largest); inputs->clear(); vstorage->GetOverlappingInputs(level, &smallest, &largest, &inputs->files, - hint_index, &hint_index); + hint_index, &hint_index, true, + next_smallest); } while (inputs->size() > old_size); // we started off with inputs non-empty and the previous loop only grew @@ -315,13 +317,29 @@ // shouldn't have been released since. assert(!FilesRangeOverlapWithCompaction(input_files, output_level)); - auto c = - new Compaction(vstorage, ioptions_, mutable_cf_options, input_files, - output_level, compact_options.output_file_size_limit, - mutable_cf_options.max_compaction_bytes, output_path_id, - compact_options.compression, ioptions_.compression_opts, - compact_options.max_subcompactions, - /* grandparents */ {}, true); + CompressionType compression_type; + if (compact_options.compression == kDisableCompressionOption) { + int base_level; + if (ioptions_.compaction_style == kCompactionStyleLevel) { + base_level = vstorage->base_level(); + } else { + base_level = 1; + } + compression_type = + GetCompressionType(ioptions_, vstorage, mutable_cf_options, + output_level, base_level); + } else { + // TODO(ajkr): `CompactionOptions` offers configurable `CompressionType` + // without configurable `CompressionOptions`, which is inconsistent. + compression_type = compact_options.compression; + } + auto c = new Compaction( + vstorage, ioptions_, mutable_cf_options, input_files, output_level, + compact_options.output_file_size_limit, + mutable_cf_options.max_compaction_bytes, output_path_id, compression_type, + GetCompressionOptions(ioptions_, vstorage, output_level), + compact_options.max_subcompactions, + /* grandparents */ {}, true); RegisterCompaction(c); return c; } @@ -633,7 +651,6 @@ uint64_t s = inputs[i]->compensated_file_size; total += s; if (total >= limit) { - **compaction_end = inputs[i + 1]->smallest; covering_the_whole_range = false; inputs.files.resize(i + 1); break; @@ -642,7 +659,10 @@ } assert(output_path_id < static_cast(ioptions_.cf_paths.size())); - if (ExpandInputsToCleanCut(cf_name, vstorage, &inputs) == false) { + InternalKey key_storage; + InternalKey* next_smallest = &key_storage; + if (ExpandInputsToCleanCut(cf_name, vstorage, &inputs, &next_smallest) == + false) { // manual compaction is now multi-threaded, so it can // happen that ExpandWhileOverlapping fails // we handle it higher in RunManualCompaction @@ -650,8 +670,10 @@ return nullptr; } - if (covering_the_whole_range) { + if (covering_the_whole_range || !next_smallest) { *compaction_end = nullptr; + } else { + **compaction_end = *next_smallest; } CompactionInputFiles output_level_inputs; diff -Nru rocksdb-5.15.10/db/compaction_picker.h rocksdb-5.17.2/db/compaction_picker.h --- rocksdb-5.15.10/db/compaction_picker.h 2018-09-13 17:25:20.000000000 +0000 +++ rocksdb-5.17.2/db/compaction_picker.h 2018-11-12 19:57:32.000000000 +0000 @@ -151,7 +151,8 @@ // Will return false if it is impossible to apply this compaction. bool ExpandInputsToCleanCut(const std::string& cf_name, VersionStorageInfo* vstorage, - CompactionInputFiles* inputs); + CompactionInputFiles* inputs, + InternalKey** next_smallest = nullptr); // Returns true if any one of the parent files are being compacted bool IsRangeInCompaction(VersionStorageInfo* vstorage, diff -Nru rocksdb-5.15.10/db/compaction_picker_test.cc rocksdb-5.17.2/db/compaction_picker_test.cc --- rocksdb-5.15.10/db/compaction_picker_test.cc 2018-09-13 17:25:20.000000000 +0000 +++ rocksdb-5.17.2/db/compaction_picker_test.cc 2018-11-12 19:57:32.000000000 +0000 @@ -90,8 +90,8 @@ f->fd = FileDescriptor(file_number, path_id, file_size); f->smallest = InternalKey(smallest, smallest_seq, kTypeValue); f->largest = InternalKey(largest, largest_seq, kTypeValue); - f->smallest_seqno = smallest_seq; - f->largest_seqno = largest_seq; + f->fd.smallest_seqno = smallest_seq; + f->fd.largest_seqno = largest_seq; f->compensated_file_size = file_size; f->refs = 0; vstorage_->AddFile(level, f); diff -Nru rocksdb-5.15.10/db/compaction_picker_universal.cc rocksdb-5.17.2/db/compaction_picker_universal.cc --- rocksdb-5.15.10/db/compaction_picker_universal.cc 2018-09-13 17:25:20.000000000 +0000 +++ rocksdb-5.17.2/db/compaction_picker_universal.cc 2018-11-12 19:57:32.000000000 +0000 @@ -97,17 +97,17 @@ SequenceNumber* largest_seqno) { bool is_first = true; for (FileMetaData* f : files) { - assert(f->smallest_seqno <= f->largest_seqno); + assert(f->fd.smallest_seqno <= f->fd.largest_seqno); if (is_first) { is_first = false; - *smallest_seqno = f->smallest_seqno; - *largest_seqno = f->largest_seqno; + *smallest_seqno = f->fd.smallest_seqno; + *largest_seqno = f->fd.largest_seqno; } else { - if (f->smallest_seqno < *smallest_seqno) { - *smallest_seqno = f->smallest_seqno; + if (f->fd.smallest_seqno < *smallest_seqno) { + *smallest_seqno = f->fd.smallest_seqno; } - if (f->largest_seqno > *largest_seqno) { - *largest_seqno = f->largest_seqno; + if (f->fd.largest_seqno > *largest_seqno) { + *largest_seqno = f->fd.largest_seqno; } } } @@ -365,11 +365,11 @@ size_t level_index = 0U; if (c->start_level() == 0) { for (auto f : *c->inputs(0)) { - assert(f->smallest_seqno <= f->largest_seqno); + assert(f->fd.smallest_seqno <= f->fd.largest_seqno); if (is_first) { is_first = false; } - prev_smallest_seqno = f->smallest_seqno; + prev_smallest_seqno = f->fd.smallest_seqno; } level_index = 1U; } diff -Nru rocksdb-5.15.10/db/c_test.c rocksdb-5.17.2/db/c_test.c --- rocksdb-5.15.10/db/c_test.c 2018-09-13 17:25:20.000000000 +0000 +++ rocksdb-5.17.2/db/c_test.c 2018-11-12 19:57:32.000000000 +0000 @@ -19,11 +19,8 @@ // Can not use port/port.h macros as this is a c file #ifdef OS_WIN - #include -#define snprintf _snprintf - // Ok for uniqueness int geteuid() { int result = 0; @@ -34,6 +31,11 @@ return result; } +// VS < 2015 +#if defined(_MSC_VER) && (_MSC_VER < 1900) +#define snprintf _snprintf +#endif + #endif const char* phase = ""; @@ -47,12 +49,19 @@ fprintf(stderr, "=== Test %s\n", name); phase = name; } +#ifdef _MSC_VER +#pragma warning(push) +#pragma warning (disable: 4996) // getenv security warning +#endif static const char* GetTempDir(void) { const char* ret = getenv("TEST_TMPDIR"); if (ret == NULL || ret[0] == '\0') ret = "/tmp"; return ret; } +#ifdef _MSC_VER +#pragma warning(pop) +#endif #define CheckNoError(err) \ if ((err) != NULL) { \ @@ -643,7 +652,7 @@ rocksdb_sstfilewriter_t* writer = rocksdb_sstfilewriter_create(env_opt, io_options); - unlink(sstfilename); + remove(sstfilename); rocksdb_sstfilewriter_open(writer, sstfilename, &err); CheckNoError(err); rocksdb_sstfilewriter_put(writer, "sstk1", 5, "v1", 2, &err); @@ -664,7 +673,7 @@ CheckGet(db, roptions, "sstk2", "v2"); CheckGet(db, roptions, "sstk3", "v3"); - unlink(sstfilename); + remove(sstfilename); rocksdb_sstfilewriter_open(writer, sstfilename, &err); CheckNoError(err); rocksdb_sstfilewriter_put(writer, "sstk2", 5, "v4", 2, &err); @@ -1334,6 +1343,47 @@ rocksdb_destroy_db(options, dbname, &err); } + // Check memory usage stats + StartPhase("approximate_memory_usage"); + { + // Create database + db = rocksdb_open(options, dbname, &err); + CheckNoError(err); + + rocksdb_memory_consumers_t* consumers; + consumers = rocksdb_memory_consumers_create(); + rocksdb_memory_consumers_add_db(consumers, db); + rocksdb_memory_consumers_add_cache(consumers, cache); + + // take memory usage report before write-read operation + rocksdb_memory_usage_t* mu1; + mu1 = rocksdb_approximate_memory_usage_create(consumers, &err); + CheckNoError(err); + + // Put data (this should affect memtables) + rocksdb_put(db, woptions, "memory", 6, "test", 4, &err); + CheckNoError(err); + CheckGet(db, roptions, "memory", "test"); + + // take memory usage report after write-read operation + rocksdb_memory_usage_t* mu2; + mu2 = rocksdb_approximate_memory_usage_create(consumers, &err); + CheckNoError(err); + + // amount of memory used within memtables should grow + CheckCondition(rocksdb_approximate_memory_usage_get_mem_table_total(mu2) >= + rocksdb_approximate_memory_usage_get_mem_table_total(mu1)); + CheckCondition(rocksdb_approximate_memory_usage_get_mem_table_unflushed(mu2) >= + rocksdb_approximate_memory_usage_get_mem_table_unflushed(mu1)); + + rocksdb_memory_consumers_destroy(consumers); + rocksdb_approximate_memory_usage_destroy(mu1); + rocksdb_approximate_memory_usage_destroy(mu2); + rocksdb_close(db); + rocksdb_destroy_db(options, dbname, &err); + CheckNoError(err); + } + StartPhase("cuckoo_options"); { rocksdb_cuckoo_table_options_t* cuckoo_options; @@ -1675,7 +1725,7 @@ db = rocksdb_open(options, dbname, &err); CheckNoError(err); } - + StartPhase("cleanup"); rocksdb_close(db); rocksdb_options_destroy(options); diff -Nru rocksdb-5.15.10/db/db_bloom_filter_test.cc rocksdb-5.17.2/db/db_bloom_filter_test.cc --- rocksdb-5.15.10/db/db_bloom_filter_test.cc 2018-09-13 17:25:20.000000000 +0000 +++ rocksdb-5.17.2/db/db_bloom_filter_test.cc 2018-11-12 19:57:32.000000000 +0000 @@ -22,11 +22,12 @@ class DBBloomFilterTestWithParam : public DBTestBase, - public testing::WithParamInterface> { + public testing::WithParamInterface> { // public testing::WithParamInterface { protected: bool use_block_based_filter_; bool partition_filters_; + uint32_t format_version_; public: DBBloomFilterTestWithParam() : DBTestBase("/db_bloom_filter_tests") {} @@ -36,9 +37,12 @@ void SetUp() override { use_block_based_filter_ = std::get<0>(GetParam()); partition_filters_ = std::get<1>(GetParam()); + format_version_ = std::get<2>(GetParam()); } }; +class DBBloomFilterTestDefFormatVersion : public DBBloomFilterTestWithParam {}; + class SliceTransformLimitedDomainGeneric : public SliceTransform { const char* Name() const override { return "SliceTransformLimitedDomainGeneric"; @@ -62,7 +66,7 @@ // KeyMayExist can lead to a few false positives, but not false negatives. // To make test deterministic, use a much larger number of bits per key-20 than // bits in the key, so that false positives are eliminated -TEST_P(DBBloomFilterTestWithParam, KeyMayExist) { +TEST_P(DBBloomFilterTestDefFormatVersion, KeyMayExist) { do { ReadOptions ropts; std::string value; @@ -401,6 +405,11 @@ table_options.index_type = BlockBasedTableOptions::IndexType::kTwoLevelIndexSearch; } + table_options.format_version = format_version_; + if (format_version_ >= 4) { + // value delta encoding challenged more with index interval > 1 + table_options.index_block_restart_interval = 8; + } table_options.metadata_block_size = 32; options.table_factory.reset(NewBlockBasedTableFactory(table_options)); @@ -456,10 +465,26 @@ } while (ChangeCompactOptions()); } -INSTANTIATE_TEST_CASE_P(DBBloomFilterTestWithParam, DBBloomFilterTestWithParam, - ::testing::Values(std::make_tuple(true, false), - std::make_tuple(false, true), - std::make_tuple(false, false))); +INSTANTIATE_TEST_CASE_P( + FormatDef, DBBloomFilterTestDefFormatVersion, + ::testing::Values(std::make_tuple(true, false, test::kDefaultFormatVersion), + std::make_tuple(false, true, test::kDefaultFormatVersion), + std::make_tuple(false, false, + test::kDefaultFormatVersion))); + +INSTANTIATE_TEST_CASE_P( + FormatDef, DBBloomFilterTestWithParam, + ::testing::Values(std::make_tuple(true, false, test::kDefaultFormatVersion), + std::make_tuple(false, true, test::kDefaultFormatVersion), + std::make_tuple(false, false, + test::kDefaultFormatVersion))); + +INSTANTIATE_TEST_CASE_P( + FormatLatest, DBBloomFilterTestWithParam, + ::testing::Values(std::make_tuple(true, false, test::kLatestFormatVersion), + std::make_tuple(false, true, test::kLatestFormatVersion), + std::make_tuple(false, false, + test::kLatestFormatVersion))); TEST_F(DBBloomFilterTest, BloomFilterRate) { while (ChangeFilterOptions()) { diff -Nru rocksdb-5.15.10/db/db_compaction_test.cc rocksdb-5.17.2/db/db_compaction_test.cc --- rocksdb-5.15.10/db/db_compaction_test.cc 2018-09-13 17:25:20.000000000 +0000 +++ rocksdb-5.17.2/db/db_compaction_test.cc 2018-11-12 19:57:32.000000000 +0000 @@ -120,13 +120,12 @@ public: SstStatsCollector() : num_ssts_creation_started_(0) {} - void OnTableFileCreationStarted(const TableFileCreationBriefInfo& /* info */) override { + void OnTableFileCreationStarted( + const TableFileCreationBriefInfo& /* info */) override { ++num_ssts_creation_started_; } - int num_ssts_creation_started() { - return num_ssts_creation_started_; - } + int num_ssts_creation_started() { return num_ssts_creation_started_; } private: std::atomic num_ssts_creation_started_; @@ -2478,6 +2477,7 @@ // Compaction range overlaps files Compact(1, "p1", "p9", 1); + ASSERT_OK(dbfull()->TEST_WaitForCompact()); ASSERT_EQ("0,1", FilesPerLevel(1)); ASSERT_EQ(1, GetSstFileCount(options.db_paths[1].path)); ASSERT_EQ(0, GetSstFileCount(options.db_paths[0].path)); @@ -2493,6 +2493,7 @@ // Compact just the new range Compact(1, "b", "f", 1); + ASSERT_OK(dbfull()->TEST_WaitForCompact()); ASSERT_EQ("0,2", FilesPerLevel(1)); ASSERT_EQ(2, GetSstFileCount(options.db_paths[1].path)); ASSERT_EQ(0, GetSstFileCount(options.db_paths[0].path)); @@ -2509,6 +2510,7 @@ compact_options.target_path_id = 1; compact_options.exclusive_manual_compaction = exclusive_manual_compaction_; db_->CompactRange(compact_options, handles_[1], nullptr, nullptr); + ASSERT_OK(dbfull()->TEST_WaitForCompact()); ASSERT_EQ("0,1", FilesPerLevel(1)); ASSERT_EQ(1, GetSstFileCount(options.db_paths[1].path)); @@ -3501,12 +3503,13 @@ // ensure the auto compaction doesn't finish until manual compaction has // had a chance to be delayed. rocksdb::SyncPoint::GetInstance()->LoadDependency( - {{"DBImpl::CompactRange:StallWait", "CompactionJob::Run():End"}}); + {{"DBImpl::WaitUntilFlushWouldNotStallWrites:StallWait", + "CompactionJob::Run():End"}}); } else { // ensure the auto-compaction doesn't finish until manual compaction has // continued without delay. rocksdb::SyncPoint::GetInstance()->LoadDependency( - {{"DBImpl::CompactRange:StallWaitDone", "CompactionJob::Run():End"}}); + {{"DBImpl::FlushMemTable:StallWaitDone", "CompactionJob::Run():End"}}); } rocksdb::SyncPoint::GetInstance()->EnableProcessing(); @@ -3554,12 +3557,13 @@ // ensure the flush doesn't finish until manual compaction has had a // chance to be delayed. rocksdb::SyncPoint::GetInstance()->LoadDependency( - {{"DBImpl::CompactRange:StallWait", "FlushJob::WriteLevel0Table"}}); + {{"DBImpl::WaitUntilFlushWouldNotStallWrites:StallWait", + "FlushJob::WriteLevel0Table"}}); } else { // ensure the flush doesn't finish until manual compaction has continued // without delay. rocksdb::SyncPoint::GetInstance()->LoadDependency( - {{"DBImpl::CompactRange:StallWaitDone", + {{"DBImpl::FlushMemTable:StallWaitDone", "FlushJob::WriteLevel0Table"}}); } rocksdb::SyncPoint::GetInstance()->EnableProcessing(); @@ -3569,6 +3573,7 @@ ASSERT_OK(Put(Key(0), RandomString(&rnd, 1024))); FlushOptions flush_opts; flush_opts.wait = false; + flush_opts.allow_write_stall = true; dbfull()->Flush(flush_opts); } @@ -3604,7 +3609,7 @@ // The auto-compaction waits until the manual compaction finishes to ensure // the signal comes from closing CF/DB, not from compaction making progress. rocksdb::SyncPoint::GetInstance()->LoadDependency( - {{"DBImpl::CompactRange:StallWait", + {{"DBImpl::WaitUntilFlushWouldNotStallWrites:StallWait", "DBCompactionTest::CompactRangeShutdownWhileDelayed:PreShutdown"}, {"DBCompactionTest::CompactRangeShutdownWhileDelayed:PostManual", "CompactionJob::Run():End"}}); @@ -3655,18 +3660,21 @@ // began. So it unblocks CompactRange and precludes its flush. Throughout the // test, stall conditions are upheld via high L0 file count. rocksdb::SyncPoint::GetInstance()->LoadDependency( - {{"DBImpl::CompactRange:StallWait", + {{"DBImpl::WaitUntilFlushWouldNotStallWrites:StallWait", "DBCompactionTest::CompactRangeSkipFlushAfterDelay:PreFlush"}, {"DBCompactionTest::CompactRangeSkipFlushAfterDelay:PostFlush", - "DBImpl::CompactRange:StallWaitDone"}, - {"DBImpl::CompactRange:StallWaitDone", "CompactionJob::Run():End"}}); + "DBImpl::FlushMemTable:StallWaitDone"}, + {"DBImpl::FlushMemTable:StallWaitDone", "CompactionJob::Run():End"}}); rocksdb::SyncPoint::GetInstance()->EnableProcessing(); + //used for the delayable flushes + FlushOptions flush_opts; + flush_opts.allow_write_stall = true; for (int i = 0; i < kNumL0FilesLimit - 1; ++i) { for (int j = 0; j < 2; ++j) { ASSERT_OK(Put(Key(j), RandomString(&rnd, 1024))); } - Flush(); + dbfull()->Flush(flush_opts); } auto manual_compaction_thread = port::Thread([this]() { CompactRangeOptions cro; @@ -3676,7 +3684,7 @@ TEST_SYNC_POINT("DBCompactionTest::CompactRangeSkipFlushAfterDelay:PreFlush"); Put(ToString(0), RandomString(&rnd, 1024)); - Flush(); + dbfull()->Flush(flush_opts); Put(ToString(0), RandomString(&rnd, 1024)); TEST_SYNC_POINT("DBCompactionTest::CompactRangeSkipFlushAfterDelay:PostFlush"); manual_compaction_thread.join(); @@ -3953,6 +3961,50 @@ CompactionPri::kOldestSmallestSeqFirst, CompactionPri::kMinOverlappingRatio)); +class NoopMergeOperator : public MergeOperator { + public: + NoopMergeOperator() {} + + virtual bool FullMergeV2(const MergeOperationInput& /*merge_in*/, + MergeOperationOutput* merge_out) const override { + std::string val("bar"); + merge_out->new_value = val; + return true; + } + + virtual const char* Name() const override { return "Noop"; } +}; + +TEST_F(DBCompactionTest, PartialManualCompaction) { + Options opts = CurrentOptions(); + opts.num_levels = 3; + opts.level0_file_num_compaction_trigger = 10; + opts.compression = kNoCompression; + opts.merge_operator.reset(new NoopMergeOperator()); + opts.target_file_size_base = 10240; + DestroyAndReopen(opts); + + Random rnd(301); + for (auto i = 0; i < 8; ++i) { + for (auto j = 0; j < 10; ++j) { + Merge("foo", RandomString(&rnd, 1024)); + } + Flush(); + } + + MoveFilesToLevel(2); + + std::string prop; + EXPECT_TRUE(dbfull()->GetProperty(DB::Properties::kLiveSstFilesSize, &prop)); + uint64_t max_compaction_bytes = atoi(prop.c_str()) / 2; + ASSERT_OK(dbfull()->SetOptions( + {{"max_compaction_bytes", std::to_string(max_compaction_bytes)}})); + + CompactRangeOptions cro; + cro.bottommost_level_compaction = BottommostLevelCompaction::kForce; + dbfull()->CompactRange(cro, nullptr, nullptr); +} + #endif // !defined(ROCKSDB_LITE) } // namespace rocksdb diff -Nru rocksdb-5.15.10/db/db_flush_test.cc rocksdb-5.17.2/db/db_flush_test.cc --- rocksdb-5.15.10/db/db_flush_test.cc 2018-09-13 17:25:20.000000000 +0000 +++ rocksdb-5.17.2/db/db_flush_test.cc 2018-11-12 19:57:32.000000000 +0000 @@ -35,6 +35,7 @@ Reopen(options); FlushOptions no_wait; no_wait.wait = false; + no_wait.allow_write_stall=true; SyncPoint::GetInstance()->LoadDependency( {{"VersionSet::LogAndApply:WriteManifest", @@ -55,6 +56,9 @@ #endif // ROCKSDB_LITE } +#ifndef TRAVIS +// Disable this test temporarily on Travis as it fails intermittently. +// Github issue: #4151 TEST_F(DBFlushTest, SyncFail) { std::unique_ptr fault_injection_env( new FaultInjectionTestEnv(env_)); @@ -92,6 +96,7 @@ ASSERT_EQ(refs_before, cfd->current()->TEST_refs()); Destroy(options); } +#endif // TRAVIS TEST_F(DBFlushTest, FlushInLowPriThreadPool) { // Verify setting an empty high-pri (flush) thread pool causes flushes to be diff -Nru rocksdb-5.15.10/db/dbformat.cc rocksdb-5.17.2/db/dbformat.cc --- rocksdb-5.15.10/db/dbformat.cc 2018-09-13 17:25:20.000000000 +0000 +++ rocksdb-5.17.2/db/dbformat.cc 2018-11-12 19:57:32.000000000 +0000 @@ -48,6 +48,8 @@ return kEntryMerge; case kTypeRangeDeletion: return kEntryRangeDeletion; + case kTypeBlobIndex: + return kEntryBlobIndex; default: return kEntryOther; } diff -Nru rocksdb-5.15.10/db/db_impl.cc rocksdb-5.17.2/db/db_impl.cc --- rocksdb-5.15.10/db/db_impl.cc 2018-09-13 17:25:20.000000000 +0000 +++ rocksdb-5.17.2/db/db_impl.cc 2018-11-12 19:57:32.000000000 +0000 @@ -215,9 +215,11 @@ // requires a custom gc for compaction, we use that to set use_custom_gc_ // as well. use_custom_gc_(seq_per_batch), + shutdown_initiated_(false), + own_sfm_(options.sst_file_manager == nullptr), preserve_deletes_(options.preserve_deletes), closed_(false), - error_handler_(immutable_db_options_, &mutex_) { + error_handler_(this, immutable_db_options_, &mutex_) { // !batch_per_trx_ implies seq_per_batch_ because it is only unset for // WriteUnprepared, which should use seq_per_batch_. assert(batch_per_txn_ || seq_per_batch_); @@ -259,16 +261,62 @@ return Status::OK(); } - Status s = error_handler_.GetBGError(); - if (s.severity() > Status::Severity::kHardError) { + if (error_handler_.IsRecoveryInProgress()) { + // Don't allow a mix of manual and automatic recovery + return Status::Busy(); + } + + mutex_.Unlock(); + Status s = error_handler_.RecoverFromBGError(true); + mutex_.Lock(); + return s; +} + +// This function implements the guts of recovery from a background error. It +// is eventually called for both manual as well as automatic recovery. It does +// the following - +// 1. Wait for currently scheduled background flush/compaction to exit, in +// order to inadvertently causing an error and thinking recovery failed +// 2. Flush memtables if there's any data for all the CFs. This may result +// another error, which will be saved by error_handler_ and reported later +// as the recovery status +// 3. Find and delete any obsolete files +// 4. Schedule compactions if needed for all the CFs. This is needed as the +// flush in the prior step might have been a no-op for some CFs, which +// means a new super version wouldn't have been installed +Status DBImpl::ResumeImpl() { + mutex_.AssertHeld(); + WaitForBackgroundWork(); + + Status bg_error = error_handler_.GetBGError(); + Status s; + if (shutdown_initiated_) { + // Returning shutdown status to SFM during auto recovery will cause it + // to abort the recovery and allow the shutdown to progress + s = Status::ShutdownInProgress(); + } + if (s.ok() && bg_error.severity() > Status::Severity::kHardError) { ROCKS_LOG_INFO(immutable_db_options_.info_log, "DB resume requested but failed due to Fatal/Unrecoverable error"); - return s; + s = bg_error; + } + + // We cannot guarantee consistency of the WAL. So force flush Memtables of + // all the column families + if (s.ok()) { + s = FlushAllCFs(FlushReason::kErrorRecovery); + if (!s.ok()) { + ROCKS_LOG_INFO(immutable_db_options_.info_log, + "DB resume requested but failed due to Flush failure [%s]", + s.ToString().c_str()); + } } JobContext job_context(0); FindObsoleteFiles(&job_context, true); - error_handler_.ClearBGError(); + if (s.ok()) { + s = error_handler_.ClearBGError(); + } mutex_.Unlock(); job_context.manifest_file_number = 1; @@ -277,13 +325,36 @@ } job_context.Clean(); - ROCKS_LOG_INFO(immutable_db_options_.info_log, "Successfully resumed DB"); + if (s.ok()) { + ROCKS_LOG_INFO(immutable_db_options_.info_log, "Successfully resumed DB"); + } mutex_.Lock(); - MaybeScheduleFlushOrCompaction(); + // Check for shutdown again before scheduling further compactions, + // since we released and re-acquired the lock above + if (shutdown_initiated_) { + s = Status::ShutdownInProgress(); + } + if (s.ok()) { + for (auto cfd : *versions_->GetColumnFamilySet()) { + SchedulePendingCompaction(cfd); + } + MaybeScheduleFlushOrCompaction(); + } + + // Wake up any waiters - in this case, it could be the shutdown thread + bg_cv_.SignalAll(); // No need to check BGError again. If something happened, event listener would be // notified and the operation causing it would have failed - return Status::OK(); + return s; +} + +void DBImpl::WaitForBackgroundWork() { + // Wait for background work to finish + while (bg_bottom_compaction_scheduled_ || bg_compaction_scheduled_ || + bg_flush_scheduled_) { + bg_cv_.Wait(); + } } // Will lock the mutex_, will wait for completion if wait is true @@ -313,14 +384,20 @@ if (!wait) { return; } - // Wait for background work to finish - while (bg_bottom_compaction_scheduled_ || bg_compaction_scheduled_ || - bg_flush_scheduled_) { - bg_cv_.Wait(); - } + WaitForBackgroundWork(); } Status DBImpl::CloseHelper() { + // Guarantee that there is no background error recovery in progress before + // continuing with the shutdown + mutex_.Lock(); + shutdown_initiated_ = true; + error_handler_.CancelErrorRecovery(); + while (error_handler_.IsRecoveryInProgress()) { + bg_cv_.Wait(); + } + mutex_.Unlock(); + // CancelAllBackgroundWork called with false means we just set the shutdown // marker. After this we do a variant of the waiting and unschedule work // (to consider: moving all the waiting into CancelAllBackgroundWork(true)) @@ -338,7 +415,8 @@ // Wait for background work to finish while (bg_bottom_compaction_scheduled_ || bg_compaction_scheduled_ || bg_flush_scheduled_ || bg_purge_scheduled_ || - pending_purge_obsolete_files_) { + pending_purge_obsolete_files_ || + error_handler_.IsRecoveryInProgress()) { TEST_SYNC_POINT("DBImpl::~DBImpl:WaitJob"); bg_cv_.Wait(); } @@ -348,9 +426,12 @@ flush_scheduler_.Clear(); while (!flush_queue_.empty()) { - auto cfd = PopFirstFromFlushQueue(); - if (cfd->Unref()) { - delete cfd; + const FlushRequest& flush_req = PopFirstFromFlushQueue(); + for (const auto& iter : flush_req) { + ColumnFamilyData* cfd = iter.first; + if (cfd->Unref()) { + delete cfd; + } } } while (!compaction_queue_.empty()) { @@ -440,6 +521,17 @@ ROCKS_LOG_INFO(immutable_db_options_.info_log, "Shutdown complete"); LogFlush(immutable_db_options_.info_log); +#ifndef ROCKSDB_LITE + // If the sst_file_manager was allocated by us during DB::Open(), ccall + // Close() on it before closing the info_log. Otherwise, background thread + // in SstFileManagerImpl might try to log something + if (immutable_db_options_.sst_file_manager && own_sfm_) { + auto sfm = static_cast( + immutable_db_options_.sst_file_manager.get()); + sfm->Close(); + } +#endif // ROCKSDB_LITE + if (immutable_db_options_.info_log && own_info_log_) { Status s = immutable_db_options_.info_log->Close(); if (ret.ok()) { @@ -1047,7 +1139,7 @@ } else { CleanupSuperVersion(super_version); } - return NewErrorInternalIterator(s, arena); + return NewErrorInternalIterator(s, arena); } ColumnFamilyHandle* DBImpl::DefaultColumnFamily() const { @@ -1071,6 +1163,15 @@ auto cfh = reinterpret_cast(column_family); auto cfd = cfh->cfd(); + if (tracer_) { + // TODO: This mutex should be removed later, to improve performance when + // tracing is enabled. + InstrumentedMutexLock lock(&trace_mutex_); + if (tracer_) { + tracer_->Get(column_family, key); + } + } + // Acquire SuperVersion SuperVersion* sv = GetAndRefSuperVersion(cfd); @@ -1609,8 +1710,8 @@ result = NewDBIterator( env_, read_options, *cfd->ioptions(), sv->mutable_cf_options, cfd->user_comparator(), iter, kMaxSequenceNumber, - sv->mutable_cf_options.max_sequential_skip_in_iterations, - read_callback); + sv->mutable_cf_options.max_sequential_skip_in_iterations, read_callback, + this, cfd); #endif } else { // Note: no need to consider the special case of @@ -1677,9 +1778,8 @@ ArenaWrappedDBIter* db_iter = NewArenaWrappedDbIterator( env_, read_options, *cfd->ioptions(), sv->mutable_cf_options, snapshot, sv->mutable_cf_options.max_sequential_skip_in_iterations, - sv->version_number, read_callback, - ((read_options.snapshot != nullptr) ? nullptr : this), cfd, allow_blob, - allow_refresh); + sv->version_number, read_callback, this, cfd, allow_blob, + ((read_options.snapshot != nullptr) ? false : allow_refresh)); InternalIterator* internal_iter = NewInternalIterator(read_options, cfd, sv, db_iter->GetArena(), @@ -1716,7 +1816,7 @@ env_, read_options, *cfd->ioptions(), sv->mutable_cf_options, cfd->user_comparator(), iter, kMaxSequenceNumber, sv->mutable_cf_options.max_sequential_skip_in_iterations, - read_callback)); + read_callback, this, cfd)); } #endif } else { @@ -2227,9 +2327,9 @@ status = versions_->LogAndApply(cfd, *cfd->GetLatestMutableCFOptions(), &edit, &mutex_, directories_.GetDbDir()); if (status.ok()) { - InstallSuperVersionAndScheduleWork(cfd, &job_context.superversion_context, - *cfd->GetLatestMutableCFOptions(), - FlushReason::kDeleteFiles); + InstallSuperVersionAndScheduleWork( + cfd, &job_context.superversion_contexts[0], + *cfd->GetLatestMutableCFOptions(), FlushReason::kDeleteFiles); } FindObsoleteFiles(&job_context, false); } // lock released here @@ -2312,9 +2412,9 @@ status = versions_->LogAndApply(cfd, *cfd->GetLatestMutableCFOptions(), &edit, &mutex_, directories_.GetDbDir()); if (status.ok()) { - InstallSuperVersionAndScheduleWork(cfd, &job_context.superversion_context, - *cfd->GetLatestMutableCFOptions(), - FlushReason::kDeleteFiles); + InstallSuperVersionAndScheduleWork( + cfd, &job_context.superversion_contexts[0], + *cfd->GetLatestMutableCFOptions(), FlushReason::kDeleteFiles); } for (auto* deleted_file : deleted_files) { deleted_file->being_compacted = false; @@ -2402,7 +2502,7 @@ if (!s.ok()) { return s; } - char* buffer = reinterpret_cast(alloca(file_size)); + char* buffer = reinterpret_cast(alloca(static_cast(file_size))); Slice id; s = id_file_reader->Read(static_cast(file_size), &id, buffer); if (!s.ok()) { @@ -2879,6 +2979,10 @@ ColumnFamilyHandle* column_family, const std::vector& external_files, const IngestExternalFileOptions& ingestion_options) { + if (external_files.empty()) { + return Status::InvalidArgument("external_files is empty"); + } + Status status; auto cfh = reinterpret_cast(column_family); auto cfd = cfh->cfd(); @@ -2896,6 +3000,9 @@ immutable_db_options_, env_options_, &snapshots_, ingestion_options); + SuperVersionContext dummy_sv_ctx(/* create_superversion */ true); + VersionEdit dummy_edit; + uint64_t next_file_number = 0; std::list::iterator pending_output_elem; { InstrumentedMutexLock l(&mutex_); @@ -2906,10 +3013,29 @@ // Make sure that bg cleanup wont delete the files that we are ingesting pending_output_elem = CaptureCurrentFileNumberInPendingOutputs(); + + // If crash happen after a hard link established, Recover function may + // reuse the file number that has already assigned to the internal file, + // and this will overwrite the external file. To protect the external + // file, we have to make sure the file number will never being reused. + next_file_number = versions_->FetchAddFileNumber(external_files.size()); + auto cf_options = cfd->GetLatestMutableCFOptions(); + status = versions_->LogAndApply(cfd, *cf_options, &dummy_edit, &mutex_, + directories_.GetDbDir()); + if (status.ok()) { + InstallSuperVersionAndScheduleWork(cfd, &dummy_sv_ctx, *cf_options); + } + } + dummy_sv_ctx.Clean(); + if (!status.ok()) { + InstrumentedMutexLock l(&mutex_); + ReleaseFileNumberFromPendingOutputs(pending_output_elem); + return status; } SuperVersion* super_version = cfd->GetReferencedSuperVersion(&mutex_); - status = ingestion_job.Prepare(external_files, super_version); + status = + ingestion_job.Prepare(external_files, next_file_number, super_version); CleanupSuperVersion(super_version); if (!status.ok()) { InstrumentedMutexLock l(&mutex_); @@ -3060,7 +3186,6 @@ void DBImpl::NotifyOnExternalFileIngested( ColumnFamilyData* cfd, const ExternalSstFileIngestionJob& ingestion_job) { -#ifndef ROCKSDB_LITE if (immutable_db_options_.listeners.empty()) { return; } @@ -3076,8 +3201,6 @@ listener->OnExternalFileIngested(this, info); } } - -#endif } void DBImpl::WaitForIngestFile() { @@ -3087,5 +3210,43 @@ } } +Status DBImpl::StartTrace(const TraceOptions& /* options */, + std::unique_ptr&& trace_writer) { + InstrumentedMutexLock lock(&trace_mutex_); + tracer_.reset(new Tracer(env_, std::move(trace_writer))); + return Status::OK(); +} + +Status DBImpl::EndTrace() { + InstrumentedMutexLock lock(&trace_mutex_); + Status s = tracer_->Close(); + tracer_.reset(); + return s; +} + +Status DBImpl::TraceIteratorSeek(const uint32_t& cf_id, const Slice& key) { + Status s; + if (tracer_) { + InstrumentedMutexLock lock(&trace_mutex_); + if (tracer_) { + s = tracer_->IteratorSeek(cf_id, key); + } + } + return s; +} + +Status DBImpl::TraceIteratorSeekForPrev(const uint32_t& cf_id, + const Slice& key) { + Status s; + if (tracer_) { + InstrumentedMutexLock lock(&trace_mutex_); + if (tracer_) { + s = tracer_->IteratorSeekForPrev(cf_id, key); + } + } + return s; +} + #endif // ROCKSDB_LITE + } // namespace rocksdb diff -Nru rocksdb-5.15.10/db/db_impl_compaction_flush.cc rocksdb-5.17.2/db/db_impl_compaction_flush.cc --- rocksdb-5.15.10/db/db_impl_compaction_flush.cc 2018-09-13 17:25:20.000000000 +0000 +++ rocksdb-5.17.2/db/db_impl_compaction_flush.cc 2018-11-12 19:57:32.000000000 +0000 @@ -26,7 +26,7 @@ namespace rocksdb { bool DBImpl::EnoughRoomForCompaction( - const std::vector& inputs, + ColumnFamilyData* cfd, const std::vector& inputs, bool* sfm_reserved_compact_space, LogBuffer* log_buffer) { // Check if we have enough room to do the compaction bool enough_room = true; @@ -34,12 +34,17 @@ auto sfm = static_cast( immutable_db_options_.sst_file_manager.get()); if (sfm) { - enough_room = sfm->EnoughRoomForCompaction(inputs); + // Pass the current bg_error_ to SFM so it can decide what checks to + // perform. If this DB instance hasn't seen any error yet, the SFM can be + // optimistic and not do disk space checks + enough_room = + sfm->EnoughRoomForCompaction(cfd, inputs, error_handler_.GetBGError()); if (enough_room) { *sfm_reserved_compact_space = true; } } #else + (void)cfd; (void)inputs; (void)sfm_reserved_compact_space; #endif // ROCKSDB_LITE @@ -104,7 +109,8 @@ Status DBImpl::FlushMemTableToOutputFile( ColumnFamilyData* cfd, const MutableCFOptions& mutable_cf_options, - bool* made_progress, JobContext* job_context, LogBuffer* log_buffer) { + bool* made_progress, JobContext* job_context, + SuperVersionContext* superversion_context, LogBuffer* log_buffer) { mutex_.AssertHeld(); assert(cfd->imm()->NumNotFlushed() != 0); assert(cfd->imm()->IsFlushPending()); @@ -160,7 +166,7 @@ } if (s.ok()) { - InstallSuperVersionAndScheduleWork(cfd, &job_context->superversion_context, + InstallSuperVersionAndScheduleWork(cfd, superversion_context, mutable_cf_options); if (made_progress) { *made_progress = 1; @@ -200,6 +206,25 @@ return s; } +Status DBImpl::FlushMemTablesToOutputFiles( + const autovector& bg_flush_args, bool* made_progress, + JobContext* job_context, LogBuffer* log_buffer) { + Status s; + for (auto& arg : bg_flush_args) { + ColumnFamilyData* cfd = arg.cfd_; + const MutableCFOptions& mutable_cf_options = + *cfd->GetLatestMutableCFOptions(); + SuperVersionContext* superversion_context = arg.superversion_context_; + s = FlushMemTableToOutputFile(cfd, mutable_cf_options, made_progress, + job_context, superversion_context, + log_buffer); + if (!s.ok()) { + break; + } + } + return s; +} + void DBImpl::NotifyOnFlushBegin(ColumnFamilyData* cfd, FileMetaData* file_meta, const MutableCFOptions& mutable_cf_options, int job_id, TableProperties prop) { @@ -230,8 +255,8 @@ info.job_id = job_id; info.triggered_writes_slowdown = triggered_writes_slowdown; info.triggered_writes_stop = triggered_writes_stop; - info.smallest_seqno = file_meta->smallest_seqno; - info.largest_seqno = file_meta->largest_seqno; + info.smallest_seqno = file_meta->fd.smallest_seqno; + info.largest_seqno = file_meta->fd.largest_seqno; info.table_properties = prop; info.flush_reason = cfd->GetFlushReason(); for (auto listener : immutable_db_options_.listeners) { @@ -281,8 +306,8 @@ info.job_id = job_id; info.triggered_writes_slowdown = triggered_writes_slowdown; info.triggered_writes_stop = triggered_writes_stop; - info.smallest_seqno = file_meta->smallest_seqno; - info.largest_seqno = file_meta->largest_seqno; + info.smallest_seqno = file_meta->fd.smallest_seqno; + info.largest_seqno = file_meta->fd.largest_seqno; info.table_properties = prop; info.flush_reason = cfd->GetFlushReason(); for (auto listener : immutable_db_options_.listeners) { @@ -324,60 +349,12 @@ CleanupSuperVersion(super_version); } - if (!options.allow_write_stall && flush_needed) { - InstrumentedMutexLock l(&mutex_); - uint64_t orig_active_memtable_id = cfd->mem()->GetID(); - WriteStallCondition write_stall_condition = WriteStallCondition::kNormal; - do { - if (write_stall_condition != WriteStallCondition::kNormal) { - TEST_SYNC_POINT("DBImpl::CompactRange:StallWait"); - ROCKS_LOG_INFO(immutable_db_options_.info_log, - "[%s] CompactRange waiting on stall conditions to clear", - cfd->GetName().c_str()); - bg_cv_.Wait(); - } - if (cfd->IsDropped() || shutting_down_.load(std::memory_order_acquire)) { - return Status::ShutdownInProgress(); - } - - uint64_t earliest_memtable_id = - std::min(cfd->mem()->GetID(), cfd->imm()->GetEarliestMemTableID()); - if (earliest_memtable_id > orig_active_memtable_id) { - // We waited so long that the memtable we were originally waiting on was - // flushed. - flush_needed = false; - break; - } - - const auto& mutable_cf_options = *cfd->GetLatestMutableCFOptions(); - const auto* vstorage = cfd->current()->storage_info(); - - // Skip stalling check if we're below auto-flush and auto-compaction - // triggers. If it stalled in these conditions, that'd mean the stall - // triggers are so low that stalling is needed for any background work. In - // that case we shouldn't wait since background work won't be scheduled. - if (cfd->imm()->NumNotFlushed() < - cfd->ioptions()->min_write_buffer_number_to_merge && - vstorage->l0_delay_trigger_count() < - mutable_cf_options.level0_file_num_compaction_trigger) { - break; - } - - // check whether one extra immutable memtable or an extra L0 file would - // cause write stalling mode to be entered. It could still enter stall - // mode due to pending compaction bytes, but that's less common - write_stall_condition = - ColumnFamilyData::GetWriteStallConditionAndCause( - cfd->imm()->NumNotFlushed() + 1, - vstorage->l0_delay_trigger_count() + 1, - vstorage->estimated_compaction_needed_bytes(), mutable_cf_options) - .first; - } while (write_stall_condition != WriteStallCondition::kNormal); - } - TEST_SYNC_POINT("DBImpl::CompactRange:StallWaitDone"); Status s; if (flush_needed) { - s = FlushMemTable(cfd, FlushOptions(), FlushReason::kManualCompaction); + FlushOptions fo; + fo.allow_write_stall = options.allow_write_stall; + s = FlushMemTable(cfd, fo, FlushReason::kManualCompaction, + false /* writes_stopped*/); if (!s.ok()) { LogFlush(immutable_db_options_.info_log); return s; @@ -612,7 +589,7 @@ bool sfm_reserved_compact_space = false; // First check if we have enough room to do the compaction bool enough_room = EnoughRoomForCompaction( - input_files, &sfm_reserved_compact_space, log_buffer); + cfd, input_files, &sfm_reserved_compact_space, log_buffer); if (!enough_room) { // m's vars will get set properly at the end of this function, @@ -691,7 +668,7 @@ Status status = compaction_job.Install(*c->mutable_cf_options()); if (status.ok()) { InstallSuperVersionAndScheduleWork( - c->column_family_data(), &job_context->superversion_context, + c->column_family_data(), &job_context->superversion_contexts[0], *c->mutable_cf_options(), FlushReason::kManualCompaction); } c->ReleaseCompactionFiles(s); @@ -885,7 +862,7 @@ edit.DeleteFile(level, f->fd.GetNumber()); edit.AddFile(to_level, f->fd.GetNumber(), f->fd.GetPathId(), f->fd.GetFileSize(), f->smallest, f->largest, - f->smallest_seqno, f->largest_seqno, + f->fd.smallest_seqno, f->fd.largest_seqno, f->marked_for_compaction); } ROCKS_LOG_DEBUG(immutable_db_options_.info_log, @@ -942,6 +919,68 @@ return s; } + +Status DBImpl::FlushAllCFs(FlushReason flush_reason) { + Status s; + WriteContext context; + WriteThread::Writer w; + + mutex_.AssertHeld(); + write_thread_.EnterUnbatched(&w, &mutex_); + + FlushRequest flush_req; + for (auto cfd : *versions_->GetColumnFamilySet()) { + if (cfd->imm()->NumNotFlushed() == 0 && cfd->mem()->IsEmpty() && + cached_recoverable_state_empty_.load()) { + // Nothing to flush + continue; + } + + // SwitchMemtable() will release and reacquire mutex during execution + s = SwitchMemtable(cfd, &context); + if (!s.ok()) { + break; + } + + cfd->imm()->FlushRequested(); + + flush_req.emplace_back(cfd, cfd->imm()->GetLatestMemTableID()); + } + + // schedule flush + if (s.ok() && !flush_req.empty()) { + SchedulePendingFlush(flush_req, flush_reason); + MaybeScheduleFlushOrCompaction(); + } + + write_thread_.ExitUnbatched(&w); + + if (s.ok()) { + for (auto& flush : flush_req) { + auto cfd = flush.first; + auto flush_memtable_id = flush.second; + while (cfd->imm()->NumNotFlushed() > 0 && + cfd->imm()->GetEarliestMemTableID() <= flush_memtable_id) { + if (!error_handler_.GetRecoveryError().ok()) { + break; + } + if (cfd->IsDropped()) { + // FlushJob cannot flush a dropped CF, if we did not break here + // we will loop forever since cfd->imm()->NumNotFlushed() will never + // drop to zero + continue; + } + cfd->Ref(); + bg_cv_.Wait(); + cfd->Unref(); + } + } + } + + flush_req.clear(); + return s; +} + Status DBImpl::RunManualCompaction(ColumnFamilyData* cfd, int input_level, int output_level, uint32_t output_path_id, uint32_t max_subcompactions, @@ -1077,63 +1116,164 @@ FlushReason flush_reason, bool writes_stopped) { Status s; uint64_t flush_memtable_id = 0; + if (!flush_options.allow_write_stall) { + bool flush_needed = true; + s = WaitUntilFlushWouldNotStallWrites(cfd, &flush_needed); + TEST_SYNC_POINT("DBImpl::FlushMemTable:StallWaitDone"); + if (!s.ok() || !flush_needed) { + return s; + } + } + FlushRequest flush_req; { WriteContext context; InstrumentedMutexLock guard_lock(&mutex_); - if (cfd->imm()->NumNotFlushed() == 0 && cfd->mem()->IsEmpty() && - cached_recoverable_state_empty_.load()) { - // Nothing to flush - return Status::OK(); - } - WriteThread::Writer w; if (!writes_stopped) { write_thread_.EnterUnbatched(&w, &mutex_); } - // SwitchMemtable() will release and reacquire mutex during execution - s = SwitchMemtable(cfd, &context); - flush_memtable_id = cfd->imm()->GetLatestMemTableID(); + if (cfd->imm()->NumNotFlushed() != 0 || !cfd->mem()->IsEmpty() || + !cached_recoverable_state_empty_.load()) { + s = SwitchMemtable(cfd, &context); + flush_memtable_id = cfd->imm()->GetLatestMemTableID(); + flush_req.emplace_back(cfd, flush_memtable_id); + } + + if (s.ok() && !flush_req.empty()) { + for (auto& elem : flush_req) { + ColumnFamilyData* loop_cfd = elem.first; + loop_cfd->imm()->FlushRequested(); + } + SchedulePendingFlush(flush_req, flush_reason); + MaybeScheduleFlushOrCompaction(); + } if (!writes_stopped) { write_thread_.ExitUnbatched(&w); } - - cfd->imm()->FlushRequested(); - - // schedule flush - SchedulePendingFlush(cfd, flush_reason); - MaybeScheduleFlushOrCompaction(); } if (s.ok() && flush_options.wait) { - // Wait until the compaction completes - s = WaitForFlushMemTable(cfd, &flush_memtable_id); + autovector cfds; + autovector flush_memtable_ids; + for (auto& iter : flush_req) { + cfds.push_back(iter.first); + flush_memtable_ids.push_back(&(iter.second)); + } + s = WaitForFlushMemTables(cfds, flush_memtable_ids); } TEST_SYNC_POINT("FlushMemTableFinished"); return s; } -Status DBImpl::WaitForFlushMemTable(ColumnFamilyData* cfd, - const uint64_t* flush_memtable_id) { - Status s; +// Calling FlushMemTable(), whether from DB::Flush() or from Backup Engine, can +// cause write stall, for example if one memtable is being flushed already. +// This method tries to avoid write stall (similar to CompactRange() behavior) +// it emulates how the SuperVersion / LSM would change if flush happens, checks +// it against various constrains and delays flush if it'd cause write stall. +// Called should check status and flush_needed to see if flush already happened. +Status DBImpl::WaitUntilFlushWouldNotStallWrites(ColumnFamilyData* cfd, + bool* flush_needed) { + { + *flush_needed = true; + InstrumentedMutexLock l(&mutex_); + uint64_t orig_active_memtable_id = cfd->mem()->GetID(); + WriteStallCondition write_stall_condition = WriteStallCondition::kNormal; + do { + if (write_stall_condition != WriteStallCondition::kNormal) { + TEST_SYNC_POINT("DBImpl::WaitUntilFlushWouldNotStallWrites:StallWait"); + ROCKS_LOG_INFO(immutable_db_options_.info_log, + "[%s] WaitUntilFlushWouldNotStallWrites" + " waiting on stall conditions to clear", + cfd->GetName().c_str()); + bg_cv_.Wait(); + } + if (cfd->IsDropped() || shutting_down_.load(std::memory_order_acquire)) { + return Status::ShutdownInProgress(); + } + + uint64_t earliest_memtable_id = + std::min(cfd->mem()->GetID(), cfd->imm()->GetEarliestMemTableID()); + if (earliest_memtable_id > orig_active_memtable_id) { + // We waited so long that the memtable we were originally waiting on was + // flushed. + *flush_needed = false; + return Status::OK(); + } + + const auto& mutable_cf_options = *cfd->GetLatestMutableCFOptions(); + const auto* vstorage = cfd->current()->storage_info(); + + // Skip stalling check if we're below auto-flush and auto-compaction + // triggers. If it stalled in these conditions, that'd mean the stall + // triggers are so low that stalling is needed for any background work. In + // that case we shouldn't wait since background work won't be scheduled. + if (cfd->imm()->NumNotFlushed() < + cfd->ioptions()->min_write_buffer_number_to_merge && + vstorage->l0_delay_trigger_count() < + mutable_cf_options.level0_file_num_compaction_trigger) { + break; + } + + // check whether one extra immutable memtable or an extra L0 file would + // cause write stalling mode to be entered. It could still enter stall + // mode due to pending compaction bytes, but that's less common + write_stall_condition = + ColumnFamilyData::GetWriteStallConditionAndCause( + cfd->imm()->NumNotFlushed() + 1, + vstorage->l0_delay_trigger_count() + 1, + vstorage->estimated_compaction_needed_bytes(), mutable_cf_options) + .first; + } while (write_stall_condition != WriteStallCondition::kNormal); + } + return Status::OK(); +} + +// Wait for memtables to be flushed for multiple column families. +// let N = cfds.size() +// for i in [0, N), +// 1) if flush_memtable_ids[i] is not null, then the memtables with lower IDs +// have to be flushed for THIS column family; +// 2) if flush_memtable_ids[i] is null, then all memtables in THIS column +// family have to be flushed. +// Finish waiting when ALL column families finish flushing memtables. +Status DBImpl::WaitForFlushMemTables( + const autovector& cfds, + const autovector& flush_memtable_ids) { + int num = static_cast(cfds.size()); // Wait until the compaction completes InstrumentedMutexLock l(&mutex_); - while (cfd->imm()->NumNotFlushed() > 0 && !error_handler_.IsDBStopped() && - (flush_memtable_id == nullptr || - cfd->imm()->GetEarliestMemTableID() <= *flush_memtable_id)) { + while (!error_handler_.IsDBStopped()) { if (shutting_down_.load(std::memory_order_acquire)) { return Status::ShutdownInProgress(); } - if (cfd->IsDropped()) { - // FlushJob cannot flush a dropped CF, if we did not break here - // we will loop forever since cfd->imm()->NumNotFlushed() will never - // drop to zero + // Number of column families that have been dropped. + int num_dropped = 0; + // Number of column families that have finished flush. + int num_finished = 0; + for (int i = 0; i < num; ++i) { + if (cfds[i]->IsDropped()) { + ++num_dropped; + } else if (cfds[i]->imm()->NumNotFlushed() == 0 || + (flush_memtable_ids[i] != nullptr && + cfds[i]->imm()->GetEarliestMemTableID() > + *flush_memtable_ids[i])) { + ++num_finished; + } + } + if (1 == num_dropped && 1 == num) { return Status::InvalidArgument("Cannot flush a dropped CF"); } + // Column families involved in this flush request have either been dropped + // or finished flush. Then it's time to finish waiting. + if (num_dropped + num_finished == num) { + break; + } bg_cv_.Wait(); } + Status s; if (error_handler_.IsDBStopped()) { s = error_handler_.GetBGError(); } @@ -1163,6 +1303,12 @@ if (bg_work_paused_ > 0) { // we paused the background work return; + } else if (error_handler_.IsBGWorkStopped() && + !error_handler_.IsRecoveryInProgress()) { + // There has been a hard error and this call is not part of the recovery + // sequence. Bail out here so we don't get into an endless loop of + // scheduling BG work which will again call this function + return; } else if (shutting_down_.load(std::memory_order_acquire)) { // DB is being deleted; no more background compactions return; @@ -1172,7 +1318,6 @@ env_->GetBackgroundThreads(Env::Priority::HIGH) == 0; while (!is_flush_pool_empty && unscheduled_flushes_ > 0 && bg_flush_scheduled_ < bg_job_limits.max_flushes) { - unscheduled_flushes_--; bg_flush_scheduled_++; env_->Schedule(&DBImpl::BGWorkFlush, this, Env::Priority::HIGH, this); } @@ -1183,7 +1328,6 @@ while (unscheduled_flushes_ > 0 && bg_flush_scheduled_ + bg_compaction_scheduled_ < bg_job_limits.max_flushes) { - unscheduled_flushes_--; bg_flush_scheduled_++; env_->Schedule(&DBImpl::BGWorkFlush, this, Env::Priority::LOW, this); } @@ -1192,6 +1336,12 @@ if (bg_compaction_paused_ > 0) { // we paused the background compaction return; + } else if (error_handler_.IsBGWorkStopped()) { + // Compaction is not part of the recovery sequence from a hard error. We + // might get here because recovery might do a flush and install a new + // super version, which will try to schedule pending compactions. Bail + // out here and let the higher level recovery handle compactions + return; } if (HasExclusiveManualCompaction()) { @@ -1260,30 +1410,28 @@ return cfd; } -void DBImpl::AddToFlushQueue(ColumnFamilyData* cfd, FlushReason flush_reason) { - assert(!cfd->queued_for_flush()); - cfd->Ref(); - flush_queue_.push_back(cfd); - cfd->set_queued_for_flush(true); - cfd->SetFlushReason(flush_reason); -} - -ColumnFamilyData* DBImpl::PopFirstFromFlushQueue() { +DBImpl::FlushRequest DBImpl::PopFirstFromFlushQueue() { assert(!flush_queue_.empty()); - auto cfd = *flush_queue_.begin(); + FlushRequest flush_req = flush_queue_.front(); + assert(unscheduled_flushes_ >= static_cast(flush_req.size())); + unscheduled_flushes_ -= static_cast(flush_req.size()); flush_queue_.pop_front(); - assert(cfd->queued_for_flush()); - cfd->set_queued_for_flush(false); // TODO: need to unset flush reason? - return cfd; + return flush_req; } -void DBImpl::SchedulePendingFlush(ColumnFamilyData* cfd, +void DBImpl::SchedulePendingFlush(const FlushRequest& flush_req, FlushReason flush_reason) { - if (!cfd->queued_for_flush() && cfd->imm()->IsFlushPending()) { - AddToFlushQueue(cfd, flush_reason); - ++unscheduled_flushes_; + if (flush_req.empty()) { + return; } + for (auto& iter : flush_req) { + ColumnFamilyData* cfd = iter.first; + cfd->Ref(); + cfd->SetFlushReason(flush_reason); + } + unscheduled_flushes_ += static_cast(flush_req.size()); + flush_queue_.push_back(flush_req); } void DBImpl::SchedulePendingCompaction(ColumnFamilyData* cfd) { @@ -1351,15 +1499,18 @@ } Status DBImpl::BackgroundFlush(bool* made_progress, JobContext* job_context, - LogBuffer* log_buffer) { + LogBuffer* log_buffer, FlushReason* reason) { mutex_.AssertHeld(); Status status; + *reason = FlushReason::kOthers; + // If BG work is stopped due to an error, but a recovery is in progress, + // that means this flush is part of the recovery. So allow it to go through if (!error_handler_.IsBGWorkStopped()) { if (shutting_down_.load(std::memory_order_acquire)) { status = Status::ShutdownInProgress(); } - } else { + } else if (!error_handler_.IsRecoveryInProgress()) { status = error_handler_.GetBGError(); } @@ -1367,40 +1518,58 @@ return status; } - ColumnFamilyData* cfd = nullptr; + autovector bg_flush_args; + std::vector& superversion_contexts = + job_context->superversion_contexts; while (!flush_queue_.empty()) { // This cfd is already referenced - auto first_cfd = PopFirstFromFlushQueue(); - - if (first_cfd->IsDropped() || !first_cfd->imm()->IsFlushPending()) { - // can't flush this CF, try next one - if (first_cfd->Unref()) { - delete first_cfd; + const FlushRequest& flush_req = PopFirstFromFlushQueue(); + superversion_contexts.clear(); + superversion_contexts.reserve(flush_req.size()); + + for (const auto& iter : flush_req) { + ColumnFamilyData* cfd = iter.first; + if (cfd->IsDropped() || !cfd->imm()->IsFlushPending()) { + // can't flush this CF, try next one + if (cfd->Unref()) { + delete cfd; + } + continue; } - continue; + superversion_contexts.emplace_back(SuperVersionContext(true)); + bg_flush_args.emplace_back(cfd, iter.second, + &(superversion_contexts.back())); + } + if (!bg_flush_args.empty()) { + break; } - - // found a flush! - cfd = first_cfd; - break; } - if (cfd != nullptr) { - const MutableCFOptions mutable_cf_options = - *cfd->GetLatestMutableCFOptions(); + if (!bg_flush_args.empty()) { auto bg_job_limits = GetBGJobLimits(); - ROCKS_LOG_BUFFER( - log_buffer, - "Calling FlushMemTableToOutputFile with column " - "family [%s], flush slots available %d, compaction slots available %d, " - "flush slots scheduled %d, compaction slots scheduled %d", - cfd->GetName().c_str(), bg_job_limits.max_flushes, - bg_job_limits.max_compactions, bg_flush_scheduled_, - bg_compaction_scheduled_); - status = FlushMemTableToOutputFile(cfd, mutable_cf_options, made_progress, - job_context, log_buffer); - if (cfd->Unref()) { - delete cfd; + for (const auto& arg : bg_flush_args) { + ColumnFamilyData* cfd = arg.cfd_; + ROCKS_LOG_BUFFER( + log_buffer, + "Calling FlushMemTableToOutputFile with column " + "family [%s], flush slots available %d, compaction slots available " + "%d, " + "flush slots scheduled %d, compaction slots scheduled %d", + cfd->GetName().c_str(), bg_job_limits.max_flushes, + bg_job_limits.max_compactions, bg_flush_scheduled_, + bg_compaction_scheduled_); + } + status = FlushMemTablesToOutputFiles(bg_flush_args, made_progress, + job_context, log_buffer); + // All the CFDs in the FlushReq must have the same flush reason, so just + // grab the first one + *reason = bg_flush_args[0].cfd_->GetFlushReason(); + for (auto& arg : bg_flush_args) { + ColumnFamilyData* cfd = arg.cfd_; + if (cfd->Unref()) { + delete cfd; + arg.cfd_ = nullptr; + } } } return status; @@ -1421,9 +1590,12 @@ auto pending_outputs_inserted_elem = CaptureCurrentFileNumberInPendingOutputs(); + FlushReason reason; - Status s = BackgroundFlush(&made_progress, &job_context, &log_buffer); - if (!s.ok() && !s.IsShutdownInProgress()) { + Status s = + BackgroundFlush(&made_progress, &job_context, &log_buffer, &reason); + if (!s.ok() && !s.IsShutdownInProgress() && + reason != FlushReason::kErrorRecovery) { // Wait a little bit before retrying background flush in // case this is an environmental problem and we do not want to // chew up resources for failed flushes for the duration of @@ -1613,6 +1785,11 @@ } } else { status = error_handler_.GetBGError(); + // If we get here, it means a hard error happened after this compaction + // was scheduled by MaybeScheduleFlushOrCompaction(), but before it got + // a chance to execute. Since we didn't pop a cfd from the compaction + // queue, increment unscheduled_compactions_ + unscheduled_compactions_++; } if (!status.ok()) { @@ -1648,7 +1825,7 @@ } else { // First check if we have enough room to do the compaction bool enough_room = EnoughRoomForCompaction( - *(c->inputs()), &sfm_reserved_compact_space, log_buffer); + m->cfd, *(c->inputs()), &sfm_reserved_compact_space, log_buffer); if (!enough_room) { // Then don't do the compaction @@ -1711,7 +1888,7 @@ if (c != nullptr) { bool enough_room = EnoughRoomForCompaction( - *(c->inputs()), &sfm_reserved_compact_space, log_buffer); + cfd, *(c->inputs()), &sfm_reserved_compact_space, log_buffer); if (!enough_room) { // Then don't do the compaction @@ -1775,7 +1952,7 @@ *c->mutable_cf_options(), c->edit(), &mutex_, directories_.GetDbDir()); InstallSuperVersionAndScheduleWork( - c->column_family_data(), &job_context->superversion_context, + c->column_family_data(), &job_context->superversion_contexts[0], *c->mutable_cf_options(), FlushReason::kAutoCompaction); ROCKS_LOG_BUFFER(log_buffer, "[%s] Deleted %d files\n", c->column_family_data()->GetName().c_str(), @@ -1804,8 +1981,8 @@ c->edit()->DeleteFile(c->level(l), f->fd.GetNumber()); c->edit()->AddFile(c->output_level(), f->fd.GetNumber(), f->fd.GetPathId(), f->fd.GetFileSize(), f->smallest, - f->largest, f->smallest_seqno, f->largest_seqno, - f->marked_for_compaction); + f->largest, f->fd.smallest_seqno, + f->fd.largest_seqno, f->marked_for_compaction); ROCKS_LOG_BUFFER( log_buffer, @@ -1822,7 +1999,7 @@ &mutex_, directories_.GetDbDir()); // Use latest MutableCFOptions InstallSuperVersionAndScheduleWork( - c->column_family_data(), &job_context->superversion_context, + c->column_family_data(), &job_context->superversion_contexts[0], *c->mutable_cf_options(), FlushReason::kAutoCompaction); VersionStorageInfo::LevelSummaryStorage tmp; @@ -1899,7 +2076,7 @@ status = compaction_job.Install(*c->mutable_cf_options()); if (status.ok()) { InstallSuperVersionAndScheduleWork( - c->column_family_data(), &job_context->superversion_context, + c->column_family_data(), &job_context->superversion_contexts[0], *c->mutable_cf_options(), FlushReason::kAutoCompaction); } *made_progress = true; @@ -1920,8 +2097,6 @@ NotifyOnCompactionCompleted(c->column_family_data(), c.get(), status, compaction_job_stats, job_context->job_id); } - // this will unref its input_version and column_family_data - c.reset(); if (status.ok() || status.IsCompactionTooLarge()) { // Done @@ -1931,7 +2106,26 @@ ROCKS_LOG_WARN(immutable_db_options_.info_log, "Compaction error: %s", status.ToString().c_str()); error_handler_.SetBGError(status, BackgroundErrorReason::kCompaction); + if (c != nullptr && !is_manual && !error_handler_.IsBGWorkStopped()) { + // Put this cfd back in the compaction queue so we can retry after some + // time + auto cfd = c->column_family_data(); + assert(cfd != nullptr); + // Since this compaction failed, we need to recompute the score so it + // takes the original input files into account + c->column_family_data() + ->current() + ->storage_info() + ->ComputeCompactionScore(*(c->immutable_cf_options()), + *(c->mutable_cf_options())); + if (!cfd->queued_for_compaction()) { + AddToCompactionQueue(cfd); + ++unscheduled_compactions_; + } + } } + // this will unref its input_version and column_family_data + c.reset(); if (is_manual) { ManualCompactionState* m = manual_compaction; @@ -2080,7 +2274,10 @@ void DBImpl::InstallSuperVersionAndScheduleWork( ColumnFamilyData* cfd, SuperVersionContext* sv_context, - const MutableCFOptions& mutable_cf_options, FlushReason flush_reason) { + const MutableCFOptions& mutable_cf_options, + FlushReason /* flush_reason */) { + // TODO(yanqin) investigate if 'flush_reason' can be removed since it's not + // used. mutex_.AssertHeld(); // Update max_total_in_memory_state_ @@ -2091,14 +2288,14 @@ old_sv->mutable_cf_options.max_write_buffer_number; } - if (sv_context->new_superversion == nullptr) { + // this branch is unlikely to step in + if (UNLIKELY(sv_context->new_superversion == nullptr)) { sv_context->NewSuperVersion(); } cfd->InstallSuperVersion(sv_context, &mutex_, mutable_cf_options); // Whenever we install new SuperVersion, we might need to issue new flushes or // compactions. - SchedulePendingFlush(cfd, flush_reason); SchedulePendingCompaction(cfd); MaybeScheduleFlushOrCompaction(); diff -Nru rocksdb-5.15.10/db/db_impl_debug.cc rocksdb-5.17.2/db/db_impl_debug.cc --- rocksdb-5.15.10/db/db_impl_debug.cc 2018-09-13 17:25:20.000000000 +0000 +++ rocksdb-5.17.2/db/db_impl_debug.cc 2018-11-12 19:57:32.000000000 +0000 @@ -100,9 +100,11 @@ return SwitchMemtable(cfd, &write_context); } -Status DBImpl::TEST_FlushMemTable(bool wait, ColumnFamilyHandle* cfh) { +Status DBImpl::TEST_FlushMemTable(bool wait, bool allow_write_stall, + ColumnFamilyHandle* cfh) { FlushOptions fo; fo.wait = wait; + fo.allow_write_stall = allow_write_stall; ColumnFamilyData* cfd; if (cfh == nullptr) { cfd = default_cf_handle_->cfd(); @@ -135,7 +137,7 @@ while ((bg_bottom_compaction_scheduled_ || bg_compaction_scheduled_ || bg_flush_scheduled_ || (wait_unscheduled && unscheduled_compactions_)) && - !error_handler_.IsDBStopped()) { + (error_handler_.GetBGError() == Status::OK())) { bg_cv_.Wait(); } return error_handler_.GetBGError(); @@ -235,5 +237,11 @@ } } +size_t DBImpl::TEST_GetWalPreallocateBlockSize( + uint64_t write_buffer_size) const { + InstrumentedMutexLock l(&mutex_); + return GetWalPreallocateBlockSize(write_buffer_size); +} + } // namespace rocksdb #endif // NDEBUG diff -Nru rocksdb-5.15.10/db/db_impl_experimental.cc rocksdb-5.17.2/db/db_impl_experimental.cc --- rocksdb-5.15.10/db/db_impl_experimental.cc 2018-09-13 17:25:20.000000000 +0000 +++ rocksdb-5.17.2/db/db_impl_experimental.cc 2018-11-12 19:57:32.000000000 +0000 @@ -131,16 +131,16 @@ edit.DeleteFile(0, f->fd.GetNumber()); edit.AddFile(target_level, f->fd.GetNumber(), f->fd.GetPathId(), f->fd.GetFileSize(), f->smallest, f->largest, - f->smallest_seqno, f->largest_seqno, + f->fd.smallest_seqno, f->fd.largest_seqno, f->marked_for_compaction); } status = versions_->LogAndApply(cfd, *cfd->GetLatestMutableCFOptions(), &edit, &mutex_, directories_.GetDbDir()); if (status.ok()) { - InstallSuperVersionAndScheduleWork( - cfd, &job_context.superversion_context, - *cfd->GetLatestMutableCFOptions()); + InstallSuperVersionAndScheduleWork(cfd, + &job_context.superversion_contexts[0], + *cfd->GetLatestMutableCFOptions()); } } // lock released here LogFlush(immutable_db_options_.info_log); diff -Nru rocksdb-5.15.10/db/db_impl.h rocksdb-5.17.2/db/db_impl.h --- rocksdb-5.15.10/db/db_impl.h 2018-09-13 17:25:20.000000000 +0000 +++ rocksdb-5.17.2/db/db_impl.h 2018-11-12 19:57:32.000000000 +0000 @@ -22,9 +22,9 @@ #include "db/column_family.h" #include "db/compaction_job.h" #include "db/dbformat.h" -#include "db/external_sst_file_ingestion_job.h" #include "db/error_handler.h" #include "db/event_helpers.h" +#include "db/external_sst_file_ingestion_job.h" #include "db/flush_job.h" #include "db/flush_scheduler.h" #include "db/internal_stats.h" @@ -46,6 +46,7 @@ #include "rocksdb/env.h" #include "rocksdb/memtablerep.h" #include "rocksdb/status.h" +#include "rocksdb/trace_reader_writer.h" #include "rocksdb/transaction_log.h" #include "rocksdb/write_buffer_manager.h" #include "table/scoped_arena_iterator.h" @@ -54,6 +55,7 @@ #include "util/hash.h" #include "util/stop_watch.h" #include "util/thread_local.h" +#include "util/trace_replay.h" namespace rocksdb { @@ -333,6 +335,15 @@ virtual Status VerifyChecksum() override; + using DB::StartTrace; + virtual Status StartTrace( + const TraceOptions& options, + std::unique_ptr&& trace_writer) override; + + using DB::EndTrace; + virtual Status EndTrace() override; + Status TraceIteratorSeek(const uint32_t& cf_id, const Slice& key); + Status TraceIteratorSeekForPrev(const uint32_t& cf_id, const Slice& key); #endif // ROCKSDB_LITE // Similar to GetSnapshot(), but also lets the db know that this snapshot @@ -385,7 +396,7 @@ Status TEST_SwitchMemtable(ColumnFamilyData* cfd = nullptr); // Force current memtable contents to be flushed. - Status TEST_FlushMemTable(bool wait = true, + Status TEST_FlushMemTable(bool wait = true, bool allow_write_stall = false, ColumnFamilyHandle* cfh = nullptr); // Wait for memtable compaction @@ -453,6 +464,7 @@ int TEST_BGCompactionsAllowed() const; int TEST_BGFlushesAllowed() const; + size_t TEST_GetWalPreallocateBlockSize(uint64_t write_buffer_size) const; #endif // NDEBUG @@ -697,6 +709,8 @@ Statistics* stats_; std::unordered_map recovered_transactions_; + std::unique_ptr tracer_; + InstrumentedMutex trace_mutex_; // Except in DB::Open(), WriteOptionsFile can only be called when: // Persist options to options file. @@ -782,6 +796,7 @@ private: friend class DB; + friend class ErrorHandler; friend class InternalStats; friend class PessimisticTransaction; friend class TransactionBaseImpl; @@ -790,18 +805,21 @@ friend class WritePreparedTxnDB; friend class WriteBatchWithIndex; friend class WriteUnpreparedTxnDB; + friend class WriteUnpreparedTxn; + #ifndef ROCKSDB_LITE friend class ForwardIterator; #endif friend struct SuperVersion; friend class CompactedDBImpl; friend class DBTest_ConcurrentFlushWAL_Test; + friend class DBTest_MixedSlowdownOptionsStop_Test; #ifndef NDEBUG friend class DBTest2_ReadCallbackTest_Test; friend class WriteCallbackTest_WriteWithCallbackTest_Test; friend class XFTransactionWriteHandler; friend class DBBlobIndexTest; - friend class WriteUnpreparedTransactionTest_RecoveryRollbackUnprepared_Test; + friend class WriteUnpreparedTransactionTest_RecoveryTest_Test; #endif struct CompactionState; @@ -830,6 +848,8 @@ bool read_only = false, bool error_if_log_file_exist = false, bool error_if_data_exists_in_logs = false); + Status ResumeImpl(); + void MaybeIgnoreError(Status* s) const; const Status CreateArchivalDirectory(); @@ -869,12 +889,41 @@ Status SyncClosedLogs(JobContext* job_context); // Flush the in-memory write buffer to storage. Switches to a new - // log-file/memtable and writes a new descriptor iff successful. + // log-file/memtable and writes a new descriptor iff successful. Then + // installs a new super version for the column family. Status FlushMemTableToOutputFile(ColumnFamilyData* cfd, const MutableCFOptions& mutable_cf_options, bool* madeProgress, JobContext* job_context, + SuperVersionContext* superversion_context, LogBuffer* log_buffer); + // Argument required by background flush thread. + struct BGFlushArg { + BGFlushArg() + : cfd_(nullptr), memtable_id_(0), superversion_context_(nullptr) {} + BGFlushArg(ColumnFamilyData* cfd, uint64_t memtable_id, + SuperVersionContext* superversion_context) + : cfd_(cfd), + memtable_id_(memtable_id), + superversion_context_(superversion_context) {} + + // Column family to flush. + ColumnFamilyData* cfd_; + // Maximum ID of memtable to flush. In this column family, memtables with + // IDs smaller than this value must be flushed before this flush completes. + uint64_t memtable_id_; + // Pointer to a SuperVersionContext object. After flush completes, RocksDB + // installs a new superversion for the column family. This operation + // requires a SuperVersionContext object (currently embedded in JobContext). + SuperVersionContext* superversion_context_; + }; + + // Flush the memtables of (multiple) column families to multiple files on + // persistent storage. + Status FlushMemTablesToOutputFiles( + const autovector& bg_flush_args, bool* made_progress, + JobContext* job_context, LogBuffer* log_buffer); + // REQUIRES: log_numbers are sorted in ascending order Status RecoverLogFiles(const std::vector& log_numbers, SequenceNumber* next_sequence, bool read_only); @@ -887,6 +936,12 @@ Status WriteLevel0TableForRecovery(int job_id, ColumnFamilyData* cfd, MemTable* mem, VersionEdit* edit); + // Restore alive_log_files_ and total_log_size_ after recovery. + // It needs to run only when there's no flush during recovery + // (e.g. avoid_flush_during_recovery=true). May also trigger flush + // in case total_log_size > max_total_wal_size. + Status RestoreAliveLogFiles(const std::vector& log_numbers); + // num_bytes: for slowdown case, delay time is calculated based on // `num_bytes` going through. Status DelayWrite(uint64_t num_bytes, const WriteOptions& write_options); @@ -896,19 +951,28 @@ Status ScheduleFlushes(WriteContext* context); - Status SwitchMemtable(ColumnFamilyData* cfd, WriteContext* context, - FlushReason flush_reason = FlushReason::kOthers); + Status SwitchMemtable(ColumnFamilyData* cfd, WriteContext* context); // Force current memtable contents to be flushed. Status FlushMemTable(ColumnFamilyData* cfd, const FlushOptions& options, FlushReason flush_reason, bool writes_stopped = false); + // Wait until flushing this column family won't stall writes + Status WaitUntilFlushWouldNotStallWrites(ColumnFamilyData* cfd, + bool* flush_needed); + // Wait for memtable flushed. // If flush_memtable_id is non-null, wait until the memtable with the ID // gets flush. Otherwise, wait until the column family don't have any // memtable pending flush. Status WaitForFlushMemTable(ColumnFamilyData* cfd, - const uint64_t* flush_memtable_id = nullptr); + const uint64_t* flush_memtable_id = nullptr) { + return WaitForFlushMemTables({cfd}, {flush_memtable_id}); + } + // Wait for memtables to be flushed for multiple column families. + Status WaitForFlushMemTables( + const autovector& cfds, + const autovector& flush_memtable_ids); // REQUIRES: mutex locked Status SwitchWAL(WriteContext* write_context); @@ -964,7 +1028,17 @@ ColumnFamilyData* GetColumnFamilyDataByName(const std::string& cf_name); void MaybeScheduleFlushOrCompaction(); - void SchedulePendingFlush(ColumnFamilyData* cfd, FlushReason flush_reason); + + // A flush request specifies the column families to flush as well as the + // largest memtable id to persist for each column family. Once all the + // memtables whose IDs are smaller than or equal to this per-column-family + // specified value, this flush request is considered to have completed its + // work of flushing this column family. After completing the work for all + // column families in this request, this flush is considered complete. + typedef std::vector> FlushRequest; + + void SchedulePendingFlush(const FlushRequest& req, FlushReason flush_reason); + void SchedulePendingCompaction(ColumnFamilyData* cfd); void SchedulePendingPurge(std::string fname, std::string dir_to_sync, FileType type, uint64_t number, int job_id); @@ -983,9 +1057,10 @@ LogBuffer* log_buffer, PrepickedCompaction* prepicked_compaction); Status BackgroundFlush(bool* madeProgress, JobContext* job_context, - LogBuffer* log_buffer); + LogBuffer* log_buffer, FlushReason* reason); - bool EnoughRoomForCompaction(const std::vector& inputs, + bool EnoughRoomForCompaction(ColumnFamilyData* cfd, + const std::vector& inputs, bool* sfm_bookkeeping, LogBuffer* log_buffer); void PrintStatistics(); @@ -1006,8 +1081,7 @@ // helper functions for adding and removing from flush & compaction queues void AddToCompactionQueue(ColumnFamilyData* cfd); ColumnFamilyData* PopFirstFromCompactionQueue(); - void AddToFlushQueue(ColumnFamilyData* cfd, FlushReason flush_reason); - ColumnFamilyData* PopFirstFromFlushQueue(); + FlushRequest PopFirstFromFlushQueue(); // helper function to call after some of the logs_ were synced void MarkLogsSynced(uint64_t up_to, bool synced_dir, const Status& status); @@ -1020,6 +1094,10 @@ Status CloseHelper(); + Status FlushAllCFs(FlushReason flush_reason); + + void WaitForBackgroundWork(); + // table_cache_ provides its own synchronization std::shared_ptr table_cache_; @@ -1240,7 +1318,7 @@ // in MaybeScheduleFlushOrCompaction() // invariant(column family present in flush_queue_ <==> // ColumnFamilyData::pending_flush_ == true) - std::deque flush_queue_; + std::deque flush_queue_; // invariant(column family present in compaction_queue_ <==> // ColumnFamilyData::pending_compaction_ == true) std::deque compaction_queue_; @@ -1464,6 +1542,16 @@ // flush/compaction and if it is not provided vis SnapshotChecker, we should // disable gc to be safe. const bool use_custom_gc_; + // Flag to indicate that the DB instance shutdown has been initiated. This + // different from shutting_down_ atomic in that it is set at the beginning + // of shutdown sequence, specifically in order to prevent any background + // error recovery from going on in parallel. The latter, shutting_down_, + // is set a little later during the shutdown after scheduling memtable + // flushes + bool shutdown_initiated_; + // Flag to indicate whether sst_file_manager object was allocated in + // DB::Open() or passed to us + bool own_sfm_; // Clients must periodically call SetPreserveDeletesSequenceNumber() // to advance this seqnum. Default value is 0 which means ALL deletes are diff -Nru rocksdb-5.15.10/db/db_impl_open.cc rocksdb-5.17.2/db/db_impl_open.cc --- rocksdb-5.15.10/db/db_impl_open.cc 2018-09-13 17:25:20.000000000 +0000 +++ rocksdb-5.17.2/db/db_impl_open.cc 2018-11-12 19:57:32.000000000 +0000 @@ -134,8 +134,15 @@ for (size_t i = 0; i < result.db_paths.size(); i++) { DeleteScheduler::CleanupDirectory(result.env, sfm, result.db_paths[i].path); } -#endif + // Create a default SstFileManager for purposes of tracking compaction size + // and facilitating recovery from out of space errors. + if (result.sst_file_manager.get() == nullptr) { + std::shared_ptr sst_file_manager( + NewSstFileManager(result.env, result.info_log)); + result.sst_file_manager = sst_file_manager; + } +#endif return result; } @@ -232,7 +239,7 @@ file->SetPreallocationBlockSize( immutable_db_options_.manifest_preallocation_size); unique_ptr file_writer( - new WritableFileWriter(std::move(file), env_options)); + new WritableFileWriter(std::move(file), manifest, env_options)); log::Writer log(std::move(file_writer), 0, false); std::string record; new_db.EncodeTo(&record); @@ -361,7 +368,7 @@ s = env_->NewRandomAccessFile(IdentityFileName(dbname_), &idfile, customized_env); if (!s.ok()) { - const char* error_msg = s.ToString().c_str(); + std::string error_str = s.ToString(); // Check if unsupported Direct I/O is the root cause customized_env.use_direct_reads = false; s = env_->NewRandomAccessFile(IdentityFileName(dbname_), &idfile, @@ -371,7 +378,7 @@ "Direct I/O is not supported by the specified DB."); } else { return Status::InvalidArgument( - "Found options incompatible with filesystem", error_msg); + "Found options incompatible with filesystem", error_str.c_str()); } } } @@ -389,6 +396,16 @@ } } } + + // Initial max_total_in_memory_state_ before recovery logs. Log recovery + // may check this value to decide whether to flush. + max_total_in_memory_state_ = 0; + for (auto cfd : *versions_->GetColumnFamilySet()) { + auto* mutable_cf_options = cfd->GetLatestMutableCFOptions(); + max_total_in_memory_state_ += mutable_cf_options->write_buffer_size * + mutable_cf_options->max_write_buffer_number; + } + if (s.ok()) { SequenceNumber next_sequence(kMaxSequenceNumber); default_cf_handle_ = new ColumnFamilyHandleImpl( @@ -461,14 +478,6 @@ } } - // Initial value - max_total_in_memory_state_ = 0; - for (auto cfd : *versions_->GetColumnFamilySet()) { - auto* mutable_cf_options = cfd->GetLatestMutableCFOptions(); - max_total_in_memory_state_ += mutable_cf_options->write_buffer_size * - mutable_cf_options->max_write_buffer_number; - } - return s; } @@ -598,8 +607,7 @@ // to be skipped instead of propagating bad information (like overly // large sequence numbers). log::Reader reader(immutable_db_options_.info_log, std::move(file_reader), - &reporter, true /*checksum*/, 0 /*initial_offset*/, - log_number); + &reporter, true /*checksum*/, log_number); // Determine if we should tolerate incomplete records at the tail end of the // Read all the records and add to a memtable @@ -879,18 +887,8 @@ } } - if (data_seen && !flushed) { - // Mark these as alive so they'll be considered for deletion later by - // FindObsoleteFiles() - if (two_write_queues_) { - log_write_mutex_.Lock(); - } - for (auto log_number : log_numbers) { - alive_log_files_.push_back(LogFileNumberSize(log_number)); - } - if (two_write_queues_) { - log_write_mutex_.Unlock(); - } + if (status.ok() && data_seen && !flushed) { + status = RestoreAliveLogFiles(log_numbers); } event_logger_.Log() << "job" << job_id << "event" @@ -899,6 +897,60 @@ return status; } +Status DBImpl::RestoreAliveLogFiles(const std::vector& log_numbers) { + if (log_numbers.empty()) { + return Status::OK(); + } + Status s; + mutex_.AssertHeld(); + assert(immutable_db_options_.avoid_flush_during_recovery); + if (two_write_queues_) { + log_write_mutex_.Lock(); + } + // Mark these as alive so they'll be considered for deletion later by + // FindObsoleteFiles() + total_log_size_ = 0; + log_empty_ = false; + for (auto log_number : log_numbers) { + LogFileNumberSize log(log_number); + std::string fname = LogFileName(immutable_db_options_.wal_dir, log_number); + // This gets the appear size of the logs, not including preallocated space. + s = env_->GetFileSize(fname, &log.size); + if (!s.ok()) { + break; + } + total_log_size_ += log.size; + alive_log_files_.push_back(log); + // We preallocate space for logs, but then after a crash and restart, those + // preallocated space are not needed anymore. It is likely only the last + // log has such preallocated space, so we only truncate for the last log. + if (log_number == log_numbers.back()) { + std::unique_ptr last_log; + Status truncate_status = env_->ReopenWritableFile( + fname, &last_log, + env_->OptimizeForLogWrite( + env_options_, + BuildDBOptions(immutable_db_options_, mutable_db_options_))); + if (truncate_status.ok()) { + truncate_status = last_log->Truncate(log.size); + } + if (truncate_status.ok()) { + truncate_status = last_log->Close(); + } + // Not a critical error if fail to truncate. + if (!truncate_status.ok()) { + ROCKS_LOG_WARN(immutable_db_options_.info_log, + "Failed to truncate log #%" PRIu64 ": %s", log_number, + truncate_status.ToString().c_str()); + } + } + } + if (two_write_queues_) { + log_write_mutex_.Unlock(); + } + return s; +} + Status DBImpl::WriteLevel0TableForRecovery(int job_id, ColumnFamilyData* cfd, MemTable* mem, VersionEdit* edit) { mutex_.AssertHeld(); @@ -969,7 +1021,7 @@ if (s.ok() && meta.fd.GetFileSize() > 0) { edit->AddFile(level, meta.fd.GetNumber(), meta.fd.GetPathId(), meta.fd.GetFileSize(), meta.smallest, meta.largest, - meta.smallest_seqno, meta.largest_seqno, + meta.fd.smallest_seqno, meta.fd.largest_seqno, meta.marked_for_compaction); } @@ -1051,6 +1103,12 @@ break; } } + + // For recovery from NoSpace() error, we can only handle + // the case where the database is stored in a single path + if (paths.size() <= 1) { + impl->error_handler_.EnableAutoRecovery(); + } } if (!s.ok()) { @@ -1075,10 +1133,10 @@ impl->immutable_db_options_.env->OptimizeForLogWrite( soptions, BuildDBOptions(impl->immutable_db_options_, impl->mutable_db_options_)); - s = NewWritableFile( - impl->immutable_db_options_.env, - LogFileName(impl->immutable_db_options_.wal_dir, new_log_number), - &lfile, opt_env_options); + std::string log_fname = + LogFileName(impl->immutable_db_options_.wal_dir, new_log_number); + s = NewWritableFile(impl->immutable_db_options_.env, log_fname, &lfile, + opt_env_options); if (s.ok()) { lfile->SetWriteLifeTimeHint(write_hint); lfile->SetPreallocationBlockSize( @@ -1086,8 +1144,8 @@ { InstrumentedMutexLock wl(&impl->log_write_mutex_); impl->logfile_number_ = new_log_number; - unique_ptr file_writer( - new WritableFileWriter(std::move(lfile), opt_env_options)); + unique_ptr file_writer(new WritableFileWriter( + std::move(lfile), log_fname, opt_env_options)); impl->logs_.emplace_back( new_log_number, new log::Writer( @@ -1214,6 +1272,14 @@ } } } + + // Reserve some disk buffer space. This is a heuristic - when we run out + // of disk space, this ensures that there is atleast write_buffer_size + // amount of free space before we resume DB writes. In low disk space + // conditions, we want to avoid a lot of small L0 files due to frequent + // WAL write failures and resultant forced flushes + sfm->ReserveDiskBuffer(max_write_buffer_size, + impl->immutable_db_options_.db_paths[0].path); } #endif // !ROCKSDB_LITE diff -Nru rocksdb-5.15.10/db/db_impl_readonly.cc rocksdb-5.17.2/db/db_impl_readonly.cc --- rocksdb-5.15.10/db/db_impl_readonly.cc 2018-09-13 17:25:20.000000000 +0000 +++ rocksdb-5.17.2/db/db_impl_readonly.cc 2018-11-12 19:57:32.000000000 +0000 @@ -31,22 +31,38 @@ ColumnFamilyHandle* column_family, const Slice& key, PinnableSlice* pinnable_val) { assert(pinnable_val != nullptr); + // TODO: stopwatch DB_GET needed?, perf timer needed? + PERF_TIMER_GUARD(get_snapshot_time); Status s; SequenceNumber snapshot = versions_->LastSequence(); auto cfh = reinterpret_cast(column_family); auto cfd = cfh->cfd(); + if (tracer_) { + InstrumentedMutexLock lock(&trace_mutex_); + if (tracer_) { + tracer_->Get(column_family, key); + } + } SuperVersion* super_version = cfd->GetSuperVersion(); MergeContext merge_context; RangeDelAggregator range_del_agg(cfd->internal_comparator(), snapshot); LookupKey lkey(key, snapshot); + PERF_TIMER_STOP(get_snapshot_time); if (super_version->mem->Get(lkey, pinnable_val->GetSelf(), &s, &merge_context, &range_del_agg, read_options)) { pinnable_val->PinSelf(); + RecordTick(stats_, MEMTABLE_HIT); } else { PERF_TIMER_GUARD(get_from_output_files_time); super_version->current->Get(read_options, lkey, pinnable_val, &s, &merge_context, &range_del_agg); + RecordTick(stats_, MEMTABLE_MISS); } + RecordTick(stats_, NUMBER_KEYS_READ); + size_t size = pinnable_val->size(); + RecordTick(stats_, BYTES_READ, size); + MeasureTime(stats_, BYTES_PER_READ, size); + PERF_COUNTER_ADD(get_read_bytes, size); return s; } diff -Nru rocksdb-5.15.10/db/db_impl_write.cc rocksdb-5.17.2/db/db_impl_write.cc --- rocksdb-5.15.10/db/db_impl_write.cc 2018-09-13 17:25:20.000000000 +0000 +++ rocksdb-5.17.2/db/db_impl_write.cc 2018-11-12 19:57:32.000000000 +0000 @@ -76,6 +76,12 @@ if (my_batch == nullptr) { return Status::Corruption("Batch is nullptr!"); } + if (tracer_) { + InstrumentedMutexLock lock(&trace_mutex_); + if (tracer_) { + tracer_->Write(my_batch); + } + } if (write_options.sync && write_options.disableWAL) { return Status::InvalidArgument("Sync writes has to enable WAL."); } @@ -311,7 +317,8 @@ w.status = WriteBatchInternal::InsertInto( write_group, current_sequence, column_family_memtables_.get(), &flush_scheduler_, write_options.ignore_missing_column_families, - 0 /*recovery_log_number*/, this, parallel, seq_per_batch_); + 0 /*recovery_log_number*/, this, parallel, seq_per_batch_, + batch_per_txn_); } else { SequenceNumber next_sequence = current_sequence; // Note: the logic for advancing seq here must be consistent with the @@ -346,7 +353,7 @@ &w, w.sequence, &column_family_memtables, &flush_scheduler_, write_options.ignore_missing_column_families, 0 /*log_number*/, this, true /*concurrent_memtable_writes*/, seq_per_batch_, - w.batch_cnt); + w.batch_cnt, batch_per_txn_); } } if (seq_used != nullptr) { @@ -508,7 +515,8 @@ memtable_write_group.status = WriteBatchInternal::InsertInto( memtable_write_group, w.sequence, column_family_memtables_.get(), &flush_scheduler_, write_options.ignore_missing_column_families, - 0 /*log_number*/, this, seq_per_batch_); + 0 /*log_number*/, this, false /*concurrent_memtable_writes*/, + seq_per_batch_, batch_per_txn_); versions_->SetLastSequence(memtable_write_group.last_sequence); write_thread_.ExitAsMemTableWriter(&w, memtable_write_group); } @@ -565,7 +573,6 @@ } // else we are the leader of the write batch group assert(w.state == WriteThread::STATE_GROUP_LEADER); - WriteContext write_context; WriteThread::WriteGroup write_group; uint64_t last_sequence; nonmem_write_thread_.EnterAsBatchGroupLeader(&w, &write_group); @@ -703,6 +710,10 @@ assert(write_context != nullptr && need_log_sync != nullptr); Status status; + if (error_handler_.IsDBStopped()) { + status = error_handler_.GetBGError(); + } + PERF_TIMER_GUARD(write_scheduling_flushes_compactions_time); assert(!single_column_family_mode_ || @@ -721,10 +732,6 @@ status = HandleWriteBufferFull(write_context); } - if (UNLIKELY(status.ok())) { - status = error_handler_.GetBGError(); - } - if (UNLIKELY(status.ok() && !flush_scheduler_.Empty())) { status = ScheduleFlushes(write_context); } @@ -1057,6 +1064,7 @@ oldest_alive_log, total_log_size_.load(), GetMaxTotalWalSize()); // no need to refcount because drop is happening in write thread, so can't // happen while we're in the write thread + FlushRequest flush_req; for (auto cfd : *versions_->GetColumnFamilySet()) { if (cfd->IsDropped()) { continue; @@ -1066,11 +1074,14 @@ if (!status.ok()) { break; } + flush_req.emplace_back(cfd, cfd->imm()->GetLatestMemTableID()); cfd->imm()->FlushRequested(); - SchedulePendingFlush(cfd, FlushReason::kWriteBufferManager); } } - MaybeScheduleFlushOrCompaction(); + if (status.ok()) { + SchedulePendingFlush(flush_req, FlushReason::kWriteBufferManager); + MaybeScheduleFlushOrCompaction(); + } return status; } @@ -1109,14 +1120,26 @@ } } } + + autovector cfds; if (cfd_picked != nullptr) { - status = SwitchMemtable(cfd_picked, write_context, - FlushReason::kWriteBufferFull); - if (status.ok()) { - cfd_picked->imm()->FlushRequested(); - SchedulePendingFlush(cfd_picked, FlushReason::kWriteBufferFull); - MaybeScheduleFlushOrCompaction(); + cfds.push_back(cfd_picked); + } + FlushRequest flush_req; + for (const auto cfd : cfds) { + cfd->Ref(); + status = SwitchMemtable(cfd, write_context); + cfd->Unref(); + if (!status.ok()) { + break; } + uint64_t flush_memtable_id = cfd->imm()->GetLatestMemTableID(); + cfd->imm()->FlushRequested(); + flush_req.emplace_back(cfd, flush_memtable_id); + } + if (status.ok()) { + SchedulePendingFlush(flush_req, FlushReason::kWriteBufferFull); + MaybeScheduleFlushOrCompaction(); } return status; } @@ -1139,10 +1162,14 @@ uint64_t delay = write_controller_.GetDelay(env_, num_bytes); if (delay > 0) { if (write_options.no_slowdown) { - return Status::Incomplete(); + return Status::Incomplete("Write stall"); } TEST_SYNC_POINT("DBImpl::DelayWrite:Sleep"); + // Notify write_thread_ about the stall so it can setup a barrier and + // fail any pending writers with no_slowdown + write_thread_.BeginWriteStall(); + TEST_SYNC_POINT("DBImpl::DelayWrite:BeginWriteStallDone"); mutex_.Unlock(); // We will delay the write until we have slept for delay ms or // we don't need a delay anymore @@ -1159,15 +1186,25 @@ env_->SleepForMicroseconds(kDelayInterval); } mutex_.Lock(); + write_thread_.EndWriteStall(); } - while (!error_handler_.IsDBStopped() && write_controller_.IsStopped()) { + // Don't wait if there's a background error, even if its a soft error. We + // might wait here indefinitely as the background compaction may never + // finish successfully, resulting in the stall condition lasting + // indefinitely + while (error_handler_.GetBGError().ok() && write_controller_.IsStopped()) { if (write_options.no_slowdown) { - return Status::Incomplete(); + return Status::Incomplete("Write stall"); } delayed = true; + + // Notify write_thread_ about the stall so it can setup a barrier and + // fail any pending writers with no_slowdown + write_thread_.BeginWriteStall(); TEST_SYNC_POINT("DBImpl::DelayWrite:Wait"); bg_cv_.Wait(); + write_thread_.EndWriteStall(); } } assert(!delayed || !write_options.no_slowdown); @@ -1177,7 +1214,19 @@ RecordTick(stats_, STALL_MICROS, time_delayed); } - return error_handler_.GetBGError(); + // If DB is not in read-only mode and write_controller is not stopping + // writes, we can ignore any background errors and allow the write to + // proceed + Status s; + if (write_controller_.IsStopped()) { + // If writes are still stopped, it means we bailed due to a background + // error + s = Status::Incomplete(error_handler_.GetBGError().ToString()); + } + if (error_handler_.IsDBStopped()) { + s = error_handler_.GetBGError(); + } + return s; } Status DBImpl::ThrottleLowPriWritesIfNeeded(const WriteOptions& write_options, @@ -1212,16 +1261,28 @@ Status DBImpl::ScheduleFlushes(WriteContext* context) { ColumnFamilyData* cfd; + FlushRequest flush_req; + Status status; while ((cfd = flush_scheduler_.TakeNextColumnFamily()) != nullptr) { - auto status = SwitchMemtable(cfd, context, FlushReason::kWriteBufferFull); + status = SwitchMemtable(cfd, context); + bool should_schedule = true; if (cfd->Unref()) { delete cfd; + should_schedule = false; } if (!status.ok()) { - return status; + break; + } + if (should_schedule) { + uint64_t flush_memtable_id = cfd->imm()->GetLatestMemTableID(); + flush_req.emplace_back(cfd, flush_memtable_id); } } - return Status::OK(); + if (status.ok()) { + SchedulePendingFlush(flush_req, FlushReason::kWriteBufferFull); + MaybeScheduleFlushOrCompaction(); + } + return status; } #ifndef ROCKSDB_LITE @@ -1242,8 +1303,7 @@ // REQUIRES: mutex_ is held // REQUIRES: this thread is currently at the front of the writer queue -Status DBImpl::SwitchMemtable(ColumnFamilyData* cfd, WriteContext* context, - FlushReason flush_reason) { +Status DBImpl::SwitchMemtable(ColumnFamilyData* cfd, WriteContext* context) { mutex_.AssertHeld(); WriteThread::Writer nonmem_w; if (two_write_queues_) { @@ -1312,6 +1372,8 @@ auto write_hint = CalculateWALWriteHint(); mutex_.Unlock(); { + std::string log_fname = + LogFileName(immutable_db_options_.wal_dir, new_log_number); if (creating_new_log) { EnvOptions opt_env_opt = env_->OptimizeForLogWrite(env_options_, db_options); @@ -1319,14 +1381,12 @@ ROCKS_LOG_INFO(immutable_db_options_.info_log, "reusing log %" PRIu64 " from recycle list\n", recycle_log_number); - s = env_->ReuseWritableFile( - LogFileName(immutable_db_options_.wal_dir, new_log_number), - LogFileName(immutable_db_options_.wal_dir, recycle_log_number), - &lfile, opt_env_opt); + std::string old_log_fname = + LogFileName(immutable_db_options_.wal_dir, recycle_log_number); + s = env_->ReuseWritableFile(log_fname, old_log_fname, &lfile, + opt_env_opt); } else { - s = NewWritableFile( - env_, LogFileName(immutable_db_options_.wal_dir, new_log_number), - &lfile, opt_env_opt); + s = NewWritableFile(env_, log_fname, &lfile, opt_env_opt); } if (s.ok()) { // Our final size should be less than write_buffer_size @@ -1337,7 +1397,7 @@ lfile->SetPreallocationBlockSize(preallocate_block_size); lfile->SetWriteLifeTimeHint(write_hint); unique_ptr file_writer( - new WritableFileWriter(std::move(lfile), opt_env_opt)); + new WritableFileWriter(std::move(lfile), log_fname, opt_env_opt)); new_log = new log::Writer( std::move(file_writer), new_log_number, immutable_db_options_.recycle_log_file_num > 0, manual_wal_flush_); @@ -1415,7 +1475,7 @@ new_mem->Ref(); cfd->SetMemtable(new_mem); InstallSuperVersionAndScheduleWork(cfd, &context->superversion_context, - mutable_cf_options, flush_reason); + mutable_cf_options); if (two_write_queues_) { nonmem_write_thread_.ExitUnbatched(&nonmem_w); } diff -Nru rocksdb-5.15.10/db/db_iterator_test.cc rocksdb-5.17.2/db/db_iterator_test.cc --- rocksdb-5.15.10/db/db_iterator_test.cc 2018-09-13 17:25:20.000000000 +0000 +++ rocksdb-5.17.2/db/db_iterator_test.cc 2018-11-12 19:57:32.000000000 +0000 @@ -2093,6 +2093,34 @@ iter.reset(); } +TEST_P(DBIteratorTest, RefreshWithSnapshot) { + ASSERT_OK(Put("x", "y")); + const Snapshot* snapshot = db_->GetSnapshot(); + ReadOptions options; + options.snapshot = snapshot; + Iterator* iter = NewIterator(options); + + iter->Seek(Slice("a")); + ASSERT_TRUE(iter->Valid()); + ASSERT_EQ(iter->key().compare(Slice("x")), 0); + iter->Next(); + ASSERT_FALSE(iter->Valid()); + + ASSERT_OK(Put("c", "d")); + + iter->Seek(Slice("a")); + ASSERT_TRUE(iter->Valid()); + ASSERT_EQ(iter->key().compare(Slice("x")), 0); + iter->Next(); + ASSERT_FALSE(iter->Valid()); + + Status s; + s = iter->Refresh(); + ASSERT_TRUE(s.IsNotSupported()); + db_->ReleaseSnapshot(snapshot); + delete iter; +} + TEST_P(DBIteratorTest, CreationFailure) { SyncPoint::GetInstance()->SetCallBack( "DBImpl::NewInternalIterator:StatusCallback", [](void* arg) { diff -Nru rocksdb-5.15.10/db/db_iter.cc rocksdb-5.17.2/db/db_iter.cc --- rocksdb-5.15.10/db/db_iter.cc 2018-09-13 17:25:20.000000000 +0000 +++ rocksdb-5.17.2/db/db_iter.cc 2018-11-12 19:57:32.000000000 +0000 @@ -27,6 +27,7 @@ #include "util/logging.h" #include "util/mutexlock.h" #include "util/string_util.h" +#include "util/trace_replay.h" namespace rocksdb { @@ -114,7 +115,8 @@ const MutableCFOptions& mutable_cf_options, const Comparator* cmp, InternalIterator* iter, SequenceNumber s, bool arena_mode, uint64_t max_sequential_skip_in_iterations, - ReadCallback* read_callback, bool allow_blob) + ReadCallback* read_callback, DBImpl* db_impl, ColumnFamilyData* cfd, + bool allow_blob) : arena_mode_(arena_mode), env_(_env), logger_(cf_options.info_log), @@ -135,6 +137,8 @@ range_del_agg_(cf_options.internal_comparator, s, true /* collapse_deletions */), read_callback_(read_callback), + db_impl_(db_impl), + cfd_(cfd), allow_blob_(allow_blob), is_blob_(false), start_seqnum_(read_options.iter_start_seqnum) { @@ -344,6 +348,8 @@ LocalStatistics local_stats_; PinnedIteratorsManager pinned_iters_mgr_; ReadCallback* read_callback_; + DBImpl* db_impl_; + ColumnFamilyData* cfd_; bool allow_blob_; bool is_blob_; // for diff snapshots we want the lower bound on the seqnum; @@ -1267,6 +1273,12 @@ saved_key_.Clear(); saved_key_.SetInternalKey(target, seq); +#ifndef ROCKSDB_LITE + if (db_impl_ != nullptr && cfd_ != nullptr) { + db_impl_->TraceIteratorSeek(cfd_->GetID(), target); + } +#endif // ROCKSDB_LITE + if (iterate_lower_bound_ != nullptr && user_comparator_->Compare(saved_key_.GetUserKey(), *iterate_lower_bound_) < 0) { @@ -1331,6 +1343,12 @@ range_del_agg_.InvalidateRangeDelMapPositions(); } +#ifndef ROCKSDB_LITE + if (db_impl_ != nullptr && cfd_ != nullptr) { + db_impl_->TraceIteratorSeekForPrev(cfd_->GetID(), target); + } +#endif // ROCKSDB_LITE + RecordTick(statistics_, NUMBER_DB_SEEK); if (iter_->Valid()) { if (prefix_extractor_ && prefix_same_as_start_) { @@ -1453,11 +1471,12 @@ InternalIterator* internal_iter, const SequenceNumber& sequence, uint64_t max_sequential_skip_in_iterations, - ReadCallback* read_callback, bool allow_blob) { - DBIter* db_iter = - new DBIter(env, read_options, cf_options, mutable_cf_options, - user_key_comparator, internal_iter, sequence, false, - max_sequential_skip_in_iterations, read_callback, allow_blob); + ReadCallback* read_callback, DBImpl* db_impl, + ColumnFamilyData* cfd, bool allow_blob) { + DBIter* db_iter = new DBIter( + env, read_options, cf_options, mutable_cf_options, user_key_comparator, + internal_iter, sequence, false, max_sequential_skip_in_iterations, + read_callback, db_impl, cfd, allow_blob); return db_iter; } @@ -1504,13 +1523,14 @@ const SequenceNumber& sequence, uint64_t max_sequential_skip_in_iteration, uint64_t version_number, - ReadCallback* read_callback, bool allow_blob, + ReadCallback* read_callback, DBImpl* db_impl, + ColumnFamilyData* cfd, bool allow_blob, bool allow_refresh) { auto mem = arena_.AllocateAligned(sizeof(DBIter)); - db_iter_ = new (mem) - DBIter(env, read_options, cf_options, mutable_cf_options, - cf_options.user_comparator, nullptr, sequence, true, - max_sequential_skip_in_iteration, read_callback, allow_blob); + db_iter_ = new (mem) DBIter(env, read_options, cf_options, mutable_cf_options, + cf_options.user_comparator, nullptr, sequence, + true, max_sequential_skip_in_iteration, + read_callback, db_impl, cfd, allow_blob); sv_number_ = version_number; allow_refresh_ = allow_refresh; } @@ -1534,7 +1554,8 @@ SuperVersion* sv = cfd_->GetReferencedSuperVersion(db_impl_->mutex()); Init(env, read_options_, *(cfd_->ioptions()), sv->mutable_cf_options, latest_seq, sv->mutable_cf_options.max_sequential_skip_in_iterations, - cur_sv_number, read_callback_, allow_blob_, allow_refresh_); + cur_sv_number, read_callback_, db_impl_, cfd_, allow_blob_, + allow_refresh_); InternalIterator* internal_iter = db_impl_->NewInternalIterator( read_options_, cfd_, sv, &arena_, db_iter_->GetRangeDelAggregator()); @@ -1556,7 +1577,7 @@ ArenaWrappedDBIter* iter = new ArenaWrappedDBIter(); iter->Init(env, read_options, cf_options, mutable_cf_options, sequence, max_sequential_skip_in_iterations, version_number, read_callback, - allow_blob, allow_refresh); + db_impl, cfd, allow_blob, allow_refresh); if (db_impl != nullptr && cfd != nullptr && allow_refresh) { iter->StoreRefreshInfo(read_options, db_impl, cfd, read_callback, allow_blob); diff -Nru rocksdb-5.15.10/db/db_iter.h rocksdb-5.17.2/db/db_iter.h --- rocksdb-5.15.10/db/db_iter.h 2018-09-13 17:25:20.000000000 +0000 +++ rocksdb-5.17.2/db/db_iter.h 2018-11-12 19:57:32.000000000 +0000 @@ -23,20 +23,18 @@ class Arena; class DBIter; -class InternalIterator; // Return a new iterator that converts internal keys (yielded by // "*internal_iter") that were live at the specified "sequence" number // into appropriate user keys. -extern Iterator* NewDBIterator(Env* env, const ReadOptions& read_options, - const ImmutableCFOptions& cf_options, - const MutableCFOptions& mutable_cf_options, - const Comparator* user_key_comparator, - InternalIterator* internal_iter, - const SequenceNumber& sequence, - uint64_t max_sequential_skip_in_iterations, - ReadCallback* read_callback, - bool allow_blob = false); +extern Iterator* NewDBIterator( + Env* env, const ReadOptions& read_options, + const ImmutableCFOptions& cf_options, + const MutableCFOptions& mutable_cf_options, + const Comparator* user_key_comparator, InternalIterator* internal_iter, + const SequenceNumber& sequence, uint64_t max_sequential_skip_in_iterations, + ReadCallback* read_callback, DBImpl* db_impl = nullptr, + ColumnFamilyData* cfd = nullptr, bool allow_blob = false); // A wrapper iterator which wraps DB Iterator and the arena, with which the DB // iterator is supposed be allocated. This class is used as an entry point of @@ -75,7 +73,8 @@ const MutableCFOptions& mutable_cf_options, const SequenceNumber& sequence, uint64_t max_sequential_skip_in_iterations, uint64_t version_number, - ReadCallback* read_callback, bool allow_blob, bool allow_refresh); + ReadCallback* read_callback, DBImpl* db_impl, ColumnFamilyData* cfd, + bool allow_blob, bool allow_refresh); void StoreRefreshInfo(const ReadOptions& read_options, DBImpl* db_impl, ColumnFamilyData* cfd, ReadCallback* read_callback, diff -Nru rocksdb-5.15.10/db/db_properties_test.cc rocksdb-5.17.2/db/db_properties_test.cc --- rocksdb-5.15.10/db/db_properties_test.cc 2018-09-13 17:25:20.000000000 +0000 +++ rocksdb-5.17.2/db/db_properties_test.cc 2018-11-12 19:57:32.000000000 +0000 @@ -180,17 +180,16 @@ ResetTableProperties(tp); sscanf(tp_string.c_str(), "# data blocks %" SCNu64 " # entries %" SCNu64 - " # range deletions %" SCNu64 - " raw key size %" SCNu64 + " # range deletions %" SCNu64 " raw key size %" SCNu64 " raw average key size %lf " " raw value size %" SCNu64 " raw average value size %lf " " data block size %" SCNu64 " index block size (user-key? %" SCNu64 - ") %" SCNu64 " filter block size %" SCNu64, + ", delta-value? %" SCNu64 ") %" SCNu64 " filter block size %" SCNu64, &tp->num_data_blocks, &tp->num_entries, &tp->num_range_deletions, &tp->raw_key_size, &dummy_double, &tp->raw_value_size, &dummy_double, - &tp->data_size, &tp->index_key_is_user_key, &tp->index_size, - &tp->filter_size); + &tp->data_size, &tp->index_key_is_user_key, + &tp->index_value_is_delta_encoded, &tp->index_size, &tp->filter_size); } void VerifySimilar(uint64_t a, uint64_t b, double bias) { @@ -224,14 +223,11 @@ ASSERT_EQ(base_tp.num_range_deletions, new_tp.num_range_deletions); } -void GetExpectedTableProperties(TableProperties* expected_tp, - const int kKeySize, const int kValueSize, - const int kKeysPerTable, - const int kRangeDeletionsPerTable, - const int kTableCount, - const int kBloomBitsPerKey, - const size_t kBlockSize, - const bool index_key_is_user_key) { +void GetExpectedTableProperties( + TableProperties* expected_tp, const int kKeySize, const int kValueSize, + const int kKeysPerTable, const int kRangeDeletionsPerTable, + const int kTableCount, const int kBloomBitsPerKey, const size_t kBlockSize, + const bool index_key_is_user_key, const bool value_delta_encoding) { const int kKeyCount = kTableCount * kKeysPerTable; const int kRangeDeletionCount = kTableCount * kRangeDeletionsPerTable; const int kAvgSuccessorSize = kKeySize / 5; @@ -248,7 +244,9 @@ kTableCount * (kKeysPerTable * (kKeySize + 8 + kValueSize)); expected_tp->index_size = expected_tp->num_data_blocks * - (kAvgSuccessorSize + (index_key_is_user_key ? 0 : 8)); + (kAvgSuccessorSize + (index_key_is_user_key ? 0 : 8) - + // discount 1 byte as value size is not encoded in value delta encoding + (value_delta_encoding ? 1 : 0)); expected_tp->filter_size = kTableCount * (kKeysPerTable * kBloomBitsPerKey / 8); } @@ -342,12 +340,14 @@ TableProperties output_tp; ParseTablePropertiesString(property, &output_tp); bool index_key_is_user_key = output_tp.index_key_is_user_key > 0; + bool value_is_delta_encoded = output_tp.index_value_is_delta_encoded > 0; TableProperties expected_tp; GetExpectedTableProperties(&expected_tp, kKeySize, kValueSize, kKeysPerTable, kRangeDeletionsPerTable, kTableCount, kBloomBitsPerKey, - table_options.block_size, index_key_is_user_key); + table_options.block_size, index_key_is_user_key, + value_is_delta_encoded); VerifyTableProperties(expected_tp, output_tp); } @@ -533,6 +533,7 @@ db_->GetProperty(DB::Properties::kAggregatedTableProperties, &tp_string); ParseTablePropertiesString(tp_string, &tp); bool index_key_is_user_key = tp.index_key_is_user_key > 0; + bool value_is_delta_encoded = tp.index_value_is_delta_encoded > 0; ASSERT_EQ(sum_tp.data_size, tp.data_size); ASSERT_EQ(sum_tp.index_size, tp.index_size); ASSERT_EQ(sum_tp.filter_size, tp.filter_size); @@ -542,10 +543,10 @@ ASSERT_EQ(sum_tp.num_entries, tp.num_entries); ASSERT_EQ(sum_tp.num_range_deletions, tp.num_range_deletions); if (table > 3) { - GetExpectedTableProperties( - &expected_tp, kKeySize, kValueSize, kKeysPerTable, - kRangeDeletionsPerTable, table, kBloomBitsPerKey, - table_options.block_size, index_key_is_user_key); + GetExpectedTableProperties(&expected_tp, kKeySize, kValueSize, + kKeysPerTable, kRangeDeletionsPerTable, table, + kBloomBitsPerKey, table_options.block_size, + index_key_is_user_key, value_is_delta_encoded); // Gives larger bias here as index block size, filter block size, // and data block size become much harder to estimate in this test. VerifyTableProperties(expected_tp, tp, 0.5, 0.4, 0.4, 0.25); diff -Nru rocksdb-5.15.10/db/db_range_del_test.cc rocksdb-5.17.2/db/db_range_del_test.cc --- rocksdb-5.15.10/db/db_range_del_test.cc 2018-09-13 17:25:20.000000000 +0000 +++ rocksdb-5.17.2/db/db_range_del_test.cc 2018-11-12 19:57:32.000000000 +0000 @@ -191,7 +191,7 @@ std::vector> files; dbfull()->TEST_GetFilesMetaData(db_->DefaultColumnFamily(), &files); - ASSERT_GT(files[0][0].smallest_seqno, 0); + ASSERT_GT(files[0][0].fd.smallest_seqno, 0); db_->ReleaseSnapshot(snapshot); } diff -Nru rocksdb-5.15.10/db/db_sst_test.cc rocksdb-5.17.2/db/db_sst_test.cc --- rocksdb-5.15.10/db/db_sst_test.cc 2018-09-13 17:25:20.000000000 +0000 +++ rocksdb-5.17.2/db/db_sst_test.cc 2018-11-12 19:57:32.000000000 +0000 @@ -436,7 +436,7 @@ // deleted from first db_path were deleted using DeleteScheduler and // files in the second path were not. TEST_F(DBSSTTest, DeleteSchedulerMultipleDBPaths) { - int bg_delete_file = 0; + std::atomic bg_delete_file(0); rocksdb::SyncPoint::GetInstance()->SetCallBack( "DeleteScheduler::DeleteTrashFile:DeleteFile", [&](void* /*arg*/) { bg_delete_file++; }); @@ -703,26 +703,19 @@ // When bg_error_ is set we will verify that the DB size is greater // than the limit. - std::vector max_space_limits_mbs = {1, 2, 4, 8, 10}; - decltype(max_space_limits_mbs)::value_type limit_mb_cb; - bool bg_error_set = false; - uint64_t total_sst_files_size = 0; - - std::atomic estimate_multiplier(1); - int reached_max_space_on_flush = 0; - int reached_max_space_on_compaction = 0; + std::vector max_space_limits_mbs = {1, 10}; + std::atomic bg_error_set(false); + + std::atomic reached_max_space_on_flush(0); + std::atomic reached_max_space_on_compaction(0); rocksdb::SyncPoint::GetInstance()->SetCallBack( "DBImpl::FlushMemTableToOutputFile:MaxAllowedSpaceReached", [&](void* arg) { Status* bg_error = static_cast(arg); bg_error_set = true; - GetAllSSTFiles(&total_sst_files_size); reached_max_space_on_flush++; - // low limit for size calculated using sst files - ASSERT_GE(total_sst_files_size, limit_mb_cb * 1024 * 1024); // clear error to ensure compaction callback is called *bg_error = Status::OK(); - estimate_multiplier++; // used in the main loop assert }); rocksdb::SyncPoint::GetInstance()->SetCallBack( @@ -735,15 +728,11 @@ "CompactionJob::FinishCompactionOutputFile:MaxAllowedSpaceReached", [&](void* /*arg*/) { bg_error_set = true; - GetAllSSTFiles(&total_sst_files_size); reached_max_space_on_compaction++; }); for (auto limit_mb : max_space_limits_mbs) { bg_error_set = false; - total_sst_files_size = 0; - estimate_multiplier = 1; - limit_mb_cb = limit_mb; rocksdb::SyncPoint::GetInstance()->ClearTrace(); rocksdb::SyncPoint::GetInstance()->EnableProcessing(); std::shared_ptr sst_file_manager(NewSstFileManager(env_)); @@ -757,21 +746,17 @@ sfm->SetMaxAllowedSpaceUsage(limit_mb * 1024 * 1024); - int keys_written = 0; - uint64_t estimated_db_size = 0; + // It is easy to detect if the test is stuck in a loop. No need for + // complex termination logic. while (true) { auto s = Put(RandomString(&rnd, 10), RandomString(&rnd, 50)); if (!s.ok()) { break; } - keys_written++; - // Check the estimated db size vs the db limit just to make sure we - // dont run into an infinite loop - estimated_db_size = keys_written * 60; // ~60 bytes per key - ASSERT_LT(estimated_db_size, - estimate_multiplier * limit_mb * 1024 * 1024 * 2); } ASSERT_TRUE(bg_error_set); + uint64_t total_sst_files_size = 0; + GetAllSSTFiles(&total_sst_files_size); ASSERT_GE(total_sst_files_size, limit_mb * 1024 * 1024); rocksdb::SyncPoint::GetInstance()->DisableProcessing(); } diff -Nru rocksdb-5.15.10/db/db_table_properties_test.cc rocksdb-5.17.2/db/db_table_properties_test.cc --- rocksdb-5.15.10/db/db_table_properties_test.cc 2018-09-13 17:25:20.000000000 +0000 +++ rocksdb-5.17.2/db/db_table_properties_test.cc 2018-11-12 19:57:32.000000000 +0000 @@ -252,13 +252,14 @@ } TEST_F(DBTablePropertiesTest, DeletionTriggeredCompactionMarking) { - const int kNumKeys = 1000; - const int kWindowSize = 100; - const int kNumDelsTrigger = 90; + int kNumKeys = 1000; + int kWindowSize = 100; + int kNumDelsTrigger = 90; + std::shared_ptr compact_on_del = + NewCompactOnDeletionCollectorFactory(kWindowSize, kNumDelsTrigger); Options opts = CurrentOptions(); - opts.table_properties_collector_factories.emplace_back( - NewCompactOnDeletionCollectorFactory(kWindowSize, kNumDelsTrigger)); + opts.table_properties_collector_factories.emplace_back(compact_on_del); Reopen(opts); // add an L1 file to prevent tombstones from dropping due to obsolescence @@ -280,6 +281,48 @@ dbfull()->TEST_WaitForCompact(); ASSERT_EQ(0, NumTableFilesAtLevel(0)); ASSERT_GT(NumTableFilesAtLevel(1), 0); + + // Change the window size and deletion trigger and ensure new values take + // effect + kWindowSize = 50; + kNumDelsTrigger = 40; + static_cast + (compact_on_del.get())->SetWindowSize(kWindowSize); + static_cast + (compact_on_del.get())->SetDeletionTrigger(kNumDelsTrigger); + for (int i = 0; i < kNumKeys; ++i) { + if (i >= kNumKeys - kWindowSize && + i < kNumKeys - kWindowSize + kNumDelsTrigger) { + Delete(Key(i)); + } else { + Put(Key(i), "val"); + } + } + Flush(); + + dbfull()->TEST_WaitForCompact(); + ASSERT_EQ(0, NumTableFilesAtLevel(0)); + ASSERT_GT(NumTableFilesAtLevel(1), 0); + + // Change the window size to disable delete triggered compaction + kWindowSize = 0; + static_cast + (compact_on_del.get())->SetWindowSize(kWindowSize); + static_cast + (compact_on_del.get())->SetDeletionTrigger(kNumDelsTrigger); + for (int i = 0; i < kNumKeys; ++i) { + if (i >= kNumKeys - kWindowSize && + i < kNumKeys - kWindowSize + kNumDelsTrigger) { + Delete(Key(i)); + } else { + Put(Key(i), "val"); + } + } + Flush(); + + dbfull()->TEST_WaitForCompact(); + ASSERT_EQ(1, NumTableFilesAtLevel(0)); + } } // namespace rocksdb diff -Nru rocksdb-5.15.10/db/db_test2.cc rocksdb-5.17.2/db/db_test2.cc --- rocksdb-5.15.10/db/db_test2.cc 2018-09-13 17:25:20.000000000 +0000 +++ rocksdb-5.17.2/db/db_test2.cc 2018-11-12 19:57:32.000000000 +0000 @@ -2321,9 +2321,9 @@ options.rate_limiter->GetTotalBytesThrough(Env::IO_LOW); // Include the explicit prefetch of the footer in direct I/O case. size_t direct_io_extra = use_direct_io ? 512 * 1024 : 0; - ASSERT_GE(rate_limited_bytes, - static_cast(kNumKeysPerFile * kBytesPerKey * kNumL0Files + - direct_io_extra)); + ASSERT_GE( + rate_limited_bytes, + static_cast(kNumKeysPerFile * kBytesPerKey * kNumL0Files)); ASSERT_LT( rate_limited_bytes, static_cast(2 * kNumKeysPerFile * kBytesPerKey * kNumL0Files + @@ -2500,6 +2500,108 @@ rocksdb::SyncPoint::GetInstance()->DisableProcessing(); } +TEST_F(DBTest2, TraceAndReplay) { + Options options = CurrentOptions(); + options.merge_operator = MergeOperators::CreatePutOperator(); + ReadOptions ro; + WriteOptions wo; + TraceOptions trace_opts; + EnvOptions env_opts; + CreateAndReopenWithCF({"pikachu"}, options); + Random rnd(301); + Iterator* single_iter = nullptr; + + std::string trace_filename = dbname_ + "/rocksdb.trace"; + std::unique_ptr trace_writer; + ASSERT_OK(NewFileTraceWriter(env_, env_opts, trace_filename, &trace_writer)); + ASSERT_OK(db_->StartTrace(trace_opts, std::move(trace_writer))); + + ASSERT_OK(Put(0, "a", "1")); + ASSERT_OK(Merge(0, "b", "2")); + ASSERT_OK(Delete(0, "c")); + ASSERT_OK(SingleDelete(0, "d")); + ASSERT_OK(db_->DeleteRange(wo, dbfull()->DefaultColumnFamily(), "e", "f")); + + WriteBatch batch; + ASSERT_OK(batch.Put("f", "11")); + ASSERT_OK(batch.Merge("g", "12")); + ASSERT_OK(batch.Delete("h")); + ASSERT_OK(batch.SingleDelete("i")); + ASSERT_OK(batch.DeleteRange("j", "k")); + ASSERT_OK(db_->Write(wo, &batch)); + + single_iter = db_->NewIterator(ro); + single_iter->Seek("f"); + single_iter->SeekForPrev("g"); + delete single_iter; + + ASSERT_EQ("1", Get(0, "a")); + ASSERT_EQ("12", Get(0, "g")); + + ASSERT_OK(Put(1, "foo", "bar")); + ASSERT_OK(Put(1, "rocksdb", "rocks")); + ASSERT_EQ("NOT_FOUND", Get(1, "leveldb")); + + ASSERT_OK(db_->EndTrace()); + // These should not get into the trace file as it is after EndTrace. + Put("hello", "world"); + Merge("foo", "bar"); + + // Open another db, replay, and verify the data + std::string value; + std::string dbname2 = test::TmpDir(env_) + "/db_replay"; + ASSERT_OK(DestroyDB(dbname2, options)); + + // Using a different name than db2, to pacify infer's use-after-lifetime + // warnings (http://fbinfer.com). + DB* db2_init = nullptr; + options.create_if_missing = true; + ASSERT_OK(DB::Open(options, dbname2, &db2_init)); + ColumnFamilyHandle* cf; + ASSERT_OK( + db2_init->CreateColumnFamily(ColumnFamilyOptions(), "pikachu", &cf)); + delete cf; + delete db2_init; + + DB* db2 = nullptr; + std::vector column_families; + ColumnFamilyOptions cf_options; + cf_options.merge_operator = MergeOperators::CreatePutOperator(); + column_families.push_back(ColumnFamilyDescriptor("default", cf_options)); + column_families.push_back( + ColumnFamilyDescriptor("pikachu", ColumnFamilyOptions())); + std::vector handles; + ASSERT_OK(DB::Open(DBOptions(), dbname2, column_families, &handles, &db2)); + + env_->SleepForMicroseconds(100); + // Verify that the keys don't already exist + ASSERT_TRUE(db2->Get(ro, handles[0], "a", &value).IsNotFound()); + ASSERT_TRUE(db2->Get(ro, handles[0], "g", &value).IsNotFound()); + + std::unique_ptr trace_reader; + ASSERT_OK(NewFileTraceReader(env_, env_opts, trace_filename, &trace_reader)); + Replayer replayer(db2, handles_, std::move(trace_reader)); + ASSERT_OK(replayer.Replay()); + + ASSERT_OK(db2->Get(ro, handles[0], "a", &value)); + ASSERT_EQ("1", value); + ASSERT_OK(db2->Get(ro, handles[0], "g", &value)); + ASSERT_EQ("12", value); + ASSERT_TRUE(db2->Get(ro, handles[0], "hello", &value).IsNotFound()); + ASSERT_TRUE(db2->Get(ro, handles[0], "world", &value).IsNotFound()); + + ASSERT_OK(db2->Get(ro, handles[1], "foo", &value)); + ASSERT_EQ("bar", value); + ASSERT_OK(db2->Get(ro, handles[1], "rocksdb", &value)); + ASSERT_EQ("rocks", value); + + for (auto handle : handles) { + delete handle; + } + delete db2; + ASSERT_OK(DestroyDB(dbname2, options)); +} + #endif // ROCKSDB_LITE TEST_F(DBTest2, PinnableSliceAndMmapReads) { @@ -2547,6 +2649,192 @@ #endif } +TEST_F(DBTest2, DISABLED_IteratorPinnedMemory) { + Options options = CurrentOptions(); + options.create_if_missing = true; + options.statistics = rocksdb::CreateDBStatistics(); + BlockBasedTableOptions bbto; + bbto.no_block_cache = false; + bbto.cache_index_and_filter_blocks = false; + bbto.block_cache = NewLRUCache(100000); + bbto.block_size = 400; // small block size + options.table_factory.reset(new BlockBasedTableFactory(bbto)); + Reopen(options); + + Random rnd(301); + std::string v = RandomString(&rnd, 400); + + // Since v is the size of a block, each key should take a block + // of 400+ bytes. + Put("1", v); + Put("3", v); + Put("5", v); + Put("7", v); + ASSERT_OK(Flush()); + + ASSERT_EQ(0, bbto.block_cache->GetPinnedUsage()); + + // Verify that iterators don't pin more than one data block in block cache + // at each time. + { + unique_ptr iter(db_->NewIterator(ReadOptions())); + iter->SeekToFirst(); + + for (int i = 0; i < 4; i++) { + ASSERT_TRUE(iter->Valid()); + // Block cache should contain exactly one block. + ASSERT_GT(bbto.block_cache->GetPinnedUsage(), 0); + ASSERT_LT(bbto.block_cache->GetPinnedUsage(), 800); + iter->Next(); + } + ASSERT_FALSE(iter->Valid()); + + iter->Seek("4"); + ASSERT_TRUE(iter->Valid()); + + ASSERT_GT(bbto.block_cache->GetPinnedUsage(), 0); + ASSERT_LT(bbto.block_cache->GetPinnedUsage(), 800); + + iter->Seek("3"); + ASSERT_TRUE(iter->Valid()); + + ASSERT_GT(bbto.block_cache->GetPinnedUsage(), 0); + ASSERT_LT(bbto.block_cache->GetPinnedUsage(), 800); + } + ASSERT_EQ(0, bbto.block_cache->GetPinnedUsage()); + + // Test compaction case + Put("2", v); + Put("5", v); + Put("6", v); + Put("8", v); + ASSERT_OK(Flush()); + + // Clear existing data in block cache + bbto.block_cache->SetCapacity(0); + bbto.block_cache->SetCapacity(100000); + + // Verify compaction input iterators don't hold more than one data blocks at + // one time. + std::atomic finished(false); + std::atomic block_newed(0); + std::atomic block_destroyed(0); + rocksdb::SyncPoint::GetInstance()->SetCallBack( + "Block::Block:0", [&](void* /*arg*/) { + if (finished) { + return; + } + // Two iterators. At most 2 outstanding blocks. + EXPECT_GE(block_newed.load(), block_destroyed.load()); + EXPECT_LE(block_newed.load(), block_destroyed.load() + 1); + block_newed.fetch_add(1); + }); + rocksdb::SyncPoint::GetInstance()->SetCallBack( + "Block::~Block", [&](void* /*arg*/) { + if (finished) { + return; + } + // Two iterators. At most 2 outstanding blocks. + EXPECT_GE(block_newed.load(), block_destroyed.load() + 1); + EXPECT_LE(block_newed.load(), block_destroyed.load() + 2); + block_destroyed.fetch_add(1); + }); + rocksdb::SyncPoint::GetInstance()->SetCallBack( + "CompactionJob::Run:BeforeVerify", + [&](void* /*arg*/) { finished = true; }); + rocksdb::SyncPoint::GetInstance()->EnableProcessing(); + + ASSERT_OK(db_->CompactRange(CompactRangeOptions(), nullptr, nullptr)); + + // Two input files. Each of them has 4 data blocks. + ASSERT_EQ(8, block_newed.load()); + ASSERT_EQ(8, block_destroyed.load()); + + rocksdb::SyncPoint::GetInstance()->DisableProcessing(); +} + +TEST_F(DBTest2, TestBBTTailPrefetch) { + std::atomic called(false); + size_t expected_lower_bound = 512 * 1024; + size_t expected_higher_bound = 512 * 1024; + rocksdb::SyncPoint::GetInstance()->SetCallBack( + "BlockBasedTable::Open::TailPrefetchLen", [&](void* arg) { + size_t* prefetch_size = static_cast(arg); + EXPECT_LE(expected_lower_bound, *prefetch_size); + EXPECT_GE(expected_higher_bound, *prefetch_size); + called = true; + }); + rocksdb::SyncPoint::GetInstance()->EnableProcessing(); + + Put("1", "1"); + Put("9", "1"); + Flush(); + + expected_lower_bound = 0; + expected_higher_bound = 8 * 1024; + + Put("1", "1"); + Put("9", "1"); + Flush(); + + Put("1", "1"); + Put("9", "1"); + Flush(); + + // Full compaction to make sure there is no L0 file after the open. + ASSERT_OK(db_->CompactRange(CompactRangeOptions(), nullptr, nullptr)); + + ASSERT_TRUE(called.load()); + called = false; + + rocksdb::SyncPoint::GetInstance()->DisableProcessing(); + rocksdb::SyncPoint::GetInstance()->ClearAllCallBacks(); + + std::atomic first_call(true); + rocksdb::SyncPoint::GetInstance()->SetCallBack( + "BlockBasedTable::Open::TailPrefetchLen", [&](void* arg) { + size_t* prefetch_size = static_cast(arg); + if (first_call) { + EXPECT_EQ(4 * 1024, *prefetch_size); + first_call = false; + } else { + EXPECT_GE(4 * 1024, *prefetch_size); + } + called = true; + }); + rocksdb::SyncPoint::GetInstance()->EnableProcessing(); + + Options options = CurrentOptions(); + options.max_file_opening_threads = 1; // one thread + BlockBasedTableOptions table_options; + table_options.cache_index_and_filter_blocks = true; + options.table_factory.reset(NewBlockBasedTableFactory(table_options)); + options.max_open_files = -1; + Reopen(options); + + Put("1", "1"); + Put("9", "1"); + Flush(); + + Put("1", "1"); + Put("9", "1"); + Flush(); + + ASSERT_TRUE(called.load()); + called = false; + + // Parallel loading SST files + options.max_file_opening_threads = 16; + Reopen(options); + + ASSERT_OK(db_->CompactRange(CompactRangeOptions(), nullptr, nullptr)); + + ASSERT_TRUE(called.load()); + + rocksdb::SyncPoint::GetInstance()->DisableProcessing(); + rocksdb::SyncPoint::GetInstance()->ClearAllCallBacks(); +} + } // namespace rocksdb int main(int argc, char** argv) { diff -Nru rocksdb-5.15.10/db/db_test.cc rocksdb-5.17.2/db/db_test.cc --- rocksdb-5.15.10/db/db_test.cc 2018-09-13 17:25:20.000000000 +0000 +++ rocksdb-5.17.2/db/db_test.cc 2018-11-12 19:57:32.000000000 +0000 @@ -262,6 +262,196 @@ } } +TEST_F(DBTest, MixedSlowdownOptions) { + Options options = CurrentOptions(); + options.env = env_; + options.write_buffer_size = 100000; + CreateAndReopenWithCF({"pikachu"}, options); + std::vector threads; + std::atomic thread_num(0); + + std::function write_slowdown_func = [&]() { + int a = thread_num.fetch_add(1); + std::string key = "foo" + std::to_string(a); + WriteOptions wo; + wo.no_slowdown = false; + ASSERT_OK(dbfull()->Put(wo, key, "bar")); + }; + std::function write_no_slowdown_func = [&]() { + int a = thread_num.fetch_add(1); + std::string key = "foo" + std::to_string(a); + WriteOptions wo; + wo.no_slowdown = true; + ASSERT_NOK(dbfull()->Put(wo, key, "bar")); + }; + // Use a small number to ensure a large delay that is still effective + // when we do Put + // TODO(myabandeh): this is time dependent and could potentially make + // the test flaky + auto token = dbfull()->TEST_write_controler().GetDelayToken(1); + std::atomic sleep_count(0); + rocksdb::SyncPoint::GetInstance()->SetCallBack( + "DBImpl::DelayWrite:BeginWriteStallDone", + [&](void* /*arg*/) { + sleep_count.fetch_add(1); + if (threads.empty()) { + for (int i = 0; i < 2; ++i) { + threads.emplace_back(write_slowdown_func); + } + for (int i = 0; i < 2; ++i) { + threads.emplace_back(write_no_slowdown_func); + } + } + }); + rocksdb::SyncPoint::GetInstance()->EnableProcessing(); + + WriteOptions wo; + wo.sync = false; + wo.disableWAL = false; + wo.no_slowdown = false; + dbfull()->Put(wo, "foo", "bar"); + // We need the 2nd write to trigger delay. This is because delay is + // estimated based on the last write size which is 0 for the first write. + ASSERT_OK(dbfull()->Put(wo, "foo2", "bar2")); + token.reset(); + + for (auto& t : threads) { + t.join(); + } + ASSERT_GE(sleep_count.load(), 1); + + wo.no_slowdown = true; + ASSERT_OK(dbfull()->Put(wo, "foo3", "bar")); +} + +TEST_F(DBTest, MixedSlowdownOptionsInQueue) { + Options options = CurrentOptions(); + options.env = env_; + options.write_buffer_size = 100000; + CreateAndReopenWithCF({"pikachu"}, options); + std::vector threads; + std::atomic thread_num(0); + + std::function write_no_slowdown_func = [&]() { + int a = thread_num.fetch_add(1); + std::string key = "foo" + std::to_string(a); + WriteOptions wo; + wo.no_slowdown = true; + ASSERT_NOK(dbfull()->Put(wo, key, "bar")); + }; + // Use a small number to ensure a large delay that is still effective + // when we do Put + // TODO(myabandeh): this is time dependent and could potentially make + // the test flaky + auto token = dbfull()->TEST_write_controler().GetDelayToken(1); + std::atomic sleep_count(0); + rocksdb::SyncPoint::GetInstance()->SetCallBack( + "DBImpl::DelayWrite:Sleep", + [&](void* /*arg*/) { + sleep_count.fetch_add(1); + if (threads.empty()) { + for (int i = 0; i < 2; ++i) { + threads.emplace_back(write_no_slowdown_func); + } + // Sleep for 2s to allow the threads to insert themselves into the + // write queue + env_->SleepForMicroseconds(3000000ULL); + } + }); + std::atomic wait_count(0); + rocksdb::SyncPoint::GetInstance()->SetCallBack( + "DBImpl::DelayWrite:Wait", + [&](void* /*arg*/) { wait_count.fetch_add(1); }); + rocksdb::SyncPoint::GetInstance()->EnableProcessing(); + + WriteOptions wo; + wo.sync = false; + wo.disableWAL = false; + wo.no_slowdown = false; + dbfull()->Put(wo, "foo", "bar"); + // We need the 2nd write to trigger delay. This is because delay is + // estimated based on the last write size which is 0 for the first write. + ASSERT_OK(dbfull()->Put(wo, "foo2", "bar2")); + token.reset(); + + for (auto& t : threads) { + t.join(); + } + ASSERT_EQ(sleep_count.load(), 1); + ASSERT_GE(wait_count.load(), 0); +} + +TEST_F(DBTest, MixedSlowdownOptionsStop) { + Options options = CurrentOptions(); + options.env = env_; + options.write_buffer_size = 100000; + CreateAndReopenWithCF({"pikachu"}, options); + std::vector threads; + std::atomic thread_num(0); + + std::function write_slowdown_func = [&]() { + int a = thread_num.fetch_add(1); + std::string key = "foo" + std::to_string(a); + WriteOptions wo; + wo.no_slowdown = false; + ASSERT_OK(dbfull()->Put(wo, key, "bar")); + }; + std::function write_no_slowdown_func = [&]() { + int a = thread_num.fetch_add(1); + std::string key = "foo" + std::to_string(a); + WriteOptions wo; + wo.no_slowdown = true; + ASSERT_NOK(dbfull()->Put(wo, key, "bar")); + }; + std::function wakeup_writer = [&]() { + dbfull()->mutex_.Lock(); + dbfull()->bg_cv_.SignalAll(); + dbfull()->mutex_.Unlock(); + }; + // Use a small number to ensure a large delay that is still effective + // when we do Put + // TODO(myabandeh): this is time dependent and could potentially make + // the test flaky + auto token = dbfull()->TEST_write_controler().GetStopToken(); + std::atomic wait_count(0); + rocksdb::SyncPoint::GetInstance()->SetCallBack( + "DBImpl::DelayWrite:Wait", + [&](void* /*arg*/) { + wait_count.fetch_add(1); + if (threads.empty()) { + for (int i = 0; i < 2; ++i) { + threads.emplace_back(write_slowdown_func); + } + for (int i = 0; i < 2; ++i) { + threads.emplace_back(write_no_slowdown_func); + } + // Sleep for 2s to allow the threads to insert themselves into the + // write queue + env_->SleepForMicroseconds(3000000ULL); + } + token.reset(); + threads.emplace_back(wakeup_writer); + }); + rocksdb::SyncPoint::GetInstance()->EnableProcessing(); + + WriteOptions wo; + wo.sync = false; + wo.disableWAL = false; + wo.no_slowdown = false; + dbfull()->Put(wo, "foo", "bar"); + // We need the 2nd write to trigger delay. This is because delay is + // estimated based on the last write size which is 0 for the first write. + ASSERT_OK(dbfull()->Put(wo, "foo2", "bar2")); + token.reset(); + + for (auto& t : threads) { + t.join(); + } + ASSERT_GE(wait_count.load(), 1); + + wo.no_slowdown = true; + ASSERT_OK(dbfull()->Put(wo, "foo3", "bar")); +} #ifndef ROCKSDB_LITE TEST_F(DBTest, LevelLimitReopen) { @@ -2149,6 +2339,9 @@ } // namespace +#ifndef TRAVIS +// Disable this test temporarily on Travis as it fails intermittently. +// Github issue: #4151 TEST_F(DBTest, GroupCommitTest) { do { Options options = CurrentOptions(); @@ -2195,6 +2388,7 @@ ASSERT_GT(hist_data.average, 0.0); } while (ChangeOptions(kSkipNoSeekToLast)); } +#endif // TRAVIS namespace { typedef std::map KVMap; @@ -4327,7 +4521,7 @@ // Clean up memtable and L0. Block compaction threads. If continue to write // and flush memtables. We should see put stop after 8 memtable flushes // since level0_stop_writes_trigger = 8 - dbfull()->TEST_FlushMemTable(true); + dbfull()->TEST_FlushMemTable(true, true); dbfull()->CompactRange(CompactRangeOptions(), nullptr, nullptr); // Block compaction test::SleepingBackgroundTask sleeping_task_low; @@ -4340,7 +4534,7 @@ WriteOptions wo; while (count < 64) { ASSERT_OK(Put(Key(count), RandomString(&rnd, 1024), wo)); - dbfull()->TEST_FlushMemTable(true); + dbfull()->TEST_FlushMemTable(true, true); count++; if (dbfull()->TEST_write_controler().IsStopped()) { sleeping_task_low.WakeUp(); @@ -4368,7 +4562,7 @@ count = 0; while (count < 64) { ASSERT_OK(Put(Key(count), RandomString(&rnd, 1024), wo)); - dbfull()->TEST_FlushMemTable(true); + dbfull()->TEST_FlushMemTable(true, true); count++; if (dbfull()->TEST_write_controler().IsStopped()) { sleeping_task_low.WakeUp(); @@ -5508,7 +5702,7 @@ for (int i = 0; i < 72; i++) { Put(Key(i), std::string(5000, 'x')); if (i % 10 == 0) { - Flush(); + dbfull()->TEST_FlushMemTable(true, true); } } dbfull()->TEST_WaitForCompact(); @@ -5518,7 +5712,7 @@ for (int i = 0; i < 72; i++) { Put(Key(i), std::string(5000, 'x')); if (i % 10 == 0) { - Flush(); + dbfull()->TEST_FlushMemTable(true, true); } } dbfull()->TEST_WaitForCompact(); @@ -5537,7 +5731,7 @@ Put(Key(i), std::string(5000, 'x')); Put(Key(100 - i), std::string(5000, 'x')); // Flush the file. File size is around 30KB. - Flush(); + dbfull()->TEST_FlushMemTable(true, true); } ASSERT_TRUE(dbfull()->TEST_write_controler().NeedsDelay()); ASSERT_TRUE(listener->CheckCondition(WriteStallCondition::kDelayed)); @@ -5572,7 +5766,7 @@ Put(Key(10 + i), std::string(5000, 'x')); Put(Key(90 - i), std::string(5000, 'x')); // Flush the file. File size is around 30KB. - Flush(); + dbfull()->TEST_FlushMemTable(true, true); } // Wake up sleep task to enable compaction to run and waits @@ -5593,7 +5787,7 @@ Put(Key(20 + i), std::string(5000, 'x')); Put(Key(80 - i), std::string(5000, 'x')); // Flush the file. File size is around 30KB. - Flush(); + dbfull()->TEST_FlushMemTable(true, true); } // Wake up sleep task to enable compaction to run and waits // for it to go to sleep state again to make sure one compaction diff -Nru rocksdb-5.15.10/db/db_test_util.cc rocksdb-5.17.2/db/db_test_util.cc --- rocksdb-5.15.10/db/db_test_util.cc 2018-09-13 17:25:20.000000000 +0000 +++ rocksdb-5.17.2/db/db_test_util.cc 2018-11-12 19:57:32.000000000 +0000 @@ -449,15 +449,16 @@ options.prefix_extractor.reset(NewNoopTransform()); break; } - case kBlockBasedTableWithPartitionedIndexFormat3: { - table_options.format_version = 3; - // Format 3 changes the binary index format. Since partitioned index is a + case kBlockBasedTableWithPartitionedIndexFormat4: { + table_options.format_version = 4; + // Format 4 changes the binary index format. Since partitioned index is a // super-set of simple indexes, we are also using kTwoLevelIndexSearch to // test this format. table_options.index_type = BlockBasedTableOptions::kTwoLevelIndexSearch; - // The top-level index in partition filters are also affected by format 3. + // The top-level index in partition filters are also affected by format 4. table_options.filter_policy.reset(NewBloomFilterPolicy(10, false)); table_options.partition_filters = true; + table_options.index_block_restart_interval = 8; break; } case kBlockBasedTableWithIndexRestartInterval: { diff -Nru rocksdb-5.15.10/db/db_test_util.h rocksdb-5.17.2/db/db_test_util.h --- rocksdb-5.15.10/db/db_test_util.h 2018-09-13 17:25:20.000000000 +0000 +++ rocksdb-5.17.2/db/db_test_util.h 2018-11-12 19:57:32.000000000 +0000 @@ -109,8 +109,6 @@ // These will be used only if filter_policy is set bool partition_filters = false; uint64_t metadata_block_size = 1024; - BlockBasedTableOptions::IndexType index_type = - BlockBasedTableOptions::IndexType::kBinarySearch; // Used as a bit mask of individual enums in which to skip an XF test point int skip_policy = 0; @@ -317,6 +315,9 @@ } } uint64_t GetFileSize() override { return base_->GetFileSize(); } + Status Allocate(uint64_t offset, uint64_t len) override { + return base_->Allocate(offset, len); + } private: SpecialEnv* env_; @@ -370,6 +371,9 @@ bool IsSyncThreadSafe() const override { return env_->is_wal_sync_thread_safe_.load(); } + Status Allocate(uint64_t offset, uint64_t len) override { + return base_->Allocate(offset, len); + } private: SpecialEnv* env_; @@ -575,7 +579,7 @@ std::atomic is_wal_sync_thread_safe_{true}; - std::atomic compaction_readahead_size_; + std::atomic compaction_readahead_size_{}; }; class MockTimeEnv : public EnvWrapper { @@ -698,7 +702,7 @@ kLevelSubcompactions, kBlockBasedTableWithIndexRestartInterval, kBlockBasedTableWithPartitionedIndex, - kBlockBasedTableWithPartitionedIndexFormat3, + kBlockBasedTableWithPartitionedIndexFormat4, kPartitionedFilterWithNewTableReaderForCompactions, kUniversalSubcompactions, // This must be the last line diff -Nru rocksdb-5.15.10/db/db_universal_compaction_test.cc rocksdb-5.17.2/db/db_universal_compaction_test.cc --- rocksdb-5.15.10/db/db_universal_compaction_test.cc 2018-09-13 17:25:20.000000000 +0000 +++ rocksdb-5.17.2/db/db_universal_compaction_test.cc 2018-11-12 19:57:32.000000000 +0000 @@ -1824,7 +1824,7 @@ port::Thread compact_files_thread([&]() { ASSERT_OK(dbfull()->CompactFiles(CompactionOptions(), default_cfh, - {first_sst_filename}, num_levels_ - 1)); + {first_sst_filename}, num_levels_ - 1)); }); TEST_SYNC_POINT( diff -Nru rocksdb-5.15.10/db/db_wal_test.cc rocksdb-5.17.2/db/db_wal_test.cc --- rocksdb-5.15.10/db/db_wal_test.cc 2018-09-13 17:25:20.000000000 +0000 +++ rocksdb-5.17.2/db/db_wal_test.cc 2018-11-12 19:57:32.000000000 +0000 @@ -18,6 +18,15 @@ class DBWALTest : public DBTestBase { public: DBWALTest() : DBTestBase("/db_wal_test") {} + +#if defined(ROCKSDB_PLATFORM_POSIX) + uint64_t GetAllocatedFileSize(std::string file_name) { + struct stat sbuf; + int err = stat(file_name.c_str(), &sbuf); + assert(err == 0); + return sbuf.st_blocks * 512; + } +#endif }; // A SpecialEnv enriched to give more insight about deleted files @@ -815,7 +824,7 @@ unique_ptr file; ASSERT_OK(db_options.env->NewWritableFile(fname, &file, env_options)); unique_ptr file_writer( - new WritableFileWriter(std::move(file), env_options)); + new WritableFileWriter(std::move(file), fname, env_options)); current_log_writer.reset( new log::Writer(std::move(file_writer), current_log_number, db_options.recycle_log_file_num > 0)); @@ -1329,6 +1338,99 @@ } } +// Tests that total log size is recovered if we set +// avoid_flush_during_recovery=true. +// Flush should trigger if max_total_wal_size is reached. +TEST_F(DBWALTest, RestoreTotalLogSizeAfterRecoverWithoutFlush) { + class TestFlushListener : public EventListener { + public: + std::atomic count{0}; + + TestFlushListener() = default; + + void OnFlushBegin(DB* /*db*/, const FlushJobInfo& flush_job_info) override { + count++; + assert(FlushReason::kWriteBufferManager == flush_job_info.flush_reason); + } + }; + std::shared_ptr test_listener = + std::make_shared(); + + constexpr size_t kKB = 1024; + constexpr size_t kMB = 1024 * 1024; + Options options = CurrentOptions(); + options.avoid_flush_during_recovery = true; + options.max_total_wal_size = 1 * kMB; + options.listeners.push_back(test_listener); + // Have to open DB in multi-CF mode to trigger flush when + // max_total_wal_size is reached. + CreateAndReopenWithCF({"one"}, options); + // Write some keys and we will end up with one log file which is slightly + // smaller than 1MB. + std::string value_100k(100 * kKB, 'v'); + std::string value_300k(300 * kKB, 'v'); + ASSERT_OK(Put(0, "foo", "v1")); + for (int i = 0; i < 9; i++) { + ASSERT_OK(Put(1, "key" + ToString(i), value_100k)); + } + // Get log files before reopen. + VectorLogPtr log_files_before; + ASSERT_OK(dbfull()->GetSortedWalFiles(log_files_before)); + ASSERT_EQ(1, log_files_before.size()); + uint64_t log_size_before = log_files_before[0]->SizeFileBytes(); + ASSERT_GT(log_size_before, 900 * kKB); + ASSERT_LT(log_size_before, 1 * kMB); + ReopenWithColumnFamilies({"default", "one"}, options); + // Write one more value to make log larger than 1MB. + ASSERT_OK(Put(1, "bar", value_300k)); + // Get log files again. A new log file will be opened. + VectorLogPtr log_files_after_reopen; + ASSERT_OK(dbfull()->GetSortedWalFiles(log_files_after_reopen)); + ASSERT_EQ(2, log_files_after_reopen.size()); + ASSERT_EQ(log_files_before[0]->LogNumber(), + log_files_after_reopen[0]->LogNumber()); + ASSERT_GT(log_files_after_reopen[0]->SizeFileBytes() + + log_files_after_reopen[1]->SizeFileBytes(), + 1 * kMB); + // Write one more key to trigger flush. + ASSERT_OK(Put(0, "foo", "v2")); + dbfull()->TEST_WaitForFlushMemTable(); + // Flushed two column families. + ASSERT_EQ(2, test_listener->count.load()); +} + +#if defined(ROCKSDB_PLATFORM_POSIX) +#if defined(ROCKSDB_FALLOCATE_PRESENT) +// Tests that we will truncate the preallocated space of the last log from +// previous. +TEST_F(DBWALTest, TruncateLastLogAfterRecoverWithoutFlush) { + constexpr size_t kKB = 1024; + Options options = CurrentOptions(); + options.avoid_flush_during_recovery = true; + DestroyAndReopen(options); + size_t preallocated_size = + dbfull()->TEST_GetWalPreallocateBlockSize(options.write_buffer_size); + ASSERT_OK(Put("foo", "v1")); + VectorLogPtr log_files_before; + ASSERT_OK(dbfull()->GetSortedWalFiles(log_files_before)); + ASSERT_EQ(1, log_files_before.size()); + auto& file_before = log_files_before[0]; + ASSERT_LT(file_before->SizeFileBytes(), 1 * kKB); + // The log file has preallocated space. + ASSERT_GE(GetAllocatedFileSize(dbname_ + file_before->PathName()), + preallocated_size); + Reopen(options); + VectorLogPtr log_files_after; + ASSERT_OK(dbfull()->GetSortedWalFiles(log_files_after)); + ASSERT_EQ(1, log_files_after.size()); + ASSERT_LT(log_files_after[0]->SizeFileBytes(), 1 * kKB); + // The preallocated space should be truncated. + ASSERT_LT(GetAllocatedFileSize(dbname_ + file_before->PathName()), + preallocated_size); +} +#endif // ROCKSDB_FALLOCATE_PRESENT +#endif // ROCKSDB_PLATFORM_POSIX + #endif // ROCKSDB_LITE TEST_F(DBWALTest, WalTermTest) { diff -Nru rocksdb-5.15.10/db/error_handler.cc rocksdb-5.17.2/db/error_handler.cc --- rocksdb-5.15.10/db/error_handler.cc 2018-09-13 17:25:20.000000000 +0000 +++ rocksdb-5.17.2/db/error_handler.cc 2018-11-12 19:57:32.000000000 +0000 @@ -4,7 +4,9 @@ // (found in the LICENSE.Apache file in the root directory). // #include "db/error_handler.h" +#include "db/db_impl.h" #include "db/event_helpers.h" +#include "util/sst_file_manager_impl.h" namespace rocksdb { @@ -33,7 +35,7 @@ // Errors during BG flush {std::make_tuple(BackgroundErrorReason::kFlush, Status::Code::kIOError, Status::SubCode::kNoSpace, true), - Status::Severity::kSoftError}, + Status::Severity::kHardError}, {std::make_tuple(BackgroundErrorReason::kFlush, Status::Code::kIOError, Status::SubCode::kNoSpace, false), Status::Severity::kNoError}, @@ -44,11 +46,11 @@ {std::make_tuple(BackgroundErrorReason::kWriteCallback, Status::Code::kIOError, Status::SubCode::kNoSpace, true), - Status::Severity::kFatalError}, + Status::Severity::kHardError}, {std::make_tuple(BackgroundErrorReason::kWriteCallback, Status::Code::kIOError, Status::SubCode::kNoSpace, false), - Status::Severity::kFatalError}, + Status::Severity::kHardError}, }; std::map, Status::Severity> @@ -118,6 +120,45 @@ Status::Severity::kFatalError}, }; +void ErrorHandler::CancelErrorRecovery() { +#ifndef ROCKSDB_LITE + db_mutex_->AssertHeld(); + + // We'll release the lock before calling sfm, so make sure no new + // recovery gets scheduled at that point + auto_recovery_ = false; + SstFileManagerImpl* sfm = reinterpret_cast( + db_options_.sst_file_manager.get()); + if (sfm) { + // This may or may not cancel a pending recovery + db_mutex_->Unlock(); + bool cancelled = sfm->CancelErrorRecovery(this); + db_mutex_->Lock(); + if (cancelled) { + recovery_in_prog_ = false; + } + } +#endif +} + +// This is the main function for looking at an error during a background +// operation and deciding the severity, and error recovery strategy. The high +// level algorithm is as follows - +// 1. Classify the severity of the error based on the ErrorSeverityMap, +// DefaultErrorSeverityMap and DefaultReasonMap defined earlier +// 2. Call a Status code specific override function to adjust the severity +// if needed. The reason for this is our ability to recover may depend on +// the exact options enabled in DBOptions +// 3. Determine if auto recovery is possible. A listener notification callback +// is called, which can disable the auto recovery even if we decide its +// feasible +// 4. For Status::NoSpace() errors, rely on SstFileManagerImpl to control +// the actual recovery. If no sst file manager is specified in DBOptions, +// a default one is allocated during DB::Open(), so there will always be +// one. +// This can also get called as part of a recovery operation. In that case, we +// also track the error seperately in recovery_error_ so we can tell in the +// end whether recovery succeeded or not Status ErrorHandler::SetBGError(const Status& bg_err, BackgroundErrorReason reason) { db_mutex_->AssertHeld(); @@ -125,6 +166,12 @@ return Status::OK(); } + // Check if recovery is currently in progress. If it is, we will save this + // error so we can check it at the end to see if recovery succeeded or not + if (recovery_in_prog_ && recovery_error_.ok()) { + recovery_error_ = bg_err; + } + bool paranoid = db_options_.paranoid_checks; Status::Severity sev = Status::Severity::kFatalError; Status new_bg_err; @@ -156,15 +203,143 @@ } new_bg_err = Status(bg_err, sev); + + bool auto_recovery = auto_recovery_; + if (new_bg_err.severity() >= Status::Severity::kFatalError && auto_recovery) { + auto_recovery = false; + ; + } + + // Allow some error specific overrides + if (new_bg_err == Status::NoSpace()) { + new_bg_err = OverrideNoSpaceError(new_bg_err, &auto_recovery); + } + if (!new_bg_err.ok()) { Status s = new_bg_err; - EventHelpers::NotifyOnBackgroundError(db_options_.listeners, reason, &s, db_mutex_); + EventHelpers::NotifyOnBackgroundError(db_options_.listeners, reason, &s, + db_mutex_, &auto_recovery); if (!s.ok() && (s.severity() > bg_error_.severity())) { bg_error_ = s; + } else { + // This error is less severe than previously encountered error. Don't + // take any further action + return bg_error_; } } + if (auto_recovery) { + recovery_in_prog_ = true; + + // Kick-off error specific recovery + if (bg_error_ == Status::NoSpace()) { + RecoverFromNoSpace(); + } + } return bg_error_; } +Status ErrorHandler::OverrideNoSpaceError(Status bg_error, + bool* auto_recovery) { +#ifndef ROCKSDB_LITE + if (bg_error.severity() >= Status::Severity::kFatalError) { + return bg_error; + } + + if (db_options_.sst_file_manager.get() == nullptr) { + // We rely on SFM to poll for enough disk space and recover + *auto_recovery = false; + return bg_error; + } + + if (db_options_.allow_2pc && + (bg_error.severity() <= Status::Severity::kSoftError)) { + // Don't know how to recover, as the contents of the current WAL file may + // be inconsistent, and it may be needed for 2PC. If 2PC is not enabled, + // we can just flush the memtable and discard the log + *auto_recovery = false; + return Status(bg_error, Status::Severity::kFatalError); + } + + { + uint64_t free_space; + if (db_options_.env->GetFreeSpace(db_options_.db_paths[0].path, + &free_space) == Status::NotSupported()) { + *auto_recovery = false; + } + } + + return bg_error; +#else + (void)auto_recovery; + return Status(bg_error, Status::Severity::kFatalError); +#endif +} + +void ErrorHandler::RecoverFromNoSpace() { +#ifndef ROCKSDB_LITE + SstFileManagerImpl* sfm = + reinterpret_cast(db_options_.sst_file_manager.get()); + + // Inform SFM of the error, so it can kick-off the recovery + if (sfm) { + sfm->StartErrorRecovery(this, bg_error_); + } +#endif +} + +Status ErrorHandler::ClearBGError() { +#ifndef ROCKSDB_LITE + db_mutex_->AssertHeld(); + + // Signal that recovery succeeded + if (recovery_error_.ok()) { + Status old_bg_error = bg_error_; + bg_error_ = Status::OK(); + recovery_in_prog_ = false; + EventHelpers::NotifyOnErrorRecoveryCompleted(db_options_.listeners, + old_bg_error, db_mutex_); + } + return recovery_error_; +#else + return bg_error_; +#endif +} + +Status ErrorHandler::RecoverFromBGError(bool is_manual) { +#ifndef ROCKSDB_LITE + InstrumentedMutexLock l(db_mutex_); + if (is_manual) { + // If its a manual recovery and there's a background recovery in progress + // return busy status + if (recovery_in_prog_) { + return Status::Busy(); + } + recovery_in_prog_ = true; + } + + if (bg_error_.severity() == Status::Severity::kSoftError) { + // Simply clear the background error and return + recovery_error_ = Status::OK(); + return ClearBGError(); + } + + // Reset recovery_error_. We will use this to record any errors that happen + // during the recovery process. While recovering, the only operations that + // can generate background errors should be the flush operations + recovery_error_ = Status::OK(); + Status s = db_->ResumeImpl(); + // For manual recover, shutdown, and fatal error cases, set + // recovery_in_prog_ to false. For automatic background recovery, leave it + // as is regardless of success or failure as it will be retried + if (is_manual || s.IsShutdownInProgress() || + bg_error_.severity() >= Status::Severity::kFatalError) { + recovery_in_prog_ = false; + } + return s; +#else + (void)is_manual; + return bg_error_; +#endif +} } diff -Nru rocksdb-5.15.10/db/error_handler.h rocksdb-5.17.2/db/error_handler.h --- rocksdb-5.15.10/db/error_handler.h 2018-09-13 17:25:20.000000000 +0000 +++ rocksdb-5.17.2/db/error_handler.h 2018-11-12 19:57:32.000000000 +0000 @@ -11,42 +11,65 @@ namespace rocksdb { +class DBImpl; + class ErrorHandler { public: - ErrorHandler(const ImmutableDBOptions& db_options, - InstrumentedMutex* db_mutex) - : db_options_(db_options), - bg_error_(Status::OK()), - db_mutex_(db_mutex) - {} - ~ErrorHandler() {} - - Status::Severity GetErrorSeverity(BackgroundErrorReason reason, - Status::Code code, Status::SubCode subcode); - - Status SetBGError(const Status& bg_err, BackgroundErrorReason reason); - - Status GetBGError() - { - return bg_error_; + ErrorHandler(DBImpl* db, const ImmutableDBOptions& db_options, + InstrumentedMutex* db_mutex) + : db_(db), + db_options_(db_options), + bg_error_(Status::OK()), + recovery_error_(Status::OK()), + db_mutex_(db_mutex), + auto_recovery_(false), + recovery_in_prog_(false) {} + ~ErrorHandler() {} + + void EnableAutoRecovery() { auto_recovery_ = true; } + + Status::Severity GetErrorSeverity(BackgroundErrorReason reason, + Status::Code code, + Status::SubCode subcode); + + Status SetBGError(const Status& bg_err, BackgroundErrorReason reason); + + Status GetBGError() { return bg_error_; } + + Status GetRecoveryError() { return recovery_error_; } + + Status ClearBGError(); + + bool IsDBStopped() { + return !bg_error_.ok() && + bg_error_.severity() >= Status::Severity::kHardError; } - void ClearBGError() { - bg_error_ = Status::OK(); + bool IsBGWorkStopped() { + return !bg_error_.ok() && + (bg_error_.severity() >= Status::Severity::kHardError || + !auto_recovery_); } - bool IsDBStopped() { - return !bg_error_.ok(); - } + bool IsRecoveryInProgress() { return recovery_in_prog_; } - bool IsBGWorkStopped() { - return !bg_error_.ok(); - } + Status RecoverFromBGError(bool is_manual = false); + void CancelErrorRecovery(); - private: + private: + DBImpl* db_; const ImmutableDBOptions& db_options_; Status bg_error_; + // A seperate Status variable used to record any errors during the + // recovery process from hard errors + Status recovery_error_; InstrumentedMutex* db_mutex_; + // A flag indicating whether automatic recovery from errors is enabled + bool auto_recovery_; + bool recovery_in_prog_; + + Status OverrideNoSpaceError(Status bg_error, bool* auto_recovery); + void RecoverFromNoSpace(); }; } diff -Nru rocksdb-5.15.10/db/error_handler_test.cc rocksdb-5.17.2/db/error_handler_test.cc --- rocksdb-5.15.10/db/error_handler_test.cc 2018-09-13 17:25:20.000000000 +0000 +++ rocksdb-5.17.2/db/error_handler_test.cc 2018-11-12 19:57:32.000000000 +0000 @@ -6,9 +6,12 @@ // Copyright (c) 2011 The LevelDB Authors. All rights reserved. // Use of this source code is governed by a BSD-style license that can be // found in the LICENSE file. See the AUTHORS file for names of contributors. +#ifndef ROCKSDB_LITE + #include "db/db_test_util.h" #include "port/stack_trace.h" #include "rocksdb/perf_context.h" +#include "rocksdb/sst_file_manager.h" #include "util/fault_injection_test_env.h" #if !defined(ROCKSDB_LITE) #include "util/sync_point.h" @@ -33,36 +36,139 @@ bool trig_io_error; }; +class ErrorHandlerListener : public EventListener { + public: + ErrorHandlerListener() + : mutex_(), + cv_(&mutex_), + no_auto_recovery_(false), + recovery_complete_(false), + file_creation_started_(false), + override_bg_error_(false), + file_count_(0), + fault_env_(nullptr) {} + + void OnTableFileCreationStarted( + const TableFileCreationBriefInfo& /*ti*/) override { + InstrumentedMutexLock l(&mutex_); + file_creation_started_ = true; + if (file_count_ > 0) { + if (--file_count_ == 0) { + fault_env_->SetFilesystemActive(false, file_creation_error_); + file_creation_error_ = Status::OK(); + } + } + cv_.SignalAll(); + } + + void OnErrorRecoveryBegin(BackgroundErrorReason /*reason*/, + Status /*bg_error*/, + bool* auto_recovery) override { + if (*auto_recovery && no_auto_recovery_) { + *auto_recovery = false; + } + } + + void OnErrorRecoveryCompleted(Status /*old_bg_error*/) override { + InstrumentedMutexLock l(&mutex_); + recovery_complete_ = true; + cv_.SignalAll(); + } + + bool WaitForRecovery(uint64_t /*abs_time_us*/) { + InstrumentedMutexLock l(&mutex_); + while (!recovery_complete_) { + cv_.Wait(/*abs_time_us*/); + } + if (recovery_complete_) { + recovery_complete_ = false; + return true; + } + return false; + } + + void WaitForTableFileCreationStarted(uint64_t /*abs_time_us*/) { + InstrumentedMutexLock l(&mutex_); + while (!file_creation_started_) { + cv_.Wait(/*abs_time_us*/); + } + file_creation_started_ = false; + } + + void OnBackgroundError(BackgroundErrorReason /*reason*/, + Status* bg_error) override { + if (override_bg_error_) { + *bg_error = bg_error_; + override_bg_error_ = false; + } + } + + void EnableAutoRecovery(bool enable = true) { no_auto_recovery_ = !enable; } + + void OverrideBGError(Status bg_err) { + bg_error_ = bg_err; + override_bg_error_ = true; + } + + void InjectFileCreationError(FaultInjectionTestEnv* env, int file_count, + Status s) { + fault_env_ = env; + file_count_ = file_count; + file_creation_error_ = s; + } + + private: + InstrumentedMutex mutex_; + InstrumentedCondVar cv_; + bool no_auto_recovery_; + bool recovery_complete_; + bool file_creation_started_; + bool override_bg_error_; + int file_count_; + Status file_creation_error_; + Status bg_error_; + FaultInjectionTestEnv* fault_env_; +}; + TEST_F(DBErrorHandlingTest, FLushWriteError) { std::unique_ptr fault_env( new FaultInjectionTestEnv(Env::Default())); + std::shared_ptr listener(new ErrorHandlerListener()); Options options = GetDefaultOptions(); options.create_if_missing = true; options.env = fault_env.get(); + options.listeners.emplace_back(listener); Status s; + + listener->EnableAutoRecovery(false); DestroyAndReopen(options); - Put(Key(0), "va;"); + Put(Key(0), "val"); SyncPoint::GetInstance()->SetCallBack( "FlushJob::Start", [&](void *) { fault_env->SetFilesystemActive(false, Status::NoSpace("Out of space")); }); SyncPoint::GetInstance()->EnableProcessing(); s = Flush(); - ASSERT_EQ(s.severity(), rocksdb::Status::Severity::kSoftError); + ASSERT_EQ(s.severity(), rocksdb::Status::Severity::kHardError); + SyncPoint::GetInstance()->DisableProcessing(); fault_env->SetFilesystemActive(true); s = dbfull()->Resume(); ASSERT_EQ(s, Status::OK()); + Reopen(options); + ASSERT_EQ("val", Get(Key(0))); Destroy(options); } TEST_F(DBErrorHandlingTest, CompactionWriteError) { std::unique_ptr fault_env( new FaultInjectionTestEnv(Env::Default())); + std::shared_ptr listener(new ErrorHandlerListener()); Options options = GetDefaultOptions(); options.create_if_missing = true; options.level0_file_num_compaction_trigger = 2; + options.listeners.emplace_back(listener); options.env = fault_env.get(); Status s; DestroyAndReopen(options); @@ -72,6 +178,10 @@ s = Flush(); ASSERT_EQ(s, Status::OK()); + listener->OverrideBGError( + Status(Status::NoSpace(), Status::Severity::kHardError) + ); + listener->EnableAutoRecovery(false); rocksdb::SyncPoint::GetInstance()->LoadDependency( {{"FlushMemTableFinished", "BackgroundCallCompaction:0"}}); rocksdb::SyncPoint::GetInstance()->SetCallBack( @@ -85,7 +195,7 @@ ASSERT_EQ(s, Status::OK()); s = dbfull()->TEST_WaitForCompact(); - ASSERT_EQ(s.severity(), rocksdb::Status::Severity::kSoftError); + ASSERT_EQ(s.severity(), rocksdb::Status::Severity::kHardError); fault_env->SetFilesystemActive(true); s = dbfull()->Resume(); @@ -129,6 +239,439 @@ Destroy(options); } +TEST_F(DBErrorHandlingTest, AutoRecoverFlushError) { + std::unique_ptr fault_env( + new FaultInjectionTestEnv(Env::Default())); + std::shared_ptr listener(new ErrorHandlerListener()); + Options options = GetDefaultOptions(); + options.create_if_missing = true; + options.env = fault_env.get(); + options.listeners.emplace_back(listener); + Status s; + + listener->EnableAutoRecovery(); + DestroyAndReopen(options); + + Put(Key(0), "val"); + SyncPoint::GetInstance()->SetCallBack("FlushJob::Start", [&](void*) { + fault_env->SetFilesystemActive(false, Status::NoSpace("Out of space")); + }); + SyncPoint::GetInstance()->EnableProcessing(); + s = Flush(); + ASSERT_EQ(s.severity(), rocksdb::Status::Severity::kHardError); + SyncPoint::GetInstance()->DisableProcessing(); + fault_env->SetFilesystemActive(true); + ASSERT_EQ(listener->WaitForRecovery(5000000), true); + + s = Put(Key(1), "val"); + ASSERT_EQ(s, Status::OK()); + + Reopen(options); + ASSERT_EQ("val", Get(Key(0))); + ASSERT_EQ("val", Get(Key(1))); + Destroy(options); +} + +TEST_F(DBErrorHandlingTest, FailRecoverFlushError) { + std::unique_ptr fault_env( + new FaultInjectionTestEnv(Env::Default())); + std::shared_ptr listener(new ErrorHandlerListener()); + Options options = GetDefaultOptions(); + options.create_if_missing = true; + options.env = fault_env.get(); + options.listeners.emplace_back(listener); + Status s; + + listener->EnableAutoRecovery(); + DestroyAndReopen(options); + + Put(Key(0), "val"); + SyncPoint::GetInstance()->SetCallBack("FlushJob::Start", [&](void*) { + fault_env->SetFilesystemActive(false, Status::NoSpace("Out of space")); + }); + SyncPoint::GetInstance()->EnableProcessing(); + s = Flush(); + ASSERT_EQ(s.severity(), rocksdb::Status::Severity::kHardError); + // We should be able to shutdown the database while auto recovery is going + // on in the background + Close(); + DestroyDB(dbname_, options); +} + +TEST_F(DBErrorHandlingTest, WALWriteError) { + std::unique_ptr fault_env( + new FaultInjectionTestEnv(Env::Default())); + std::shared_ptr listener(new ErrorHandlerListener()); + Options options = GetDefaultOptions(); + options.create_if_missing = true; + options.writable_file_max_buffer_size = 32768; + options.env = fault_env.get(); + options.listeners.emplace_back(listener); + Status s; + Random rnd(301); + + listener->EnableAutoRecovery(); + DestroyAndReopen(options); + + { + WriteBatch batch; + + for (auto i = 0; i<100; ++i) { + batch.Put(Key(i), RandomString(&rnd, 1024)); + } + + WriteOptions wopts; + wopts.sync = true; + ASSERT_EQ(dbfull()->Write(wopts, &batch), Status::OK()); + }; + + { + WriteBatch batch; + int write_error = 0; + + for (auto i = 100; i<199; ++i) { + batch.Put(Key(i), RandomString(&rnd, 1024)); + } + + SyncPoint::GetInstance()->SetCallBack("WritableFileWriter::Append:BeforePrepareWrite", [&](void*) { + write_error++; + if (write_error > 2) { + fault_env->SetFilesystemActive(false, Status::NoSpace("Out of space")); + } + }); + SyncPoint::GetInstance()->EnableProcessing(); + WriteOptions wopts; + wopts.sync = true; + s = dbfull()->Write(wopts, &batch); + ASSERT_EQ(s, s.NoSpace()); + } + SyncPoint::GetInstance()->DisableProcessing(); + fault_env->SetFilesystemActive(true); + ASSERT_EQ(listener->WaitForRecovery(5000000), true); + for (auto i=0; i<199; ++i) { + if (i < 100) { + ASSERT_NE(Get(Key(i)), "NOT_FOUND"); + } else { + ASSERT_EQ(Get(Key(i)), "NOT_FOUND"); + } + } + Reopen(options); + for (auto i=0; i<199; ++i) { + if (i < 100) { + ASSERT_NE(Get(Key(i)), "NOT_FOUND"); + } else { + ASSERT_EQ(Get(Key(i)), "NOT_FOUND"); + } + } + Close(); +} + +TEST_F(DBErrorHandlingTest, MultiCFWALWriteError) { + std::unique_ptr fault_env( + new FaultInjectionTestEnv(Env::Default())); + std::shared_ptr listener(new ErrorHandlerListener()); + Options options = GetDefaultOptions(); + options.create_if_missing = true; + options.writable_file_max_buffer_size = 32768; + options.env = fault_env.get(); + options.listeners.emplace_back(listener); + Status s; + Random rnd(301); + + listener->EnableAutoRecovery(); + CreateAndReopenWithCF({"one", "two", "three"}, options); + + { + WriteBatch batch; + + for (auto i = 1; i < 4; ++i) { + for (auto j = 0; j < 100; ++j) { + batch.Put(handles_[i], Key(j), RandomString(&rnd, 1024)); + } + } + + WriteOptions wopts; + wopts.sync = true; + ASSERT_EQ(dbfull()->Write(wopts, &batch), Status::OK()); + }; + + { + WriteBatch batch; + int write_error = 0; + + // Write to one CF + for (auto i = 100; i < 199; ++i) { + batch.Put(handles_[2], Key(i), RandomString(&rnd, 1024)); + } + + SyncPoint::GetInstance()->SetCallBack( + "WritableFileWriter::Append:BeforePrepareWrite", [&](void*) { + write_error++; + if (write_error > 2) { + fault_env->SetFilesystemActive(false, + Status::NoSpace("Out of space")); + } + }); + SyncPoint::GetInstance()->EnableProcessing(); + WriteOptions wopts; + wopts.sync = true; + s = dbfull()->Write(wopts, &batch); + ASSERT_EQ(s, s.NoSpace()); + } + SyncPoint::GetInstance()->DisableProcessing(); + fault_env->SetFilesystemActive(true); + ASSERT_EQ(listener->WaitForRecovery(5000000), true); + + for (auto i = 1; i < 4; ++i) { + // Every CF should have been flushed + ASSERT_EQ(NumTableFilesAtLevel(0, i), 1); + } + + for (auto i = 1; i < 4; ++i) { + for (auto j = 0; j < 199; ++j) { + if (j < 100) { + ASSERT_NE(Get(i, Key(j)), "NOT_FOUND"); + } else { + ASSERT_EQ(Get(i, Key(j)), "NOT_FOUND"); + } + } + } + ReopenWithColumnFamilies({"default", "one", "two", "three"}, options); + for (auto i = 1; i < 4; ++i) { + for (auto j = 0; j < 199; ++j) { + if (j < 100) { + ASSERT_NE(Get(i, Key(j)), "NOT_FOUND"); + } else { + ASSERT_EQ(Get(i, Key(j)), "NOT_FOUND"); + } + } + } + Close(); +} + +TEST_F(DBErrorHandlingTest, MultiDBCompactionError) { + FaultInjectionTestEnv* def_env = new FaultInjectionTestEnv(Env::Default()); + std::vector> fault_env; + std::vector options; + std::vector> listener; + std::vector db; + std::shared_ptr sfm(NewSstFileManager(def_env)); + int kNumDbInstances = 3; + Random rnd(301); + + for (auto i = 0; i < kNumDbInstances; ++i) { + listener.emplace_back(new ErrorHandlerListener()); + options.emplace_back(GetDefaultOptions()); + fault_env.emplace_back(new FaultInjectionTestEnv(Env::Default())); + options[i].create_if_missing = true; + options[i].level0_file_num_compaction_trigger = 2; + options[i].writable_file_max_buffer_size = 32768; + options[i].env = fault_env[i].get(); + options[i].listeners.emplace_back(listener[i]); + options[i].sst_file_manager = sfm; + DB* dbptr; + char buf[16]; + + listener[i]->EnableAutoRecovery(); + // Setup for returning error for the 3rd SST, which would be level 1 + listener[i]->InjectFileCreationError(fault_env[i].get(), 3, + Status::NoSpace("Out of space")); + snprintf(buf, sizeof(buf), "_%d", i); + DestroyDB(dbname_ + std::string(buf), options[i]); + ASSERT_EQ(DB::Open(options[i], dbname_ + std::string(buf), &dbptr), + Status::OK()); + db.emplace_back(dbptr); + } + + for (auto i = 0; i < kNumDbInstances; ++i) { + WriteBatch batch; + + for (auto j = 0; j <= 100; ++j) { + batch.Put(Key(j), RandomString(&rnd, 1024)); + } + + WriteOptions wopts; + wopts.sync = true; + ASSERT_EQ(db[i]->Write(wopts, &batch), Status::OK()); + ASSERT_EQ(db[i]->Flush(FlushOptions()), Status::OK()); + } + + def_env->SetFilesystemActive(false, Status::NoSpace("Out of space")); + for (auto i = 0; i < kNumDbInstances; ++i) { + WriteBatch batch; + + // Write to one CF + for (auto j = 100; j < 199; ++j) { + batch.Put(Key(j), RandomString(&rnd, 1024)); + } + + WriteOptions wopts; + wopts.sync = true; + ASSERT_EQ(db[i]->Write(wopts, &batch), Status::OK()); + ASSERT_EQ(db[i]->Flush(FlushOptions()), Status::OK()); + } + + for (auto i = 0; i < kNumDbInstances; ++i) { + Status s = static_cast(db[i])->TEST_WaitForCompact(true); + ASSERT_EQ(s.severity(), Status::Severity::kSoftError); + fault_env[i]->SetFilesystemActive(true); + } + + def_env->SetFilesystemActive(true); + for (auto i = 0; i < kNumDbInstances; ++i) { + std::string prop; + ASSERT_EQ(listener[i]->WaitForRecovery(5000000), true); + EXPECT_TRUE(db[i]->GetProperty( + "rocksdb.num-files-at-level" + NumberToString(0), &prop)); + EXPECT_EQ(atoi(prop.c_str()), 0); + EXPECT_TRUE(db[i]->GetProperty( + "rocksdb.num-files-at-level" + NumberToString(1), &prop)); + EXPECT_EQ(atoi(prop.c_str()), 1); + } + + for (auto i = 0; i < kNumDbInstances; ++i) { + char buf[16]; + snprintf(buf, sizeof(buf), "_%d", i); + delete db[i]; + fault_env[i]->SetFilesystemActive(true); + if (getenv("KEEP_DB")) { + printf("DB is still at %s%s\n", dbname_.c_str(), buf); + } else { + Status s = DestroyDB(dbname_ + std::string(buf), options[i]); + } + } + options.clear(); + sfm.reset(); + delete def_env; +} + +TEST_F(DBErrorHandlingTest, MultiDBVariousErrors) { + FaultInjectionTestEnv* def_env = new FaultInjectionTestEnv(Env::Default()); + std::vector> fault_env; + std::vector options; + std::vector> listener; + std::vector db; + std::shared_ptr sfm(NewSstFileManager(def_env)); + int kNumDbInstances = 3; + Random rnd(301); + + for (auto i = 0; i < kNumDbInstances; ++i) { + listener.emplace_back(new ErrorHandlerListener()); + options.emplace_back(GetDefaultOptions()); + fault_env.emplace_back(new FaultInjectionTestEnv(Env::Default())); + options[i].create_if_missing = true; + options[i].level0_file_num_compaction_trigger = 2; + options[i].writable_file_max_buffer_size = 32768; + options[i].env = fault_env[i].get(); + options[i].listeners.emplace_back(listener[i]); + options[i].sst_file_manager = sfm; + DB* dbptr; + char buf[16]; + + listener[i]->EnableAutoRecovery(); + switch (i) { + case 0: + // Setup for returning error for the 3rd SST, which would be level 1 + listener[i]->InjectFileCreationError(fault_env[i].get(), 3, + Status::NoSpace("Out of space")); + break; + case 1: + // Setup for returning error after the 1st SST, which would result + // in a hard error + listener[i]->InjectFileCreationError(fault_env[i].get(), 2, + Status::NoSpace("Out of space")); + break; + default: + break; + } + snprintf(buf, sizeof(buf), "_%d", i); + DestroyDB(dbname_ + std::string(buf), options[i]); + ASSERT_EQ(DB::Open(options[i], dbname_ + std::string(buf), &dbptr), + Status::OK()); + db.emplace_back(dbptr); + } + + for (auto i = 0; i < kNumDbInstances; ++i) { + WriteBatch batch; + + for (auto j = 0; j <= 100; ++j) { + batch.Put(Key(j), RandomString(&rnd, 1024)); + } + + WriteOptions wopts; + wopts.sync = true; + ASSERT_EQ(db[i]->Write(wopts, &batch), Status::OK()); + ASSERT_EQ(db[i]->Flush(FlushOptions()), Status::OK()); + } + + def_env->SetFilesystemActive(false, Status::NoSpace("Out of space")); + for (auto i = 0; i < kNumDbInstances; ++i) { + WriteBatch batch; + + // Write to one CF + for (auto j = 100; j < 199; ++j) { + batch.Put(Key(j), RandomString(&rnd, 1024)); + } + + WriteOptions wopts; + wopts.sync = true; + ASSERT_EQ(db[i]->Write(wopts, &batch), Status::OK()); + if (i != 1) { + ASSERT_EQ(db[i]->Flush(FlushOptions()), Status::OK()); + } else { + ASSERT_EQ(db[i]->Flush(FlushOptions()), Status::NoSpace()); + } + } + + for (auto i = 0; i < kNumDbInstances; ++i) { + Status s = static_cast(db[i])->TEST_WaitForCompact(true); + switch (i) { + case 0: + ASSERT_EQ(s.severity(), Status::Severity::kSoftError); + break; + case 1: + ASSERT_EQ(s.severity(), Status::Severity::kHardError); + break; + case 2: + ASSERT_EQ(s, Status::OK()); + break; + } + fault_env[i]->SetFilesystemActive(true); + } + + def_env->SetFilesystemActive(true); + for (auto i = 0; i < kNumDbInstances; ++i) { + std::string prop; + if (i < 2) { + ASSERT_EQ(listener[i]->WaitForRecovery(5000000), true); + } + if (i == 1) { + ASSERT_EQ(static_cast(db[i])->TEST_WaitForCompact(true), + Status::OK()); + } + EXPECT_TRUE(db[i]->GetProperty( + "rocksdb.num-files-at-level" + NumberToString(0), &prop)); + EXPECT_EQ(atoi(prop.c_str()), 0); + EXPECT_TRUE(db[i]->GetProperty( + "rocksdb.num-files-at-level" + NumberToString(1), &prop)); + EXPECT_EQ(atoi(prop.c_str()), 1); + } + + for (auto i = 0; i < kNumDbInstances; ++i) { + char buf[16]; + snprintf(buf, sizeof(buf), "_%d", i); + fault_env[i]->SetFilesystemActive(true); + delete db[i]; + if (getenv("KEEP_DB")) { + printf("DB is still at %s%s\n", dbname_.c_str(), buf); + } else { + DestroyDB(dbname_ + std::string(buf), options[i]); + } + } + options.clear(); + delete def_env; +} + } // namespace rocksdb int main(int argc, char** argv) { @@ -136,3 +679,13 @@ ::testing::InitGoogleTest(&argc, argv); return RUN_ALL_TESTS(); } + +#else +#include + +int main(int /*argc*/, char** /*argv*/) { + fprintf(stderr, "SKIPPED as Cuckoo table is not supported in ROCKSDB_LITE\n"); + return 0; +} + +#endif // ROCKSDB_LITE diff -Nru rocksdb-5.15.10/db/event_helpers.cc rocksdb-5.17.2/db/event_helpers.cc --- rocksdb-5.15.10/db/event_helpers.cc 2018-09-13 17:25:20.000000000 +0000 +++ rocksdb-5.17.2/db/event_helpers.cc 2018-11-12 19:57:32.000000000 +0000 @@ -40,8 +40,8 @@ void EventHelpers::NotifyOnBackgroundError( const std::vector>& listeners, - BackgroundErrorReason reason, Status* bg_error, - InstrumentedMutex* db_mutex) { + BackgroundErrorReason reason, Status* bg_error, InstrumentedMutex* db_mutex, + bool* auto_recovery) { #ifndef ROCKSDB_LITE if (listeners.size() == 0U) { return; @@ -51,6 +51,9 @@ db_mutex->Unlock(); for (auto& listener : listeners) { listener->OnBackgroundError(reason, bg_error); + if (*auto_recovery) { + listener->OnErrorRecoveryBegin(reason, *bg_error, auto_recovery); + } } db_mutex->Lock(); #else @@ -58,6 +61,7 @@ (void)reason; (void)bg_error; (void)db_mutex; + (void)auto_recovery; #endif // ROCKSDB_LITE } @@ -167,4 +171,25 @@ #endif // !ROCKSDB_LITE } +void EventHelpers::NotifyOnErrorRecoveryCompleted( + const std::vector>& listeners, + Status old_bg_error, InstrumentedMutex* db_mutex) { +#ifndef ROCKSDB_LITE + if (listeners.size() == 0U) { + return; + } + db_mutex->AssertHeld(); + // release lock while notifying events + db_mutex->Unlock(); + for (auto& listener : listeners) { + listener->OnErrorRecoveryCompleted(old_bg_error); + } + db_mutex->Lock(); +#else + (void)listeners; + (void)old_bg_error; + (void)db_mutex; +#endif // ROCKSDB_LITE +} + } // namespace rocksdb diff -Nru rocksdb-5.15.10/db/event_helpers.h rocksdb-5.17.2/db/event_helpers.h --- rocksdb-5.15.10/db/event_helpers.h 2018-09-13 17:25:20.000000000 +0000 +++ rocksdb-5.17.2/db/event_helpers.h 2018-11-12 19:57:32.000000000 +0000 @@ -28,7 +28,7 @@ static void NotifyOnBackgroundError( const std::vector>& listeners, BackgroundErrorReason reason, Status* bg_error, - InstrumentedMutex* db_mutex); + InstrumentedMutex* db_mutex, bool* auto_recovery); static void LogAndNotifyTableFileCreationFinished( EventLogger* event_logger, const std::vector>& listeners, @@ -41,6 +41,9 @@ uint64_t file_number, const std::string& file_path, const Status& status, const std::string& db_name, const std::vector>& listeners); + static void NotifyOnErrorRecoveryCompleted( + const std::vector>& listeners, + Status bg_error, InstrumentedMutex* db_mutex); private: static void LogAndNotifyTableFileCreation( diff -Nru rocksdb-5.15.10/db/external_sst_file_ingestion_job.cc rocksdb-5.17.2/db/external_sst_file_ingestion_job.cc --- rocksdb-5.15.10/db/external_sst_file_ingestion_job.cc 2018-09-13 17:25:20.000000000 +0000 +++ rocksdb-5.17.2/db/external_sst_file_ingestion_job.cc 2018-11-12 19:57:32.000000000 +0000 @@ -29,7 +29,8 @@ namespace rocksdb { Status ExternalSstFileIngestionJob::Prepare( - const std::vector& external_files_paths, SuperVersion* sv) { + const std::vector& external_files_paths, + uint64_t next_file_number, SuperVersion* sv) { Status status; // Read the information of files we are ingesting @@ -90,7 +91,7 @@ // Copy/Move external files into DB for (IngestedFileInfo& f : files_to_ingest_) { - f.fd = FileDescriptor(versions_->NewFileNumber(), 0, f.file_size); + f.fd = FileDescriptor(next_file_number++, 0, f.file_size); const std::string path_outside_db = f.external_file_path; const std::string path_inside_db = @@ -343,7 +344,7 @@ file_to_ingest->global_seqno_offset = 0; return Status::Corruption("Was not able to find file global seqno field"); } - file_to_ingest->global_seqno_offset = offsets_iter->second; + file_to_ingest->global_seqno_offset = static_cast(offsets_iter->second); } else if (file_to_ingest->version == 1) { // SST file V1 should not have global seqno field assert(seqno_iter == uprops.end()); @@ -475,9 +476,9 @@ const SequenceNumber level_largest_seqno = (*max_element(level_files.begin(), level_files.end(), [](FileMetaData* f1, FileMetaData* f2) { - return f1->largest_seqno < f2->largest_seqno; + return f1->fd.largest_seqno < f2->fd.largest_seqno; })) - ->largest_seqno; + ->fd.largest_seqno; // should only assign seqno to current level's largest seqno when // the file fits if (level_largest_seqno != 0 && @@ -522,7 +523,7 @@ // at some upper level for (int lvl = 0; lvl < cfd_->NumberLevels() - 1; lvl++) { for (auto file : vstorage->LevelFiles(lvl)) { - if (file->smallest_seqno == 0) { + if (file->fd.smallest_seqno == 0) { return Status::InvalidArgument( "Can't ingest_behind file as despite allow_ingest_behind=true " "there are files with 0 seqno in database at upper levels!"); @@ -547,24 +548,27 @@ "field"); } - std::unique_ptr rwfile; - Status status = env_->NewRandomRWFile(file_to_ingest->internal_file_path, - &rwfile, env_options_); - if (!status.ok()) { - return status; + if (ingestion_options_.write_global_seqno) { + // Determine if we can write global_seqno to a given offset of file. + // If the file system does not support random write, then we should not. + // Otherwise we should. + std::unique_ptr rwfile; + Status status = env_->NewRandomRWFile(file_to_ingest->internal_file_path, + &rwfile, env_options_); + if (status.ok()) { + std::string seqno_val; + PutFixed64(&seqno_val, seqno); + status = rwfile->Write(file_to_ingest->global_seqno_offset, seqno_val); + if (!status.ok()) { + return status; + } + } else if (!status.IsNotSupported()) { + return status; + } } - // Write the new seqno in the global sequence number field in the file - std::string seqno_val; - PutFixed64(&seqno_val, seqno); - status = rwfile->Write(file_to_ingest->global_seqno_offset, seqno_val); - if (status.ok()) { - status = rwfile->Fsync(); - } - if (status.ok()) { - file_to_ingest->assigned_seqno = seqno; - } - return status; + file_to_ingest->assigned_seqno = seqno; + return Status::OK(); } bool ExternalSstFileIngestionJob::IngestedFileFitInLevel( diff -Nru rocksdb-5.15.10/db/external_sst_file_ingestion_job.h rocksdb-5.17.2/db/external_sst_file_ingestion_job.h --- rocksdb-5.15.10/db/external_sst_file_ingestion_job.h 2018-09-13 17:25:20.000000000 +0000 +++ rocksdb-5.17.2/db/external_sst_file_ingestion_job.h 2018-11-12 19:57:32.000000000 +0000 @@ -89,7 +89,7 @@ // Prepare the job by copying external files into the DB. Status Prepare(const std::vector& external_files_paths, - SuperVersion* sv); + uint64_t next_file_number, SuperVersion* sv); // Check if we need to flush the memtable before running the ingestion job // This will be true if the files we are ingesting are overlapping with any diff -Nru rocksdb-5.15.10/db/flush_job.cc rocksdb-5.17.2/db/flush_job.cc --- rocksdb-5.15.10/db/flush_job.cc 2018-09-13 17:25:20.000000000 +0000 +++ rocksdb-5.17.2/db/flush_job.cc 2018-11-12 19:57:32.000000000 +0000 @@ -78,6 +78,8 @@ return "Auto Compaction"; case FlushReason::kManualFlush: return "Manual Flush"; + case FlushReason::kErrorRecovery: + return "Error Recovery"; default: return "Invalid"; } @@ -371,8 +373,8 @@ s.ToString().c_str(), meta_.marked_for_compaction ? " (needs compaction)" : ""); - if (output_file_directory_ != nullptr) { - output_file_directory_->Fsync(); + if (s.ok() && output_file_directory_ != nullptr) { + s = output_file_directory_->Fsync(); } TEST_SYNC_POINT("FlushJob::WriteLevel0Table"); db_mutex_->Lock(); @@ -389,7 +391,7 @@ // Add file to L0 edit_->AddFile(0 /* level */, meta_.fd.GetNumber(), meta_.fd.GetPathId(), meta_.fd.GetFileSize(), meta_.smallest, meta_.largest, - meta_.smallest_seqno, meta_.largest_seqno, + meta_.fd.smallest_seqno, meta_.fd.largest_seqno, meta_.marked_for_compaction); } diff -Nru rocksdb-5.15.10/db/flush_job_test.cc rocksdb-5.17.2/db/flush_job_test.cc --- rocksdb-5.15.10/db/flush_job_test.cc 2018-09-13 17:25:20.000000000 +0000 +++ rocksdb-5.17.2/db/flush_job_test.cc 2018-11-12 19:57:32.000000000 +0000 @@ -62,7 +62,7 @@ manifest, &file, env_->OptimizeForManifestWrite(env_options_)); ASSERT_OK(s); unique_ptr file_writer( - new WritableFileWriter(std::move(file), EnvOptions())); + new WritableFileWriter(std::move(file), manifest, EnvOptions())); { log::Writer log(std::move(file_writer), 0, false); std::string record; @@ -147,19 +147,20 @@ db_options_.statistics.get(), &event_logger, true); HistogramData hist; - FileMetaData fd; + FileMetaData file_meta; mutex_.Lock(); flush_job.PickMemTable(); - ASSERT_OK(flush_job.Run(nullptr, &fd)); + ASSERT_OK(flush_job.Run(nullptr, &file_meta)); mutex_.Unlock(); db_options_.statistics->histogramData(FLUSH_TIME, &hist); ASSERT_GT(hist.average, 0.0); - ASSERT_EQ(ToString(0), fd.smallest.user_key().ToString()); - ASSERT_EQ("9999a", - fd.largest.user_key().ToString()); // range tombstone end key - ASSERT_EQ(1, fd.smallest_seqno); - ASSERT_EQ(10000, fd.largest_seqno); // range tombstone seqnum 10000 + ASSERT_EQ(ToString(0), file_meta.smallest.user_key().ToString()); + ASSERT_EQ( + "9999a", + file_meta.largest.user_key().ToString()); // range tombstone end key + ASSERT_EQ(1, file_meta.fd.smallest_seqno); + ASSERT_EQ(10000, file_meta.fd.largest_seqno); // range tombstone seqnum 10000 mock_table_factory_->AssertSingleFile(inserted_keys); job_context.Clean(); } diff -Nru rocksdb-5.15.10/db/forward_iterator.cc rocksdb-5.17.2/db/forward_iterator.cc --- rocksdb-5.15.10/db/forward_iterator.cc 2018-09-13 17:25:20.000000000 +0000 +++ rocksdb-5.17.2/db/forward_iterator.cc 2018-11-12 19:57:32.000000000 +0000 @@ -916,21 +916,13 @@ uint32_t ForwardIterator::FindFileInRange( const std::vector& files, const Slice& internal_key, uint32_t left, uint32_t right) { - while (left < right) { - uint32_t mid = (left + right) / 2; - const FileMetaData* f = files[mid]; - if (cfd_->internal_comparator().InternalKeyComparator::Compare( - f->largest.Encode(), internal_key) < 0) { - // Key at "mid.largest" is < "target". Therefore all - // files at or before "mid" are uninteresting. - left = mid + 1; - } else { - // Key at "mid.largest" is >= "target". Therefore all files - // after "mid" are uninteresting. - right = mid; - } - } - return right; + auto cmp = [&](const FileMetaData* f, const Slice& key) -> bool { + return cfd_->internal_comparator().InternalKeyComparator::Compare( + f->largest.Encode(), key) < 0; + }; + const auto &b = files.begin(); + return static_cast(std::lower_bound(b + left, + b + right, internal_key, cmp) - b); } void ForwardIterator::DeleteIterator(InternalIterator* iter, bool is_arena) { diff -Nru rocksdb-5.15.10/db/job_context.h rocksdb-5.17.2/db/job_context.h --- rocksdb-5.15.10/db/job_context.h 2018-09-13 17:25:20.000000000 +0000 +++ rocksdb-5.17.2/db/job_context.h 2018-11-12 19:57:32.000000000 +0000 @@ -35,6 +35,14 @@ explicit SuperVersionContext(bool create_superversion = false) : new_superversion(create_superversion ? new SuperVersion() : nullptr) {} + explicit SuperVersionContext(SuperVersionContext&& other) + : superversions_to_free(std::move(other.superversions_to_free)), +#ifndef ROCKSDB_DISABLE_STALL_NOTIFICATION + write_stall_notifications(std::move(other.write_stall_notifications)), +#endif + new_superversion(std::move(other.new_superversion)) { + } + void NewSuperVersion() { new_superversion = unique_ptr(new SuperVersion()); } @@ -98,8 +106,15 @@ } inline bool HaveSomethingToClean() const { + bool sv_have_sth = false; + for (const auto& sv_ctx : superversion_contexts) { + if (sv_ctx.HaveSomethingToDelete()) { + sv_have_sth = true; + break; + } + } return memtables_to_free.size() > 0 || logs_to_free.size() > 0 || - superversion_context.HaveSomethingToDelete(); + sv_have_sth; } // Structure to store information for candidate files to delete. @@ -142,7 +157,8 @@ // a list of memtables to be free autovector memtables_to_free; - SuperVersionContext superversion_context; + // contexts for installing superversions for multiple column families + std::vector superversion_contexts; autovector logs_to_free; @@ -158,13 +174,14 @@ size_t num_alive_log_files = 0; uint64_t size_log_to_delete = 0; - explicit JobContext(int _job_id, bool create_superversion = false) - : superversion_context(create_superversion) { + explicit JobContext(int _job_id, bool create_superversion = false) { job_id = _job_id; manifest_file_number = 0; pending_manifest_file_number = 0; log_number = 0; prev_log_number = 0; + superversion_contexts.emplace_back( + SuperVersionContext(create_superversion)); } // For non-empty JobContext Clean() has to be called at least once before @@ -173,7 +190,9 @@ // doing potentially slow Clean() with locked DB mutex. void Clean() { // free superversions - superversion_context.Clean(); + for (auto& sv_context : superversion_contexts) { + sv_context.Clean(); + } // free pending memtables for (auto m : memtables_to_free) { delete m; diff -Nru rocksdb-5.15.10/db/listener_test.cc rocksdb-5.17.2/db/listener_test.cc --- rocksdb-5.15.10/db/listener_test.cc 2018-09-13 17:25:20.000000000 +0000 +++ rocksdb-5.17.2/db/listener_test.cc 2018-11-12 19:57:32.000000000 +0000 @@ -417,7 +417,9 @@ for (int i = 0; static_cast(cf_meta.file_count) < kSlowdownTrigger * 10; ++i) { Put(1, ToString(i), std::string(10000, 'x'), WriteOptions()); - db_->Flush(FlushOptions(), handles_[1]); + FlushOptions fo; + fo.allow_write_stall = true; + db_->Flush(fo, handles_[1]); db_->GetColumnFamilyMetaData(handles_[1], &cf_meta); } ASSERT_GE(listener->slowdown_count, kSlowdownTrigger * 9); @@ -880,10 +882,13 @@ ASSERT_EQ(1, listener->counter()); // trigger flush so compaction is triggered again; this time it succeeds + // The previous failed compaction may get retried automatically, so we may + // be left with 0 or 1 files in level 1, depending on when the retry gets + // scheduled ASSERT_OK(Put("key0", "val")); ASSERT_OK(dbfull()->TEST_WaitForFlushMemTable()); ASSERT_OK(dbfull()->TEST_WaitForCompact()); - ASSERT_EQ(0, NumTableFilesAtLevel(0)); + ASSERT_LE(1, NumTableFilesAtLevel(0)); } } // namespace rocksdb diff -Nru rocksdb-5.15.10/db/log_format.h rocksdb-5.17.2/db/log_format.h --- rocksdb-5.15.10/db/log_format.h 2018-09-13 17:25:20.000000000 +0000 +++ rocksdb-5.17.2/db/log_format.h 2018-11-12 19:57:32.000000000 +0000 @@ -37,9 +37,9 @@ // Header is checksum (4 bytes), length (2 bytes), type (1 byte) static const int kHeaderSize = 4 + 2 + 1; -// Recyclable header is checksum (4 bytes), type (1 byte), log number -// (4 bytes), length (2 bytes). -static const int kRecyclableHeaderSize = 4 + 1 + 4 + 2; +// Recyclable header is checksum (4 bytes), length (2 bytes), type (1 byte), +// log number (4 bytes). +static const int kRecyclableHeaderSize = 4 + 2 + 1 + 4; } // namespace log } // namespace rocksdb diff -Nru rocksdb-5.15.10/db/log_reader.cc rocksdb-5.17.2/db/log_reader.cc --- rocksdb-5.15.10/db/log_reader.cc 2018-09-13 17:25:20.000000000 +0000 +++ rocksdb-5.17.2/db/log_reader.cc 2018-11-12 19:57:32.000000000 +0000 @@ -24,7 +24,7 @@ Reader::Reader(std::shared_ptr info_log, unique_ptr&& _file, Reporter* reporter, - bool checksum, uint64_t initial_offset, uint64_t log_num) + bool checksum, uint64_t log_num) : info_log_(info_log), file_(std::move(_file)), reporter_(reporter), @@ -36,7 +36,6 @@ eof_offset_(0), last_record_offset_(0), end_of_buffer_offset_(0), - initial_offset_(initial_offset), log_number_(log_num), recycled_(false) {} @@ -44,29 +43,6 @@ delete[] backing_store_; } -bool Reader::SkipToInitialBlock() { - size_t initial_offset_in_block = initial_offset_ % kBlockSize; - uint64_t block_start_location = initial_offset_ - initial_offset_in_block; - - // Don't search a block if we'd be in the trailer - if (initial_offset_in_block > kBlockSize - 6) { - block_start_location += kBlockSize; - } - - end_of_buffer_offset_ = block_start_location; - - // Skip to start of first block that can contain the initial record - if (block_start_location > 0) { - Status skip_status = file_->Skip(block_start_location); - if (!skip_status.ok()) { - ReportDrop(static_cast(block_start_location), skip_status); - return false; - } - } - - return true; -} - // For kAbsoluteConsistency, on clean shutdown we don't expect any error // in the log files. For other modes, we can ignore only incomplete records // in the last log file, which are presumably due to a write in progress @@ -76,12 +52,6 @@ // restrict the inconsistency to only the last log bool Reader::ReadRecord(Slice* record, std::string* scratch, WALRecoveryMode wal_recovery_mode) { - if (last_record_offset_ < initial_offset_) { - if (!SkipToInitialBlock()) { - return false; - } - } - scratch->clear(); record->clear(); bool in_fragmented_record = false; @@ -299,8 +269,7 @@ } void Reader::ReportDrop(size_t bytes, const Status& reason) { - if (reporter_ != nullptr && - end_of_buffer_offset_ - buffer_.size() - bytes >= initial_offset_) { + if (reporter_ != nullptr) { reporter_->Corruption(bytes, reason); } } @@ -317,7 +286,7 @@ read_error_ = true; *error = kEof; return false; - } else if (buffer_.size() < (size_t)kBlockSize) { + } else if (buffer_.size() < static_cast(kBlockSize)) { eof_ = true; eof_offset_ = buffer_.size(); } @@ -342,7 +311,7 @@ unsigned int Reader::ReadPhysicalRecord(Slice* result, size_t* drop_size) { while (true) { // We need at least the minimum header size - if (buffer_.size() < (size_t)kHeaderSize) { + if (buffer_.size() < static_cast(kHeaderSize)) { int r; if (!ReadMore(drop_size, &r)) { return r; @@ -363,7 +332,7 @@ } header_size = kRecyclableHeaderSize; // We need enough for the larger header - if (buffer_.size() < (size_t)kRecyclableHeaderSize) { + if (buffer_.size() < static_cast(kRecyclableHeaderSize)) { int r; if (!ReadMore(drop_size, &r)) { return r; @@ -417,13 +386,6 @@ buffer_.remove_prefix(header_size + length); - // Skip physical record that started before initial_offset_ - if (end_of_buffer_offset_ - buffer_.size() - header_size - length < - initial_offset_) { - result->clear(); - return kBadRecord; - } - *result = Slice(header + header_size, length); return type; } diff -Nru rocksdb-5.15.10/db/log_reader.h rocksdb-5.17.2/db/log_reader.h --- rocksdb-5.15.10/db/log_reader.h 2018-09-13 17:25:20.000000000 +0000 +++ rocksdb-5.17.2/db/log_reader.h 2018-11-12 19:57:32.000000000 +0000 @@ -50,14 +50,10 @@ // live while this Reader is in use. // // If "checksum" is true, verify checksums if available. - // - // The Reader will start reading at the first record located at physical - // position >= initial_offset within the file. Reader(std::shared_ptr info_log, - // @lint-ignore TXT2 T25377293 Grandfathered in - unique_ptr&& file, - Reporter* reporter, bool checksum, uint64_t initial_offset, - uint64_t log_num); + // @lint-ignore TXT2 T25377293 Grandfathered in + unique_ptr&& file, Reporter* reporter, + bool checksum, uint64_t log_num); ~Reader(); @@ -108,9 +104,6 @@ // Offset of the first location past the end of buffer_. uint64_t end_of_buffer_offset_; - // Offset at which to start looking for the first record to return - uint64_t const initial_offset_; - // which log number this is uint64_t const log_number_; @@ -124,7 +117,6 @@ // Currently there are three situations in which this happens: // * The record has an invalid CRC (ReadPhysicalRecord reports a drop) // * The record is a 0-length record (No drop is reported) - // * The record is below constructor's initial_offset (No drop is reported) kBadRecord = kMaxRecordType + 2, // Returned when we fail to read a valid header. kBadHeader = kMaxRecordType + 3, @@ -136,11 +128,6 @@ kBadRecordChecksum = kMaxRecordType + 6, }; - // Skips all blocks that are completely before "initial_offset_". - // - // Returns true on success. Handles reporting. - bool SkipToInitialBlock(); - // Return type, or one of the preceding special values unsigned int ReadPhysicalRecord(Slice* result, size_t* drop_size); diff -Nru rocksdb-5.15.10/db/log_test.cc rocksdb-5.17.2/db/log_test.cc --- rocksdb-5.15.10/db/log_test.cc 2018-09-13 17:25:20.000000000 +0000 +++ rocksdb-5.17.2/db/log_test.cc 2018-11-12 19:57:32.000000000 +0000 @@ -159,12 +159,12 @@ LogTest() : reader_contents_(), dest_holder_(test::GetWritableFileWriter( - new test::StringSink(&reader_contents_))), + new test::StringSink(&reader_contents_), "" /* don't care */)), source_holder_(test::GetSequentialFileReader( new StringSource(reader_contents_), "" /* file name */)), writer_(std::move(dest_holder_), 123, GetParam()), - reader_(nullptr, std::move(source_holder_), &report_, true /*checksum*/, - 0 /*initial_offset*/, 123) { + reader_(nullptr, std::move(source_holder_), &report_, + true /* checksum */, 123 /* log_number */) { int header_size = GetParam() ? kRecyclableHeaderSize : kHeaderSize; initial_offset_last_record_offsets_[0] = 0; initial_offset_last_record_offsets_[1] = header_size + 10000; @@ -266,36 +266,6 @@ } } - void CheckOffsetPastEndReturnsNoRecords(uint64_t offset_past_end) { - WriteInitialOffsetLog(); - unique_ptr file_reader(test::GetSequentialFileReader( - new StringSource(reader_contents_), "" /* fname */)); - unique_ptr offset_reader( - new Reader(nullptr, std::move(file_reader), &report_, - true /*checksum*/, WrittenBytes() + offset_past_end, 123)); - Slice record; - std::string scratch; - ASSERT_TRUE(!offset_reader->ReadRecord(&record, &scratch)); - } - - void CheckInitialOffsetRecord(uint64_t initial_offset, - int expected_record_offset) { - WriteInitialOffsetLog(); - unique_ptr file_reader(test::GetSequentialFileReader( - new StringSource(reader_contents_), "" /* fname */)); - unique_ptr offset_reader( - new Reader(nullptr, std::move(file_reader), &report_, - true /*checksum*/, initial_offset, 123)); - Slice record; - std::string scratch; - ASSERT_TRUE(offset_reader->ReadRecord(&record, &scratch)); - ASSERT_EQ(initial_offset_record_sizes_[expected_record_offset], - record.size()); - ASSERT_EQ(initial_offset_last_record_offsets_[expected_record_offset], - offset_reader->LastRecordOffset()); - ASSERT_EQ((char)('a' + expected_record_offset), record.data()[0]); - } - }; size_t LogTest::initial_offset_record_sizes_[] = @@ -590,55 +560,6 @@ } } -TEST_P(LogTest, ReadStart) { CheckInitialOffsetRecord(0, 0); } - -TEST_P(LogTest, ReadSecondOneOff) { CheckInitialOffsetRecord(1, 1); } - -TEST_P(LogTest, ReadSecondTenThousand) { CheckInitialOffsetRecord(10000, 1); } - -TEST_P(LogTest, ReadSecondStart) { - int header_size = GetParam() ? kRecyclableHeaderSize : kHeaderSize; - CheckInitialOffsetRecord(10000 + header_size, 1); -} - -TEST_P(LogTest, ReadThirdOneOff) { - int header_size = GetParam() ? kRecyclableHeaderSize : kHeaderSize; - CheckInitialOffsetRecord(10000 + header_size + 1, 2); -} - -TEST_P(LogTest, ReadThirdStart) { - int header_size = GetParam() ? kRecyclableHeaderSize : kHeaderSize; - CheckInitialOffsetRecord(20000 + 2 * header_size, 2); -} - -TEST_P(LogTest, ReadFourthOneOff) { - int header_size = GetParam() ? kRecyclableHeaderSize : kHeaderSize; - CheckInitialOffsetRecord(20000 + 2 * header_size + 1, 3); -} - -TEST_P(LogTest, ReadFourthFirstBlockTrailer) { - CheckInitialOffsetRecord(log::kBlockSize - 4, 3); -} - -TEST_P(LogTest, ReadFourthMiddleBlock) { - CheckInitialOffsetRecord(log::kBlockSize + 1, 3); -} - -TEST_P(LogTest, ReadFourthLastBlock) { - CheckInitialOffsetRecord(2 * log::kBlockSize + 1, 3); -} - -TEST_P(LogTest, ReadFourthStart) { - int header_size = GetParam() ? kRecyclableHeaderSize : kHeaderSize; - CheckInitialOffsetRecord( - 2 * (header_size + 1000) + (2 * log::kBlockSize - 1000) + 3 * header_size, - 3); -} - -TEST_P(LogTest, ReadEnd) { CheckOffsetPastEndReturnsNoRecords(0); } - -TEST_P(LogTest, ReadPastEnd) { CheckOffsetPastEndReturnsNoRecords(5); } - TEST_P(LogTest, ClearEofSingleBlock) { Write("foo"); Write("bar"); @@ -718,7 +639,8 @@ Write("xxxxxxxxxxxxxxxx"); } unique_ptr dest_holder(test::GetWritableFileWriter( - new test::OverwritingStringSink(get_reader_contents()))); + new test::OverwritingStringSink(get_reader_contents()), + "" /* don't care */)); Writer recycle_writer(std::move(dest_holder), 123, true); recycle_writer.AddRecord(Slice("foooo")); recycle_writer.AddRecord(Slice("bar")); diff -Nru rocksdb-5.15.10/db/malloc_stats.cc rocksdb-5.17.2/db/malloc_stats.cc --- rocksdb-5.15.10/db/malloc_stats.cc 2018-09-13 17:25:20.000000000 +0000 +++ rocksdb-5.17.2/db/malloc_stats.cc 2018-11-12 19:57:32.000000000 +0000 @@ -18,9 +18,11 @@ #ifdef ROCKSDB_JEMALLOC #ifdef __FreeBSD__ #include -#define je_malloc_stats_print malloc_stats_print #else #include "jemalloc/jemalloc.h" +#ifdef JEMALLOC_NO_RENAME +#define malloc_stats_print je_malloc_stats_print +#endif #endif typedef struct { @@ -48,7 +50,7 @@ std::unique_ptr buf{new char[kMallocStatusLen + 1]}; mstat.cur = buf.get(); mstat.end = buf.get() + kMallocStatusLen; - je_malloc_stats_print(GetJemallocStatus, &mstat, ""); + malloc_stats_print(GetJemallocStatus, &mstat, ""); stats->append(buf.get()); } #else diff -Nru rocksdb-5.15.10/db/memtable.h rocksdb-5.17.2/db/memtable.h --- rocksdb-5.15.10/db/memtable.h 2018-09-13 17:25:20.000000000 +0000 +++ rocksdb-5.17.2/db/memtable.h 2018-11-12 19:57:32.000000000 +0000 @@ -34,7 +34,6 @@ class Mutex; class MemTableIterator; class MergeContext; -class InternalIterator; struct ImmutableMemTableOptions { explicit ImmutableMemTableOptions(const ImmutableCFOptions& ioptions, @@ -337,6 +336,14 @@ mem_tracker_.DoneAllocating(); } + // Notify the underlying storage that all data it contained has been + // persisted. + // REQUIRES: external synchronization to prevent simultaneous + // operations on the same MemTable. + void MarkFlushed() { + table_->MarkFlushed(); + } + // return true if the current MemTableRep supports merge operator. bool IsMergeOperatorSupported() const { return table_->IsMergeOperatorSupported(); diff -Nru rocksdb-5.15.10/db/memtable_list.cc rocksdb-5.17.2/db/memtable_list.cc --- rocksdb-5.15.10/db/memtable_list.cc 2018-09-13 17:25:20.000000000 +0000 +++ rocksdb-5.17.2/db/memtable_list.cc 2018-11-12 19:57:32.000000000 +0000 @@ -248,6 +248,7 @@ assert(refs_ == 1); // only when refs_ == 1 is MemTableListVersion mutable memlist_.remove(m); + m->MarkFlushed(); if (max_write_buffer_number_to_maintain_ > 0) { memlist_history_.push_front(m); TrimHistory(to_delete); diff -Nru rocksdb-5.15.10/db/merge_helper.h rocksdb-5.17.2/db/merge_helper.h --- rocksdb-5.15.10/db/merge_helper.h 2018-09-13 17:25:20.000000000 +0000 +++ rocksdb-5.17.2/db/merge_helper.h 2018-11-12 19:57:32.000000000 +0000 @@ -3,8 +3,7 @@ // COPYING file in the root directory) and Apache 2.0 License // (found in the LICENSE.Apache file in the root directory). // -#ifndef MERGE_HELPER_H -#define MERGE_HELPER_H +#pragma once #include #include @@ -26,7 +25,6 @@ class Logger; class MergeOperator; class Statistics; -class InternalIterator; class MergeHelper { public: @@ -194,5 +192,3 @@ }; } // namespace rocksdb - -#endif diff -Nru rocksdb-5.15.10/db/obsolete_files_test.cc rocksdb-5.17.2/db/obsolete_files_test.cc --- rocksdb-5.15.10/db/obsolete_files_test.cc 2018-09-13 17:25:20.000000000 +0000 +++ rocksdb-5.17.2/db/obsolete_files_test.cc 2018-11-12 19:57:32.000000000 +0000 @@ -227,16 +227,24 @@ } ASSERT_OK(dbi->EnableFileDeletions(true /* force */)); ASSERT_EQ(optsfiles_nums.size(), optsfiles_keep.size()); - int size = static_cast(optsfiles_nums.size()); - int kept_opts_files_count = 0; - for (int i = 0; i != size; ++i) { - if (optsfiles_keep[i]) { - ++kept_opts_files_count; - } - } - ASSERT_EQ(2, kept_opts_files_count); CloseDB(); + + std::vector files; + int opts_file_count = 0; + ASSERT_OK(env_->GetChildren(dbname_, &files)); + for (const auto& file : files) { + uint64_t file_num; + Slice dummy_info_log_name_prefix; + FileType type; + WalFileType log_type; + if (ParseFileName(file, &file_num, dummy_info_log_name_prefix, &type, + &log_type) && + type == kOptionsFile) { + opts_file_count++; + } + } + ASSERT_EQ(2, opts_file_count); } } //namespace rocksdb diff -Nru rocksdb-5.15.10/db/perf_context_test.cc rocksdb-5.17.2/db/perf_context_test.cc --- rocksdb-5.15.10/db/perf_context_test.cc 2018-09-13 17:25:20.000000000 +0000 +++ rocksdb-5.17.2/db/perf_context_test.cc 2018-11-12 19:57:32.000000000 +0000 @@ -469,7 +469,7 @@ ASSERT_GT(hist_num_memtable_checked.Average(), 0); // In read-only mode Get(), no super version operation is needed ASSERT_EQ(hist_get_post_process.Average(), 0); - ASSERT_EQ(hist_get_snapshot.Average(), 0); + ASSERT_GT(hist_get_snapshot.Average(), 0); ASSERT_GT(hist_mget.Average(), 0); ASSERT_GT(hist_mget_snapshot.Average(), 0); diff -Nru rocksdb-5.15.10/db/range_del_aggregator_bench.cc rocksdb-5.17.2/db/range_del_aggregator_bench.cc --- rocksdb-5.15.10/db/range_del_aggregator_bench.cc 1970-01-01 00:00:00.000000000 +0000 +++ rocksdb-5.17.2/db/range_del_aggregator_bench.cc 2018-11-12 19:57:32.000000000 +0000 @@ -0,0 +1,244 @@ +// Copyright (c) 2018-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). + +#ifndef GFLAGS +#include +int main() { + fprintf(stderr, "Please install gflags to run rocksdb tools\n"); + return 1; +} +#else + +#include +#include +#include +#include +#include +#include +#include + +#include "db/range_del_aggregator.h" +#include "rocksdb/comparator.h" +#include "rocksdb/env.h" +#include "util/coding.h" +#include "util/random.h" +#include "util/stop_watch.h" +#include "util/testutil.h" + +#include "util/gflags_compat.h" + +using GFLAGS_NAMESPACE::ParseCommandLineFlags; + +DEFINE_int32(num_range_tombstones, 1000, "number of range tombstones created"); + +DEFINE_int32(num_runs, 10000, "number of test runs"); + +DEFINE_int32(tombstone_start_upper_bound, 1000, + "exclusive upper bound on range tombstone start keys"); + +DEFINE_int32(should_delete_upper_bound, 1000, + "exclusive upper bound on keys passed to ShouldDelete"); + +DEFINE_double(tombstone_width_mean, 100.0, "average range tombstone width"); + +DEFINE_double(tombstone_width_stddev, 0.0, + "standard deviation of range tombstone width"); + +DEFINE_bool(use_collapsed, true, "use the collapsed range tombstone map"); + +DEFINE_int32(seed, 0, "random number generator seed"); + +DEFINE_int32(should_deletes_per_run, 1, "number of ShouldDelete calls per run"); + +DEFINE_int32(add_tombstones_per_run, 1, + "number of AddTombstones calls per run"); + +namespace { + +struct Stats { + uint64_t time_add_tombstones = 0; + uint64_t time_first_should_delete = 0; + uint64_t time_rest_should_delete = 0; +}; + +std::ostream& operator<<(std::ostream& os, const Stats& s) { + std::ios fmt_holder(nullptr); + fmt_holder.copyfmt(os); + + os << std::left; + os << std::setw(25) << "AddTombstones: " + << s.time_add_tombstones / + (FLAGS_add_tombstones_per_run * FLAGS_num_runs * 1.0e3) + << " us\n"; + os << std::setw(25) << "ShouldDelete (first): " + << s.time_first_should_delete / (FLAGS_num_runs * 1.0e3) << " us\n"; + if (FLAGS_should_deletes_per_run > 1) { + os << std::setw(25) << "ShouldDelete (rest): " + << s.time_rest_should_delete / + ((FLAGS_should_deletes_per_run - 1) * FLAGS_num_runs * 1.0e3) + << " us\n"; + } + + os.copyfmt(fmt_holder); + return os; +} + +} // anonymous namespace + +namespace rocksdb { + +namespace { + +// A wrapper around RangeTombstones and the underlying data of its start and end +// keys. +struct PersistentRangeTombstone { + std::string start_key; + std::string end_key; + RangeTombstone tombstone; + + PersistentRangeTombstone(std::string start, std::string end, + SequenceNumber seq) + : start_key(std::move(start)), end_key(std::move(end)) { + tombstone = RangeTombstone(start_key, end_key, seq); + } + + PersistentRangeTombstone() = default; + + PersistentRangeTombstone(const PersistentRangeTombstone& t) { *this = t; } + + PersistentRangeTombstone& operator=(const PersistentRangeTombstone& t) { + start_key = t.start_key; + end_key = t.end_key; + tombstone = RangeTombstone(start_key, end_key, t.tombstone.seq_); + + return *this; + } + + PersistentRangeTombstone(PersistentRangeTombstone&& t) noexcept { *this = t; } + + PersistentRangeTombstone& operator=(PersistentRangeTombstone&& t) { + start_key = std::move(t.start_key); + end_key = std::move(t.end_key); + tombstone = RangeTombstone(start_key, end_key, t.tombstone.seq_); + + return *this; + } +}; + +struct TombstoneStartKeyComparator { + explicit TombstoneStartKeyComparator(const Comparator* c) : cmp(c) {} + + bool operator()(const RangeTombstone& a, const RangeTombstone& b) const { + return cmp->Compare(a.start_key_, b.start_key_) < 0; + } + + const Comparator* cmp; +}; + +std::unique_ptr MakeRangeDelIterator( + const std::vector& range_dels) { + std::vector keys, values; + for (const auto& range_del : range_dels) { + auto key_and_value = range_del.tombstone.Serialize(); + keys.push_back(key_and_value.first.Encode().ToString()); + values.push_back(key_and_value.second.ToString()); + } + return std::unique_ptr( + new test::VectorIterator(keys, values)); +} + +// convert long to a big-endian slice key +static std::string Key(int64_t val) { + std::string little_endian_key; + std::string big_endian_key; + PutFixed64(&little_endian_key, val); + assert(little_endian_key.size() == sizeof(val)); + big_endian_key.resize(sizeof(val)); + for (size_t i = 0; i < sizeof(val); ++i) { + big_endian_key[i] = little_endian_key[sizeof(val) - 1 - i]; + } + return big_endian_key; +} + +} // anonymous namespace + +} // namespace rocksdb + +int main(int argc, char** argv) { + ParseCommandLineFlags(&argc, &argv, true); + + Stats stats; + rocksdb::Random64 rnd(FLAGS_seed); + std::default_random_engine random_gen(FLAGS_seed); + std::normal_distribution normal_dist(FLAGS_tombstone_width_mean, + FLAGS_tombstone_width_stddev); + std::vector > + all_persistent_range_tombstones(FLAGS_add_tombstones_per_run); + for (int i = 0; i < FLAGS_add_tombstones_per_run; i++) { + all_persistent_range_tombstones[i] = + std::vector( + FLAGS_num_range_tombstones); + } + auto mode = FLAGS_use_collapsed + ? rocksdb::RangeDelPositioningMode::kForwardTraversal + : rocksdb::RangeDelPositioningMode::kFullScan; + + for (int i = 0; i < FLAGS_num_runs; i++) { + auto icmp = rocksdb::InternalKeyComparator(rocksdb::BytewiseComparator()); + rocksdb::RangeDelAggregator range_del_agg(icmp, {} /* snapshots */, + FLAGS_use_collapsed); + + for (auto& persistent_range_tombstones : all_persistent_range_tombstones) { + // TODO(abhimadan): consider whether creating the range tombstones right + // before AddTombstones is artificially warming the cache compared to + // real workloads. + for (int j = 0; j < FLAGS_num_range_tombstones; j++) { + uint64_t start = rnd.Uniform(FLAGS_tombstone_start_upper_bound); + uint64_t end = start + std::max(1.0, normal_dist(random_gen)); + persistent_range_tombstones[j] = rocksdb::PersistentRangeTombstone( + rocksdb::Key(start), rocksdb::Key(end), j); + } + + auto range_del_iter = + rocksdb::MakeRangeDelIterator(persistent_range_tombstones); + rocksdb::StopWatchNano stop_watch_add_tombstones(rocksdb::Env::Default(), + true /* auto_start */); + range_del_agg.AddTombstones(std::move(range_del_iter)); + stats.time_add_tombstones += stop_watch_add_tombstones.ElapsedNanos(); + } + + rocksdb::ParsedInternalKey parsed_key; + parsed_key.sequence = FLAGS_num_range_tombstones / 2; + parsed_key.type = rocksdb::kTypeValue; + + uint64_t first_key = rnd.Uniform(FLAGS_should_delete_upper_bound - + FLAGS_should_deletes_per_run + 1); + + for (int j = 0; j < FLAGS_should_deletes_per_run; j++) { + std::string key_string = rocksdb::Key(first_key + j); + parsed_key.user_key = key_string; + + rocksdb::StopWatchNano stop_watch_should_delete(rocksdb::Env::Default(), + true /* auto_start */); + range_del_agg.ShouldDelete(parsed_key, mode); + uint64_t call_time = stop_watch_should_delete.ElapsedNanos(); + + if (j == 0) { + stats.time_first_should_delete += call_time; + } else { + stats.time_rest_should_delete += call_time; + } + } + } + + std::cout << "=========================\n" + << "Results:\n" + << "=========================\n" + << stats; + + return 0; +} + +#endif // GFLAGS diff -Nru rocksdb-5.15.10/db/range_del_aggregator.cc rocksdb-5.17.2/db/range_del_aggregator.cc --- rocksdb-5.15.10/db/range_del_aggregator.cc 2018-09-13 17:25:20.000000000 +0000 +++ rocksdb-5.17.2/db/range_del_aggregator.cc 2018-11-12 19:57:32.000000000 +0000 @@ -76,7 +76,9 @@ return false; } - void AddTombstone(RangeTombstone tombstone) override { rep_.emplace(tombstone); } + void AddTombstone(RangeTombstone tombstone) override { + rep_.emplace(tombstone); + } size_t Size() const override { return rep_.size(); } @@ -171,7 +173,9 @@ const Comparator* ucmp_; public: - CollapsedRangeDelMap(const Comparator* ucmp) : ucmp_(ucmp) { + explicit CollapsedRangeDelMap(const Comparator* ucmp) + : rep_(stl_wrappers::LessOfComparator(ucmp)), + ucmp_(ucmp) { InvalidatePosition(); } @@ -265,22 +269,36 @@ // 2: c--- OR 2: c--- OR 2: c--- OR 2: c------ // 1: A--C 1: 1: A------ 1: C------ // ^ ^ ^ ^ - // Insert a new transition at the new tombstone's start point, or raise - // the existing transition at that point to the new tombstone's seqno. end_seq = prev_seq(); - rep_[t.start_key_] = t.seq_; // operator[] will overwrite existing entry + Rep::iterator pit; + if (it != rep_.begin() && (pit = std::prev(it)) != rep_.begin() && + ucmp_->Compare(pit->first, t.start_key_) == 0 && std::prev(pit)->second == t.seq_) { + // The new tombstone starts at the end of an existing tombstone with an + // identical seqno: + // + // 3: + // 2: A--C--- + // 1: + // ^ + // Merge the tombstones by removing the existing tombstone's end key. + it = rep_.erase(std::prev(it)); + } else { + // Insert a new transition at the new tombstone's start point, or raise + // the existing transition at that point to the new tombstone's seqno. + rep_[t.start_key_] = t.seq_; // operator[] will overwrite existing entry + } } else { // The new tombstone's start point is covered by an existing tombstone: // - // 3: A----- OR 3: C------ - // 2: c--- 2: c------ - // ^ ^ + // 3: A----- OR 3: C------ OR + // 2: c--- 2: c------ 2: C------ + // ^ ^ ^ // Do nothing. } // Look at all the existing transitions that overlap the new tombstone. while (it != rep_.end() && ucmp_->Compare(it->first, t.end_key_) < 0) { - if (t.seq_ > it->second) { + if (t.seq_ >= it->second) { // The transition is to an existing tombstone that the new tombstone // covers. Save the covered tombstone's seqno. We'll need to return to // it if the new tombstone ends before the existing tombstone. @@ -324,15 +342,29 @@ } if (t.seq_ == prev_seq()) { - // The new tombstone is unterminated in the map: - // - // 3: OR 3: --G OR 3: --G K-- - // 2: C-------k 2: G---k 2: G---k - // ^ ^ ^ - // End it now, returning to the last seqno we covered. Because end keys - // are exclusive, if there's an existing transition at t.end_key_, it - // takes precedence over the transition that we install here. - rep_.emplace(t.end_key_, end_seq); // emplace is a noop if existing entry + // The new tombstone is unterminated in the map. + if (it != rep_.end() && t.seq_ == it->second && ucmp_->Compare(it->first, t.end_key_) == 0) { + // The new tombstone ends at the start of another tombstone with an + // identical seqno. Merge the tombstones by removing the existing + // tombstone's start key. + rep_.erase(it); + } else if (end_seq == prev_seq() || (it != rep_.end() && end_seq == it->second)) { + // The new tombstone is implicitly ended because its end point is + // contained within an existing tombstone with the same seqno: + // + // 2: ---k--N + // ^ + } else { + // The new tombstone needs an explicit end point. + // + // 3: OR 3: --G OR 3: --G K-- + // 2: C-------k 2: G---k 2: G---k + // ^ ^ ^ + // Install one that returns to the last seqno we covered. Because end + // keys are exclusive, if there's an existing transition at t.end_key_, + // it takes precedence over the transition that we install here. + rep_.emplace(t.end_key_, end_seq); // emplace is a noop if existing entry + } } else { // The new tombstone is implicitly ended because its end point is covered // by an existing tombstone with a higher seqno. @@ -478,22 +510,22 @@ } } if (largest != nullptr) { - // This is subtly correct despite the discrepancy between - // FileMetaData::largest being inclusive while RangeTombstone::end_key_ - // is exclusive. A tombstone will only extend past the bounds of an - // sstable if its end-key is the largest key in the table. If that - // occurs, the largest key for the table is set based on the smallest - // key in the next table in the level. In that case, largest->user_key() - // is not actually a key in the current table and thus we can use it as - // the exclusive end-key for the tombstone. - if (icmp_.user_comparator()->Compare( - tombstone.end_key_, largest->user_key()) > 0) { - // The largest key should be a tombstone sentinel key. - assert(GetInternalKeySeqno(largest->Encode()) == kMaxSequenceNumber); + // To safely truncate the range tombstone's end key, it must extend past + // the largest key in the sstable (which may have been extended to the + // smallest key in the next sstable), and largest must be a tombstone + // sentinel key. A range tombstone may straddle two sstables and not be + // the tombstone sentinel key in the first sstable if a user-key also + // straddles the sstables (possible if there is a snapshot between the + // two versions of the user-key), in which case we cannot truncate the + // range tombstone. + if (icmp_.user_comparator()->Compare(tombstone.end_key_, + largest->user_key()) > 0 && + GetInternalKeySeqno(largest->Encode()) == kMaxSequenceNumber) { tombstone.end_key_ = largest->user_key(); } } - GetRangeDelMap(tombstone.seq_).AddTombstone(std::move(tombstone)); + auto seq = tombstone.seq_; + GetRangeDelMap(seq).AddTombstone(std::move(tombstone)); input->Next(); } if (!first_iter) { diff -Nru rocksdb-5.15.10/db/range_del_aggregator_test.cc rocksdb-5.17.2/db/range_del_aggregator_test.cc --- rocksdb-5.15.10/db/range_del_aggregator_test.cc 2018-09-13 17:25:20.000000000 +0000 +++ rocksdb-5.17.2/db/range_del_aggregator_test.cc 2018-11-12 19:57:32.000000000 +0000 @@ -27,7 +27,7 @@ kReverse, }; -static auto icmp = InternalKeyComparator(BytewiseComparator()); +static auto bytewise_icmp = InternalKeyComparator(BytewiseComparator()); void AddTombstones(RangeDelAggregator* range_del_agg, const std::vector& range_dels, @@ -66,8 +66,8 @@ const std::vector& range_dels_in, const std::vector& expected_points, const std::vector& expected_collapsed_range_dels, - const InternalKey* smallest = nullptr, - const InternalKey* largest = nullptr) { + const InternalKey* smallest = nullptr, const InternalKey* largest = nullptr, + const InternalKeyComparator& icmp = bytewise_icmp) { // Test same result regardless of which order the range deletions are added // and regardless of collapsed mode. for (bool collapsed : {false, true}) { @@ -164,6 +164,14 @@ {{"a", "b", 5}, {"b", "c", 10}, {"c", "d", 5}}); } +TEST_F(RangeDelAggregatorTest, OverlapAboveMiddleReverse) { + VerifyRangeDels({{"d", "a", 5}, {"c", "b", 10}}, + {{"z", 0}, {"d", 5}, {"c", 10}, {"b", 5}, {"a", 0}}, + {{"d", "c", 5}, {"c", "b", 10}, {"b", "a", 5}}, + nullptr /* smallest */, nullptr /* largest */, + InternalKeyComparator(ReverseBytewiseComparator())); +} + TEST_F(RangeDelAggregatorTest, OverlapFully) { VerifyRangeDels({{"a", "d", 10}, {"b", "c", 5}}, {{" ", 0}, {"a", 10}, {"d", 0}}, {{"a", "d", 10}}); @@ -200,6 +208,30 @@ {{"a", "b", 5}, {"c", "d", 10}, {"e", "f", 15}}); } +TEST_F(RangeDelAggregatorTest, IdenticalSameSeqNo) { + VerifyRangeDels({{"a", "b", 5}, {"a", "b", 5}}, + {{" ", 0}, {"a", 5}, {"b", 0}}, + {{"a", "b", 5}}); +} + +TEST_F(RangeDelAggregatorTest, ContiguousSameSeqNo) { + VerifyRangeDels({{"a", "b", 5}, {"b", "c", 5}}, + {{" ", 0}, {"a", 5}, {"b", 5}, {"c", 0}}, + {{"a", "c", 5}}); +} + +TEST_F(RangeDelAggregatorTest, OverlappingSameSeqNo) { + VerifyRangeDels({{"a", "c", 5}, {"b", "d", 5}}, + {{" ", 0}, {"a", 5}, {"b", 5}, {"c", 5}, {"d", 0}}, + {{"a", "d", 5}}); +} + +TEST_F(RangeDelAggregatorTest, CoverSameSeqNo) { + VerifyRangeDels({{"a", "d", 5}, {"b", "c", 5}}, + {{" ", 0}, {"a", 5}, {"b", 5}, {"c", 5}, {"d", 0}}, + {{"a", "d", 5}}); +} + // Note the Cover* tests also test cases where tombstones are inserted under a // larger one when VerifyRangeDels() runs them in reverse TEST_F(RangeDelAggregatorTest, CoverMultipleFromLeft) { @@ -235,14 +267,14 @@ TEST_F(RangeDelAggregatorTest, MergingIteratorAllEmptyStripes) { for (bool collapsed : {true, false}) { - RangeDelAggregator range_del_agg(icmp, {1, 2}, collapsed); + RangeDelAggregator range_del_agg(bytewise_icmp, {1, 2}, collapsed); VerifyRangeDelIter(range_del_agg.NewIterator().get(), {}); } } TEST_F(RangeDelAggregatorTest, MergingIteratorOverlappingStripes) { for (bool collapsed : {true, false}) { - RangeDelAggregator range_del_agg(icmp, {5, 15, 25, 35}, collapsed); + RangeDelAggregator range_del_agg(bytewise_icmp, {5, 15, 25, 35}, collapsed); AddTombstones( &range_del_agg, {{"d", "e", 10}, {"aa", "b", 20}, {"c", "d", 30}, {"a", "b", 10}}); @@ -253,7 +285,8 @@ } TEST_F(RangeDelAggregatorTest, MergingIteratorSeek) { - RangeDelAggregator range_del_agg(icmp, {5, 15}, true /* collapsed */); + RangeDelAggregator range_del_agg(bytewise_icmp, {5, 15}, + true /* collapsed */); AddTombstones(&range_del_agg, {{"a", "c", 10}, {"b", "c", 11}, {"f", "g", 10}, @@ -300,6 +333,21 @@ &smallest, &largest); } +TEST_F(RangeDelAggregatorTest, OverlappingLargestKeyTruncateTombstones) { + const InternalKey smallest("b", 1, kTypeRangeDeletion); + const InternalKey largest( + "e", 3, // could happen if "e" is in consecutive sstables + kTypeValue); + VerifyRangeDels( + {{"a", "c", 10}, {"d", "f", 10}}, + {{"a", 10, true}, // truncated + {"b", 10, false}, // not truncated + {"d", 10, false}, // not truncated + {"e", 10, false}}, // not truncated + {{"b", "c", 10}, {"d", "f", 10}}, + &smallest, &largest); +} + } // namespace rocksdb int main(int argc, char** argv) { diff -Nru rocksdb-5.15.10/db/repair.cc rocksdb-5.17.2/db/repair.cc --- rocksdb-5.15.10/db/repair.cc 2018-09-13 17:25:20.000000000 +0000 +++ rocksdb-5.17.2/db/repair.cc 2018-11-12 19:57:32.000000000 +0000 @@ -353,7 +353,7 @@ // propagating bad information (like overly large sequence // numbers). log::Reader reader(db_options_.info_log, std::move(lfile_reader), &reporter, - true /*enable checksum*/, 0 /*initial_offset*/, log); + true /*enable checksum*/, log); // Initialize per-column family memtables for (auto* cfd : *vset_.GetColumnFamilySet()) { diff -Nru rocksdb-5.15.10/db/repair_test.cc rocksdb-5.17.2/db/repair_test.cc --- rocksdb-5.15.10/db/repair_test.cc 2018-09-13 17:25:20.000000000 +0000 +++ rocksdb-5.17.2/db/repair_test.cc 2018-11-12 19:57:32.000000000 +0000 @@ -74,7 +74,7 @@ Close(); ASSERT_OK(env_->FileExists(manifest_path)); - CreateFile(env_, manifest_path, "blah"); + CreateFile(env_, manifest_path, "blah", false /* use_fsync */); ASSERT_OK(RepairDB(dbname_, CurrentOptions())); Reopen(CurrentOptions()); @@ -153,7 +153,7 @@ Flush(); auto sst_path = GetFirstSstPath(); ASSERT_FALSE(sst_path.empty()); - CreateFile(env_, sst_path, "blah"); + CreateFile(env_, sst_path, "blah", false /* use_fsync */); Close(); ASSERT_OK(RepairDB(dbname_, CurrentOptions())); diff -Nru rocksdb-5.15.10/db/snapshot_checker.h rocksdb-5.17.2/db/snapshot_checker.h --- rocksdb-5.15.10/db/snapshot_checker.h 2018-09-13 17:25:20.000000000 +0000 +++ rocksdb-5.17.2/db/snapshot_checker.h 2018-11-12 19:57:32.000000000 +0000 @@ -19,8 +19,9 @@ class DisableGCSnapshotChecker : public SnapshotChecker { public: virtual ~DisableGCSnapshotChecker() {} - virtual bool IsInSnapshot(SequenceNumber /*sequence*/, - SequenceNumber /*snapshot_sequence*/) const override { + virtual bool IsInSnapshot( + SequenceNumber /*sequence*/, + SequenceNumber /*snapshot_sequence*/) const override { // By returning false, we prevent all the values from being GCed return false; } diff -Nru rocksdb-5.15.10/db/table_cache.cc rocksdb-5.17.2/db/table_cache.cc --- rocksdb-5.15.10/db/table_cache.cc 2018-09-13 17:25:20.000000000 +0000 +++ rocksdb-5.17.2/db/table_cache.cc 2018-11-12 19:57:32.000000000 +0000 @@ -120,7 +120,7 @@ s = ioptions_.table_factory->NewTableReader( TableReaderOptions(ioptions_, prefix_extractor, env_options, internal_comparator, skip_filters, immortal_tables_, - level), + level, fd.largest_seqno), std::move(file_reader), fd.GetFileSize(), table_reader, prefetch_index_and_filter_in_cache); TEST_SYNC_POINT("TableCache::GetTableReader:0"); @@ -238,7 +238,7 @@ if (s.ok()) { if (options.table_filter && !options.table_filter(*table_reader->GetTableProperties())) { - result = NewEmptyInternalIterator(arena); + result = NewEmptyInternalIterator(arena); } else { result = table_reader->NewIterator(options, prefix_extractor, arena, skip_filters, for_compaction); @@ -279,7 +279,7 @@ } if (!s.ok()) { assert(result == nullptr); - result = NewErrorInternalIterator(s, arena); + result = NewErrorInternalIterator(s, arena); } return result; } diff -Nru rocksdb-5.15.10/db/table_cache.h rocksdb-5.17.2/db/table_cache.h --- rocksdb-5.15.10/db/table_cache.h 2018-09-13 17:25:20.000000000 +0000 +++ rocksdb-5.17.2/db/table_cache.h 2018-11-12 19:57:32.000000000 +0000 @@ -31,7 +31,6 @@ struct FileDescriptor; class GetContext; class HistogramImpl; -class InternalIterator; class TableCache { public: diff -Nru rocksdb-5.15.10/db/table_properties_collector_test.cc rocksdb-5.17.2/db/table_properties_collector_test.cc --- rocksdb-5.15.10/db/table_properties_collector_test.cc 2018-09-13 17:25:20.000000000 +0000 +++ rocksdb-5.17.2/db/table_properties_collector_test.cc 2018-11-12 19:57:32.000000000 +0000 @@ -46,7 +46,8 @@ std::unique_ptr* writable, std::unique_ptr* builder) { unique_ptr wf(new test::StringSink); - writable->reset(new WritableFileWriter(std::move(wf), EnvOptions())); + writable->reset( + new WritableFileWriter(std::move(wf), "" /* don't care */, EnvOptions())); int unknown_level = -1; builder->reset(NewTableBuilder( ioptions, moptions, internal_comparator, int_tbl_prop_collector_factories, diff -Nru rocksdb-5.15.10/db/transaction_log_impl.cc rocksdb-5.17.2/db/transaction_log_impl.cc --- rocksdb-5.15.10/db/transaction_log_impl.cc 2018-09-13 17:25:20.000000000 +0000 +++ rocksdb-5.17.2/db/transaction_log_impl.cc 2018-11-12 19:57:32.000000000 +0000 @@ -104,7 +104,7 @@ if (files_->size() <= startFileIndex) { return; } - Status s = OpenLogReader(files_->at(startFileIndex).get()); + Status s = OpenLogReader(files_->at(static_cast(startFileIndex)).get()); if (!s.ok()) { currentStatus_ = s; reporter_.Info(currentStatus_.ToString().c_str()); @@ -312,9 +312,9 @@ return s; } assert(file); - currentLogReader_.reset(new log::Reader( - options_->info_log, std::move(file), &reporter_, - read_options_.verify_checksums_, 0, logFile->LogNumber())); + currentLogReader_.reset( + new log::Reader(options_->info_log, std::move(file), &reporter_, + read_options_.verify_checksums_, logFile->LogNumber())); return Status::OK(); } } // namespace rocksdb diff -Nru rocksdb-5.15.10/db/version_builder.cc rocksdb-5.17.2/db/version_builder.cc --- rocksdb-5.15.10/db/version_builder.cc 2018-09-13 17:25:20.000000000 +0000 +++ rocksdb-5.17.2/db/version_builder.cc 2018-11-12 19:57:32.000000000 +0000 @@ -35,11 +35,11 @@ namespace rocksdb { bool NewestFirstBySeqNo(FileMetaData* a, FileMetaData* b) { - if (a->largest_seqno != b->largest_seqno) { - return a->largest_seqno > b->largest_seqno; + if (a->fd.largest_seqno != b->fd.largest_seqno) { + return a->fd.largest_seqno > b->fd.largest_seqno; } - if (a->smallest_seqno != b->smallest_seqno) { - return a->smallest_seqno > b->smallest_seqno; + if (a->fd.smallest_seqno != b->fd.smallest_seqno) { + return a->fd.smallest_seqno > b->fd.smallest_seqno; } // Break ties by file number return a->fd.GetNumber() > b->fd.GetNumber(); @@ -162,22 +162,24 @@ abort(); } - if (f2->smallest_seqno == f2->largest_seqno) { + if (f2->fd.smallest_seqno == f2->fd.largest_seqno) { // This is an external file that we ingested - SequenceNumber external_file_seqno = f2->smallest_seqno; - if (!(external_file_seqno < f1->largest_seqno || + SequenceNumber external_file_seqno = f2->fd.smallest_seqno; + if (!(external_file_seqno < f1->fd.largest_seqno || external_file_seqno == 0)) { - fprintf(stderr, "L0 file with seqno %" PRIu64 " %" PRIu64 - " vs. file with global_seqno %" PRIu64 "\n", - f1->smallest_seqno, f1->largest_seqno, + fprintf(stderr, + "L0 file with seqno %" PRIu64 " %" PRIu64 + " vs. file with global_seqno %" PRIu64 "\n", + f1->fd.smallest_seqno, f1->fd.largest_seqno, external_file_seqno); abort(); } - } else if (f1->smallest_seqno <= f2->smallest_seqno) { - fprintf(stderr, "L0 files seqno %" PRIu64 " %" PRIu64 - " vs. %" PRIu64 " %" PRIu64 "\n", - f1->smallest_seqno, f1->largest_seqno, f2->smallest_seqno, - f2->largest_seqno); + } else if (f1->fd.smallest_seqno <= f2->fd.smallest_seqno) { + fprintf(stderr, + "L0 files seqno %" PRIu64 " %" PRIu64 " vs. %" PRIu64 + " %" PRIu64 "\n", + f1->fd.smallest_seqno, f1->fd.largest_seqno, + f2->fd.smallest_seqno, f2->fd.largest_seqno); abort(); } } else { @@ -322,8 +324,6 @@ // Merge the set of added files with the set of pre-existing files. // Drop any deleted files. Store the result in *v. const auto& base_files = base_vstorage_->LevelFiles(level); - auto base_iter = base_files.begin(); - auto base_end = base_files.end(); const auto& unordered_added_files = levels_[level].added_files; vstorage->Reserve(level, base_files.size() + unordered_added_files.size()); @@ -337,30 +337,27 @@ std::sort(added_files.begin(), added_files.end(), cmp); #ifndef NDEBUG - FileMetaData* prev_file = nullptr; -#endif - + FileMetaData* prev_added_file = nullptr; for (const auto& added : added_files) { -#ifndef NDEBUG - if (level > 0 && prev_file != nullptr) { + if (level > 0 && prev_added_file != nullptr) { assert(base_vstorage_->InternalComparator()->Compare( - prev_file->smallest, added->smallest) <= 0); + prev_added_file->smallest, added->smallest) <= 0); } - prev_file = added; + prev_added_file = added; + } #endif - // Add all smaller files listed in base_ - for (auto bpos = std::upper_bound(base_iter, base_end, added, cmp); - base_iter != bpos; ++base_iter) { - MaybeAddFile(vstorage, level, *base_iter); + auto base_iter = base_files.begin(); + auto base_end = base_files.end(); + auto added_iter = added_files.begin(); + auto added_end = added_files.end(); + while (added_iter != added_end || base_iter != base_end) { + if (base_iter == base_end || + (added_iter != added_end && cmp(*added_iter, *base_iter))) { + MaybeAddFile(vstorage, level, *added_iter++); + } else { + MaybeAddFile(vstorage, level, *base_iter++); } - - MaybeAddFile(vstorage, level, added); - } - - // Add remaining base files - for (; base_iter != base_end; ++base_iter) { - MaybeAddFile(vstorage, level, *base_iter); } } @@ -382,7 +379,7 @@ } std::atomic next_file_meta_idx(0); - std::function load_handlers_func = [&]() { + std::function load_handlers_func([&]() { while (true) { size_t file_idx = next_file_meta_idx.fetch_add(1); if (file_idx >= files_meta.size()) { @@ -403,7 +400,7 @@ file_meta->table_reader_handle); } } - }; + }); std::vector threads; for (int i = 1; i < max_threads; i++) { diff -Nru rocksdb-5.15.10/db/version_builder_test.cc rocksdb-5.17.2/db/version_builder_test.cc --- rocksdb-5.15.10/db/version_builder_test.cc 2018-09-13 17:25:20.000000000 +0000 +++ rocksdb-5.17.2/db/version_builder_test.cc 2018-11-12 19:57:32.000000000 +0000 @@ -63,8 +63,8 @@ f->fd = FileDescriptor(file_number, path_id, file_size); f->smallest = GetInternalKey(smallest, smallest_seq); f->largest = GetInternalKey(largest, largest_seq); - f->smallest_seqno = smallest_seqno; - f->largest_seqno = largest_seqno; + f->fd.smallest_seqno = smallest_seqno; + f->fd.largest_seqno = largest_seqno; f->compensated_file_size = file_size; f->refs = 0; f->num_entries = num_entries; diff -Nru rocksdb-5.15.10/db/version_edit.cc rocksdb-5.17.2/db/version_edit.cc --- rocksdb-5.15.10/db/version_edit.cc 2018-09-13 17:25:20.000000000 +0000 +++ rocksdb-5.17.2/db/version_edit.cc 2018-11-12 19:57:32.000000000 +0000 @@ -40,13 +40,15 @@ kColumnFamilyAdd = 201, kColumnFamilyDrop = 202, kMaxColumnFamily = 203, + + kInAtomicGroup = 300, }; enum CustomTag : uint32_t { kTerminate = 1, // The end of customized fields kNeedCompaction = 2, // Since Manifest is not entirely currently forward-compatible, and the only - // forward-compatbile part is the CutsomtTag of kNewFile, we currently encode + // forward-compatible part is the CutsomtTag of kNewFile, we currently encode // kMinLogNumberToKeep as part of a CustomTag as a hack. This should be // removed when manifest becomes forward-comptabile. kMinLogNumberToKeepHack = 3, @@ -83,6 +85,8 @@ is_column_family_add_ = 0; is_column_family_drop_ = 0; column_family_name_.clear(); + is_in_atomic_group_ = false; + remaining_entries_ = 0; } bool VersionEdit::EncodeTo(std::string* dst) const { @@ -135,7 +139,7 @@ PutVarint64(dst, f.fd.GetFileSize()); PutLengthPrefixedSlice(dst, f.smallest.Encode()); PutLengthPrefixedSlice(dst, f.largest.Encode()); - PutVarint64Varint64(dst, f.smallest_seqno, f.largest_seqno); + PutVarint64Varint64(dst, f.fd.smallest_seqno, f.fd.largest_seqno); if (has_customized_fields) { // Customized fields' format: // +-----------------------------+ @@ -200,6 +204,11 @@ if (is_column_family_drop_) { PutVarint32(dst, kColumnFamilyDrop); } + + if (is_in_atomic_group_) { + PutVarint32(dst, kInAtomicGroup); + PutVarint32(dst, remaining_entries_); + } return true; } @@ -233,14 +242,16 @@ uint64_t number; uint32_t path_id = 0; uint64_t file_size; + SequenceNumber smallest_seqno; + SequenceNumber largest_seqno; // Since this is the only forward-compatible part of the code, we hack new // extension into this record. When we do, we set this boolean to distinguish // the record from the normal NewFile records. if (GetLevel(input, &level, &msg) && GetVarint64(input, &number) && GetVarint64(input, &file_size) && GetInternalKey(input, &f.smallest) && GetInternalKey(input, &f.largest) && - GetVarint64(input, &f.smallest_seqno) && - GetVarint64(input, &f.largest_seqno)) { + GetVarint64(input, &smallest_seqno) && + GetVarint64(input, &largest_seqno)) { // See comments in VersionEdit::EncodeTo() for format of customized fields while (true) { uint32_t custom_tag; @@ -272,7 +283,7 @@ break; case kMinLogNumberToKeepHack: // This is a hack to encode kMinLogNumberToKeep in a - // forward-compatbile fashion. + // forward-compatible fashion. if (!GetFixed64(&field, &min_log_number_to_keep_)) { return "deleted log number malformatted"; } @@ -289,7 +300,8 @@ } else { return "new-file4 entry"; } - f.fd = FileDescriptor(number, path_id, file_size); + f.fd = + FileDescriptor(number, path_id, file_size, smallest_seqno, largest_seqno); new_files_.push_back(std::make_pair(level, f)); return nullptr; } @@ -409,13 +421,16 @@ case kNewFile2: { uint64_t number; uint64_t file_size; + SequenceNumber smallest_seqno; + SequenceNumber largest_seqno; if (GetLevel(&input, &level, &msg) && GetVarint64(&input, &number) && GetVarint64(&input, &file_size) && GetInternalKey(&input, &f.smallest) && GetInternalKey(&input, &f.largest) && - GetVarint64(&input, &f.smallest_seqno) && - GetVarint64(&input, &f.largest_seqno)) { - f.fd = FileDescriptor(number, 0, file_size); + GetVarint64(&input, &smallest_seqno) && + GetVarint64(&input, &largest_seqno)) { + f.fd = FileDescriptor(number, 0, file_size, smallest_seqno, + largest_seqno); new_files_.push_back(std::make_pair(level, f)); } else { if (!msg) { @@ -429,13 +444,16 @@ uint64_t number; uint32_t path_id; uint64_t file_size; + SequenceNumber smallest_seqno; + SequenceNumber largest_seqno; if (GetLevel(&input, &level, &msg) && GetVarint64(&input, &number) && GetVarint32(&input, &path_id) && GetVarint64(&input, &file_size) && GetInternalKey(&input, &f.smallest) && GetInternalKey(&input, &f.largest) && - GetVarint64(&input, &f.smallest_seqno) && - GetVarint64(&input, &f.largest_seqno)) { - f.fd = FileDescriptor(number, path_id, file_size); + GetVarint64(&input, &smallest_seqno) && + GetVarint64(&input, &largest_seqno)) { + f.fd = FileDescriptor(number, path_id, file_size, smallest_seqno, + largest_seqno); new_files_.push_back(std::make_pair(level, f)); } else { if (!msg) { @@ -473,6 +491,15 @@ is_column_family_drop_ = true; break; + case kInAtomicGroup: + is_in_atomic_group_ = true; + if (!GetVarint32(&input, &remaining_entries_)) { + if (!msg) { + msg = "remaining entries"; + } + } + break; + default: msg = "unknown tag"; break; @@ -551,6 +578,11 @@ r.append("\n MaxColumnFamily: "); AppendNumberTo(&r, max_column_family_); } + if (is_in_atomic_group_) { + r.append("\n AtomicGroup: "); + AppendNumberTo(&r, remaining_entries_); + r.append(" entries remains"); + } r.append("\n}\n"); return r; } @@ -623,6 +655,9 @@ if (has_min_log_number_to_keep_) { jw << "MinLogNumberToKeep" << min_log_number_to_keep_; } + if (is_in_atomic_group_) { + jw << "AtomicGroup" << remaining_entries_; + } jw.EndObject(); diff -Nru rocksdb-5.15.10/db/version_edit.h rocksdb-5.17.2/db/version_edit.h --- rocksdb-5.15.10/db/version_edit.h 2018-09-13 17:25:20.000000000 +0000 +++ rocksdb-5.17.2/db/version_edit.h 2018-11-12 19:57:32.000000000 +0000 @@ -27,7 +27,7 @@ extern uint64_t PackFileNumberAndPathId(uint64_t number, uint64_t path_id); // A copyable structure contains information needed to read data from an SST -// file. It can contains a pointer to a table reader opened for the file, or +// file. It can contain a pointer to a table reader opened for the file, or // file number and size, which can be used to create a new table reader for it. // The behavior is undefined when a copied of the structure is used when the // file is not in any live version any more. @@ -36,18 +36,28 @@ TableReader* table_reader; uint64_t packed_number_and_path_id; uint64_t file_size; // File size in bytes + SequenceNumber smallest_seqno; // The smallest seqno in this file + SequenceNumber largest_seqno; // The largest seqno in this file FileDescriptor() : FileDescriptor(0, 0, 0) {} FileDescriptor(uint64_t number, uint32_t path_id, uint64_t _file_size) + : FileDescriptor(number, path_id, _file_size, kMaxSequenceNumber, 0) {} + + FileDescriptor(uint64_t number, uint32_t path_id, uint64_t _file_size, + SequenceNumber _smallest_seqno, SequenceNumber _largest_seqno) : table_reader(nullptr), packed_number_and_path_id(PackFileNumberAndPathId(number, path_id)), - file_size(_file_size) {} + file_size(_file_size), + smallest_seqno(_smallest_seqno), + largest_seqno(_largest_seqno) {} FileDescriptor& operator=(const FileDescriptor& fd) { table_reader = fd.table_reader; packed_number_and_path_id = fd.packed_number_and_path_id; file_size = fd.file_size; + smallest_seqno = fd.smallest_seqno; + largest_seqno = fd.largest_seqno; return *this; } @@ -77,8 +87,6 @@ FileDescriptor fd; InternalKey smallest; // Smallest internal key served by table InternalKey largest; // Largest internal key served by table - SequenceNumber smallest_seqno; // The smallest seqno in this file - SequenceNumber largest_seqno; // The largest seqno in this file // Needs to be disposed when refs becomes 0. Cache::Handle* table_reader_handle; @@ -108,9 +116,7 @@ // file. FileMetaData() - : smallest_seqno(kMaxSequenceNumber), - largest_seqno(0), - table_reader_handle(nullptr), + : table_reader_handle(nullptr), compensated_file_size(0), num_entries(0), num_deletions(0), @@ -128,8 +134,8 @@ smallest.DecodeFrom(key); } largest.DecodeFrom(key); - smallest_seqno = std::min(smallest_seqno, seqno); - largest_seqno = std::max(largest_seqno, seqno); + fd.smallest_seqno = std::min(fd.smallest_seqno, seqno); + fd.largest_seqno = std::max(fd.largest_seqno, seqno); } // Unlike UpdateBoundaries, ranges do not need to be presented in any @@ -143,8 +149,8 @@ if (largest.size() == 0 || icmp.Compare(largest, end) < 0) { largest = end; } - smallest_seqno = std::min(smallest_seqno, seqno); - largest_seqno = std::max(largest_seqno, seqno); + fd.smallest_seqno = std::min(fd.smallest_seqno, seqno); + fd.largest_seqno = std::max(fd.largest_seqno, seqno); } }; @@ -233,17 +239,18 @@ bool marked_for_compaction) { assert(smallest_seqno <= largest_seqno); FileMetaData f; - f.fd = FileDescriptor(file, file_path_id, file_size); + f.fd = FileDescriptor(file, file_path_id, file_size, smallest_seqno, + largest_seqno); f.smallest = smallest; f.largest = largest; - f.smallest_seqno = smallest_seqno; - f.largest_seqno = largest_seqno; + f.fd.smallest_seqno = smallest_seqno; + f.fd.largest_seqno = largest_seqno; f.marked_for_compaction = marked_for_compaction; new_files_.emplace_back(level, std::move(f)); } void AddFile(int level, const FileMetaData& f) { - assert(f.smallest_seqno <= f.largest_seqno); + assert(f.fd.smallest_seqno <= f.fd.largest_seqno); new_files_.emplace_back(level, f); } @@ -293,6 +300,11 @@ return new_files_; } + void MarkAtomicGroup(uint32_t remaining_entries) { + is_in_atomic_group_ = true; + remaining_entries_ = remaining_entries; + } + std::string DebugString(bool hex_key = false) const; std::string DebugJSON(int edit_num, bool hex_key = false) const; @@ -322,7 +334,7 @@ DeletedFileSet deleted_files_; std::vector> new_files_; - // Each version edit record should have column_family_id set + // Each version edit record should have column_family_ set // If it's not set, it is default (0) uint32_t column_family_; // a version edit can be either column_family add or @@ -331,6 +343,9 @@ bool is_column_family_drop_; bool is_column_family_add_; std::string column_family_name_; + + bool is_in_atomic_group_; + uint32_t remaining_entries_; }; } // namespace rocksdb diff -Nru rocksdb-5.15.10/db/version_edit_test.cc rocksdb-5.17.2/db/version_edit_test.cc --- rocksdb-5.15.10/db/version_edit_test.cc 2018-09-13 17:25:20.000000000 +0000 +++ rocksdb-5.17.2/db/version_edit_test.cc 2018-11-12 19:57:32.000000000 +0000 @@ -191,6 +191,12 @@ TestEncodeDecode(edit); } +TEST_F(VersionEditTest, AtomicGroupTest) { + VersionEdit edit; + edit.MarkAtomicGroup(1); + TestEncodeDecode(edit); +} + } // namespace rocksdb int main(int argc, char** argv) { diff -Nru rocksdb-5.15.10/db/version_set.cc rocksdb-5.17.2/db/version_set.cc --- rocksdb-5.15.10/db/version_set.cc 2018-09-13 17:25:20.000000000 +0000 +++ rocksdb-5.17.2/db/version_set.cc 2018-11-12 19:57:32.000000000 +0000 @@ -21,6 +21,7 @@ #include #include #include +#include #include "db/compaction.h" #include "db/internal_stats.h" #include "db/log_reader.h" @@ -62,20 +63,12 @@ const Slice& key, uint32_t left, uint32_t right) { - while (left < right) { - uint32_t mid = (left + right) / 2; - const FdWithKeyRange& f = file_level.files[mid]; - if (icmp.InternalKeyComparator::Compare(f.largest_key, key) < 0) { - // Key at "mid.largest" is < "target". Therefore all - // files at or before "mid" are uninteresting. - left = mid + 1; - } else { - // Key at "mid.largest" is >= "target". Therefore all files - // after "mid" are uninteresting. - right = mid; - } - } - return right; + auto cmp = [&](const FdWithKeyRange& f, const Slice& k) -> bool { + return icmp.InternalKeyComparator::Compare(f.largest_key, k) < 0; + }; + const auto &b = file_level.files; + return static_cast(std::lower_bound(b + left, + b + right, key, cmp) - b); } Status OverlapWithIterator(const Comparator* ucmp, @@ -895,13 +888,16 @@ assert(!ioptions->cf_paths.empty()); file_path = ioptions->cf_paths.back().path; } - files.emplace_back( - MakeTableFileName("", file->fd.GetNumber()), file_path, - file->fd.GetFileSize(), file->smallest_seqno, file->largest_seqno, + files.emplace_back(SstFileMetaData{ + MakeTableFileName("", file->fd.GetNumber()), + file_path, + static_cast(file->fd.GetFileSize()), + file->fd.smallest_seqno, + file->fd.largest_seqno, file->smallest.user_key().ToString(), file->largest.user_key().ToString(), file->stats.num_reads_sampled.load(std::memory_order_relaxed), - file->being_compacted); + file->being_compacted}); level_size += file->fd.GetFileSize(); } cf_meta->levels.emplace_back( @@ -1212,12 +1208,9 @@ // report the counters before returning if (get_context.State() != GetContext::kNotFound && - get_context.State() != GetContext::kMerge) { - for (uint32_t t = 0; t < Tickers::TICKER_ENUM_MAX; t++) { - if (get_context.tickers_value[t] > 0) { - RecordTick(db_statistics_, t, get_context.tickers_value[t]); - } - } + get_context.State() != GetContext::kMerge && + db_statistics_ != nullptr) { + get_context.ReportCounters(); } switch (get_context.State()) { case GetContext::kNotFound: @@ -1251,10 +1244,8 @@ f = fp.GetNextFile(); } - for (uint32_t t = 0; t < Tickers::TICKER_ENUM_MAX; t++) { - if (get_context.tickers_value[t] > 0) { - RecordTick(db_statistics_, t, get_context.tickers_value[t]); - } + if (db_statistics_ != nullptr) { + get_context.ReportCounters(); } if (GetContext::kMerge == get_context.State()) { if (!merge_operator_) { @@ -1896,13 +1887,15 @@ case kOldestLargestSeqFirst: std::sort(temp.begin(), temp.end(), [](const Fsize& f1, const Fsize& f2) -> bool { - return f1.file->largest_seqno < f2.file->largest_seqno; + return f1.file->fd.largest_seqno < + f2.file->fd.largest_seqno; }); break; case kOldestSmallestSeqFirst: std::sort(temp.begin(), temp.end(), [](const Fsize& f1, const Fsize& f2) -> bool { - return f1.file->smallest_seqno < f2.file->smallest_seqno; + return f1.file->fd.smallest_seqno < + f2.file->fd.smallest_seqno; }); break; case kMinOverlappingRatio: @@ -1986,17 +1979,17 @@ bottommost_files_mark_threshold_ = kMaxSequenceNumber; for (auto& level_and_file : bottommost_files_) { if (!level_and_file.second->being_compacted && - level_and_file.second->largest_seqno != 0 && + level_and_file.second->fd.largest_seqno != 0 && level_and_file.second->num_deletions > 1) { // largest_seqno might be nonzero due to containing the final key in an // earlier compaction, whose seqnum we didn't zero out. Multiple deletions // ensures the file really contains deleted or overwritten keys. - if (level_and_file.second->largest_seqno < oldest_snapshot_seqnum_) { + if (level_and_file.second->fd.largest_seqno < oldest_snapshot_seqnum_) { bottommost_files_marked_for_compaction_.push_back(level_and_file); } else { bottommost_files_mark_threshold_ = std::min(bottommost_files_mark_threshold_, - level_and_file.second->largest_seqno); + level_and_file.second->fd.largest_seqno); } } } @@ -2035,57 +2028,82 @@ void VersionStorageInfo::GetOverlappingInputs( int level, const InternalKey* begin, const InternalKey* end, std::vector* inputs, int hint_index, int* file_index, - bool expand_range) const { + bool expand_range, InternalKey** next_smallest) const { if (level >= num_non_empty_levels_) { // this level is empty, no overlapping inputs return; } inputs->clear(); - Slice user_begin, user_end; - if (begin != nullptr) { - user_begin = begin->user_key(); - } - if (end != nullptr) { - user_end = end->user_key(); - } if (file_index) { *file_index = -1; } const Comparator* user_cmp = user_comparator_; if (level > 0) { - GetOverlappingInputsRangeBinarySearch(level, begin, end, inputs, - hint_index, file_index); + GetOverlappingInputsRangeBinarySearch(level, begin, end, inputs, hint_index, + file_index, false, next_smallest); return; } - for (size_t i = 0; i < level_files_brief_[level].num_files; ) { - FdWithKeyRange* f = &(level_files_brief_[level].files[i++]); - const Slice file_start = ExtractUserKey(f->smallest_key); - const Slice file_limit = ExtractUserKey(f->largest_key); - if (begin != nullptr && user_cmp->Compare(file_limit, user_begin) < 0) { - // "f" is completely before specified range; skip it - } else if (end != nullptr && user_cmp->Compare(file_start, user_end) > 0) { - // "f" is completely after specified range; skip it - } else { - inputs->push_back(files_[level][i-1]); - if (level == 0 && expand_range) { - // Level-0 files may overlap each other. So check if the newly - // added file has expanded the range. If so, restart search. - if (begin != nullptr && user_cmp->Compare(file_start, user_begin) < 0) { - user_begin = file_start; - inputs->clear(); - i = 0; - } else if (end != nullptr - && user_cmp->Compare(file_limit, user_end) > 0) { - user_end = file_limit; - inputs->clear(); - i = 0; + if (next_smallest) { + // next_smallest key only makes sense for non-level 0, where files are + // non-overlapping + *next_smallest = nullptr; + } + + Slice user_begin, user_end; + if (begin != nullptr) { + user_begin = begin->user_key(); + } + if (end != nullptr) { + user_end = end->user_key(); + } + + // index stores the file index need to check. + std::list index; + for (size_t i = 0; i < level_files_brief_[level].num_files; i++) { + index.emplace_back(i); + } + + while (!index.empty()) { + bool found_overlapping_file = false; + auto iter = index.begin(); + while (iter != index.end()) { + FdWithKeyRange* f = &(level_files_brief_[level].files[*iter]); + const Slice file_start = ExtractUserKey(f->smallest_key); + const Slice file_limit = ExtractUserKey(f->largest_key); + if (begin != nullptr && user_cmp->Compare(file_limit, user_begin) < 0) { + // "f" is completely before specified range; skip it + iter++; + } else if (end != nullptr && user_cmp->Compare(file_start, user_end) > 0) { + // "f" is completely after specified range; skip it + iter++; + } else { + // if overlap + inputs->emplace_back(files_[level][*iter]); + found_overlapping_file = true; + // record the first file index. + if (file_index && *file_index == -1) { + *file_index = static_cast(*iter); + } + // the related file is overlap, erase to avoid checking again. + iter = index.erase(iter); + if (expand_range) { + if (begin != nullptr && + user_cmp->Compare(file_start, user_begin) < 0) { + user_begin = file_start; + } + if (end != nullptr && + user_cmp->Compare(file_limit, user_end) > 0) { + user_end = file_limit; + } } - } else if (file_index) { - *file_index = static_cast(i) - 1; } } + // if all the files left are not overlap, break + if (!found_overlapping_file) { + break; + } } } @@ -2186,7 +2204,7 @@ void VersionStorageInfo::GetOverlappingInputsRangeBinarySearch( int level, const InternalKey* begin, const InternalKey* end, std::vector* inputs, int hint_index, int* file_index, - bool within_interval) const { + bool within_interval, InternalKey** next_smallest) const { assert(level > 0); int min = 0; int mid = 0; @@ -2222,6 +2240,9 @@ // If there were no overlapping files, return immediately. if (!foundOverlap) { + if (next_smallest) { + next_smallest = nullptr; + } return; } // returns the index where an overlap is found @@ -2242,6 +2263,15 @@ for (int i = start_index; i <= end_index; i++) { inputs->push_back(files_[level][i]); } + + if (next_smallest != nullptr) { + // Provide the next key outside the range covered by inputs + if (++end_index < static_cast(files_[level].size())) { + **next_smallest = files_[level][end_index]->smallest; + } else { + *next_smallest = nullptr; + } + } } // Store in *start_index and *end_index the range of all files in @@ -2422,7 +2452,7 @@ AppendHumanBytes(f->fd.GetFileSize(), sztxt, sizeof(sztxt)); int ret = snprintf(scratch->buffer + len, sz, "#%" PRIu64 "(seq=%" PRIu64 ",sz=%s,%d) ", - f->fd.GetNumber(), f->smallest_seqno, sztxt, + f->fd.GetNumber(), f->fd.smallest_seqno, sztxt, static_cast(f->being_compacted)); if (ret < 0 || ret >= sz) break; @@ -2904,16 +2934,17 @@ // create new manifest file ROCKS_LOG_INFO(db_options_->info_log, "Creating manifest %" PRIu64 "\n", pending_manifest_file_number_); + std::string descriptor_fname = + DescriptorFileName(dbname_, pending_manifest_file_number_); unique_ptr descriptor_file; - s = NewWritableFile( - env_, DescriptorFileName(dbname_, pending_manifest_file_number_), - &descriptor_file, opt_env_opts); + s = NewWritableFile(env_, descriptor_fname, &descriptor_file, + opt_env_opts); if (s.ok()) { descriptor_file->SetPreallocationBlockSize( db_options_->manifest_preallocation_size); - unique_ptr file_writer( - new WritableFileWriter(std::move(descriptor_file), opt_env_opts)); + unique_ptr file_writer(new WritableFileWriter( + std::move(descriptor_file), descriptor_fname, opt_env_opts)); descriptor_log_.reset( new log::Writer(std::move(file_writer), 0, false)); s = WriteSnapshot(descriptor_log_.get()); @@ -3211,6 +3242,133 @@ builder->Apply(edit); } +Status VersionSet::ApplyOneVersionEdit( + VersionEdit& edit, + const std::unordered_map& name_to_options, + std::unordered_map& column_families_not_found, + std::unordered_map& builders, + bool* have_log_number, uint64_t* /* log_number */, + bool* have_prev_log_number, uint64_t* previous_log_number, + bool* have_next_file, uint64_t* next_file, bool* have_last_sequence, + SequenceNumber* last_sequence, uint64_t* min_log_number_to_keep, + uint32_t* max_column_family) { + // Not found means that user didn't supply that column + // family option AND we encountered column family add + // record. Once we encounter column family drop record, + // we will delete the column family from + // column_families_not_found. + bool cf_in_not_found = (column_families_not_found.find(edit.column_family_) != + column_families_not_found.end()); + // in builders means that user supplied that column family + // option AND that we encountered column family add record + bool cf_in_builders = builders.find(edit.column_family_) != builders.end(); + + // they can't both be true + assert(!(cf_in_not_found && cf_in_builders)); + + ColumnFamilyData* cfd = nullptr; + + if (edit.is_column_family_add_) { + if (cf_in_builders || cf_in_not_found) { + return Status::Corruption( + "Manifest adding the same column family twice: " + + edit.column_family_name_); + } + auto cf_options = name_to_options.find(edit.column_family_name_); + if (cf_options == name_to_options.end()) { + column_families_not_found.insert( + {edit.column_family_, edit.column_family_name_}); + } else { + cfd = CreateColumnFamily(cf_options->second, &edit); + cfd->set_initialized(); + builders.insert( + {edit.column_family_, new BaseReferencedVersionBuilder(cfd)}); + } + } else if (edit.is_column_family_drop_) { + if (cf_in_builders) { + auto builder = builders.find(edit.column_family_); + assert(builder != builders.end()); + delete builder->second; + builders.erase(builder); + cfd = column_family_set_->GetColumnFamily(edit.column_family_); + assert(cfd != nullptr); + if (cfd->Unref()) { + delete cfd; + cfd = nullptr; + } else { + // who else can have reference to cfd!? + assert(false); + } + } else if (cf_in_not_found) { + column_families_not_found.erase(edit.column_family_); + } else { + return Status::Corruption( + "Manifest - dropping non-existing column family"); + } + } else if (!cf_in_not_found) { + if (!cf_in_builders) { + return Status::Corruption( + "Manifest record referencing unknown column family"); + } + + cfd = column_family_set_->GetColumnFamily(edit.column_family_); + // this should never happen since cf_in_builders is true + assert(cfd != nullptr); + + // if it is not column family add or column family drop, + // then it's a file add/delete, which should be forwarded + // to builder + auto builder = builders.find(edit.column_family_); + assert(builder != builders.end()); + builder->second->version_builder()->Apply(&edit); + } + + if (cfd != nullptr) { + if (edit.has_log_number_) { + if (cfd->GetLogNumber() > edit.log_number_) { + ROCKS_LOG_WARN( + db_options_->info_log, + "MANIFEST corruption detected, but ignored - Log numbers in " + "records NOT monotonically increasing"); + } else { + cfd->SetLogNumber(edit.log_number_); + *have_log_number = true; + } + } + if (edit.has_comparator_ && + edit.comparator_ != cfd->user_comparator()->Name()) { + return Status::InvalidArgument( + cfd->user_comparator()->Name(), + "does not match existing comparator " + edit.comparator_); + } + } + + if (edit.has_prev_log_number_) { + *previous_log_number = edit.prev_log_number_; + *have_prev_log_number = true; + } + + if (edit.has_next_file_number_) { + *next_file = edit.next_file_number_; + *have_next_file = true; + } + + if (edit.has_max_column_family_) { + *max_column_family = edit.max_column_family_; + } + + if (edit.has_min_log_number_to_keep_) { + *min_log_number_to_keep = + std::max(*min_log_number_to_keep, edit.min_log_number_to_keep_); + } + + if (edit.has_last_sequence_) { + *last_sequence = edit.last_sequence_; + *have_last_sequence = true; + } + return Status::OK(); +} + Status VersionSet::Recover( const std::vector& column_families, bool read_only) { @@ -3296,9 +3454,11 @@ VersionSet::LogReporter reporter; reporter.status = &s; log::Reader reader(nullptr, std::move(manifest_file_reader), &reporter, - true /*checksum*/, 0 /*initial_offset*/, 0); + true /* checksum */, 0 /* log_number */); Slice record; std::string scratch; + std::vector replay_buffer; + size_t num_entries_decoded = 0; while (reader.ReadRecord(&record, &scratch) && s.ok()) { VersionEdit edit; s = edit.DecodeFrom(record); @@ -3306,123 +3466,44 @@ break; } - // Not found means that user didn't supply that column - // family option AND we encountered column family add - // record. Once we encounter column family drop record, - // we will delete the column family from - // column_families_not_found. - bool cf_in_not_found = - column_families_not_found.find(edit.column_family_) != - column_families_not_found.end(); - // in builders means that user supplied that column family - // option AND that we encountered column family add record - bool cf_in_builders = - builders.find(edit.column_family_) != builders.end(); - - // they can't both be true - assert(!(cf_in_not_found && cf_in_builders)); - - ColumnFamilyData* cfd = nullptr; - - if (edit.is_column_family_add_) { - if (cf_in_builders || cf_in_not_found) { - s = Status::Corruption( - "Manifest adding the same column family twice"); - break; - } - auto cf_options = cf_name_to_options.find(edit.column_family_name_); - if (cf_options == cf_name_to_options.end()) { - column_families_not_found.insert( - {edit.column_family_, edit.column_family_name_}); - } else { - cfd = CreateColumnFamily(cf_options->second, &edit); - cfd->set_initialized(); - builders.insert( - {edit.column_family_, new BaseReferencedVersionBuilder(cfd)}); - } - } else if (edit.is_column_family_drop_) { - if (cf_in_builders) { - auto builder = builders.find(edit.column_family_); - assert(builder != builders.end()); - delete builder->second; - builders.erase(builder); - cfd = column_family_set_->GetColumnFamily(edit.column_family_); - if (cfd->Unref()) { - delete cfd; - cfd = nullptr; - } else { - // who else can have reference to cfd!? - assert(false); - } - } else if (cf_in_not_found) { - column_families_not_found.erase(edit.column_family_); - } else { - s = Status::Corruption( - "Manifest - dropping non-existing column family"); - break; - } - } else if (!cf_in_not_found) { - if (!cf_in_builders) { - s = Status::Corruption( - "Manifest record referencing unknown column family"); - break; - } - - cfd = column_family_set_->GetColumnFamily(edit.column_family_); - // this should never happen since cf_in_builders is true - assert(cfd != nullptr); - - // if it is not column family add or column family drop, - // then it's a file add/delete, which should be forwarded - // to builder - auto builder = builders.find(edit.column_family_); - assert(builder != builders.end()); - builder->second->version_builder()->Apply(&edit); - } - - if (cfd != nullptr) { - if (edit.has_log_number_) { - if (cfd->GetLogNumber() > edit.log_number_) { - ROCKS_LOG_WARN( - db_options_->info_log, - "MANIFEST corruption detected, but ignored - Log numbers in " - "records NOT monotonically increasing"); - } else { - cfd->SetLogNumber(edit.log_number_); - have_log_number = true; + if (edit.is_in_atomic_group_) { + if (replay_buffer.empty()) { + replay_buffer.resize(edit.remaining_entries_ + 1); + } + ++num_entries_decoded; + if (num_entries_decoded + edit.remaining_entries_ != + static_cast(replay_buffer.size())) { + return Status::Corruption("corrupted atomic group"); + } + replay_buffer[num_entries_decoded - 1] = std::move(edit); + if (num_entries_decoded == replay_buffer.size()) { + for (auto& e : replay_buffer) { + s = ApplyOneVersionEdit( + e, cf_name_to_options, column_families_not_found, builders, + &have_log_number, &log_number, &have_prev_log_number, + &previous_log_number, &have_next_file, &next_file, + &have_last_sequence, &last_sequence, &min_log_number_to_keep, + &max_column_family); + if (!s.ok()) { + break; + } } + replay_buffer.clear(); + num_entries_decoded = 0; } - if (edit.has_comparator_ && - edit.comparator_ != cfd->user_comparator()->Name()) { - s = Status::InvalidArgument( - cfd->user_comparator()->Name(), - "does not match existing comparator " + edit.comparator_); - break; + } else { + if (!replay_buffer.empty()) { + return Status::Corruption("corrupted atomic group"); } + s = ApplyOneVersionEdit( + edit, cf_name_to_options, column_families_not_found, builders, + &have_log_number, &log_number, &have_prev_log_number, + &previous_log_number, &have_next_file, &next_file, + &have_last_sequence, &last_sequence, &min_log_number_to_keep, + &max_column_family); } - - if (edit.has_prev_log_number_) { - previous_log_number = edit.prev_log_number_; - have_prev_log_number = true; - } - - if (edit.has_next_file_number_) { - next_file = edit.next_file_number_; - have_next_file = true; - } - - if (edit.has_max_column_family_) { - max_column_family = edit.max_column_family_; - } - - if (edit.has_min_log_number_to_keep_) { - min_log_number_to_keep = - std::max(min_log_number_to_keep, edit.min_log_number_to_keep_); - } - - if (edit.has_last_sequence_) { - last_sequence = edit.last_sequence_; - have_last_sequence = true; + if (!s.ok()) { + break; } } } @@ -3578,8 +3659,8 @@ column_family_names.insert({0, kDefaultColumnFamilyName}); VersionSet::LogReporter reporter; reporter.status = &s; - log::Reader reader(nullptr, std::move(file_reader), &reporter, true /*checksum*/, - 0 /*initial_offset*/, 0); + log::Reader reader(nullptr, std::move(file_reader), &reporter, + true /* checksum */, 0 /* log_number */); Slice record; std::string scratch; while (reader.ReadRecord(&record, &scratch) && s.ok()) { @@ -3739,7 +3820,7 @@ VersionSet::LogReporter reporter; reporter.status = &s; log::Reader reader(nullptr, std::move(file_reader), &reporter, - true /*checksum*/, 0 /*initial_offset*/, 0); + true /* checksum */, 0 /* log_number */); Slice record; std::string scratch; while (reader.ReadRecord(&record, &scratch) && s.ok()) { @@ -3968,7 +4049,7 @@ cfd->current()->storage_info()->LevelFiles(level)) { edit.AddFile(level, f->fd.GetNumber(), f->fd.GetPathId(), f->fd.GetFileSize(), f->smallest, f->largest, - f->smallest_seqno, f->largest_seqno, + f->fd.smallest_seqno, f->fd.largest_seqno, f->marked_for_compaction); } } @@ -4290,11 +4371,11 @@ } filemetadata.name = MakeTableFileName("", file->fd.GetNumber()); filemetadata.level = level; - filemetadata.size = file->fd.GetFileSize(); + filemetadata.size = static_cast(file->fd.GetFileSize()); filemetadata.smallestkey = file->smallest.user_key().ToString(); filemetadata.largestkey = file->largest.user_key().ToString(); - filemetadata.smallest_seqno = file->smallest_seqno; - filemetadata.largest_seqno = file->largest_seqno; + filemetadata.smallest_seqno = file->fd.smallest_seqno; + filemetadata.largest_seqno = file->fd.largest_seqno; metadata->push_back(filemetadata); } } diff -Nru rocksdb-5.15.10/db/version_set.h rocksdb-5.17.2/db/version_set.h --- rocksdb-5.15.10/db/version_set.h 2018-09-13 17:25:20.000000000 +0000 +++ rocksdb-5.17.2/db/version_set.h 2018-11-12 19:57:32.000000000 +0000 @@ -52,7 +52,6 @@ } class Compaction; -class InternalIterator; class LogBuffer; class LookupKey; class MemTable; @@ -189,9 +188,11 @@ std::vector* inputs, int hint_index = -1, // index of overlap file int* file_index = nullptr, // return index of overlap file - bool expand_range = true) // if set, returns files which overlap the - const; // range and overlap each other. If false, + bool expand_range = true, // if set, returns files which overlap the + // range and overlap each other. If false, // then just files intersecting the range + InternalKey** next_smallest = nullptr) // if non-null, returns the + const; // smallest key of next file not included void GetCleanInputsWithinInterval( int level, const InternalKey* begin, // nullptr means before all keys const InternalKey* end, // nullptr means after all keys @@ -201,14 +202,15 @@ const; void GetOverlappingInputsRangeBinarySearch( - int level, // level > 0 + int level, // level > 0 const InternalKey* begin, // nullptr means before all keys const InternalKey* end, // nullptr means after all keys std::vector* inputs, int hint_index, // index of overlap file int* file_index, // return index of overlap file - bool within_interval = false) // if set, force the inputs within interval - const; + bool within_interval = false, // if set, force the inputs within interval + InternalKey** next_smallest = nullptr) // if non-null, returns the + const; // smallest key of next file not included void ExtendFileRangeOverlappingInterval( int level, @@ -729,6 +731,10 @@ } }; +namespace { +class BaseReferencedVersionBuilder; +} + class VersionSet { public: VersionSet(const std::string& dbname, const ImmutableDBOptions* db_options, @@ -832,6 +838,11 @@ // Allocate and return a new file number uint64_t NewFileNumber() { return next_file_number_.fetch_add(1); } + // Fetch And Add n new file number + uint64_t FetchAddFileNumber(uint64_t n) { + return next_file_number_.fetch_add(n); + } + // Return the last sequence number. uint64_t LastSequence() const { return last_sequence_.load(std::memory_order_acquire); @@ -985,6 +996,16 @@ ColumnFamilyData* CreateColumnFamily(const ColumnFamilyOptions& cf_options, VersionEdit* edit); + Status ApplyOneVersionEdit( + VersionEdit& edit, + const std::unordered_map& name_to_opts, + std::unordered_map& column_families_not_found, + std::unordered_map& builders, + bool* have_log_number, uint64_t* log_number, bool* have_prev_log_number, + uint64_t* previous_log_number, bool* have_next_file, uint64_t* next_file, + bool* have_last_sequence, SequenceNumber* last_sequence, + uint64_t* min_log_number_to_keep, uint32_t* max_column_family); + Status ProcessManifestWrites(std::deque& writers, InstrumentedMutex* mu, Directory* db_directory, bool new_descriptor_log, diff -Nru rocksdb-5.15.10/db/version_set_test.cc rocksdb-5.17.2/db/version_set_test.cc --- rocksdb-5.15.10/db/version_set_test.cc 2018-09-13 17:25:20.000000000 +0000 +++ rocksdb-5.17.2/db/version_set_test.cc 2018-11-12 19:57:32.000000000 +0000 @@ -566,7 +566,7 @@ manifest, &file, env_->OptimizeForManifestWrite(env_options_)); ASSERT_OK(s); unique_ptr file_writer( - new WritableFileWriter(std::move(file), env_options_)); + new WritableFileWriter(std::move(file), manifest, env_options_)); { log::Writer log(std::move(file_writer), 0, false); std::string record; diff -Nru rocksdb-5.15.10/db/wal_manager.cc rocksdb-5.17.2/db/wal_manager.cc --- rocksdb-5.15.10/db/wal_manager.cc 2018-09-13 17:25:20.000000000 +0000 +++ rocksdb-5.17.2/db/wal_manager.cc 2018-11-12 19:57:32.000000000 +0000 @@ -237,7 +237,7 @@ } size_t const files_keep_num = - db_options_.wal_size_limit_mb * 1024 * 1024 / log_file_size; + static_cast(db_options_.wal_size_limit_mb * 1024 * 1024 / log_file_size); if (log_files_num <= files_keep_num) { return; } @@ -352,7 +352,7 @@ // Binary Search. avoid opening all files. while (end >= start) { int64_t mid = start + (end - start) / 2; // Avoid overflow. - SequenceNumber current_seq_num = all_logs.at(mid)->StartSequence(); + SequenceNumber current_seq_num = all_logs.at(static_cast(mid))->StartSequence(); if (current_seq_num == target) { end = mid; break; @@ -363,7 +363,7 @@ } } // end could be -ve. - size_t start_index = std::max(static_cast(0), end); + size_t start_index = static_cast(std::max(static_cast(0), end)); // The last wal file is always included all_logs.erase(all_logs.begin(), all_logs.begin() + start_index); return Status::OK(); @@ -457,7 +457,7 @@ reporter.status = &status; reporter.ignore_error = !db_options_.paranoid_checks; log::Reader reader(db_options_.info_log, std::move(file_reader), &reporter, - true /*checksum*/, 0 /*initial_offset*/, number); + true /*checksum*/, number); std::string scratch; Slice record; diff -Nru rocksdb-5.15.10/db/wal_manager_test.cc rocksdb-5.17.2/db/wal_manager_test.cc --- rocksdb-5.15.10/db/wal_manager_test.cc 2018-09-13 17:25:20.000000000 +0000 +++ rocksdb-5.17.2/db/wal_manager_test.cc 2018-11-12 19:57:32.000000000 +0000 @@ -79,7 +79,7 @@ unique_ptr file; ASSERT_OK(env_->NewWritableFile(fname, &file, env_options_)); unique_ptr file_writer( - new WritableFileWriter(std::move(file), env_options_)); + new WritableFileWriter(std::move(file), fname, env_options_)); current_log_writer_.reset(new log::Writer(std::move(file_writer), 0, false)); } @@ -130,7 +130,7 @@ ASSERT_EQ(s, 0U); unique_ptr file_writer( - new WritableFileWriter(std::move(file), EnvOptions())); + new WritableFileWriter(std::move(file), path, EnvOptions())); log::Writer writer(std::move(file_writer), 1, db_options_.recycle_log_file_num > 0); WriteBatch batch; diff -Nru rocksdb-5.15.10/db/write_batch.cc rocksdb-5.17.2/db/write_batch.cc --- rocksdb-5.15.10/db/write_batch.cc 2018-09-13 17:25:20.000000000 +0000 +++ rocksdb-5.17.2/db/write_batch.cc 2018-11-12 19:57:32.000000000 +0000 @@ -727,6 +727,11 @@ ContentFlags::HAS_END_PREPARE | ContentFlags::HAS_BEGIN_PREPARE, std::memory_order_relaxed); + if (unprepared_batch) { + b->content_flags_.store(b->content_flags_.load(std::memory_order_relaxed) | + ContentFlags::HAS_BEGIN_UNPREPARE, + std::memory_order_relaxed); + } return Status::OK(); } diff -Nru rocksdb-5.15.10/db/write_thread.cc rocksdb-5.17.2/db/write_thread.cc --- rocksdb-5.15.10/db/write_thread.cc 2018-09-13 17:25:20.000000000 +0000 +++ rocksdb-5.17.2/db/write_thread.cc 2018-11-12 19:57:32.000000000 +0000 @@ -24,7 +24,10 @@ enable_pipelined_write_(db_options.enable_pipelined_write), newest_writer_(nullptr), newest_memtable_writer_(nullptr), - last_sequence_(0) {} + last_sequence_(0), + write_stall_dummy_(), + stall_mu_(), + stall_cv_(&stall_mu_) {} uint8_t WriteThread::BlockingAwaitState(Writer* w, uint8_t goal_mask) { // We're going to block. Lazily create the mutex. We guarantee @@ -219,6 +222,28 @@ assert(w->state == STATE_INIT); Writer* writers = newest_writer->load(std::memory_order_relaxed); while (true) { + // If write stall in effect, and w->no_slowdown is not true, + // block here until stall is cleared. If its true, then return + // immediately + if (writers == &write_stall_dummy_) { + if (w->no_slowdown) { + w->status = Status::Incomplete("Write stall"); + SetState(w, STATE_COMPLETED); + return false; + } + // Since no_slowdown is false, wait here to be notified of the write + // stall clearing + { + MutexLock lock(&stall_mu_); + writers = newest_writer->load(std::memory_order_relaxed); + if (writers == &write_stall_dummy_) { + stall_cv_.Wait(); + // Load newest_writers_ again since it may have changed + writers = newest_writer->load(std::memory_order_relaxed); + continue; + } + } + } w->link_older = writers; if (newest_writer->compare_exchange_weak(writers, w)) { return (writers == nullptr); @@ -303,12 +328,44 @@ SetState(w, STATE_COMPLETED); } +void WriteThread::BeginWriteStall() { + LinkOne(&write_stall_dummy_, &newest_writer_); + + // Walk writer list until w->write_group != nullptr. The current write group + // will not have a mix of slowdown/no_slowdown, so its ok to stop at that + // point + Writer* w = write_stall_dummy_.link_older; + Writer* prev = &write_stall_dummy_; + while (w != nullptr && w->write_group == nullptr) { + if (w->no_slowdown) { + prev->link_older = w->link_older; + w->status = Status::Incomplete("Write stall"); + SetState(w, STATE_COMPLETED); + w = prev->link_older; + } else { + prev = w; + w = w->link_older; + } + } +} + +void WriteThread::EndWriteStall() { + MutexLock lock(&stall_mu_); + + assert(newest_writer_.load(std::memory_order_relaxed) == &write_stall_dummy_); + newest_writer_.exchange(write_stall_dummy_.link_older); + + // Wake up writers + stall_cv_.SignalAll(); +} + static WriteThread::AdaptationContext jbg_ctx("JoinBatchGroup"); void WriteThread::JoinBatchGroup(Writer* w) { TEST_SYNC_POINT_CALLBACK("WriteThread::JoinBatchGroup:Start", w); assert(w->batch != nullptr); bool linked_as_leader = LinkOne(w, &newest_writer_); + if (linked_as_leader) { SetState(w, STATE_GROUP_LEADER); } diff -Nru rocksdb-5.15.10/db/write_thread.h rocksdb-5.17.2/db/write_thread.h --- rocksdb-5.15.10/db/write_thread.h 2018-09-13 17:25:20.000000000 +0000 +++ rocksdb-5.17.2/db/write_thread.h 2018-11-12 19:57:32.000000000 +0000 @@ -342,6 +342,13 @@ return last_sequence_; } + // Insert a dummy writer at the tail of the write queue to indicate a write + // stall, and fail any writers in the queue with no_slowdown set to true + void BeginWriteStall(); + + // Remove the dummy writer and wake up waiting writers + void EndWriteStall(); + private: // See AwaitState. const uint64_t max_yield_usec_; @@ -365,6 +372,17 @@ // is not necessary visible to reads because the writer can be ongoing. SequenceNumber last_sequence_; + // A dummy writer to indicate a write stall condition. This will be inserted + // at the tail of the writer queue by the leader, so newer writers can just + // check for this and bail + Writer write_stall_dummy_; + + // Mutex and condvar for writers to block on a write stall. During a write + // stall, writers with no_slowdown set to false will wait on this rather + // on the writer queue + port::Mutex stall_mu_; + port::CondVar stall_cv_; + // Waits for w->state & goal_mask using w->StateMutex(). Returns // the state that satisfies goal_mask. uint8_t BlockingAwaitState(Writer* w, uint8_t goal_mask); diff -Nru rocksdb-5.15.10/debian/changelog rocksdb-5.17.2/debian/changelog --- rocksdb-5.15.10/debian/changelog 2018-11-21 21:07:17.000000000 +0000 +++ rocksdb-5.17.2/debian/changelog 2018-12-19 17:01:38.000000000 +0000 @@ -1,3 +1,23 @@ +rocksdb (5.17.2-3) unstable; urgency=medium + + * Backport fix for snprintf() buffer overflow. + * Upload to Sid. + + -- Laszlo Boszormenyi (GCS) Wed, 19 Dec 2018 17:01:38 +0000 + +rocksdb (5.17.2-2) experimental; urgency=medium + + * Backport upstream fix for db_bench_tool.cc FTBFS. + + -- Laszlo Boszormenyi (GCS) Sun, 09 Dec 2018 06:00:39 +0000 + +rocksdb (5.17.2-1) experimental; urgency=medium + + * New upstream release. + * Library transition from librocksdb5.15 to librocksdb5.17 . + + -- Laszlo Boszormenyi (GCS) Thu, 22 Nov 2018 16:25:48 +0100 + rocksdb (5.15.10-2) unstable; urgency=medium * Remove ppc64 from build architectures, upstream doesn't want to diff -Nru rocksdb-5.15.10/debian/control rocksdb-5.17.2/debian/control --- rocksdb-5.15.10/debian/control 2018-11-21 21:07:17.000000000 +0000 +++ rocksdb-5.17.2/debian/control 2018-11-22 15:25:48.000000000 +0000 @@ -4,12 +4,12 @@ Maintainer: Laszlo Boszormenyi (GCS) Build-Depends: debhelper (>= 11), libgflags-dev, libsnappy-dev, libbz2-dev, zlib1g-dev, liblz4-dev, libzstd-dev Standards-Version: 4.2.1 -Homepage: http://rocksdb.org/ +Homepage: https://rocksdb.org/ Package: librocksdb-dev Section: libdevel Architecture: amd64 arm64 ppc64el mips mipsel mips64el sparc64 s390x i386 -Depends: ${misc:Depends}, librocksdb5.15 (= ${binary:Version}) +Depends: ${misc:Depends}, librocksdb5.17 (= ${binary:Version}) Conflicts: librocksdb5.7 Replaces: librocksdb5.7 Description: persistent Key-Value Store for Flash and RAM Storage (development) @@ -36,7 +36,7 @@ This package contains libraries and header files for developing applications that use librocksdb . -Package: librocksdb5.15 +Package: librocksdb5.17 Section: libs Architecture: amd64 arm64 ppc64el mips mipsel mips64el sparc64 s390x i386 Depends: ${misc:Depends}, ${shlibs:Depends} diff -Nru rocksdb-5.15.10/debian/copyright rocksdb-5.17.2/debian/copyright --- rocksdb-5.15.10/debian/copyright 2017-10-01 07:44:53.000000000 +0000 +++ rocksdb-5.17.2/debian/copyright 2018-11-22 15:25:48.000000000 +0000 @@ -1,6 +1,6 @@ -Format: http://www.debian.org/doc/packaging-manuals/copyright-format/1.0/ +Format: https://www.debian.org/doc/packaging-manuals/copyright-format/1.0/ Upstream-Name: RocksDB -Upstream-Contact: https://code.facebook.com/projects/ +Upstream-Contact: https://opensource.fb.com/ Source: https://github.com/facebook/rocksdb Copyright: Copyright (C) 2013- Facebook Database Engineering Team @@ -18,7 +18,7 @@ Files: java/benchmark/src/main/java/org/rocksdb/benchmark/DbBenchmark.java Copyright: Copyright (C) 2011 Dain Sundstrom , - Copyright (C) 2011 FuseSource Corp. http://fusesource.com + Copyright (C) 2011 FuseSource Corp. https://fusesource.com License: Apache-2.0 Files: java/rocksjni.pom @@ -35,7 +35,7 @@ Version 2.0 (the "License"); you may not use this work except in compliance with the License. You may obtain a copy of the License at . - http://www.apache.org/licenses/LICENSE-2.0 + https://www.apache.org/licenses/LICENSE-2.0 . On Debian systems, the complete text of the Apache License Version 2.0 can be found in the file '/usr/share/common-licenses/Apache-2.0'. @@ -65,7 +65,7 @@ GNU General Public License for more details. . You should have received a copy of the GNU General Public License - along with this program. If not, see . + along with this program. If not, see . . On Debian systems, the complete text of the GNU General Public License 3 can be found in the file `/usr/share/common-licenses/GPL-3'. diff -Nru rocksdb-5.15.10/debian/librocksdb5.15.install rocksdb-5.17.2/debian/librocksdb5.15.install --- rocksdb-5.15.10/debian/librocksdb5.15.install 2017-10-01 07:44:53.000000000 +0000 +++ rocksdb-5.17.2/debian/librocksdb5.15.install 1970-01-01 00:00:00.000000000 +0000 @@ -1 +0,0 @@ -usr/lib/lib*.so.*.* diff -Nru rocksdb-5.15.10/debian/librocksdb5.17.install rocksdb-5.17.2/debian/librocksdb5.17.install --- rocksdb-5.15.10/debian/librocksdb5.17.install 1970-01-01 00:00:00.000000000 +0000 +++ rocksdb-5.17.2/debian/librocksdb5.17.install 2017-10-01 07:44:53.000000000 +0000 @@ -0,0 +1 @@ +usr/lib/lib*.so.*.* diff -Nru rocksdb-5.15.10/debian/patches/fix_db_bench_tool_FTBFS.patch rocksdb-5.17.2/debian/patches/fix_db_bench_tool_FTBFS.patch --- rocksdb-5.15.10/debian/patches/fix_db_bench_tool_FTBFS.patch 1970-01-01 00:00:00.000000000 +0000 +++ rocksdb-5.17.2/debian/patches/fix_db_bench_tool_FTBFS.patch 2018-11-22 15:25:48.000000000 +0000 @@ -0,0 +1,48 @@ +From f959e88048642f3548065d07306ad7a0ffdeaa7e Mon Sep 17 00:00:00 2001 +From: =?UTF-8?q?Simon=20Gra=CC=88tzer?= +Date: Fri, 19 Oct 2018 14:43:55 -0700 +Subject: [PATCH] Fix printf formatting on MacOS (#4533) + +Summary: +On MacOS with clang the compilation of _tools/db_bench_tool.cc_ always fails because the format used in a `fprintf` call has the wrong type. This PR should hopefully fix this issue +``` +tools/db_bench_tool.cc:4233:61: error: format specifies type 'unsigned long long' but the argument has type 'size_t' (aka 'unsigned long') +``` +Pull Request resolved: https://github.com/facebook/rocksdb/pull/4533 + +Differential Revision: D10471657 + +Pulled By: maysamyabandeh + +fbshipit-source-id: f20f5f3756d3571b586c895c845d0d4d1e34a398 +--- + .travis.yml | 2 +- + tools/db_bench_tool.cc | 2 +- + 2 files changed, 2 insertions(+), 2 deletions(-) + +diff --git a/.travis.yml b/.travis.yml +index b366da2517..e759a642a0 100644 +--- a/.travis.yml ++++ b/.travis.yml +@@ -90,7 +90,7 @@ script: + OPT=-DTRAVIS V=1 ROCKSDBTESTS_START=db_block_cache_test ROCKSDBTESTS_END=full_filter_block_test make -j4 check_some + ;; + 2) +- OPT=-DTRAVIS V=1 ROCKSDBTESTS_START=full_filter_block_test ROCKSDBTESTS_END=write_batch_with_index_test make -j4 check_some ++ OPT=-DTRAVIS V=1 make -j4 tools && OPT=-DTRAVIS V=1 ROCKSDBTESTS_START=full_filter_block_test ROCKSDBTESTS_END=write_batch_with_index_test make -j4 check_some + ;; + 3) + OPT=-DTRAVIS V=1 ROCKSDBTESTS_START=write_batch_with_index_test ROCKSDBTESTS_END=write_prepared_transaction_test make -j4 check_some +diff --git a/tools/db_bench_tool.cc b/tools/db_bench_tool.cc +index 1a68ad6548..a416b91abe 100644 +--- a/tools/db_bench_tool.cc ++++ b/tools/db_bench_tool.cc +@@ -4237,7 +4237,7 @@ void VerifyDBFromDB(std::string& truth_db_name) { + } + if (levelMeta.level == 0) { + for (auto& fileMeta : levelMeta.files) { +- fprintf(stdout, "Level[%d]: %s(size: %" PRIu64 " bytes)\n", ++ fprintf(stdout, "Level[%d]: %s(size: %" ROCKSDB_PRIszt " bytes)\n", + levelMeta.level, fileMeta.name.c_str(), fileMeta.size); + } + } else { diff -Nru rocksdb-5.15.10/debian/patches/fix_snprintf_buffer_overflow_bug.patch rocksdb-5.17.2/debian/patches/fix_snprintf_buffer_overflow_bug.patch --- rocksdb-5.15.10/debian/patches/fix_snprintf_buffer_overflow_bug.patch 1970-01-01 00:00:00.000000000 +0000 +++ rocksdb-5.17.2/debian/patches/fix_snprintf_buffer_overflow_bug.patch 2018-12-19 17:01:38.000000000 +0000 @@ -0,0 +1,38 @@ +From 1fb68055271bc4cf879325db49f8c4266bbcb5e6 Mon Sep 17 00:00:00 2001 +From: Maysam Yabandeh +Date: Fri, 5 Oct 2018 14:49:01 -0700 +Subject: [PATCH] Fix snprintf buffer overflow bug (#4465) + +Summary: +The contract of snprintf says that it returns "The number of characters that would have been written if n had been sufficiently large" http://www.cplusplus.com/reference/cstdio/snprintf/ +The existing code however was assuming that the return value is the actual number of written bytes and uses that to reposition the starting point on the next call to snprintf. This leads to buffer overflow when the last call to snprintf has filled up the buffer. +Pull Request resolved: https://github.com/facebook/rocksdb/pull/4465 + +Differential Revision: D10224080 + +Pulled By: maysamyabandeh + +fbshipit-source-id: 40f44e122d15b0db439812a0a361167cf012de3e +--- + db/compaction.cc | 2 ++ + 1 file changed, 2 insertions(+) + +diff --git a/db/compaction.cc b/db/compaction.cc +index 4ea92d5cc7..b3921eb4bc 100644 +--- a/db/compaction.cc ++++ b/db/compaction.cc +@@ -331,12 +331,14 @@ const char* Compaction::InputLevelSummary( + if (!is_first) { + len += + snprintf(scratch->buffer + len, sizeof(scratch->buffer) - len, " + "); ++ len = std::min(len, static_cast(sizeof(scratch->buffer))); + } else { + is_first = false; + } + len += snprintf(scratch->buffer + len, sizeof(scratch->buffer) - len, + "%" ROCKSDB_PRIszt "@%d", input_level.size(), + input_level.level); ++ len = std::min(len, static_cast(sizeof(scratch->buffer))); + } + snprintf(scratch->buffer + len, sizeof(scratch->buffer) - len, + " files to L%d", output_level()); diff -Nru rocksdb-5.15.10/debian/patches/series rocksdb-5.17.2/debian/patches/series --- rocksdb-5.15.10/debian/patches/series 2018-09-18 20:52:12.000000000 +0000 +++ rocksdb-5.17.2/debian/patches/series 2018-12-19 17:01:38.000000000 +0000 @@ -1,3 +1,5 @@ install_dir-is-destdir.patch build_reproducible.patch verbose_build.patch +fix_db_bench_tool_FTBFS.patch +fix_snprintf_buffer_overflow_bug.patch diff -Nru rocksdb-5.15.10/debian/rules rocksdb-5.17.2/debian/rules --- rocksdb-5.15.10/debian/rules 2018-07-17 16:29:50.000000000 +0000 +++ rocksdb-5.17.2/debian/rules 2018-11-22 15:25:48.000000000 +0000 @@ -17,7 +17,11 @@ override_dh_auto_install: dh_auto_install --destdir=$(CURDIR)/debian/tmp/usr/ +override_dh_missing: + dh_missing --list-missing + %: - dh $@ --fail-missing + dh $@ -.PHONY: override_dh_auto_build override_dh_auto_test override_dh_auto_install +.PHONY: override_dh_auto_build override_dh_auto_test \ + override_dh_auto_install override_dh_missing diff -Nru rocksdb-5.15.10/docs/_data/authors.yml rocksdb-5.17.2/docs/_data/authors.yml --- rocksdb-5.15.10/docs/_data/authors.yml 2018-09-13 17:25:20.000000000 +0000 +++ rocksdb-5.17.2/docs/_data/authors.yml 2018-11-12 19:57:32.000000000 +0000 @@ -60,3 +60,7 @@ lightmark: full_name: Aaron Gao fbid: 1351549072 + +fgwu: + full_name: Fenggang Wu + fbid: 100002297362180 diff -Nru rocksdb-5.15.10/docs/feed.xml rocksdb-5.17.2/docs/feed.xml --- rocksdb-5.15.10/docs/feed.xml 2018-09-13 17:25:20.000000000 +0000 +++ rocksdb-5.17.2/docs/feed.xml 2018-11-12 19:57:32.000000000 +0000 @@ -6,7 +6,7 @@ {{ site.title | xml_escape }} {{ site.description | xml_escape }} - {{ absolute_url }}/ + https://rocksdb.org/feed.xml {{ site.time | date_to_rfc822 }} {{ site.time | date_to_rfc822 }} diff -Nru rocksdb-5.15.10/docs/_posts/2018-08-01-rocksdb-tuning-advisor.markdown rocksdb-5.17.2/docs/_posts/2018-08-01-rocksdb-tuning-advisor.markdown --- rocksdb-5.15.10/docs/_posts/2018-08-01-rocksdb-tuning-advisor.markdown 1970-01-01 00:00:00.000000000 +0000 +++ rocksdb-5.17.2/docs/_posts/2018-08-01-rocksdb-tuning-advisor.markdown 2018-11-12 19:57:32.000000000 +0000 @@ -0,0 +1,58 @@ +--- +title: Rocksdb Tuning Advisor +layout: post +author: poojam23 +category: blog +--- + +The performance of Rocksdb is contingent on its tuning. However, because +of the complexity of its underlying technology and a large number of +configurable parameters, a good configuration is sometimes hard to obtain. The aim of +the python command-line tool, Rocksdb Advisor, is to automate the process of +suggesting improvements in the configuration based on advice from Rocksdb +experts. + +### Overview + +Experts share their wisdom as rules comprising of conditions and suggestions in the INI format (refer +[rules.ini](https://github.com/facebook/rocksdb/blob/master/tools/advisor/advisor/rules.ini)). +Users provide the Rocksdb configuration that they want to improve upon (as the +familiar Rocksdb OPTIONS file — +[example](https://github.com/facebook/rocksdb/blob/master/examples/rocksdb_option_file_example.ini)) +and the path of the file which contains Rocksdb logs and statistics. +The [Advisor](https://github.com/facebook/rocksdb/blob/master/tools/advisor/advisor/rule_parser_example.py) +creates appropriate DataSource objects (for Rocksdb +[logs](https://github.com/facebook/rocksdb/blob/master/tools/advisor/advisor/db_log_parser.py), +[options](https://github.com/facebook/rocksdb/blob/master/tools/advisor/advisor/db_options_parser.py), +[statistics](https://github.com/facebook/rocksdb/blob/master/tools/advisor/advisor/db_stats_fetcher.py) etc.) +and provides them to the [Rules Engine](https://github.com/facebook/rocksdb/blob/master/tools/advisor/advisor/rule_parser.py). +The Rules uses rules from experts to parse data-sources and trigger appropriate rules. +The Advisor's output gives information about which rules were triggered, +why they were triggered and what each of them suggests. Each suggestion +provided by a triggered rule advises some action on a Rocksdb +configuration option, for example, increase CFOptions.write_buffer_size, +set bloom_bits to 2 etc. + +### Usage + +An example command to run the tool: + +```shell +cd rocksdb/tools/advisor +python3 -m advisor.rule_parser_example --rules_spec=advisor/rules.ini --rocksdb_options=test/input_files/OPTIONS-000005 --log_files_path_prefix=test/input_files/LOG-0 --stats_dump_period_sec=20 +``` + +Sample output where a Rocksdb log-based rule has been triggered : + +```shell +Rule: stall-too-many-memtables +LogCondition: stall-too-many-memtables regex: Stopping writes because we have \d+ immutable memtables \(waiting for flush\), max_write_buffer_number is set to \d+ +Suggestion: inc-bg-flush option : DBOptions.max_background_flushes action : increase suggested_values : ['2'] +Suggestion: inc-write-buffer option : CFOptions.max_write_buffer_number action : increase +scope: col_fam: +{'default'} +``` + +### Read more + +For more information, refer to [advisor](https://github.com/facebook/rocksdb/tree/master/tools/advisor/README.md). diff -Nru rocksdb-5.15.10/docs/_posts/2018-08-23-data-block-hash-index.markdown rocksdb-5.17.2/docs/_posts/2018-08-23-data-block-hash-index.markdown --- rocksdb-5.15.10/docs/_posts/2018-08-23-data-block-hash-index.markdown 1970-01-01 00:00:00.000000000 +0000 +++ rocksdb-5.17.2/docs/_posts/2018-08-23-data-block-hash-index.markdown 2018-11-12 19:57:32.000000000 +0000 @@ -0,0 +1,118 @@ +--- +title: Improving Point-Lookup Using Data Block Hash Index +layout: post +author: fgwu +category: blog +--- +We've designed and implemented a _data block hash index_ in RocksDB that has the benefit of both reducing the CPU util and increasing the throughput for point lookup queries with a reasonable and tunable space overhead. + +Specifially, we append a compact hash table to the end of the data block for efficient indexing. It is backward compatible with the data base created without this feature. After turned on the hash index feature, existing data will be gradually converted to the hash index format. + +Benchmarks with `db_bench` show the CPU utilization of one of the main functions in the point lookup code path, `DataBlockIter::Seek()`, is reduced by 21.8%, and the overall RocksDB throughput is increased by 10% under purely cached workloads, at an overhead of 4.6% more space. Shadow testing with Facebook production traffic shows good CPU improvements too. + + +### How to use it +Two new options are added as part of this feature: `BlockBasedTableOptions::data_block_index_type` and `BlockBasedTableOptions::data_block_hash_table_util_ratio`. + +The hash index is disabled by default unless `BlockBasedTableOptions::data_block_index_type` is set to `data_block_index_type = kDataBlockBinaryAndHash`. The hash table utilization ratio is adjustable using `BlockBasedTableOptions::data_block_hash_table_util_ratio`, which is valid only if `data_block_index_type = kDataBlockBinaryAndHash`. + + +``` +// the definitions can be found in include/rocksdb/table.h + +// The index type that will be used for the data block. +enum DataBlockIndexType : char { + kDataBlockBinarySearch = 0, // traditional block type + kDataBlockBinaryAndHash = 1, // additional hash index +}; + +// Set to kDataBlockBinaryAndHash to enable hash index +DataBlockIndexType data_block_index_type = kDataBlockBinarySearch; + +// #entries/#buckets. It is valid only when data_block_hash_index_type is +// kDataBlockBinaryAndHash. +double data_block_hash_table_util_ratio = 0.75; + +``` + + +### Data Block Hash Index Design + +Current data block format groups adjacent keys together as a restart interval. One block consists of multiple restart intervals. The byte offset of the beginning of each restart interval, i.e. a restart point, is stored in an array called restart interval index or binary seek index. RocksDB does a binary search when performing point lookup for keys in data blocks to find the right restart interval the key may reside. We will use binary seek and binary search interchangeably in this post. + +In order to find the right location where the key may reside using binary search, multiple key parsing and comparison are needed. Each binary search branching triggers CPU cache miss, causing much CPU utilization. We have seen that this binary search takes up considerable CPU in production use-cases. + +![](/static/images/data-block-hash-index/block-format-binary-seek.png) + +We implemented a hash map at the end of the block to index the key to reduce the CPU overhead of the binary search. The hash index is just an array of pointers pointing into the binary seek index. + +![](/static/images/data-block-hash-index/block-format-hash-index.png) + + +Each array element is considered as a hash bucket when storing the location of a key (or more precisely, the restart index of the restart interval where the key resides). When multiple keys happen to hash into the same bucket (hash collision), we just mark the bucket as “collision”. So that when later querying on that key, the hash table lookup knows that there was a hash collision happened so it can fall back to the traditional binary search to find the location of the key. + +We define hash table utilization ratio as the #keys/#buckets. If a utilization ratio is 0.5 and there are 100 buckets, 50 keys are stored in the bucket. The less the util ratio, the less hash collision, and the less chance for a point lookup falls back to binary seek (fall back ratio) due to the collision. So a small util ratio has more benefit to reduce the CPU time but introduces more space overhead. + +Space overhead depends on the util ratio. Each bucket is a `uint8_t` (i.e. one byte). For a util ratio of 1, the space overhead is 1Byte per key, the fall back ratio observed is ~52%. + +![](/static/images/data-block-hash-index/hash-index-data-structure.png) + +### Things that Need Attention + +**Customized Comparator** + +Hash index will hash different keys (keys with different content, or byte sequence) into different hash values. This assumes the comparator will not treat different keys as equal if they have different content. + +The default bytewise comparator orders the keys in alphabetical order and works well with hash index, as different keys will never be regarded as equal. However, some specially crafted comparators will do. For example, say, a `StringToIntComparator` can convert a string into an integer, and use the integer to perform the comparison. Key string “16” and “0x10” is equal to each other as seen by this `StringToIntComparator`, but they probably hash to different value. Later queries to one form of the key will not be able to find the existing key been stored in the other format. + +We add a new function member to the comparator interface: + +``` +virtual bool CanKeysWithDifferentByteContentsBeEqual() const { return true; } +``` + + +Every comparator implementation should override this function and specify the behavior of the comparator. If a comparator can regard different keys equal, the function returns true, and as a result the hash index feature will not be enabled, and vice versa. + +NOTE: to use the hash index feature, one should 1) have a comparator that can never treat different keys as equal; and 2) override the `CanKeysWithDifferentByteContentsBeEqual()` function to return `false`, so the hash index can be enabled. + + +**Util Ratio's Impact on Data Block Cache** + +Adding the hash index to the end of the data block essentially takes up the data block cache space, making the effective data block cache size smaller and increasing the data block cache miss ratio. Therefore, a very small util ratio will result in a large data block cache miss ratio, and the extra I/O may drag down the throughput gain achieved by the hash index lookup. Besides, when compression is enabled, cache miss also incurs data block decompression, which is CPU-consuming. Therefore the CPU may even increase if using a too small util ratio. The best util ratio depends on workloads, cache to data ratio, disk bandwidth/latency etc. In our experiment, we found util ratio = 0.5 ~ 1 is a good range to explore that brings both CPU and throughput gains. + + +### Limitations + +As we use `uint8_t` to store binary seek index, i.e. restart interval index, the total number of restart intervals cannot be more than 253 (we reserved 255 and 254 as special flags). For blocks having a larger number of restart intervals, the hash index will not be created and the point lookup will be done by traditional binary seek. + +Data block hash index only supports point lookup. We do not support range lookup. Range lookup request will fall back to BinarySeek. + +RocksDB supports many types of records, such as `Put`, `Delete`, `Merge`, etc (visit [here](https://github.com/facebook/rocksdb/wiki/rocksdb-basics) for more information). Currently we only support `Put` and `Delete`, but not `Merge`. Internally we have a limited set of supported record types: + + +``` +kPutRecord, <=== supported +kDeleteRecord, <=== supported +kSingleDeleteRecord, <=== supported +kTypeBlobIndex, <=== supported +``` + +For records not supported, the searching process will fall back to the traditional binary seek. + + + +### Evaluation +To evaluate the CPU util reduction and isolate other factors such as disk I/O and block decompression, we first evaluate the hash idnex in a purely cached workload. We observe that the CPU utilization of one of the main functions in the point lookup code path, DataBlockIter::Seek(), is reduced by 21.8% and the overall throughput is increased by 10% at an overhead of 4.6% more space. + +However, general worload is not always purely cached. So we also evaluate the performance under different cache space pressure. In the following test, we use `db_bench` with RocksDB deployed on SSDs. The total DB size is 5~6GB, and it is about 14GB if decompressed. Different block cache sizes are used, ranging from 14GB down to 2GB, with an increasing cache miss ratio. + +Orange bars are representing our hash index performance. We use a hash util ratio of 1.0 in this test. Block size are set to 16KiB with the restart interval as 16. + +![](/static/images/data-block-hash-index/perf-throughput.png) +![](/static/images/data-block-hash-index/perf-cache-miss.png) + +We can see that if cache size is greater than 8GB, hash index can bring throughput gain. Cache size greater than 8GB can be translated to a cache miss ratio smaller than 40%. So if the workload has a cache miss ratio smaller than 40%, hash index is able to increase the throughput. + +Besides, shadow testing with Facebook production traffic shows good CPU improvements too. + Binary files /tmp/tmpCpQiov/sdWDYeQVrI/rocksdb-5.15.10/docs/static/images/binaryseek.png and /tmp/tmpCpQiov/KYFbIzS3v6/rocksdb-5.17.2/docs/static/images/binaryseek.png differ Binary files /tmp/tmpCpQiov/sdWDYeQVrI/rocksdb-5.15.10/docs/static/images/data-block-hash-index/block-format-binary-seek.png and /tmp/tmpCpQiov/KYFbIzS3v6/rocksdb-5.17.2/docs/static/images/data-block-hash-index/block-format-binary-seek.png differ Binary files /tmp/tmpCpQiov/sdWDYeQVrI/rocksdb-5.15.10/docs/static/images/data-block-hash-index/block-format-hash-index.png and /tmp/tmpCpQiov/KYFbIzS3v6/rocksdb-5.17.2/docs/static/images/data-block-hash-index/block-format-hash-index.png differ Binary files /tmp/tmpCpQiov/sdWDYeQVrI/rocksdb-5.15.10/docs/static/images/data-block-hash-index/hash-index-data-structure.png and /tmp/tmpCpQiov/KYFbIzS3v6/rocksdb-5.17.2/docs/static/images/data-block-hash-index/hash-index-data-structure.png differ Binary files /tmp/tmpCpQiov/sdWDYeQVrI/rocksdb-5.15.10/docs/static/images/data-block-hash-index/perf-cache-miss.png and /tmp/tmpCpQiov/KYFbIzS3v6/rocksdb-5.17.2/docs/static/images/data-block-hash-index/perf-cache-miss.png differ Binary files /tmp/tmpCpQiov/sdWDYeQVrI/rocksdb-5.15.10/docs/static/images/data-block-hash-index/perf-throughput.png and /tmp/tmpCpQiov/KYFbIzS3v6/rocksdb-5.17.2/docs/static/images/data-block-hash-index/perf-throughput.png differ diff -Nru rocksdb-5.15.10/env/env_posix.cc rocksdb-5.17.2/env/env_posix.cc --- rocksdb-5.15.10/env/env_posix.cc 2018-09-13 17:25:20.000000000 +0000 +++ rocksdb-5.17.2/env/env_posix.cc 2018-11-12 19:57:32.000000000 +0000 @@ -20,11 +20,12 @@ #include #include #include -#if defined(OS_LINUX) || defined(OS_SOLARIS) +#if defined(OS_LINUX) || defined(OS_SOLARIS) || defined(OS_ANDROID) #include #include #include #endif +#include #include #include #include @@ -102,6 +103,18 @@ std::string filename; }; +int cloexec_flags(int flags, const EnvOptions* options) { + // If the system supports opening the file with cloexec enabled, + // do so, as this avoids a race condition if a db is opened around + // the same time that a child process is forked +#ifdef O_CLOEXEC + if (options == nullptr || options->set_fd_cloexec) { + flags |= O_CLOEXEC; + } +#endif + return flags; +} + class PosixEnv : public Env { public: PosixEnv(); @@ -133,7 +146,7 @@ const EnvOptions& options) override { result->reset(); int fd = -1; - int flags = O_RDONLY; + int flags = cloexec_flags(O_RDONLY, &options); FILE* file = nullptr; if (options.use_direct_reads && !options.use_mmap_reads) { @@ -184,7 +197,8 @@ result->reset(); Status s; int fd; - int flags = O_RDONLY; + int flags = cloexec_flags(O_RDONLY, &options); + if (options.use_direct_reads && !options.use_mmap_reads) { #ifdef ROCKSDB_LITE return Status::IOError(fname, "Direct I/O not supported in RocksDB lite"); @@ -266,6 +280,8 @@ flags |= O_WRONLY; } + flags = cloexec_flags(flags, &options); + do { IOSTATS_TIMER_GUARD(open_nanos); fd = open(fname.c_str(), flags, GetDBFileMode(allow_non_owner_access_)); @@ -354,6 +370,8 @@ flags |= O_WRONLY; } + flags = cloexec_flags(flags, &options); + do { IOSTATS_TIMER_GUARD(open_nanos); fd = open(old_fname.c_str(), flags, @@ -415,9 +433,12 @@ unique_ptr* result, const EnvOptions& options) override { int fd = -1; + int flags = cloexec_flags(O_RDWR, &options); + while (fd < 0) { IOSTATS_TIMER_GUARD(open_nanos); - fd = open(fname.c_str(), O_RDWR, GetDBFileMode(allow_non_owner_access_)); + + fd = open(fname.c_str(), flags, GetDBFileMode(allow_non_owner_access_)); if (fd < 0) { // Error while opening the file if (errno == EINTR) { @@ -437,9 +458,11 @@ unique_ptr* result) override { int fd = -1; Status status; + int flags = cloexec_flags(O_RDWR, nullptr); + while (fd < 0) { IOSTATS_TIMER_GUARD(open_nanos); - fd = open(fname.c_str(), O_RDWR, 0644); + fd = open(fname.c_str(), flags, 0644); if (fd < 0) { // Error while opening the file if (errno == EINTR) { @@ -477,9 +500,10 @@ unique_ptr* result) override { result->reset(); int fd; + int flags = cloexec_flags(0, nullptr); { IOSTATS_TIMER_GUARD(open_nanos); - fd = open(name.c_str(), 0); + fd = open(name.c_str(), flags); } if (fd < 0) { return IOError("While open directory", name, errno); @@ -496,7 +520,8 @@ return Status::OK(); } - switch (errno) { + int err = errno; + switch (err) { case EACCES: case ELOOP: case ENAMETOOLONG: @@ -504,8 +529,8 @@ case ENOTDIR: return Status::NotFound(); default: - assert(result == EIO || result == ENOMEM); - return Status::IOError("Unexpected error(" + ToString(result) + + assert(err == EIO || err == ENOMEM); + return Status::IOError("Unexpected error(" + ToString(err) + ") accessing file `" + fname + "' "); } } @@ -663,9 +688,11 @@ } int fd; + int flags = cloexec_flags(O_RDWR | O_CREAT, nullptr); + { IOSTATS_TIMER_GUARD(open_nanos); - fd = open(fname.c_str(), O_RDWR | O_CREAT, 0644); + fd = open(fname.c_str(), flags, 0644); } if (fd < 0) { result = IOError("while open a file for lock", fname, errno); @@ -751,12 +778,30 @@ return gettid(pthread_self()); } + virtual Status GetFreeSpace(const std::string& fname, + uint64_t* free_space) override { + struct statvfs sbuf; + + if (statvfs(fname.c_str(), &sbuf) < 0) { + return IOError("While doing statvfs", fname, errno); + } + + *free_space = ((uint64_t)sbuf.f_bsize * sbuf.f_bfree); + return Status::OK(); + } + virtual Status NewLogger(const std::string& fname, shared_ptr* result) override { FILE* f; { IOSTATS_TIMER_GUARD(open_nanos); - f = fopen(fname.c_str(), "w"); + f = fopen(fname.c_str(), "w" +#ifdef __GLIBC_PREREQ +#if __GLIBC_PREREQ(2, 7) + "e" // glibc extension to enable O_CLOEXEC +#endif +#endif + ); } if (f == nullptr) { result->reset(); diff -Nru rocksdb-5.15.10/env/mock_env.cc rocksdb-5.17.2/env/mock_env.cc --- rocksdb-5.15.10/env/mock_env.cc 2018-09-13 17:25:20.000000000 +0000 +++ rocksdb-5.17.2/env/mock_env.cc 2018-11-12 19:57:32.000000000 +0000 @@ -201,7 +201,7 @@ if (n > available) { n = available; } - pos_ += n; + pos_ += static_cast(n); return Status::OK(); } diff -Nru rocksdb-5.15.10/env/posix_logger.h rocksdb-5.17.2/env/posix_logger.h --- rocksdb-5.15.10/env/posix_logger.h 2018-09-13 17:25:20.000000000 +0000 +++ rocksdb-5.17.2/env/posix_logger.h 2018-11-12 19:57:32.000000000 +0000 @@ -165,7 +165,6 @@ size_t sz = fwrite(base, 1, write_size, file_); flush_pending_ = true; - assert(sz == write_size); if (sz > 0) { log_size_ += write_size; } diff -Nru rocksdb-5.15.10/.gitignore rocksdb-5.17.2/.gitignore --- rocksdb-5.15.10/.gitignore 2018-09-13 17:25:20.000000000 +0000 +++ rocksdb-5.17.2/.gitignore 2018-11-12 19:57:32.000000000 +0000 @@ -45,6 +45,8 @@ rocksdb_dump rocksdb_undump db_test2 +trace_analyzer +trace_analyzer_test java/out java/target diff -Nru rocksdb-5.15.10/HISTORY.md rocksdb-5.17.2/HISTORY.md --- rocksdb-5.15.10/HISTORY.md 2018-09-13 17:25:20.000000000 +0000 +++ rocksdb-5.17.2/HISTORY.md 2018-11-12 19:57:32.000000000 +0000 @@ -1,43 +1,58 @@ # Rocksdb Change Log -### 5.15.10 (9/13/2018) -### Bug Fixes -* Fix RocksDB Java build and tests. -### 5.15.9 (9/4/2018) +# 5.17.2 (10/24/2018) ### Bug Fixes -* Fix compilation errors on OS X clang due to '-Wsuggest-override'. +* Fix the bug that WriteBatchWithIndex's SeekForPrev() doesn't see the entries with the same key. -## 5.15.8 (8/31/2018) +# 5.17.1 (10/16/2018) ### Bug Fixes -* Further avoid creating empty SSTs and subsequently deleting them during compaction. +* Fix slow flush/compaction when DB contains many snapshots. The problem became noticeable to us in DBs with 100,000+ snapshots, though it will affect others at different thresholds. +* Properly set the stop key for a truncated manual CompactRange +* Fix corner case where a write group leader blocked due to write stall blocks other writers in queue with WriteOptions::no_slowdown set. -## 5.15.7 (8/24/2018) -### Bug Fixes -* Avoid creating empty SSTs and subsequently deleting them in certain cases during compaction. +### New Features +* Introduced CacheAllocator, which lets the user specify custom allocator for memory in block cache. -## 5.15.6 (8/21/2018) +## 5.17.0 (10/05/2018) ### Public API Change -* The merge operands are passed to `MergeOperator::ShouldMerge` in the reversed order relative to how they were merged (passed to FullMerge or FullMergeV2) for performance reasons +* `OnTableFileCreated` will now be called for empty files generated during compaction. In that case, `TableFileCreationInfo::file_path` will be "(nil)" and `TableFileCreationInfo::file_size` will be zero. +* Add `FlushOptions::allow_write_stall`, which controls whether Flush calls start working immediately, even if it causes user writes to stall, or will wait until flush can be performed without causing write stall (similar to `CompactRangeOptions::allow_write_stall`). Note that the default value is false, meaning we add delay to Flush calls until stalling can be avoided when possible. This is behavior change compared to previous RocksDB versions, where Flush calls didn't check if they might cause stall or not. +* Application using PessimisticTransactionDB is expected to rollback/commit recovered transactions before starting new ones. This assumption is used to skip concurrency control during recovery. + +### New Features +* TransactionOptions::skip_concurrency_control allows pessimistic transactions to skip the overhead of concurrency control. Could be used for optimizing certain transactions or during recovery. -## 5.15.5 (8/16/2018) ### Bug Fixes -* Fix VerifyChecksum() API not preserving options +* Avoid creating empty SSTs and subsequently deleting them in certain cases during compaction. +* Sync CURRENT file contents during checkpoint. -## 5.15.4 (8/11/2018) +## 5.16.3 (10/1/2018) ### Bug Fixes -* Fix a bug caused by not generating OnTableFileCreated() notification for a 0-byte SST. +* Fix crash caused when `CompactFiles` run with `CompactionOptions::compression == CompressionType::kDisableCompressionOption`. Now that setting causes the compression type to be chosen according to the column family-wide compression options. -## 5.15.3 (8/10/2018) +## 5.16.2 (9/21/2018) ### Bug Fixes -* Fix a bug in misreporting the estimated partition index size in properties block. +* Fix bug in partition filters with format_version=4. -## 5.15.2 (8/9/2018) +## 5.16.1 (9/17/2018) ### Bug Fixes -* Return correct usable_size for BlockContents. +* Remove trace_analyzer_tool from rocksdb_lib target in TARGETS file. +* Fix RocksDB Java build and tests. +* Remove sync point in Block destructor. + +## 5.16.0 (8/21/2018) +### Public API Change +* The merge operands are passed to `MergeOperator::ShouldMerge` in the reversed order relative to how they were merged (passed to FullMerge or FullMergeV2) for performance reasons +* GetAllKeyVersions() to take an extra argument of `max_num_ikeys`. +* Using ZSTD dictionary trainer (i.e., setting `CompressionOptions::zstd_max_train_bytes` to a nonzero value) now requires ZSTD version 1.1.3 or later. + +### New Features +* Changes the format of index blocks by delta encoding the index values, which are the block handles. This saves the encoding of BlockHandle::offset of the non-head index entries in each restart interval. The feature is backward compatible but not forward compatible. It is disabled by default unless format_version 4 or above is used. +* Add a new tool: trace_analyzer. Trace_analyzer analyzes the trace file generated by using trace_replay API. It can convert the binary format trace file to a human readable txt file, output the statistics of the analyzed query types such as access statistics and size statistics, combining the dumped whole key space file to analyze, support query correlation analyzing, and etc. Current supported query types are: Get, Put, Delete, SingleDelete, DeleteRange, Merge, Iterator (Seek, SeekForPrev only). +* Add hash index support to data blocks, which helps reducing the cpu utilization of point-lookup operations. This feature is backward compatible with the data block created without the hash index. It is disabled by default unless BlockBasedTableOptions::data_block_index_type is set to data_block_index_type = kDataBlockBinaryAndHash. -## 5.15.1 (8/1/2018) ### Bug Fixes -* Prevent dereferencing invalid STL iterators when there are range tombstones in ingested files. +* Fix a bug in misreporting the estimated partition index size in properties block. ## 5.15.0 (7/17/2018) ### Public API Change @@ -48,12 +63,13 @@ * The "rocksdb.num.entries" table property no longer counts range deletion tombstones as entries. ### New Features -* Changes the format of index blocks by storing the key in their raw form rather than converting them to InternalKey. This saves 8 bytes per index key. The feature is backward compatbile but not forward compatible. It is disabled by default unless format_version 3 or above is used. +* Changes the format of index blocks by storing the key in their raw form rather than converting them to InternalKey. This saves 8 bytes per index key. The feature is backward compatible but not forward compatible. It is disabled by default unless format_version 3 or above is used. * Avoid memcpy when reading mmap files with OpenReadOnly and max_open_files==-1. * Support dynamically changing `ColumnFamilyOptions::ttl` via `SetOptions()`. * Add a new table property, "rocksdb.num.range-deletions", which counts the number of range deletion tombstones in the table. * Improve the performance of iterators doing long range scans by using readahead, when using direct IO. * pin_top_level_index_and_filter (default true) in BlockBasedTableOptions can be used in combination with cache_index_and_filter_blocks to prefetch and pin the top-level index of partitioned index and filter blocks in cache. It has no impact when cache_index_and_filter_blocks is false. +* Write properties meta-block at the end of block-based table to save read-ahead IO. ### Bug Fixes * Fix deadlock with enable_pipelined_write=true and max_successive_merges > 0 @@ -172,7 +188,8 @@ * `BackupableDBOptions::max_valid_backups_to_open == 0` now means no backups will be opened during BackupEngine initialization. Previously this condition disabled limiting backups opened. * `DBOptions::preserve_deletes` is a new option that allows one to specify that DB should not drop tombstones for regular deletes if they have sequence number larger than what was set by the new API call `DB::SetPreserveDeletesSequenceNumber(SequenceNumber seqnum)`. Disabled by default. * API call `DB::SetPreserveDeletesSequenceNumber(SequenceNumber seqnum)` was added, users who wish to preserve deletes are expected to periodically call this function to advance the cutoff seqnum (all deletes made before this seqnum can be dropped by DB). It's user responsibility to figure out how to advance the seqnum in the way so the tombstones are kept for the desired period of time, yet are eventually processed in time and don't eat up too much space. -* `ReadOptions::iter_start_seqnum` was added; if set to something > 0 user will see 2 changes in iterators behavior 1) only keys written with sequence larger than this parameter would be returned and 2) the `Slice` returned by iter->key() now points to the memory that keep User-oriented representation of the internal key, rather than user key. New struct `FullKey` was added to represent internal keys, along with a new helper function `ParseFullKey(const Slice& internal_key, FullKey* result);`. +* `ReadOptions::iter_start_seqnum` was added; +if set to something > 0 user will see 2 changes in iterators behavior 1) only keys written with sequence larger than this parameter would be returned and 2) the `Slice` returned by iter->key() now points to the memory that keep User-oriented representation of the internal key, rather than user key. New struct `FullKey` was added to represent internal keys, along with a new helper function `ParseFullKey(const Slice& internal_key, FullKey* result);`. * Deprecate trash_dir param in NewSstFileManager, right now we will rename deleted files to .trash instead of moving them to trash directory * Allow setting a custom trash/DB size ratio limit in the SstFileManager, after which files that are to be scheduled for deletion are deleted immediately, regardless of any delete ratelimit. * Return an error on write if write_options.sync = true and write_options.disableWAL = true to warn user of inconsistent options. Previously we will not write to WAL and not respecting the sync options in this case. diff -Nru rocksdb-5.15.10/include/rocksdb/c.h rocksdb-5.17.2/include/rocksdb/c.h --- rocksdb-5.15.10/include/rocksdb/c.h 2018-09-13 17:25:20.000000000 +0000 +++ rocksdb-5.17.2/include/rocksdb/c.h 2018-11-12 19:57:32.000000000 +0000 @@ -42,9 +42,6 @@ (5) All of the pointer arguments must be non-NULL. */ -#ifndef STORAGE_ROCKSDB_INCLUDE_C_H_ -#define STORAGE_ROCKSDB_INCLUDE_C_H_ - #pragma once #ifdef _WIN32 @@ -126,6 +123,8 @@ typedef struct rocksdb_checkpoint_t rocksdb_checkpoint_t; typedef struct rocksdb_wal_iterator_t rocksdb_wal_iterator_t; typedef struct rocksdb_wal_readoptions_t rocksdb_wal_readoptions_t; +typedef struct rocksdb_memory_consumers_t rocksdb_memory_consumers_t; +typedef struct rocksdb_memory_usage_t rocksdb_memory_usage_t; /* DB operations */ @@ -831,6 +830,12 @@ extern ROCKSDB_LIBRARY_API void rocksdb_options_set_max_write_buffer_number_to_maintain(rocksdb_options_t*, int); +extern ROCKSDB_LIBRARY_API void rocksdb_options_set_enable_pipelined_write( + rocksdb_options_t*, unsigned char); +extern ROCKSDB_LIBRARY_API void rocksdb_options_set_max_subcompactions( + rocksdb_options_t*, uint32_t); +extern ROCKSDB_LIBRARY_API void rocksdb_options_set_max_background_jobs( + rocksdb_options_t*, int); extern ROCKSDB_LIBRARY_API void rocksdb_options_set_max_background_compactions( rocksdb_options_t*, int); extern ROCKSDB_LIBRARY_API void rocksdb_options_set_base_background_compactions( @@ -1669,8 +1674,33 @@ extern ROCKSDB_LIBRARY_API const char* rocksdb_pinnableslice_value( const rocksdb_pinnableslice_t* t, size_t* vlen); +extern ROCKSDB_LIBRARY_API rocksdb_memory_consumers_t* + rocksdb_memory_consumers_create(); +extern ROCKSDB_LIBRARY_API void rocksdb_memory_consumers_add_db( + rocksdb_memory_consumers_t* consumers, rocksdb_t* db); +extern ROCKSDB_LIBRARY_API void rocksdb_memory_consumers_add_cache( + rocksdb_memory_consumers_t* consumers, rocksdb_cache_t* cache); +extern ROCKSDB_LIBRARY_API void rocksdb_memory_consumers_destroy( + rocksdb_memory_consumers_t* consumers); +extern ROCKSDB_LIBRARY_API rocksdb_memory_usage_t* +rocksdb_approximate_memory_usage_create(rocksdb_memory_consumers_t* consumers, + char** errptr); +extern ROCKSDB_LIBRARY_API void rocksdb_approximate_memory_usage_destroy( + rocksdb_memory_usage_t* usage); + +extern ROCKSDB_LIBRARY_API uint64_t +rocksdb_approximate_memory_usage_get_mem_table_total( + rocksdb_memory_usage_t* memory_usage); +extern ROCKSDB_LIBRARY_API uint64_t +rocksdb_approximate_memory_usage_get_mem_table_unflushed( + rocksdb_memory_usage_t* memory_usage); +extern ROCKSDB_LIBRARY_API uint64_t +rocksdb_approximate_memory_usage_get_mem_table_readers_total( + rocksdb_memory_usage_t* memory_usage); +extern ROCKSDB_LIBRARY_API uint64_t +rocksdb_approximate_memory_usage_get_cache_total( + rocksdb_memory_usage_t* memory_usage); + #ifdef __cplusplus } /* end extern "C" */ #endif - -#endif /* STORAGE_ROCKSDB_INCLUDE_C_H_ */ diff -Nru rocksdb-5.15.10/include/rocksdb/cleanable.h rocksdb-5.17.2/include/rocksdb/cleanable.h --- rocksdb-5.15.10/include/rocksdb/cleanable.h 2018-09-13 17:25:20.000000000 +0000 +++ rocksdb-5.17.2/include/rocksdb/cleanable.h 2018-11-12 19:57:32.000000000 +0000 @@ -16,8 +16,7 @@ // non-const method, all threads accessing the same Iterator must use // external synchronization. -#ifndef INCLUDE_ROCKSDB_CLEANABLE_H_ -#define INCLUDE_ROCKSDB_CLEANABLE_H_ +#pragma once namespace rocksdb { @@ -78,5 +77,3 @@ }; } // namespace rocksdb - -#endif // INCLUDE_ROCKSDB_CLEANABLE_H_ diff -Nru rocksdb-5.15.10/include/rocksdb/compaction_filter.h rocksdb-5.17.2/include/rocksdb/compaction_filter.h --- rocksdb-5.15.10/include/rocksdb/compaction_filter.h 2018-09-13 17:25:20.000000000 +0000 +++ rocksdb-5.17.2/include/rocksdb/compaction_filter.h 2018-11-12 19:57:32.000000000 +0000 @@ -6,8 +6,7 @@ // Use of this source code is governed by a BSD-style license that can be // found in the LICENSE file. See the AUTHORS file for names of contributors. -#ifndef STORAGE_ROCKSDB_INCLUDE_COMPACTION_FILTER_H_ -#define STORAGE_ROCKSDB_INCLUDE_COMPACTION_FILTER_H_ +#pragma once #include #include @@ -206,5 +205,3 @@ }; } // namespace rocksdb - -#endif // STORAGE_ROCKSDB_INCLUDE_COMPACTION_FILTER_H_ diff -Nru rocksdb-5.15.10/include/rocksdb/comparator.h rocksdb-5.17.2/include/rocksdb/comparator.h --- rocksdb-5.15.10/include/rocksdb/comparator.h 2018-09-13 17:25:20.000000000 +0000 +++ rocksdb-5.17.2/include/rocksdb/comparator.h 2018-11-12 19:57:32.000000000 +0000 @@ -6,8 +6,7 @@ // Use of this source code is governed by a BSD-style license that can be // found in the LICENSE file. See the AUTHORS file for names of contributors. -#ifndef STORAGE_ROCKSDB_INCLUDE_COMPARATOR_H_ -#define STORAGE_ROCKSDB_INCLUDE_COMPARATOR_H_ +#pragma once #include @@ -74,6 +73,12 @@ const Slice& /*t*/) const { return false; } + + // return true if two keys with different byte sequences can be regarded + // as equal by this comparator. + // The major use case is to determine if DataBlockHashIndex is compatible + // with the customized comparator. + virtual bool CanKeysWithDifferentByteContentsBeEqual() const { return true; } }; // Return a builtin comparator that uses lexicographic byte-wise @@ -86,5 +91,3 @@ extern const Comparator* ReverseBytewiseComparator(); } // namespace rocksdb - -#endif // STORAGE_ROCKSDB_INCLUDE_COMPARATOR_H_ diff -Nru rocksdb-5.15.10/include/rocksdb/db.h rocksdb-5.17.2/include/rocksdb/db.h --- rocksdb-5.15.10/include/rocksdb/db.h 2018-09-13 17:25:20.000000000 +0000 +++ rocksdb-5.17.2/include/rocksdb/db.h 2018-11-12 19:57:32.000000000 +0000 @@ -6,8 +6,7 @@ // Use of this source code is governed by a BSD-style license that can be // found in the LICENSE file. See the AUTHORS file for names of contributors. -#ifndef STORAGE_ROCKSDB_INCLUDE_DB_H_ -#define STORAGE_ROCKSDB_INCLUDE_DB_H_ +#pragma once #include #include @@ -53,6 +52,7 @@ class WriteBatch; class Env; class EventListener; +class TraceWriter; using std::unique_ptr; @@ -949,14 +949,14 @@ // GetLiveFiles followed by GetSortedWalFiles can generate a lossless backup // Retrieve the list of all files in the database. The files are - // relative to the dbname and are not absolute paths. The valid size of the - // manifest file is returned in manifest_file_size. The manifest file is an - // ever growing file, but only the portion specified by manifest_file_size is - // valid for this snapshot. - // Setting flush_memtable to true does Flush before recording the live files. - // Setting flush_memtable to false is useful when we don't want to wait for - // flush which may have to wait for compaction to complete taking an - // indeterminate time. + // relative to the dbname and are not absolute paths. Despite being relative + // paths, the file names begin with "/". The valid size of the manifest file + // is returned in manifest_file_size. The manifest file is an ever growing + // file, but only the portion specified by manifest_file_size is valid for + // this snapshot. Setting flush_memtable to true does Flush before recording + // the live files. Setting flush_memtable to false is useful when we don't + // want to wait for flush which may have to wait for compaction to complete + // taking an indeterminate time. // // In case you have multiple column families, even if flush_memtable is true, // you still need to call GetSortedWalFiles after GetLiveFiles to compensate @@ -996,11 +996,6 @@ std::vector* /*metadata*/) {} // Obtains the meta data of the specified column family of the DB. - // Status::NotFound() will be returned if the current DB does not have - // any column family match the specified name. - // - // If cf_name is not specified, then the metadata of the default - // column family will be returned. virtual void GetColumnFamilyMetaData(ColumnFamilyHandle* /*column_family*/, ColumnFamilyMetaData* /*metadata*/) {} @@ -1173,6 +1168,15 @@ return Status::NotSupported("PromoteL0() is not implemented."); } + // Trace DB operations. Use EndTrace() to stop tracing. + virtual Status StartTrace(const TraceOptions& /*options*/, + std::unique_ptr&& /*trace_writer*/) { + return Status::NotSupported("StartTrace() is not implemented."); + } + + virtual Status EndTrace() { + return Status::NotSupported("EndTrace() is not implemented."); + } #endif // ROCKSDB_LITE // Needed for StackableDB @@ -1216,5 +1220,3 @@ #endif } // namespace rocksdb - -#endif // STORAGE_ROCKSDB_INCLUDE_DB_H_ diff -Nru rocksdb-5.15.10/include/rocksdb/env.h rocksdb-5.17.2/include/rocksdb/env.h --- rocksdb-5.15.10/include/rocksdb/env.h 2018-09-13 17:25:20.000000000 +0000 +++ rocksdb-5.17.2/include/rocksdb/env.h 2018-11-12 19:57:32.000000000 +0000 @@ -14,8 +14,7 @@ // All Env implementations are safe for concurrent access from // multiple threads without any external synchronization. -#ifndef STORAGE_ROCKSDB_INCLUDE_ENV_H_ -#define STORAGE_ROCKSDB_INCLUDE_ENV_H_ +#pragma once #include #include @@ -478,6 +477,15 @@ // Returns the ID of the current thread. virtual uint64_t GetThreadID() const; +// This seems to clash with a macro on Windows, so #undef it here +#undef GetFreeSpace + + // Get the amount of free disk space + virtual Status GetFreeSpace(const std::string& /*path*/, + uint64_t* /*diskfree*/) { + return Status::NotSupported(); + } + protected: // The pointer to an internal structure that will update the // status of each thread. @@ -1267,5 +1275,3 @@ Env* NewTimedEnv(Env* base_env); } // namespace rocksdb - -#endif // STORAGE_ROCKSDB_INCLUDE_ENV_H_ diff -Nru rocksdb-5.15.10/include/rocksdb/filter_policy.h rocksdb-5.17.2/include/rocksdb/filter_policy.h --- rocksdb-5.15.10/include/rocksdb/filter_policy.h 2018-09-13 17:25:20.000000000 +0000 +++ rocksdb-5.17.2/include/rocksdb/filter_policy.h 2018-11-12 19:57:32.000000000 +0000 @@ -17,8 +17,7 @@ // Most people will want to use the builtin bloom filter support (see // NewBloomFilterPolicy() below). -#ifndef STORAGE_ROCKSDB_INCLUDE_FILTER_POLICY_H_ -#define STORAGE_ROCKSDB_INCLUDE_FILTER_POLICY_H_ +#pragma once #include #include @@ -149,5 +148,3 @@ extern const FilterPolicy* NewBloomFilterPolicy(int bits_per_key, bool use_block_based_builder = true); } - -#endif // STORAGE_ROCKSDB_INCLUDE_FILTER_POLICY_H_ diff -Nru rocksdb-5.15.10/include/rocksdb/iterator.h rocksdb-5.17.2/include/rocksdb/iterator.h --- rocksdb-5.15.10/include/rocksdb/iterator.h 2018-09-13 17:25:20.000000000 +0000 +++ rocksdb-5.17.2/include/rocksdb/iterator.h 2018-11-12 19:57:32.000000000 +0000 @@ -16,8 +16,7 @@ // non-const method, all threads accessing the same Iterator must use // external synchronization. -#ifndef STORAGE_ROCKSDB_INCLUDE_ITERATOR_H_ -#define STORAGE_ROCKSDB_INCLUDE_ITERATOR_H_ +#pragma once #include #include "rocksdb/cleanable.h" @@ -119,5 +118,3 @@ extern Iterator* NewErrorIterator(const Status& status); } // namespace rocksdb - -#endif // STORAGE_ROCKSDB_INCLUDE_ITERATOR_H_ diff -Nru rocksdb-5.15.10/include/rocksdb/ldb_tool.h rocksdb-5.17.2/include/rocksdb/ldb_tool.h --- rocksdb-5.15.10/include/rocksdb/ldb_tool.h 2018-09-13 17:25:20.000000000 +0000 +++ rocksdb-5.17.2/include/rocksdb/ldb_tool.h 2018-11-12 19:57:32.000000000 +0000 @@ -2,8 +2,8 @@ // This source code is licensed under both the GPLv2 (found in the // COPYING file in the root directory) and Apache 2.0 License // (found in the LICENSE.Apache file in the root directory). -#ifndef ROCKSDB_LITE #pragma once +#ifndef ROCKSDB_LITE #include #include #include "rocksdb/db.h" diff -Nru rocksdb-5.15.10/include/rocksdb/listener.h rocksdb-5.17.2/include/rocksdb/listener.h --- rocksdb-5.15.10/include/rocksdb/listener.h 2018-09-13 17:25:20.000000000 +0000 +++ rocksdb-5.17.2/include/rocksdb/listener.h 2018-11-12 19:57:32.000000000 +0000 @@ -27,6 +27,7 @@ kFlush, kCompaction, kRecovery, + kMisc, }; struct TableFileCreationBriefInfo { @@ -103,6 +104,7 @@ kDeleteFiles = 0x08, kAutoCompaction = 0x09, kManualFlush = 0x0a, + kErrorRecovery = 0xb, }; enum class BackgroundErrorReason { @@ -393,6 +395,21 @@ // returns. Otherwise, RocksDB may be blocked. virtual void OnStallConditionsChanged(const WriteStallInfo& /*info*/) {} + // A callback function for RocksDB which will be called just before + // starting the automatic recovery process for recoverable background + // errors, such as NoSpace(). The callback can suppress the automatic + // recovery by setting *auto_recovery to false. The database will then + // have to be transitioned out of read-only mode by calling DB::Resume() + virtual void OnErrorRecoveryBegin(BackgroundErrorReason /* reason */, + Status /* bg_error */, + bool* /* auto_recovery */) {} + + // A callback function for RocksDB which will be called once the database + // is recovered from read-only mode after an error. When this is called, it + // means normal writes to the database can be issued and the user can + // initiate any further recovery actions needed + virtual void OnErrorRecoveryCompleted(Status /* old_bg_error */) {} + virtual ~EventListener() {} }; diff -Nru rocksdb-5.15.10/include/rocksdb/memtablerep.h rocksdb-5.17.2/include/rocksdb/memtablerep.h --- rocksdb-5.15.10/include/rocksdb/memtablerep.h 2018-09-13 17:25:20.000000000 +0000 +++ rocksdb-5.17.2/include/rocksdb/memtablerep.h 2018-11-12 19:57:32.000000000 +0000 @@ -144,6 +144,14 @@ // or any writes done directly to entries accessed through the iterator.) virtual void MarkReadOnly() { } + // Notify this table rep that it has been flushed to stable storage. + // By default, does nothing. + // + // Invariant: MarkReadOnly() is called, before MarkFlushed(). + // Note that this method if overridden, should not run for an extended period + // of time. Otherwise, RocksDB may be blocked. + virtual void MarkFlushed() { } + // Look up key from the mem table, since the first key in the mem table whose // user_key matches the one given k, call the function callback_func(), with // callback_args directly forwarded as the first parameter, and the mem table diff -Nru rocksdb-5.15.10/include/rocksdb/merge_operator.h rocksdb-5.17.2/include/rocksdb/merge_operator.h --- rocksdb-5.15.10/include/rocksdb/merge_operator.h 2018-09-13 17:25:20.000000000 +0000 +++ rocksdb-5.17.2/include/rocksdb/merge_operator.h 2018-11-12 19:57:32.000000000 +0000 @@ -3,8 +3,7 @@ // COPYING file in the root directory) and Apache 2.0 License // (found in the LICENSE.Apache file in the root directory). -#ifndef STORAGE_ROCKSDB_INCLUDE_MERGE_OPERATOR_H_ -#define STORAGE_ROCKSDB_INCLUDE_MERGE_OPERATOR_H_ +#pragma once #include #include @@ -241,5 +240,3 @@ }; } // namespace rocksdb - -#endif // STORAGE_ROCKSDB_INCLUDE_MERGE_OPERATOR_H_ diff -Nru rocksdb-5.15.10/include/rocksdb/metadata.h rocksdb-5.17.2/include/rocksdb/metadata.h --- rocksdb-5.15.10/include/rocksdb/metadata.h 2018-09-13 17:25:20.000000000 +0000 +++ rocksdb-5.17.2/include/rocksdb/metadata.h 2018-11-12 19:57:32.000000000 +0000 @@ -65,7 +65,7 @@ num_reads_sampled(0), being_compacted(false) {} SstFileMetaData(const std::string& _file_name, const std::string& _path, - uint64_t _size, SequenceNumber _smallest_seqno, + size_t _size, SequenceNumber _smallest_seqno, SequenceNumber _largest_seqno, const std::string& _smallestkey, const std::string& _largestkey, uint64_t _num_reads_sampled, @@ -81,7 +81,7 @@ being_compacted(_being_compacted) {} // File size in bytes. - uint64_t size; + size_t size; // The name of the file. std::string name; // The full path where the file locates. diff -Nru rocksdb-5.15.10/include/rocksdb/options.h rocksdb-5.17.2/include/rocksdb/options.h --- rocksdb-5.15.10/include/rocksdb/options.h 2018-09-13 17:25:20.000000000 +0000 +++ rocksdb-5.17.2/include/rocksdb/options.h 2018-11-12 19:57:32.000000000 +0000 @@ -6,8 +6,7 @@ // Use of this source code is governed by a BSD-style license that can be // found in the LICENSE file. See the AUTHORS file for names of contributors. -#ifndef STORAGE_ROCKSDB_INCLUDE_OPTIONS_H_ -#define STORAGE_ROCKSDB_INCLUDE_OPTIONS_H_ +#pragma once #include #include @@ -430,6 +429,8 @@ // (i.e. the ones that are causing all the space amplification). If set to 0 // (default), we will dynamically choose the WAL size limit to be // [sum of all write_buffer_size * max_write_buffer_number] * 4 + // This option takes effect only when there are more than one column family as + // otherwise the wal size is dictated by the write_buffer_size. // Default: 0 uint64_t max_total_wal_size = 0; @@ -1181,8 +1182,13 @@ // If true, the flush will wait until the flush is done. // Default: true bool wait; - - FlushOptions() : wait(true) {} + // If true, the flush would proceed immediately even it means writes will + // stall for the duration of the flush; if false the operation will wait + // until it's possible to do flush w/o causing stall or until required flush + // is performed by someone else (foreground call or background thread). + // Default: false + bool allow_write_stall; + FlushOptions() : wait(true), allow_write_stall(false) {} }; // Create a Logger from provided DBOptions @@ -1194,6 +1200,9 @@ struct CompactionOptions { // Compaction output compression type // Default: snappy + // If set to `kDisableCompressionOption`, RocksDB will choose compression type + // according to the `ColumnFamilyOptions`, taking into account the output + // level if `compression_per_level` is specified. CompressionType compression; // Compaction will create files of size `output_file_size_limit`. // Default: MAX, which means that compaction will create a single file @@ -1265,8 +1274,20 @@ // with allow_ingest_behind=true since the dawn of time. // All files will be ingested at the bottommost level with seqno=0. bool ingest_behind = false; + // Set to true if you would like to write global_seqno to a given offset in + // the external SST file for backward compatibility. Older versions of + // RocksDB writes a global_seqno to a given offset within ingested SST files, + // and new versions of RocksDB do not. If you ingest an external SST using + // new version of RocksDB and would like to be able to downgrade to an + // older version of RocksDB, you should set 'write_global_seqno' to true. If + // your service is just starting to use the new RocksDB, we recommend that + // you set this option to false, which brings two benefits: + // 1. No extra random write for global_seqno during ingestion. + // 2. Without writing external SST file, it's possible to do checksum. + // We have a plan to set this option to false by default in the future. + bool write_global_seqno = true; }; -} // namespace rocksdb +struct TraceOptions {}; -#endif // STORAGE_ROCKSDB_INCLUDE_OPTIONS_H_ +} // namespace rocksdb diff -Nru rocksdb-5.15.10/include/rocksdb/perf_context.h rocksdb-5.17.2/include/rocksdb/perf_context.h --- rocksdb-5.15.10/include/rocksdb/perf_context.h 2018-09-13 17:25:20.000000000 +0000 +++ rocksdb-5.17.2/include/rocksdb/perf_context.h 2018-11-12 19:57:32.000000000 +0000 @@ -3,8 +3,7 @@ // COPYING file in the root directory) and Apache 2.0 License // (found in the LICENSE.Apache file in the root directory). -#ifndef STORAGE_ROCKSDB_INCLUDE_PERF_CONTEXT_H -#define STORAGE_ROCKSDB_INCLUDE_PERF_CONTEXT_H +#pragma once #include #include @@ -176,5 +175,3 @@ PerfContext* get_perf_context(); } - -#endif diff -Nru rocksdb-5.15.10/include/rocksdb/perf_level.h rocksdb-5.17.2/include/rocksdb/perf_level.h --- rocksdb-5.15.10/include/rocksdb/perf_level.h 2018-09-13 17:25:20.000000000 +0000 +++ rocksdb-5.17.2/include/rocksdb/perf_level.h 2018-11-12 19:57:32.000000000 +0000 @@ -3,8 +3,7 @@ // COPYING file in the root directory) and Apache 2.0 License // (found in the LICENSE.Apache file in the root directory). -#ifndef INCLUDE_ROCKSDB_PERF_LEVEL_H_ -#define INCLUDE_ROCKSDB_PERF_LEVEL_H_ +#pragma once #include #include @@ -29,5 +28,3 @@ PerfLevel GetPerfLevel(); } // namespace rocksdb - -#endif // INCLUDE_ROCKSDB_PERF_LEVEL_H_ diff -Nru rocksdb-5.15.10/include/rocksdb/slice.h rocksdb-5.17.2/include/rocksdb/slice.h --- rocksdb-5.15.10/include/rocksdb/slice.h 2018-09-13 17:25:20.000000000 +0000 +++ rocksdb-5.17.2/include/rocksdb/slice.h 2018-11-12 19:57:32.000000000 +0000 @@ -16,8 +16,7 @@ // non-const method, all threads accessing the same Slice must use // external synchronization. -#ifndef STORAGE_ROCKSDB_INCLUDE_SLICE_H_ -#define STORAGE_ROCKSDB_INCLUDE_SLICE_H_ +#pragma once #include #include @@ -25,6 +24,10 @@ #include #include +#ifdef __cpp_lib_string_view +#include +#endif + #include "rocksdb/cleanable.h" namespace rocksdb { @@ -41,6 +44,12 @@ /* implicit */ Slice(const std::string& s) : data_(s.data()), size_(s.size()) { } +#ifdef __cpp_lib_string_view + // Create a slice that refers to the same contents as "sv" + /* implicit */ + Slice(std::string_view sv) : data_(sv.data()), size_(sv.size()) {} +#endif + // Create a slice that refers to s[0,strlen(s)-1] /* implicit */ Slice(const char* s) : data_(s) { @@ -86,6 +95,13 @@ // when hex is true, returns a string of twice the length hex encoded (0-9A-F) std::string ToString(bool hex = false) const; +#ifdef __cpp_lib_string_view + // Return a string_view that references the same data as this slice. + std::string_view ToStringView() const { + return std::string_view(data_, size_); + } +#endif + // Decodes the current slice interpreted as an hexadecimal string into result, // if successful returns true, if this isn't a valid hex string // (e.g not coming from Slice::ToString(true)) DecodeHex returns false. @@ -239,6 +255,4 @@ return off; } -} // namespace rocksdb - -#endif // STORAGE_ROCKSDB_INCLUDE_SLICE_H_ +} // namespace rocksdb \ No newline at end of file diff -Nru rocksdb-5.15.10/include/rocksdb/slice_transform.h rocksdb-5.17.2/include/rocksdb/slice_transform.h --- rocksdb-5.15.10/include/rocksdb/slice_transform.h 2018-09-13 17:25:20.000000000 +0000 +++ rocksdb-5.17.2/include/rocksdb/slice_transform.h 2018-11-12 19:57:32.000000000 +0000 @@ -12,8 +12,7 @@ // define InDomain and InRange to determine which slices are in either // of these sets respectively. -#ifndef STORAGE_ROCKSDB_INCLUDE_SLICE_TRANSFORM_H_ -#define STORAGE_ROCKSDB_INCLUDE_SLICE_TRANSFORM_H_ +#pragma once #include @@ -100,5 +99,3 @@ extern const SliceTransform* NewNoopTransform(); } - -#endif // STORAGE_ROCKSDB_INCLUDE_SLICE_TRANSFORM_H_ diff -Nru rocksdb-5.15.10/include/rocksdb/sst_file_manager.h rocksdb-5.17.2/include/rocksdb/sst_file_manager.h --- rocksdb-5.15.10/include/rocksdb/sst_file_manager.h 2018-09-13 17:25:20.000000000 +0000 +++ rocksdb-5.17.2/include/rocksdb/sst_file_manager.h 2018-11-12 19:57:32.000000000 +0000 @@ -75,6 +75,10 @@ // Update trash/DB size ratio where new files will be deleted immediately // thread-safe virtual void SetMaxTrashDBRatio(double ratio) = 0; + + // Return the total size of trash files + // thread-safe + virtual uint64_t GetTotalTrashSize() = 0; }; // Create a new SstFileManager that can be shared among multiple RocksDB diff -Nru rocksdb-5.15.10/include/rocksdb/sst_file_writer.h rocksdb-5.17.2/include/rocksdb/sst_file_writer.h --- rocksdb-5.15.10/include/rocksdb/sst_file_writer.h 2018-09-13 17:25:20.000000000 +0000 +++ rocksdb-5.17.2/include/rocksdb/sst_file_writer.h 2018-11-12 19:57:32.000000000 +0000 @@ -3,10 +3,10 @@ // COPYING file in the root directory) and Apache 2.0 License // (found in the LICENSE.Apache file in the root directory). -#ifndef ROCKSDB_LITE - #pragma once +#ifndef ROCKSDB_LITE + #include #include diff -Nru rocksdb-5.15.10/include/rocksdb/statistics.h rocksdb-5.17.2/include/rocksdb/statistics.h --- rocksdb-5.15.10/include/rocksdb/statistics.h 2018-09-13 17:25:20.000000000 +0000 +++ rocksdb-5.17.2/include/rocksdb/statistics.h 2018-11-12 19:57:32.000000000 +0000 @@ -3,8 +3,7 @@ // COPYING file in the root directory) and Apache 2.0 License // (found in the LICENSE.Apache file in the root directory). -#ifndef STORAGE_ROCKSDB_INCLUDE_STATISTICS_H_ -#define STORAGE_ROCKSDB_INCLUDE_STATISTICS_H_ +#pragma once #include #include @@ -673,5 +672,3 @@ std::shared_ptr CreateDBStatistics(); } // namespace rocksdb - -#endif // STORAGE_ROCKSDB_INCLUDE_STATISTICS_H_ diff -Nru rocksdb-5.15.10/include/rocksdb/status.h rocksdb-5.17.2/include/rocksdb/status.h --- rocksdb-5.15.10/include/rocksdb/status.h 2018-09-13 17:25:20.000000000 +0000 +++ rocksdb-5.17.2/include/rocksdb/status.h 2018-11-12 19:57:32.000000000 +0000 @@ -14,8 +14,7 @@ // non-const method, all threads accessing the same Status must use // external synchronization. -#ifndef STORAGE_ROCKSDB_INCLUDE_STATUS_H_ -#define STORAGE_ROCKSDB_INCLUDE_STATUS_H_ +#pragma once #include #include "rocksdb/slice.h" @@ -282,8 +281,6 @@ Severity sev_; const char* state_; - static const char* msgs[static_cast(kMaxSubCode)]; - explicit Status(Code _code, SubCode _subcode = kNone) : code_(_code), subcode_(_subcode), sev_(kNoError), state_(nullptr) {} @@ -350,5 +347,3 @@ } } // namespace rocksdb - -#endif // STORAGE_ROCKSDB_INCLUDE_STATUS_H_ diff -Nru rocksdb-5.15.10/include/rocksdb/table.h rocksdb-5.17.2/include/rocksdb/table.h --- rocksdb-5.15.10/include/rocksdb/table.h 2018-09-13 17:25:20.000000000 +0000 +++ rocksdb-5.17.2/include/rocksdb/table.h 2018-11-12 19:57:32.000000000 +0000 @@ -16,6 +16,7 @@ // https://github.com/facebook/rocksdb/wiki/A-Tutorial-of-RocksDB-SST-formats#wiki-examples #pragma once + #include #include #include @@ -100,6 +101,18 @@ IndexType index_type = kBinarySearch; + // The index type that will be used for the data block. + enum DataBlockIndexType : char { + kDataBlockBinarySearch = 0, // traditional block type + kDataBlockBinaryAndHash = 1, // additional hash index + }; + + DataBlockIndexType data_block_index_type = kDataBlockBinarySearch; + + // #entries/#buckets. It is valid only when data_block_hash_index_type is + // kDataBlockBinaryAndHash. + double data_block_hash_table_util_ratio = 0.75; + // This option is now deprecated. No matter what value it is set to, // it will behave as if hash_index_allow_collision=true. bool hash_index_allow_collision = true; @@ -226,6 +239,12 @@ // version 5.15, you should probably use this. // This option only affects newly written tables. When reading existing // tables, the information about version is read from the footer. + // 4 -- Can be read by RocksDB's versions since 5.16. Changes the way we + // encode the values in index blocks. If you don't plan to run RocksDB before + // version 5.16 and you are using index_block_restart_interval > 1, you should + // probably use this as it would reduce the index size. + // This option only affects newly written tables. When reading existing + // tables, the information about version is read from the footer. uint32_t format_version = 2; // Store index blocks on disk in compressed format. Changing this option to diff -Nru rocksdb-5.15.10/include/rocksdb/table_properties.h rocksdb-5.17.2/include/rocksdb/table_properties.h --- rocksdb-5.15.10/include/rocksdb/table_properties.h 2018-09-13 17:25:20.000000000 +0000 +++ rocksdb-5.17.2/include/rocksdb/table_properties.h 2018-11-12 19:57:32.000000000 +0000 @@ -34,6 +34,7 @@ static const std::string kIndexPartitions; static const std::string kTopLevelIndexSize; static const std::string kIndexKeyIsUserKey; + static const std::string kIndexValueIsDeltaEncoded; static const std::string kFilterSize; static const std::string kRawKeySize; static const std::string kRawValueSize; @@ -139,6 +140,8 @@ // Whether the index key is user key. Otherwise it includes 8 byte of sequence // number added by internal key format. uint64_t index_key_is_user_key = 0; + // Whether delta encoding is used to encode the index values. + uint64_t index_value_is_delta_encoded = 0; // the size of filter block. uint64_t filter_size = 0; // total raw key size diff -Nru rocksdb-5.15.10/include/rocksdb/trace_reader_writer.h rocksdb-5.17.2/include/rocksdb/trace_reader_writer.h --- rocksdb-5.15.10/include/rocksdb/trace_reader_writer.h 1970-01-01 00:00:00.000000000 +0000 +++ rocksdb-5.17.2/include/rocksdb/trace_reader_writer.h 2018-11-12 19:57:32.000000000 +0000 @@ -0,0 +1,47 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). + +#pragma once + +#include "rocksdb/env.h" + +namespace rocksdb { + +// Allow custom implementations of TraceWriter and TraceReader. +// By default, RocksDB provides a way to capture the traces to a file using the +// factory NewFileTraceWriter(). But users could also choose to export traces to +// any other system by providing custom implementations of TraceWriter and +// TraceReader. + +// TraceWriter allows exporting RocksDB traces to any system, one operation at +// a time. +class TraceWriter { + public: + TraceWriter() {} + virtual ~TraceWriter() {} + + virtual Status Write(const Slice& data) = 0; + virtual Status Close() = 0; +}; + +// TraceReader allows reading RocksDB traces from any system, one operation at +// a time. A RocksDB Replayer could depend on this to replay opertions. +class TraceReader { + public: + TraceReader() {} + virtual ~TraceReader() {} + + virtual Status Read(std::string* data) = 0; + virtual Status Close() = 0; +}; + +// Factory methods to read/write traces from/to a file. +Status NewFileTraceWriter(Env* env, const EnvOptions& env_options, + const std::string& trace_filename, + std::unique_ptr* trace_writer); +Status NewFileTraceReader(Env* env, const EnvOptions& env_options, + const std::string& trace_filename, + std::unique_ptr* trace_reader); +} // namespace rocksdb diff -Nru rocksdb-5.15.10/include/rocksdb/transaction_log.h rocksdb-5.17.2/include/rocksdb/transaction_log.h --- rocksdb-5.15.10/include/rocksdb/transaction_log.h 2018-09-13 17:25:20.000000000 +0000 +++ rocksdb-5.17.2/include/rocksdb/transaction_log.h 2018-11-12 19:57:32.000000000 +0000 @@ -3,8 +3,7 @@ // COPYING file in the root directory) and Apache 2.0 License // (found in the LICENSE.Apache file in the root directory). -#ifndef STORAGE_ROCKSDB_INCLUDE_TRANSACTION_LOG_ITERATOR_H_ -#define STORAGE_ROCKSDB_INCLUDE_TRANSACTION_LOG_ITERATOR_H_ +#pragma once #include "rocksdb/status.h" #include "rocksdb/types.h" @@ -121,5 +120,3 @@ }; }; } // namespace rocksdb - -#endif // STORAGE_ROCKSDB_INCLUDE_TRANSACTION_LOG_ITERATOR_H_ diff -Nru rocksdb-5.15.10/include/rocksdb/types.h rocksdb-5.17.2/include/rocksdb/types.h --- rocksdb-5.15.10/include/rocksdb/types.h 2018-09-13 17:25:20.000000000 +0000 +++ rocksdb-5.17.2/include/rocksdb/types.h 2018-11-12 19:57:32.000000000 +0000 @@ -3,8 +3,7 @@ // COPYING file in the root directory) and Apache 2.0 License // (found in the LICENSE.Apache file in the root directory). -#ifndef STORAGE_ROCKSDB_INCLUDE_TYPES_H_ -#define STORAGE_ROCKSDB_INCLUDE_TYPES_H_ +#pragma once #include #include "rocksdb/slice.h" @@ -23,6 +22,7 @@ kEntrySingleDelete, kEntryMerge, kEntryRangeDeletion, + kEntryBlobIndex, kEntryOther, }; @@ -52,5 +52,3 @@ bool ParseFullKey(const Slice& internal_key, FullKey* result); } // namespace rocksdb - -#endif // STORAGE_ROCKSDB_INCLUDE_TYPES_H_ diff -Nru rocksdb-5.15.10/include/rocksdb/universal_compaction.h rocksdb-5.17.2/include/rocksdb/universal_compaction.h --- rocksdb-5.15.10/include/rocksdb/universal_compaction.h 2018-09-13 17:25:20.000000000 +0000 +++ rocksdb-5.17.2/include/rocksdb/universal_compaction.h 2018-11-12 19:57:32.000000000 +0000 @@ -3,8 +3,7 @@ // COPYING file in the root directory) and Apache 2.0 License // (found in the LICENSE.Apache file in the root directory). -#ifndef STORAGE_ROCKSDB_UNIVERSAL_COMPACTION_OPTIONS_H -#define STORAGE_ROCKSDB_UNIVERSAL_COMPACTION_OPTIONS_H +#pragma once #include #include @@ -86,5 +85,3 @@ }; } // namespace rocksdb - -#endif // STORAGE_ROCKSDB_UNIVERSAL_COMPACTION_OPTIONS_H diff -Nru rocksdb-5.15.10/include/rocksdb/utilities/debug.h rocksdb-5.17.2/include/rocksdb/utilities/debug.h --- rocksdb-5.15.10/include/rocksdb/utilities/debug.h 2018-09-13 17:25:20.000000000 +0000 +++ rocksdb-5.17.2/include/rocksdb/utilities/debug.h 2018-11-12 19:57:32.000000000 +0000 @@ -31,9 +31,13 @@ }; // Returns listing of all versions of keys in the provided user key range. -// The range is inclusive-inclusive, i.e., [`begin_key`, `end_key`]. +// The range is inclusive-inclusive, i.e., [`begin_key`, `end_key`], or +// `max_num_ikeys` has been reached. Since all those keys returned will be +// copied to memory, if the range covers too many keys, the memory usage +// may be huge. `max_num_ikeys` can be used to cap the memory usage. // The result is inserted into the provided vector, `key_versions`. Status GetAllKeyVersions(DB* db, Slice begin_key, Slice end_key, + size_t max_num_ikeys, std::vector* key_versions); } // namespace rocksdb diff -Nru rocksdb-5.15.10/include/rocksdb/utilities/env_librados.h rocksdb-5.17.2/include/rocksdb/utilities/env_librados.h --- rocksdb-5.15.10/include/rocksdb/utilities/env_librados.h 2018-09-13 17:25:20.000000000 +0000 +++ rocksdb-5.17.2/include/rocksdb/utilities/env_librados.h 2018-11-12 19:57:32.000000000 +0000 @@ -2,8 +2,8 @@ // This source code is licensed under both the GPLv2 (found in the // COPYING file in the root directory) and Apache 2.0 License // (found in the LICENSE.Apache file in the root directory). -#ifndef ROCKSDB_UTILITIES_ENV_LIBRADOS_H -#define ROCKSDB_UTILITIES_ENV_LIBRADOS_H + +#pragma once #include #include @@ -173,4 +173,3 @@ friend class LibradosWritableFile; }; } -#endif diff -Nru rocksdb-5.15.10/include/rocksdb/utilities/table_properties_collectors.h rocksdb-5.17.2/include/rocksdb/utilities/table_properties_collectors.h --- rocksdb-5.15.10/include/rocksdb/utilities/table_properties_collectors.h 2018-09-13 17:25:20.000000000 +0000 +++ rocksdb-5.17.2/include/rocksdb/utilities/table_properties_collectors.h 2018-11-12 19:57:32.000000000 +0000 @@ -5,12 +5,60 @@ #pragma once #ifndef ROCKSDB_LITE +#include #include #include "rocksdb/table_properties.h" namespace rocksdb { +// A factory of a table property collector that marks a SST +// file as need-compaction when it observe at least "D" deletion +// entries in any "N" consecutive entires. +class CompactOnDeletionCollectorFactory + : public TablePropertiesCollectorFactory { + public: + virtual ~CompactOnDeletionCollectorFactory() {} + + virtual TablePropertiesCollector* CreateTablePropertiesCollector( + TablePropertiesCollectorFactory::Context context) override; + + // Change the value of sliding_window_size "N" + // Setting it to 0 disables the delete triggered compaction + void SetWindowSize(size_t sliding_window_size) { + sliding_window_size_.store(sliding_window_size); + } + + // Change the value of deletion_trigger "D" + void SetDeletionTrigger(size_t deletion_trigger) { + deletion_trigger_.store(deletion_trigger); + } + + virtual const char* Name() const override { + return "CompactOnDeletionCollector"; + } + + private: + friend std::shared_ptr + NewCompactOnDeletionCollectorFactory( + size_t sliding_window_size, + size_t deletion_trigger); + // A factory of a table property collector that marks a SST + // file as need-compaction when it observe at least "D" deletion + // entries in any "N" consecutive entires. + // + // @param sliding_window_size "N" + // @param deletion_trigger "D" + CompactOnDeletionCollectorFactory( + size_t sliding_window_size, + size_t deletion_trigger) : + sliding_window_size_(sliding_window_size), + deletion_trigger_(deletion_trigger) {} + + std::atomic sliding_window_size_; + std::atomic deletion_trigger_; +}; + // Creates a factory of a table property collector that marks a SST // file as need-compaction when it observe at least "D" deletion // entries in any "N" consecutive entires. @@ -20,7 +68,7 @@ // than the specified size. // @param deletion_trigger "D". Note that even when "N" is changed, // the specified number for "D" will not be changed. -extern std::shared_ptr +extern std::shared_ptr NewCompactOnDeletionCollectorFactory( size_t sliding_window_size, size_t deletion_trigger); diff -Nru rocksdb-5.15.10/include/rocksdb/utilities/transaction_db.h rocksdb-5.17.2/include/rocksdb/utilities/transaction_db.h --- rocksdb-5.15.10/include/rocksdb/utilities/transaction_db.h 2018-09-13 17:25:20.000000000 +0000 +++ rocksdb-5.17.2/include/rocksdb/utilities/transaction_db.h 2018-11-12 19:57:32.000000000 +0000 @@ -137,6 +137,15 @@ // The maximum number of bytes used for the write batch. 0 means no limit. size_t max_write_batch_size = 0; + + // Skip Concurrency Control. This could be as an optimization if the + // application knows that the transaction would not have any conflict with + // concurrent transactions. It could also be used during recovery if (i) + // application guarantees no conflict between prepared transactions in the WAL + // (ii) application guarantees that recovered transactions will be rolled + // back/commit before new transactions start. + // Default: false + bool skip_concurrency_control = false; }; // The per-write optimizations that do not involve transactions. TransactionDB diff -Nru rocksdb-5.15.10/include/rocksdb/utilities/transaction.h rocksdb-5.17.2/include/rocksdb/utilities/transaction.h --- rocksdb-5.15.10/include/rocksdb/utilities/transaction.h 2018-09-13 17:25:20.000000000 +0000 +++ rocksdb-5.17.2/include/rocksdb/utilities/transaction.h 2018-11-12 19:57:32.000000000 +0000 @@ -152,6 +152,12 @@ // If there is no previous call to SetSavePoint(), returns Status::NotFound() virtual Status RollbackToSavePoint() = 0; + // Pop the most recent save point. + // If there is no previous call to SetSavePoint(), Status::NotFound() + // will be returned. + // Otherwise returns Status::OK(). + virtual Status PopSavePoint() = 0; + // This function is similar to DB::Get() except it will also read pending // changes in this transaction. Currently, this function will return // Status::MergeInProgress if the most recent write to the queried key in diff -Nru rocksdb-5.15.10/include/rocksdb/utilities/write_batch_with_index.h rocksdb-5.17.2/include/rocksdb/utilities/write_batch_with_index.h --- rocksdb-5.15.10/include/rocksdb/utilities/write_batch_with_index.h 2018-09-13 17:25:20.000000000 +0000 +++ rocksdb-5.17.2/include/rocksdb/utilities/write_batch_with_index.h 2018-11-12 19:57:32.000000000 +0000 @@ -231,6 +231,7 @@ Status PopSavePoint() override; void SetMaxBytes(size_t max_bytes) override; + size_t GetDataSize() const; private: friend class PessimisticTransactionDB; diff -Nru rocksdb-5.15.10/include/rocksdb/version.h rocksdb-5.17.2/include/rocksdb/version.h --- rocksdb-5.15.10/include/rocksdb/version.h 2018-09-13 17:25:20.000000000 +0000 +++ rocksdb-5.17.2/include/rocksdb/version.h 2018-11-12 19:57:32.000000000 +0000 @@ -5,8 +5,8 @@ #pragma once #define ROCKSDB_MAJOR 5 -#define ROCKSDB_MINOR 15 -#define ROCKSDB_PATCH 10 +#define ROCKSDB_MINOR 17 +#define ROCKSDB_PATCH 2 // Do not use these. We made the mistake of declaring macros starting with // double underscore. Now we have to live with our choice. We'll deprecate these diff -Nru rocksdb-5.15.10/include/rocksdb/wal_filter.h rocksdb-5.17.2/include/rocksdb/wal_filter.h --- rocksdb-5.15.10/include/rocksdb/wal_filter.h 2018-09-13 17:25:20.000000000 +0000 +++ rocksdb-5.17.2/include/rocksdb/wal_filter.h 2018-11-12 19:57:32.000000000 +0000 @@ -4,6 +4,7 @@ // (found in the LICENSE.Apache file in the root directory). #pragma once + #include #include diff -Nru rocksdb-5.15.10/include/rocksdb/write_batch.h rocksdb-5.17.2/include/rocksdb/write_batch.h --- rocksdb-5.15.10/include/rocksdb/write_batch.h 2018-09-13 17:25:20.000000000 +0000 +++ rocksdb-5.17.2/include/rocksdb/write_batch.h 2018-11-12 19:57:32.000000000 +0000 @@ -22,8 +22,7 @@ // non-const method, all threads accessing the same WriteBatch must use // external synchronization. -#ifndef STORAGE_ROCKSDB_INCLUDE_WRITE_BATCH_H_ -#define STORAGE_ROCKSDB_INCLUDE_WRITE_BATCH_H_ +#pragma once #include #include @@ -367,5 +366,3 @@ }; } // namespace rocksdb - -#endif // STORAGE_ROCKSDB_INCLUDE_WRITE_BATCH_H_ diff -Nru rocksdb-5.15.10/java/CMakeLists.txt rocksdb-5.17.2/java/CMakeLists.txt --- rocksdb-5.15.10/java/CMakeLists.txt 2018-09-13 17:25:20.000000000 +0000 +++ rocksdb-5.17.2/java/CMakeLists.txt 2018-11-12 19:57:32.000000000 +0000 @@ -13,6 +13,7 @@ rocksjni/compaction_filter_factory_jnicallback.cc rocksjni/compaction_options_fifo.cc rocksjni/compaction_options_universal.cc + rocksjni/compact_range_options.cc rocksjni/comparator.cc rocksjni/comparatorjnicallback.cc rocksjni/compression_options.cc @@ -79,6 +80,7 @@ org.rocksdb.ColumnFamilyOptions org.rocksdb.CompactionOptionsFIFO org.rocksdb.CompactionOptionsUniversal + org.rocksdb.CompactRangeOptions org.rocksdb.Comparator org.rocksdb.ComparatorOptions org.rocksdb.CompressionOptions @@ -192,6 +194,7 @@ src/main/java/org/rocksdb/CompactionOptionsFIFO.java src/main/java/org/rocksdb/CompactionOptionsUniversal.java src/main/java/org/rocksdb/CompactionPriority.java + src/main/java/org/rocksdb/CompactRangeOptions.java src/main/java/org/rocksdb/CompactionStopStyle.java src/main/java/org/rocksdb/CompactionStyle.java src/main/java/org/rocksdb/Comparator.java diff -Nru rocksdb-5.15.10/java/crossbuild/build-linux-centos.sh rocksdb-5.17.2/java/crossbuild/build-linux-centos.sh --- rocksdb-5.15.10/java/crossbuild/build-linux-centos.sh 2018-09-13 17:25:20.000000000 +0000 +++ rocksdb-5.17.2/java/crossbuild/build-linux-centos.sh 2018-11-12 19:57:32.000000000 +0000 @@ -26,6 +26,6 @@ # build rocksdb cd /rocksdb scl enable devtoolset-2 'make jclean clean' -scl enable devtoolset-2 'PORTABLE=1 make rocksdbjavastatic' +scl enable devtoolset-2 'PORTABLE=1 make -j8 rocksdbjavastatic' cp /rocksdb/java/target/librocksdbjni-* /rocksdb-build cp /rocksdb/java/target/rocksdbjni-* /rocksdb-build diff -Nru rocksdb-5.15.10/java/crossbuild/docker-build-linux-centos.sh rocksdb-5.17.2/java/crossbuild/docker-build-linux-centos.sh --- rocksdb-5.15.10/java/crossbuild/docker-build-linux-centos.sh 2018-09-13 17:25:20.000000000 +0000 +++ rocksdb-5.17.2/java/crossbuild/docker-build-linux-centos.sh 2018-11-12 19:57:32.000000000 +0000 @@ -9,10 +9,10 @@ # Use scl devtoolset if available (i.e. CentOS <7) if hash scl 2>/dev/null; then scl enable devtoolset-2 'make jclean clean' - scl enable devtoolset-2 'PORTABLE=1 make rocksdbjavastatic' + scl enable devtoolset-2 'PORTABLE=1 make -j8 rocksdbjavastatic' else make jclean clean - PORTABLE=1 make rocksdbjavastatic + PORTABLE=1 make -j8 rocksdbjavastatic fi cp java/target/librocksdbjni-linux*.so java/target/rocksdbjni-*-linux*.jar /rocksdb-host/java/target diff -Nru rocksdb-5.15.10/java/Makefile rocksdb-5.17.2/java/Makefile --- rocksdb-5.15.10/java/Makefile 2018-09-13 17:25:20.000000000 +0000 +++ rocksdb-5.17.2/java/Makefile 2018-11-12 19:57:32.000000000 +0000 @@ -14,6 +14,7 @@ org.rocksdb.ColumnFamilyOptions\ org.rocksdb.CompactionOptionsFIFO\ org.rocksdb.CompactionOptionsUniversal\ + org.rocksdb.CompactRangeOptions\ org.rocksdb.Comparator\ org.rocksdb.ComparatorOptions\ org.rocksdb.CompressionOptions\ diff -Nru rocksdb-5.15.10/java/rocksjni/compact_range_options.cc rocksdb-5.17.2/java/rocksjni/compact_range_options.cc --- rocksdb-5.15.10/java/rocksjni/compact_range_options.cc 1970-01-01 00:00:00.000000000 +0000 +++ rocksdb-5.17.2/java/rocksjni/compact_range_options.cc 2018-11-12 19:57:32.000000000 +0000 @@ -0,0 +1,196 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). +// +// This file implements the "bridge" between Java and C++ for +// rocksdb::CompactRangeOptions. + +#include + +#include "include/org_rocksdb_CompactRangeOptions.h" +#include "rocksdb/options.h" +#include "rocksjni/portal.h" + +/* + * Class: org_rocksdb_CompactRangeOptions + * Method: newCompactRangeOptions + * Signature: ()J + */ +jlong Java_org_rocksdb_CompactRangeOptions_newCompactRangeOptions( + JNIEnv* /*env*/, jclass /*jclazz*/) { + auto* options = new rocksdb::CompactRangeOptions(); + return reinterpret_cast(options); +} + + +/* + * Class: org_rocksdb_CompactRangeOptions + * Method: exclusiveManualCompaction + * Signature: (J)Z + */ +jboolean Java_org_rocksdb_CompactRangeOptions_exclusiveManualCompaction( + JNIEnv* /*env*/, jobject /*jobj*/, jlong jhandle) { + auto* options = reinterpret_cast(jhandle); + return static_cast(options->exclusive_manual_compaction); +} + +/* + * Class: org_rocksdb_CompactRangeOptions + * Method: setExclusiveManualCompaction + * Signature: (JZ)V + */ +void Java_org_rocksdb_CompactRangeOptions_setExclusiveManualCompaction( + JNIEnv* /*env*/, jobject /*jobj*/, jlong jhandle, jboolean exclusive_manual_compaction) { + auto* options = + reinterpret_cast(jhandle); + options->exclusive_manual_compaction = static_cast(exclusive_manual_compaction); +} + + +/* + * Class: org_rocksdb_CompactRangeOptions + * Method: bottommostLevelCompaction + * Signature: (J)I + */ +jint Java_org_rocksdb_CompactRangeOptions_bottommostLevelCompaction( + JNIEnv* /*env*/, jobject /*jobj*/, jlong jhandle) { + auto* options = reinterpret_cast(jhandle); + return rocksdb::BottommostLevelCompactionJni::toJavaBottommostLevelCompaction( + options->bottommost_level_compaction); +} + +/* + * Class: org_rocksdb_CompactRangeOptions + * Method: setBottommostLevelCompaction + * Signature: (JI)V + */ +void Java_org_rocksdb_CompactRangeOptions_setBottommostLevelCompaction( + JNIEnv* /*env*/, jobject /*jobj*/, jlong jhandle, + jint bottommost_level_compaction) { + auto* options = reinterpret_cast(jhandle); + options->bottommost_level_compaction = + rocksdb::BottommostLevelCompactionJni::toCppBottommostLevelCompaction(bottommost_level_compaction); +} + +/* + * Class: org_rocksdb_CompactRangeOptions + * Method: changeLevel + * Signature: (J)Z + */ +jboolean Java_org_rocksdb_CompactRangeOptions_changeLevel + (JNIEnv* /*env*/, jobject /*jobj*/, jlong jhandle) { + auto* options = reinterpret_cast(jhandle); + return static_cast(options->change_level); +} + +/* + * Class: org_rocksdb_CompactRangeOptions + * Method: setChangeLevel + * Signature: (JZ)V + */ +void Java_org_rocksdb_CompactRangeOptions_setChangeLevel + (JNIEnv* /*env*/, jobject /*jobj*/, jlong jhandle, jboolean change_level) { + auto* options = reinterpret_cast(jhandle); + options->change_level = static_cast(change_level); +} + +/* + * Class: org_rocksdb_CompactRangeOptions + * Method: targetLevel + * Signature: (J)I + */ +jint Java_org_rocksdb_CompactRangeOptions_targetLevel + (JNIEnv* /*env*/, jobject /*jobj*/, jlong jhandle) { + auto* options = reinterpret_cast(jhandle); + return static_cast(options->target_level); +} + +/* + * Class: org_rocksdb_CompactRangeOptions + * Method: setTargetLevel + * Signature: (JI)V + */ +void Java_org_rocksdb_CompactRangeOptions_setTargetLevel + (JNIEnv* /*env*/, jobject /*jobj*/, jlong jhandle, jint target_level) { + auto* options = reinterpret_cast(jhandle); + options->target_level = static_cast(target_level); +} + +/* + * Class: org_rocksdb_CompactRangeOptions + * Method: targetPathId + * Signature: (J)I + */ +jint Java_org_rocksdb_CompactRangeOptions_targetPathId + (JNIEnv* /*env*/, jobject /*jobj*/, jlong jhandle) { + auto* options = reinterpret_cast(jhandle); + return static_cast(options->target_path_id); +} + +/* + * Class: org_rocksdb_CompactRangeOptions + * Method: setTargetPathId + * Signature: (JI)V + */ +void Java_org_rocksdb_CompactRangeOptions_setTargetPathId + (JNIEnv* /*env*/, jobject /*jobj*/, jlong jhandle, jint target_path_id) { + auto* options = reinterpret_cast(jhandle); + options->target_path_id = static_cast(target_path_id); +} + +/* + * Class: org_rocksdb_CompactRangeOptions + * Method: allowWriteStall + * Signature: (J)Z + */ +jboolean Java_org_rocksdb_CompactRangeOptions_allowWriteStall + (JNIEnv* /*env*/, jobject /*jobj*/, jlong jhandle) { + auto* options = reinterpret_cast(jhandle); + return static_cast(options->allow_write_stall); +} + +/* + * Class: org_rocksdb_CompactRangeOptions + * Method: setAllowWriteStall + * Signature: (JZ)V + */ +void Java_org_rocksdb_CompactRangeOptions_setAllowWriteStall + (JNIEnv* /*env*/, jobject /*jobj*/, jlong jhandle, jboolean allow_write_stall) { + auto* options = reinterpret_cast(jhandle); + options->allow_write_stall = static_cast(allow_write_stall); +} + + +/* + * Class: org_rocksdb_CompactRangeOptions + * Method: maxSubcompactions + * Signature: (J)I + */ +jint Java_org_rocksdb_CompactRangeOptions_maxSubcompactions + (JNIEnv* /*env*/, jobject /*jobj*/, jlong jhandle) { + auto* options = reinterpret_cast(jhandle); + return static_cast(options->max_subcompactions); +} + +/* + * Class: org_rocksdb_CompactRangeOptions + * Method: setMaxSubcompactions + * Signature: (JI)V + */ +void Java_org_rocksdb_CompactRangeOptions_setMaxSubcompactions + (JNIEnv* /*env*/, jobject /*jobj*/, jlong jhandle, jint max_subcompactions) { + auto* options = reinterpret_cast(jhandle); + options->max_subcompactions = static_cast(max_subcompactions); +} + +/* + * Class: org_rocksdb_CompactRangeOptions + * Method: disposeInternal + * Signature: (J)V + */ +void Java_org_rocksdb_CompactRangeOptions_disposeInternal( + JNIEnv* /*env*/, jobject /*jobj*/, jlong jhandle) { + auto* options = reinterpret_cast(jhandle); + delete options; +} diff -Nru rocksdb-5.15.10/java/rocksjni/portal.h rocksdb-5.17.2/java/rocksjni/portal.h --- rocksdb-5.15.10/java/rocksjni/portal.h 2018-09-13 17:25:20.000000000 +0000 +++ rocksdb-5.17.2/java/rocksjni/portal.h 2018-11-12 19:57:32.000000000 +0000 @@ -2896,6 +2896,43 @@ } }; +// The portal class for org.rocksdb.BottommostLevelCompaction +class BottommostLevelCompactionJni { + public: + // Returns the equivalent org.rocksdb.BottommostLevelCompaction for the provided + // C++ rocksdb::BottommostLevelCompaction enum + static jint toJavaBottommostLevelCompaction( + const rocksdb::BottommostLevelCompaction& bottommost_level_compaction) { + switch(bottommost_level_compaction) { + case rocksdb::BottommostLevelCompaction::kSkip: + return 0x0; + case rocksdb::BottommostLevelCompaction::kIfHaveCompactionFilter: + return 0x1; + case rocksdb::BottommostLevelCompaction::kForce: + return 0x2; + default: + return 0x7F; // undefined + } + } + + // Returns the equivalent C++ rocksdb::BottommostLevelCompaction enum for the + // provided Java org.rocksdb.BottommostLevelCompaction + static rocksdb::BottommostLevelCompaction toCppBottommostLevelCompaction( + jint bottommost_level_compaction) { + switch(bottommost_level_compaction) { + case 0x0: + return rocksdb::BottommostLevelCompaction::kSkip; + case 0x1: + return rocksdb::BottommostLevelCompaction::kIfHaveCompactionFilter; + case 0x2: + return rocksdb::BottommostLevelCompaction::kForce; + default: + // undefined/default + return rocksdb::BottommostLevelCompaction::kIfHaveCompactionFilter; + } + } +}; + // The portal class for org.rocksdb.CompactionStopStyle class CompactionStopStyleJni { public: diff -Nru rocksdb-5.15.10/java/rocksjni/rocksjni.cc rocksdb-5.17.2/java/rocksjni/rocksjni.cc --- rocksdb-5.15.10/java/rocksjni/rocksjni.cc 2018-09-13 17:25:20.000000000 +0000 +++ rocksdb-5.17.2/java/rocksjni/rocksjni.cc 2018-11-12 19:57:32.000000000 +0000 @@ -1844,6 +1844,32 @@ return 0; } +/* + * Class: org_rocksdb_RocksDB + * Method: getAggregatedLongProperty + * Signature: (JLjava/lang/String;I)J + */ +jlong Java_org_rocksdb_RocksDB_getAggregatedLongProperty( + JNIEnv* env, jobject, jlong db_handle, jstring jproperty, jint jproperty_len) { + const char* property = env->GetStringUTFChars(jproperty, nullptr); + if (property == nullptr) { + return 0; + } + rocksdb::Slice property_slice(property, jproperty_len); + auto* db = reinterpret_cast(db_handle); + uint64_t property_value = 0; + bool retCode = db->GetAggregatedIntProperty(property_slice, &property_value); + env->ReleaseStringUTFChars(jproperty, property); + + if (retCode) { + return property_value; + } + + rocksdb::RocksDBExceptionJni::ThrowNew(env, rocksdb::Status::NotFound()); + return 0; +} + + ////////////////////////////////////////////////////////////////////////////// // rocksdb::DB::Flush @@ -1955,8 +1981,7 @@ rocksdb::ColumnFamilyHandle* cf_handle, jbyteArray jbegin, jint jbegin_len, jbyteArray jend, jint jend_len, - jboolean jreduce_level, jint jtarget_level, - jint jtarget_path_id) { + const rocksdb::CompactRangeOptions& compact_options) { jbyte* begin = env->GetByteArrayElements(jbegin, nullptr); if (begin == nullptr) { // exception thrown: OutOfMemoryError @@ -1974,10 +1999,6 @@ const rocksdb::Slice end_slice(reinterpret_cast(end), jend_len); rocksdb::Status s; - rocksdb::CompactRangeOptions compact_options; - compact_options.change_level = jreduce_level; - compact_options.target_level = jtarget_level; - compact_options.target_path_id = static_cast(jtarget_path_id); if (cf_handle != nullptr) { s = db->CompactRange(compact_options, cf_handle, &begin_slice, &end_slice); } else { @@ -1996,6 +2017,25 @@ return false; } +/** + * @return true if the compact range succeeded, false if a Java Exception + * was thrown + */ +bool rocksdb_compactrange_helper(JNIEnv* env, rocksdb::DB* db, + rocksdb::ColumnFamilyHandle* cf_handle, + jbyteArray jbegin, jint jbegin_len, + jbyteArray jend, jint jend_len, + jboolean jreduce_level, jint jtarget_level, + jint jtarget_path_id) { + rocksdb::CompactRangeOptions compact_options; + compact_options.change_level = jreduce_level; + compact_options.target_level = jtarget_level; + compact_options.target_path_id = static_cast(jtarget_path_id); + + return rocksdb_compactrange_helper(env, db, cf_handle, jbegin, jbegin_len, + jend, jend_len, compact_options); +} + /* * Class: org_rocksdb_RocksDB * Method: compactRange0 @@ -2027,6 +2067,20 @@ jtarget_path_id); } + +void Java_org_rocksdb_RocksDB_compactRange__J_3BI_3BIJJ( + JNIEnv* env, jobject /*jdb*/, jlong jdb_handle, jbyteArray jbegin, + jint jbegin_len, jbyteArray jend, jint jend_len, + jlong jcompact_options_handle, jlong jcf_handle) { + auto* db = reinterpret_cast(jdb_handle); + auto* cf_handle = reinterpret_cast(jcf_handle); + auto* compact_options = reinterpret_cast(jcompact_options_handle); + + rocksdb_compactrange_helper(env, db, cf_handle, jbegin, jbegin_len, jend, + jend_len, *compact_options); +} + + ////////////////////////////////////////////////////////////////////////////// // rocksdb::DB::PauseBackgroundWork diff -Nru rocksdb-5.15.10/java/rocksjni/transaction.cc rocksdb-5.17.2/java/rocksjni/transaction.cc --- rocksdb-5.15.10/java/rocksjni/transaction.cc 2018-09-13 17:25:20.000000000 +0000 +++ rocksdb-5.17.2/java/rocksjni/transaction.cc 2018-11-12 19:57:32.000000000 +0000 @@ -18,7 +18,8 @@ #if defined(_MSC_VER) #pragma warning(push) -#pragma warning(disable : 4503) // identifier' : decorated name length exceeded, name was truncated +#pragma warning(disable : 4503) // identifier' : decorated name length + // exceeded, name was truncated #endif /* @@ -271,8 +272,8 @@ void free_parts( JNIEnv* env, - std::vector> &parts_to_free) { - for (auto &value : parts_to_free) { + std::vector>& parts_to_free) { + for (auto& value : parts_to_free) { jobject jk; jbyteArray jk_ba; jbyte* jk_val; @@ -675,10 +676,10 @@ return; } - jparts_to_free.push_back(std::make_tuple( - jba_key_part, jkey_part, jobj_key_part)); - jparts_to_free.push_back(std::make_tuple( - jba_value_part, jvalue_part, jobj_value_part)); + jparts_to_free.push_back( + std::make_tuple(jba_key_part, jkey_part, jobj_key_part)); + jparts_to_free.push_back( + std::make_tuple(jba_value_part, jvalue_part, jobj_value_part)); key_parts.push_back( rocksdb::Slice(reinterpret_cast(jkey_part), jkey_part_len)); @@ -688,8 +689,8 @@ // call the write_multi function rocksdb::Status s = fn_write_kv_parts( - rocksdb::SliceParts(key_parts.data(), (int)key_parts.size()), - rocksdb::SliceParts(value_parts.data(), (int)value_parts.size())); + rocksdb::SliceParts(key_parts.data(), (int)key_parts.size()), + rocksdb::SliceParts(value_parts.data(), (int)value_parts.size())); // cleanup temporary memory free_parts(env, jparts_to_free); @@ -834,13 +835,11 @@ typedef std::function FnWriteKParts; - // TODO(AR) consider refactoring to share this between here and rocksjni.cc void txn_write_k_parts_helper(JNIEnv* env, const FnWriteKParts& fn_write_k_parts, const jobjectArray& jkey_parts, const jint& jkey_parts_len) { - std::vector key_parts; std::vector> jkey_parts_to_free; @@ -872,12 +871,13 @@ jkey_parts_to_free.push_back(std::tuple( jba_key_part, jkey_part, jobj_key_part)); - key_parts.push_back(rocksdb::Slice(reinterpret_cast(jkey_part), jkey_part_len)); + key_parts.push_back( + rocksdb::Slice(reinterpret_cast(jkey_part), jkey_part_len)); } // call the write_multi function - rocksdb::Status s = - fn_write_k_parts(rocksdb::SliceParts(key_parts.data(), (int)key_parts.size())); + rocksdb::Status s = fn_write_k_parts( + rocksdb::SliceParts(key_parts.data(), (int)key_parts.size())); // cleanup temporary memory free_parts(env, jkey_parts_to_free); diff -Nru rocksdb-5.15.10/java/src/main/java/org/rocksdb/CompactRangeOptions.java rocksdb-5.17.2/java/src/main/java/org/rocksdb/CompactRangeOptions.java --- rocksdb-5.15.10/java/src/main/java/org/rocksdb/CompactRangeOptions.java 1970-01-01 00:00:00.000000000 +0000 +++ rocksdb-5.17.2/java/src/main/java/org/rocksdb/CompactRangeOptions.java 2018-11-12 19:57:32.000000000 +0000 @@ -0,0 +1,233 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). + +package org.rocksdb; + +/** + * CompactRangeOptions is used by CompactRange() call. In the documentation of the methods "the compaction" refers to + * any compaction that is using this CompactRangeOptions. + */ +public class CompactRangeOptions extends RocksObject { + + private final static byte VALUE_kSkip = 0; + private final static byte VALUE_kIfHaveCompactionFilter = 1; + private final static byte VALUE_kForce = 2; + + // For level based compaction, we can configure if we want to skip/force bottommost level compaction. + // The order of this neum MUST follow the C++ layer. See BottommostLevelCompaction in db/options.h + public enum BottommostLevelCompaction { + /** + * Skip bottommost level compaction + */ + kSkip((byte)VALUE_kSkip), + /** + * Only compact bottommost level if there is a compaction filter. This is the default option + */ + kIfHaveCompactionFilter(VALUE_kIfHaveCompactionFilter), + /** + * Always compact bottommost level + */ + kForce(VALUE_kForce); + + private final byte value; + + BottommostLevelCompaction(final byte value) { + this.value = value; + } + + /** + *

Returns the byte value of the enumerations value.

+ * + * @return byte representation + */ + public byte getValue() { + return value; + } + + /** + * Returns the BottommostLevelCompaction for the given C++ rocks enum value. + * @param bottommostLevelCompaction The value of the BottommostLevelCompaction + * @return BottommostLevelCompaction instance, or null if none matches + */ + public static BottommostLevelCompaction fromRocksId(final int bottommostLevelCompaction) { + switch (bottommostLevelCompaction) { + case VALUE_kSkip: return kSkip; + case VALUE_kIfHaveCompactionFilter: return kIfHaveCompactionFilter; + case VALUE_kForce: return kForce; + default: return null; + } + } + } + + /** + * Construct CompactRangeOptions. + */ + public CompactRangeOptions() { + super(newCompactRangeOptions()); + } + + /** + * Returns whether the compaction is exclusive or other compactions may run concurrently at the same time. + * + * @return true if exclusive, false if concurrent + */ + public boolean exclusiveManualCompaction() { + return exclusiveManualCompaction(nativeHandle_); + } + + /** + * Sets whether the compaction is exclusive or other compaction are allowed run concurrently at the same time. + * + * @param exclusiveCompaction true if compaction should be exclusive + * @return This CompactRangeOptions + */ + public CompactRangeOptions setExclusiveManualCompaction(final boolean exclusiveCompaction) { + setExclusiveManualCompaction(nativeHandle_, exclusiveCompaction); + return this; + } + + + /** + * Returns the policy for compacting the bottommost level + * @return The BottommostLevelCompaction policy + */ + public BottommostLevelCompaction bottommostLevelCompaction() { + return BottommostLevelCompaction.fromRocksId(bottommostLevelCompaction(nativeHandle_)); + } + + /** + * Sets the policy for compacting the bottommost level + * + * @param bottommostLevelCompaction The policy for compacting the bottommost level + * @return This CompactRangeOptions + */ + public CompactRangeOptions setBottommostLevelCompaction(final BottommostLevelCompaction bottommostLevelCompaction) { + setBottommostLevelCompaction(nativeHandle_, bottommostLevelCompaction.getValue()); + return this; + } + + /** + * Returns whether compacted files will be moved to the minimum level capable of holding the data or given level + * (specified non-negative target_level). + * @return true, if compacted files will be moved to the minimum level + */ + public boolean changeLevel() { + return changeLevel(nativeHandle_); + } + + /** + * Whether compacted files will be moved to the minimum level capable of holding the data or given level + * (specified non-negative target_level). + * + * @param changeLevel If true, compacted files will be moved to the minimum level + * @return This CompactRangeOptions + */ + public CompactRangeOptions setChangeLevel(final boolean changeLevel) { + setChangeLevel(nativeHandle_, changeLevel); + return this; + } + + /** + * If change_level is true and target_level have non-negative value, compacted files will be moved to target_level. + * @return The target level for the compacted files + */ + public int targetLevel() { + return targetLevel(nativeHandle_); + } + + + /** + * If change_level is true and target_level have non-negative value, compacted files will be moved to target_level. + * + * @param targetLevel target level for the compacted files + * @return This CompactRangeOptions + */ + public CompactRangeOptions setTargetLevel(final int targetLevel) { + setTargetLevel(nativeHandle_, targetLevel); + return this; + } + + /** + * target_path_id for compaction output. Compaction outputs will be placed in options.db_paths[target_path_id]. + * + * @return target_path_id + */ + public int targetPathId() { + return targetPathId(nativeHandle_); + } + + /** + * Compaction outputs will be placed in options.db_paths[target_path_id]. Behavior is undefined if target_path_id is + * out of range. + * + * @param targetPathId target path id + * @return This CompactRangeOptions + */ + public CompactRangeOptions setTargetPathId(final int targetPathId) { + setTargetPathId(nativeHandle_, targetPathId); + return this; + } + + /** + * If true, compaction will execute immediately even if doing so would cause the DB to + * enter write stall mode. Otherwise, it'll sleep until load is low enough. + * @return true if compaction will execute immediately + */ + public boolean allowWriteStall() { + return allowWriteStall(nativeHandle_); + } + + + /** + * If true, compaction will execute immediately even if doing so would cause the DB to + * enter write stall mode. Otherwise, it'll sleep until load is low enough. + * + * @return This CompactRangeOptions + * @param allowWriteStall true if compaction should execute immediately + */ + public CompactRangeOptions setAllowWriteStall(final boolean allowWriteStall) { + setAllowWriteStall(nativeHandle_, allowWriteStall); + return this; + } + + /** + * If > 0, it will replace the option in the DBOptions for this compaction + * @return number of subcompactions + */ + public int maxSubcompactions() { + return maxSubcompactions(nativeHandle_); + } + + /** + * If > 0, it will replace the option in the DBOptions for this compaction + * + * @param maxSubcompactions number of subcompactions + * @return This CompactRangeOptions + */ + public CompactRangeOptions setMaxSubcompactions(final int maxSubcompactions) { + setMaxSubcompactions(nativeHandle_, maxSubcompactions); + return this; + } + + private native static long newCompactRangeOptions(); + private native boolean exclusiveManualCompaction(final long handle); + private native void setExclusiveManualCompaction(final long handle, final boolean exclusive_manual_compaction); + private native int bottommostLevelCompaction(final long handle); + private native void setBottommostLevelCompaction(final long handle, final int bottommostLevelCompaction); + private native boolean changeLevel(final long handle); + private native void setChangeLevel(final long handle, final boolean changeLevel); + private native int targetLevel(final long handle); + private native void setTargetLevel(final long handle, final int targetLevel); + private native int targetPathId(final long handle); + private native void setTargetPathId(final long handle, final int /* uint32_t */ targetPathId); + private native boolean allowWriteStall(final long handle); + private native void setAllowWriteStall(final long handle, final boolean allowWriteStall); + private native void setMaxSubcompactions(final long handle, final int /* uint32_t */ maxSubcompactions); + private native int maxSubcompactions(final long handle); + + @Override + protected final native void disposeInternal(final long handle); + +} diff -Nru rocksdb-5.15.10/java/src/main/java/org/rocksdb/DBOptionsInterface.java rocksdb-5.17.2/java/src/main/java/org/rocksdb/DBOptionsInterface.java --- rocksdb-5.15.10/java/src/main/java/org/rocksdb/DBOptionsInterface.java 2018-09-13 17:25:20.000000000 +0000 +++ rocksdb-5.17.2/java/src/main/java/org/rocksdb/DBOptionsInterface.java 2018-11-12 19:57:32.000000000 +0000 @@ -262,6 +262,8 @@ *

*

If set to 0 (default), we will dynamically choose the WAL size limit to * be [sum of all write_buffer_size * max_write_buffer_number] * 2

+ *

This option takes effect only when there are more than one column family as + * otherwise the wal size is dictated by the write_buffer_size.

*

Default: 0

* * @param maxTotalWalSize max total wal size. diff -Nru rocksdb-5.15.10/java/src/main/java/org/rocksdb/RocksDB.java rocksdb-5.17.2/java/src/main/java/org/rocksdb/RocksDB.java --- rocksdb-5.15.10/java/src/main/java/org/rocksdb/RocksDB.java 2018-09-13 17:25:20.000000000 +0000 +++ rocksdb-5.17.2/java/src/main/java/org/rocksdb/RocksDB.java 2018-11-12 19:57:32.000000000 +0000 @@ -1518,6 +1518,31 @@ property, property.length()); } + /** + *

Return sum of the getLongProperty of all the column families

+ * + *

Note: As the returned property is of type + * {@code uint64_t} on C++ side the returning value can be negative + * because Java supports in Java 7 only signed long values.

+ * + *

Java 7: To mitigate the problem of the non + * existent unsigned long tpye, values should be encapsulated using + * {@link java.math.BigInteger} to reflect the correct value. The correct + * behavior is guaranteed if {@code 2^64} is added to negative values.

+ * + *

Java 8: In Java 8 the value should be treated as + * unsigned long using provided methods of type {@link Long}.

+ * + * @param property to be fetched. + * + * @return numerical property value + * + * @throws RocksDBException if an error happens in the underlying native code. + */ + public long getAggregatedLongProperty(final String property) throws RocksDBException { + return getAggregatedLongProperty(nativeHandle_, property, property.length()); + } + /** *

Return a heap-allocated iterator over the contents of the * database. The result of newIterator() is initially invalid @@ -1823,6 +1848,8 @@ *

  • {@link #compactRange(byte[], byte[], boolean, int, int)}
  • * * + * @deprecated Use {@link #compactRange(ColumnFamilyHandle, byte[], byte[], CompactRangeOptions)} instead + * * @param reduce_level reduce level after compaction * @param target_level target level to compact to * @param target_path_id the target path id of output path @@ -1830,6 +1857,7 @@ * @throws RocksDBException thrown if an error occurs within the native * part of the library. */ + @Deprecated public void compactRange(final boolean reduce_level, final int target_level, final int target_path_id) throws RocksDBException { @@ -1855,6 +1883,8 @@ *
  • {@link #compactRange(byte[], byte[])}
  • * * + * @deprecated Use {@link #compactRange(ColumnFamilyHandle, byte[], byte[], CompactRangeOptions)} instead + * * @param begin start of key range (included in range) * @param end end of key range (excluded from range) * @param reduce_level reduce level after compaction @@ -1864,6 +1894,7 @@ * @throws RocksDBException thrown if an error occurs within the native * part of the library. */ + @Deprecated public void compactRange(final byte[] begin, final byte[] end, final boolean reduce_level, final int target_level, final int target_path_id) throws RocksDBException { @@ -1935,6 +1966,27 @@ false, -1, 0, columnFamilyHandle.nativeHandle_); } + + /** + *

    Range compaction of column family.

    + *

    Note: After the database has been compacted, + * all data will have been pushed down to the last level containing + * any data.

    + * + * @param columnFamilyHandle {@link org.rocksdb.ColumnFamilyHandle} instance. + * @param begin start of key range (included in range) + * @param end end of key range (excluded from range) + * @param compactRangeOptions options for the compaction + * + * @throws RocksDBException thrown if an error occurs within the native + * part of the library. + */ + public void compactRange(final ColumnFamilyHandle columnFamilyHandle, + final byte[] begin, final byte[] end, CompactRangeOptions compactRangeOptions) throws RocksDBException { + compactRange(nativeHandle_, begin, begin.length, end, end.length, + compactRangeOptions.nativeHandle_, columnFamilyHandle.nativeHandle_); + } + /** *

    Range compaction of column family.

    *

    Note: After the database has been compacted, @@ -1957,6 +2009,8 @@ * * * + * @deprecated Use {@link #compactRange(ColumnFamilyHandle, byte[], byte[], CompactRangeOptions)} instead + * * @param columnFamilyHandle {@link org.rocksdb.ColumnFamilyHandle} * instance. * @param reduce_level reduce level after compaction @@ -1966,6 +2020,7 @@ * @throws RocksDBException thrown if an error occurs within the native * part of the library. */ + @Deprecated public void compactRange(final ColumnFamilyHandle columnFamilyHandle, final boolean reduce_level, final int target_level, final int target_path_id) throws RocksDBException { @@ -1994,6 +2049,8 @@ * * * + * @deprecated Use {@link #compactRange(ColumnFamilyHandle, byte[], byte[], CompactRangeOptions)} instead + * * @param columnFamilyHandle {@link org.rocksdb.ColumnFamilyHandle} * instance. * @param begin start of key range (included in range) @@ -2005,6 +2062,7 @@ * @throws RocksDBException thrown if an error occurs within the native * part of the library. */ + @Deprecated public void compactRange(final ColumnFamilyHandle columnFamilyHandle, final byte[] begin, final byte[] end, final boolean reduce_level, final int target_level, final int target_path_id) @@ -2350,6 +2408,8 @@ int propertyLength) throws RocksDBException; protected native long getLongProperty(long nativeHandle, long cfHandle, String property, int propertyLength) throws RocksDBException; + protected native long getAggregatedLongProperty(long nativeHandle, String property, + int propertyLength) throws RocksDBException; protected native long iterator(long handle); protected native long iterator(long handle, long readOptHandle); protected native long iteratorCF(long handle, long cfHandle); @@ -2377,6 +2437,9 @@ private native void compactRange0(long handle, byte[] begin, int beginLen, byte[] end, int endLen, boolean reduce_level, int target_level, int target_path_id) throws RocksDBException; + private native void compactRange(long handle, byte[] begin, int beginLen, + byte[] end, int endLen, long compactRangeOptHandle, long cfHandle) + throws RocksDBException; private native void compactRange(long handle, boolean reduce_level, int target_level, int target_path_id, long cfHandle) throws RocksDBException; diff -Nru rocksdb-5.15.10/java/src/test/java/org/rocksdb/AbstractTransactionTest.java rocksdb-5.17.2/java/src/test/java/org/rocksdb/AbstractTransactionTest.java --- rocksdb-5.15.10/java/src/test/java/org/rocksdb/AbstractTransactionTest.java 2018-09-13 17:25:20.000000000 +0000 +++ rocksdb-5.17.2/java/src/test/java/org/rocksdb/AbstractTransactionTest.java 2018-11-12 19:57:32.000000000 +0000 @@ -686,13 +686,12 @@ @Test public void elapsedTime() throws RocksDBException, InterruptedException { final long preStartTxnTime = System.currentTimeMillis(); - try(final DBContainer dbContainer = startDb(); - final Transaction txn = dbContainer.beginTransaction()) { + try (final DBContainer dbContainer = startDb(); + final Transaction txn = dbContainer.beginTransaction()) { Thread.sleep(2); final long txnElapsedTime = txn.getElapsedTime(); - assertThat(txnElapsedTime).isLessThan(System.currentTimeMillis() - - preStartTxnTime); + assertThat(txnElapsedTime).isLessThan(System.currentTimeMillis() - preStartTxnTime); assertThat(txnElapsedTime).isGreaterThan(0); } } diff -Nru rocksdb-5.15.10/java/src/test/java/org/rocksdb/ColumnFamilyTest.java rocksdb-5.17.2/java/src/test/java/org/rocksdb/ColumnFamilyTest.java --- rocksdb-5.15.10/java/src/test/java/org/rocksdb/ColumnFamilyTest.java 2018-09-13 17:25:20.000000000 +0000 +++ rocksdb-5.17.2/java/src/test/java/org/rocksdb/ColumnFamilyTest.java 2018-11-12 19:57:32.000000000 +0000 @@ -404,6 +404,10 @@ "rocksdb.stats")).isNotNull(); assertThat(db.getProperty(columnFamilyHandleList.get(1), "rocksdb.sstables")).isNotNull(); + assertThat(db.getAggregatedLongProperty("rocksdb.estimate-num-keys")). + isNotNull(); + assertThat(db.getAggregatedLongProperty("rocksdb.estimate-num-keys")). + isGreaterThanOrEqualTo(0); } finally { for (final ColumnFamilyHandle columnFamilyHandle : columnFamilyHandleList) { diff -Nru rocksdb-5.15.10/java/src/test/java/org/rocksdb/CompactRangeOptionsTest.java rocksdb-5.17.2/java/src/test/java/org/rocksdb/CompactRangeOptionsTest.java --- rocksdb-5.15.10/java/src/test/java/org/rocksdb/CompactRangeOptionsTest.java 1970-01-01 00:00:00.000000000 +0000 +++ rocksdb-5.17.2/java/src/test/java/org/rocksdb/CompactRangeOptionsTest.java 2018-11-12 19:57:32.000000000 +0000 @@ -0,0 +1,98 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). + +package org.rocksdb; + +import org.junit.Test; +import org.rocksdb.CompactRangeOptions.BottommostLevelCompaction; + +import static org.assertj.core.api.Assertions.assertThat; + +public class CompactRangeOptionsTest { + + static { + RocksDB.loadLibrary(); + } + + @Test + public void exclusiveManualCompaction() { + CompactRangeOptions opt = new CompactRangeOptions(); + boolean value = false; + opt.setExclusiveManualCompaction(value); + assertThat(opt.exclusiveManualCompaction()).isEqualTo(value); + value = true; + opt.setExclusiveManualCompaction(value); + assertThat(opt.exclusiveManualCompaction()).isEqualTo(value); + } + + @Test + public void bottommostLevelCompaction() { + CompactRangeOptions opt = new CompactRangeOptions(); + BottommostLevelCompaction value = BottommostLevelCompaction.kSkip; + opt.setBottommostLevelCompaction(value); + assertThat(opt.bottommostLevelCompaction()).isEqualTo(value); + value = BottommostLevelCompaction.kForce; + opt.setBottommostLevelCompaction(value); + assertThat(opt.bottommostLevelCompaction()).isEqualTo(value); + value = BottommostLevelCompaction.kIfHaveCompactionFilter; + opt.setBottommostLevelCompaction(value); + assertThat(opt.bottommostLevelCompaction()).isEqualTo(value); + } + + @Test + public void changeLevel() { + CompactRangeOptions opt = new CompactRangeOptions(); + boolean value = false; + opt.setChangeLevel(value); + assertThat(opt.changeLevel()).isEqualTo(value); + value = true; + opt.setChangeLevel(value); + assertThat(opt.changeLevel()).isEqualTo(value); + } + + @Test + public void targetLevel() { + CompactRangeOptions opt = new CompactRangeOptions(); + int value = 2; + opt.setTargetLevel(value); + assertThat(opt.targetLevel()).isEqualTo(value); + value = 3; + opt.setTargetLevel(value); + assertThat(opt.targetLevel()).isEqualTo(value); + } + + @Test + public void targetPathId() { + CompactRangeOptions opt = new CompactRangeOptions(); + int value = 2; + opt.setTargetPathId(value); + assertThat(opt.targetPathId()).isEqualTo(value); + value = 3; + opt.setTargetPathId(value); + assertThat(opt.targetPathId()).isEqualTo(value); + } + + @Test + public void allowWriteStall() { + CompactRangeOptions opt = new CompactRangeOptions(); + boolean value = false; + opt.setAllowWriteStall(value); + assertThat(opt.allowWriteStall()).isEqualTo(value); + value = true; + opt.setAllowWriteStall(value); + assertThat(opt.allowWriteStall()).isEqualTo(value); + } + + @Test + public void maxSubcompactions() { + CompactRangeOptions opt = new CompactRangeOptions(); + int value = 2; + opt.setMaxSubcompactions(value); + assertThat(opt.maxSubcompactions()).isEqualTo(value); + value = 3; + opt.setMaxSubcompactions(value); + assertThat(opt.maxSubcompactions()).isEqualTo(value); + } +} diff -Nru rocksdb-5.15.10/Makefile rocksdb-5.17.2/Makefile --- rocksdb-5.15.10/Makefile 2018-09-13 17:25:20.000000000 +0000 +++ rocksdb-5.17.2/Makefile 2018-11-12 19:57:32.000000000 +0000 @@ -382,6 +382,8 @@ BENCHTOOLOBJECTS = $(BENCH_LIB_SOURCES:.cc=.o) $(LIBOBJECTS) $(TESTUTIL) +ANALYZETOOLOBJECTS = $(ANALYZER_LIB_SOURCES:.cc=.o) + EXPOBJECTS = $(EXP_LIB_SOURCES:.cc=.o) $(LIBOBJECTS) $(TESTUTIL) TESTS = \ @@ -437,6 +439,7 @@ table_properties_collector_test \ arena_test \ block_test \ + data_block_hash_index_test \ cache_test \ corruption_test \ slice_transform_test \ @@ -529,6 +532,8 @@ write_prepared_transaction_test \ write_unprepared_transaction_test \ db_universal_compaction_test \ + trace_analyzer_test \ + repeatable_thread_test \ PARALLEL_TEST = \ backupable_db_test \ @@ -572,12 +577,13 @@ rocksdb_dump \ rocksdb_undump \ blob_dump \ + trace_analyzer \ TEST_LIBS = \ librocksdb_env_basic_test.a # TODO: add back forward_iterator_bench, after making it build in all environemnts. -BENCHMARKS = db_bench table_reader_bench cache_bench memtablerep_bench column_aware_encoding_exp persistent_cache_bench +BENCHMARKS = db_bench table_reader_bench cache_bench memtablerep_bench column_aware_encoding_exp persistent_cache_bench range_del_aggregator_bench # if user didn't config LIBNAME, set the default ifeq ($(LIBNAME),) @@ -665,7 +671,7 @@ endif # PLATFORM_SHARED_EXT .PHONY: blackbox_crash_test check clean coverage crash_test ldb_tests package \ - release tags valgrind_check whitebox_crash_test format static_lib shared_lib all \ + release tags tags0 valgrind_check whitebox_crash_test format static_lib shared_lib all \ dbg rocksdbjavastatic rocksdbjava install install-static install-shared uninstall \ analyze tools tools_lib @@ -997,8 +1003,10 @@ $(AM_V_AR)rm -f $@ $(AM_V_at)$(AR) $(ARFLAGS) $@ unity.o + +TOOLLIBOBJECTS = $(TOOL_LIB_SOURCES:.cc=.o) # try compiling db_test with unity -unity_test: db/db_test.o db/db_test_util.o $(TESTHARNESS) unity.a +unity_test: db/db_test.o db/db_test_util.o $(TESTHARNESS) $(TOOLLIBOBJECTS) unity.a $(AM_LINK) ./unity_test @@ -1018,6 +1026,13 @@ cscope -b `$(FIND) . -name '*.cc'` `$(FIND) . -name '*.h'` `$(FIND) . -name '*.c'` ctags -e -R -o etags * +tags0: + ctags -R . + cscope -b `$(FIND) . -name '*.cc' -and ! -name '*_test.cc'` \ + `$(FIND) . -name '*.c' -and ! -name '*_test.c'` \ + `$(FIND) . -name '*.h' -and ! -name '*_test.h'` + ctags -e -R -o etags * + format: build_tools/format-diff.sh @@ -1031,7 +1046,7 @@ $(AM_V_AR)rm -f $@ $(AM_V_at)$(AR) $(ARFLAGS) $@ $(LIBOBJECTS) -$(TOOLS_LIBRARY): $(BENCH_LIB_SOURCES:.cc=.o) $(TOOL_LIB_SOURCES:.cc=.o) $(LIB_SOURCES:.cc=.o) $(TESTUTIL) +$(TOOLS_LIBRARY): $(BENCH_LIB_SOURCES:.cc=.o) $(TOOL_LIB_SOURCES:.cc=.o) $(LIB_SOURCES:.cc=.o) $(TESTUTIL) $(ANALYZER_LIB_SOURCES:.cc=.o) $(AM_V_AR)rm -f $@ $(AM_V_at)$(AR) $(ARFLAGS) $@ $^ @@ -1042,6 +1057,9 @@ db_bench: tools/db_bench.o $(BENCHTOOLOBJECTS) $(AM_LINK) +trace_analyzer: tools/trace_analyzer.o $(ANALYZETOOLOBJECTS) $(LIBOBJECTS) + $(AM_LINK) + cache_bench: cache/cache_bench.o $(LIBOBJECTS) $(TESTUTIL) $(AM_LINK) @@ -1350,6 +1368,9 @@ block_test: table/block_test.o $(LIBOBJECTS) $(TESTHARNESS) $(AM_LINK) +data_block_hash_index_test: table/data_block_hash_index_test.o $(LIBOBJECTS) $(TESTHARNESS) + $(AM_LINK) + inlineskiplist_test: memtable/inlineskiplist_test.o $(LIBOBJECTS) $(TESTHARNESS) $(AM_LINK) @@ -1446,6 +1467,9 @@ db_bench_tool_test: tools/db_bench_tool_test.o $(BENCHTOOLOBJECTS) $(TESTHARNESS) $(AM_LINK) +trace_analyzer_test: tools/trace_analyzer_test.o $(LIBOBJECTS) $(ANALYZETOOLOBJECTS) $(TESTHARNESS) + $(AM_LINK) + event_logger_test: util/event_logger_test.o $(LIBOBJECTS) $(TESTHARNESS) $(AM_LINK) @@ -1527,9 +1551,15 @@ range_del_aggregator_test: db/range_del_aggregator_test.o db/db_test_util.o $(LIBOBJECTS) $(TESTHARNESS) $(AM_LINK) +range_del_aggregator_bench: db/range_del_aggregator_bench.o $(LIBOBJECTS) $(TESTUTIL) + $(AM_LINK) + blob_db_test: utilities/blob_db/blob_db_test.o $(LIBOBJECTS) $(TESTHARNESS) $(AM_LINK) +repeatable_thread_test: util/repeatable_thread_test.o $(LIBOBJECTS) $(TESTHARNESS) + $(AM_LINK) + #------------------------------------------------- # make install related stuff INSTALL_PATH ?= /usr/local @@ -1760,20 +1790,26 @@ cd java/target;jar -uf $(ROCKSDB_JAR_ALL) librocksdbjni-*.so librocksdbjni-*.jnilib cd java/target/classes;jar -uf ../$(ROCKSDB_JAR_ALL) org/rocksdb/*.class org/rocksdb/util/*.class -rocksdbjavastaticreleasedocker: rocksdbjavastatic - DOCKER_LINUX_X64_CONTAINER=`docker ps -aqf name=rocksdb_linux_x64-be`; \ - if [ -z "$$DOCKER_LINUX_X64_CONTAINER" ]; then \ - docker container create --attach stdin --attach stdout --attach stderr --volume `pwd`:/rocksdb-host --name rocksdb_linux_x64-be evolvedbinary/rocksjava:centos6_x64-be /rocksdb-host/java/crossbuild/docker-build-linux-centos.sh; \ - fi - docker start -a rocksdb_linux_x64-be +rocksdbjavastaticreleasedocker: rocksdbjavastatic rocksdbjavastaticdockerx86 rocksdbjavastaticdockerx86_64 + cd java;jar -cf target/$(ROCKSDB_JAR_ALL) HISTORY*.md + cd java/target;jar -uf $(ROCKSDB_JAR_ALL) librocksdbjni-*.so librocksdbjni-*.jnilib + cd java/target/classes;jar -uf ../$(ROCKSDB_JAR_ALL) org/rocksdb/*.class org/rocksdb/util/*.class + +rocksdbjavastaticdockerx86: + mkdir -p java/target DOCKER_LINUX_X86_CONTAINER=`docker ps -aqf name=rocksdb_linux_x86-be`; \ if [ -z "$$DOCKER_LINUX_X86_CONTAINER" ]; then \ docker container create --attach stdin --attach stdout --attach stderr --volume `pwd`:/rocksdb-host --name rocksdb_linux_x86-be evolvedbinary/rocksjava:centos6_x86-be /rocksdb-host/java/crossbuild/docker-build-linux-centos.sh; \ fi docker start -a rocksdb_linux_x86-be - cd java;jar -cf target/$(ROCKSDB_JAR_ALL) HISTORY*.md - cd java/target;jar -uf $(ROCKSDB_JAR_ALL) librocksdbjni-*.so librocksdbjni-*.jnilib - cd java/target/classes;jar -uf ../$(ROCKSDB_JAR_ALL) org/rocksdb/*.class org/rocksdb/util/*.class + +rocksdbjavastaticdockerx86_64: + mkdir -p java/target + DOCKER_LINUX_X64_CONTAINER=`docker ps -aqf name=rocksdb_linux_x64-be`; \ + if [ -z "$$DOCKER_LINUX_X64_CONTAINER" ]; then \ + docker container create --attach stdin --attach stdout --attach stderr --volume `pwd`:/rocksdb-host --name rocksdb_linux_x64-be evolvedbinary/rocksjava:centos6_x64-be /rocksdb-host/java/crossbuild/docker-build-linux-centos.sh; \ + fi + docker start -a rocksdb_linux_x64-be rocksdbjavastaticdockerppc64le: mkdir -p java/target @@ -1898,7 +1934,7 @@ # Source files dependencies detection # --------------------------------------------------------------------------- -all_sources = $(LIB_SOURCES) $(MAIN_SOURCES) $(MOCK_LIB_SOURCES) $(TOOL_LIB_SOURCES) $(BENCH_LIB_SOURCES) $(TEST_LIB_SOURCES) $(EXP_LIB_SOURCES) +all_sources = $(LIB_SOURCES) $(MAIN_SOURCES) $(MOCK_LIB_SOURCES) $(TOOL_LIB_SOURCES) $(BENCH_LIB_SOURCES) $(TEST_LIB_SOURCES) $(EXP_LIB_SOURCES) $(ANALYZER_LIB_SOURCES) DEPFILES = $(all_sources:.cc=.cc.d) # Add proper dependency support so changing a .h file forces a .cc file to diff -Nru rocksdb-5.15.10/memtable/hash_cuckoo_rep.cc rocksdb-5.17.2/memtable/hash_cuckoo_rep.cc --- rocksdb-5.15.10/memtable/hash_cuckoo_rep.cc 2018-09-13 17:25:20.000000000 +0000 +++ rocksdb-5.17.2/memtable/hash_cuckoo_rep.cc 2018-11-12 19:57:32.000000000 +0000 @@ -408,6 +408,7 @@ const auto bucket_user_key = UserKey(stored_key); if (bucket_user_key.compare(user_key) == 0) { cuckoo_bucket_id = bucket_ids[hid]; + assert(cuckoo_bucket_id != -1); break; } } diff -Nru rocksdb-5.15.10/monitoring/histogram_windowing.cc rocksdb-5.17.2/monitoring/histogram_windowing.cc --- rocksdb-5.15.10/monitoring/histogram_windowing.cc 2018-09-13 17:25:20.000000000 +0000 +++ rocksdb-5.17.2/monitoring/histogram_windowing.cc 2018-11-12 19:57:32.000000000 +0000 @@ -17,7 +17,7 @@ HistogramWindowingImpl::HistogramWindowingImpl() { env_ = Env::Default(); - window_stats_.reset(new HistogramStat[num_windows_]); + window_stats_.reset(new HistogramStat[static_cast(num_windows_)]); Clear(); } @@ -29,7 +29,7 @@ micros_per_window_(micros_per_window), min_num_per_window_(min_num_per_window) { env_ = Env::Default(); - window_stats_.reset(new HistogramStat[num_windows_]); + window_stats_.reset(new HistogramStat[static_cast(num_windows_)]); Clear(); } diff -Nru rocksdb-5.15.10/options/options.cc rocksdb-5.17.2/options/options.cc --- rocksdb-5.15.10/options/options.cc 2018-09-13 17:25:20.000000000 +0000 +++ rocksdb-5.17.2/options/options.cc 2018-11-12 19:57:32.000000000 +0000 @@ -479,6 +479,9 @@ prefix_extractor.reset(NewNoopTransform()); BlockBasedTableOptions block_based_options; block_based_options.index_type = BlockBasedTableOptions::kHashSearch; + block_based_options.data_block_index_type = + BlockBasedTableOptions::kDataBlockBinaryAndHash; + block_based_options.data_block_hash_table_util_ratio = 0.75; block_based_options.filter_policy.reset(NewBloomFilterPolicy(10)); block_based_options.block_cache = NewLRUCache(static_cast(block_cache_size_mb * 1024 * 1024)); diff -Nru rocksdb-5.15.10/options/options_helper.cc rocksdb-5.17.2/options/options_helper.cc --- rocksdb-5.15.10/options/options_helper.cc 2018-09-13 17:25:20.000000000 +0000 +++ rocksdb-5.17.2/options/options_helper.cc 2018-11-12 19:57:32.000000000 +0000 @@ -494,6 +494,11 @@ return ParseEnum( block_base_table_index_type_string_map, value, reinterpret_cast(opt_address)); + case OptionType::kBlockBasedTableDataBlockIndexType: + return ParseEnum( + block_base_table_data_block_index_type_string_map, value, + reinterpret_cast( + opt_address)); case OptionType::kEncodingType: return ParseEnum( encoding_type_string_map, value, @@ -673,6 +678,12 @@ *reinterpret_cast( opt_address), value); + case OptionType::kBlockBasedTableDataBlockIndexType: + return SerializeEnum( + block_base_table_data_block_index_type_string_map, + *reinterpret_cast( + opt_address), + value); case OptionType::kFlushBlockPolicyFactory: { const auto* ptr = reinterpret_cast*>( @@ -1552,6 +1563,13 @@ {"kTwoLevelIndexSearch", BlockBasedTableOptions::IndexType::kTwoLevelIndexSearch}}; +std::unordered_map + OptionsHelper::block_base_table_data_block_index_type_string_map = { + {"kDataBlockBinarySearch", + BlockBasedTableOptions::DataBlockIndexType::kDataBlockBinarySearch}, + {"kDataBlockBinaryAndHash", + BlockBasedTableOptions::DataBlockIndexType::kDataBlockBinaryAndHash}}; + std::unordered_map OptionsHelper::encoding_type_string_map = {{"kPlain", kPlain}, {"kPrefix", kPrefix}}; diff -Nru rocksdb-5.15.10/options/options_helper.h rocksdb-5.17.2/options/options_helper.h --- rocksdb-5.15.10/options/options_helper.h 2018-09-13 17:25:20.000000000 +0000 +++ rocksdb-5.17.2/options/options_helper.h 2018-11-12 19:57:32.000000000 +0000 @@ -69,6 +69,7 @@ kMergeOperator, kMemTableRepFactory, kBlockBasedTableIndexType, + kBlockBasedTableDataBlockIndexType, kFilterPolicy, kFlushBlockPolicyFactory, kChecksumType, @@ -163,6 +164,9 @@ lru_cache_options_type_info; static std::unordered_map block_base_table_index_type_string_map; + static std::unordered_map + block_base_table_data_block_index_type_string_map; static std::unordered_map encoding_type_string_map; static std::unordered_map compaction_style_string_map; @@ -203,6 +207,8 @@ OptionsHelper::compression_type_string_map; static auto& block_base_table_index_type_string_map = OptionsHelper::block_base_table_index_type_string_map; +static auto& block_base_table_data_block_index_type_string_map = + OptionsHelper::block_base_table_data_block_index_type_string_map; static auto& encoding_type_string_map = OptionsHelper::encoding_type_string_map; static auto& compaction_style_string_map = OptionsHelper::compaction_style_string_map; diff -Nru rocksdb-5.15.10/options/options_parser.cc rocksdb-5.17.2/options/options_parser.cc --- rocksdb-5.15.10/options/options_parser.cc 2018-09-13 17:25:20.000000000 +0000 +++ rocksdb-5.17.2/options/options_parser.cc 2018-11-12 19:57:32.000000000 +0000 @@ -49,7 +49,7 @@ return s; } unique_ptr writable; - writable.reset(new WritableFileWriter(std::move(wf), EnvOptions(), + writable.reset(new WritableFileWriter(std::move(wf), file_name, EnvOptions(), nullptr /* statistics */)); std::string options_file_content; @@ -200,45 +200,6 @@ return Status::OK(); } -namespace { -bool ReadOneLine(std::istringstream* iss, SequentialFile* seq_file, - std::string* output, bool* has_data, Status* result) { - const int kBufferSize = 8192; - char buffer[kBufferSize + 1]; - Slice input_slice; - - std::string line; - bool has_complete_line = false; - while (!has_complete_line) { - if (std::getline(*iss, line)) { - has_complete_line = !iss->eof(); - } else { - has_complete_line = false; - } - if (!has_complete_line) { - // if we're not sure whether we have a complete line, - // further read from the file. - if (*has_data) { - *result = seq_file->Read(kBufferSize, &input_slice, buffer); - } - if (input_slice.size() == 0) { - // meaning we have read all the data - *has_data = false; - break; - } else { - iss->str(line + input_slice.ToString()); - // reset the internal state of iss so that we can keep reading it. - iss->clear(); - *has_data = (input_slice.size() == kBufferSize); - continue; - } - } - } - *output = line; - return *has_data || has_complete_line; -} -} // namespace - Status RocksDBOptionsParser::Parse(const std::string& file_name, Env* env, bool ignore_unknown_options) { Reset(); @@ -592,6 +553,12 @@ *reinterpret_cast( offset1) == *reinterpret_cast(offset2)); + case OptionType::kBlockBasedTableDataBlockIndexType: + return ( + *reinterpret_cast( + offset1) == + *reinterpret_cast( + offset2)); case OptionType::kWALRecoveryMode: return (*reinterpret_cast(offset1) == *reinterpret_cast(offset2)); diff -Nru rocksdb-5.15.10/options/options_settable_test.cc rocksdb-5.17.2/options/options_settable_test.cc --- rocksdb-5.15.10/options/options_settable_test.cc 2018-09-13 17:25:20.000000000 +0000 +++ rocksdb-5.17.2/options/options_settable_test.cc 2018-11-12 19:57:32.000000000 +0000 @@ -142,6 +142,8 @@ "pin_l0_filter_and_index_blocks_in_cache=1;" "pin_top_level_index_and_filter=1;" "index_type=kHashSearch;" + "data_block_index_type=kDataBlockBinaryAndHash;" + "data_block_hash_table_util_ratio=0.75;" "checksum=kxxHash;hash_index_allow_collision=1;no_block_cache=1;" "block_cache=1M;block_cache_compressed=1k;block_size=1024;" "block_size_deviation=8;block_restart_interval=4; " diff -Nru rocksdb-5.15.10/port/dirent.h rocksdb-5.17.2/port/dirent.h --- rocksdb-5.15.10/port/dirent.h 2018-09-13 17:25:20.000000000 +0000 +++ rocksdb-5.17.2/port/dirent.h 2018-11-12 19:57:32.000000000 +0000 @@ -9,8 +9,7 @@ // // See port_example.h for documentation for the following types/functions. -#ifndef STORAGE_LEVELDB_PORT_DIRENT_H_ -#define STORAGE_LEVELDB_PORT_DIRENT_H_ +#pragma once #ifdef ROCKSDB_PLATFORM_POSIX #include @@ -43,5 +42,3 @@ } // namespace rocksdb #endif // OS_WIN - -#endif // STORAGE_LEVELDB_PORT_DIRENT_H_ diff -Nru rocksdb-5.15.10/port/likely.h rocksdb-5.17.2/port/likely.h --- rocksdb-5.15.10/port/likely.h 2018-09-13 17:25:20.000000000 +0000 +++ rocksdb-5.17.2/port/likely.h 2018-11-12 19:57:32.000000000 +0000 @@ -7,8 +7,7 @@ // Use of this source code is governed by a BSD-style license that can be // found in the LICENSE file. See the AUTHORS file for names of contributors. -#ifndef PORT_LIKELY_H_ -#define PORT_LIKELY_H_ +#pragma once #if defined(__GNUC__) && __GNUC__ >= 4 #define LIKELY(x) (__builtin_expect((x), 1)) @@ -17,5 +16,3 @@ #define LIKELY(x) (x) #define UNLIKELY(x) (x) #endif - -#endif // PORT_LIKELY_H_ diff -Nru rocksdb-5.15.10/port/port_example.h rocksdb-5.17.2/port/port_example.h --- rocksdb-5.15.10/port/port_example.h 2018-09-13 17:25:20.000000000 +0000 +++ rocksdb-5.17.2/port/port_example.h 2018-11-12 19:57:32.000000000 +0000 @@ -12,8 +12,7 @@ // specific port_.h file. Use this file as a reference for // how to port this package to a new platform. -#ifndef STORAGE_LEVELDB_PORT_PORT_EXAMPLE_H_ -#define STORAGE_LEVELDB_PORT_PORT_EXAMPLE_H_ +#pragma once namespace rocksdb { namespace port { @@ -100,5 +99,3 @@ } // namespace port } // namespace rocksdb - -#endif // STORAGE_LEVELDB_PORT_PORT_EXAMPLE_H_ diff -Nru rocksdb-5.15.10/port/sys_time.h rocksdb-5.17.2/port/sys_time.h --- rocksdb-5.15.10/port/sys_time.h 2018-09-13 17:25:20.000000000 +0000 +++ rocksdb-5.17.2/port/sys_time.h 2018-11-12 19:57:32.000000000 +0000 @@ -10,8 +10,7 @@ // This file is a portable substitute for sys/time.h which does not exist on // Windows -#ifndef STORAGE_LEVELDB_PORT_SYS_TIME_H_ -#define STORAGE_LEVELDB_PORT_SYS_TIME_H_ +#pragma once #if defined(OS_WIN) && defined(_MSC_VER) @@ -44,5 +43,3 @@ #include #include #endif - -#endif // STORAGE_LEVELDB_PORT_SYS_TIME_H_ diff -Nru rocksdb-5.15.10/port/util_logger.h rocksdb-5.17.2/port/util_logger.h --- rocksdb-5.15.10/port/util_logger.h 2018-09-13 17:25:20.000000000 +0000 +++ rocksdb-5.17.2/port/util_logger.h 2018-11-12 19:57:32.000000000 +0000 @@ -7,8 +7,7 @@ // Use of this source code is governed by a BSD-style license that can be // found in the LICENSE file. See the AUTHORS file for names of contributors. -#ifndef STORAGE_LEVELDB_PORT_UTIL_LOGGER_H_ -#define STORAGE_LEVELDB_PORT_UTIL_LOGGER_H_ +#pragma once // Include the appropriate platform specific file below. If you are // porting to a new platform, see "port_example.h" for documentation @@ -19,5 +18,3 @@ #elif defined(OS_WIN) #include "port/win/win_logger.h" #endif - -#endif // STORAGE_LEVELDB_PORT_UTIL_LOGGER_H_ diff -Nru rocksdb-5.15.10/port/win/env_win.cc rocksdb-5.17.2/port/win/env_win.cc --- rocksdb-5.15.10/port/win/env_win.cc 2018-09-13 17:25:20.000000000 +0000 +++ rocksdb-5.17.2/port/win/env_win.cc 2018-11-12 19:57:32.000000000 +0000 @@ -235,7 +235,7 @@ MapViewOfFileEx(hMap, FILE_MAP_READ, 0, // High DWORD of access start 0, // Low DWORD - fileSize, + static_cast(fileSize), NULL); // Let the OS choose the mapping if (!mapped_region) { @@ -246,7 +246,7 @@ } result->reset(new WinMmapReadableFile(fname, hFile, hMap, mapped_region, - fileSize)); + static_cast(fileSize))); mapGuard.release(); fileGuard.release(); @@ -448,7 +448,7 @@ void* base = MapViewOfFileEx(hMap, FILE_MAP_WRITE, 0, // High DWORD of access start 0, // Low DWORD - fileSize, + static_cast(fileSize), NULL); // Let the OS choose the mapping if (!base) { @@ -706,6 +706,9 @@ if (!CreateHardLinkA(target.c_str(), src.c_str(), NULL)) { DWORD lastError = GetLastError(); + if (lastError == ERROR_NOT_SAME_DEVICE) { + return Status::NotSupported("No cross FS links allowed"); + } std::string text("Failed to link: "); text.append(src).append(" to: ").append(target); @@ -716,6 +719,31 @@ return result; } +Status WinEnvIO::NumFileLinks(const std::string& fname, uint64_t* count) { + Status s; + HANDLE handle = ::CreateFileA( + fname.c_str(), 0, FILE_SHARE_DELETE | FILE_SHARE_READ | FILE_SHARE_WRITE, + NULL, OPEN_EXISTING, FILE_FLAG_BACKUP_SEMANTICS, NULL); + + if (INVALID_HANDLE_VALUE == handle) { + auto lastError = GetLastError(); + s = IOErrorFromWindowsError("NumFileLinks: " + fname, lastError); + return s; + } + UniqueCloseHandlePtr handle_guard(handle, CloseHandleFunc); + FILE_STANDARD_INFO standard_info; + if (0 != GetFileInformationByHandleEx(handle, FileStandardInfo, + &standard_info, + sizeof(standard_info))) { + *count = standard_info.NumberOfLinks; + } else { + auto lastError = GetLastError(); + s = IOErrorFromWindowsError("GetFileInformationByHandleEx: " + fname, + lastError); + } + return s; +} + Status WinEnvIO::AreFilesSame(const std::string& first, const std::string& second, bool* res) { // For MinGW builds @@ -1325,6 +1353,10 @@ return winenv_io_.LinkFile(src, target); } +Status WinEnv::NumFileLinks(const std::string& fname, uint64_t* count) { + return winenv_io_.NumFileLinks(fname, count); +} + Status WinEnv::AreFilesSame(const std::string& first, const std::string& second, bool* res) { return winenv_io_.AreFilesSame(first, second, res); diff -Nru rocksdb-5.15.10/port/win/env_win.h rocksdb-5.17.2/port/win/env_win.h --- rocksdb-5.15.10/port/win/env_win.h 2018-09-13 17:25:20.000000000 +0000 +++ rocksdb-5.17.2/port/win/env_win.h 2018-11-12 19:57:32.000000000 +0000 @@ -144,6 +144,9 @@ virtual Status LinkFile(const std::string& src, const std::string& target); + virtual Status NumFileLinks(const std::string& /*fname*/, + uint64_t* /*count*/); + virtual Status AreFilesSame(const std::string& first, const std::string& second, bool* res); @@ -268,6 +271,8 @@ Status LinkFile(const std::string& src, const std::string& target) override; + Status NumFileLinks(const std::string& fname, uint64_t* count) override; + Status AreFilesSame(const std::string& first, const std::string& second, bool* res) override; diff -Nru rocksdb-5.15.10/port/win/io_win.cc rocksdb-5.17.2/port/win/io_win.cc --- rocksdb-5.15.10/port/win/io_win.cc 2018-09-13 17:25:20.000000000 +0000 +++ rocksdb-5.17.2/port/win/io_win.cc 2018-11-12 19:57:32.000000000 +0000 @@ -260,7 +260,7 @@ *result = Slice(); return IOError(filename_, EINVAL); } else if (offset + n > length_) { - n = length_ - offset; + n = length_ - static_cast(offset); } *result = Slice(reinterpret_cast(mapped_region_)+offset, n); @@ -317,7 +317,7 @@ assert(mapped_begin_ == nullptr); - size_t minDiskSize = file_offset_ + view_size_; + size_t minDiskSize = static_cast(file_offset_) + view_size_; if (minDiskSize > reserved_size_) { status = Allocate(file_offset_, view_size_); @@ -579,7 +579,7 @@ // Make sure that we reserve an aligned amount of space // since the reservation block size is driven outside so we want // to check if we are ok with reservation here - size_t spaceToReserve = Roundup(offset + len, view_size_); + size_t spaceToReserve = Roundup(static_cast(offset + len), view_size_); // Nothing to do if (spaceToReserve <= reserved_size_) { return status; @@ -656,14 +656,14 @@ return Status::NotSupported("This function is only used for direct_io"); } - if (!IsSectorAligned(offset) || + if (!IsSectorAligned(static_cast(offset)) || !IsSectorAligned(n)) { return Status::InvalidArgument( "WinSequentialFile::PositionedRead: offset is not properly aligned"); } size_t bytes_read = 0; // out param - s = PositionedReadInternal(scratch, n, offset, bytes_read); + s = PositionedReadInternal(scratch, static_cast(n), offset, bytes_read); *result = Slice(scratch, bytes_read); return s; } @@ -721,7 +721,7 @@ // Check buffer alignment if (file_base_->use_direct_io()) { - if (!IsSectorAligned(offset) || + if (!IsSectorAligned(static_cast(offset)) || !IsAligned(alignment_, scratch)) { return Status::InvalidArgument( "WinRandomAccessImpl::ReadImpl: offset or scratch is not properly aligned"); @@ -818,7 +818,7 @@ // to the end of the file assert(IsSectorAligned(next_write_offset_)); if (!IsSectorAligned(data.size()) || - !IsAligned(GetAlignement(), data.data())) { + !IsAligned(static_cast(GetAlignement()), data.data())) { s = Status::InvalidArgument( "WriteData must be page aligned, size must be sector aligned"); } else { @@ -857,9 +857,9 @@ Status WinWritableImpl::PositionedAppendImpl(const Slice& data, uint64_t offset) { if(file_data_->use_direct_io()) { - if (!IsSectorAligned(offset) || + if (!IsSectorAligned(static_cast(offset)) || !IsSectorAligned(data.size()) || - !IsAligned(GetAlignement(), data.data())) { + !IsAligned(static_cast(GetAlignement()), data.data())) { return Status::InvalidArgument( "Data and offset must be page aligned, size must be sector aligned"); } @@ -944,7 +944,7 @@ // Make sure that we reserve an aligned amount of space // since the reservation block size is driven outside so we want // to check if we are ok with reservation here - size_t spaceToReserve = Roundup(offset + len, alignment_); + size_t spaceToReserve = Roundup(static_cast(offset + len), static_cast(alignment_)); // Nothing to do if (spaceToReserve <= reservedsize_) { return status; @@ -977,7 +977,7 @@ bool WinWritableFile::use_direct_io() const { return WinFileData::use_direct_io(); } size_t WinWritableFile::GetRequiredBufferAlignment() const { - return GetAlignement(); + return static_cast(GetAlignement()); } Status WinWritableFile::Append(const Slice& data) { @@ -1037,7 +1037,7 @@ bool WinRandomRWFile::use_direct_io() const { return WinFileData::use_direct_io(); } size_t WinRandomRWFile::GetRequiredBufferAlignment() const { - return GetAlignement(); + return static_cast(GetAlignement()); } Status WinRandomRWFile::Write(uint64_t offset, const Slice & data) { diff -Nru rocksdb-5.15.10/port/win/port_win.h rocksdb-5.17.2/port/win/port_win.h --- rocksdb-5.15.10/port/win/port_win.h 2018-09-13 17:25:20.000000000 +0000 +++ rocksdb-5.17.2/port/win/port_win.h 2018-11-12 19:57:32.000000000 +0000 @@ -9,8 +9,7 @@ // // See port_example.h for documentation for the following types/functions. -#ifndef STORAGE_LEVELDB_PORT_PORT_WIN_H_ -#define STORAGE_LEVELDB_PORT_PORT_WIN_H_ +#pragma once // Always want minimum headers #ifndef WIN32_LEAN_AND_MEAN @@ -341,5 +340,3 @@ using port::truncate; } // namespace rocksdb - -#endif // STORAGE_LEVELDB_PORT_PORT_WIN_H_ diff -Nru rocksdb-5.15.10/port/win/win_jemalloc.cc rocksdb-5.17.2/port/win/win_jemalloc.cc --- rocksdb-5.15.10/port/win/win_jemalloc.cc 2018-09-13 17:25:20.000000000 +0000 +++ rocksdb-5.17.2/port/win/win_jemalloc.cc 2018-11-12 19:57:32.000000000 +0000 @@ -43,8 +43,8 @@ return je_aligned_alloc(alignment, size); } void jemalloc_aligned_free(void* p) ROCKSDB_NOEXCEPT { je_free(p); } -} // port -} // rocksdb +} // namespace port +} // namespace rocksdb void* operator new(size_t size) { void* p = je_malloc(size); diff -Nru rocksdb-5.15.10/src.mk rocksdb-5.17.2/src.mk --- rocksdb-5.15.10/src.mk 2018-09-13 17:25:20.000000000 +0000 +++ rocksdb-5.17.2/src.mk 2018-11-12 19:57:32.000000000 +0000 @@ -103,6 +103,8 @@ table/cuckoo_table_builder.cc \ table/cuckoo_table_factory.cc \ table/cuckoo_table_reader.cc \ + table/data_block_hash_index.cc \ + table/data_block_footer.cc \ table/flush_block_policy.cc \ table/format.cc \ table/full_filter_block.cc \ @@ -147,18 +149,19 @@ util/slice.cc \ util/sst_file_manager_impl.cc \ util/status.cc \ - util/status_message.cc \ util/string_util.cc \ util/sync_point.cc \ util/sync_point_impl.cc \ util/thread_local.cc \ util/threadpool_imp.cc \ + util/trace_replay.cc \ util/transaction_test_util.cc \ util/xxhash.cc \ utilities/backupable/backupable_db.cc \ utilities/blob_db/blob_compaction_filter.cc \ utilities/blob_db/blob_db.cc \ utilities/blob_db/blob_db_impl.cc \ + utilities/blob_db/blob_db_impl_filesnapshot.cc \ utilities/blob_db/blob_file.cc \ utilities/blob_db/blob_log_format.cc \ utilities/blob_db/blob_log_reader.cc \ @@ -197,6 +200,7 @@ utilities/simulator_cache/sim_cache.cc \ utilities/spatialdb/spatial_db.cc \ utilities/table_properties_collectors/compact_on_deletion_collector.cc \ + utilities/trace/file_trace_reader_writer.cc \ utilities/transactions/optimistic_transaction.cc \ utilities/transactions/optimistic_transaction_db_impl.cc \ utilities/transactions/pessimistic_transaction.cc \ @@ -230,6 +234,9 @@ tools/sst_dump_tool.cc \ utilities/blob_db/blob_dump_tool.cc \ +ANALYZER_LIB_SOURCES = \ + tools/trace_analyzer_tool.cc \ + MOCK_LIB_SOURCES = \ table/mock_table.cc \ util/fault_injection_test_env.cc @@ -321,6 +328,7 @@ db/redis_test.cc \ db/repair_test.cc \ db/range_del_aggregator_test.cc \ + db/range_del_aggregator_bench.cc \ db/table_properties_collector_test.cc \ db/util_merge_operators_test.cc \ db/version_builder_test.cc \ @@ -346,6 +354,7 @@ table/cleanable_test.cc \ table/cuckoo_table_builder_test.cc \ table/cuckoo_table_reader_test.cc \ + table/data_block_hash_index_test.cc \ table/full_filter_block_test.cc \ table/merger_test.cc \ table/table_reader_bench.cc \ @@ -357,6 +366,7 @@ tools/ldb_cmd_test.cc \ tools/reduce_levels_test.cc \ tools/sst_dump_test.cc \ + tools/trace_analyzer_test.cc \ util/arena_test.cc \ util/auto_roll_logger_test.cc \ util/autovector_test.cc \ @@ -368,6 +378,7 @@ util/filelock_test.cc \ util/log_write_bench.cc \ util/rate_limiter_test.cc \ + util/repeatable_thread_test.cc \ util/slice_transform_test.cc \ util/timer_queue_test.cc \ util/thread_list_test.cc \ @@ -411,6 +422,7 @@ java/rocksjni/compaction_filter.cc \ java/rocksjni/compaction_filter_factory.cc \ java/rocksjni/compaction_filter_factory_jnicallback.cc \ + java/rocksjni/compact_range_options.cc \ java/rocksjni/compaction_options_fifo.cc \ java/rocksjni/compaction_options_universal.cc \ java/rocksjni/comparator.cc \ diff -Nru rocksdb-5.15.10/table/block_based_table_builder.cc rocksdb-5.17.2/table/block_based_table_builder.cc --- rocksdb-5.15.10/table/block_based_table_builder.cc 2018-09-13 17:25:20.000000000 +0000 +++ rocksdb-5.17.2/table/block_based_table_builder.cc 2018-11-12 19:57:32.000000000 +0000 @@ -39,11 +39,11 @@ #include "table/full_filter_block.h" #include "table/table_builder.h" -#include "util/string_util.h" #include "util/coding.h" #include "util/compression.h" #include "util/crc32c.h" #include "util/stop_watch.h" +#include "util/string_util.h" #include "util/xxhash.h" #include "table/index_builder.h" @@ -63,6 +63,7 @@ FilterBlockBuilder* CreateFilterBlockBuilder( const ImmutableCFOptions& /*opt*/, const MutableCFOptions& mopt, const BlockBasedTableOptions& table_opt, + const bool use_delta_encoding_for_index_values, PartitionedIndexBuilder* const p_index_builder) { if (table_opt.filter_policy == nullptr) return nullptr; @@ -85,7 +86,7 @@ return new PartitionedFilterBlockBuilder( mopt.prefix_extractor.get(), table_opt.whole_key_filtering, filter_bits_builder, table_opt.index_block_restart_interval, - p_index_builder, partition_size); + use_delta_encoding_for_index_values, p_index_builder, partition_size); } else { return new FullFilterBlockBuilder(mopt.prefix_extractor.get(), table_opt.whole_key_filtering, @@ -266,6 +267,7 @@ TableProperties props; bool closed = false; // Either Finish() or Abandon() has been called. + const bool use_delta_encoding_for_index_values; std::unique_ptr filter_builder; char compressed_cache_key_prefix[BlockBasedTable::kMaxCacheKeyPrefixSize]; size_t compressed_cache_key_prefix_size; @@ -301,11 +303,19 @@ ? std::min(table_options.block_size, kDefaultPageSize) : 0), data_block(table_options.block_restart_interval, - table_options.use_delta_encoding), + table_options.use_delta_encoding, + false /* use_value_delta_encoding */, + icomparator.user_comparator() + ->CanKeysWithDifferentByteContentsBeEqual() + ? BlockBasedTableOptions::kDataBlockBinarySearch + : table_options.data_block_index_type, + table_options.data_block_hash_table_util_ratio), range_del_block(1 /* block_restart_interval */), internal_prefix_transform(_moptions.prefix_extractor.get()), compression_dict(_compression_dict), compression_ctx(_compression_type, _compression_opts), + use_delta_encoding_for_index_values(table_opt.format_version >= 4 && + !table_opt.block_align), compressed_cache_key_prefix_size(0), flush_block_policy( table_options.flush_block_policy_factory->NewFlushBlockPolicy( @@ -317,18 +327,21 @@ if (table_options.index_type == BlockBasedTableOptions::kTwoLevelIndexSearch) { p_index_builder_ = PartitionedIndexBuilder::CreateIndexBuilder( - &internal_comparator, table_options); + &internal_comparator, use_delta_encoding_for_index_values, + table_options); index_builder.reset(p_index_builder_); } else { index_builder.reset(IndexBuilder::CreateIndexBuilder( table_options.index_type, &internal_comparator, - &this->internal_prefix_transform, table_options)); + &this->internal_prefix_transform, use_delta_encoding_for_index_values, + table_options)); } if (skip_filters) { filter_builder = nullptr; } else { filter_builder.reset(CreateFilterBlockBuilder( - _ioptions, _moptions, table_options, p_index_builder_)); + _ioptions, _moptions, table_options, + use_delta_encoding_for_index_values, p_index_builder_)); } for (auto& collector_factories : *int_tbl_prop_collector_factories) { @@ -675,7 +688,8 @@ if (ok() && !empty_filter_block) { Status s = Status::Incomplete(); while (ok() && s.IsIncomplete()) { - Slice filter_content = rep_->filter_builder->Finish(filter_block_handle, &s); + Slice filter_content = + rep_->filter_builder->Finish(filter_block_handle, &s); assert(s.ok() || s.IsIncomplete()); rep_->props.filter_size += filter_content.size(); WriteRawBlock(filter_content, kNoCompression, &filter_block_handle); @@ -752,22 +766,25 @@ PropertyBlockBuilder property_block_builder; rep_->props.column_family_id = rep_->column_family_id; rep_->props.column_family_name = rep_->column_family_name; - rep_->props.filter_policy_name = rep_->table_options.filter_policy != nullptr - ? rep_->table_options.filter_policy->Name() - : ""; + rep_->props.filter_policy_name = + rep_->table_options.filter_policy != nullptr + ? rep_->table_options.filter_policy->Name() + : ""; rep_->props.index_size = rep_->index_builder->IndexSize() + kBlockTrailerSize; rep_->props.comparator_name = rep_->ioptions.user_comparator != nullptr - ? rep_->ioptions.user_comparator->Name() - : "nullptr"; - rep_->props.merge_operator_name = rep_->ioptions.merge_operator != nullptr - ? rep_->ioptions.merge_operator->Name() - : "nullptr"; + ? rep_->ioptions.user_comparator->Name() + : "nullptr"; + rep_->props.merge_operator_name = + rep_->ioptions.merge_operator != nullptr + ? rep_->ioptions.merge_operator->Name() + : "nullptr"; rep_->props.compression_name = CompressionTypeToString(rep_->compression_ctx.type()); - rep_->props.prefix_extractor_name = rep_->moptions.prefix_extractor != nullptr - ? rep_->moptions.prefix_extractor->Name() - : "nullptr"; + rep_->props.prefix_extractor_name = + rep_->moptions.prefix_extractor != nullptr + ? rep_->moptions.prefix_extractor->Name() + : "nullptr"; std::string property_collectors_names = "["; for (size_t i = 0; @@ -789,6 +806,8 @@ } rep_->props.index_key_is_user_key = !rep_->index_builder->seperator_is_key_plus_seq(); + rep_->props.index_value_is_delta_encoded = + rep_->use_delta_encoding_for_index_values; rep_->props.creation_time = rep_->creation_time; rep_->props.oldest_key_time = rep_->oldest_key_time; diff -Nru rocksdb-5.15.10/table/block_based_table_factory.cc rocksdb-5.17.2/table/block_based_table_factory.cc --- rocksdb-5.15.10/table/block_based_table_factory.cc 2018-09-13 17:25:20.000000000 +0000 +++ rocksdb-5.17.2/table/block_based_table_factory.cc 2018-11-12 19:57:32.000000000 +0000 @@ -27,10 +27,141 @@ #include "table/block_based_table_builder.h" #include "table/block_based_table_reader.h" #include "table/format.h" +#include "util/mutexlock.h" #include "util/string_util.h" namespace rocksdb { +void TailPrefetchStats::RecordEffectiveSize(size_t len) { + MutexLock l(&mutex_); + if (num_records_ < kNumTracked) { + num_records_++; + } + records_[next_++] = len; + if (next_ == kNumTracked) { + next_ = 0; + } +} + +size_t TailPrefetchStats::GetSuggestedPrefetchSize() { + std::vector sorted; + { + MutexLock l(&mutex_); + + if (num_records_ == 0) { + return 0; + } + sorted.assign(records_, records_ + num_records_); + } + + // Of the historic size, we find the maximum one that satisifis the condtiion + // that if prefetching all, less than 1/8 will be wasted. + std::sort(sorted.begin(), sorted.end()); + + // Assuming we have 5 data points, and after sorting it looks like this: + // + // +---+ + // +---+ | | + // | | | | + // | | | | + // | | | | + // | | | | + // +---+ | | | | + // | | | | | | + // +---+ | | | | | | + // | | | | | | | | + // +---+ | | | | | | | | + // | | | | | | | | | | + // | | | | | | | | | | + // | | | | | | | | | | + // | | | | | | | | | | + // | | | | | | | | | | + // +---+ +---+ +---+ +---+ +---+ + // + // and we use every of the value as a candidate, and estimate how much we + // wasted, compared to read. For example, when we use the 3rd record + // as candiate. This area is what we read: + // +---+ + // +---+ | | + // | | | | + // | | | | + // | | | | + // | | | | + // *** *** *** ***+ *** *** *** *** ** + // * | | | | | | + // +---+ | | | | | * + // * | | | | | | | | + // +---+ | | | | | | | * + // * | | | | X | | | | | + // | | | | | | | | | * + // * | | | | | | | | | + // | | | | | | | | | * + // * | | | | | | | | | + // *** *** ***-*** ***--*** ***--*** +**** + // which is (size of the record) X (number of records). + // + // While wasted is this area: + // +---+ + // +---+ | | + // | | | | + // | | | | + // | | | | + // | | | | + // *** *** *** ****---+ | | | | + // * * | | | | | + // * *-*** *** | | | | | + // * * | | | | | | | + // *--** *** | | | | | | | + // | | | | | X | | | | | + // | | | | | | | | | | + // | | | | | | | | | | + // | | | | | | | | | | + // | | | | | | | | | | + // +---+ +---+ +---+ +---+ +---+ + // + // Which can be calculated iteratively. + // The difference between wasted using 4st and 3rd record, will + // be following area: + // +---+ + // +--+ +-+ ++ +-+ +-+ +---+ | | + // + xxxxxxxxxxxxxxxxxxxxxxxx | | | | + // xxxxxxxxxxxxxxxxxxxxxxxx | | | | + // + xxxxxxxxxxxxxxxxxxxxxxxx | | | | + // | xxxxxxxxxxxxxxxxxxxxxxxx | | | | + // +-+ +-+ +-+ ++ +---+ +--+ | | | + // | | | | | | | + // +---+ ++ | | | | | | + // | | | | | | X | | | + // +---+ ++ | | | | | | | | + // | | | | | | | | | | + // | | | | | | | | | | + // | | | | | | | | | | + // | | | | | | | | | | + // | | | | | | | | | | + // +---+ +---+ +---+ +---+ +---+ + // + // which will be the size difference between 4st and 3rd record, + // times 3, which is number of records before the 4st. + // Here we assume that all data within the prefetch range will be useful. In + // reality, it may not be the case when a partial block is inside the range, + // or there are data in the middle that is not read. We ignore those cases + // for simplicity. + assert(!sorted.empty()); + size_t prev_size = sorted[0]; + size_t max_qualified_size = sorted[0]; + size_t wasted = 0; + for (size_t i = 1; i < sorted.size(); i++) { + size_t read = sorted[i] * sorted.size(); + wasted += (sorted[i] - prev_size) * i; + if (wasted <= read / 8) { + max_qualified_size = sorted[i]; + } + prev_size = sorted[i]; + } + const size_t kMaxPrefetchSize = 512 * 1024; // Never exceed 512KB + return std::min(kMaxPrefetchSize, max_qualified_size); +} + BlockBasedTableFactory::BlockBasedTableFactory( const BlockBasedTableOptions& _table_options) : table_options_(_table_options) { @@ -71,7 +202,8 @@ table_options_, table_reader_options.internal_comparator, std::move(file), file_size, table_reader, table_reader_options.prefix_extractor, prefetch_index_and_filter_in_cache, table_reader_options.skip_filters, - table_reader_options.level, table_reader_options.immortal); + table_reader_options.level, table_reader_options.immortal, + table_reader_options.largest_seqno, &tail_prefetch_stats_); } TableBuilder* BlockBasedTableFactory::NewTableBuilder( @@ -127,6 +259,13 @@ return Status::InvalidArgument( "Block alignment requested but block size is not a power of 2"); } + if (table_options_.data_block_index_type == + BlockBasedTableOptions::kDataBlockBinaryAndHash && + table_options_.data_block_hash_table_util_ratio <= 0) { + return Status::InvalidArgument( + "data_block_hash_table_util_ratio should be greater than 0 when " + "data_block_index_type is set to kDataBlockBinaryAndHash"); + } return Status::OK(); } diff -Nru rocksdb-5.15.10/table/block_based_table_factory.h rocksdb-5.17.2/table/block_based_table_factory.h --- rocksdb-5.15.10/table/block_based_table_factory.h 2018-09-13 17:25:20.000000000 +0000 +++ rocksdb-5.17.2/table/block_based_table_factory.h 2018-11-12 19:57:32.000000000 +0000 @@ -26,6 +26,22 @@ using std::unique_ptr; class BlockBasedTableBuilder; +// A class used to track actual bytes written from the tail in the recent SST +// file opens, and provide a suggestion for following open. +class TailPrefetchStats { + public: + void RecordEffectiveSize(size_t len); + // 0 indicates no information to determine. + size_t GetSuggestedPrefetchSize(); + + private: + const static size_t kNumTracked = 32; + size_t records_[kNumTracked]; + port::Mutex mutex_; + size_t next_ = 0; + size_t num_records_ = 0; +}; + class BlockBasedTableFactory : public TableFactory { public: explicit BlockBasedTableFactory( @@ -64,6 +80,7 @@ private: BlockBasedTableOptions table_options_; + mutable TailPrefetchStats tail_prefetch_stats_; }; extern const std::string kHashIndexPrefixesBlock; @@ -106,6 +123,14 @@ {"hash_index_allow_collision", {offsetof(struct BlockBasedTableOptions, hash_index_allow_collision), OptionType::kBoolean, OptionVerificationType::kNormal, false, 0}}, + {"data_block_index_type", + {offsetof(struct BlockBasedTableOptions, data_block_index_type), + OptionType::kBlockBasedTableDataBlockIndexType, + OptionVerificationType::kNormal, false, 0}}, + {"data_block_hash_table_util_ratio", + {offsetof(struct BlockBasedTableOptions, + data_block_hash_table_util_ratio), + OptionType::kDouble, OptionVerificationType::kNormal, false, 0}}, {"checksum", {offsetof(struct BlockBasedTableOptions, checksum), OptionType::kChecksumType, OptionVerificationType::kNormal, false, diff -Nru rocksdb-5.15.10/table/block_based_table_reader.cc rocksdb-5.17.2/table/block_based_table_reader.cc --- rocksdb-5.15.10/table/block_based_table_reader.cc 2018-09-13 17:25:20.000000000 +0000 +++ rocksdb-5.17.2/table/block_based_table_reader.cc 2018-11-12 19:57:32.000000000 +0000 @@ -9,6 +9,7 @@ #include "table/block_based_table_reader.h" #include +#include #include #include #include @@ -137,6 +138,8 @@ Cache::Handle* GetEntryFromCache(Cache* block_cache, const Slice& key, Tickers block_cache_miss_ticker, Tickers block_cache_hit_ticker, + uint64_t* block_cache_miss_stats, + uint64_t* block_cache_hit_stats, Statistics* statistics, GetContext* get_context) { auto cache_handle = block_cache->Lookup(key, statistics); @@ -144,12 +147,12 @@ PERF_COUNTER_ADD(block_cache_hit_count, 1); if (get_context != nullptr) { // overall cache hit - get_context->RecordCounters(BLOCK_CACHE_HIT, 1); + get_context->get_context_stats_.num_cache_hit++; // total bytes read from cache - get_context->RecordCounters(BLOCK_CACHE_BYTES_READ, - block_cache->GetUsage(cache_handle)); + get_context->get_context_stats_.num_cache_bytes_read += + block_cache->GetUsage(cache_handle); // block-type specific cache hit - get_context->RecordCounters(block_cache_hit_ticker, 1); + (*block_cache_hit_stats)++; } else { // overall cache hit RecordTick(statistics, BLOCK_CACHE_HIT); @@ -161,9 +164,9 @@ } else { if (get_context != nullptr) { // overall cache miss - get_context->RecordCounters(BLOCK_CACHE_MISS, 1); + get_context->get_context_stats_.num_cache_miss++; // block-type specific cache miss - get_context->RecordCounters(block_cache_miss_ticker, 1); + (*block_cache_miss_stats)++; } else { RecordTick(statistics, BLOCK_CACHE_MISS); RecordTick(statistics, block_cache_miss_ticker); @@ -211,7 +214,8 @@ const InternalKeyComparator* icomparator, IndexReader** index_reader, const PersistentCacheOptions& cache_options, - const int level, const bool index_key_includes_seq) { + const int level, const bool index_key_includes_seq, + const bool index_value_is_full) { std::unique_ptr index_block; auto s = ReadBlockFromFile( file, prefetch_buffer, footer, ReadOptions(), index_handle, @@ -222,36 +226,37 @@ if (s.ok()) { *index_reader = new PartitionIndexReader( table, icomparator, std::move(index_block), ioptions.statistics, - level, index_key_includes_seq); + level, index_key_includes_seq, index_value_is_full); } return s; } // return a two-level iterator: first level is on the partition index - virtual InternalIterator* NewIterator(IndexBlockIter* /*iter*/ = nullptr, - bool /*dont_care*/ = true, - bool fill_cache = true) override { + virtual InternalIteratorBase* NewIterator( + IndexBlockIter* /*iter*/ = nullptr, bool /*dont_care*/ = true, + bool fill_cache = true) override { Statistics* kNullStats = nullptr; // Filters are already checked before seeking the index if (!partition_map_.empty()) { return NewTwoLevelIterator( new BlockBasedTable::PartitionedIndexIteratorState( - table_, &partition_map_, index_key_includes_seq_), + table_, &partition_map_, index_key_includes_seq_, + index_value_is_full_), index_block_->NewIterator( icomparator_, icomparator_->user_comparator(), nullptr, - kNullStats, true, index_key_includes_seq_)); + kNullStats, true, index_key_includes_seq_, index_value_is_full_)); } else { auto ro = ReadOptions(); ro.fill_cache = fill_cache; bool kIsIndex = true; - return new BlockBasedTableIterator( + return new BlockBasedTableIterator( table_, ro, *icomparator_, index_block_->NewIterator( icomparator_, icomparator_->user_comparator(), nullptr, - kNullStats, true, index_key_includes_seq_), + kNullStats, true, index_key_includes_seq_, index_value_is_full_), false, true, /* prefix_extractor */ nullptr, kIsIndex, - index_key_includes_seq_); + index_key_includes_seq_, index_value_is_full_); } // TODO(myabandeh): Update TwoLevelIterator to be able to make use of // on-stack BlockIter while the state is on heap. Currentlly it assumes @@ -267,7 +272,7 @@ Statistics* kNullStats = nullptr; index_block_->NewIterator( icomparator_, icomparator_->user_comparator(), &biter, kNullStats, true, - index_key_includes_seq_); + index_key_includes_seq_, index_value_is_full_); // Index partitions are assumed to be consecuitive. Prefetch them all. // Read the first block offset biter.SeekToFirst(); @@ -275,14 +280,7 @@ // Empty index. return; } - Slice input = biter.value(); - Status s = handle.DecodeFrom(&input); - assert(s.ok()); - if (!s.ok()) { - ROCKS_LOG_WARN(rep->ioptions.info_log, - "Could not read first index partition"); - return; - } + handle = biter.value(); uint64_t prefetch_off = handle.offset(); // Read the last block's offset @@ -291,36 +289,21 @@ // Empty index. return; } - input = biter.value(); - s = handle.DecodeFrom(&input); - assert(s.ok()); - if (!s.ok()) { - ROCKS_LOG_WARN(rep->ioptions.info_log, - "Could not read last index partition"); - return; - } + handle = biter.value(); uint64_t last_off = handle.offset() + handle.size() + kBlockTrailerSize; uint64_t prefetch_len = last_off - prefetch_off; std::unique_ptr prefetch_buffer; auto& file = table_->rep_->file; prefetch_buffer.reset(new FilePrefetchBuffer()); - s = prefetch_buffer->Prefetch(file.get(), prefetch_off, - static_cast(prefetch_len)); + Status s = prefetch_buffer->Prefetch(file.get(), prefetch_off, + static_cast(prefetch_len)); // After prefetch, read the partitions one by one biter.SeekToFirst(); auto ro = ReadOptions(); Cache* block_cache = rep->table_options.block_cache.get(); for (; biter.Valid(); biter.Next()) { - input = biter.value(); - s = handle.DecodeFrom(&input); - assert(s.ok()); - if (!s.ok()) { - ROCKS_LOG_WARN(rep->ioptions.info_log, - "Could not read index partition"); - continue; - } - + handle = biter.value(); BlockBasedTable::CachableEntry block; Slice compression_dict; if (rep->compression_dict_block) { @@ -371,11 +354,13 @@ PartitionIndexReader(BlockBasedTable* table, const InternalKeyComparator* icomparator, std::unique_ptr&& index_block, Statistics* stats, - const int /*level*/, const bool index_key_includes_seq) + const int /*level*/, const bool index_key_includes_seq, + const bool index_value_is_full) : IndexReader(icomparator, stats), table_(table), index_block_(std::move(index_block)), - index_key_includes_seq_(index_key_includes_seq) { + index_key_includes_seq_(index_key_includes_seq), + index_value_is_full_(index_value_is_full) { assert(index_block_ != nullptr); } BlockBasedTable* table_; @@ -383,6 +368,7 @@ std::unordered_map> partition_map_; const bool index_key_includes_seq_; + const bool index_value_is_full_; }; // Index that allows binary search lookup for the first key of each block. @@ -401,7 +387,8 @@ const InternalKeyComparator* icomparator, IndexReader** index_reader, const PersistentCacheOptions& cache_options, - const bool index_key_includes_seq) { + const bool index_key_includes_seq, + const bool index_value_is_full) { std::unique_ptr index_block; auto s = ReadBlockFromFile( file, prefetch_buffer, footer, ReadOptions(), index_handle, @@ -412,19 +399,19 @@ if (s.ok()) { *index_reader = new BinarySearchIndexReader( icomparator, std::move(index_block), ioptions.statistics, - index_key_includes_seq); + index_key_includes_seq, index_value_is_full); } return s; } - virtual InternalIterator* NewIterator(IndexBlockIter* iter = nullptr, - bool /*dont_care*/ = true, - bool /*dont_care*/ = true) override { + virtual InternalIteratorBase* NewIterator( + IndexBlockIter* iter = nullptr, bool /*dont_care*/ = true, + bool /*dont_care*/ = true) override { Statistics* kNullStats = nullptr; return index_block_->NewIterator( icomparator_, icomparator_->user_comparator(), iter, kNullStats, true, - index_key_includes_seq_); + index_key_includes_seq_, index_value_is_full_); } virtual size_t size() const override { return index_block_->size(); } @@ -446,31 +433,32 @@ private: BinarySearchIndexReader(const InternalKeyComparator* icomparator, std::unique_ptr&& index_block, - Statistics* stats, const bool index_key_includes_seq) + Statistics* stats, const bool index_key_includes_seq, + const bool index_value_is_full) : IndexReader(icomparator, stats), index_block_(std::move(index_block)), - index_key_includes_seq_(index_key_includes_seq) { + index_key_includes_seq_(index_key_includes_seq), + index_value_is_full_(index_value_is_full) { assert(index_block_ != nullptr); } std::unique_ptr index_block_; const bool index_key_includes_seq_; + const bool index_value_is_full_; }; // Index that leverages an internal hash table to quicken the lookup for a given // key. class HashIndexReader : public IndexReader { public: - static Status Create(const SliceTransform* hash_key_extractor, - const Footer& footer, RandomAccessFileReader* file, - FilePrefetchBuffer* prefetch_buffer, - const ImmutableCFOptions& ioptions, - const InternalKeyComparator* icomparator, - const BlockHandle& index_handle, - InternalIterator* meta_index_iter, - IndexReader** index_reader, - bool /*hash_index_allow_collision*/, - const PersistentCacheOptions& cache_options, - const bool index_key_includes_seq) { + static Status Create( + const SliceTransform* hash_key_extractor, const Footer& footer, + RandomAccessFileReader* file, FilePrefetchBuffer* prefetch_buffer, + const ImmutableCFOptions& ioptions, + const InternalKeyComparator* icomparator, const BlockHandle& index_handle, + InternalIterator* meta_index_iter, IndexReader** index_reader, + bool /*hash_index_allow_collision*/, + const PersistentCacheOptions& cache_options, + const bool index_key_includes_seq, const bool index_value_is_full) { std::unique_ptr index_block; auto s = ReadBlockFromFile( file, prefetch_buffer, footer, ReadOptions(), index_handle, @@ -486,9 +474,9 @@ // hard error. We can still fall back to the original binary search index. // So, Create will succeed regardless, from this point on. - auto new_index_reader = - new HashIndexReader(icomparator, std::move(index_block), - ioptions.statistics, index_key_includes_seq); + auto new_index_reader = new HashIndexReader( + icomparator, std::move(index_block), ioptions.statistics, + index_key_includes_seq, index_value_is_full); *index_reader = new_index_reader; // Get prefixes block @@ -542,13 +530,14 @@ return Status::OK(); } - virtual InternalIterator* NewIterator(IndexBlockIter* iter = nullptr, - bool total_order_seek = true, - bool /*dont_care*/ = true) override { + virtual InternalIteratorBase* NewIterator( + IndexBlockIter* iter = nullptr, bool total_order_seek = true, + bool /*dont_care*/ = true) override { Statistics* kNullStats = nullptr; return index_block_->NewIterator( icomparator_, icomparator_->user_comparator(), iter, kNullStats, - total_order_seek, index_key_includes_seq_, prefix_index_.get()); + total_order_seek, index_key_includes_seq_, index_value_is_full_, + prefix_index_.get()); } virtual size_t size() const override { return index_block_->size(); } @@ -574,10 +563,12 @@ private: HashIndexReader(const InternalKeyComparator* icomparator, std::unique_ptr&& index_block, Statistics* stats, - const bool index_key_includes_seq) + const bool index_key_includes_seq, + const bool index_value_is_full) : IndexReader(icomparator, stats), index_block_(std::move(index_block)), - index_key_includes_seq_(index_key_includes_seq) { + index_key_includes_seq_(index_key_includes_seq), + index_value_is_full_(index_value_is_full) { assert(index_block_ != nullptr); } @@ -588,6 +579,7 @@ std::unique_ptr prefix_index_; BlockContents prefixes_contents_; const bool index_key_includes_seq_; + const bool index_value_is_full_; }; // Helper function to setup the cache key's prefix for the Table. @@ -661,51 +653,71 @@ return true; } -SequenceNumber GetGlobalSequenceNumber(const TableProperties& table_properties, - Logger* info_log) { - auto& props = table_properties.user_collected_properties; - - auto version_pos = props.find(ExternalSstFilePropertyNames::kVersion); - auto seqno_pos = props.find(ExternalSstFilePropertyNames::kGlobalSeqno); +// Caller has to ensure seqno is not nullptr. +Status GetGlobalSequenceNumber(const TableProperties& table_properties, + SequenceNumber largest_seqno, + SequenceNumber* seqno) { + const auto& props = table_properties.user_collected_properties; + const auto version_pos = props.find(ExternalSstFilePropertyNames::kVersion); + const auto seqno_pos = props.find(ExternalSstFilePropertyNames::kGlobalSeqno); + *seqno = kDisableGlobalSequenceNumber; if (version_pos == props.end()) { if (seqno_pos != props.end()) { + std::array msg_buf; // This is not an external sst file, global_seqno is not supported. - assert(false); - ROCKS_LOG_ERROR( - info_log, + snprintf( + msg_buf.data(), msg_buf.max_size(), "A non-external sst file have global seqno property with value %s", seqno_pos->second.c_str()); + return Status::Corruption(msg_buf.data()); } - return kDisableGlobalSequenceNumber; + return Status::OK(); } uint32_t version = DecodeFixed32(version_pos->second.c_str()); if (version < 2) { if (seqno_pos != props.end() || version != 1) { + std::array msg_buf; // This is a v1 external sst file, global_seqno is not supported. - assert(false); - ROCKS_LOG_ERROR( - info_log, - "An external sst file with version %u have global seqno property " - "with value %s", - version, seqno_pos->second.c_str()); + snprintf(msg_buf.data(), msg_buf.max_size(), + "An external sst file with version %u have global seqno " + "property with value %s", + version, seqno_pos->second.c_str()); + return Status::Corruption(msg_buf.data()); } - return kDisableGlobalSequenceNumber; + return Status::OK(); } - SequenceNumber global_seqno = DecodeFixed64(seqno_pos->second.c_str()); + // Since we have a plan to deprecate global_seqno, we do not return failure + // if seqno_pos == props.end(). We rely on version_pos to detect whether the + // SST is external. + SequenceNumber global_seqno(0); + if (seqno_pos != props.end()) { + global_seqno = DecodeFixed64(seqno_pos->second.c_str()); + } + if (global_seqno != 0 && global_seqno != largest_seqno) { + std::array msg_buf; + snprintf(msg_buf.data(), msg_buf.max_size(), + "An external sst file with version %u have global seqno property " + "with value %s, while largest seqno in the file is %llu", + version, seqno_pos->second.c_str(), + static_cast(largest_seqno)); + return Status::Corruption(msg_buf.data()); + } + global_seqno = largest_seqno; + *seqno = largest_seqno; if (global_seqno > kMaxSequenceNumber) { - assert(false); - ROCKS_LOG_ERROR( - info_log, - "An external sst file with version %u have global seqno property " - "with value %llu, which is greater than kMaxSequenceNumber", - version, global_seqno); + std::array msg_buf; + snprintf(msg_buf.data(), msg_buf.max_size(), + "An external sst file with version %u have global seqno property " + "with value %llu, which is greater than kMaxSequenceNumber", + version, static_cast(global_seqno)); + return Status::Corruption(msg_buf.data()); } - return global_seqno; + return Status::OK(); } } // namespace @@ -731,7 +743,9 @@ const SliceTransform* prefix_extractor, const bool prefetch_index_and_filter_in_cache, const bool skip_filters, const int level, - const bool immortal_table) { + const bool immortal_table, + const SequenceNumber largest_seqno, + TailPrefetchStats* tail_prefetch_stats) { table_reader->reset(); Footer footer; @@ -741,29 +755,40 @@ // prefetch both index and filters, down to all partitions const bool prefetch_all = prefetch_index_and_filter_in_cache || level == 0; const bool preload_all = !table_options.cache_index_and_filter_blocks; - // Before read footer, readahead backwards to prefetch data. Do more readahead - // if we're going to read index/filter. - // TODO: This may incorrectly select small readahead in case partitioned - // index/filter is enabled and top-level partition pinning is enabled. That's - // because we need to issue readahead before we read the properties, at which - // point we don't yet know the index type. - const size_t kTailPrefetchSize = - prefetch_all || preload_all ? 512 * 1024 : 4 * 1024; + + size_t tail_prefetch_size = 0; + if (tail_prefetch_stats != nullptr) { + // Multiple threads may get a 0 (no history) when running in parallel, + // but it will get cleared after the first of them finishes. + tail_prefetch_size = tail_prefetch_stats->GetSuggestedPrefetchSize(); + } + if (tail_prefetch_size == 0) { + // Before read footer, readahead backwards to prefetch data. Do more + // readahead if we're going to read index/filter. + // TODO: This may incorrectly select small readahead in case partitioned + // index/filter is enabled and top-level partition pinning is enabled. + // That's because we need to issue readahead before we read the properties, + // at which point we don't yet know the index type. + tail_prefetch_size = prefetch_all || preload_all ? 512 * 1024 : 4 * 1024; + } size_t prefetch_off; size_t prefetch_len; - if (file_size < kTailPrefetchSize) { + if (file_size < tail_prefetch_size) { prefetch_off = 0; prefetch_len = static_cast(file_size); } else { - prefetch_off = static_cast(file_size - kTailPrefetchSize); - prefetch_len = kTailPrefetchSize; + prefetch_off = static_cast(file_size - tail_prefetch_size); + prefetch_len = tail_prefetch_size; } + TEST_SYNC_POINT_CALLBACK("BlockBasedTable::Open::TailPrefetchLen", + &tail_prefetch_size); Status s; // TODO should not have this special logic in the future. if (!file->use_direct_io()) { + prefetch_buffer.reset(new FilePrefetchBuffer(nullptr, 0, 0, false, true)); s = file->Prefetch(prefetch_off, prefetch_len); } else { - prefetch_buffer.reset(new FilePrefetchBuffer()); + prefetch_buffer.reset(new FilePrefetchBuffer(nullptr, 0, 0, true, true)); s = prefetch_buffer->Prefetch(file.get(), prefetch_off, prefetch_len); } s = ReadFooterFromFile(file.get(), prefetch_buffer.get(), file_size, &footer, @@ -922,8 +947,12 @@ *(rep->table_properties), BlockBasedTablePropertyNames::kPrefixFiltering, rep->ioptions.info_log); - rep->global_seqno = GetGlobalSequenceNumber(*(rep->table_properties), - rep->ioptions.info_log); + s = GetGlobalSequenceNumber(*(rep->table_properties), largest_seqno, + &(rep->global_seqno)); + if (!s.ok()) { + ROCKS_LOG_ERROR(rep->ioptions.info_log, "%s", s.ToString().c_str()); + return s; + } } // Read the range del meta block @@ -990,8 +1019,9 @@ bool disable_prefix_seek = rep->index_type == BlockBasedTableOptions::kHashSearch && need_upper_bound_check; - unique_ptr iter(new_table->NewIndexIterator( - ReadOptions(), disable_prefix_seek, nullptr, &index_entry)); + unique_ptr> iter( + new_table->NewIndexIterator(ReadOptions(), disable_prefix_seek, + nullptr, &index_entry)); s = iter->status(); if (s.ok()) { // This is the first call to NewIndexIterator() since we're in Open(). @@ -1060,6 +1090,12 @@ } if (s.ok()) { + assert(prefetch_buffer.get() != nullptr); + if (tail_prefetch_stats != nullptr) { + assert(prefetch_buffer->min_offset_read() < file_size); + tail_prefetch_stats->RecordEffectiveSize( + static_cast(file_size) - prefetch_buffer->min_offset_read()); + } *table_reader = std::move(new_table); } @@ -1148,8 +1184,16 @@ block->cache_handle = GetEntryFromCache( block_cache, block_cache_key, is_index ? BLOCK_CACHE_INDEX_MISS : BLOCK_CACHE_DATA_MISS, - is_index ? BLOCK_CACHE_INDEX_HIT : BLOCK_CACHE_DATA_HIT, statistics, - get_context); + is_index ? BLOCK_CACHE_INDEX_HIT : BLOCK_CACHE_DATA_HIT, + get_context + ? (is_index ? &get_context->get_context_stats_.num_cache_index_miss + : &get_context->get_context_stats_.num_cache_data_miss) + : nullptr, + get_context + ? (is_index ? &get_context->get_context_stats_.num_cache_index_hit + : &get_context->get_context_stats_.num_cache_data_hit) + : nullptr, + statistics, get_context); if (block->cache_handle != nullptr) { block->value = reinterpret_cast(block_cache->Value(block->cache_handle)); @@ -1204,24 +1248,26 @@ block_cache->TEST_mark_as_data_block(block_cache_key, charge); if (s.ok()) { if (get_context != nullptr) { - get_context->RecordCounters(BLOCK_CACHE_ADD, 1); - get_context->RecordCounters(BLOCK_CACHE_BYTES_WRITE, charge); + get_context->get_context_stats_.num_cache_add++; + get_context->get_context_stats_.num_cache_bytes_write += charge; } else { RecordTick(statistics, BLOCK_CACHE_ADD); RecordTick(statistics, BLOCK_CACHE_BYTES_WRITE, charge); } if (is_index) { if (get_context != nullptr) { - get_context->RecordCounters(BLOCK_CACHE_INDEX_ADD, 1); - get_context->RecordCounters(BLOCK_CACHE_INDEX_BYTES_INSERT, charge); + get_context->get_context_stats_.num_cache_index_add++; + get_context->get_context_stats_.num_cache_index_bytes_insert += + charge; } else { RecordTick(statistics, BLOCK_CACHE_INDEX_ADD); RecordTick(statistics, BLOCK_CACHE_INDEX_BYTES_INSERT, charge); } } else { if (get_context != nullptr) { - get_context->RecordCounters(BLOCK_CACHE_DATA_ADD, 1); - get_context->RecordCounters(BLOCK_CACHE_DATA_BYTES_INSERT, charge); + get_context->get_context_stats_.num_cache_data_add++; + get_context->get_context_stats_.num_cache_data_bytes_insert += + charge; } else { RecordTick(statistics, BLOCK_CACHE_DATA_ADD); RecordTick(statistics, BLOCK_CACHE_DATA_BYTES_INSERT, charge); @@ -1303,24 +1349,25 @@ if (s.ok()) { assert(block->cache_handle != nullptr); if (get_context != nullptr) { - get_context->RecordCounters(BLOCK_CACHE_ADD, 1); - get_context->RecordCounters(BLOCK_CACHE_BYTES_WRITE, charge); + get_context->get_context_stats_.num_cache_add++; + get_context->get_context_stats_.num_cache_bytes_write += charge; } else { RecordTick(statistics, BLOCK_CACHE_ADD); RecordTick(statistics, BLOCK_CACHE_BYTES_WRITE, charge); } if (is_index) { if (get_context != nullptr) { - get_context->RecordCounters(BLOCK_CACHE_INDEX_ADD, 1); - get_context->RecordCounters(BLOCK_CACHE_INDEX_BYTES_INSERT, charge); + get_context->get_context_stats_.num_cache_index_add++; + get_context->get_context_stats_.num_cache_index_bytes_insert += + charge; } else { RecordTick(statistics, BLOCK_CACHE_INDEX_ADD); RecordTick(statistics, BLOCK_CACHE_INDEX_BYTES_INSERT, charge); } } else { if (get_context != nullptr) { - get_context->RecordCounters(BLOCK_CACHE_DATA_ADD, 1); - get_context->RecordCounters(BLOCK_CACHE_DATA_BYTES_INSERT, charge); + get_context->get_context_stats_.num_cache_data_add++; + get_context->get_context_stats_.num_cache_data_bytes_insert += charge; } else { RecordTick(statistics, BLOCK_CACHE_DATA_ADD); RecordTick(statistics, BLOCK_CACHE_DATA_BYTES_INSERT, charge); @@ -1378,7 +1425,9 @@ rep->whole_key_filtering, std::move(block), nullptr, rep->ioptions.statistics, rep->internal_comparator, this, rep_->table_properties == nullptr || - !rep_->table_properties->index_key_is_user_key); + rep_->table_properties->index_key_is_user_key == 0, + rep_->table_properties == nullptr || + rep_->table_properties->index_value_is_delta_encoded == 0); } case Rep::FilterType::kBlockFilter: @@ -1445,9 +1494,13 @@ filter_blk_handle, cache_key); Statistics* statistics = rep_->ioptions.statistics; - auto cache_handle = - GetEntryFromCache(block_cache, key, BLOCK_CACHE_FILTER_MISS, - BLOCK_CACHE_FILTER_HIT, statistics, get_context); + auto cache_handle = GetEntryFromCache( + block_cache, key, BLOCK_CACHE_FILTER_MISS, BLOCK_CACHE_FILTER_HIT, + get_context ? &get_context->get_context_stats_.num_cache_filter_miss + : nullptr, + get_context ? &get_context->get_context_stats_.num_cache_filter_hit + : nullptr, + statistics, get_context); FilterBlockReader* filter = nullptr; if (cache_handle != nullptr) { @@ -1468,10 +1521,11 @@ : Cache::Priority::LOW); if (s.ok()) { if (get_context != nullptr) { - get_context->RecordCounters(BLOCK_CACHE_ADD, 1); - get_context->RecordCounters(BLOCK_CACHE_BYTES_WRITE, usage); - get_context->RecordCounters(BLOCK_CACHE_FILTER_ADD, 1); - get_context->RecordCounters(BLOCK_CACHE_FILTER_BYTES_INSERT, usage); + get_context->get_context_stats_.num_cache_add++; + get_context->get_context_stats_.num_cache_bytes_write += usage; + get_context->get_context_stats_.num_cache_filter_add++; + get_context->get_context_stats_.num_cache_filter_bytes_insert += + usage; } else { RecordTick(statistics, BLOCK_CACHE_ADD); RecordTick(statistics, BLOCK_CACHE_BYTES_WRITE, usage); @@ -1491,7 +1545,7 @@ // disable_prefix_seek should be set to true when prefix_extractor found in SST // differs from the one in mutable_cf_options and index type is HashBasedIndex -InternalIterator* BlockBasedTable::NewIndexIterator( +InternalIteratorBase* BlockBasedTable::NewIndexIterator( const ReadOptions& read_options, bool disable_prefix_seek, IndexBlockIter* input_iter, CachableEntry* index_entry, GetContext* get_context) { @@ -1517,16 +1571,21 @@ GetCacheKeyFromOffset(rep_->cache_key_prefix, rep_->cache_key_prefix_size, rep_->dummy_index_reader_offset, cache_key); Statistics* statistics = rep_->ioptions.statistics; - auto cache_handle = - GetEntryFromCache(block_cache, key, BLOCK_CACHE_INDEX_MISS, - BLOCK_CACHE_INDEX_HIT, statistics, get_context); + auto cache_handle = GetEntryFromCache( + block_cache, key, BLOCK_CACHE_INDEX_MISS, BLOCK_CACHE_INDEX_HIT, + get_context ? &get_context->get_context_stats_.num_cache_index_miss + : nullptr, + get_context ? &get_context->get_context_stats_.num_cache_index_hit + : nullptr, + statistics, get_context); if (cache_handle == nullptr && no_io) { if (input_iter != nullptr) { input_iter->Invalidate(Status::Incomplete("no blocking io")); return input_iter; } else { - return NewErrorInternalIterator(Status::Incomplete("no blocking io")); + return NewErrorInternalIterator( + Status::Incomplete("no blocking io")); } } @@ -1555,8 +1614,8 @@ if (s.ok()) { if (get_context != nullptr) { - get_context->RecordCounters(BLOCK_CACHE_ADD, 1); - get_context->RecordCounters(BLOCK_CACHE_BYTES_WRITE, charge); + get_context->get_context_stats_.num_cache_add++; + get_context->get_context_stats_.num_cache_bytes_write += charge; } else { RecordTick(statistics, BLOCK_CACHE_ADD); RecordTick(statistics, BLOCK_CACHE_BYTES_WRITE, charge); @@ -1573,7 +1632,7 @@ input_iter->Invalidate(s); return input_iter; } else { - return NewErrorInternalIterator(s); + return NewErrorInternalIterator(s); } } @@ -1594,21 +1653,6 @@ return iter; } -template -TBlockIter* BlockBasedTable::NewDataBlockIterator( - Rep* rep, const ReadOptions& ro, const Slice& index_value, - TBlockIter* input_iter, bool is_index, bool key_includes_seq, - GetContext* get_context, FilePrefetchBuffer* prefetch_buffer) { - BlockHandle handle; - Slice input = index_value; - // We intentionally allow extra stuff in index_value so that we - // can add more features in the future. - Status s = handle.DecodeFrom(&input); - return NewDataBlockIterator(rep, ro, handle, input_iter, is_index, - key_includes_seq, get_context, s, - prefetch_buffer); -} - // Convert an index iterator value (i.e., an encoded BlockHandle) // into an iterator over the contents of the corresponding block. // If input_iter is null, new a iterator @@ -1617,7 +1661,8 @@ TBlockIter* BlockBasedTable::NewDataBlockIterator( Rep* rep, const ReadOptions& ro, const BlockHandle& handle, TBlockIter* input_iter, bool is_index, bool key_includes_seq, - GetContext* get_context, Status s, FilePrefetchBuffer* prefetch_buffer) { + bool index_key_is_full, GetContext* get_context, Status s, + FilePrefetchBuffer* prefetch_buffer) { PERF_TIMER_GUARD(new_table_block_iter_nanos); const bool no_io = (ro.read_tier == kBlockCacheTier); @@ -1667,7 +1712,8 @@ const bool kTotalOrderSeek = true; iter = block.value->NewIterator( &rep->internal_comparator, rep->internal_comparator.user_comparator(), - iter, rep->ioptions.statistics, kTotalOrderSeek, key_includes_seq); + iter, rep->ioptions.statistics, kTotalOrderSeek, key_includes_seq, + index_key_is_full); if (block.cache_handle != nullptr) { iter->RegisterCleanup(&ReleaseCachedEntry, block_cache, block.cache_handle); @@ -1782,22 +1828,20 @@ BlockBasedTable::PartitionedIndexIteratorState::PartitionedIndexIteratorState( BlockBasedTable* table, std::unordered_map>* block_map, - bool index_key_includes_seq) + bool index_key_includes_seq, bool index_key_is_full) : table_(table), block_map_(block_map), - index_key_includes_seq_(index_key_includes_seq) {} + index_key_includes_seq_(index_key_includes_seq), + index_key_is_full_(index_key_is_full) {} -template -const size_t BlockBasedTableIterator::kMaxReadaheadSize = +template +const size_t BlockBasedTableIterator::kMaxReadaheadSize = 256 * 1024; -InternalIterator* +InternalIteratorBase* BlockBasedTable::PartitionedIndexIteratorState::NewSecondaryIterator( - const Slice& index_value) { + const BlockHandle& handle) { // Return a block iterator on the index partition - BlockHandle handle; - Slice input = index_value; - Status s = handle.DecodeFrom(&input); auto rep = table_->get_rep(); auto block = block_map_->find(handle.offset()); // This is a possible scenario since block cache might not have had space @@ -1813,10 +1857,10 @@ Statistics* kNullStats = nullptr; return block->second.value->NewIterator( &rep->internal_comparator, rep->internal_comparator.user_comparator(), - nullptr, kNullStats, true, index_key_includes_seq_); + nullptr, kNullStats, true, index_key_includes_seq_, index_key_is_full_); } // Create an empty iterator - return new DataBlockIter(); + return new IndexBlockIter(); } // This will be broken if the user specifies an unusual implementation @@ -1889,7 +1933,7 @@ // Then, try find it within each block // we already know prefix_extractor and prefix_extractor_name must match // because `CheckPrefixMayMatch` first checks `check_filter_ == true` - unique_ptr iiter( + unique_ptr> iiter( NewIndexIterator(no_io_read_options, /* need_upper_bound_check */ false)); iiter->Seek(internal_prefix); @@ -1922,10 +1966,7 @@ // after the data block corresponding to iiter->key() cannot // possibly contain the key. Thus, the corresponding data block // is the only on could potentially contain the prefix. - Slice handle_value = iiter->value(); - BlockHandle handle; - s = handle.DecodeFrom(&handle_value); - assert(s.ok()); + BlockHandle handle = iiter->value(); may_match = filter->PrefixMayMatch(prefix, prefix_extractor, handle.offset()); } @@ -1949,8 +1990,8 @@ return may_match; } -template -void BlockBasedTableIterator::Seek(const Slice& target) { +template +void BlockBasedTableIterator::Seek(const Slice& target) { is_out_of_bound_ = false; if (!CheckPrefixMayMatch(target)) { ResetDataIter(); @@ -1979,8 +2020,9 @@ block_iter_.key()) <= 0)); } -template -void BlockBasedTableIterator::SeekForPrev(const Slice& target) { +template +void BlockBasedTableIterator::SeekForPrev( + const Slice& target) { is_out_of_bound_ = false; if (!CheckPrefixMayMatch(target)) { ResetDataIter(); @@ -2022,8 +2064,8 @@ icomp_.Compare(target, block_iter_.key()) >= 0); } -template -void BlockBasedTableIterator::SeekToFirst() { +template +void BlockBasedTableIterator::SeekToFirst() { is_out_of_bound_ = false; SavePrevIndexValue(); index_iter_->SeekToFirst(); @@ -2036,8 +2078,8 @@ FindKeyForward(); } -template -void BlockBasedTableIterator::SeekToLast() { +template +void BlockBasedTableIterator::SeekToLast() { is_out_of_bound_ = false; SavePrevIndexValue(); index_iter_->SeekToLast(); @@ -2050,32 +2092,30 @@ FindKeyBackward(); } -template -void BlockBasedTableIterator::Next() { +template +void BlockBasedTableIterator::Next() { assert(block_iter_points_to_real_block_); block_iter_.Next(); FindKeyForward(); } -template -void BlockBasedTableIterator::Prev() { +template +void BlockBasedTableIterator::Prev() { assert(block_iter_points_to_real_block_); block_iter_.Prev(); FindKeyBackward(); } -template -void BlockBasedTableIterator::InitDataBlock() { - BlockHandle data_block_handle; - Slice handle_slice = index_iter_->value(); +template +void BlockBasedTableIterator::InitDataBlock() { + BlockHandle data_block_handle = index_iter_->value(); if (!block_iter_points_to_real_block_ || - handle_slice.compare(prev_index_value_) != 0 || + data_block_handle.offset() != prev_index_value_.offset() || // if previous attempt of reading the block missed cache, try again block_iter_.status().IsIncomplete()) { if (block_iter_points_to_real_block_) { ResetDataIter(); } - Status s = data_block_handle.DecodeFrom(&handle_slice); auto* rep = table_->get_rep(); // Automatically prefetch additional data when a range scan (iterator) does @@ -2107,16 +2147,17 @@ } } + Status s; BlockBasedTable::NewDataBlockIterator( rep, read_options_, data_block_handle, &block_iter_, is_index_, - key_includes_seq_, + key_includes_seq_, index_key_is_full_, /* get_context */ nullptr, s, prefetch_buffer_.get()); block_iter_points_to_real_block_ = true; } } -template -void BlockBasedTableIterator::FindKeyForward() { +template +void BlockBasedTableIterator::FindKeyForward() { assert(!is_out_of_bound_); // TODO the while loop inherits from two-level-iterator. We don't know // whether a block can be empty so it can be replaced by an "if". @@ -2155,8 +2196,8 @@ } } -template -void BlockBasedTableIterator::FindKeyBackward() { +template +void BlockBasedTableIterator::FindKeyBackward() { assert(!is_out_of_bound_); while (!block_iter_.Valid()) { if (!block_iter_.status().ok()) { @@ -2231,11 +2272,10 @@ return iter; } } - std::string str; - rep_->range_del_handle.EncodeTo(&str); // The meta-block exists but isn't in uncompressed block cache (maybe // because it is disabled), so go through the full lookup process. - return NewDataBlockIterator(rep_, read_options, Slice(str)); + return NewDataBlockIterator(rep_, read_options, + rep_->range_del_handle); } bool BlockBasedTable::FullFilterKeyMayMatch( @@ -2298,7 +2338,7 @@ auto iiter = NewIndexIterator(read_options, need_upper_bound_check, &iiter_on_stack, /* index_entry */ nullptr, get_context); - std::unique_ptr iiter_unique_ptr; + std::unique_ptr> iiter_unique_ptr; if (iiter != &iiter_on_stack) { iiter_unique_ptr.reset(iiter); } @@ -2306,12 +2346,10 @@ bool matched = false; // if such user key mathced a key in SST bool done = false; for (iiter->Seek(key); iiter->Valid() && !done; iiter->Next()) { - Slice handle_value = iiter->value(); + BlockHandle handle = iiter->value(); - BlockHandle handle; bool not_exist_in_filter = filter != nullptr && filter->IsBlockBased() == true && - handle.DecodeFrom(&handle_value).ok() && !filter->KeyMayMatch(ExtractUserKey(key), prefix_extractor, handle.offset(), no_io); @@ -2340,8 +2378,17 @@ break; } + bool may_exist = biter.SeekForGet(key); + if (!may_exist) { + // HashSeek cannot find the key this block and the the iter is not + // the end of the block, i.e. cannot be in the following blocks + // either. In this case, the seek_key cannot be found, so we break + // from the top level for-loop. + break; + } + // Call the *saver function on each entry/block until it returns false - for (biter.Seek(key); biter.Valid(); biter.Next()) { + for (; biter.Valid(); biter.Next()) { ParsedInternalKey parsed_key; if (!ParseInternalKey(biter.key(), &parsed_key)) { s = Status::Corruption(Slice()); @@ -2389,9 +2436,10 @@ IndexBlockIter iiter_on_stack; auto iiter = NewIndexIterator(ReadOptions(), false, &iiter_on_stack); - std::unique_ptr iiter_unique_ptr; + std::unique_ptr> iiter_unique_ptr; if (iiter != &iiter_on_stack) { - iiter_unique_ptr = std::unique_ptr(iiter); + iiter_unique_ptr = + std::unique_ptr>(iiter); } if (!iiter->status().ok()) { @@ -2404,7 +2452,7 @@ for (begin ? iiter->Seek(*begin) : iiter->SeekToFirst(); iiter->Valid(); iiter->Next()) { - Slice block_handle = iiter->value(); + BlockHandle block_handle = iiter->value(); const bool is_user_key = rep_->table_properties && rep_->table_properties->index_key_is_user_key > 0; if (end && @@ -2450,11 +2498,12 @@ } // Check Data blocks IndexBlockIter iiter_on_stack; - InternalIterator* iiter = + InternalIteratorBase* iiter = NewIndexIterator(ReadOptions(), false, &iiter_on_stack); - std::unique_ptr iiter_unique_ptr; + std::unique_ptr> iiter_unique_ptr; if (iiter != &iiter_on_stack) { - iiter_unique_ptr = std::unique_ptr(iiter); + iiter_unique_ptr = + std::unique_ptr>(iiter); } if (!iiter->status().ok()) { // error opening index iterator @@ -2464,19 +2513,41 @@ return s; } -Status BlockBasedTable::VerifyChecksumInBlocks(InternalIterator* index_iter) { +Status BlockBasedTable::VerifyChecksumInBlocks( + InternalIteratorBase* index_iter) { Status s; for (index_iter->SeekToFirst(); index_iter->Valid(); index_iter->Next()) { s = index_iter->status(); if (!s.ok()) { break; } - BlockHandle handle; - Slice input = index_iter->value(); - s = handle.DecodeFrom(&input); + BlockHandle handle = index_iter->value(); + BlockContents contents; + Slice dummy_comp_dict; + BlockFetcher block_fetcher(rep_->file.get(), nullptr /* prefetch buffer */, + rep_->footer, ReadOptions(), handle, &contents, + rep_->ioptions, false /* decompress */, + dummy_comp_dict /*compression dict*/, + rep_->persistent_cache_options); + s = block_fetcher.ReadBlockContents(); + if (!s.ok()) { + break; + } + } + return s; +} + +Status BlockBasedTable::VerifyChecksumInBlocks( + InternalIteratorBase* index_iter) { + Status s; + for (index_iter->SeekToFirst(); index_iter->Valid(); index_iter->Next()) { + s = index_iter->status(); if (!s.ok()) { break; } + BlockHandle handle; + Slice input = index_iter->value(); + s = handle.DecodeFrom(&input); BlockContents contents; Slice dummy_comp_dict; BlockFetcher block_fetcher(rep_->file.get(), nullptr /* prefetch buffer */, @@ -2494,15 +2565,13 @@ bool BlockBasedTable::TEST_KeyInCache(const ReadOptions& options, const Slice& key) { - std::unique_ptr iiter(NewIndexIterator(options)); + std::unique_ptr> iiter( + NewIndexIterator(options)); iiter->Seek(key); assert(iiter->Valid()); CachableEntry block; - BlockHandle handle; - Slice input = iiter->value(); - Status s = handle.DecodeFrom(&input); - assert(s.ok()); + BlockHandle handle = iiter->value(); Cache* block_cache = rep_->table_options.block_cache.get(); assert(block_cache != nullptr); @@ -2512,6 +2581,7 @@ cache_key_storage); Slice ckey; + Status s; s = GetDataBlockFromCache( cache_key, ckey, block_cache, nullptr, rep_->ioptions, options, &block, rep_->table_options.format_version, @@ -2572,14 +2642,18 @@ rep_->ioptions, icomparator, index_reader, rep_->persistent_cache_options, level, rep_->table_properties == nullptr || - rep_->table_properties->index_key_is_user_key == 0); + rep_->table_properties->index_key_is_user_key == 0, + rep_->table_properties == nullptr || + rep_->table_properties->index_value_is_delta_encoded == 0); } case BlockBasedTableOptions::kBinarySearch: { return BinarySearchIndexReader::Create( file, prefetch_buffer, footer, footer.index_handle(), rep_->ioptions, icomparator, index_reader, rep_->persistent_cache_options, rep_->table_properties == nullptr || - rep_->table_properties->index_key_is_user_key == 0); + rep_->table_properties->index_key_is_user_key == 0, + rep_->table_properties == nullptr || + rep_->table_properties->index_value_is_delta_encoded == 0); } case BlockBasedTableOptions::kHashSearch: { std::unique_ptr meta_guard; @@ -2599,7 +2673,9 @@ rep_->ioptions, icomparator, index_reader, rep_->persistent_cache_options, rep_->table_properties == nullptr || - rep_->table_properties->index_key_is_user_key == 0); + rep_->table_properties->index_key_is_user_key == 0, + rep_->table_properties == nullptr || + rep_->table_properties->index_value_is_delta_encoded == 0); } meta_index_iter = meta_iter_guard.get(); } @@ -2610,7 +2686,9 @@ index_reader, rep_->hash_index_allow_collision, rep_->persistent_cache_options, rep_->table_properties == nullptr || - rep_->table_properties->index_key_is_user_key == 0); + rep_->table_properties->index_key_is_user_key == 0, + rep_->table_properties == nullptr || + rep_->table_properties->index_value_is_delta_encoded == 0); } default: { std::string error_message = @@ -2621,22 +2699,14 @@ } uint64_t BlockBasedTable::ApproximateOffsetOf(const Slice& key) { - unique_ptr index_iter(NewIndexIterator(ReadOptions())); + unique_ptr> index_iter( + NewIndexIterator(ReadOptions())); index_iter->Seek(key); uint64_t result; if (index_iter->Valid()) { - BlockHandle handle; - Slice input = index_iter->value(); - Status s = handle.DecodeFrom(&input); - if (s.ok()) { - result = handle.offset(); - } else { - // Strange: we can't decode the block handle in the index block. - // We'll just return the offset of the metaindex block, which is - // close to the whole file size for this case. - result = rep_->footer.metaindex_handle().offset(); - } + BlockHandle handle = index_iter->value(); + result = handle.offset(); } else { // key is past the last key in the file. If table_properties is not // available, approximate the offset by returning the offset of the @@ -2663,7 +2733,7 @@ Status BlockBasedTable::GetKVPairsFromDataBlocks( std::vector* kv_pair_blocks) { - std::unique_ptr blockhandles_iter( + std::unique_ptr> blockhandles_iter( NewIndexIterator(ReadOptions())); Status s = blockhandles_iter->status(); @@ -2770,32 +2840,32 @@ " "); out_file->Append(table_properties->ToString("\n ", ": ").c_str()); out_file->Append("\n"); - } - // Output Filter blocks - if (!rep_->filter && !table_properties->filter_policy_name.empty()) { - // Support only BloomFilter as off now - rocksdb::BlockBasedTableOptions table_options; - table_options.filter_policy.reset(rocksdb::NewBloomFilterPolicy(1)); - if (table_properties->filter_policy_name.compare( - table_options.filter_policy->Name()) == 0) { - std::string filter_block_key = kFilterBlockPrefix; - filter_block_key.append(table_properties->filter_policy_name); - BlockHandle handle; - if (FindMetaBlock(meta_iter.get(), filter_block_key, &handle).ok()) { - BlockContents block; - Slice dummy_comp_dict; - BlockFetcher block_fetcher( - rep_->file.get(), nullptr /* prefetch_buffer */, rep_->footer, - ReadOptions(), handle, &block, rep_->ioptions, false /*decompress*/, - dummy_comp_dict /*compression dict*/, - rep_->persistent_cache_options); - s = block_fetcher.ReadBlockContents(); - if (!s.ok()) { - rep_->filter.reset(new BlockBasedFilterBlockReader( - prefix_extractor, table_options, - table_options.whole_key_filtering, std::move(block), - rep_->ioptions.statistics)); + // Output Filter blocks + if (!rep_->filter && !table_properties->filter_policy_name.empty()) { + // Support only BloomFilter as off now + rocksdb::BlockBasedTableOptions table_options; + table_options.filter_policy.reset(rocksdb::NewBloomFilterPolicy(1)); + if (table_properties->filter_policy_name.compare( + table_options.filter_policy->Name()) == 0) { + std::string filter_block_key = kFilterBlockPrefix; + filter_block_key.append(table_properties->filter_policy_name); + BlockHandle handle; + if (FindMetaBlock(meta_iter.get(), filter_block_key, &handle).ok()) { + BlockContents block; + Slice dummy_comp_dict; + BlockFetcher block_fetcher( + rep_->file.get(), nullptr /* prefetch_buffer */, rep_->footer, + ReadOptions(), handle, &block, rep_->ioptions, + false /*decompress*/, dummy_comp_dict /*compression dict*/, + rep_->persistent_cache_options); + s = block_fetcher.ReadBlockContents(); + if (!s.ok()) { + rep_->filter.reset(new BlockBasedFilterBlockReader( + prefix_extractor, table_options, + table_options.whole_key_filtering, std::move(block), + rep_->ioptions.statistics)); + } } } } @@ -2878,7 +2948,7 @@ out_file->Append( "Index Details:\n" "--------------------------------------\n"); - std::unique_ptr blockhandles_iter( + std::unique_ptr> blockhandles_iter( NewIndexIterator(ReadOptions())); Status s = blockhandles_iter->status(); if (!s.ok()) { @@ -2927,7 +2997,7 @@ } Status BlockBasedTable::DumpDataBlocks(WritableFile* out_file) { - std::unique_ptr blockhandles_iter( + std::unique_ptr> blockhandles_iter( NewIndexIterator(ReadOptions())); Status s = blockhandles_iter->status(); if (!s.ok()) { @@ -2947,9 +3017,7 @@ break; } - Slice bh_val = blockhandles_iter->value(); - BlockHandle bh; - bh.DecodeFrom(&bh_val); + BlockHandle bh = blockhandles_iter->value(); uint64_t datablock_size = bh.size(); datablock_size_min = std::min(datablock_size_min, datablock_size); datablock_size_max = std::max(datablock_size_max, datablock_size); diff -Nru rocksdb-5.15.10/table/block_based_table_reader.h rocksdb-5.17.2/table/block_based_table_reader.h --- rocksdb-5.15.10/table/block_based_table_reader.h 2018-09-13 17:25:20.000000000 +0000 +++ rocksdb-5.17.2/table/block_based_table_reader.h 2018-11-12 19:57:32.000000000 +0000 @@ -23,6 +23,7 @@ #include "rocksdb/status.h" #include "rocksdb/table.h" #include "table/block.h" +#include "table/block_based_table_factory.h" #include "table/filter_block.h" #include "table/format.h" #include "table/persistent_cache_helper.h" @@ -50,7 +51,6 @@ struct EnvOptions; struct ReadOptions; class GetContext; -class InternalIterator; using std::unique_ptr; @@ -93,7 +93,9 @@ const SliceTransform* prefix_extractor = nullptr, bool prefetch_index_and_filter_in_cache = true, bool skip_filters = false, int level = -1, - const bool immortal_table = false); + const bool immortal_table = false, + const SequenceNumber largest_seqno = 0, + TailPrefetchStats* tail_prefetch_stats = nullptr); bool PrefixMayMatch(const Slice& internal_key, const ReadOptions& read_options, @@ -175,9 +177,9 @@ // to // a different object then iter and the callee has the ownership of the // returned object. - virtual InternalIterator* NewIterator(IndexBlockIter* iter = nullptr, - bool total_order_seek = true, - bool fill_cache = true) = 0; + virtual InternalIteratorBase* NewIterator( + IndexBlockIter* iter = nullptr, bool total_order_seek = true, + bool fill_cache = true) = 0; // The size of the index. virtual size_t size() const = 0; @@ -221,14 +223,16 @@ static TBlockIter* NewDataBlockIterator( Rep* rep, const ReadOptions& ro, const Slice& index_value, TBlockIter* input_iter = nullptr, bool is_index = false, - bool key_includes_seq = true, GetContext* get_context = nullptr, + bool key_includes_seq = true, bool index_key_is_full = true, + GetContext* get_context = nullptr, FilePrefetchBuffer* prefetch_buffer = nullptr); template static TBlockIter* NewDataBlockIterator( Rep* rep, const ReadOptions& ro, const BlockHandle& block_hanlde, TBlockIter* input_iter = nullptr, bool is_index = false, - bool key_includes_seq = true, GetContext* get_context = nullptr, - Status s = Status(), FilePrefetchBuffer* prefetch_buffer = nullptr); + bool key_includes_seq = true, bool index_key_is_full = true, + GetContext* get_context = nullptr, Status s = Status(), + FilePrefetchBuffer* prefetch_buffer = nullptr); class PartitionedIndexIteratorState; @@ -281,7 +285,7 @@ // 2. index is not present in block cache. // 3. We disallowed any io to be performed, that is, read_options == // kBlockCacheTier - InternalIterator* NewIndexIterator( + InternalIteratorBase* NewIndexIterator( const ReadOptions& read_options, bool need_upper_bound_check = false, IndexBlockIter* input_iter = nullptr, CachableEntry* index_entry = nullptr, @@ -350,7 +354,8 @@ std::unique_ptr* meta_block, std::unique_ptr* iter); - Status VerifyChecksumInBlocks(InternalIterator* index_iter); + Status VerifyChecksumInBlocks(InternalIteratorBase* index_iter); + Status VerifyChecksumInBlocks(InternalIteratorBase* index_iter); // Create the filter from the filter block. virtual FilterBlockReader* ReadFilter( @@ -387,14 +392,16 @@ PartitionedIndexIteratorState( BlockBasedTable* table, std::unordered_map>* block_map, - const bool index_key_includes_seq); - InternalIterator* NewSecondaryIterator(const Slice& index_value) override; + const bool index_key_includes_seq, const bool index_key_is_full); + InternalIteratorBase* NewSecondaryIterator( + const BlockHandle& index_value) override; private: // Don't own table_ BlockBasedTable* table_; std::unordered_map>* block_map_; bool index_key_includes_seq_; + bool index_key_is_full_; }; // CachableEntry represents the entries that *may* be fetched from block cache. @@ -518,16 +525,17 @@ const bool immortal_table; }; -template -class BlockBasedTableIterator : public InternalIterator { +template +class BlockBasedTableIterator : public InternalIteratorBase { public: BlockBasedTableIterator(BlockBasedTable* table, const ReadOptions& read_options, const InternalKeyComparator& icomp, - InternalIterator* index_iter, bool check_filter, - bool need_upper_bound_check, + InternalIteratorBase* index_iter, + bool check_filter, bool need_upper_bound_check, const SliceTransform* prefix_extractor, bool is_index, bool key_includes_seq = true, + bool index_key_is_full = true, bool for_compaction = false) : table_(table), read_options_(read_options), @@ -540,6 +548,7 @@ prefix_extractor_(prefix_extractor), is_index_(is_index), key_includes_seq_(key_includes_seq), + index_key_is_full_(index_key_is_full), for_compaction_(for_compaction) {} ~BlockBasedTableIterator() { delete index_iter_; } @@ -558,7 +567,7 @@ assert(Valid()); return block_iter_.key(); } - Slice value() const override { + TValue value() const override { assert(Valid()); return block_iter_.value(); } @@ -615,8 +624,7 @@ if (block_iter_points_to_real_block_) { // Reseek. If they end up with the same data block, we shouldn't re-fetch // the same data block. - Slice v = index_iter_->value(); - prev_index_value_.assign(v.data(), v.size()); + prev_index_value_ = index_iter_->value(); } } @@ -628,7 +636,7 @@ BlockBasedTable* table_; const ReadOptions read_options_; const InternalKeyComparator& icomp_; - InternalIterator* index_iter_; + InternalIteratorBase* index_iter_; PinnedIteratorsManager* pinned_iters_mgr_; TBlockIter block_iter_; bool block_iter_points_to_real_block_; @@ -641,10 +649,10 @@ bool is_index_; // If the keys in the blocks over which we iterate include 8 byte sequence bool key_includes_seq_; + bool index_key_is_full_; // If this iterator is created for compaction bool for_compaction_; - // TODO use block offset instead - std::string prev_index_value_; + BlockHandle prev_index_value_; static const size_t kInitReadaheadSize = 8 * 1024; // Found that 256 KB readahead size provides the best performance, based on diff -Nru rocksdb-5.15.10/table/block_builder.cc rocksdb-5.17.2/table/block_builder.cc --- rocksdb-5.15.10/table/block_builder.cc 2018-09-13 17:25:20.000000000 +0000 +++ rocksdb-5.17.2/table/block_builder.cc 2018-11-12 19:57:32.000000000 +0000 @@ -33,20 +33,36 @@ #include "table/block_builder.h" -#include #include -#include "rocksdb/comparator.h" +#include #include "db/dbformat.h" +#include "rocksdb/comparator.h" +#include "table/data_block_footer.h" #include "util/coding.h" namespace rocksdb { -BlockBuilder::BlockBuilder(int block_restart_interval, bool use_delta_encoding) +BlockBuilder::BlockBuilder( + int block_restart_interval, bool use_delta_encoding, + bool use_value_delta_encoding, + BlockBasedTableOptions::DataBlockIndexType index_type, + double data_block_hash_table_util_ratio) : block_restart_interval_(block_restart_interval), use_delta_encoding_(use_delta_encoding), + use_value_delta_encoding_(use_value_delta_encoding), restarts_(), counter_(0), finished_(false) { + switch (index_type) { + case BlockBasedTableOptions::kDataBlockBinarySearch: + break; + case BlockBasedTableOptions::kDataBlockBinaryAndHash: + data_block_hash_index_builder_.Initialize( + data_block_hash_table_util_ratio); + break; + default: + assert(0); + } assert(block_restart_interval_ >= 1); restarts_.push_back(0); // First restart point is at offset 0 estimate_ = sizeof(uint32_t) + sizeof(uint32_t); @@ -60,19 +76,35 @@ counter_ = 0; finished_ = false; last_key_.clear(); + if (data_block_hash_index_builder_.Valid()) { + data_block_hash_index_builder_.Reset(); + } } size_t BlockBuilder::EstimateSizeAfterKV(const Slice& key, const Slice& value) const { size_t estimate = CurrentSizeEstimate(); - estimate += key.size() + value.size(); + // Note: this is an imprecise estimate as it accounts for the whole key size + // instead of non-shared key size. + estimate += key.size(); + // In value delta encoding we estimate the value delta size as half the full + // value size since only the size field of block handle is encoded. + estimate += + !use_value_delta_encoding_ || (counter_ >= block_restart_interval_) + ? value.size() + : value.size() / 2; + if (counter_ >= block_restart_interval_) { estimate += sizeof(uint32_t); // a new restart entry. } estimate += sizeof(int32_t); // varint for shared prefix length. + // Note: this is an imprecise estimate as we will have to encoded size, one + // for shared key and one for non-shared key. estimate += VarintLength(key.size()); // varint for key length. - estimate += VarintLength(value.size()); // varint for value length. + if (!use_value_delta_encoding_ || (counter_ >= block_restart_interval_)) { + estimate += VarintLength(value.size()); // varint for value length. + } return estimate; } @@ -82,14 +114,29 @@ for (size_t i = 0; i < restarts_.size(); i++) { PutFixed32(&buffer_, restarts_[i]); } - PutFixed32(&buffer_, static_cast(restarts_.size())); + + uint32_t num_restarts = static_cast(restarts_.size()); + BlockBasedTableOptions::DataBlockIndexType index_type = + BlockBasedTableOptions::kDataBlockBinarySearch; + if (data_block_hash_index_builder_.Valid() && + CurrentSizeEstimate() <= kMaxBlockSizeSupportedByHashIndex) { + data_block_hash_index_builder_.Finish(buffer_); + index_type = BlockBasedTableOptions::kDataBlockBinaryAndHash; + } + + // footer is a packed format of data_block_index_type and num_restarts + uint32_t block_footer = PackIndexTypeAndNumRestarts(index_type, num_restarts); + + PutFixed32(&buffer_, block_footer); finished_ = true; return Slice(buffer_); } -void BlockBuilder::Add(const Slice& key, const Slice& value) { +void BlockBuilder::Add(const Slice& key, const Slice& value, + const Slice* const delta_value) { assert(!finished_); assert(counter_ <= block_restart_interval_); + assert(!use_value_delta_encoding_ || delta_value); size_t shared = 0; // number of bytes shared with prev key if (counter_ >= block_restart_interval_) { // Restart compression @@ -115,14 +162,32 @@ const size_t non_shared = key.size() - shared; const size_t curr_size = buffer_.size(); - // Add "" to buffer_ - PutVarint32Varint32Varint32(&buffer_, static_cast(shared), - static_cast(non_shared), - static_cast(value.size())); + if (use_value_delta_encoding_) { + // Add "" to buffer_ + PutVarint32Varint32(&buffer_, static_cast(shared), + static_cast(non_shared)); + } else { + // Add "" to buffer_ + PutVarint32Varint32Varint32(&buffer_, static_cast(shared), + static_cast(non_shared), + static_cast(value.size())); + } // Add string delta to buffer_ followed by value buffer_.append(key.data() + shared, non_shared); - buffer_.append(value.data(), value.size()); + // Use value delta encoding only when the key has shared bytes. This would + // simplify the decoding, where it can figure which decoding to use simply by + // looking at the shared bytes size. + if (shared != 0 && use_value_delta_encoding_) { + buffer_.append(delta_value->data(), delta_value->size()); + } else { + buffer_.append(value.data(), value.size()); + } + + if (data_block_hash_index_builder_.Valid()) { + data_block_hash_index_builder_.Add(ExtractUserKey(key), + restarts_.size() - 1); + } counter_++; estimate_ += buffer_.size() - curr_size; diff -Nru rocksdb-5.15.10/table/block_builder.h rocksdb-5.17.2/table/block_builder.h --- rocksdb-5.15.10/table/block_builder.h 2018-09-13 17:25:20.000000000 +0000 +++ rocksdb-5.17.2/table/block_builder.h 2018-11-12 19:57:32.000000000 +0000 @@ -12,6 +12,8 @@ #include #include "rocksdb/slice.h" +#include "rocksdb/table.h" +#include "table/data_block_hash_index.h" namespace rocksdb { @@ -21,14 +23,19 @@ void operator=(const BlockBuilder&) = delete; explicit BlockBuilder(int block_restart_interval, - bool use_delta_encoding = true); + bool use_delta_encoding = true, + bool use_value_delta_encoding = false, + BlockBasedTableOptions::DataBlockIndexType index_type = + BlockBasedTableOptions::kDataBlockBinarySearch, + double data_block_hash_table_util_ratio = 0.75); // Reset the contents as if the BlockBuilder was just constructed. void Reset(); // REQUIRES: Finish() has not been called since the last call to Reset(). // REQUIRES: key is larger than any previously added key - void Add(const Slice& key, const Slice& value); + void Add(const Slice& key, const Slice& value, + const Slice* const delta_value = nullptr); // Finish building the block and return a slice that refers to the // block contents. The returned slice will remain valid for the @@ -37,7 +44,11 @@ // Returns an estimate of the current (uncompressed) size of the block // we are building. - inline size_t CurrentSizeEstimate() const { return estimate_; } + inline size_t CurrentSizeEstimate() const { + return estimate_ + (data_block_hash_index_builder_.Valid() + ? data_block_hash_index_builder_.EstimateSize() + : 0); + } // Returns an estimated block size after appending key and value. size_t EstimateSizeAfterKV(const Slice& key, const Slice& value) const; @@ -49,7 +60,10 @@ private: const int block_restart_interval_; + // TODO(myabandeh): put it into a separate IndexBlockBuilder const bool use_delta_encoding_; + // Refer to BlockIter::DecodeCurrentValue for format of delta encoded values + const bool use_value_delta_encoding_; std::string buffer_; // Destination buffer std::vector restarts_; // Restart points @@ -57,6 +71,7 @@ int counter_; // Number of entries emitted since restart bool finished_; // Has Finish() been called? std::string last_key_; + DataBlockHashIndexBuilder data_block_hash_index_builder_; }; } // namespace rocksdb diff -Nru rocksdb-5.15.10/table/block.cc rocksdb-5.17.2/table/block.cc --- rocksdb-5.15.10/table/block.cc 2018-09-13 17:25:20.000000000 +0000 +++ rocksdb-5.17.2/table/block.cc 2018-11-12 19:57:32.000000000 +0000 @@ -20,6 +20,7 @@ #include "port/stack_trace.h" #include "rocksdb/comparator.h" #include "table/block_prefix_index.h" +#include "table/data_block_footer.h" #include "table/format.h" #include "util/coding.h" #include "util/logging.h" @@ -33,28 +34,65 @@ // // If any errors are detected, returns nullptr. Otherwise, returns a // pointer to the key delta (just past the three decoded values). -static inline const char* DecodeEntry(const char* p, const char* limit, - uint32_t* shared, - uint32_t* non_shared, - uint32_t* value_length) { - if (limit - p < 3) return nullptr; - *shared = reinterpret_cast(p)[0]; - *non_shared = reinterpret_cast(p)[1]; - *value_length = reinterpret_cast(p)[2]; - if ((*shared | *non_shared | *value_length) < 128) { - // Fast path: all three values are encoded in one byte each - p += 3; - } else { - if ((p = GetVarint32Ptr(p, limit, shared)) == nullptr) return nullptr; - if ((p = GetVarint32Ptr(p, limit, non_shared)) == nullptr) return nullptr; - if ((p = GetVarint32Ptr(p, limit, value_length)) == nullptr) return nullptr; - } +struct DecodeEntry { + inline const char* operator()(const char* p, const char* limit, + uint32_t* shared, uint32_t* non_shared, + uint32_t* value_length) { + // We need 2 bytes for shared and non_shared size. We also need one more + // byte either for value size or the actual value in case of value delta + // encoding. + assert(limit - p >= 3); + *shared = reinterpret_cast(p)[0]; + *non_shared = reinterpret_cast(p)[1]; + *value_length = reinterpret_cast(p)[2]; + if ((*shared | *non_shared | *value_length) < 128) { + // Fast path: all three values are encoded in one byte each + p += 3; + } else { + if ((p = GetVarint32Ptr(p, limit, shared)) == nullptr) return nullptr; + if ((p = GetVarint32Ptr(p, limit, non_shared)) == nullptr) return nullptr; + if ((p = GetVarint32Ptr(p, limit, value_length)) == nullptr) { + return nullptr; + } + } - if (static_cast(limit - p) < (*non_shared + *value_length)) { - return nullptr; + // Using an assert in place of "return null" since we should not pay the + // cost of checking for corruption on every single key decoding + assert(!(static_cast(limit - p) < (*non_shared + *value_length))); + return p; + } +}; + +struct DecodeKey { + inline const char* operator()(const char* p, const char* limit, + uint32_t* shared, uint32_t* non_shared) { + uint32_t value_length; + return DecodeEntry()(p, limit, shared, non_shared, &value_length); + } +}; + +// In format_version 4, which is used by index blocks, the value size is not +// encoded before the entry, as the value is known to be the handle with the +// known size. +struct DecodeKeyV4 { + inline const char* operator()(const char* p, const char* limit, + uint32_t* shared, uint32_t* non_shared) { + // We need 2 bytes for shared and non_shared size. We also need one more + // byte either for value size or the actual value in case of value delta + // encoding. + if (limit - p < 3) return nullptr; + *shared = reinterpret_cast(p)[0]; + *non_shared = reinterpret_cast(p)[1]; + if ((*shared | *non_shared) < 128) { + // Fast path: all three values are encoded in one byte each + p += 2; + } else { + if ((p = GetVarint32Ptr(p, limit, shared)) == nullptr) return nullptr; + if ((p = GetVarint32Ptr(p, limit, non_shared)) == nullptr) return nullptr; + } + return p; } - return p; -} +}; void DataBlockIter::Next() { assert(Valid()); @@ -170,7 +208,8 @@ return; } uint32_t index = 0; - bool ok = BinarySeek(seek_key, 0, num_restarts_ - 1, &index, comparator_); + bool ok = BinarySeek(seek_key, 0, num_restarts_ - 1, &index, + comparator_); if (!ok) { return; @@ -185,6 +224,123 @@ } } +// Optimized Seek for point lookup for an internal key `target` +// target = "seek_user_key @ type | seqno". +// +// For any type other than kTypeValue, kTypeDeletion, kTypeSingleDeletion, +// or kTypeBlobIndex, this function behaves identically as Seek(). +// +// For any type in kTypeValue, kTypeDeletion, kTypeSingleDeletion, +// or kTypeBlobIndex: +// +// If the return value is FALSE, iter location is undefined, and it means: +// 1) there is no key in this block falling into the range: +// ["seek_user_key @ type | seqno", "seek_user_key @ kTypeDeletion | 0"], +// inclusive; AND +// 2) the last key of this block has a greater user_key from seek_user_key +// +// If the return value is TRUE, iter location has two possibilies: +// 1) If iter is valid, it is set to a location as if set by BinarySeek. In +// this case, it points to the first key_ with a larger user_key or a +// matching user_key with a seqno no greater than the seeking seqno. +// 2) If the iter is invalid, it means that either all the user_key is less +// than the seek_user_key, or the block ends with a matching user_key but +// with a smaller [ type | seqno ] (i.e. a larger seqno, or the same seqno +// but larger type). +bool DataBlockIter::SeekForGetImpl(const Slice& target) { + Slice user_key = ExtractUserKey(target); + uint32_t map_offset = restarts_ + num_restarts_ * sizeof(uint32_t); + uint8_t entry = data_block_hash_index_->Lookup(data_, map_offset, user_key); + + if (entry == kCollision) { + // HashSeek not effective, falling back + Seek(target); + return true; + } + + if (entry == kNoEntry) { + // Even if we cannot find the user_key in this block, the result may + // exist in the next block. Consider this exmpale: + // + // Block N: [aab@100, ... , app@120] + // bounary key: axy@50 (we make minimal assumption about a boundary key) + // Block N+1: [axy@10, ... ] + // + // If seek_key = axy@60, the search will starts from Block N. + // Even if the user_key is not found in the hash map, the caller still + // have to conntinue searching the next block. + // + // In this case, we pretend the key is the the last restart interval. + // The while-loop below will search the last restart interval for the + // key. It will stop at the first key that is larger than the seek_key, + // or to the end of the block if no one is larger. + entry = static_cast(num_restarts_ - 1); + } + + uint32_t restart_index = entry; + + // check if the key is in the restart_interval + assert(restart_index < num_restarts_); + SeekToRestartPoint(restart_index); + + const char* limit = nullptr; + if (restart_index_ + 1 < num_restarts_) { + limit = data_ + GetRestartPoint(restart_index_ + 1); + } else { + limit = data_ + restarts_; + } + + while (true) { + // Here we only linear seek the target key inside the restart interval. + // If a key does not exist inside a restart interval, we avoid + // further searching the block content accross restart interval boundary. + // + // TODO(fwu): check the left and write boundary of the restart interval + // to avoid linear seek a target key that is out of range. + if (!ParseNextDataKey(limit) || Compare(key_, target) >= 0) { + // we stop at the first potential matching user key. + break; + } + } + + if (current_ == restarts_) { + // Search reaches to the end of the block. There are three possibilites: + // 1) there is only one user_key match in the block (otherwise collsion). + // the matching user_key resides in the last restart interval, and it + // is the last key of the restart interval and of the block as well. + // ParseNextDataKey() skiped it as its [ type | seqno ] is smaller. + // + // 2) The seek_key is not found in the HashIndex Lookup(), i.e. kNoEntry, + // AND all existing user_keys in the restart interval are smaller than + // seek_user_key. + // + // 3) The seek_key is a false positive and happens to be hashed to the + // last restart interval, AND all existing user_keys in the restart + // interval are smaller than seek_user_key. + // + // The result may exist in the next block each case, so we return true. + return true; + } + + if (user_comparator_->Compare(key_.GetUserKey(), user_key) != 0) { + // the key is not in this block and cannot be at the next block either. + return false; + } + + // Here we are conservative and only support a limited set of cases + ValueType value_type = ExtractValueType(key_.GetKey()); + if (value_type != ValueType::kTypeValue && + value_type != ValueType::kTypeDeletion && + value_type != ValueType::kTypeSingleDeletion && + value_type != ValueType::kTypeBlobIndex) { + Seek(target); + return true; + } + + // Result found, and the iter is correctly set. + return true; +} + void IndexBlockIter::Seek(const Slice& target) { Slice seek_key = target; if (!key_includes_seq_) { @@ -198,8 +354,12 @@ bool ok = false; if (prefix_index_) { ok = PrefixSeek(target, &index); + } else if (value_delta_encoded_) { + ok = BinarySeek(seek_key, 0, num_restarts_ - 1, &index, + comparator_); } else { - ok = BinarySeek(seek_key, 0, num_restarts_ - 1, &index, active_comparator_); + ok = BinarySeek(seek_key, 0, num_restarts_ - 1, &index, + comparator_); } if (!ok) { @@ -222,7 +382,8 @@ return; } uint32_t index = 0; - bool ok = BinarySeek(seek_key, 0, num_restarts_ - 1, &index, comparator_); + bool ok = BinarySeek(seek_key, 0, num_restarts_ - 1, &index, + comparator_); if (!ok) { return; @@ -277,7 +438,8 @@ } } -void BlockIter::CorruptionError() { +template +void BlockIter::CorruptionError() { current_ = restarts_; restart_index_ = num_restarts_; status_ = Status::Corruption("bad entry in block"); @@ -285,10 +447,13 @@ value_.clear(); } -bool DataBlockIter::ParseNextDataKey() { +bool DataBlockIter::ParseNextDataKey(const char* limit) { current_ = NextEntryOffset(); const char* p = data_ + current_; - const char* limit = data_ + restarts_; // Restarts come right after data + if (!limit) { + limit = data_ + restarts_; // Restarts come right after data + } + if (p >= limit) { // No more entries to return. Mark as invalid. current_ = restarts_; @@ -298,7 +463,7 @@ // Decode next entry uint32_t shared, non_shared, value_length; - p = DecodeEntry(p, limit, &shared, &non_shared, &value_length); + p = DecodeEntry()(p, limit, &shared, &non_shared, &value_length); if (p == nullptr || key_.Size() < shared) { CorruptionError(); return false; @@ -340,10 +505,14 @@ } value_ = Slice(p + non_shared, value_length); - while (restart_index_ + 1 < num_restarts_ && - GetRestartPoint(restart_index_ + 1) < current_) { - ++restart_index_; + if (shared == 0) { + while (restart_index_ + 1 < num_restarts_ && + GetRestartPoint(restart_index_ + 1) < current_) { + ++restart_index_; + } } + // else we are in the middle of a restart interval and the restart_index_ + // thus has not changed return true; } } @@ -361,7 +530,12 @@ // Decode next entry uint32_t shared, non_shared, value_length; - p = DecodeEntry(p, limit, &shared, &non_shared, &value_length); + if (value_delta_encoded_) { + p = DecodeKeyV4()(p, limit, &shared, &non_shared); + value_length = 0; + } else { + p = DecodeEntry()(p, limit, &shared, &non_shared, &value_length); + } if (p == nullptr || key_.Size() < shared) { CorruptionError(); return false; @@ -377,27 +551,71 @@ key_pinned_ = false; } value_ = Slice(p + non_shared, value_length); - while (restart_index_ + 1 < num_restarts_ && - GetRestartPoint(restart_index_ + 1) < current_) { - ++restart_index_; + if (shared == 0) { + while (restart_index_ + 1 < num_restarts_ && + GetRestartPoint(restart_index_ + 1) < current_) { + ++restart_index_; + } + } + // else we are in the middle of a restart interval and the restart_index_ + // thus has not changed + if (value_delta_encoded_) { + assert(value_length == 0); + DecodeCurrentValue(shared); } return true; } +// The format: +// restart_point 0: k, v (off, sz), k, v (delta-sz), ..., k, v (delta-sz) +// restart_point 1: k, v (off, sz), k, v (delta-sz), ..., k, v (delta-sz) +// ... +// restart_point n-1: k, v (off, sz), k, v (delta-sz), ..., k, v (delta-sz) +// where, k is key, v is value, and its encoding is in parenthesis. +// The format of each key is (shared_size, non_shared_size, shared, non_shared) +// The format of each value, i.e., block hanlde, is (offset, size) whenever the +// shared_size is 0, which included the first entry in each restart point. +// Otherwise the format is delta-size = block handle size - size of last block +// handle. +void IndexBlockIter::DecodeCurrentValue(uint32_t shared) { + assert(value_delta_encoded_); + const char* limit = data_ + restarts_; + if (shared == 0) { + uint64_t o, s; + const char* newp = GetVarint64Ptr(value_.data(), limit, &o); + assert(newp); + newp = GetVarint64Ptr(newp, limit, &s); + assert(newp); + decoded_value_ = BlockHandle(o, s); + value_ = Slice(value_.data(), newp - value_.data()); + } else { + uint64_t next_value_base = + decoded_value_.offset() + decoded_value_.size() + kBlockTrailerSize; + int64_t delta; + const char* newp = GetVarsignedint64Ptr(value_.data(), limit, &delta); + decoded_value_ = + BlockHandle(next_value_base, decoded_value_.size() + delta); + value_ = Slice(value_.data(), newp - value_.data()); + } +} + // Binary search in restart array to find the first restart point that // is either the last restart point with a key less than target, // which means the key of next restart point is larger than target, or // the first restart point with a key = target -bool BlockIter::BinarySeek(const Slice& target, uint32_t left, uint32_t right, - uint32_t* index, const Comparator* comp) { +template +template +bool BlockIter::BinarySeek(const Slice& target, uint32_t left, + uint32_t right, uint32_t* index, + const Comparator* comp) { assert(left <= right); while (left < right) { uint32_t mid = (left + right + 1) / 2; uint32_t region_offset = GetRestartPoint(mid); - uint32_t shared, non_shared, value_length; - const char* key_ptr = DecodeEntry(data_ + region_offset, data_ + restarts_, - &shared, &non_shared, &value_length); + uint32_t shared, non_shared; + const char* key_ptr = DecodeKeyFunc()( + data_ + region_offset, data_ + restarts_, &shared, &non_shared); if (key_ptr == nullptr || (shared != 0)) { CorruptionError(); return false; @@ -425,9 +643,13 @@ // Return -1 if error. int IndexBlockIter::CompareBlockKey(uint32_t block_index, const Slice& target) { uint32_t region_offset = GetRestartPoint(block_index); - uint32_t shared, non_shared, value_length; - const char* key_ptr = DecodeEntry(data_ + region_offset, data_ + restarts_, - &shared, &non_shared, &value_length); + uint32_t shared, non_shared; + const char* key_ptr = + value_delta_encoded_ + ? DecodeKeyV4()(data_ + region_offset, data_ + restarts_, &shared, + &non_shared) + : DecodeKey()(data_ + region_offset, data_ + restarts_, &shared, + &non_shared); if (key_ptr == nullptr || (shared != 0)) { CorruptionError(); return 1; // Return target is smaller @@ -507,7 +729,43 @@ uint32_t Block::NumRestarts() const { assert(size_ >= 2*sizeof(uint32_t)); - return DecodeFixed32(data_ + size_ - sizeof(uint32_t)); + uint32_t block_footer = DecodeFixed32(data_ + size_ - sizeof(uint32_t)); + uint32_t num_restarts = block_footer; + if (size_ > kMaxBlockSizeSupportedByHashIndex) { + // In BlockBuilder, we have ensured a block with HashIndex is less than + // kMaxBlockSizeSupportedByHashIndex (64KiB). + // + // Therefore, if we encounter a block with a size > 64KiB, the block + // cannot have HashIndex. So the footer will directly interpreted as + // num_restarts. + // + // Such check is for backward compatibility. We can ensure legacy block + // with a vary large num_restarts i.e. >= 0x80000000 can be interpreted + // correctly as no HashIndex even if the MSB of num_restarts is set. + return num_restarts; + } + BlockBasedTableOptions::DataBlockIndexType index_type; + UnPackIndexTypeAndNumRestarts(block_footer, &index_type, &num_restarts); + return num_restarts; +} + +BlockBasedTableOptions::DataBlockIndexType Block::IndexType() const { + assert(size_ >= 2 * sizeof(uint32_t)); + if (size_ > kMaxBlockSizeSupportedByHashIndex) { + // The check is for the same reason as that in NumRestarts() + return BlockBasedTableOptions::kDataBlockBinarySearch; + } + uint32_t block_footer = DecodeFixed32(data_ + size_ - sizeof(uint32_t)); + uint32_t num_restarts = block_footer; + BlockBasedTableOptions::DataBlockIndexType index_type; + UnPackIndexTypeAndNumRestarts(block_footer, &index_type, &num_restarts); + return index_type; +} + +Block::~Block() { + // This sync point can be re-enabled if RocksDB can control the + // initialization order of any/all static options created by the user. + // TEST_SYNC_POINT("Block::~Block"); } Block::Block(BlockContents&& contents, SequenceNumber _global_seqno, @@ -518,18 +776,49 @@ restart_offset_(0), num_restarts_(0), global_seqno_(_global_seqno) { + TEST_SYNC_POINT("Block::Block:0"); if (size_ < sizeof(uint32_t)) { size_ = 0; // Error marker } else { // Should only decode restart points for uncompressed blocks if (compression_type() == kNoCompression) { num_restarts_ = NumRestarts(); - restart_offset_ = - static_cast(size_) - (1 + num_restarts_) * sizeof(uint32_t); - if (restart_offset_ > size_ - sizeof(uint32_t)) { - // The size is too small for NumRestarts() and therefore - // restart_offset_ wrapped around. - size_ = 0; + switch (IndexType()) { + case BlockBasedTableOptions::kDataBlockBinarySearch: + restart_offset_ = static_cast(size_) - + (1 + num_restarts_) * sizeof(uint32_t); + if (restart_offset_ > size_ - sizeof(uint32_t)) { + // The size is too small for NumRestarts() and therefore + // restart_offset_ wrapped around. + size_ = 0; + } + break; + case BlockBasedTableOptions::kDataBlockBinaryAndHash: + if (size_ < sizeof(uint32_t) /* block footer */ + + sizeof(uint16_t) /* NUM_BUCK */) { + size_ = 0; + break; + } + + uint16_t map_offset; + data_block_hash_index_.Initialize( + contents.data.data(), + static_cast(contents.data.size() - + sizeof(uint32_t)), /*chop off + NUM_RESTARTS*/ + &map_offset); + + restart_offset_ = map_offset - num_restarts_ * sizeof(uint32_t); + + if (restart_offset_ > map_offset) { + // map_offset is too small for NumRestarts() and + // therefore restart_offset_ wrapped around. + size_ = 0; + break; + } + break; + default: + size_ = 0; // Error marker } } } @@ -544,6 +833,7 @@ DataBlockIter* iter, Statistics* stats, bool /*total_order_seek*/, bool /*key_includes_seq*/, + bool /*value_is_full*/, BlockPrefixIndex* /*prefix_index*/) { DataBlockIter* ret_iter; if (iter != nullptr) { @@ -560,8 +850,10 @@ ret_iter->Invalidate(Status::OK()); return ret_iter; } else { - ret_iter->Initialize(cmp, ucmp, data_, restart_offset_, num_restarts_, - global_seqno_, read_amp_bitmap_.get(), cachable()); + ret_iter->Initialize( + cmp, ucmp, data_, restart_offset_, num_restarts_, global_seqno_, + read_amp_bitmap_.get(), cachable(), + data_block_hash_index_.Valid() ? &data_block_hash_index_ : nullptr); if (read_amp_bitmap_) { if (read_amp_bitmap_->GetStatistics() != stats) { // DB changed the Statistics pointer, we need to notify read_amp_bitmap_ @@ -577,7 +869,7 @@ IndexBlockIter* Block::NewIterator(const Comparator* cmp, const Comparator* ucmp, IndexBlockIter* iter, Statistics* /*stats*/, bool total_order_seek, - bool key_includes_seq, + bool key_includes_seq, bool value_is_full, BlockPrefixIndex* prefix_index) { IndexBlockIter* ret_iter; if (iter != nullptr) { @@ -597,7 +889,8 @@ BlockPrefixIndex* prefix_index_ptr = total_order_seek ? nullptr : prefix_index; ret_iter->Initialize(cmp, ucmp, data_, restart_offset_, num_restarts_, - prefix_index_ptr, key_includes_seq, cachable()); + prefix_index_ptr, key_includes_seq, value_is_full, + cachable(), nullptr /* data_block_hash_index */); } return ret_iter; diff -Nru rocksdb-5.15.10/table/block_fetcher.cc rocksdb-5.17.2/table/block_fetcher.cc --- rocksdb-5.15.10/table/block_fetcher.cc 2018-09-13 17:25:20.000000000 +0000 +++ rocksdb-5.17.2/table/block_fetcher.cc 2018-11-12 19:57:32.000000000 +0000 @@ -169,6 +169,7 @@ // page can be either uncompressed or compressed, the buffer either stack // or heap provided. Refer to https://github.com/facebook/rocksdb/pull/4096 if (got_from_prefetch_buffer_ || used_buf_ == &stack_buf_[0]) { + assert(used_buf_ != heap_buf_.get()); heap_buf_.reset(new char[block_size_ + kBlockTrailerSize]); memcpy(heap_buf_.get(), used_buf_, block_size_ + kBlockTrailerSize); } diff -Nru rocksdb-5.15.10/table/block.h rocksdb-5.17.2/table/block.h --- rocksdb-5.15.10/table/block.h 2018-09-13 17:25:20.000000000 +0000 +++ rocksdb-5.17.2/table/block.h 2018-11-12 19:57:32.000000000 +0000 @@ -22,19 +22,22 @@ #include "db/dbformat.h" #include "db/pinned_iterators_manager.h" +#include "format.h" #include "rocksdb/iterator.h" #include "rocksdb/options.h" #include "rocksdb/statistics.h" +#include "rocksdb/table.h" #include "table/block_prefix_index.h" +#include "table/data_block_hash_index.h" #include "table/internal_iterator.h" #include "util/random.h" #include "util/sync_point.h" -#include "format.h" namespace rocksdb { struct BlockContents; class Comparator; +template class BlockIter; class DataBlockIter; class IndexBlockIter; @@ -146,7 +149,7 @@ size_t read_amp_bytes_per_bit = 0, Statistics* statistics = nullptr); - ~Block() = default; + ~Block(); size_t size() const { return size_; } const char* data() const { return data_; } @@ -154,6 +157,7 @@ // The additional memory space taken by the block data. size_t usable_size() const { return contents_.usable_size(); } uint32_t NumRestarts() const; + BlockBasedTableOptions::DataBlockIndexType IndexType() const; CompressionType compression_type() const { return contents_.compression_type; } @@ -164,6 +168,11 @@ // If iter is null, return new Iterator // If iter is not null, update this one and return it as Iterator* // + // key_includes_seq, default true, means that the keys are in internal key + // format. + // value_is_full, default ture, means that no delta encoding is + // applied to values. + // // NewIterator // Same as above but also updates read_amp_bitmap_ if it is not nullptr. // @@ -175,13 +184,11 @@ // the iterator will simply be set as "invalid", rather than returning // the key that is just pass the target key. template - TBlockIter* NewIterator(const Comparator* comparator, - const Comparator* user_comparator, - TBlockIter* iter = nullptr, - Statistics* stats = nullptr, - bool total_order_seek = true, - bool key_includes_seq = true, - BlockPrefixIndex* prefix_index = nullptr); + TBlockIter* NewIterator( + const Comparator* comparator, const Comparator* user_comparator, + TBlockIter* iter = nullptr, Statistics* stats = nullptr, + bool total_order_seek = true, bool key_includes_seq = true, + bool value_is_full = true, BlockPrefixIndex* prefix_index = nullptr); // Report an approximation of how much memory has been used. size_t ApproximateMemoryUsage() const; @@ -199,12 +206,15 @@ // the encoded value (kDisableGlobalSequenceNumber means disabled) const SequenceNumber global_seqno_; + DataBlockHashIndex data_block_hash_index_; + // No copying allowed Block(const Block&) = delete; void operator=(const Block&) = delete; }; -class BlockIter : public InternalIterator { +template +class BlockIter : public InternalIteratorBase { public: void InitializeBase(const Comparator* comparator, const char* data, uint32_t restarts, uint32_t num_restarts, @@ -243,10 +253,6 @@ assert(Valid()); return key_.GetKey(); } - virtual Slice value() const override { - assert(Valid()); - return value_; - } #ifndef NDEBUG virtual ~BlockIter() { @@ -280,7 +286,8 @@ const char* data_; // underlying block contents uint32_t num_restarts_; // Number of uint32_t entries in restart array - uint32_t restart_index_; // Index of restart block in which current_ falls + // Index of restart block in which current_ or current_-1 falls + uint32_t restart_index_; uint32_t restarts_; // Offset of restart array (list of fixed32) // current_ is offset in data_ of current entry. >= restarts_ if !Valid uint32_t current_; @@ -316,33 +323,39 @@ void CorruptionError(); - bool BinarySeek(const Slice& target, uint32_t left, uint32_t right, - uint32_t* index, const Comparator* comp); + template + inline bool BinarySeek(const Slice& target, uint32_t left, uint32_t right, + uint32_t* index, const Comparator* comp); }; -class DataBlockIter final : public BlockIter { +class DataBlockIter final : public BlockIter { public: DataBlockIter() : BlockIter(), read_amp_bitmap_(nullptr), last_bitmap_offset_(0) {} DataBlockIter(const Comparator* comparator, const Comparator* user_comparator, const char* data, uint32_t restarts, uint32_t num_restarts, SequenceNumber global_seqno, - BlockReadAmpBitmap* read_amp_bitmap, bool block_contents_pinned) + BlockReadAmpBitmap* read_amp_bitmap, bool block_contents_pinned, + DataBlockHashIndex* data_block_hash_index) : DataBlockIter() { Initialize(comparator, user_comparator, data, restarts, num_restarts, - global_seqno, read_amp_bitmap, block_contents_pinned); + global_seqno, read_amp_bitmap, block_contents_pinned, + data_block_hash_index); } void Initialize(const Comparator* comparator, - const Comparator* /*user_comparator*/, const char* data, + const Comparator* user_comparator, const char* data, uint32_t restarts, uint32_t num_restarts, SequenceNumber global_seqno, BlockReadAmpBitmap* read_amp_bitmap, - bool block_contents_pinned) { + bool block_contents_pinned, + DataBlockHashIndex* data_block_hash_index) { InitializeBase(comparator, data, restarts, num_restarts, global_seqno, block_contents_pinned); + user_comparator_ = user_comparator; key_.SetIsUserKey(false); read_amp_bitmap_ = read_amp_bitmap; last_bitmap_offset_ = current_ + 1; + data_block_hash_index_ = data_block_hash_index; } virtual Slice value() const override { @@ -358,6 +371,15 @@ virtual void Seek(const Slice& target) override; + inline bool SeekForGet(const Slice& target) { + if (!data_block_hash_index_) { + Seek(target); + return true; + } + + return SeekForGetImpl(target); + } + virtual void SeekForPrev(const Slice& target) override; virtual void Prev() override; @@ -405,14 +427,19 @@ std::vector prev_entries_; int32_t prev_entries_idx_ = -1; - bool ParseNextDataKey(); + DataBlockHashIndex* data_block_hash_index_; + const Comparator* user_comparator_; + + inline bool ParseNextDataKey(const char* limit = nullptr); inline int Compare(const IterKey& ikey, const Slice& b) const { return comparator_->Compare(ikey.GetInternalKey(), b); } + + bool SeekForGetImpl(const Slice& target); }; -class IndexBlockIter final : public BlockIter { +class IndexBlockIter final : public BlockIter { public: IndexBlockIter() : BlockIter(), prefix_index_(nullptr) {} @@ -420,27 +447,47 @@ assert(Valid()); return key_.GetKey(); } + // key_includes_seq, default true, means that the keys are in internal key + // format. + // value_is_full, default ture, means that no delta encoding is + // applied to values. IndexBlockIter(const Comparator* comparator, const Comparator* user_comparator, const char* data, uint32_t restarts, uint32_t num_restarts, BlockPrefixIndex* prefix_index, bool key_includes_seq, - bool block_contents_pinned) + bool value_is_full, bool block_contents_pinned) : IndexBlockIter() { Initialize(comparator, user_comparator, data, restarts, num_restarts, - prefix_index, key_includes_seq, block_contents_pinned); + prefix_index, key_includes_seq, block_contents_pinned, + value_is_full, nullptr /* data_block_hash_index */); } void Initialize(const Comparator* comparator, const Comparator* user_comparator, const char* data, uint32_t restarts, uint32_t num_restarts, BlockPrefixIndex* prefix_index, bool key_includes_seq, - bool block_contents_pinned) { - InitializeBase(comparator, data, restarts, num_restarts, - kDisableGlobalSequenceNumber, block_contents_pinned); + bool value_is_full, bool block_contents_pinned, + DataBlockHashIndex* /*data_block_hash_index*/) { + InitializeBase(key_includes_seq ? comparator : user_comparator, data, + restarts, num_restarts, kDisableGlobalSequenceNumber, + block_contents_pinned); key_includes_seq_ = key_includes_seq; - active_comparator_ = key_includes_seq_ ? comparator_ : user_comparator; key_.SetIsUserKey(!key_includes_seq_); prefix_index_ = prefix_index; + value_delta_encoded_ = !value_is_full; + } + + virtual BlockHandle value() const override { + assert(Valid()); + if (value_delta_encoded_) { + return decoded_value_; + } else { + BlockHandle handle; + Slice v = value_; + Status decode_s __attribute__((__unused__)) = handle.DecodeFrom(&v); + assert(decode_s.ok()); + return handle; + } } virtual void Seek(const Slice& target) override; @@ -467,27 +514,37 @@ void Invalidate(Status s) { InvalidateBase(s); } private: + // Key is in InternalKey format + bool key_includes_seq_; + bool value_delta_encoded_; + BlockPrefixIndex* prefix_index_; + // Whether the value is delta encoded. In that case the value is assumed to be + // BlockHandle. The first value in each restart interval is the full encoded + // BlockHandle; the restart of encoded size part of the BlockHandle. The + // offset of delta encoded BlockHandles is computed by adding the size of + // previous delta encoded values in the same restart interval to the offset of + // the first value in that restart interval. + BlockHandle decoded_value_; + bool PrefixSeek(const Slice& target, uint32_t* index); bool BinaryBlockIndexSeek(const Slice& target, uint32_t* block_ids, uint32_t left, uint32_t right, uint32_t* index); - int CompareBlockKey(uint32_t block_index, const Slice& target); + inline int CompareBlockKey(uint32_t block_index, const Slice& target); inline int Compare(const Slice& a, const Slice& b) const { - return active_comparator_->Compare(a, b); + return comparator_->Compare(a, b); } inline int Compare(const IterKey& ikey, const Slice& b) const { - return active_comparator_->Compare(ikey.GetKey(), b); + return comparator_->Compare(ikey.GetKey(), b); } - bool ParseNextIndexKey(); + inline bool ParseNextIndexKey(); - // Key is in InternalKey format - bool key_includes_seq_; - // key_includes_seq_ ? comparator_ : user_comparator_ - const Comparator* active_comparator_; - BlockPrefixIndex* prefix_index_; + // When value_delta_encoded_ is enabled it decodes the value which is assumed + // to be BlockHandle and put it to decoded_value_ + inline void DecodeCurrentValue(uint32_t shared); }; } // namespace rocksdb diff -Nru rocksdb-5.15.10/table/block_test.cc rocksdb-5.17.2/table/block_test.cc --- rocksdb-5.15.10/table/block_test.cc 2018-09-13 17:25:20.000000000 +0000 +++ rocksdb-5.17.2/table/block_test.cc 2018-11-12 19:57:32.000000000 +0000 @@ -68,6 +68,29 @@ } } +// Same as GenerateRandomKVs but the values are BlockHandle +void GenerateRandomKBHs(std::vector *keys, + std::vector *values, const int from, + const int len, const int step = 1, + const int padding_size = 0, + const int keys_share_prefix = 1) { + Random rnd(302); + uint64_t offset = 0; + + // generate different prefix + for (int i = from; i < from + len; i += step) { + // generate keys that shares the prefix + for (int j = 0; j < keys_share_prefix; ++j) { + keys->emplace_back(GenerateKey(i, j, padding_size, &rnd)); + + uint64_t size = rnd.Uniform(1024 * 16); + BlockHandle handle(offset, size); + offset += size + kBlockTrailerSize; + values->emplace_back(handle); + } + } +} + class BlockTest : public testing::Test {}; // block test @@ -131,6 +154,84 @@ delete iter; } +TEST_F(BlockTest, ValueDeltaEncodingTest) { + Random rnd(301); + Options options = Options(); + std::unique_ptr ic; + ic.reset(new test::PlainInternalKeyComparator(options.comparator)); + + std::vector keys; + std::vector values; + const bool kUseDeltaEncoding = true; + const bool kUseValueDeltaEncoding = true; + BlockBuilder builder(16, kUseDeltaEncoding, kUseValueDeltaEncoding); + int num_records = 100; + + GenerateRandomKBHs(&keys, &values, 0, num_records); + // add a bunch of records to a block + BlockHandle last_encoded_handle; + for (int i = 0; i < num_records; i++) { + auto block_handle = values[i]; + std::string handle_encoding; + block_handle.EncodeTo(&handle_encoding); + std::string handle_delta_encoding; + PutVarsignedint64(&handle_delta_encoding, + block_handle.size() - last_encoded_handle.size()); + last_encoded_handle = block_handle; + const Slice handle_delta_encoding_slice(handle_delta_encoding); + builder.Add(keys[i], handle_encoding, &handle_delta_encoding_slice); + } + + // read serialized contents of the block + Slice rawblock = builder.Finish(); + + // create block reader + BlockContents contents; + contents.data = rawblock; + contents.cachable = false; + Block reader(std::move(contents), kDisableGlobalSequenceNumber); + + const bool kTotalOrderSeek = true; + const bool kIncludesSeq = true; + const bool kValueIsFull = !kUseValueDeltaEncoding; + IndexBlockIter *kNullIter = nullptr; + Statistics *kNullStats = nullptr; + // read contents of block sequentially + int count = 0; + InternalIteratorBase *iter = reader.NewIterator( + options.comparator, options.comparator, kNullIter, kNullStats, + kTotalOrderSeek, kIncludesSeq, kValueIsFull); + for (iter->SeekToFirst(); iter->Valid(); count++, iter->Next()) { + // read kv from block + Slice k = iter->key(); + BlockHandle handle = iter->value(); + + // compare with lookaside array + ASSERT_EQ(k.ToString().compare(keys[count]), 0); + + ASSERT_EQ(values[count].offset(), handle.offset()); + ASSERT_EQ(values[count].size(), handle.size()); + } + delete iter; + + // read block contents randomly + iter = reader.NewIterator( + options.comparator, options.comparator, kNullIter, kNullStats, + kTotalOrderSeek, kIncludesSeq, kValueIsFull); + for (int i = 0; i < num_records; i++) { + // find a random key in the lookaside array + int index = rnd.Uniform(num_records); + Slice k(keys[index]); + + // search in block for this key + iter->Seek(k); + ASSERT_TRUE(iter->Valid()); + BlockHandle handle = iter->value(); + ASSERT_EQ(values[index].offset(), handle.offset()); + ASSERT_EQ(values[index].size(), handle.size()); + } + delete iter; +} // return the block contents BlockContents GetBlockContents(std::unique_ptr *builder, const std::vector &keys, diff -Nru rocksdb-5.15.10/table/cuckoo_table_builder.cc rocksdb-5.17.2/table/cuckoo_table_builder.cc --- rocksdb-5.15.10/table/cuckoo_table_builder.cc 2018-09-13 17:25:20.000000000 +0000 +++ rocksdb-5.17.2/table/cuckoo_table_builder.cc 2018-11-12 19:57:32.000000000 +0000 @@ -164,9 +164,9 @@ Slice CuckooTableBuilder::GetKey(uint64_t idx) const { assert(closed_); if (IsDeletedKey(idx)) { - return Slice(&deleted_keys_[(idx - num_values_) * key_size_], key_size_); + return Slice(&deleted_keys_[static_cast((idx - num_values_) * key_size_)], static_cast(key_size_)); } - return Slice(&kvs_[idx * (key_size_ + value_size_)], key_size_); + return Slice(&kvs_[static_cast(idx * (key_size_ + value_size_))], static_cast(key_size_)); } Slice CuckooTableBuilder::GetUserKey(uint64_t idx) const { @@ -177,14 +177,14 @@ Slice CuckooTableBuilder::GetValue(uint64_t idx) const { assert(closed_); if (IsDeletedKey(idx)) { - static std::string empty_value(value_size_, 'a'); + static std::string empty_value(static_cast(value_size_), 'a'); return Slice(empty_value); } - return Slice(&kvs_[idx * (key_size_ + value_size_) + key_size_], value_size_); + return Slice(&kvs_[static_cast(idx * (key_size_ + value_size_) + key_size_)], static_cast(value_size_)); } Status CuckooTableBuilder::MakeHashTable(std::vector* buckets) { - buckets->resize(hash_table_size_ + cuckoo_block_size_ - 1); + buckets->resize(static_cast(hash_table_size_ + cuckoo_block_size_ - 1)); uint32_t make_space_for_key_call_id = 0; for (uint32_t vector_idx = 0; vector_idx < num_entries_; vector_idx++) { uint64_t bucket_id = 0; @@ -200,13 +200,13 @@ // stop searching and proceed for next hash function. for (uint32_t block_idx = 0; block_idx < cuckoo_block_size_; ++block_idx, ++hash_val) { - if ((*buckets)[hash_val].vector_idx == kMaxVectorIdx) { + if ((*buckets)[static_cast(hash_val)].vector_idx == kMaxVectorIdx) { bucket_id = hash_val; bucket_found = true; break; } else { if (ucomp_->Compare(user_key, - GetUserKey((*buckets)[hash_val].vector_idx)) == 0) { + GetUserKey((*buckets)[static_cast(hash_val)].vector_idx)) == 0) { return Status::NotSupported("Same key is being inserted again."); } hash_vals.push_back(hash_val); @@ -226,7 +226,7 @@ ++num_hash_func_; for (uint32_t block_idx = 0; block_idx < cuckoo_block_size_; ++block_idx, ++hash_val) { - if ((*buckets)[hash_val].vector_idx == kMaxVectorIdx) { + if ((*buckets)[static_cast(hash_val)].vector_idx == kMaxVectorIdx) { bucket_found = true; bucket_id = hash_val; break; @@ -235,7 +235,7 @@ } } } - (*buckets)[bucket_id].vector_idx = vector_idx; + (*buckets)[static_cast(bucket_id)].vector_idx = vector_idx; } return Status::OK(); } @@ -295,7 +295,7 @@ reinterpret_cast(&value_size_), sizeof(value_size_)); uint64_t bucket_size = key_size_ + value_size_; - unused_bucket.resize(bucket_size, 'a'); + unused_bucket.resize(static_cast(bucket_size), 'a'); // Write the table. uint32_t num_added = 0; for (auto& bucket : buckets) { @@ -320,7 +320,7 @@ uint64_t offset = buckets.size() * bucket_size; properties_.data_size = offset; - unused_bucket.resize(properties_.fixed_key_len); + unused_bucket.resize(static_cast(properties_.fixed_key_len)); properties_.user_collected_properties[ CuckooTablePropertyNames::kEmptyKey] = unused_bucket; properties_.user_collected_properties[ @@ -456,7 +456,7 @@ // no. of times this will be called is <= max_num_hash_func_ + num_entries_. for (uint32_t hash_cnt = 0; hash_cnt < num_hash_func_; ++hash_cnt) { uint64_t bid = hash_vals[hash_cnt]; - (*buckets)[bid].make_space_for_key_call_id = make_space_for_key_call_id; + (*buckets)[static_cast(bid)].make_space_for_key_call_id = make_space_for_key_call_id; tree.push_back(CuckooNode(bid, 0, 0)); } bool null_found = false; @@ -467,7 +467,7 @@ if (curr_depth >= max_search_depth_) { break; } - CuckooBucket& curr_bucket = (*buckets)[curr_node.bucket_id]; + CuckooBucket& curr_bucket = (*buckets)[static_cast(curr_node.bucket_id)]; for (uint32_t hash_cnt = 0; hash_cnt < num_hash_func_ && !null_found; ++hash_cnt) { uint64_t child_bucket_id = CuckooHash(GetUserKey(curr_bucket.vector_idx), @@ -476,15 +476,15 @@ // Iterate inside Cuckoo Block. for (uint32_t block_idx = 0; block_idx < cuckoo_block_size_; ++block_idx, ++child_bucket_id) { - if ((*buckets)[child_bucket_id].make_space_for_key_call_id == + if ((*buckets)[static_cast(child_bucket_id)].make_space_for_key_call_id == make_space_for_key_call_id) { continue; } - (*buckets)[child_bucket_id].make_space_for_key_call_id = + (*buckets)[static_cast(child_bucket_id)].make_space_for_key_call_id = make_space_for_key_call_id; tree.push_back(CuckooNode(child_bucket_id, curr_depth + 1, curr_pos)); - if ((*buckets)[child_bucket_id].vector_idx == kMaxVectorIdx) { + if ((*buckets)[static_cast(child_bucket_id)].vector_idx == kMaxVectorIdx) { null_found = true; break; } @@ -502,8 +502,8 @@ uint32_t bucket_to_replace_pos = static_cast(tree.size()) - 1; while (bucket_to_replace_pos >= num_hash_func_) { CuckooNode& curr_node = tree[bucket_to_replace_pos]; - (*buckets)[curr_node.bucket_id] = - (*buckets)[tree[curr_node.parent_pos].bucket_id]; + (*buckets)[static_cast(curr_node.bucket_id)] = + (*buckets)[static_cast(tree[curr_node.parent_pos].bucket_id)]; bucket_to_replace_pos = curr_node.parent_pos; } *bucket_id = tree[bucket_to_replace_pos].bucket_id; diff -Nru rocksdb-5.15.10/table/cuckoo_table_builder_test.cc rocksdb-5.17.2/table/cuckoo_table_builder_test.cc --- rocksdb-5.15.10/table/cuckoo_table_builder_test.cc 2018-09-13 17:25:20.000000000 +0000 +++ rocksdb-5.17.2/table/cuckoo_table_builder_test.cc 2018-11-12 19:57:32.000000000 +0000 @@ -156,7 +156,7 @@ fname = test::PerThreadDBPath("EmptyFile"); ASSERT_OK(env_->NewWritableFile(fname, &writable_file, env_options_)); unique_ptr file_writer( - new WritableFileWriter(std::move(writable_file), EnvOptions())); + new WritableFileWriter(std::move(writable_file), fname, EnvOptions())); CuckooTableBuilder builder(file_writer.get(), kHashTableRatio, 4, 100, BytewiseComparator(), 1, false, false, GetSliceHash, 0 /* column_family_id */, @@ -192,7 +192,7 @@ fname = test::PerThreadDBPath("NoCollisionFullKey"); ASSERT_OK(env_->NewWritableFile(fname, &writable_file, env_options_)); unique_ptr file_writer( - new WritableFileWriter(std::move(writable_file), EnvOptions())); + new WritableFileWriter(std::move(writable_file), fname, EnvOptions())); CuckooTableBuilder builder(file_writer.get(), kHashTableRatio, num_hash_fun, 100, BytewiseComparator(), 1, false, false, GetSliceHash, 0 /* column_family_id */, @@ -240,7 +240,7 @@ fname = test::PerThreadDBPath("WithCollisionFullKey"); ASSERT_OK(env_->NewWritableFile(fname, &writable_file, env_options_)); unique_ptr file_writer( - new WritableFileWriter(std::move(writable_file), EnvOptions())); + new WritableFileWriter(std::move(writable_file), fname, EnvOptions())); CuckooTableBuilder builder(file_writer.get(), kHashTableRatio, num_hash_fun, 100, BytewiseComparator(), 1, false, false, GetSliceHash, 0 /* column_family_id */, @@ -289,7 +289,7 @@ fname = test::PerThreadDBPath("WithCollisionFullKey2"); ASSERT_OK(env_->NewWritableFile(fname, &writable_file, env_options_)); unique_ptr file_writer( - new WritableFileWriter(std::move(writable_file), EnvOptions())); + new WritableFileWriter(std::move(writable_file), fname, EnvOptions())); CuckooTableBuilder builder( file_writer.get(), kHashTableRatio, num_hash_fun, 100, BytewiseComparator(), cuckoo_block_size, false, false, GetSliceHash, @@ -342,7 +342,7 @@ fname = test::PerThreadDBPath("WithCollisionPathFullKey"); ASSERT_OK(env_->NewWritableFile(fname, &writable_file, env_options_)); unique_ptr file_writer( - new WritableFileWriter(std::move(writable_file), EnvOptions())); + new WritableFileWriter(std::move(writable_file), fname, EnvOptions())); CuckooTableBuilder builder(file_writer.get(), kHashTableRatio, num_hash_fun, 100, BytewiseComparator(), 1, false, false, GetSliceHash, 0 /* column_family_id */, @@ -392,7 +392,7 @@ fname = test::PerThreadDBPath("WithCollisionPathFullKeyAndCuckooBlock"); ASSERT_OK(env_->NewWritableFile(fname, &writable_file, env_options_)); unique_ptr file_writer( - new WritableFileWriter(std::move(writable_file), EnvOptions())); + new WritableFileWriter(std::move(writable_file), fname, EnvOptions())); CuckooTableBuilder builder(file_writer.get(), kHashTableRatio, num_hash_fun, 100, BytewiseComparator(), 2, false, false, GetSliceHash, 0 /* column_family_id */, @@ -435,7 +435,7 @@ fname = test::PerThreadDBPath("NoCollisionUserKey"); ASSERT_OK(env_->NewWritableFile(fname, &writable_file, env_options_)); unique_ptr file_writer( - new WritableFileWriter(std::move(writable_file), EnvOptions())); + new WritableFileWriter(std::move(writable_file), fname, EnvOptions())); CuckooTableBuilder builder(file_writer.get(), kHashTableRatio, num_hash_fun, 100, BytewiseComparator(), 1, false, false, GetSliceHash, 0 /* column_family_id */, @@ -479,7 +479,7 @@ fname = test::PerThreadDBPath("WithCollisionUserKey"); ASSERT_OK(env_->NewWritableFile(fname, &writable_file, env_options_)); unique_ptr file_writer( - new WritableFileWriter(std::move(writable_file), EnvOptions())); + new WritableFileWriter(std::move(writable_file), fname, EnvOptions())); CuckooTableBuilder builder(file_writer.get(), kHashTableRatio, num_hash_fun, 100, BytewiseComparator(), 1, false, false, GetSliceHash, 0 /* column_family_id */, @@ -525,7 +525,7 @@ fname = test::PerThreadDBPath("WithCollisionPathUserKey"); ASSERT_OK(env_->NewWritableFile(fname, &writable_file, env_options_)); unique_ptr file_writer( - new WritableFileWriter(std::move(writable_file), EnvOptions())); + new WritableFileWriter(std::move(writable_file), fname, EnvOptions())); CuckooTableBuilder builder(file_writer.get(), kHashTableRatio, num_hash_fun, 2, BytewiseComparator(), 1, false, false, GetSliceHash, 0 /* column_family_id */, @@ -570,7 +570,7 @@ fname = test::PerThreadDBPath("WithCollisionPathUserKey"); ASSERT_OK(env_->NewWritableFile(fname, &writable_file, env_options_)); unique_ptr file_writer( - new WritableFileWriter(std::move(writable_file), EnvOptions())); + new WritableFileWriter(std::move(writable_file), fname, EnvOptions())); CuckooTableBuilder builder(file_writer.get(), kHashTableRatio, num_hash_fun, 2, BytewiseComparator(), 1, false, false, GetSliceHash, 0 /* column_family_id */, @@ -598,7 +598,7 @@ fname = test::PerThreadDBPath("FailWhenSameKeyInserted"); ASSERT_OK(env_->NewWritableFile(fname, &writable_file, env_options_)); unique_ptr file_writer( - new WritableFileWriter(std::move(writable_file), EnvOptions())); + new WritableFileWriter(std::move(writable_file), fname, EnvOptions())); CuckooTableBuilder builder(file_writer.get(), kHashTableRatio, num_hash_fun, 100, BytewiseComparator(), 1, false, false, GetSliceHash, 0 /* column_family_id */, diff -Nru rocksdb-5.15.10/table/cuckoo_table_reader.cc rocksdb-5.17.2/table/cuckoo_table_reader.cc --- rocksdb-5.15.10/table/cuckoo_table_reader.cc 2018-09-13 17:25:20.000000000 +0000 +++ rocksdb-5.17.2/table/cuckoo_table_reader.cc 2018-11-12 19:57:32.000000000 +0000 @@ -136,7 +136,7 @@ cuckoo_block_size_ = *reinterpret_cast( cuckoo_block_size->second.data()); cuckoo_block_bytes_minus_one_ = cuckoo_block_size_ * bucket_length_ - 1; - status_ = file_->Read(0, file_size, &file_data_, nullptr); + status_ = file_->Read(0, static_cast(file_size), &file_data_, nullptr); } Status CuckooTableReader::Get(const ReadOptions& /*readOptions*/, @@ -268,7 +268,7 @@ if (initialized_) { return; } - sorted_bucket_ids_.reserve(reader_->GetTableProperties()->num_entries); + sorted_bucket_ids_.reserve(static_cast(reader_->GetTableProperties()->num_entries)); uint64_t num_buckets = reader_->table_size_ + reader_->cuckoo_block_size_ - 1; assert(num_buckets < kInvalidIndex); const char* bucket = reader_->file_data_.data(); @@ -374,15 +374,12 @@ return curr_value_; } -extern InternalIterator* NewErrorInternalIterator(const Status& status, - Arena* arena); - InternalIterator* CuckooTableReader::NewIterator( const ReadOptions& /*read_options*/, const SliceTransform* /* prefix_extractor */, Arena* arena, bool /*skip_filters*/, bool /*for_compaction*/) { if (!status().ok()) { - return NewErrorInternalIterator( + return NewErrorInternalIterator( Status::Corruption("CuckooTableReader status is not okay."), arena); } CuckooTableIterator* iter; diff -Nru rocksdb-5.15.10/table/cuckoo_table_reader.h rocksdb-5.17.2/table/cuckoo_table_reader.h --- rocksdb-5.15.10/table/cuckoo_table_reader.h 2018-09-13 17:25:20.000000000 +0000 +++ rocksdb-5.17.2/table/cuckoo_table_reader.h 2018-11-12 19:57:32.000000000 +0000 @@ -25,7 +25,6 @@ class Arena; class TableReader; -class InternalIterator; class CuckooTableReader: public TableReader { public: diff -Nru rocksdb-5.15.10/table/cuckoo_table_reader_test.cc rocksdb-5.17.2/table/cuckoo_table_reader_test.cc --- rocksdb-5.15.10/table/cuckoo_table_reader_test.cc 2018-09-13 17:25:20.000000000 +0000 +++ rocksdb-5.17.2/table/cuckoo_table_reader_test.cc 2018-11-12 19:57:32.000000000 +0000 @@ -96,7 +96,7 @@ std::unique_ptr writable_file; ASSERT_OK(env->NewWritableFile(fname, &writable_file, env_options)); unique_ptr file_writer( - new WritableFileWriter(std::move(writable_file), env_options)); + new WritableFileWriter(std::move(writable_file), fname, env_options)); CuckooTableBuilder builder( file_writer.get(), 0.9, kNumHashFunc, 100, ucomp, 2, false, false, @@ -412,7 +412,7 @@ std::unique_ptr writable_file; ASSERT_OK(env->NewWritableFile(fname, &writable_file, env_options)); unique_ptr file_writer( - new WritableFileWriter(std::move(writable_file), env_options)); + new WritableFileWriter(std::move(writable_file), fname, env_options)); CuckooTableBuilder builder( file_writer.get(), hash_ratio, 64, 1000, test::Uint64Comparator(), 5, false, FLAGS_identity_as_first_hash, nullptr, 0 /* column_family_id */, diff -Nru rocksdb-5.15.10/table/data_block_footer.cc rocksdb-5.17.2/table/data_block_footer.cc --- rocksdb-5.15.10/table/data_block_footer.cc 1970-01-01 00:00:00.000000000 +0000 +++ rocksdb-5.17.2/table/data_block_footer.cc 2018-11-12 19:57:32.000000000 +0000 @@ -0,0 +1,59 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). +// +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. + +#include "data_block_footer.h" + +#include "rocksdb/table.h" + +namespace rocksdb { + +const int kDataBlockIndexTypeBitShift = 31; + +// 0x7FFFFFFF +const uint32_t kMaxNumRestarts = (1u << kDataBlockIndexTypeBitShift) - 1u; + +// 0x7FFFFFFF +const uint32_t kNumRestartsMask = (1u << kDataBlockIndexTypeBitShift) - 1u; + +uint32_t PackIndexTypeAndNumRestarts( + BlockBasedTableOptions::DataBlockIndexType index_type, + uint32_t num_restarts) { + if (num_restarts > kMaxNumRestarts) { + assert(0); // mute travis "unused" warning + } + + uint32_t block_footer = num_restarts; + if (index_type == BlockBasedTableOptions::kDataBlockBinaryAndHash) { + block_footer |= 1u << kDataBlockIndexTypeBitShift; + } else if (index_type != BlockBasedTableOptions::kDataBlockBinarySearch) { + assert(0); + } + + return block_footer; +} + +void UnPackIndexTypeAndNumRestarts( + uint32_t block_footer, + BlockBasedTableOptions::DataBlockIndexType* index_type, + uint32_t* num_restarts) { + if (index_type) { + if (block_footer & 1u << kDataBlockIndexTypeBitShift) { + *index_type = BlockBasedTableOptions::kDataBlockBinaryAndHash; + } else { + *index_type = BlockBasedTableOptions::kDataBlockBinarySearch; + } + } + + if (num_restarts) { + *num_restarts = block_footer & kNumRestartsMask; + assert(*num_restarts <= kMaxNumRestarts); + } +} + +} // namespace rocksdb diff -Nru rocksdb-5.15.10/table/data_block_footer.h rocksdb-5.17.2/table/data_block_footer.h --- rocksdb-5.15.10/table/data_block_footer.h 1970-01-01 00:00:00.000000000 +0000 +++ rocksdb-5.17.2/table/data_block_footer.h 2018-11-12 19:57:32.000000000 +0000 @@ -0,0 +1,25 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). +// +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. + +#pragma once + +#include "rocksdb/table.h" + +namespace rocksdb { + +uint32_t PackIndexTypeAndNumRestarts( + BlockBasedTableOptions::DataBlockIndexType index_type, + uint32_t num_restarts); + +void UnPackIndexTypeAndNumRestarts( + uint32_t block_footer, + BlockBasedTableOptions::DataBlockIndexType* index_type, + uint32_t* num_restarts); + +} // namespace rocksdb diff -Nru rocksdb-5.15.10/table/data_block_hash_index.cc rocksdb-5.17.2/table/data_block_hash_index.cc --- rocksdb-5.15.10/table/data_block_hash_index.cc 1970-01-01 00:00:00.000000000 +0000 +++ rocksdb-5.17.2/table/data_block_hash_index.cc 2018-11-12 19:57:32.000000000 +0000 @@ -0,0 +1,93 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). +#include +#include + +#include "rocksdb/slice.h" +#include "table/data_block_hash_index.h" +#include "util/coding.h" +#include "util/hash.h" + +namespace rocksdb { + +void DataBlockHashIndexBuilder::Add(const Slice& key, + const size_t restart_index) { + assert(Valid()); + if (restart_index > kMaxRestartSupportedByHashIndex) { + valid_ = false; + return; + } + + uint32_t hash_value = GetSliceHash(key); + hash_and_restart_pairs_.emplace_back(hash_value, + static_cast(restart_index)); + estimated_num_buckets_ += bucket_per_key_; +} + +void DataBlockHashIndexBuilder::Finish(std::string& buffer) { + assert(Valid()); + uint16_t num_buckets = static_cast(estimated_num_buckets_); + + if (num_buckets == 0) { + num_buckets = 1; // sanity check + } + + // The build-in hash cannot well distribute strings when into different + // buckets when num_buckets is power of two, resulting in high hash + // collision. + // We made the num_buckets to be odd to avoid this issue. + num_buckets |= 1; + + std::vector buckets(num_buckets, kNoEntry); + // write the restart_index array + for (auto& entry : hash_and_restart_pairs_) { + uint32_t hash_value = entry.first; + uint8_t restart_index = entry.second; + uint16_t buck_idx = static_cast(hash_value % num_buckets); + if (buckets[buck_idx] == kNoEntry) { + buckets[buck_idx] = restart_index; + } else if (buckets[buck_idx] != restart_index) { + // same bucket cannot store two different restart_index, mark collision + buckets[buck_idx] = kCollision; + } + } + + for (uint8_t restart_index : buckets) { + buffer.append( + const_cast(reinterpret_cast(&restart_index)), + sizeof(restart_index)); + } + + // write NUM_BUCK + PutFixed16(&buffer, num_buckets); + + assert(buffer.size() <= kMaxBlockSizeSupportedByHashIndex); +} + +void DataBlockHashIndexBuilder::Reset() { + estimated_num_buckets_ = 0; + valid_ = true; + hash_and_restart_pairs_.clear(); +} + +void DataBlockHashIndex::Initialize(const char* data, uint16_t size, + uint16_t* map_offset) { + assert(size >= sizeof(uint16_t)); // NUM_BUCKETS + num_buckets_ = DecodeFixed16(data + size - sizeof(uint16_t)); + assert(num_buckets_ > 0); + assert(size > num_buckets_ * sizeof(uint8_t)); + *map_offset = static_cast(size - sizeof(uint16_t) - + num_buckets_ * sizeof(uint8_t)); +} + +uint8_t DataBlockHashIndex::Lookup(const char* data, uint32_t map_offset, + const Slice& key) const { + uint32_t hash_value = GetSliceHash(key); + uint16_t idx = static_cast(hash_value % num_buckets_); + const char* bucket_table = data + map_offset; + return static_cast(*(bucket_table + idx * sizeof(uint8_t))); +} + +} // namespace rocksdb diff -Nru rocksdb-5.15.10/table/data_block_hash_index.h rocksdb-5.17.2/table/data_block_hash_index.h --- rocksdb-5.15.10/table/data_block_hash_index.h 1970-01-01 00:00:00.000000000 +0000 +++ rocksdb-5.17.2/table/data_block_hash_index.h 2018-11-12 19:57:32.000000000 +0000 @@ -0,0 +1,136 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). + +#pragma once + +#include +#include + +#include "rocksdb/slice.h" + +namespace rocksdb { +// This is an experimental feature aiming to reduce the CPU utilization of +// point-lookup within a data-block. It is only used in data blocks, and not +// in meta-data blocks or per-table index blocks. +// +// It only used to support BlockBasedTable::Get(). +// +// A serialized hash index is appended to the data-block. The new block data +// format is as follows: +// +// DATA_BLOCK: [RI RI RI ... RI RI_IDX HASH_IDX FOOTER] +// +// RI: Restart Interval (the same as the default data-block format) +// RI_IDX: Restart Interval index (the same as the default data-block format) +// HASH_IDX: The new data-block hash index feature. +// FOOTER: A 32bit block footer, which is the NUM_RESTARTS with the MSB as +// the flag indicating if this hash index is in use. Note that +// given a data block < 32KB, the MSB is never used. So we can +// borrow the MSB as the hash index flag. Therefore, this format is +// compatible with the legacy data-blocks with num_restarts < 32768, +// as the MSB is 0. +// +// The format of the data-block hash index is as follows: +// +// HASH_IDX: [B B B ... B NUM_BUCK] +// +// B: bucket, an array of restart index. Each buckets is uint8_t. +// NUM_BUCK: Number of buckets, which is the length of the bucket array. +// +// We reserve two special flag: +// kNoEntry=255, +// kCollision=254. +// +// Therefore, the max number of restarts this hash index can supoport is 253. +// +// Buckets are initialized to be kNoEntry. +// +// When storing a key in the hash index, the key is first hashed to a bucket. +// If there the bucket is empty (kNoEntry), the restart index is stored in +// the bucket. If there is already a restart index there, we will update the +// existing restart index to a collision marker (kCollision). If the +// the bucket is already marked as collision, we do not store the restart +// index either. +// +// During query process, a key is first hashed to a bucket. Then we examine if +// the buckets store nothing (kNoEntry) or the bucket had a collision +// (kCollision). If either of those happens, we get the restart index of +// the key and will directly go to the restart interval to search the key. +// +// Note that we only support blocks with #restart_interval < 254. If a block +// has more restart interval than that, hash index will not be create for it. + +const uint8_t kNoEntry = 255; +const uint8_t kCollision = 254; +const uint8_t kMaxRestartSupportedByHashIndex = 253; + +// Because we use uint16_t address, we only support block no more than 64KB +const size_t kMaxBlockSizeSupportedByHashIndex = 1u << 16; +const double kDefaultUtilRatio = 0.75; + +class DataBlockHashIndexBuilder { + public: + DataBlockHashIndexBuilder() + : bucket_per_key_(-1 /*uninitialized marker*/), + estimated_num_buckets_(0), + valid_(false) {} + + void Initialize(double util_ratio) { + if (util_ratio <= 0) { + util_ratio = kDefaultUtilRatio; // sanity check + } + bucket_per_key_ = 1 / util_ratio; + valid_ = true; + } + + inline bool Valid() const { return valid_ && bucket_per_key_ > 0; } + void Add(const Slice& key, const size_t restart_index); + void Finish(std::string& buffer); + void Reset(); + inline size_t EstimateSize() const { + uint16_t estimated_num_buckets = + static_cast(estimated_num_buckets_); + + // Maching the num_buckets number in DataBlockHashIndexBuilder::Finish. + estimated_num_buckets |= 1; + + return sizeof(uint16_t) + + static_cast(estimated_num_buckets * sizeof(uint8_t)); + } + + private: + double bucket_per_key_; // is the multiplicative inverse of util_ratio_ + double estimated_num_buckets_; + + // Now the only usage for `valid_` is to mark false when the inserted + // restart_index is larger than supported. In this case HashIndex is not + // appended to the block content. + bool valid_; + + std::vector> hash_and_restart_pairs_; + friend class DataBlockHashIndex_DataBlockHashTestSmall_Test; +}; + +class DataBlockHashIndex { + public: + DataBlockHashIndex() : num_buckets_(0) {} + + void Initialize(const char* data, uint16_t size, uint16_t* map_offset); + + uint8_t Lookup(const char* data, uint32_t map_offset, const Slice& key) const; + + inline bool Valid() { return num_buckets_ != 0; } + + private: + // To make the serialized hash index compact and to save the space overhead, + // here all the data fields persisted in the block are in uint16 format. + // We find that a uint16 is large enough to index every offset of a 64KiB + // block. + // So in other words, DataBlockHashIndex does not support block size equal + // or greater then 64KiB. + uint16_t num_buckets_; +}; + +} // namespace rocksdb diff -Nru rocksdb-5.15.10/table/data_block_hash_index_test.cc rocksdb-5.17.2/table/data_block_hash_index_test.cc --- rocksdb-5.15.10/table/data_block_hash_index_test.cc 1970-01-01 00:00:00.000000000 +0000 +++ rocksdb-5.17.2/table/data_block_hash_index_test.cc 2018-11-12 19:57:32.000000000 +0000 @@ -0,0 +1,728 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). + +#include +#include +#include + +#include "rocksdb/slice.h" +#include "table/block.h" +#include "table/block_based_table_reader.h" +#include "table/block_builder.h" +#include "table/data_block_hash_index.h" +#include "table/get_context.h" +#include "util/testharness.h" +#include "util/testutil.h" + +namespace rocksdb { + +bool SearchForOffset(DataBlockHashIndex& index, const char* data, + uint16_t map_offset, const Slice& key, + uint8_t& restart_point) { + uint8_t entry = index.Lookup(data, map_offset, key); + if (entry == kCollision) { + return true; + } + + if (entry == kNoEntry) { + return false; + } + + return entry == restart_point; +} + +// Random KV generator similer to block_test +static std::string RandomString(Random* rnd, int len) { + std::string r; + test::RandomString(rnd, len, &r); + return r; +} +std::string GenerateKey(int primary_key, int secondary_key, int padding_size, + Random* rnd) { + char buf[50]; + char* p = &buf[0]; + snprintf(buf, sizeof(buf), "%6d%4d", primary_key, secondary_key); + std::string k(p); + if (padding_size) { + k += RandomString(rnd, padding_size); + } + + return k; +} + +// Generate random key value pairs. +// The generated key will be sorted. You can tune the parameters to generated +// different kinds of test key/value pairs for different scenario. +void GenerateRandomKVs(std::vector* keys, + std::vector* values, const int from, + const int len, const int step = 1, + const int padding_size = 0, + const int keys_share_prefix = 1) { + Random rnd(302); + + // generate different prefix + for (int i = from; i < from + len; i += step) { + // generating keys that shares the prefix + for (int j = 0; j < keys_share_prefix; ++j) { + keys->emplace_back(GenerateKey(i, j, padding_size, &rnd)); + + // 100 bytes values + values->emplace_back(RandomString(&rnd, 100)); + } + } +} + +TEST(DataBlockHashIndex, DataBlockHashTestSmall) { + DataBlockHashIndexBuilder builder; + builder.Initialize(0.75 /*util_ratio*/); + for (int j = 0; j < 5; j++) { + for (uint8_t i = 0; i < 2 + j; i++) { + std::string key("key" + std::to_string(i)); + uint8_t restart_point = i; + builder.Add(key, restart_point); + } + + size_t estimated_size = builder.EstimateSize(); + + std::string buffer("fake"), buffer2; + size_t original_size = buffer.size(); + estimated_size += original_size; + builder.Finish(buffer); + + ASSERT_EQ(buffer.size(), estimated_size); + + buffer2 = buffer; // test for the correctness of relative offset + + Slice s(buffer2); + DataBlockHashIndex index; + uint16_t map_offset; + index.Initialize(s.data(), static_cast(s.size()), &map_offset); + + // the additional hash map should start at the end of the buffer + ASSERT_EQ(original_size, map_offset); + for (uint8_t i = 0; i < 2; i++) { + std::string key("key" + std::to_string(i)); + uint8_t restart_point = i; + ASSERT_TRUE( + SearchForOffset(index, s.data(), map_offset, key, restart_point)); + } + builder.Reset(); + } +} + +TEST(DataBlockHashIndex, DataBlockHashTest) { + // bucket_num = 200, #keys = 100. 50% utilization + DataBlockHashIndexBuilder builder; + builder.Initialize(0.75 /*util_ratio*/); + + for (uint8_t i = 0; i < 100; i++) { + std::string key("key" + std::to_string(i)); + uint8_t restart_point = i; + builder.Add(key, restart_point); + } + + size_t estimated_size = builder.EstimateSize(); + + std::string buffer("fake content"), buffer2; + size_t original_size = buffer.size(); + estimated_size += original_size; + builder.Finish(buffer); + + ASSERT_EQ(buffer.size(), estimated_size); + + buffer2 = buffer; // test for the correctness of relative offset + + Slice s(buffer2); + DataBlockHashIndex index; + uint16_t map_offset; + index.Initialize(s.data(), static_cast(s.size()), &map_offset); + + // the additional hash map should start at the end of the buffer + ASSERT_EQ(original_size, map_offset); + for (uint8_t i = 0; i < 100; i++) { + std::string key("key" + std::to_string(i)); + uint8_t restart_point = i; + ASSERT_TRUE( + SearchForOffset(index, s.data(), map_offset, key, restart_point)); + } +} + +TEST(DataBlockHashIndex, DataBlockHashTestCollision) { + // bucket_num = 2. There will be intense hash collisions + DataBlockHashIndexBuilder builder; + builder.Initialize(0.75 /*util_ratio*/); + + for (uint8_t i = 0; i < 100; i++) { + std::string key("key" + std::to_string(i)); + uint8_t restart_point = i; + builder.Add(key, restart_point); + } + + size_t estimated_size = builder.EstimateSize(); + + std::string buffer("some other fake content to take up space"), buffer2; + size_t original_size = buffer.size(); + estimated_size += original_size; + builder.Finish(buffer); + + ASSERT_EQ(buffer.size(), estimated_size); + + buffer2 = buffer; // test for the correctness of relative offset + + Slice s(buffer2); + DataBlockHashIndex index; + uint16_t map_offset; + index.Initialize(s.data(), static_cast(s.size()), &map_offset); + + // the additional hash map should start at the end of the buffer + ASSERT_EQ(original_size, map_offset); + for (uint8_t i = 0; i < 100; i++) { + std::string key("key" + std::to_string(i)); + uint8_t restart_point = i; + ASSERT_TRUE( + SearchForOffset(index, s.data(), map_offset, key, restart_point)); + } +} + +TEST(DataBlockHashIndex, DataBlockHashTestLarge) { + DataBlockHashIndexBuilder builder; + builder.Initialize(0.75 /*util_ratio*/); + std::unordered_map m; + + for (uint8_t i = 0; i < 100; i++) { + if (i % 2) { + continue; // leave half of the keys out + } + std::string key = "key" + std::to_string(i); + uint8_t restart_point = i; + builder.Add(key, restart_point); + m[key] = restart_point; + } + + size_t estimated_size = builder.EstimateSize(); + + std::string buffer("filling stuff"), buffer2; + size_t original_size = buffer.size(); + estimated_size += original_size; + builder.Finish(buffer); + + ASSERT_EQ(buffer.size(), estimated_size); + + buffer2 = buffer; // test for the correctness of relative offset + + Slice s(buffer2); + DataBlockHashIndex index; + uint16_t map_offset; + index.Initialize(s.data(), static_cast(s.size()), &map_offset); + + // the additional hash map should start at the end of the buffer + ASSERT_EQ(original_size, map_offset); + for (uint8_t i = 0; i < 100; i++) { + std::string key = "key" + std::to_string(i); + uint8_t restart_point = i; + if (m.count(key)) { + ASSERT_TRUE(m[key] == restart_point); + ASSERT_TRUE( + SearchForOffset(index, s.data(), map_offset, key, restart_point)); + } else { + // we allow false positve, so don't test the nonexisting keys. + // when false positive happens, the search will continue to the + // restart intervals to see if the key really exist. + } + } +} + +TEST(DataBlockHashIndex, RestartIndexExceedMax) { + DataBlockHashIndexBuilder builder; + builder.Initialize(0.75 /*util_ratio*/); + std::unordered_map m; + + for (uint8_t i = 0; i <= 253; i++) { + std::string key = "key" + std::to_string(i); + uint8_t restart_point = i; + builder.Add(key, restart_point); + } + ASSERT_TRUE(builder.Valid()); + + builder.Reset(); + + for (uint8_t i = 0; i <= 254; i++) { + std::string key = "key" + std::to_string(i); + uint8_t restart_point = i; + builder.Add(key, restart_point); + } + + ASSERT_FALSE(builder.Valid()); + + builder.Reset(); + ASSERT_TRUE(builder.Valid()); +} + +TEST(DataBlockHashIndex, BlockRestartIndexExceedMax) { + Options options = Options(); + + BlockBuilder builder(1 /* block_restart_interval */, + true /* use_delta_encoding */, + false /* use_value_delta_encoding */, + BlockBasedTableOptions::kDataBlockBinaryAndHash); + + // #restarts <= 253. HashIndex is valid + for (int i = 0; i <= 253; i++) { + std::string ukey = "key" + std::to_string(i); + InternalKey ikey(ukey, 0, kTypeValue); + builder.Add(ikey.Encode().ToString(), "value"); + } + + { + // read serialized contents of the block + Slice rawblock = builder.Finish(); + + // create block reader + BlockContents contents; + contents.data = rawblock; + contents.cachable = false; + Block reader(std::move(contents), kDisableGlobalSequenceNumber); + + ASSERT_EQ(reader.IndexType(), + BlockBasedTableOptions::kDataBlockBinaryAndHash); + } + + builder.Reset(); + + // #restarts > 253. HashIndex is not used + for (int i = 0; i <= 254; i++) { + std::string ukey = "key" + std::to_string(i); + InternalKey ikey(ukey, 0, kTypeValue); + builder.Add(ikey.Encode().ToString(), "value"); + } + + { + // read serialized contents of the block + Slice rawblock = builder.Finish(); + + // create block reader + BlockContents contents; + contents.data = rawblock; + contents.cachable = false; + Block reader(std::move(contents), kDisableGlobalSequenceNumber); + + ASSERT_EQ(reader.IndexType(), + BlockBasedTableOptions::kDataBlockBinarySearch); + } +} + +TEST(DataBlockHashIndex, BlockSizeExceedMax) { + Options options = Options(); + std::string ukey(10, 'k'); + InternalKey ikey(ukey, 0, kTypeValue); + + BlockBuilder builder(1 /* block_restart_interval */, + false /* use_delta_encoding */, + false /* use_value_delta_encoding */, + BlockBasedTableOptions::kDataBlockBinaryAndHash); + + { + // insert a large value. The block size plus HashIndex is 65536. + std::string value(65502, 'v'); + + builder.Add(ikey.Encode().ToString(), value); + + // read serialized contents of the block + Slice rawblock = builder.Finish(); + ASSERT_LE(rawblock.size(), kMaxBlockSizeSupportedByHashIndex); + std::cerr << "block size: " << rawblock.size() << std::endl; + + // create block reader + BlockContents contents; + contents.data = rawblock; + contents.cachable = false; + Block reader(std::move(contents), kDisableGlobalSequenceNumber); + + ASSERT_EQ(reader.IndexType(), + BlockBasedTableOptions::kDataBlockBinaryAndHash); + } + + builder.Reset(); + + { + // insert a large value. The block size plus HashIndex would be 65537. + // This excceed the max block size supported by HashIndex (65536). + // So when build finishes HashIndex will not be created for the block. + std::string value(65503, 'v'); + + builder.Add(ikey.Encode().ToString(), value); + + // read serialized contents of the block + Slice rawblock = builder.Finish(); + ASSERT_LE(rawblock.size(), kMaxBlockSizeSupportedByHashIndex); + std::cerr << "block size: " << rawblock.size() << std::endl; + + // create block reader + BlockContents contents; + contents.data = rawblock; + contents.cachable = false; + Block reader(std::move(contents), kDisableGlobalSequenceNumber); + + // the index type have fallen back to binary when build finish. + ASSERT_EQ(reader.IndexType(), + BlockBasedTableOptions::kDataBlockBinarySearch); + } +} + +TEST(DataBlockHashIndex, BlockTestSingleKey) { + Options options = Options(); + + BlockBuilder builder(16 /* block_restart_interval */, + true /* use_delta_encoding */, + false /* use_value_delta_encoding */, + BlockBasedTableOptions::kDataBlockBinaryAndHash); + + std::string ukey("gopher"); + std::string value("gold"); + InternalKey ikey(ukey, 10, kTypeValue); + builder.Add(ikey.Encode().ToString(), value /*value*/); + + // read serialized contents of the block + Slice rawblock = builder.Finish(); + + // create block reader + BlockContents contents; + contents.data = rawblock; + contents.cachable = false; + Block reader(std::move(contents), kDisableGlobalSequenceNumber); + + const InternalKeyComparator icmp(BytewiseComparator()); + auto iter = reader.NewIterator(&icmp, icmp.user_comparator()); + bool may_exist; + // search in block for the key just inserted + { + InternalKey seek_ikey(ukey, 10, kValueTypeForSeek); + may_exist = iter->SeekForGet(seek_ikey.Encode().ToString()); + ASSERT_TRUE(may_exist); + ASSERT_TRUE(iter->Valid()); + ASSERT_EQ( + options.comparator->Compare(iter->key(), ikey.Encode().ToString()), 0); + ASSERT_EQ(iter->value(), value); + } + + // search in block for the existing ukey, but with higher seqno + { + InternalKey seek_ikey(ukey, 20, kValueTypeForSeek); + + // HashIndex should be able to set the iter correctly + may_exist = iter->SeekForGet(seek_ikey.Encode().ToString()); + ASSERT_TRUE(may_exist); + ASSERT_TRUE(iter->Valid()); + + // user key should match + ASSERT_EQ(options.comparator->Compare(ExtractUserKey(iter->key()), ukey), + 0); + + // seek_key seqno number should be greater than that of iter result + ASSERT_GT(GetInternalKeySeqno(seek_ikey.Encode()), + GetInternalKeySeqno(iter->key())); + + ASSERT_EQ(iter->value(), value); + } + + // Search in block for the existing ukey, but with lower seqno + // in this case, hash can find the only occurrence of the user_key, but + // ParseNextDataKey() will skip it as it does not have a older seqno. + // In this case, GetForSeek() is effective to locate the user_key, and + // iter->Valid() == false indicates that we've reached to the end of + // the block and the caller should continue searching the next block. + { + InternalKey seek_ikey(ukey, 5, kValueTypeForSeek); + may_exist = iter->SeekForGet(seek_ikey.Encode().ToString()); + ASSERT_TRUE(may_exist); + ASSERT_FALSE(iter->Valid()); // should have reached to the end of block + } + + delete iter; +} + +TEST(DataBlockHashIndex, BlockTestLarge) { + Random rnd(1019); + Options options = Options(); + std::vector keys; + std::vector values; + + BlockBuilder builder(16 /* block_restart_interval */, + true /* use_delta_encoding */, + false /* use_value_delta_encoding */, + BlockBasedTableOptions::kDataBlockBinaryAndHash); + int num_records = 500; + + GenerateRandomKVs(&keys, &values, 0, num_records); + + // Generate keys. Adding a trailing "1" to indicate existent keys. + // Later will Seeking for keys with a trailing "0" to test seeking + // non-existent keys. + for (int i = 0; i < num_records; i++) { + std::string ukey(keys[i] + "1" /* existing key marker */); + InternalKey ikey(ukey, 0, kTypeValue); + builder.Add(ikey.Encode().ToString(), values[i]); + } + + // read serialized contents of the block + Slice rawblock = builder.Finish(); + + // create block reader + BlockContents contents; + contents.data = rawblock; + contents.cachable = false; + Block reader(std::move(contents), kDisableGlobalSequenceNumber); + const InternalKeyComparator icmp(BytewiseComparator()); + + // random seek existent keys + for (int i = 0; i < num_records; i++) { + auto iter = + reader.NewIterator(&icmp, icmp.user_comparator()); + // find a random key in the lookaside array + int index = rnd.Uniform(num_records); + std::string ukey(keys[index] + "1" /* existing key marker */); + InternalKey ikey(ukey, 0, kTypeValue); + + // search in block for this key + bool may_exist = iter->SeekForGet(ikey.Encode().ToString()); + ASSERT_TRUE(may_exist); + ASSERT_TRUE(iter->Valid()); + ASSERT_EQ(values[index], iter->value()); + + delete iter; + } + + // random seek non-existent user keys + // In this case A), the user_key cannot be found in HashIndex. The key may + // exist in the next block. So the iter is set invalidated to tell the + // caller to search the next block. This test case belongs to this case A). + // + // Note that for non-existent keys, there is possibility of false positive, + // i.e. the key is still hashed into some restart interval. + // Two additional possible outcome: + // B) linear seek the restart interval and not found, the iter stops at the + // starting of the next restart interval. The key does not exist + // anywhere. + // C) linear seek the restart interval and not found, the iter stops at the + // the end of the block, i.e. restarts_. The key may exist in the next + // block. + // So these combinations are possible when searching non-existent user_key: + // + // case# may_exist iter->Valid() + // A true false + // B false true + // C true false + + for (int i = 0; i < num_records; i++) { + auto iter = + reader.NewIterator(&icmp, icmp.user_comparator()); + // find a random key in the lookaside array + int index = rnd.Uniform(num_records); + std::string ukey(keys[index] + "0" /* non-existing key marker */); + InternalKey ikey(ukey, 0, kTypeValue); + + // search in block for this key + bool may_exist = iter->SeekForGet(ikey.Encode().ToString()); + if (!may_exist) { + ASSERT_TRUE(iter->Valid()); + } + if (!iter->Valid()) { + ASSERT_TRUE(may_exist); + } + + delete iter; + } +} + +// helper routine for DataBlockHashIndex.BlockBoundary +void TestBoundary(InternalKey& ik1, std::string& v1, InternalKey& ik2, + std::string& v2, InternalKey& seek_ikey, + GetContext& get_context, Options& options) { + unique_ptr file_writer; + unique_ptr file_reader; + unique_ptr table_reader; + int level_ = -1; + + std::vector keys; + const ImmutableCFOptions ioptions(options); + const MutableCFOptions moptions(options); + const InternalKeyComparator internal_comparator(options.comparator); + + EnvOptions soptions; + + soptions.use_mmap_reads = ioptions.allow_mmap_reads; + file_writer.reset( + test::GetWritableFileWriter(new test::StringSink(), "" /* don't care */)); + unique_ptr builder; + std::vector> + int_tbl_prop_collector_factories; + std::string column_family_name; + builder.reset(ioptions.table_factory->NewTableBuilder( + TableBuilderOptions(ioptions, moptions, internal_comparator, + &int_tbl_prop_collector_factories, + options.compression, CompressionOptions(), + nullptr /* compression_dict */, + false /* skip_filters */, column_family_name, level_), + TablePropertiesCollectorFactory::Context::kUnknownColumnFamily, + file_writer.get())); + + builder->Add(ik1.Encode().ToString(), v1); + builder->Add(ik2.Encode().ToString(), v2); + EXPECT_TRUE(builder->status().ok()); + + Status s = builder->Finish(); + file_writer->Flush(); + EXPECT_TRUE(s.ok()) << s.ToString(); + + EXPECT_EQ(static_cast(file_writer->writable_file()) + ->contents() + .size(), + builder->FileSize()); + + // Open the table + file_reader.reset(test::GetRandomAccessFileReader(new test::StringSource( + static_cast(file_writer->writable_file())->contents(), + 0 /*uniq_id*/, ioptions.allow_mmap_reads))); + const bool kSkipFilters = true; + const bool kImmortal = true; + ioptions.table_factory->NewTableReader( + TableReaderOptions(ioptions, moptions.prefix_extractor.get(), soptions, + internal_comparator, !kSkipFilters, !kImmortal, + level_), + std::move(file_reader), + static_cast(file_writer->writable_file()) + ->contents() + .size(), + &table_reader); + // Search using Get() + ReadOptions ro; + + ASSERT_OK(table_reader->Get(ro, seek_ikey.Encode().ToString(), &get_context, + moptions.prefix_extractor.get())); +} + +TEST(DataBlockHashIndex, BlockBoundary) { + BlockBasedTableOptions table_options; + table_options.data_block_index_type = + BlockBasedTableOptions::kDataBlockBinaryAndHash; + table_options.block_restart_interval = 1; + table_options.block_size = 4096; + + Options options; + options.comparator = BytewiseComparator(); + + options.table_factory.reset(NewBlockBasedTableFactory(table_options)); + + // insert two large k/v pair. Given that the block_size is 4096, one k/v + // pair will take up one block. + // [ k1/v1 ][ k2/v2 ] + // [ Block N ][ Block N+1 ] + + { + // [ "aab"@100 ][ "axy"@10 ] + // | Block N ][ Block N+1 ] + // seek for "axy"@60 + std::string uk1("aab"); + InternalKey ik1(uk1, 100, kTypeValue); + std::string v1(4100, '1'); // large value + + std::string uk2("axy"); + InternalKey ik2(uk2, 10, kTypeValue); + std::string v2(4100, '2'); // large value + + PinnableSlice value; + std::string seek_ukey("axy"); + InternalKey seek_ikey(seek_ukey, 60, kTypeValue); + GetContext get_context(options.comparator, nullptr, nullptr, nullptr, + GetContext::kNotFound, seek_ukey, &value, nullptr, + nullptr, nullptr, nullptr); + + TestBoundary(ik1, v1, ik2, v2, seek_ikey, get_context, options); + ASSERT_EQ(get_context.State(), GetContext::kFound); + ASSERT_EQ(value, v2); + value.Reset(); + } + + { + // [ "axy"@100 ][ "axy"@10 ] + // | Block N ][ Block N+1 ] + // seek for "axy"@60 + std::string uk1("axy"); + InternalKey ik1(uk1, 100, kTypeValue); + std::string v1(4100, '1'); // large value + + std::string uk2("axy"); + InternalKey ik2(uk2, 10, kTypeValue); + std::string v2(4100, '2'); // large value + + PinnableSlice value; + std::string seek_ukey("axy"); + InternalKey seek_ikey(seek_ukey, 60, kTypeValue); + GetContext get_context(options.comparator, nullptr, nullptr, nullptr, + GetContext::kNotFound, seek_ukey, &value, nullptr, + nullptr, nullptr, nullptr); + + TestBoundary(ik1, v1, ik2, v2, seek_ikey, get_context, options); + ASSERT_EQ(get_context.State(), GetContext::kFound); + ASSERT_EQ(value, v2); + value.Reset(); + } + + { + // [ "axy"@100 ][ "axy"@10 ] + // | Block N ][ Block N+1 ] + // seek for "axy"@120 + std::string uk1("axy"); + InternalKey ik1(uk1, 100, kTypeValue); + std::string v1(4100, '1'); // large value + + std::string uk2("axy"); + InternalKey ik2(uk2, 10, kTypeValue); + std::string v2(4100, '2'); // large value + + PinnableSlice value; + std::string seek_ukey("axy"); + InternalKey seek_ikey(seek_ukey, 120, kTypeValue); + GetContext get_context(options.comparator, nullptr, nullptr, nullptr, + GetContext::kNotFound, seek_ukey, &value, nullptr, + nullptr, nullptr, nullptr); + + TestBoundary(ik1, v1, ik2, v2, seek_ikey, get_context, options); + ASSERT_EQ(get_context.State(), GetContext::kFound); + ASSERT_EQ(value, v1); + value.Reset(); + } + + { + // [ "axy"@100 ][ "axy"@10 ] + // | Block N ][ Block N+1 ] + // seek for "axy"@5 + std::string uk1("axy"); + InternalKey ik1(uk1, 100, kTypeValue); + std::string v1(4100, '1'); // large value + + std::string uk2("axy"); + InternalKey ik2(uk2, 10, kTypeValue); + std::string v2(4100, '2'); // large value + + PinnableSlice value; + std::string seek_ukey("axy"); + InternalKey seek_ikey(seek_ukey, 5, kTypeValue); + GetContext get_context(options.comparator, nullptr, nullptr, nullptr, + GetContext::kNotFound, seek_ukey, &value, nullptr, + nullptr, nullptr, nullptr); + + TestBoundary(ik1, v1, ik2, v2, seek_ikey, get_context, options); + ASSERT_EQ(get_context.State(), GetContext::kNotFound); + value.Reset(); + } +} + +} // namespace rocksdb + +int main(int argc, char** argv) { + ::testing::InitGoogleTest(&argc, argv); + return RUN_ALL_TESTS(); +} diff -Nru rocksdb-5.15.10/table/format.cc rocksdb-5.17.2/table/format.cc --- rocksdb-5.15.10/table/format.cc 2018-09-13 17:25:20.000000000 +0000 +++ rocksdb-5.17.2/table/format.cc 2018-11-12 19:57:32.000000000 +0000 @@ -66,6 +66,18 @@ } } +Status BlockHandle::DecodeSizeFrom(uint64_t _offset, Slice* input) { + if (GetVarint64(input, &size_)) { + offset_ = _offset; + return Status::OK(); + } else { + // reset in case failure after partially decoding + offset_ = 0; + size_ = 0; + return Status::Corruption("bad block handle"); + } +} + // Return a string that contains the copy of handle. std::string BlockHandle::ToString(bool hex) const { std::string handle_str; diff -Nru rocksdb-5.15.10/table/format.h rocksdb-5.17.2/table/format.h --- rocksdb-5.15.10/table/format.h 2018-09-13 17:25:20.000000000 +0000 +++ rocksdb-5.17.2/table/format.h 2018-11-12 19:57:32.000000000 +0000 @@ -54,6 +54,7 @@ void EncodeTo(std::string* dst) const; Status DecodeFrom(Slice* input); + Status DecodeSizeFrom(uint64_t offset, Slice* input); // Return a string that contains the copy of handle. std::string ToString(bool hex = true) const; @@ -90,7 +91,7 @@ } inline bool BlockBasedTableSupportedVersion(uint32_t version) { - return version <= 3; + return version <= 4; } // Footer encapsulates the fixed information stored at the tail diff -Nru rocksdb-5.15.10/table/get_context.cc rocksdb-5.17.2/table/get_context.cc --- rocksdb-5.15.10/table/get_context.cc 2018-09-13 17:25:20.000000000 +0000 +++ rocksdb-5.17.2/table/get_context.cc 2018-11-12 19:57:32.000000000 +0000 @@ -91,11 +91,73 @@ } } -void GetContext::RecordCounters(Tickers ticker, size_t val) { - if (ticker == Tickers::TICKER_ENUM_MAX) { - return; +void GetContext::ReportCounters() { + if (get_context_stats_.num_cache_hit > 0) { + RecordTick(statistics_, BLOCK_CACHE_HIT, get_context_stats_.num_cache_hit); + } + if (get_context_stats_.num_cache_index_hit > 0) { + RecordTick(statistics_, BLOCK_CACHE_INDEX_HIT, + get_context_stats_.num_cache_index_hit); + } + if (get_context_stats_.num_cache_data_hit > 0) { + RecordTick(statistics_, BLOCK_CACHE_DATA_HIT, + get_context_stats_.num_cache_data_hit); + } + if (get_context_stats_.num_cache_filter_hit > 0) { + RecordTick(statistics_, BLOCK_CACHE_FILTER_HIT, + get_context_stats_.num_cache_filter_hit); + } + if (get_context_stats_.num_cache_index_miss > 0) { + RecordTick(statistics_, BLOCK_CACHE_INDEX_MISS, + get_context_stats_.num_cache_index_miss); + } + if (get_context_stats_.num_cache_filter_miss > 0) { + RecordTick(statistics_, BLOCK_CACHE_FILTER_MISS, + get_context_stats_.num_cache_filter_miss); + } + if (get_context_stats_.num_cache_data_miss > 0) { + RecordTick(statistics_, BLOCK_CACHE_DATA_MISS, + get_context_stats_.num_cache_data_miss); + } + if (get_context_stats_.num_cache_bytes_read > 0) { + RecordTick(statistics_, BLOCK_CACHE_BYTES_READ, + get_context_stats_.num_cache_bytes_read); + } + if (get_context_stats_.num_cache_miss > 0) { + RecordTick(statistics_, BLOCK_CACHE_MISS, + get_context_stats_.num_cache_miss); + } + if (get_context_stats_.num_cache_add > 0) { + RecordTick(statistics_, BLOCK_CACHE_ADD, get_context_stats_.num_cache_add); + } + if (get_context_stats_.num_cache_bytes_write > 0) { + RecordTick(statistics_, BLOCK_CACHE_BYTES_WRITE, + get_context_stats_.num_cache_bytes_write); + } + if (get_context_stats_.num_cache_index_add > 0) { + RecordTick(statistics_, BLOCK_CACHE_INDEX_ADD, + get_context_stats_.num_cache_index_add); + } + if (get_context_stats_.num_cache_index_bytes_insert > 0) { + RecordTick(statistics_, BLOCK_CACHE_INDEX_BYTES_INSERT, + get_context_stats_.num_cache_index_bytes_insert); + } + if (get_context_stats_.num_cache_data_add > 0) { + RecordTick(statistics_, BLOCK_CACHE_DATA_ADD, + get_context_stats_.num_cache_data_add); + } + if (get_context_stats_.num_cache_data_bytes_insert > 0) { + RecordTick(statistics_, BLOCK_CACHE_DATA_BYTES_INSERT, + get_context_stats_.num_cache_data_bytes_insert); + } + if (get_context_stats_.num_cache_filter_add > 0) { + RecordTick(statistics_, BLOCK_CACHE_FILTER_ADD, + get_context_stats_.num_cache_filter_add); + } + if (get_context_stats_.num_cache_filter_bytes_insert > 0) { + RecordTick(statistics_, BLOCK_CACHE_FILTER_BYTES_INSERT, + get_context_stats_.num_cache_filter_bytes_insert); } - tickers_value[ticker] += static_cast(val); } bool GetContext::SaveValue(const ParsedInternalKey& parsed_key, diff -Nru rocksdb-5.15.10/table/get_context.h rocksdb-5.17.2/table/get_context.h --- rocksdb-5.15.10/table/get_context.h 2018-09-13 17:25:20.000000000 +0000 +++ rocksdb-5.17.2/table/get_context.h 2018-11-12 19:57:32.000000000 +0000 @@ -17,6 +17,26 @@ class MergeContext; class PinnedIteratorsManager; +struct GetContextStats { + uint64_t num_cache_hit = 0; + uint64_t num_cache_index_hit = 0; + uint64_t num_cache_data_hit = 0; + uint64_t num_cache_filter_hit = 0; + uint64_t num_cache_index_miss = 0; + uint64_t num_cache_filter_miss = 0; + uint64_t num_cache_data_miss = 0; + uint64_t num_cache_bytes_read = 0; + uint64_t num_cache_miss = 0; + uint64_t num_cache_add = 0; + uint64_t num_cache_bytes_write = 0; + uint64_t num_cache_index_add = 0; + uint64_t num_cache_index_bytes_insert = 0; + uint64_t num_cache_data_add = 0; + uint64_t num_cache_data_bytes_insert = 0; + uint64_t num_cache_filter_add = 0; + uint64_t num_cache_filter_bytes_insert = 0; +}; + class GetContext { public: enum GetState { @@ -27,7 +47,7 @@ kMerge, // saver contains the current merge result (the operands) kBlobIndex, }; - uint64_t tickers_value[Tickers::TICKER_ENUM_MAX] = {0}; + GetContextStats get_context_stats_; GetContext(const Comparator* ucmp, const MergeOperator* merge_operator, Logger* logger, Statistics* statistics, GetState init_state, @@ -77,7 +97,7 @@ return true; } - void RecordCounters(Tickers ticker, size_t val); + void ReportCounters(); private: const Comparator* ucmp_; diff -Nru rocksdb-5.15.10/table/index_builder.cc rocksdb-5.17.2/table/index_builder.cc --- rocksdb-5.15.10/table/index_builder.cc 2018-09-13 17:25:20.000000000 +0000 +++ rocksdb-5.17.2/table/index_builder.cc 2018-11-12 19:57:32.000000000 +0000 @@ -27,23 +27,26 @@ BlockBasedTableOptions::IndexType index_type, const InternalKeyComparator* comparator, const InternalKeySliceTransform* int_key_slice_transform, + const bool use_value_delta_encoding, const BlockBasedTableOptions& table_opt) { IndexBuilder* result = nullptr; switch (index_type) { case BlockBasedTableOptions::kBinarySearch: { - result = new ShortenedIndexBuilder(comparator, - table_opt.index_block_restart_interval, - table_opt.format_version); + result = new ShortenedIndexBuilder( + comparator, table_opt.index_block_restart_interval, + table_opt.format_version, use_value_delta_encoding); } break; case BlockBasedTableOptions::kHashSearch: { result = new HashIndexBuilder(comparator, int_key_slice_transform, table_opt.index_block_restart_interval, - table_opt.format_version); + table_opt.format_version, + use_value_delta_encoding); } break; case BlockBasedTableOptions::kTwoLevelIndexSearch: { - result = PartitionedIndexBuilder::CreateIndexBuilder(comparator, table_opt); + result = PartitionedIndexBuilder::CreateIndexBuilder( + comparator, use_value_delta_encoding, table_opt); } break; default: { @@ -56,18 +59,23 @@ PartitionedIndexBuilder* PartitionedIndexBuilder::CreateIndexBuilder( const InternalKeyComparator* comparator, + const bool use_value_delta_encoding, const BlockBasedTableOptions& table_opt) { - return new PartitionedIndexBuilder(comparator, table_opt); + return new PartitionedIndexBuilder(comparator, table_opt, + use_value_delta_encoding); } PartitionedIndexBuilder::PartitionedIndexBuilder( const InternalKeyComparator* comparator, - const BlockBasedTableOptions& table_opt) + const BlockBasedTableOptions& table_opt, + const bool use_value_delta_encoding) : IndexBuilder(comparator), index_block_builder_(table_opt.index_block_restart_interval, - table_opt.format_version), + true /*use_delta_encoding*/, + use_value_delta_encoding), index_block_builder_without_seq_(table_opt.index_block_restart_interval, - table_opt.format_version), + true /*use_delta_encoding*/, + use_value_delta_encoding), sub_index_builder_(nullptr), table_opt_(table_opt), // We start by false. After each partition we revise the value based on @@ -76,7 +84,8 @@ // sub_index_builder. Otherwise, it could be set to true even one of the // sub_index_builders could not safely exclude seq from the keys, then it // wil be enforced on all sub_index_builders on ::Finish. - seperator_is_key_plus_seq_(false) {} + seperator_is_key_plus_seq_(false), + use_value_delta_encoding_(use_value_delta_encoding) {} PartitionedIndexBuilder::~PartitionedIndexBuilder() { delete sub_index_builder_; @@ -86,7 +95,7 @@ assert(sub_index_builder_ == nullptr); sub_index_builder_ = new ShortenedIndexBuilder( comparator_, table_opt_.index_block_restart_interval, - table_opt_.format_version); + table_opt_.format_version, use_value_delta_encoding_); flush_policy_.reset(FlushBlockBySizePolicyFactory::NewFlushBlockPolicy( table_opt_.metadata_block_size, table_opt_.block_size_deviation, // Note: this is sub-optimal since sub_index_builder_ could later reset @@ -162,10 +171,18 @@ Entry& last_entry = entries_.front(); std::string handle_encoding; last_partition_block_handle.EncodeTo(&handle_encoding); - index_block_builder_.Add(last_entry.key, handle_encoding); + std::string handle_delta_encoding; + PutVarsignedint64( + &handle_delta_encoding, + last_partition_block_handle.size() - last_encoded_handle_.size()); + last_encoded_handle_ = last_partition_block_handle; + const Slice handle_delta_encoding_slice(handle_delta_encoding); + index_block_builder_.Add(last_entry.key, handle_encoding, + &handle_delta_encoding_slice); if (!seperator_is_key_plus_seq_) { index_block_builder_without_seq_.Add(ExtractUserKey(last_entry.key), - handle_encoding); + handle_encoding, + &handle_delta_encoding_slice); } entries_.pop_front(); } @@ -193,7 +210,5 @@ } } -size_t PartitionedIndexBuilder::NumPartitions() const { - return partition_cnt_; -} +size_t PartitionedIndexBuilder::NumPartitions() const { return partition_cnt_; } } // namespace rocksdb diff -Nru rocksdb-5.15.10/table/index_builder.h rocksdb-5.17.2/table/index_builder.h --- rocksdb-5.15.10/table/index_builder.h 2018-09-13 17:25:20.000000000 +0000 +++ rocksdb-5.17.2/table/index_builder.h 2018-11-12 19:57:32.000000000 +0000 @@ -38,6 +38,7 @@ BlockBasedTableOptions::IndexType index_type, const rocksdb::InternalKeyComparator* comparator, const InternalKeySliceTransform* int_key_slice_transform, + const bool use_value_delta_encoding, const BlockBasedTableOptions& table_opt); // Index builder will construct a set of blocks which contain: @@ -119,11 +120,16 @@ class ShortenedIndexBuilder : public IndexBuilder { public: explicit ShortenedIndexBuilder(const InternalKeyComparator* comparator, - int index_block_restart_interval, - uint32_t format_version) + const int index_block_restart_interval, + const uint32_t format_version, + const bool use_value_delta_encoding) : IndexBuilder(comparator), - index_block_builder_(index_block_restart_interval), - index_block_builder_without_seq_(index_block_restart_interval) { + index_block_builder_(index_block_restart_interval, + true /*use_delta_encoding*/, + use_value_delta_encoding), + index_block_builder_without_seq_(index_block_restart_interval, + true /*use_delta_encoding*/, + use_value_delta_encoding) { // Making the default true will disable the feature for old versions seperator_is_key_plus_seq_ = (format_version <= 2); } @@ -147,10 +153,17 @@ std::string handle_encoding; block_handle.EncodeTo(&handle_encoding); - index_block_builder_.Add(sep, handle_encoding); + std::string handle_delta_encoding; + PutVarsignedint64(&handle_delta_encoding, + block_handle.size() - last_encoded_handle_.size()); + assert(handle_delta_encoding.size() != 0); + last_encoded_handle_ = block_handle; + const Slice handle_delta_encoding_slice(handle_delta_encoding); + index_block_builder_.Add(sep, handle_encoding, + &handle_delta_encoding_slice); if (!seperator_is_key_plus_seq_) { - index_block_builder_without_seq_.Add(ExtractUserKey(sep), - handle_encoding); + index_block_builder_without_seq_.Add(ExtractUserKey(sep), handle_encoding, + &handle_delta_encoding_slice); } } @@ -168,9 +181,7 @@ return Status::OK(); } - virtual size_t IndexSize() const override { - return index_size_; - } + virtual size_t IndexSize() const override { return index_size_; } virtual bool seperator_is_key_plus_seq() override { return seperator_is_key_plus_seq_; @@ -182,6 +193,7 @@ BlockBuilder index_block_builder_; BlockBuilder index_block_builder_without_seq_; bool seperator_is_key_plus_seq_; + BlockHandle last_encoded_handle_; }; // HashIndexBuilder contains a binary-searchable primary index and the @@ -216,10 +228,10 @@ explicit HashIndexBuilder(const InternalKeyComparator* comparator, const SliceTransform* hash_key_extractor, int index_block_restart_interval, - int format_version) + int format_version, bool use_value_delta_encoding) : IndexBuilder(comparator), primary_index_builder_(comparator, index_block_restart_interval, - format_version), + format_version, use_value_delta_encoding), hash_key_extractor_(hash_key_extractor) {} virtual void AddIndexEntry(std::string* last_key_in_current_block, @@ -322,10 +334,12 @@ public: static PartitionedIndexBuilder* CreateIndexBuilder( const rocksdb::InternalKeyComparator* comparator, + const bool use_value_delta_encoding, const BlockBasedTableOptions& table_opt); explicit PartitionedIndexBuilder(const InternalKeyComparator* comparator, - const BlockBasedTableOptions& table_opt); + const BlockBasedTableOptions& table_opt, + const bool use_value_delta_encoding); virtual ~PartitionedIndexBuilder(); @@ -337,12 +351,8 @@ IndexBlocks* index_blocks, const BlockHandle& last_partition_block_handle) override; - virtual size_t IndexSize() const override { - return index_size_; - } - size_t TopLevelIndexSize(uint64_t) const { - return top_level_index_size_; - } + virtual size_t IndexSize() const override { return index_size_; } + size_t TopLevelIndexSize(uint64_t) const { return top_level_index_size_; } size_t NumPartitions() const; inline bool ShouldCutFilterBlock() { @@ -364,6 +374,8 @@ return seperator_is_key_plus_seq_; } + bool get_use_value_delta_encoding() { return use_value_delta_encoding_; } + private: // Set after ::Finish is called size_t top_level_index_size_ = 0; @@ -388,10 +400,12 @@ bool finishing_indexes = false; const BlockBasedTableOptions& table_opt_; bool seperator_is_key_plus_seq_; + bool use_value_delta_encoding_; // true if an external entity (such as filter partition builder) request // cutting the next partition bool partition_cut_requested_ = true; // true if it should cut the next filter partition block bool cut_filter_block = false; + BlockHandle last_encoded_handle_; }; } // namespace rocksdb diff -Nru rocksdb-5.15.10/table/internal_iterator.h rocksdb-5.17.2/table/internal_iterator.h --- rocksdb-5.15.10/table/internal_iterator.h 2018-09-13 17:25:20.000000000 +0000 +++ rocksdb-5.17.2/table/internal_iterator.h 2018-11-12 19:57:32.000000000 +0000 @@ -10,15 +10,17 @@ #include "rocksdb/comparator.h" #include "rocksdb/iterator.h" #include "rocksdb/status.h" +#include "table/format.h" namespace rocksdb { class PinnedIteratorsManager; -class InternalIterator : public Cleanable { +template +class InternalIteratorBase : public Cleanable { public: - InternalIterator() {} - virtual ~InternalIterator() {} + InternalIteratorBase() {} + virtual ~InternalIteratorBase() {} // An iterator is either positioned at a key/value pair, or // not valid. This method returns true iff the iterator is valid. @@ -66,7 +68,7 @@ // the returned slice is valid only until the next modification of // the iterator. // REQUIRES: Valid() - virtual Slice value() const = 0; + virtual TValue value() const = 0; // If an error has occurred, return it. Else return an ok status. // If non-blocking IO is requested and this operation cannot be @@ -117,14 +119,24 @@ private: // No copying allowed - InternalIterator(const InternalIterator&) = delete; - InternalIterator& operator=(const InternalIterator&) = delete; + InternalIteratorBase(const InternalIteratorBase&) = delete; + InternalIteratorBase& operator=(const InternalIteratorBase&) = delete; }; +using InternalIterator = InternalIteratorBase; + // Return an empty iterator (yields nothing). -extern InternalIterator* NewEmptyInternalIterator(); +template +extern InternalIteratorBase* NewEmptyInternalIterator(); // Return an empty iterator with the specified status. -extern InternalIterator* NewErrorInternalIterator(const Status& status); +template +extern InternalIteratorBase* NewErrorInternalIterator( + const Status& status); + +// Return an empty iterator with the specified status, allocated arena. +template +extern InternalIteratorBase* NewErrorInternalIterator( + const Status& status, Arena* arena); } // namespace rocksdb diff -Nru rocksdb-5.15.10/table/iterator.cc rocksdb-5.17.2/table/iterator.cc --- rocksdb-5.15.10/table/iterator.cc 2018-09-13 17:25:20.000000000 +0000 +++ rocksdb-5.17.2/table/iterator.cc 2018-11-12 19:57:32.000000000 +0000 @@ -131,7 +131,8 @@ Status status_; }; -class EmptyInternalIterator : public InternalIterator { +template +class EmptyInternalIterator : public InternalIteratorBase { public: explicit EmptyInternalIterator(const Status& s) : status_(s) {} virtual bool Valid() const override { return false; } @@ -145,9 +146,9 @@ assert(false); return Slice(); } - Slice value() const override { + TValue value() const override { assert(false); - return Slice(); + return TValue(); } virtual Status status() const override { return status_; } @@ -164,30 +165,48 @@ return new EmptyIterator(status); } -InternalIterator* NewEmptyInternalIterator() { - return new EmptyInternalIterator(Status::OK()); -} - -InternalIterator* NewEmptyInternalIterator(Arena* arena) { +template +InternalIteratorBase* NewErrorInternalIterator(const Status& status) { + return new EmptyInternalIterator(status); +} +template InternalIteratorBase* NewErrorInternalIterator( + const Status& status); +template InternalIteratorBase* NewErrorInternalIterator( + const Status& status); + +template +InternalIteratorBase* NewErrorInternalIterator(const Status& status, + Arena* arena) { if (arena == nullptr) { - return NewEmptyInternalIterator(); + return NewErrorInternalIterator(status); } else { auto mem = arena->AllocateAligned(sizeof(EmptyIterator)); - return new (mem) EmptyInternalIterator(Status::OK()); + return new (mem) EmptyInternalIterator(status); } } - -InternalIterator* NewErrorInternalIterator(const Status& status) { - return new EmptyInternalIterator(status); +template InternalIteratorBase* NewErrorInternalIterator( + const Status& status, Arena* arena); +template InternalIteratorBase* NewErrorInternalIterator( + const Status& status, Arena* arena); + +template +InternalIteratorBase* NewEmptyInternalIterator() { + return new EmptyInternalIterator(Status::OK()); } +template InternalIteratorBase* NewEmptyInternalIterator(); +template InternalIteratorBase* NewEmptyInternalIterator(); -InternalIterator* NewErrorInternalIterator(const Status& status, Arena* arena) { +template +InternalIteratorBase* NewEmptyInternalIterator(Arena* arena) { if (arena == nullptr) { - return NewErrorInternalIterator(status); + return NewEmptyInternalIterator(); } else { auto mem = arena->AllocateAligned(sizeof(EmptyIterator)); - return new (mem) EmptyInternalIterator(status); + return new (mem) EmptyInternalIterator(Status::OK()); } } +template InternalIteratorBase* NewEmptyInternalIterator( + Arena* arena); +template InternalIteratorBase* NewEmptyInternalIterator(Arena* arena); } // namespace rocksdb diff -Nru rocksdb-5.15.10/table/iterator_wrapper.h rocksdb-5.17.2/table/iterator_wrapper.h --- rocksdb-5.15.10/table/iterator_wrapper.h 2018-09-13 17:25:20.000000000 +0000 +++ rocksdb-5.17.2/table/iterator_wrapper.h 2018-11-12 19:57:32.000000000 +0000 @@ -19,19 +19,21 @@ // the valid() and key() results for an underlying iterator. // This can help avoid virtual function calls and also gives better // cache locality. -class IteratorWrapper { +template +class IteratorWrapperBase { public: - IteratorWrapper() : iter_(nullptr), valid_(false) {} - explicit IteratorWrapper(InternalIterator* _iter) : iter_(nullptr) { + IteratorWrapperBase() : iter_(nullptr), valid_(false) {} + explicit IteratorWrapperBase(InternalIteratorBase* _iter) + : iter_(nullptr) { Set(_iter); } - ~IteratorWrapper() {} - InternalIterator* iter() const { return iter_; } + ~IteratorWrapperBase() {} + InternalIteratorBase* iter() const { return iter_; } // Set the underlying Iterator to _iter and return // previous underlying Iterator. - InternalIterator* Set(InternalIterator* _iter) { - InternalIterator* old_iter = iter_; + InternalIteratorBase* Set(InternalIteratorBase* _iter) { + InternalIteratorBase* old_iter = iter_; iter_ = _iter; if (iter_ == nullptr) { @@ -47,7 +49,7 @@ if (!is_arena_mode) { delete iter_; } else { - iter_->~InternalIterator(); + iter_->~InternalIteratorBase(); } } } @@ -55,7 +57,10 @@ // Iterator interface methods bool Valid() const { return valid_; } Slice key() const { assert(Valid()); return key_; } - Slice value() const { assert(Valid()); return iter_->value(); } + TValue value() const { + assert(Valid()); + return iter_->value(); + } // Methods below require iter() != nullptr Status status() const { assert(iter_); return iter_->status(); } void Next() { assert(iter_); iter_->Next(); Update(); } @@ -91,17 +96,16 @@ } } - InternalIterator* iter_; + InternalIteratorBase* iter_; bool valid_; Slice key_; }; +using IteratorWrapper = IteratorWrapperBase; + class Arena; // Return an empty iterator (yields nothing) allocated from arena. -extern InternalIterator* NewEmptyInternalIterator(Arena* arena); - -// Return an empty iterator with the specified status, allocated arena. -extern InternalIterator* NewErrorInternalIterator(const Status& status, - Arena* arena); +template +extern InternalIteratorBase* NewEmptyInternalIterator(Arena* arena); } // namespace rocksdb diff -Nru rocksdb-5.15.10/table/merging_iterator.cc rocksdb-5.17.2/table/merging_iterator.cc --- rocksdb-5.15.10/table/merging_iterator.cc 2018-09-13 17:25:20.000000000 +0000 +++ rocksdb-5.17.2/table/merging_iterator.cc 2018-11-12 19:57:32.000000000 +0000 @@ -387,7 +387,7 @@ Arena* arena, bool prefix_seek_mode) { assert(n >= 0); if (n == 0) { - return NewEmptyInternalIterator(arena); + return NewEmptyInternalIterator(arena); } else if (n == 1) { return list[0]; } else { diff -Nru rocksdb-5.15.10/table/merging_iterator.h rocksdb-5.17.2/table/merging_iterator.h --- rocksdb-5.15.10/table/merging_iterator.h 2018-09-13 17:25:20.000000000 +0000 +++ rocksdb-5.17.2/table/merging_iterator.h 2018-11-12 19:57:32.000000000 +0000 @@ -15,9 +15,11 @@ namespace rocksdb { class Comparator; -class InternalIterator; class Env; class Arena; +template +class InternalIteratorBase; +using InternalIterator = InternalIteratorBase; // Return an iterator that provided the union of the data in // children[0,n-1]. Takes ownership of the child iterators and diff -Nru rocksdb-5.15.10/table/meta_blocks.cc rocksdb-5.17.2/table/meta_blocks.cc --- rocksdb-5.15.10/table/meta_blocks.cc 2018-09-13 17:25:20.000000000 +0000 +++ rocksdb-5.17.2/table/meta_blocks.cc 2018-11-12 19:57:32.000000000 +0000 @@ -76,6 +76,8 @@ Add(TablePropertiesNames::kTopLevelIndexSize, props.top_level_index_size); } Add(TablePropertiesNames::kIndexKeyIsUserKey, props.index_key_is_user_key); + Add(TablePropertiesNames::kIndexValueIsDeltaEncoded, + props.index_value_is_delta_encoded); Add(TablePropertiesNames::kNumEntries, props.num_entries); Add(TablePropertiesNames::kNumRangeDeletions, props.num_range_deletions); Add(TablePropertiesNames::kNumDataBlocks, props.num_data_blocks); @@ -218,6 +220,8 @@ &new_table_properties->top_level_index_size}, {TablePropertiesNames::kIndexKeyIsUserKey, &new_table_properties->index_key_is_user_key}, + {TablePropertiesNames::kIndexValueIsDeltaEncoded, + &new_table_properties->index_value_is_delta_encoded}, {TablePropertiesNames::kFilterSize, &new_table_properties->filter_size}, {TablePropertiesNames::kRawKeySize, &new_table_properties->raw_key_size}, {TablePropertiesNames::kRawValueSize, diff -Nru rocksdb-5.15.10/table/meta_blocks.h rocksdb-5.17.2/table/meta_blocks.h --- rocksdb-5.15.10/table/meta_blocks.h 2018-09-13 17:25:20.000000000 +0000 +++ rocksdb-5.17.2/table/meta_blocks.h 2018-11-12 19:57:32.000000000 +0000 @@ -27,7 +27,6 @@ class Logger; class RandomAccessFile; struct TableProperties; -class InternalIterator; class MetaIndexBuilder { public: diff -Nru rocksdb-5.15.10/table/mock_table.cc rocksdb-5.17.2/table/mock_table.cc --- rocksdb-5.15.10/table/mock_table.cc 2018-09-13 17:25:20.000000000 +0000 +++ rocksdb-5.17.2/table/mock_table.cc 2018-11-12 19:57:32.000000000 +0000 @@ -93,7 +93,7 @@ return s; } - WritableFileWriter file_writer(std::move(file), EnvOptions()); + WritableFileWriter file_writer(std::move(file), fname, EnvOptions()); uint32_t id = GetAndWriteNextID(&file_writer); file_system_.files.insert({id, std::move(file_contents)}); diff -Nru rocksdb-5.15.10/table/partitioned_filter_block.cc rocksdb-5.17.2/table/partitioned_filter_block.cc --- rocksdb-5.15.10/table/partitioned_filter_block.cc 2018-09-13 17:25:20.000000000 +0000 +++ rocksdb-5.17.2/table/partitioned_filter_block.cc 2018-11-12 19:57:32.000000000 +0000 @@ -26,12 +26,17 @@ PartitionedFilterBlockBuilder::PartitionedFilterBlockBuilder( const SliceTransform* prefix_extractor, bool whole_key_filtering, FilterBitsBuilder* filter_bits_builder, int index_block_restart_interval, + const bool use_value_delta_encoding, PartitionedIndexBuilder* const p_index_builder, const uint32_t partition_size) : FullFilterBlockBuilder(prefix_extractor, whole_key_filtering, filter_bits_builder), - index_on_filter_block_builder_(index_block_restart_interval), - index_on_filter_block_builder_without_seq_(index_block_restart_interval), + index_on_filter_block_builder_(index_block_restart_interval, + true /*use_delta_encoding*/, + use_value_delta_encoding), + index_on_filter_block_builder_without_seq_(index_block_restart_interval, + true /*use_delta_encoding*/, + use_value_delta_encoding), p_index_builder_(p_index_builder), filters_in_partition_(0), num_added_(0) { @@ -73,10 +78,18 @@ FilterEntry& last_entry = filters.front(); std::string handle_encoding; last_partition_block_handle.EncodeTo(&handle_encoding); - index_on_filter_block_builder_.Add(last_entry.key, handle_encoding); + std::string handle_delta_encoding; + PutVarsignedint64( + &handle_delta_encoding, + last_partition_block_handle.size() - last_encoded_handle_.size()); + last_encoded_handle_ = last_partition_block_handle; + const Slice handle_delta_encoding_slice(handle_delta_encoding); + index_on_filter_block_builder_.Add(last_entry.key, handle_encoding, + &handle_delta_encoding_slice); if (!p_index_builder_->seperator_is_key_plus_seq()) { index_on_filter_block_builder_without_seq_.Add( - ExtractUserKey(last_entry.key), handle_encoding); + ExtractUserKey(last_entry.key), handle_encoding, + &handle_delta_encoding_slice); } filters.pop_front(); } else { @@ -109,12 +122,14 @@ const SliceTransform* prefix_extractor, bool _whole_key_filtering, BlockContents&& contents, FilterBitsReader* /*filter_bits_reader*/, Statistics* stats, const InternalKeyComparator comparator, - const BlockBasedTable* table, const bool index_key_includes_seq) + const BlockBasedTable* table, const bool index_key_includes_seq, + const bool index_value_is_full) : FilterBlockReader(contents.data.size(), stats, _whole_key_filtering), prefix_extractor_(prefix_extractor), comparator_(comparator), table_(table), - index_key_includes_seq_(index_key_includes_seq) { + index_key_includes_seq_(index_key_includes_seq), + index_value_is_full_(index_value_is_full) { idx_on_fltr_blk_.reset(new Block(std::move(contents), kDisableGlobalSequenceNumber, 0 /* read_amp_bytes_per_bit */, stats)); @@ -134,15 +149,10 @@ Statistics* kNullStats = nullptr; idx_on_fltr_blk_->NewIterator( &comparator_, comparator_.user_comparator(), &biter, kNullStats, true, - index_key_includes_seq_); + index_key_includes_seq_, index_value_is_full_); biter.SeekToFirst(); for (; biter.Valid(); biter.Next()) { - auto input = biter.value(); - auto s = handle.DecodeFrom(&input); - assert(s.ok()); - if (!s.ok()) { - continue; - } + handle = biter.value(); auto key = BlockBasedTable::GetCacheKey(table_->rep_->cache_key_prefix, table_->rep_->cache_key_prefix_size, handle, cache_key); @@ -168,7 +178,7 @@ } bool cached = false; auto filter_partition = - GetFilterPartition(nullptr /* prefetch_buffer */, &filter_handle, no_io, + GetFilterPartition(nullptr /* prefetch_buffer */, filter_handle, no_io, &cached, prefix_extractor); if (UNLIKELY(!filter_partition.value)) { return true; @@ -207,7 +217,7 @@ } bool cached = false; auto filter_partition = - GetFilterPartition(nullptr /* prefetch_buffer */, &filter_handle, no_io, + GetFilterPartition(nullptr /* prefetch_buffer */, filter_handle, no_io, &cached, prefix_extractor); if (UNLIKELY(!filter_partition.value)) { return true; @@ -225,29 +235,26 @@ return res; } -Slice PartitionedFilterBlockReader::GetFilterPartitionHandle( +BlockHandle PartitionedFilterBlockReader::GetFilterPartitionHandle( const Slice& entry) { IndexBlockIter iter; Statistics* kNullStats = nullptr; idx_on_fltr_blk_->NewIterator( &comparator_, comparator_.user_comparator(), &iter, kNullStats, true, - index_key_includes_seq_); + index_key_includes_seq_, index_value_is_full_); iter.Seek(entry); if (UNLIKELY(!iter.Valid())) { - return Slice(); + return BlockHandle(0, 0); } assert(iter.Valid()); - Slice handle_value = iter.value(); - return handle_value; + BlockHandle fltr_blk_handle = iter.value(); + return fltr_blk_handle; } BlockBasedTable::CachableEntry PartitionedFilterBlockReader::GetFilterPartition( - FilePrefetchBuffer* prefetch_buffer, Slice* handle_value, const bool no_io, - bool* cached, const SliceTransform* prefix_extractor) { - BlockHandle fltr_blk_handle; - auto s = fltr_blk_handle.DecodeFrom(handle_value); - assert(s.ok()); + FilePrefetchBuffer* prefetch_buffer, BlockHandle& fltr_blk_handle, + const bool no_io, bool* cached, const SliceTransform* prefix_extractor) { const bool is_a_filter_partition = true; auto block_cache = table_->rep_->table_options.block_cache.get(); if (LIKELY(block_cache != nullptr)) { @@ -299,39 +306,25 @@ // Before read partitions, prefetch them to avoid lots of IOs auto rep = table_->rep_; IndexBlockIter biter; - BlockHandle handle; Statistics* kNullStats = nullptr; idx_on_fltr_blk_->NewIterator( &comparator_, comparator_.user_comparator(), &biter, kNullStats, true, - index_key_includes_seq_); + index_key_includes_seq_, index_value_is_full_); // Index partitions are assumed to be consecuitive. Prefetch them all. // Read the first block offset biter.SeekToFirst(); - Slice input = biter.value(); - Status s = handle.DecodeFrom(&input); - assert(s.ok()); - if (!s.ok()) { - ROCKS_LOG_WARN(rep->ioptions.info_log, - "Could not read first index partition"); - return; - } + BlockHandle handle = biter.value(); uint64_t prefetch_off = handle.offset(); // Read the last block's offset biter.SeekToLast(); - input = biter.value(); - s = handle.DecodeFrom(&input); - assert(s.ok()); - if (!s.ok()) { - ROCKS_LOG_WARN(rep->ioptions.info_log, - "Could not read last index partition"); - return; - } + handle = biter.value(); uint64_t last_off = handle.offset() + handle.size() + kBlockTrailerSize; uint64_t prefetch_len = last_off - prefetch_off; std::unique_ptr prefetch_buffer; auto& file = table_->rep_->file; prefetch_buffer.reset(new FilePrefetchBuffer()); + Status s; s = prefetch_buffer->Prefetch(file.get(), prefetch_off, static_cast(prefetch_len)); @@ -339,14 +332,7 @@ biter.SeekToFirst(); Cache* block_cache = rep->table_options.block_cache.get(); for (; biter.Valid(); biter.Next()) { - input = biter.value(); - s = handle.DecodeFrom(&input); - assert(s.ok()); - if (!s.ok()) { - ROCKS_LOG_WARN(rep->ioptions.info_log, "Could not read index partition"); - continue; - } - + handle = biter.value(); const bool no_io = true; const bool is_a_filter_partition = true; auto filter = table_->GetFilter( diff -Nru rocksdb-5.15.10/table/partitioned_filter_block.h rocksdb-5.17.2/table/partitioned_filter_block.h --- rocksdb-5.15.10/table/partitioned_filter_block.h 2018-09-13 17:25:20.000000000 +0000 +++ rocksdb-5.17.2/table/partitioned_filter_block.h 2018-11-12 19:57:32.000000000 +0000 @@ -26,6 +26,7 @@ explicit PartitionedFilterBlockBuilder( const SliceTransform* prefix_extractor, bool whole_key_filtering, FilterBitsBuilder* filter_bits_builder, int index_block_restart_interval, + const bool use_value_delta_encoding, PartitionedIndexBuilder* const p_index_builder, const uint32_t partition_size); @@ -65,6 +66,7 @@ uint32_t filters_in_partition_; // Number of keys added size_t num_added_; + BlockHandle last_encoded_handle_; }; class PartitionedFilterBlockReader : public FilterBlockReader, @@ -74,7 +76,8 @@ const SliceTransform* prefix_extractor, bool whole_key_filtering, BlockContents&& contents, FilterBitsReader* filter_bits_reader, Statistics* stats, const InternalKeyComparator comparator, - const BlockBasedTable* table, const bool index_key_includes_seq); + const BlockBasedTable* table, const bool index_key_includes_seq, + const bool index_value_is_full); virtual ~PartitionedFilterBlockReader(); virtual bool IsBlockBased() override { return false; } @@ -89,10 +92,11 @@ virtual size_t ApproximateMemoryUsage() const override; private: - Slice GetFilterPartitionHandle(const Slice& entry); + BlockHandle GetFilterPartitionHandle(const Slice& entry); BlockBasedTable::CachableEntry GetFilterPartition( - FilePrefetchBuffer* prefetch_buffer, Slice* handle, const bool no_io, - bool* cached, const SliceTransform* prefix_extractor = nullptr); + FilePrefetchBuffer* prefetch_buffer, BlockHandle& handle, + const bool no_io, bool* cached, + const SliceTransform* prefix_extractor = nullptr); virtual void CacheDependencies( bool bin, const SliceTransform* prefix_extractor) override; @@ -101,6 +105,7 @@ const InternalKeyComparator comparator_; const BlockBasedTable* table_; const bool index_key_includes_seq_; + const bool index_value_is_full_; std::unordered_map> filter_map_; diff -Nru rocksdb-5.15.10/table/partitioned_filter_block_test.cc rocksdb-5.17.2/table/partitioned_filter_block_test.cc --- rocksdb-5.15.10/table/partitioned_filter_block_test.cc 2018-09-13 17:25:20.000000000 +0000 +++ rocksdb-5.17.2/table/partitioned_filter_block_test.cc 2018-11-12 19:57:32.000000000 +0000 @@ -50,7 +50,9 @@ } }; -class PartitionedFilterBlockTest : public testing::Test { +class PartitionedFilterBlockTest + : public testing::Test, + virtual public ::testing::WithParamInterface { public: BlockBasedTableOptions table_options_; InternalKeyComparator icomp = InternalKeyComparator(BytewiseComparator()); @@ -60,6 +62,8 @@ table_options_.no_block_cache = true; // Otherwise BlockBasedTable::Close // will access variable that are not // initialized in our mocked version + table_options_.format_version = GetParam(); + table_options_.index_block_restart_interval = 3; } std::shared_ptr cache_; @@ -100,7 +104,9 @@ } PartitionedIndexBuilder* NewIndexBuilder() { - return PartitionedIndexBuilder::CreateIndexBuilder(&icomp, table_options_); + const bool kValueDeltaEncoded = true; + return PartitionedIndexBuilder::CreateIndexBuilder( + &icomp, !kValueDeltaEncoded, table_options_); } PartitionedFilterBlockBuilder* NewBuilder( @@ -113,11 +119,12 @@ 99) / 100); partition_size = std::max(partition_size, static_cast(1)); + const bool kValueDeltaEncoded = true; return new PartitionedFilterBlockBuilder( prefix_extractor, table_options_.whole_key_filtering, table_options_.filter_policy->GetFilterBitsBuilder(), - table_options_.index_block_restart_interval, p_index_builder, - partition_size); + table_options_.index_block_restart_interval, !kValueDeltaEncoded, + p_index_builder, partition_size); } std::unique_ptr table; @@ -143,7 +150,8 @@ !kSkipFilters, !kImmortal))); auto reader = new PartitionedFilterBlockReader( prefix_extractor, true, BlockContents(slice, false, kNoCompression), - nullptr, nullptr, icomp, table.get(), pib->seperator_is_key_plus_seq()); + nullptr, nullptr, icomp, table.get(), pib->seperator_is_key_plus_seq(), + !pib->get_use_value_delta_encoding()); return reader; } @@ -275,14 +283,19 @@ } }; -TEST_F(PartitionedFilterBlockTest, EmptyBuilder) { +INSTANTIATE_TEST_CASE_P(FormatDef, PartitionedFilterBlockTest, + testing::Values(test::kDefaultFormatVersion)); +INSTANTIATE_TEST_CASE_P(FormatLatest, PartitionedFilterBlockTest, + testing::Values(test::kLatestFormatVersion)); + +TEST_P(PartitionedFilterBlockTest, EmptyBuilder) { std::unique_ptr pib(NewIndexBuilder()); std::unique_ptr builder(NewBuilder(pib.get())); const bool empty = true; VerifyReader(builder.get(), pib.get(), empty); } -TEST_F(PartitionedFilterBlockTest, OneBlock) { +TEST_P(PartitionedFilterBlockTest, OneBlock) { uint64_t max_index_size = MaxIndexSize(); for (uint64_t i = 1; i < max_index_size + 1; i++) { table_options_.metadata_block_size = i; @@ -290,7 +303,7 @@ } } -TEST_F(PartitionedFilterBlockTest, TwoBlocksPerKey) { +TEST_P(PartitionedFilterBlockTest, TwoBlocksPerKey) { uint64_t max_index_size = MaxIndexSize(); for (uint64_t i = 1; i < max_index_size + 1; i++) { table_options_.metadata_block_size = i; @@ -300,7 +313,7 @@ // This reproduces the bug that a prefix is the same among multiple consecutive // blocks but the bug would add it only to the first block. -TEST_F(PartitionedFilterBlockTest, SamePrefixInMultipleBlocks) { +TEST_P(PartitionedFilterBlockTest, SamePrefixInMultipleBlocks) { // some small number to cause partition cuts table_options_.metadata_block_size = 1; std::unique_ptr prefix_extractor @@ -326,7 +339,7 @@ } } -TEST_F(PartitionedFilterBlockTest, OneBlockPerKey) { +TEST_P(PartitionedFilterBlockTest, OneBlockPerKey) { uint64_t max_index_size = MaxIndexSize(); for (uint64_t i = 1; i < max_index_size + 1; i++) { table_options_.metadata_block_size = i; @@ -334,7 +347,7 @@ } } -TEST_F(PartitionedFilterBlockTest, PartitionCount) { +TEST_P(PartitionedFilterBlockTest, PartitionCount) { int num_keys = sizeof(keys) / sizeof(*keys); table_options_.metadata_block_size = std::max(MaxIndexSize(), MaxFilterSize()); diff -Nru rocksdb-5.15.10/table/plain_table_reader.cc rocksdb-5.17.2/table/plain_table_reader.cc --- rocksdb-5.15.10/table/plain_table_reader.cc 2018-09-13 17:25:20.000000000 +0000 +++ rocksdb-5.17.2/table/plain_table_reader.cc 2018-11-12 19:57:32.000000000 +0000 @@ -277,7 +277,7 @@ Status PlainTableReader::MmapDataIfNeeded() { if (file_info_.is_mmap_mode) { // Get mmapped memory. - return file_info_.file->Read(0, file_size_, &file_info_.file_data, nullptr); + return file_info_.file->Read(0, static_cast(file_size_), &file_info_.file_data, nullptr); } return Status::OK(); } diff -Nru rocksdb-5.15.10/table/plain_table_reader.h rocksdb-5.17.2/table/plain_table_reader.h --- rocksdb-5.15.10/table/plain_table_reader.h 2018-09-13 17:25:20.000000000 +0000 +++ rocksdb-5.17.2/table/plain_table_reader.h 2018-11-12 19:57:32.000000000 +0000 @@ -38,7 +38,6 @@ class InternalKeyComparator; class PlainTableKeyDecoder; class GetContext; -class InternalIterator; using std::unique_ptr; using std::unordered_map; diff -Nru rocksdb-5.15.10/table/sst_file_writer.cc rocksdb-5.17.2/table/sst_file_writer.cc --- rocksdb-5.15.10/table/sst_file_writer.cc 2018-09-13 17:25:20.000000000 +0000 +++ rocksdb-5.17.2/table/sst_file_writer.cc 2018-11-12 19:57:32.000000000 +0000 @@ -238,7 +238,7 @@ nullptr /* compression_dict */, r->skip_filters, r->column_family_name, unknown_level); r->file_writer.reset( - new WritableFileWriter(std::move(sst_file), r->env_options)); + new WritableFileWriter(std::move(sst_file), file_path, r->env_options)); // TODO(tec) : If table_factory is using compressed block cache, we will // be adding the external sst file blocks into it, which is wasteful. diff -Nru rocksdb-5.15.10/table/table_builder.h rocksdb-5.17.2/table/table_builder.h --- rocksdb-5.15.10/table/table_builder.h 2018-09-13 17:25:20.000000000 +0000 +++ rocksdb-5.17.2/table/table_builder.h 2018-11-12 19:57:32.000000000 +0000 @@ -13,6 +13,7 @@ #include #include #include +#include "db/dbformat.h" #include "db/table_properties_collector.h" #include "options/cf_options.h" #include "rocksdb/options.h" @@ -32,13 +33,25 @@ const InternalKeyComparator& _internal_comparator, bool _skip_filters = false, bool _immortal = false, int _level = -1) + : TableReaderOptions(_ioptions, _prefix_extractor, _env_options, + _internal_comparator, _skip_filters, _immortal, + _level, 0 /* _largest_seqno */) {} + + // @param skip_filters Disables loading/accessing the filter block + TableReaderOptions(const ImmutableCFOptions& _ioptions, + const SliceTransform* _prefix_extractor, + const EnvOptions& _env_options, + const InternalKeyComparator& _internal_comparator, + bool _skip_filters, bool _immortal, int _level, + SequenceNumber _largest_seqno) : ioptions(_ioptions), prefix_extractor(_prefix_extractor), env_options(_env_options), internal_comparator(_internal_comparator), skip_filters(_skip_filters), immortal(_immortal), - level(_level) {} + level(_level), + largest_seqno(_largest_seqno) {} const ImmutableCFOptions& ioptions; const SliceTransform* prefix_extractor; @@ -50,6 +63,8 @@ bool immortal; // what level this table/file is on, -1 for "not set, don't know" int level; + // largest seqno in the table + SequenceNumber largest_seqno; }; struct TableBuilderOptions { diff -Nru rocksdb-5.15.10/table/table_properties.cc rocksdb-5.17.2/table/table_properties.cc --- rocksdb-5.15.10/table/table_properties.cc 2018-09-13 17:25:20.000000000 +0000 +++ rocksdb-5.17.2/table/table_properties.cc 2018-11-12 19:57:32.000000000 +0000 @@ -94,8 +94,9 @@ AppendProperty(result, "data block size", data_size, prop_delim, kv_delim); char index_block_size_str[80]; snprintf(index_block_size_str, sizeof(index_block_size_str), - "index block size (user-key? %d)", - static_cast(index_key_is_user_key)); + "index block size (user-key? %d, delta-value? %d)", + static_cast(index_key_is_user_key), + static_cast(index_value_is_delta_encoded)); AppendProperty(result, index_block_size_str, index_size, prop_delim, kv_delim); if (index_partitions != 0) { @@ -163,6 +164,7 @@ index_partitions += tp.index_partitions; top_level_index_size += tp.top_level_index_size; index_key_is_user_key += tp.index_key_is_user_key; + index_value_is_delta_encoded += tp.index_value_is_delta_encoded; filter_size += tp.filter_size; raw_key_size += tp.raw_key_size; raw_value_size += tp.raw_value_size; @@ -181,6 +183,8 @@ "rocksdb.top-level.index.size"; const std::string TablePropertiesNames::kIndexKeyIsUserKey = "rocksdb.index.key.is.user.key"; +const std::string TablePropertiesNames::kIndexValueIsDeltaEncoded = + "rocksdb.index.value.is.delta.encoded"; const std::string TablePropertiesNames::kFilterSize = "rocksdb.filter.size"; const std::string TablePropertiesNames::kRawKeySize = diff -Nru rocksdb-5.15.10/table/table_properties_internal.h rocksdb-5.17.2/table/table_properties_internal.h --- rocksdb-5.15.10/table/table_properties_internal.h 2018-09-13 17:25:20.000000000 +0000 +++ rocksdb-5.17.2/table/table_properties_internal.h 2018-11-12 19:57:32.000000000 +0000 @@ -10,7 +10,6 @@ namespace rocksdb { -class InternalIterator; class BlockHandle; // Seek to the properties block. diff -Nru rocksdb-5.15.10/table/table_reader_bench.cc rocksdb-5.17.2/table/table_reader_bench.cc --- rocksdb-5.15.10/table/table_reader_bench.cc 2018-09-13 17:25:20.000000000 +0000 +++ rocksdb-5.17.2/table/table_reader_bench.cc 2018-11-12 19:57:32.000000000 +0000 @@ -94,7 +94,8 @@ std::vector > int_tbl_prop_collector_factories; - file_writer.reset(new WritableFileWriter(std::move(file), env_options)); + file_writer.reset( + new WritableFileWriter(std::move(file), file_name, env_options)); int unknown_level = -1; tb = opts.table_factory->NewTableBuilder( TableBuilderOptions( diff -Nru rocksdb-5.15.10/table/table_reader.h rocksdb-5.17.2/table/table_reader.h --- rocksdb-5.15.10/table/table_reader.h 2018-09-13 17:25:20.000000000 +0000 +++ rocksdb-5.17.2/table/table_reader.h 2018-11-12 19:57:32.000000000 +0000 @@ -21,7 +21,6 @@ struct ReadOptions; struct TableProperties; class GetContext; -class InternalIterator; // A Table is a sorted map from strings to strings. Tables are // immutable and persistent. A Table may be safely accessed from diff -Nru rocksdb-5.15.10/table/table_test.cc rocksdb-5.17.2/table/table_test.cc --- rocksdb-5.15.10/table/table_test.cc 2018-09-13 17:25:20.000000000 +0000 +++ rocksdb-5.17.2/table/table_test.cc 2018-11-12 19:57:32.000000000 +0000 @@ -323,7 +323,8 @@ const stl_wrappers::KVMap& kv_map) override { Reset(); soptions.use_mmap_reads = ioptions.allow_mmap_reads; - file_writer_.reset(test::GetWritableFileWriter(new test::StringSink())); + file_writer_.reset(test::GetWritableFileWriter(new test::StringSink(), + "" /* don't care */)); unique_ptr builder; std::vector> int_tbl_prop_collector_factories; @@ -364,7 +365,8 @@ TableReaderOptions(ioptions, moptions.prefix_extractor.get(), soptions, internal_comparator, !kSkipFilters, !kImmortal, level_), - std::move(file_reader_), TEST_GetSink()->contents().size(), &table_reader_); + std::move(file_reader_), TEST_GetSink()->contents().size(), + &table_reader_); } virtual InternalIterator* NewIterator( @@ -394,12 +396,11 @@ return ioptions.table_factory->NewTableReader( TableReaderOptions(ioptions, moptions.prefix_extractor.get(), soptions, *last_internal_key_), - std::move(file_reader_), TEST_GetSink()->contents().size(), &table_reader_); + std::move(file_reader_), TEST_GetSink()->contents().size(), + &table_reader_); } - virtual TableReader* GetTableReader() { - return table_reader_.get(); - } + virtual TableReader* GetTableReader() { return table_reader_.get(); } virtual bool AnywayDeleteIterator() const override { return convert_to_internal_key_; @@ -1075,6 +1076,7 @@ }; class PlainTableTest : public TableTest {}; class TablePropertyTest : public testing::Test {}; +class BBTTailPrefetchTest : public TableTest {}; INSTANTIATE_TEST_CASE_P(FormatDef, BlockBasedTableTest, testing::Values(test::kDefaultFormatVersion)); @@ -2549,7 +2551,7 @@ PlainTableFactory factory(plain_table_options); test::StringSink sink; unique_ptr file_writer( - test::GetWritableFileWriter(new test::StringSink())); + test::GetWritableFileWriter(new test::StringSink(), "" /* don't care */)); Options options; const ImmutableCFOptions ioptions(options); const MutableCFOptions moptions(options); @@ -2987,9 +2989,13 @@ class IndexBlockRestartIntervalTest : public TableTest, - public ::testing::WithParamInterface { + public ::testing::WithParamInterface> { public: - static std::vector GetRestartValues() { return {-1, 0, 1, 8, 16, 32}; } + static std::vector> GetRestartValues() { + return {{-1, false}, {0, false}, {1, false}, {8, false}, + {16, false}, {32, false}, {-1, true}, {0, true}, + {1, true}, {8, true}, {16, true}, {32, true}}; + } }; INSTANTIATE_TEST_CASE_P( @@ -3001,12 +3007,16 @@ const int kKeySize = 100; const int kValSize = 500; - int index_block_restart_interval = GetParam(); + const int index_block_restart_interval = std::get<0>(GetParam()); + const bool value_delta_encoding = std::get<1>(GetParam()); Options options; BlockBasedTableOptions table_options; table_options.block_size = 64; // small block size to get big index block table_options.index_block_restart_interval = index_block_restart_interval; + if (value_delta_encoding) { + table_options.format_version = 4; + } options.table_factory.reset(new BlockBasedTableFactory(table_options)); TableConstructor c(BytewiseComparator()); @@ -3131,10 +3141,18 @@ // rocksdb still works. } -TEST_P(BlockBasedTableTest, TableWithGlobalSeqno) { +/* + * Disable TableWithGlobalSeqno since RocksDB does not store global_seqno in + * the SST file any more. Instead, RocksDB deduces global_seqno from the + * MANIFEST while reading from an SST. Therefore, it's not possible to test the + * functionality of global_seqno in a single, isolated unit test without the + * involvement of Version, VersionSet, etc. + */ +TEST_P(BlockBasedTableTest, DISABLED_TableWithGlobalSeqno) { BlockBasedTableOptions bbto = GetBlockBasedTableOptions(); test::StringSink* sink = new test::StringSink(); - unique_ptr file_writer(test::GetWritableFileWriter(sink)); + unique_ptr file_writer( + test::GetWritableFileWriter(sink, "" /* don't care */)); Options options; options.table_factory.reset(NewBlockBasedTableFactory(bbto)); const ImmutableCFOptions ioptions(options); @@ -3315,7 +3333,8 @@ BlockBasedTableOptions bbto = GetBlockBasedTableOptions(); bbto.block_align = true; test::StringSink* sink = new test::StringSink(); - unique_ptr file_writer(test::GetWritableFileWriter(sink)); + unique_ptr file_writer( + test::GetWritableFileWriter(sink, "" /* don't care */)); Options options; options.compression = kNoCompression; options.table_factory.reset(NewBlockBasedTableFactory(bbto)); @@ -3404,7 +3423,8 @@ BlockBasedTableOptions bbto = GetBlockBasedTableOptions(); bbto.block_align = true; test::StringSink* sink = new test::StringSink(); - unique_ptr file_writer(test::GetWritableFileWriter(sink)); + unique_ptr file_writer( + test::GetWritableFileWriter(sink, "" /* don't care */)); Options options; options.compression = kNoCompression; @@ -3594,6 +3614,145 @@ ASSERT_NOK(rocksdb::DB::Open(options, kDBPath, &db)); } +TEST_F(BBTTailPrefetchTest, TestTailPrefetchStats) { + TailPrefetchStats tpstats; + ASSERT_EQ(0, tpstats.GetSuggestedPrefetchSize()); + tpstats.RecordEffectiveSize(size_t{1000}); + tpstats.RecordEffectiveSize(size_t{1005}); + tpstats.RecordEffectiveSize(size_t{1002}); + ASSERT_EQ(1005, tpstats.GetSuggestedPrefetchSize()); + + // One single super large value shouldn't influence much + tpstats.RecordEffectiveSize(size_t{1002000}); + tpstats.RecordEffectiveSize(size_t{999}); + ASSERT_LE(1005, tpstats.GetSuggestedPrefetchSize()); + ASSERT_GT(1200, tpstats.GetSuggestedPrefetchSize()); + + // Only history of 32 is kept + for (int i = 0; i < 32; i++) { + tpstats.RecordEffectiveSize(size_t{100}); + } + ASSERT_EQ(100, tpstats.GetSuggestedPrefetchSize()); + + // 16 large values and 16 small values. The result should be closer + // to the small value as the algorithm. + for (int i = 0; i < 16; i++) { + tpstats.RecordEffectiveSize(size_t{1000}); + } + tpstats.RecordEffectiveSize(size_t{10}); + tpstats.RecordEffectiveSize(size_t{20}); + for (int i = 0; i < 6; i++) { + tpstats.RecordEffectiveSize(size_t{100}); + } + ASSERT_LE(80, tpstats.GetSuggestedPrefetchSize()); + ASSERT_GT(200, tpstats.GetSuggestedPrefetchSize()); +} + +TEST_F(BBTTailPrefetchTest, FilePrefetchBufferMinOffset) { + TailPrefetchStats tpstats; + FilePrefetchBuffer buffer(nullptr, 0, 0, false, true); + buffer.TryReadFromCache(500, 10, nullptr); + buffer.TryReadFromCache(480, 10, nullptr); + buffer.TryReadFromCache(490, 10, nullptr); + ASSERT_EQ(480, buffer.min_offset_read()); +} + +TEST_P(BlockBasedTableTest, DataBlockHashIndex) { + const int kNumKeys = 500; + const int kKeySize = 8; + const int kValSize = 40; + + BlockBasedTableOptions table_options = GetBlockBasedTableOptions(); + table_options.data_block_index_type = + BlockBasedTableOptions::kDataBlockBinaryAndHash; + + Options options; + options.comparator = BytewiseComparator(); + + options.table_factory.reset(new BlockBasedTableFactory(table_options)); + + TableConstructor c(options.comparator); + + static Random rnd(1048); + for (int i = 0; i < kNumKeys; i++) { + // padding one "0" to mark existent keys. + std::string random_key(RandomString(&rnd, kKeySize - 1) + "1"); + InternalKey k(random_key, 0, kTypeValue); + c.Add(k.Encode().ToString(), RandomString(&rnd, kValSize)); + } + + std::vector keys; + stl_wrappers::KVMap kvmap; + const ImmutableCFOptions ioptions(options); + const MutableCFOptions moptions(options); + const InternalKeyComparator internal_comparator(options.comparator); + c.Finish(options, ioptions, moptions, table_options, internal_comparator, + &keys, &kvmap); + + auto reader = c.GetTableReader(); + + std::unique_ptr seek_iter; + seek_iter.reset( + reader->NewIterator(ReadOptions(), moptions.prefix_extractor.get())); + for (int i = 0; i < 2; ++i) { + ReadOptions ro; + // for every kv, we seek using two method: Get() and Seek() + // Get() will use the SuffixIndexHash in Block. For non-existent key it + // will invalidate the iterator + // Seek() will use the default BinarySeek() in Block. So for non-existent + // key it will land at the closest key that is large than target. + + // Search for existent keys + for (auto& kv : kvmap) { + if (i == 0) { + // Search using Seek() + seek_iter->Seek(kv.first); + ASSERT_OK(seek_iter->status()); + ASSERT_TRUE(seek_iter->Valid()); + ASSERT_EQ(seek_iter->key(), kv.first); + ASSERT_EQ(seek_iter->value(), kv.second); + } else { + // Search using Get() + PinnableSlice value; + std::string user_key = ExtractUserKey(kv.first).ToString(); + GetContext get_context(options.comparator, nullptr, nullptr, nullptr, + GetContext::kNotFound, user_key, &value, nullptr, + nullptr, nullptr, nullptr); + ASSERT_OK(reader->Get(ro, kv.first, &get_context, + moptions.prefix_extractor.get())); + ASSERT_EQ(get_context.State(), GetContext::kFound); + ASSERT_EQ(value, Slice(kv.second)); + value.Reset(); + } + } + + // Search for non-existent keys + for (auto& kv : kvmap) { + std::string user_key = ExtractUserKey(kv.first).ToString(); + user_key.back() = '0'; // make it non-existent key + InternalKey internal_key(user_key, 0, kTypeValue); + std::string encoded_key = internal_key.Encode().ToString(); + if (i == 0) { // Search using Seek() + seek_iter->Seek(encoded_key); + ASSERT_OK(seek_iter->status()); + if (seek_iter->Valid()) { + ASSERT_TRUE(BytewiseComparator()->Compare( + user_key, ExtractUserKey(seek_iter->key())) < 0); + } + } else { // Search using Get() + PinnableSlice value; + GetContext get_context(options.comparator, nullptr, nullptr, nullptr, + GetContext::kNotFound, user_key, &value, nullptr, + nullptr, nullptr, nullptr); + ASSERT_OK(reader->Get(ro, encoded_key, &get_context, + moptions.prefix_extractor.get())); + ASSERT_EQ(get_context.State(), GetContext::kNotFound); + value.Reset(); + } + } + } +} + } // namespace rocksdb int main(int argc, char** argv) { diff -Nru rocksdb-5.15.10/table/two_level_iterator.cc rocksdb-5.17.2/table/two_level_iterator.cc --- rocksdb-5.15.10/table/two_level_iterator.cc 2018-09-13 17:25:20.000000000 +0000 +++ rocksdb-5.17.2/table/two_level_iterator.cc 2018-11-12 19:57:32.000000000 +0000 @@ -19,12 +19,13 @@ namespace { -class TwoLevelIterator : public InternalIterator { +class TwoLevelIndexIterator : public InternalIteratorBase { public: - explicit TwoLevelIterator(TwoLevelIteratorState* state, - InternalIterator* first_level_iter); + explicit TwoLevelIndexIterator( + TwoLevelIteratorState* state, + InternalIteratorBase* first_level_iter); - virtual ~TwoLevelIterator() { + virtual ~TwoLevelIndexIterator() { first_level_iter_.DeleteIter(false /* is_arena_mode */); second_level_iter_.DeleteIter(false /* is_arena_mode */); delete state_; @@ -42,7 +43,7 @@ assert(Valid()); return second_level_iter_.key(); } - virtual Slice value() const override { + virtual BlockHandle value() const override { assert(Valid()); return second_level_iter_.value(); } @@ -68,23 +69,24 @@ } void SkipEmptyDataBlocksForward(); void SkipEmptyDataBlocksBackward(); - void SetSecondLevelIterator(InternalIterator* iter); + void SetSecondLevelIterator(InternalIteratorBase* iter); void InitDataBlock(); TwoLevelIteratorState* state_; - IteratorWrapper first_level_iter_; - IteratorWrapper second_level_iter_; // May be nullptr + IteratorWrapperBase first_level_iter_; + IteratorWrapperBase second_level_iter_; // May be nullptr Status status_; // If second_level_iter is non-nullptr, then "data_block_handle_" holds the // "index_value" passed to block_function_ to create the second_level_iter. - std::string data_block_handle_; + BlockHandle data_block_handle_; }; -TwoLevelIterator::TwoLevelIterator(TwoLevelIteratorState* state, - InternalIterator* first_level_iter) +TwoLevelIndexIterator::TwoLevelIndexIterator( + TwoLevelIteratorState* state, + InternalIteratorBase* first_level_iter) : state_(state), first_level_iter_(first_level_iter) {} -void TwoLevelIterator::Seek(const Slice& target) { +void TwoLevelIndexIterator::Seek(const Slice& target) { first_level_iter_.Seek(target); InitDataBlock(); @@ -94,7 +96,7 @@ SkipEmptyDataBlocksForward(); } -void TwoLevelIterator::SeekForPrev(const Slice& target) { +void TwoLevelIndexIterator::SeekForPrev(const Slice& target) { first_level_iter_.Seek(target); InitDataBlock(); if (second_level_iter_.iter() != nullptr) { @@ -112,7 +114,7 @@ } } -void TwoLevelIterator::SeekToFirst() { +void TwoLevelIndexIterator::SeekToFirst() { first_level_iter_.SeekToFirst(); InitDataBlock(); if (second_level_iter_.iter() != nullptr) { @@ -121,7 +123,7 @@ SkipEmptyDataBlocksForward(); } -void TwoLevelIterator::SeekToLast() { +void TwoLevelIndexIterator::SeekToLast() { first_level_iter_.SeekToLast(); InitDataBlock(); if (second_level_iter_.iter() != nullptr) { @@ -130,19 +132,19 @@ SkipEmptyDataBlocksBackward(); } -void TwoLevelIterator::Next() { +void TwoLevelIndexIterator::Next() { assert(Valid()); second_level_iter_.Next(); SkipEmptyDataBlocksForward(); } -void TwoLevelIterator::Prev() { +void TwoLevelIndexIterator::Prev() { assert(Valid()); second_level_iter_.Prev(); SkipEmptyDataBlocksBackward(); } -void TwoLevelIterator::SkipEmptyDataBlocksForward() { +void TwoLevelIndexIterator::SkipEmptyDataBlocksForward() { while (second_level_iter_.iter() == nullptr || (!second_level_iter_.Valid() && second_level_iter_.status().ok())) { // Move to next block @@ -158,7 +160,7 @@ } } -void TwoLevelIterator::SkipEmptyDataBlocksBackward() { +void TwoLevelIndexIterator::SkipEmptyDataBlocksBackward() { while (second_level_iter_.iter() == nullptr || (!second_level_iter_.Valid() && second_level_iter_.status().ok())) { // Move to next block @@ -174,24 +176,26 @@ } } -void TwoLevelIterator::SetSecondLevelIterator(InternalIterator* iter) { - InternalIterator* old_iter = second_level_iter_.Set(iter); +void TwoLevelIndexIterator::SetSecondLevelIterator( + InternalIteratorBase* iter) { + InternalIteratorBase* old_iter = second_level_iter_.Set(iter); delete old_iter; } -void TwoLevelIterator::InitDataBlock() { +void TwoLevelIndexIterator::InitDataBlock() { if (!first_level_iter_.Valid()) { SetSecondLevelIterator(nullptr); } else { - Slice handle = first_level_iter_.value(); + BlockHandle handle = first_level_iter_.value(); if (second_level_iter_.iter() != nullptr && !second_level_iter_.status().IsIncomplete() && - handle.compare(data_block_handle_) == 0) { + handle.offset() == data_block_handle_.offset()) { // second_level_iter is already constructed with this iterator, so // no need to change anything } else { - InternalIterator* iter = state_->NewSecondaryIterator(handle); - data_block_handle_.assign(handle.data(), handle.size()); + InternalIteratorBase* iter = + state_->NewSecondaryIterator(handle); + data_block_handle_ = handle; SetSecondLevelIterator(iter); } } @@ -199,8 +203,9 @@ } // namespace -InternalIterator* NewTwoLevelIterator(TwoLevelIteratorState* state, - InternalIterator* first_level_iter) { - return new TwoLevelIterator(state, first_level_iter); +InternalIteratorBase* NewTwoLevelIterator( + TwoLevelIteratorState* state, + InternalIteratorBase* first_level_iter) { + return new TwoLevelIndexIterator(state, first_level_iter); } } // namespace rocksdb diff -Nru rocksdb-5.15.10/table/two_level_iterator.h rocksdb-5.17.2/table/two_level_iterator.h --- rocksdb-5.15.10/table/two_level_iterator.h 2018-09-13 17:25:20.000000000 +0000 +++ rocksdb-5.17.2/table/two_level_iterator.h 2018-11-12 19:57:32.000000000 +0000 @@ -22,7 +22,8 @@ TwoLevelIteratorState() {} virtual ~TwoLevelIteratorState() {} - virtual InternalIterator* NewSecondaryIterator(const Slice& handle) = 0; + virtual InternalIteratorBase* NewSecondaryIterator( + const BlockHandle& handle) = 0; }; @@ -36,7 +37,8 @@ // Uses a supplied function to convert an index_iter value into // an iterator over the contents of the corresponding block. // Note: this function expects first_level_iter was not created using the arena -extern InternalIterator* NewTwoLevelIterator( - TwoLevelIteratorState* state, InternalIterator* first_level_iter); +extern InternalIteratorBase* NewTwoLevelIterator( + TwoLevelIteratorState* state, + InternalIteratorBase* first_level_iter); } // namespace rocksdb diff -Nru rocksdb-5.15.10/TARGETS rocksdb-5.17.2/TARGETS --- rocksdb-5.15.10/TARGETS 2018-09-13 17:25:20.000000000 +0000 +++ rocksdb-5.17.2/TARGETS 2018-11-12 19:57:32.000000000 +0000 @@ -1,3 +1,5 @@ +load("@fbcode_macros//build_defs:auto_headers.bzl", "AutoHeaders") + REPO_PATH = package_name() + "/" BUCK_BINS = "buck-out/gen/" + REPO_PATH @@ -171,6 +173,8 @@ "table/cuckoo_table_builder.cc", "table/cuckoo_table_factory.cc", "table/cuckoo_table_reader.cc", + "table/data_block_footer.cc", + "table/data_block_hash_index.cc", "table/flush_block_policy.cc", "table/format.cc", "table/full_filter_block.cc", @@ -218,18 +222,19 @@ "util/slice.cc", "util/sst_file_manager_impl.cc", "util/status.cc", - "util/status_message.cc", "util/string_util.cc", "util/sync_point.cc", "util/sync_point_impl.cc", "util/thread_local.cc", "util/threadpool_imp.cc", + "util/trace_replay.cc", "util/transaction_test_util.cc", "util/xxhash.cc", "utilities/backupable/backupable_db.cc", "utilities/blob_db/blob_compaction_filter.cc", "utilities/blob_db/blob_db.cc", "utilities/blob_db/blob_db_impl.cc", + "utilities/blob_db/blob_db_impl_filesnapshot.cc", "utilities/blob_db/blob_dump_tool.cc", "utilities/blob_db/blob_file.cc", "utilities/blob_db/blob_log_format.cc", @@ -269,6 +274,7 @@ "utilities/simulator_cache/sim_cache.cc", "utilities/spatialdb/spatial_db.cc", "utilities/table_properties_collectors/compact_on_deletion_collector.cc", + "utilities/trace/file_trace_reader_writer.cc", "utilities/transactions/optimistic_transaction.cc", "utilities/transactions/optimistic_transaction_db_impl.cc", "utilities/transactions/pessimistic_transaction.cc", @@ -286,7 +292,7 @@ "utilities/write_batch_with_index/write_batch_with_index.cc", "utilities/write_batch_with_index/write_batch_with_index_internal.cc", ], - headers = AutoHeaders.RECURSIVE_GLOB, + auto_headers = AutoHeaders.RECURSIVE_GLOB, arch_preprocessor_flags = rocksdb_arch_preprocessor_flags, compiler_flags = rocksdb_compiler_flags, preprocessor_flags = rocksdb_preprocessor_flags, @@ -299,6 +305,7 @@ srcs = [ "db/db_test_util.cc", "table/mock_table.cc", + "tools/trace_analyzer_tool.cc", "util/fault_injection_test_env.cc", "util/testharness.cc", "util/testutil.cc", @@ -307,7 +314,7 @@ "utilities/col_buf_encoder.cc", "utilities/column_aware_encoding_util.cc", ], - headers = AutoHeaders.RECURSIVE_GLOB, + auto_headers = AutoHeaders.RECURSIVE_GLOB, arch_preprocessor_flags = rocksdb_arch_preprocessor_flags, compiler_flags = rocksdb_compiler_flags, preprocessor_flags = rocksdb_preprocessor_flags, @@ -319,9 +326,10 @@ name = "rocksdb_tools_lib", srcs = [ "tools/db_bench_tool.cc", + "tools/trace_analyzer_tool.cc", "util/testutil.cc", ], - headers = AutoHeaders.RECURSIVE_GLOB, + auto_headers = AutoHeaders.RECURSIVE_GLOB, arch_preprocessor_flags = rocksdb_arch_preprocessor_flags, compiler_flags = rocksdb_compiler_flags, preprocessor_flags = rocksdb_preprocessor_flags, @@ -332,7 +340,7 @@ cpp_library( name = "env_basic_test_lib", srcs = ["env/env_basic_test.cc"], - headers = AutoHeaders.RECURSIVE_GLOB, + auto_headers = AutoHeaders.RECURSIVE_GLOB, arch_preprocessor_flags = rocksdb_arch_preprocessor_flags, compiler_flags = rocksdb_compiler_flags, preprocessor_flags = rocksdb_preprocessor_flags, @@ -498,6 +506,11 @@ "serial", ], [ + "data_block_hash_index_test", + "table/data_block_hash_index_test.cc", + "serial", + ], + [ "date_tiered_test", "utilities/date_tiered/date_tiered_test.cc", "serial", @@ -923,6 +936,11 @@ "serial", ], [ + "repeatable_thread_test", + "util/repeatable_thread_test.cc", + "serial", + ], + [ "sim_cache_test", "utilities/simulator_cache/sim_cache_test.cc", "serial", @@ -983,6 +1001,11 @@ "serial", ], [ + "trace_analyzer_test", + "tools/trace_analyzer_test.cc", + "serial", + ], + [ "transaction_test", "utilities/transactions/transaction_test.cc", "parallel", @@ -1064,20 +1087,19 @@ ttype = "gtest" if test_cfg[2] == "parallel" else "simple" test_bin = test_name + "_bin" - cpp_binary ( - name = test_bin, - srcs = [test_cc], - deps = [":rocksdb_test_lib"], - preprocessor_flags = rocksdb_preprocessor_flags, - arch_preprocessor_flags = rocksdb_arch_preprocessor_flags, - compiler_flags = rocksdb_compiler_flags, - external_deps = rocksdb_external_deps, + cpp_binary( + name = test_bin, + srcs = [test_cc], + arch_preprocessor_flags = rocksdb_arch_preprocessor_flags, + compiler_flags = rocksdb_compiler_flags, + preprocessor_flags = rocksdb_preprocessor_flags, + deps = [":rocksdb_test_lib"], + external_deps = rocksdb_external_deps, ) custom_unittest( - name = test_name, - type = ttype, - deps = [":" + test_bin], - command = [TEST_RUNNER, BUCK_BINS + test_bin] + name = test_name, + command = [TEST_RUNNER, BUCK_BINS + test_bin], + type = ttype, + deps = [":" + test_bin], ) - diff -Nru rocksdb-5.15.10/third-party/fbson/FbsonDocument.h rocksdb-5.17.2/third-party/fbson/FbsonDocument.h --- rocksdb-5.15.10/third-party/fbson/FbsonDocument.h 2018-09-13 17:25:20.000000000 +0000 +++ rocksdb-5.17.2/third-party/fbson/FbsonDocument.h 2018-11-12 19:57:32.000000000 +0000 @@ -55,8 +55,7 @@ * @author Tian Xia */ -#ifndef FBSON_FBSONDOCUMENT_H -#define FBSON_FBSONDOCUMENT_H +#pragma once #include #include @@ -889,5 +888,3 @@ #pragma pack(pop) } // namespace fbson - -#endif // FBSON_FBSONDOCUMENT_H diff -Nru rocksdb-5.15.10/third-party/fbson/FbsonJsonParser.h rocksdb-5.17.2/third-party/fbson/FbsonJsonParser.h --- rocksdb-5.15.10/third-party/fbson/FbsonJsonParser.h 2018-09-13 17:25:20.000000000 +0000 +++ rocksdb-5.17.2/third-party/fbson/FbsonJsonParser.h 2018-11-12 19:57:32.000000000 +0000 @@ -47,8 +47,7 @@ * @author Tian Xia */ -#ifndef FBSON_FBSONPARSER_H -#define FBSON_FBSONPARSER_H +#pragma once #include #include @@ -741,5 +740,3 @@ typedef FbsonJsonParserT FbsonJsonParser; } // namespace fbson - -#endif // FBSON_FBSONPARSER_H diff -Nru rocksdb-5.15.10/third-party/fbson/FbsonStream.h rocksdb-5.17.2/third-party/fbson/FbsonStream.h --- rocksdb-5.15.10/third-party/fbson/FbsonStream.h 2018-09-13 17:25:20.000000000 +0000 +++ rocksdb-5.17.2/third-party/fbson/FbsonStream.h 2018-11-12 19:57:32.000000000 +0000 @@ -18,8 +18,7 @@ * @author Tian Xia */ -#ifndef FBSON_FBSONSTREAM_H -#define FBSON_FBSONSTREAM_H +#pragma once #ifndef __STDC_FORMAT_MACROS #define __STDC_FORMAT_MACROS @@ -178,5 +177,3 @@ }; } // namespace fbson - -#endif // FBSON_FBSONSTREAM_H diff -Nru rocksdb-5.15.10/third-party/fbson/FbsonUtil.h rocksdb-5.17.2/third-party/fbson/FbsonUtil.h --- rocksdb-5.15.10/third-party/fbson/FbsonUtil.h 2018-09-13 17:25:20.000000000 +0000 +++ rocksdb-5.17.2/third-party/fbson/FbsonUtil.h 2018-11-12 19:57:32.000000000 +0000 @@ -9,8 +9,7 @@ * @author Tian Xia */ -#ifndef FBSON_FBSONUTIL_H -#define FBSON_FBSONUTIL_H +#pragma once #include #include "FbsonDocument.h" @@ -159,5 +158,3 @@ }; } // namespace fbson - -#endif // FBSON_FBSONUTIL_H diff -Nru rocksdb-5.15.10/third-party/fbson/FbsonWriter.h rocksdb-5.17.2/third-party/fbson/FbsonWriter.h --- rocksdb-5.15.10/third-party/fbson/FbsonWriter.h 2018-09-13 17:25:20.000000000 +0000 +++ rocksdb-5.17.2/third-party/fbson/FbsonWriter.h 2018-11-12 19:57:32.000000000 +0000 @@ -25,8 +25,7 @@ * @author Tian Xia */ -#ifndef FBSON_FBSONWRITER_H -#define FBSON_FBSONWRITER_H +#pragma once #include #include "FbsonDocument.h" @@ -433,5 +432,3 @@ typedef FbsonWriterT FbsonWriter; } // namespace fbson - -#endif // FBSON_FBSONWRITER_H diff -Nru rocksdb-5.15.10/tools/advisor/advisor/bench_runner.py rocksdb-5.17.2/tools/advisor/advisor/bench_runner.py --- rocksdb-5.15.10/tools/advisor/advisor/bench_runner.py 1970-01-01 00:00:00.000000000 +0000 +++ rocksdb-5.17.2/tools/advisor/advisor/bench_runner.py 2018-11-12 19:57:32.000000000 +0000 @@ -0,0 +1,39 @@ +# Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +# This source code is licensed under both the GPLv2 (found in the +# COPYING file in the root directory) and Apache 2.0 License +# (found in the LICENSE.Apache file in the root directory). + +from abc import ABC, abstractmethod +import re + + +class BenchmarkRunner(ABC): + @staticmethod + @abstractmethod + def is_metric_better(new_metric, old_metric): + pass + + @abstractmethod + def run_experiment(self): + # should return a list of DataSource objects + pass + + @staticmethod + def get_info_log_file_name(log_dir, db_path): + # Example: DB Path = /dev/shm and OPTIONS file has option + # db_log_dir=/tmp/rocks/, then the name of the log file will be + # 'dev_shm_LOG' and its location will be /tmp/rocks. If db_log_dir is + # not specified in the OPTIONS file, then the location of the log file + # will be /dev/shm and the name of the file will be 'LOG' + file_name = '' + if log_dir: + # refer GetInfoLogPrefix() in rocksdb/util/filename.cc + # example db_path: /dev/shm/dbbench + file_name = db_path[1:] # to ignore the leading '/' character + to_be_replaced = re.compile('[^0-9a-zA-Z\-_\.]') + for character in to_be_replaced.findall(db_path): + file_name = file_name.replace(character, '_') + if not file_name.endswith('_'): + file_name += '_' + file_name += 'LOG' + return file_name diff -Nru rocksdb-5.15.10/tools/advisor/advisor/config_optimizer_example.py rocksdb-5.17.2/tools/advisor/advisor/config_optimizer_example.py --- rocksdb-5.15.10/tools/advisor/advisor/config_optimizer_example.py 1970-01-01 00:00:00.000000000 +0000 +++ rocksdb-5.17.2/tools/advisor/advisor/config_optimizer_example.py 2018-11-12 19:57:32.000000000 +0000 @@ -0,0 +1,134 @@ +# Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +# This source code is licensed under both the GPLv2 (found in the +# COPYING file in the root directory) and Apache 2.0 License +# (found in the LICENSE.Apache file in the root directory). + +import argparse +from advisor.db_config_optimizer import ConfigOptimizer +from advisor.db_log_parser import NO_COL_FAMILY +from advisor.db_options_parser import DatabaseOptions +from advisor.rule_parser import RulesSpec + + +CONFIG_OPT_NUM_ITER = 10 + + +def main(args): + # initialise the RulesSpec parser + rule_spec_parser = RulesSpec(args.rules_spec) + # initialise the benchmark runner + bench_runner_module = __import__( + args.benchrunner_module, fromlist=[args.benchrunner_class] + ) + bench_runner_class = getattr(bench_runner_module, args.benchrunner_class) + ods_args = {} + if args.ods_client and args.ods_entity: + ods_args['client_script'] = args.ods_client + ods_args['entity'] = args.ods_entity + if args.ods_key_prefix: + ods_args['key_prefix'] = args.ods_key_prefix + db_bench_runner = bench_runner_class(args.benchrunner_pos_args, ods_args) + # initialise the database configuration + db_options = DatabaseOptions(args.rocksdb_options, args.misc_options) + # set the frequency at which stats are dumped in the LOG file and the + # location of the LOG file. + db_log_dump_settings = { + "DBOptions.stats_dump_period_sec": { + NO_COL_FAMILY: args.stats_dump_period_sec + } + } + db_options.update_options(db_log_dump_settings) + # initialise the configuration optimizer + config_optimizer = ConfigOptimizer( + db_bench_runner, + db_options, + rule_spec_parser, + args.base_db_path + ) + # run the optimiser to improve the database configuration for given + # benchmarks, with the help of expert-specified rules + final_db_options = config_optimizer.run() + # generate the final rocksdb options file + print( + 'Final configuration in: ' + + final_db_options.generate_options_config('final') + ) + print( + 'Final miscellaneous options: ' + + repr(final_db_options.get_misc_options()) + ) + + +if __name__ == '__main__': + ''' + An example run of this tool from the command-line would look like: + python3 -m advisor.config_optimizer_example + --base_db_path=/tmp/rocksdbtest-155919/dbbench + --rocksdb_options=temp/OPTIONS_boot.tmp --misc_options bloom_bits=2 + --rules_spec=advisor/rules.ini --stats_dump_period_sec=20 + --benchrunner_module=advisor.db_bench_runner + --benchrunner_class=DBBenchRunner --benchrunner_pos_args ./../../db_bench + readwhilewriting use_existing_db=true duration=90 + ''' + parser = argparse.ArgumentParser(description='This script is used for\ + searching for a better database configuration') + parser.add_argument( + '--rocksdb_options', required=True, type=str, + help='path of the starting Rocksdb OPTIONS file' + ) + # these are options that are column-family agnostic and are not yet + # supported by the Rocksdb Options file: eg. bloom_bits=2 + parser.add_argument( + '--misc_options', nargs='*', + help='whitespace-separated list of options that are not supported ' + + 'by the Rocksdb OPTIONS file, given in the ' + + '= format eg. "bloom_bits=2 ' + + 'rate_limiter_bytes_per_sec=128000000"') + parser.add_argument( + '--base_db_path', required=True, type=str, + help='path for the Rocksdb database' + ) + parser.add_argument( + '--rules_spec', required=True, type=str, + help='path of the file containing the expert-specified Rules' + ) + parser.add_argument( + '--stats_dump_period_sec', required=True, type=int, + help='the frequency (in seconds) at which STATISTICS are printed to ' + + 'the Rocksdb LOG file' + ) + # ODS arguments + parser.add_argument( + '--ods_client', type=str, help='the ODS client binary' + ) + parser.add_argument( + '--ods_entity', type=str, + help='the servers for which the ODS stats need to be fetched' + ) + parser.add_argument( + '--ods_key_prefix', type=str, + help='the prefix that needs to be attached to the keys of time ' + + 'series to be fetched from ODS' + ) + # benchrunner_module example: advisor.db_benchmark_client + parser.add_argument( + '--benchrunner_module', required=True, type=str, + help='the module containing the BenchmarkRunner class to be used by ' + + 'the Optimizer, example: advisor.db_bench_runner' + ) + # benchrunner_class example: DBBenchRunner + parser.add_argument( + '--benchrunner_class', required=True, type=str, + help='the name of the BenchmarkRunner class to be used by the ' + + 'Optimizer, should be present in the module provided in the ' + + 'benchrunner_module argument, example: DBBenchRunner' + ) + parser.add_argument( + '--benchrunner_pos_args', nargs='*', + help='whitespace-separated positional arguments that are passed on ' + + 'to the constructor of the BenchmarkRunner class provided in the ' + + 'benchrunner_class argument, example: "use_existing_db=true ' + + 'duration=900"' + ) + args = parser.parse_args() + main(args) diff -Nru rocksdb-5.15.10/tools/advisor/advisor/db_bench_runner.py rocksdb-5.17.2/tools/advisor/advisor/db_bench_runner.py --- rocksdb-5.15.10/tools/advisor/advisor/db_bench_runner.py 1970-01-01 00:00:00.000000000 +0000 +++ rocksdb-5.17.2/tools/advisor/advisor/db_bench_runner.py 2018-11-12 19:57:32.000000000 +0000 @@ -0,0 +1,245 @@ +# Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +# This source code is licensed under both the GPLv2 (found in the +# COPYING file in the root directory) and Apache 2.0 License +# (found in the LICENSE.Apache file in the root directory). + +from advisor.bench_runner import BenchmarkRunner +from advisor.db_log_parser import DataSource, DatabaseLogs, NO_COL_FAMILY +from advisor.db_stats_fetcher import ( + LogStatsParser, OdsStatsFetcher, DatabasePerfContext +) +import shutil +import subprocess +import time + + +''' +NOTE: This is not thread-safe, because the output file is simply overwritten. +''' + + +class DBBenchRunner(BenchmarkRunner): + OUTPUT_FILE = "temp/dbbench_out.tmp" + ERROR_FILE = "temp/dbbench_err.tmp" + DB_PATH = "DB path" + THROUGHPUT = "ops/sec" + PERF_CON = " PERF_CONTEXT:" + + @staticmethod + def is_metric_better(new_metric, old_metric): + # for db_bench 'throughput' is the metric returned by run_experiment + return new_metric >= old_metric + + @staticmethod + def get_opt_args_str(misc_options_dict): + # given a dictionary of options and their values, return a string + # that can be appended as command-line arguments + optional_args_str = "" + for option_name, option_value in misc_options_dict.items(): + if option_value: + optional_args_str += ( + " --" + option_name + "=" + str(option_value) + ) + return optional_args_str + + def __init__(self, positional_args, ods_args=None): + # parse positional_args list appropriately + self.db_bench_binary = positional_args[0] + self.benchmark = positional_args[1] + self.db_bench_args = None + if len(positional_args) > 2: + # options list with each option given as "