diff -Nru libthrust-1.15.0/CHANGELOG.md libthrust-1.16.0/CHANGELOG.md
--- libthrust-1.15.0/CHANGELOG.md	2021-10-29 18:01:06.000000000 +0000
+++ libthrust-1.16.0/CHANGELOG.md	2022-02-09 20:04:17.000000000 +0000
@@ -1,4 +1,99 @@
-# Thrust 1.15.0
+# Thrust 1.16.0
+
+## Summary
+
+Thrust 1.16.0 provides a new â€œnosyncâ€ hint for the CUDA backend, as well as
+numerous bugfixes and stability improvements.
+
+### New `thrust::cuda::par_nosync` Execution Policy
+
+Most of Thrustâ€™s parallel algorithms are fully synchronous and will block the
+calling CPU thread until all work is completed. This design avoids many pitfalls
+associated with asynchronous GPU programming, resulting in simpler and
+less-error prone usage for new CUDA developers. Unfortunately, this improvement
+in user experience comes at a performance cost that often frustrates more
+experienced CUDA programmers.
+
+Prior to this release, the only synchronous-to-asynchronous migration path for
+existing Thrust codebases involved significant refactoring, replacing calls
+to `thrust` algorithms with a limited set of `future`-based `thrust::async`
+algorithms or lower-level CUB kernels. The new `thrust::cuda::par_nosync`
+execution policy provides a new, less-invasive entry point for asynchronous
+computation.
+
+`par_nosync` is a hint to the Thrust execution engine that any non-essential
+internal synchronizations should be skipped and that an explicit synchronization
+will be performed by the caller before accessing results.
+
+While some Thrust algorithms require internal synchronization to safely compute
+their results, many do not. For example, multiple `thrust::for_each` invocations
+can be launched without waiting for earlier calls to complete:
+
+```cpp
+// Queue three `for_each` kernels:
+thrust::for_each(thrust::cuda::par_nosync, vec1.begin(), vec1.end(), Op{});
+thrust::for_each(thrust::cuda::par_nosync, vec2.begin(), vec2.end(), Op{});
+thrust::for_each(thrust::cuda::par_nosync, vec3.begin(), vec3.end(), Op{});
+
+// Do other work while kernels execute:
+do_something();
+
+// Must explictly synchronize before accessing `for_each` results:
+cudaDeviceSynchronize();
+```
+
+Thanks to @fkallen for this contribution.
+
+## Deprecation Notices
+
+### CUDA Dynamic Parallelism Support
+
+**A future version of Thrust will remove support for CUDA Dynamic Parallelism
+(CDP).**
+
+This will only affect calls to Thrust algorithms made from CUDA device-side code
+that currently launches a kernel; such calls will instead execute sequentially
+on the calling GPU thread instead of launching a device-wide kernel.
+
+## Breaking Changes
+
+- Thrust 1.14.0 included a change that aliased the `cub` namespace
+  to `thrust::cub`. This has caused issues with ambiguous namespaces for
+  projects that declare `using namespace thrust;` from the global namespace. We
+  recommend against this practice.
+- NVIDIA/thrust#1572: Removed several unnecessary header includes. Downstream
+  projects may need to update their includes if they were relying on this
+  behavior.
+
+## New Features
+
+- NVIDIA/thrust#1568: Add `thrust::cuda::par_nosync` policy. Thanks to @fkallen
+  for this contribution.
+
+## Enhancements
+
+- NVIDIA/thrust#1511: Use CUBâ€™s new `DeviceMergeSort` API and remove Thrustâ€™s
+  internal implementation.
+- NVIDIA/thrust#1566: Improved performance of `thrust::shuffle`. Thanks to
+  @djns99 for this contribution.
+- NVIDIA/thrust#1584: Support user-defined `CMAKE_INSTALL_INCLUDEDIR` values in
+  Thrustâ€™s CMake install rules. Thanks to @robertmaynard for this contribution.
+
+## Bug Fixes
+
+- NVIDIA/thrust#1496: Fix some issues affecting `icc` builds.
+- NVIDIA/thrust#1552: Fix some collisions with the `min`/`max`  macros defined
+  in `windows.h`.
+- NVIDIA/thrust#1582: Fix issue with function type alias on 32-bit MSVC builds.
+- NVIDIA/thrust#1591: Workaround issue affecting compilation with `nvc++`.
+- NVIDIA/thrust#1597: Fix some collisions with the `small` macro defined
+  in `windows.h`.
+- NVIDIA/thrust#1599, NVIDIA/thrust#1603: Fix some issues with version handling
+  in Thrustâ€™s CMake packages.
+- NVIDIA/thrust#1614: Clarify that scan algorithm results are non-deterministic
+  for pseudo-associative operators (e.g. floating-point addition).
+
+# Thrust 1.15.0 (NVIDIA HPC SDK 22.1, CUDA Toolkit 11.6)
 
 ## Summary
 
@@ -51,6 +146,13 @@
 `thrust::iterator_traits` is better integrated with `std::iterator_traits`.
 See below for more details and references.
 
+## Breaking Changes
+
+- Thrust 1.14.0 included a change that aliased the `cub` namespace
+  to `thrust::cub`. This has caused issues with ambiguous namespaces for
+  projects that declare `using namespace thrust;` from the global namespace. We
+  recommend against this practice.
+
 ## New Features
 
 - NVIDIA/thrust#1464: Add preprocessor hooks that allow `thrust::` to be wrapped
diff -Nru libthrust-1.15.0/ci/axis/cpu.yml libthrust-1.16.0/ci/axis/cpu.yml
--- libthrust-1.15.0/ci/axis/cpu.yml	2021-10-29 18:01:06.000000000 +0000
+++ libthrust-1.16.0/ci/axis/cpu.yml	2022-02-09 20:04:17.000000000 +0000
@@ -8,8 +8,8 @@
   - nvhpc
 
 SDK_VER:
-  - 11.3.1-devel
-  - 21.5-devel-cuda11.3
+  - 11.5.1-devel
+  - 22.1-devel-cuda11.5
 
 OS_TYPE:
   - ubuntu
@@ -31,7 +31,8 @@
   - 9
   - 10
   - 11
-  - 21.5
+  - 12
+  - 22.1
   - latest
 
 exclude:
@@ -46,9 +47,9 @@
     SDK_TYPE: cuda
   # Excludes by `SDK_VER`.
   - SDK_TYPE: cuda
-    SDK_VER: 21.5-devel-cuda11.3
+    SDK_VER: 22.1-devel-cuda11.5
   - SDK_TYPE: nvhpc
-    SDK_VER: 11.3.1-devel
+    SDK_VER: 11.5.1-devel
   # Excludes by `CXX_VER`.
   - CXX_TYPE: nvcxx
     CXX_VER: 5
@@ -65,11 +66,13 @@
   - CXX_TYPE: nvcxx
     CXX_VER: 11
   - CXX_TYPE: nvcxx
+    CXX_VER: 12
+  - CXX_TYPE: nvcxx
     CXX_VER: latest
   - CXX_TYPE: gcc
-    CXX_VER: 11
+    CXX_VER: 12
   - CXX_TYPE: gcc
-    CXX_VER: 21.5
+    CXX_VER: 22.1
   - CXX_TYPE: gcc
     CXX_VER: latest
   - CXX_TYPE: clang
@@ -77,7 +80,7 @@
   - CXX_TYPE: clang
     CXX_VER: 6
   - CXX_TYPE: clang
-    CXX_VER: 21.5
+    CXX_VER: 22.1
   - CXX_TYPE: clang
     CXX_VER: latest
   - CXX_TYPE: icc
@@ -95,5 +98,7 @@
   - CXX_TYPE: icc
     CXX_VER: 11
   - CXX_TYPE: icc
-    CXX_VER: 21.5
+    CXX_VER: 12
+  - CXX_TYPE: icc
+    CXX_VER: 22.1
 
diff -Nru libthrust-1.15.0/ci/axis/gpu.yml libthrust-1.16.0/ci/axis/gpu.yml
--- libthrust-1.15.0/ci/axis/gpu.yml	2021-10-29 18:01:06.000000000 +0000
+++ libthrust-1.16.0/ci/axis/gpu.yml	2022-02-09 20:04:17.000000000 +0000
@@ -7,7 +7,7 @@
   - cuda
 
 SDK_VER:
-  - 11.3.1-devel
+  - 11.5.1-devel
 
 OS_TYPE:
   - ubuntu
@@ -19,5 +19,4 @@
   - gcc
 
 CXX_VER:
-  - 7
-
+  - 9
diff -Nru libthrust-1.15.0/ci/common/build.bash libthrust-1.16.0/ci/common/build.bash
--- libthrust-1.15.0/ci/common/build.bash	2021-10-29 18:01:06.000000000 +0000
+++ libthrust-1.16.0/ci/common/build.bash	2022-02-09 20:04:17.000000000 +0000
@@ -1,6 +1,6 @@
 #! /usr/bin/env bash
 
-# Copyright (c) 2018-2020 NVIDIA Corporation
+# Copyright (c) 2018-2022 NVIDIA Corporation
 # SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 # Released under the Apache License v2.0 with LLVM Exceptions.
 # See https://llvm.org/LICENSE.txt for license information.
@@ -9,7 +9,7 @@
 # Thrust and CUB build script for gpuCI
 ################################################################################
 
-set -e
+set -e # Stop on errors.
 
 # append variable value
 # Appends ${value} to ${variable}, adding a space before ${value} if
@@ -32,6 +32,13 @@
   printf "%s\n\n" "${*}"
 }
 
+# echo_and_run name args...
+# Echo ${args[@]}, then execute ${args[@]}
+function echo_and_run {
+  echo "${1}: ${@:2}"
+  ${@:2}
+}
+
 # echo_and_run_timed name args...
 # Echo ${args[@]}, then execute ${args[@]} and report how long it took,
 # including ${name} in the output of the time.
@@ -58,19 +65,27 @@
 ################################################################################
 
 # Get the variables the Docker container set up for us: ${CXX}, ${CUDACXX}, etc.
+set +e # Don't stop on errors from /etc/cccl.bashrc.
 source /etc/cccl.bashrc
+set -e # Stop on errors.
 
-# Set path and build parallel level
+# Set path.
 export PATH=/usr/local/cuda/bin:${PATH}
 
 # Set home to the job's workspace.
 export HOME=${WORKSPACE}
 
+# Per-process memory util logs:
+MEMMON_LOG=${WORKSPACE}/build/memmon_log
+
 # Switch to the build directory.
 cd ${WORKSPACE}
 mkdir -p build
 cd build
 
+# Remove any old .ninja_log file so the PrintNinjaBuildTimes step is accurate:
+rm -f .ninja_log
+
 if [[ -z "${CMAKE_BUILD_TYPE}" ]]; then
   CMAKE_BUILD_TYPE="Release"
 fi
@@ -109,8 +124,29 @@
   append CMAKE_BUILD_FLAGS "-k0"
 fi
 
+DETERMINE_PARALLELISM_FLAGS=""
+
+# Used to limit the number of default build threads. Any build/link
+# steps that exceed this limit will cause this script to report a
+# failure. Tune this using the memmon logs printed after each run.
+#
+# Build steps that take more memory than this limit should
+# be split into multiple steps/translation units. Any temporary
+# increases to this threshold should be reverted ASAP. The goal
+# to do decrease this as much as possible and not increase it.
+if [[ -z "${MIN_MEMORY_PER_THREAD}" ]]; then
+  if [[ "${CXX_TYPE}" == "nvcxx" ]]; then
+      MIN_MEMORY_PER_THREAD=3.0 # GiB
+  elif [[ "${CXX_TYPE}" == "icc" ]]; then
+      MIN_MEMORY_PER_THREAD=2.5 # GiB
+  else
+      MIN_MEMORY_PER_THREAD=2.0 # GiB
+  fi
+fi
+append DETERMINE_PARALLELISM_FLAGS "--min-memory-per-thread ${MIN_MEMORY_PER_THREAD}"
+
 if [[ -n "${PARALLEL_LEVEL}" ]]; then
-  DETERMINE_PARALLELISM_FLAGS="-j ${PARALLEL_LEVEL}"
+  append DETERMINE_PARALLELISM_FLAGS "-j ${PARALLEL_LEVEL}"
 fi
 
 # COVERAGE_PLAN options:
@@ -148,6 +184,9 @@
     append CMAKE_FLAGS "-DTHRUST_MULTICONFIG_WORKLOAD=LARGE"
     ;;
   Thorough)
+    # Build the legacy bench.cu. We'll probably want to remove this when we
+    # switch to the new, heavier thrust_benchmarks project.
+    append CMAKE_FLAGS "-DTHRUST_ENABLE_BENCHMARKS=ON"
     append CMAKE_FLAGS "-DTHRUST_ENABLE_MULTICONFIG=ON"
     append CMAKE_FLAGS "-DTHRUST_IGNORE_DEPRECATED_CPP_11=ON"
     append CMAKE_FLAGS "-DTHRUST_MULTICONFIG_ENABLE_DIALECT_ALL=ON"
@@ -175,9 +214,6 @@
     append CMAKE_FLAGS "-DTHRUST_MULTICONFIG_ENABLE_SYSTEM_CUDA=ON"
     append CMAKE_FLAGS "-DTHRUST_MULTICONFIG_WORKLOAD=SMALL"
     append CMAKE_FLAGS "-DTHRUST_INCLUDE_CUB_CMAKE=ON"
-    append CMAKE_FLAGS "-DCUB_ENABLE_THOROUGH_TESTING=OFF"
-    append CMAKE_FLAGS "-DCUB_ENABLE_BENCHMARK_TESTING=OFF"
-    append CMAKE_FLAGS "-DCUB_ENABLE_MINIMAL_TESTING=ON"
     append CMAKE_FLAGS "-DTHRUST_AUTO_DETECT_COMPUTE_ARCHS=ON"
     if [[ "${BUILD_TYPE}" == "cpu" ]] && [[ "${CXX_TYPE}" == "nvcxx" ]]; then
       # If no GPU is automatically detected, NVC++ insists that you explicitly
@@ -229,7 +265,7 @@
 
 log "Get environment..."
 
-env
+env | sort
 
 log "Check versions..."
 
@@ -244,6 +280,10 @@
 
 echo
 
+cmake --version 2>&1 | sed -Ez '$ s/\n*$/\n/'
+
+echo
+
 if [[ "${BUILD_TYPE}" == "gpu" ]]; then
   nvidia-smi 2>&1 | sed -Ez '$ s/\n*$/\n/'
 fi
@@ -254,10 +294,7 @@
 
 log "Configure Thrust and CUB..."
 
-# Clear out any stale CMake configs:
-rm -rf CMakeCache.txt CMakeFiles/
-
-echo_and_run_timed "Configure" cmake .. ${CMAKE_FLAGS}
+echo_and_run_timed "Configure" cmake .. --log-level=VERBOSE ${CMAKE_FLAGS}
 configure_status=$?
 
 log "Build Thrust and CUB..."
@@ -265,8 +302,22 @@
 # ${PARALLEL_LEVEL} needs to be passed after we run
 # determine_build_parallelism.bash, so it can't be part of ${CMAKE_BUILD_FLAGS}.
 set +e # Don't stop on build failures.
+
+# Monitor memory usage. Thresholds in GiB:
+python3 ${WORKSPACE}/ci/common/memmon.py \
+	--log-threshold 0.0 \
+	--fail-threshold ${MIN_MEMORY_PER_THREAD} \
+	--log-file ${MEMMON_LOG} \
+        &
+memmon_pid=$!
+
 echo_and_run_timed "Build" cmake --build . ${CMAKE_BUILD_FLAGS} -j ${PARALLEL_LEVEL}
 build_status=$?
+
+# Stop memmon:
+kill -s SIGINT ${memmon_pid}
+
+# Re-enable exit on failure:
 set -e
 
 ################################################################################
@@ -275,21 +326,68 @@
 
 log "Test Thrust and CUB..."
 
-echo_and_run_timed "Test" ctest ${CTEST_FLAGS}
+(
+  # Make sure test_status captures ctest, not tee:
+  # https://stackoverflow.com/a/999259/11130318
+  set -o pipefail
+  echo_and_run_timed "Test" ctest ${CTEST_FLAGS} | tee ctest_log
+)
 test_status=$?
 
 ################################################################################
+# COMPILE TIME INFO: Print the 20 longest running build steps (ninja only)
+################################################################################
+
+if [[ -f ".ninja_log" ]]; then
+  log "Checking slowest build steps:"
+  echo_and_run "CompileTimeInfo" cmake -P ../cmake/PrintNinjaBuildTimes.cmake | head -n 23
+fi
+
+################################################################################
+# RUNTIME INFO: Print the 20 longest running test steps
+################################################################################
+
+if [[ -f "ctest_log" ]]; then
+  log "Checking slowest test steps:"
+  echo_and_run "TestTimeInfo" cmake -DLOGFILE=ctest_log -P ../cmake/PrintCTestRunTimes.cmake | head -n 20
+fi
+
+################################################################################
+# MEMORY_USAGE
+################################################################################
+
+memmon_status=0
+if [[ -f "${MEMMON_LOG}" ]]; then
+  log "Checking memmon logfile: ${MEMMON_LOG}"
+
+  if [[ -n "$(grep -E "^FAIL" ${MEMMON_LOG})" ]]; then
+    log "error: Some build steps exceeded MIN_MEMORY_PER_THREAD (${MIN_MEMORY_PER_THREAD} GiB):"
+    grep -E "^FAIL" ${MEMMON_LOG}
+    memmon_status=1
+  else
+    log "Top memory usage per build step (all less than limit of ${MIN_MEMORY_PER_THREAD} GiB):"
+    if [[ -s ${MEMMON_LOG} ]]; then
+      # Not empty:
+      head -n5 ${MEMMON_LOG}
+    else
+      echo "None detected above logging threshold."
+    fi
+  fi
+fi
+
+################################################################################
 # SUMMARY - Print status of each step and exit with failure if needed.
 ################################################################################
 
 log "Summary:"
-log "- Configure Error Code: ${configure_status}"
-log "- Build Error Code: ${build_status}"
-log "- Test Error Code: ${test_status}"
-
+echo "- Configure Error Code: ${configure_status}"
+echo "- Build Error Code: ${build_status}"
+echo "- Build Memory Check: ${memmon_status}"
+echo "- Test Error Code: ${test_status}"
 
 if [[ "${configure_status}" != "0" ]] || \
    [[ "${build_status}" != "0" ]] || \
+   [[ "${memmon_status}" != "0" ]] || \
    [[ "${test_status}" != "0" ]]; then
      exit 1
 fi
diff -Nru libthrust-1.15.0/ci/common/memmon_config/procps/toprc libthrust-1.16.0/ci/common/memmon_config/procps/toprc
--- libthrust-1.15.0/ci/common/memmon_config/procps/toprc	1970-01-01 00:00:00.000000000 +0000
+++ libthrust-1.16.0/ci/common/memmon_config/procps/toprc	2022-02-09 20:04:17.000000000 +0000
@@ -0,0 +1,16 @@
+top's Config File (Linux processes with windows)
+Id:i, Mode_altscr=0, Mode_irixps=1, Delay_time=3.0, Curwin=0
+Def	fieldscur=%(34;½@D7:9Å&')*+,-./012568<>?ABCFGHIJKLMNOPQRSTUVWXYZ[\]^_`abcdefghij
+	winflags=193972, sortindx=18, maxtasks=0, graph_cpus=0, graph_mems=0
+	summclr=1, msgsclr=1, headclr=3, taskclr=1
+Job	fieldscur=¥¦¹·º(³´Ä»½@<§Å)*+,-./012568>?ABCFGHIJKLMNOPQRSTUVWXYZ[\]^_`abcdefghij
+	winflags=193844, sortindx=0, maxtasks=0, graph_cpus=0, graph_mems=0
+	summclr=6, msgsclr=6, headclr=7, taskclr=6
+Mem	fieldscur=¥º»<½¾¿ÀÁMBNÃD34·Å&'()*+,-./0125689FGHIJKLOPQRSTUVWXYZ[\]^_`abcdefghij
+	winflags=193844, sortindx=21, maxtasks=0, graph_cpus=0, graph_mems=0
+	summclr=5, msgsclr=5, headclr=4, taskclr=5
+Usr	fieldscur=¥¦§¨ª°¹·ºÄÅ)+,-./1234568;<=>?@ABCFGHIJKLMNOPQRSTUVWXYZ[\]^_`abcdefghij
+	winflags=193844, sortindx=3, maxtasks=0, graph_cpus=0, graph_mems=0
+	summclr=3, msgsclr=3, headclr=2, taskclr=3
+Fixed_widest=0, Summ_mscale=1, Task_mscale=0, Zero_suppress=0
+
diff -Nru libthrust-1.15.0/ci/common/memmon.py libthrust-1.16.0/ci/common/memmon.py
--- libthrust-1.15.0/ci/common/memmon.py	1970-01-01 00:00:00.000000000 +0000
+++ libthrust-1.16.0/ci/common/memmon.py	2022-02-09 20:04:17.000000000 +0000
@@ -0,0 +1,99 @@
+#! /usr/bin/env python
+
+# Copyright (c) 2022 NVIDIA Corporation
+# Reply-To: Allison Vacanti <alliepiper16@gmail.com>
+# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+# Released under the Apache License v2.0 with LLVM Exceptions.
+# See https://llvm.org/LICENSE.txt for license information.
+
+help_text = """%(prog)s [reference.json compare.json | reference_dir/ compare_dir/]
+
+This script:
+
+1. Runs `top -bco RES`, continuously extracting the memory usage of each process.
+2. If a process uses more than `log_threshold` GiB and exceeds any other recorded
+   entry for the process, it is stored in `entries`.
+3. When this script receives SIGINT, it writes two files:
+  * `log_file` will contain all recorded max-memory-per-process entries
+  * `fail_file` will contain all entries that exceed `fail_threshold`
+"""
+
+import argparse
+import os
+import re
+import signal
+import sys
+
+from subprocess import Popen, PIPE, STDOUT
+
+parser = argparse.ArgumentParser(prog='memmon.py', usage=help_text)
+parser.add_argument('--log-threshold', type=float, dest='log_threshold',
+                    default=0.5,
+                    help='Logging threshold in GiB.')
+parser.add_argument('--fail-threshold', type=float, dest='fail_threshold',
+                    default=2,
+                    help='Failure threshold in GiB.')
+parser.add_argument('--log-file', type=str, dest='log_file', default='memmon_log',
+                    help='Output file for log entries.')
+args, unused = parser.parse_known_args()
+
+entries = {}
+
+def signal_handler(sig, frame):
+    # Sort by mem:
+    sortentries = sorted(entries.items(), key=lambda x:x[1], reverse=True)
+
+    lf = open(args.log_file, "w")
+
+    for com, mem in sortentries:
+        status="PASS"
+        if mem >= args.fail_threshold:
+            status="FAIL"
+        line = "%4s | %3.1f GiB | %s\n"%(status, mem, com)
+        lf.write(line)
+
+    lf.close()
+    sys.exit(0)
+
+signal.signal(signal.SIGINT, signal_handler)
+
+# Find the toprc config file and configure top's env.
+# This config:
+# - Hides all columns except for RES and COMMAND
+# - Sorts by RES
+# - Enables long command strings (-c)
+script_dir = os.path.dirname(os.path.realpath(__file__))
+config_dir = os.path.join(script_dir, 'memmon_config')
+
+proc = Popen(["top", "-b", "-w", "512"],
+             stdin=PIPE, stdout=PIPE, stderr=STDOUT,
+             env={"XDG_CONFIG_HOME": config_dir})
+
+regex = re.compile("^\\s*([0-9.]+[kmgtp]?)\\s+(.+)\\s*$")
+
+# Convert a memory string from top into floating point GiB
+def parse_mem(mem_str):
+    if mem_str[-1] == "k":
+        return float(mem_str[:-1]) / (1024 * 1024)
+    elif mem_str[-1] == "m":
+        return float(mem_str[:-1]) / (1024)
+    elif mem_str[-1] == "g":
+        return float(mem_str[:-1])
+    elif mem_str[-1] == "t":
+        return float(mem_str[:-1]) * 1024
+    elif mem_str[-1] == "p": # please no
+        return float(mem_str[:-1]) * 1024 * 1024
+    # bytes:
+    return float(mem_str) / (1024 * 1024 * 1024)
+
+for line in proc.stdout:
+    line = line.decode()
+    match = regex.match(line)
+    if match:
+        mem = parse_mem(match.group(1))
+        if mem < args.log_threshold and mem < args.fail_threshold:
+            continue
+        com = match.group(2)
+        if com in entries and entries[com] > mem:
+            continue
+        entries[com] = mem
diff -Nru libthrust-1.15.0/ci/cpu/build.bash libthrust-1.16.0/ci/cpu/build.bash
--- libthrust-1.15.0/ci/cpu/build.bash	2021-10-29 18:01:06.000000000 +0000
+++ libthrust-1.16.0/ci/cpu/build.bash	2022-02-09 20:04:17.000000000 +0000
@@ -12,4 +12,3 @@
 export PARALLEL_LEVEL=${PARALLEL_LEVEL:-4}
 
 source ${WORKSPACE}/ci/common/build.bash
-
diff -Nru libthrust-1.15.0/ci/local/build.bash libthrust-1.16.0/ci/local/build.bash
--- libthrust-1.15.0/ci/local/build.bash	2021-10-29 18:01:06.000000000 +0000
+++ libthrust-1.16.0/ci/local/build.bash	2022-02-09 20:04:17.000000000 +0000
@@ -60,7 +60,7 @@
 # FLAGS - Process command line flags.
 ################################################################################
 
-IMAGE="gpuci/cccl:cuda11.3.1-devel-ubuntu20.04-gcc7"
+IMAGE="gpuci/cccl:cuda11.5.1-devel-ubuntu20.04-gcc9"
 
 LOCAL_IMAGE=0
 
@@ -195,8 +195,8 @@
 
 NVIDIA_DOCKER_INSTALLED=$(docker info 2>&1 | grep -i runtime | grep -c nvidia)
 if [[ "${NVIDIA_DOCKER_INSTALLED}" == 0 ]]; then
-  echo "NVIDIA Docker not found, please install it: https://docs.nvidia.com/datacenter/cloud-native/container-toolkit/install-guide.html#installing-docker-ce"
-  exit -4
+  echo "NVIDIA Docker not found, the build may fail."
+  echo "Please install it if you encounter issues: https://docs.nvidia.com/datacenter/cloud-native/container-toolkit/install-guide.html#installing-docker-ce"
 fi
 
 if [[ "${LOCAL_IMAGE}" == 0 ]]; then
diff -Nru libthrust-1.15.0/cmake/header_test.in libthrust-1.16.0/cmake/header_test.in
--- libthrust-1.15.0/cmake/header_test.in	2021-10-29 18:01:06.000000000 +0000
+++ libthrust-1.16.0/cmake/header_test.in	2022-02-09 20:04:17.000000000 +0000
@@ -15,7 +15,7 @@
 // Hacky way to build a string, but it works on all tested platforms.
 #define THRUST_MACRO_CHECK(MACRO, HEADER)                                      \
   THRUST_MACRO_CHECK_IMPL(Identifier MACRO should not be used from Thrust      \
-                            headers due to conflicts with HEADER.)
+                          headers due to conflicts with HEADER macros.)
 
 // Use raw platform checks instead of the THRUST_HOST_COMPILER macros since we
 // don't want to #include any headers other than the one being tested.
@@ -45,11 +45,17 @@
 #define I THRUST_MACRO_CHECK('I', complex.h)
 
 // windows.h conflicts
-// Disabling for now; we use min/max in many places, but since most
-// projects build with NOMINMAX this doesn't seem to be high priority to fix.
+#define small THRUST_MACRO_CHECK('small', windows.h)
+// We can't enable these checks without breaking some builds -- some standard
+// library implementations unconditionally `#undef` these macros, which then
+// causes random failures later.
+// Leaving these commented out as a warning: Here be dragons.
 //#define min(...) THRUST_MACRO_CHECK('min', windows.h)
 //#define max(...) THRUST_MACRO_CHECK('max', windows.h)
 
+// termios.h conflicts (NVIDIA/thrust#1547)
+#define B0 THRUST_MACRO_CHECK("B0", termios.h)
+
 #endif // THRUST_IGNORE_MACRO_CHECKS
 
 #include <thrust/${header}>
diff -Nru libthrust-1.15.0/cmake/PrintCTestRunTimes.cmake libthrust-1.16.0/cmake/PrintCTestRunTimes.cmake
--- libthrust-1.15.0/cmake/PrintCTestRunTimes.cmake	1970-01-01 00:00:00.000000000 +0000
+++ libthrust-1.16.0/cmake/PrintCTestRunTimes.cmake	2022-02-09 20:04:17.000000000 +0000
@@ -0,0 +1,109 @@
+## This CMake script parses the output of ctest and prints a formatted list
+## of individual test runtimes, sorted longest first.
+##
+## ctest > ctest_log
+## cmake -DLOGFILE=ctest_log \
+##       -P PrintCTestRunTimes.cmake
+##
+################################################################################
+
+cmake_minimum_required(VERSION 3.15)
+
+# Prepend the string with "0" until the string length equals the specified width
+function(pad_string_with_zeros string_var width)
+  set(local_string "${${string_var}}")
+  string(LENGTH "${local_string}" size)
+  while(size LESS width)
+    string(PREPEND local_string "0")
+    string(LENGTH "${local_string}" size)
+  endwhile()
+  set(${string_var} "${local_string}" PARENT_SCOPE)
+endfunction()
+
+################################################################################
+
+if (NOT LOGFILE)
+  message(FATAL_ERROR "Missing -DLOGFILE=<ctest output> argument.")
+endif()
+
+# Check if logfile exists
+if (NOT EXISTS "${LOGFILE}")
+  message(FATAL_ERROR "LOGFILE does not exist ('${LOGFILE}').")
+endif()
+
+string(JOIN "" regex
+  "^[ ]*[0-9]+/[0-9]+[ ]+Test[ ]+#"
+  "([0-9]+)"                          # Test ID
+  ":[ ]+"
+  "(.+)"                              # Test Name
+  "[ ]+\\.+[ ]+"
+  "(.+[^ ])"                              # Result
+  "[ ]+"
+  "([0-9]+)"                          # Seconds
+  "\\.[0-9]+[ ]+sec[ ]*$"
+)
+
+message(DEBUG "Regex: ${regex}")
+
+# Read the logfile and generate a map / keylist
+set(keys)
+file(STRINGS "${LOGFILE}" lines)
+foreach(line ${lines})
+
+  # Parse each build time
+  string(REGEX MATCH "${regex}" _DUMMY "${line}")
+
+  if (CMAKE_MATCH_COUNT EQUAL 4)
+    set(test_id      "${CMAKE_MATCH_1}")
+    set(test_name    "${CMAKE_MATCH_2}")
+    set(test_result  "${CMAKE_MATCH_3}")
+    set(tmp          "${CMAKE_MATCH_4}") # floor(runtime_seconds)
+
+    # Compute human readable time
+    math(EXPR days         "${tmp} / (60 * 60 * 24)")
+    math(EXPR tmp          "${tmp} - (${days} * 60 * 60 * 24)")
+    math(EXPR hours        "${tmp} / (60 * 60)")
+    math(EXPR tmp          "${tmp} - (${hours} * 60 * 60)")
+    math(EXPR minutes      "${tmp} / (60)")
+    math(EXPR tmp          "${tmp} - (${minutes} * 60)")
+    math(EXPR seconds      "${tmp}")
+
+    # Format time components
+    pad_string_with_zeros(days 3)
+    pad_string_with_zeros(hours 2)
+    pad_string_with_zeros(minutes 2)
+    pad_string_with_zeros(seconds 2)
+
+    # Construct table entry
+    # Later values in the file for the same command overwrite earlier entries
+    string(MAKE_C_IDENTIFIER "${test_id}" key)
+    string(JOIN " | " ENTRY_${key}
+      "${days}d ${hours}h ${minutes}m ${seconds}s"
+      "${test_result}"
+      "${test_id}: ${test_name}"
+    )
+
+    # Record the key:
+    list(APPEND keys "${key}")
+  endif()
+endforeach()
+
+list(REMOVE_DUPLICATES keys)
+
+# Build the entry list:
+set(entries)
+foreach(key ${keys})
+  list(APPEND entries "${ENTRY_${key}}")
+endforeach()
+
+if (NOT entries)
+  message(FATAL_ERROR "LOGFILE contained no test times ('${LOGFILE}').")
+endif()
+
+# Sort in descending order:
+list(SORT entries ORDER DESCENDING)
+
+# Dump table:
+foreach(entry ${entries})
+  message(STATUS ${entry})
+endforeach()
diff -Nru libthrust-1.15.0/cmake/ThrustBuildTargetList.cmake libthrust-1.16.0/cmake/ThrustBuildTargetList.cmake
--- libthrust-1.15.0/cmake/ThrustBuildTargetList.cmake	2021-10-29 18:01:06.000000000 +0000
+++ libthrust-1.16.0/cmake/ThrustBuildTargetList.cmake	2022-02-09 20:04:17.000000000 +0000
@@ -313,10 +313,7 @@
   add_flag_option(IGNORE_DEPRECATED_CPP_11 "Don't warn about deprecated C++11." OFF)
   add_flag_option(IGNORE_DEPRECATED_COMPILER "Don't warn about deprecated compilers." OFF)
   add_flag_option(IGNORE_CUB_VERSION_CHECK "Don't warn about mismatched CUB versions." OFF)
-
-  # By default, suppress deprecation warnings when building our test suite,
-  ## since we'll need to test deprecated APIs with `-Werror`.
-  add_flag_option(IGNORE_DEPRECATED_API "Don't warn about deprecated Thrust or CUB APIs." ON)
+  add_flag_option(IGNORE_DEPRECATED_API "Don't warn about deprecated Thrust or CUB APIs." OFF)
 
   # Top level meta-target. Makes it easier to just build thrust targets when
   # building both CUB and Thrust. Add all project files here so IDEs will be
diff -Nru libthrust-1.15.0/cmake/ThrustInstallRules.cmake libthrust-1.16.0/cmake/ThrustInstallRules.cmake
--- libthrust-1.15.0/cmake/ThrustInstallRules.cmake	2021-10-29 18:01:06.000000000 +0000
+++ libthrust-1.16.0/cmake/ThrustInstallRules.cmake	2022-02-09 20:04:17.000000000 +0000
@@ -5,7 +5,7 @@
 set(CMAKE_SKIP_INSTALL_ALL_DEPENDENCY TRUE)
 
 install(DIRECTORY "${Thrust_SOURCE_DIR}/thrust"
-  TYPE INCLUDE
+  DESTINATION "${CMAKE_INSTALL_INCLUDEDIR}"
   FILES_MATCHING
     PATTERN "*.h"
     PATTERN "*.inl"
@@ -13,7 +13,15 @@
 
 install(DIRECTORY "${Thrust_SOURCE_DIR}/thrust/cmake/"
   DESTINATION "${CMAKE_INSTALL_LIBDIR}/cmake/thrust"
+  PATTERN thrust-header-search EXCLUDE
 )
+# Need to configure a file to store the infix specified in
+# CMAKE_INSTALL_INCLUDEDIR since it can be defined by the user
+configure_file("${Thrust_SOURCE_DIR}/thrust/cmake/thrust-header-search.cmake.in"
+  "${Thrust_BINARY_DIR}/thrust/cmake/thrust-header-search.cmake"
+  @ONLY)
+install(FILES "${Thrust_BINARY_DIR}/thrust/cmake/thrust-header-search.cmake"
+  DESTINATION "${CMAKE_INSTALL_LIBDIR}/cmake/thrust")
 
 # Depending on how Thrust is configured, CUB's CMake scripts may or may not be
 # included, so maintain a set of CUB install rules in both projects. By default
@@ -22,12 +30,19 @@
 option(THRUST_INSTALL_CUB_HEADERS "Include cub headers when installing." ON)
 if (THRUST_INSTALL_CUB_HEADERS)
   install(DIRECTORY "${Thrust_SOURCE_DIR}/dependencies/cub/cub"
-    TYPE INCLUDE
+    DESTINATION "${CMAKE_INSTALL_INCLUDEDIR}"
     FILES_MATCHING
       PATTERN "*.cuh"
   )
 
+  # Need to configure a file to store THRUST_INSTALL_HEADER_INFIX
   install(DIRECTORY "${Thrust_SOURCE_DIR}/dependencies/cub/cub/cmake/"
     DESTINATION "${CMAKE_INSTALL_LIBDIR}/cmake/cub"
+    PATTERN cub-header-search EXCLUDE
   )
+  configure_file("${Thrust_SOURCE_DIR}/dependencies/cub/cub/cmake/cub-header-search.cmake.in"
+    "${Thrust_BINARY_DIR}/dependencies/cub/cub/cmake/cub-header-search.cmake"
+    @ONLY)
+  install(FILES "${Thrust_BINARY_DIR}/dependencies/cub/cub/cmake/cub-header-search.cmake"
+    DESTINATION "${CMAKE_INSTALL_LIBDIR}/cmake/cub")
 endif()
diff -Nru libthrust-1.15.0/CMakeLists.txt libthrust-1.16.0/CMakeLists.txt
--- libthrust-1.15.0/CMakeLists.txt	2021-10-29 18:01:06.000000000 +0000
+++ libthrust-1.16.0/CMakeLists.txt	2022-02-09 20:04:17.000000000 +0000
@@ -49,14 +49,21 @@
 option(THRUST_ENABLE_HEADER_TESTING "Test that all public headers compile." "ON")
 option(THRUST_ENABLE_TESTING "Build Thrust testing suite." "ON")
 option(THRUST_ENABLE_EXAMPLES "Build Thrust examples." "ON")
+option(THRUST_ENABLE_BENCHMARKS "Build Thrust runtime benchmarks." "OFF")
 option(THRUST_INCLUDE_CUB_CMAKE "Build CUB tests and examples. (Requires CUDA)." "OFF")
 
+# Mark this option as advanced for now. We'll revisit this later once the new
+# benchmarks are ready. For now, we just need to expose a way to compile
+# bench.cu from CMake for NVIDIA's internal builds.
+mark_as_advanced(THRUST_ENABLE_BENCHMARKS)
+
 # Check if we're actually building anything before continuing. If not, no need
 # to search for deps, etc. This is a common approach for packagers that just
 # need the install rules. See GH issue NVIDIA/thrust#1211.
 if (NOT (THRUST_ENABLE_HEADER_TESTING OR
          THRUST_ENABLE_TESTING OR
          THRUST_ENABLE_EXAMPLES OR
+         THRUST_ENABLE_BENCHMARKS OR
          THRUST_INCLUDE_CUB_CMAKE))
   return()
 endif()
@@ -118,6 +125,10 @@
   add_subdirectory(examples)
 endif()
 
+if (THRUST_ENABLE_BENCHMARKS)
+  add_subdirectory(internal/benchmark)
+endif()
+
 if (THRUST_INCLUDE_CUB_CMAKE AND THRUST_CUDA_FOUND)
   set(CUB_IN_THRUST ON)
   # CUB's path is specified generically to support both GitHub and Perforce
diff -Nru libthrust-1.15.0/CODE_OF_CONDUCT.md libthrust-1.16.0/CODE_OF_CONDUCT.md
--- libthrust-1.15.0/CODE_OF_CONDUCT.md	2021-10-29 18:01:06.000000000 +0000
+++ libthrust-1.16.0/CODE_OF_CONDUCT.md	2022-02-09 20:04:17.000000000 +0000
@@ -93,4 +93,4 @@
 [FAQ]: https://www.contributor-covenant.org/faq
 
 [NVIDIA RAPIDS]: https://docs.rapids.ai/resources/conduct/
-[Contributor Covenant]: https://www.contributor-covenant.org/version/1/4/code-of-conduct.html
+[Contributor Covenant version 1.4]: https://www.contributor-covenant.org/version/1/4/code-of-conduct.html
diff -Nru libthrust-1.15.0/debian/changelog libthrust-1.16.0/debian/changelog
--- libthrust-1.15.0/debian/changelog	2022-06-05 21:19:51.000000000 +0000
+++ libthrust-1.16.0/debian/changelog	2022-06-19 18:27:00.000000000 +0000
@@ -1,3 +1,9 @@
+libthrust (1.16.0-1) unstable; urgency=medium
+
+  * New upstream release 1.16.0.
+
+ -- Andreas Beckmann <anbe@debian.org>  Sun, 19 Jun 2022 20:27:00 +0200
+
 libthrust (1.15.0-6) unstable; urgency=medium
 
   * Disable cuda tests on !amd64, they always run into timeout.
diff -Nru libthrust-1.15.0/debian/rules libthrust-1.16.0/debian/rules
--- libthrust-1.15.0/debian/rules	2022-06-05 21:19:51.000000000 +0000
+++ libthrust-1.16.0/debian/rules	2022-06-19 18:27:00.000000000 +0000
@@ -20,6 +20,7 @@
 	dh_auto_install --destdir=debian/tmp
 
 execute_after_dh_auto_install:
+	$(RM) -v debian/tmp/usr/lib/$(DEB_HOST_MULTIARCH)/cmake/thrust/thrust-header-search.cmake.in
 	# remove empty directories
 	find debian/tmp -depth -type d -exec sh -c 'd="{}"; rmdir --ignore-fail-on-non-empty "$$d" ; test -d "$$d" || echo "removed $$d"' \;
 
diff -Nru libthrust-1.15.0/debian/tests/upstream-testsuite libthrust-1.16.0/debian/tests/upstream-testsuite
--- libthrust-1.15.0/debian/tests/upstream-testsuite	2022-06-05 21:19:51.000000000 +0000
+++ libthrust-1.16.0/debian/tests/upstream-testsuite	2022-06-19 18:27:00.000000000 +0000
@@ -61,6 +61,7 @@
 	${host:+-DTHRUST_HOST_SYSTEM_OPTIONS=$host} \
 	-DTHRUST_DEVICE_SYSTEM_OPTIONS=$device \
 	-D_THRUST_CMAKE_DIR=/usr/share/cmake/thrust \
+	-DTHRUST_INSTALL_CUB_HEADERS=OFF \
 	-Wno-dev
 
 cd $AUTOPKGTEST_TMP
diff -Nru libthrust-1.15.0/.gitignore libthrust-1.16.0/.gitignore
--- libthrust-1.15.0/.gitignore	2021-10-29 18:01:06.000000000 +0000
+++ libthrust-1.16.0/.gitignore	2022-02-09 20:04:17.000000000 +0000
@@ -2,3 +2,5 @@
 .p4config
 doc/html
 discrete_voronoi.pgm
+*build*/
+.idea/
diff -Nru libthrust-1.15.0/internal/benchmark/bench.cu libthrust-1.16.0/internal/benchmark/bench.cu
--- libthrust-1.15.0/internal/benchmark/bench.cu	2021-10-29 18:01:06.000000000 +0000
+++ libthrust-1.16.0/internal/benchmark/bench.cu	2022-02-09 20:04:17.000000000 +0000
@@ -992,7 +992,6 @@
     , RegularTrials
   >::run_experiment();
 
-#if THRUST_CPP_DIALECT >= 2011
   experiment_driver<
       shuffle_tester
     , ElementMetaType
@@ -1000,7 +999,6 @@
     , BaselineTrials
     , RegularTrials
   >::run_experiment();
-#endif
 }
 
 ///////////////////////////////////////////////////////////////////////////////
diff -Nru libthrust-1.15.0/internal/benchmark/CMakeLists.txt libthrust-1.16.0/internal/benchmark/CMakeLists.txt
--- libthrust-1.15.0/internal/benchmark/CMakeLists.txt	1970-01-01 00:00:00.000000000 +0000
+++ libthrust-1.16.0/internal/benchmark/CMakeLists.txt	2022-02-09 20:04:17.000000000 +0000
@@ -0,0 +1,29 @@
+if (CMAKE_CXX_COMPILER_ID STREQUAL "MSVC")
+  # MSVC builds fail at runtime. Benchmarks are linux-only for now.
+  message(STATUS "Thrust benchmarking is not available on MSVC.")
+  return()
+endif()
+
+add_custom_target(thrust.all.bench)
+
+foreach(thrust_target IN LISTS THRUST_TARGETS)
+  thrust_get_target_property(config_host ${thrust_target} HOST)
+  thrust_get_target_property(config_device ${thrust_target} DEVICE)
+  thrust_get_target_property(config_prefix ${thrust_target} PREFIX)
+
+  # Skip non cpp.cuda targets:
+  if (NOT config_host   STREQUAL "CPP" OR
+      NOT config_device STREQUAL "CUDA")
+    continue()
+  endif()
+
+  set(bench_target ${config_prefix}.bench)
+
+  add_executable(${bench_target} bench.cu)
+  target_link_libraries(${bench_target} PRIVATE ${thrust_target})
+  target_include_directories(${bench_target} PRIVATE "${CMAKE_CURRENT_SOURCE_DIR}")
+  thrust_clone_target_properties(${bench_target} ${thrust_target})
+
+  add_dependencies(thrust.all.bench ${bench_target})
+  add_dependencies(${config_prefix}.all ${bench_target})
+endforeach()
diff -Nru libthrust-1.15.0/internal/build/common_build.mk libthrust-1.16.0/internal/build/common_build.mk
--- libthrust-1.15.0/internal/build/common_build.mk	2021-10-29 18:01:06.000000000 +0000
+++ libthrust-1.16.0/internal/build/common_build.mk	2022-02-09 20:04:17.000000000 +0000
@@ -6,10 +6,6 @@
   LIBRARIES += m
 endif
 
-# Disable our THRUST_DEPRECATED and CUB_DEPRECATED macros for internal
-# builds, since we need to build and test our deprecated APIs with -Werror.
-CUDACC_FLAGS += -DTHRUST_IGNORE_DEPRECATED_API
-
 include $(ROOTDIR)/thrust/internal/build/common_compiler.mk
 
 # Add /bigobj to Windows build flag to workaround building Thrust with debug
diff -Nru libthrust-1.15.0/README.md libthrust-1.16.0/README.md
--- libthrust-1.15.0/README.md	2021-10-29 18:01:06.000000000 +0000
+++ libthrust-1.16.0/README.md	2022-02-09 20:04:17.000000000 +0000
@@ -1,4 +1,4 @@
-<a href='https://gpuci.gpuopenanalytics.com/job/nvidia/job/thrust/job/branch/job/thrust-cpu-build/CXX_TYPE=gcc,CXX_VER=9,OS_TYPE=ubuntu,OS_VER=20.04,SDK_TYPE=cuda,SDK_VER=11.0-devel/'><img src='https://gpuci.gpuopenanalytics.com/job/nvidia/job/thrust/job/branch/job/thrust-cpu-build/CXX_TYPE=gcc,CXX_VER=9,OS_TYPE=ubuntu,OS_VER=20.04,SDK_TYPE=cuda,SDK_VER=11.0-devel/badge/icon'></a>
+<a href='https://gpuci.gpuopenanalytics.com/job/nvidia/job/thrust/job/branch/job/thrust-cpu-build/CXX_TYPE=gcc,CXX_VER=9,OS_TYPE=ubuntu,OS_VER=20.04,SDK_TYPE=cuda,SDK_VER=11.5.1-devel'><img src='https://gpuci.gpuopenanalytics.com/job/nvidia/job/thrust/job/branch/job/thrust-cpu-build/CXX_TYPE=gcc,CXX_VER=9,OS_TYPE=ubuntu,OS_VER=20.04,SDK_TYPE=cuda,SDK_VER=11.5.1-devel/badge/icon'></a>
 
 # Thrust: Code at the speed of light
 
@@ -33,10 +33,10 @@
 For CMake-based projects, we provide a CMake package for use with
 `find_package`. See the [CMake README](thrust/cmake/README.md) for more
 information. Thrust can also be added via `add_subdirectory` or tools like
-the [CMake Package Manager](https://github.com/TheLartians/CPM.cmake).
+the [CMake Package Manager](https://github.com/cpm-cmake/CPM.cmake).
 
 For non-CMake projects, compile with:
-- The Thrust include path (`-I<thrust repo root>/thrust`)
+- The Thrust include path (`-I<thrust repo root>`)
 - The CUB include path, if using the CUDA device system (`-I<thrust repo root>/dependencies/cub/`)
 - By default, the CPP host system and CUDA device system are used.
   These can be changed using compiler definitions:
@@ -114,33 +114,37 @@
 
 ## CI Status
 
-<a href='https://gpuci.gpuopenanalytics.com/job/nvidia/job/thrust/job/branch/job/thrust-gpu-build/CXX_TYPE=gcc,CXX_VER=7,OS_TYPE=ubuntu,OS_VER=20.04,SDK_TYPE=cuda,SDK_VER=11.3.1-devel/'><img src='https://gpuci.gpuopenanalytics.com/job/nvidia/job/thrust/job/branch/job/thrust-gpu-build/CXX_TYPE=gcc,CXX_VER=7,OS_TYPE=ubuntu,OS_VER=20.04,SDK_TYPE=cuda,SDK_VER=11.3.1-devel/badge/icon?subject=NVCC%2011.3.1%20%2B%20GCC%207%20build%20and%20device%20tests'></a>
+<a href='https://gpuci.gpuopenanalytics.com/job/nvidia/job/thrust/job/branch/job/thrust-gpu-build/CXX_TYPE=gcc,CXX_VER=9,OS_TYPE=ubuntu,OS_VER=20.04,SDK_TYPE=cuda,SDK_VER=11.5.1-devel/'><img src='https://gpuci.gpuopenanalytics.com/job/nvidia/job/thrust/job/branch/job/thrust-gpu-build/CXX_TYPE=gcc,CXX_VER=9,OS_TYPE=ubuntu,OS_VER=20.04,SDK_TYPE=cuda,SDK_VER=11.5.1-devel/badge/icon?subject=NVCC%2011.5.1%20%2B%20GCC%207%20build%20and%20device%20tests'></a>
 
-<a href='https://gpuci.gpuopenanalytics.com/job/nvidia/job/thrust/job/branch/job/thrust-cpu-build/CXX_TYPE=gcc,CXX_VER=10,OS_TYPE=ubuntu,OS_VER=20.04,SDK_TYPE=cuda,SDK_VER=11.3.1-devel/'><img src='https://gpuci.gpuopenanalytics.com/job/nvidia/job/thrust/job/branch/job/thrust-cpu-build/CXX_TYPE=gcc,CXX_VER=10,OS_TYPE=ubuntu,OS_VER=20.04,SDK_TYPE=cuda,SDK_VER=11.3.1-devel/badge/icon?subject=NVCC%2011.3.1%20%2B%20GCC%2010%20build%20and%20host%20tests'></a>
+<a href='https://gpuci.gpuopenanalytics.com/job/nvidia/job/thrust/job/branch/job/thrust-cpu-build/CXX_TYPE=gcc,CXX_VER=11,OS_TYPE=ubuntu,OS_VER=20.04,SDK_TYPE=cuda,SDK_VER=11.5.1-devel/'><img src='https://gpuci.gpuopenanalytics.com/job/nvidia/job/thrust/job/branch/job/thrust-cpu-build/CXX_TYPE=gcc,CXX_VER=11,OS_TYPE=ubuntu,OS_VER=20.04,SDK_TYPE=cuda,SDK_VER=11.5.1-devel/badge/icon?subject=NVCC%2011.5.1%20%2B%20GCC%2010%20build%20and%20host%20tests'></a>
 
-<a href='https://gpuci.gpuopenanalytics.com/job/nvidia/job/thrust/job/branch/job/thrust-cpu-build/CXX_TYPE=gcc,CXX_VER=9,OS_TYPE=ubuntu,OS_VER=20.04,SDK_TYPE=cuda,SDK_VER=11.3.1-devel/'><img src='https://gpuci.gpuopenanalytics.com/job/nvidia/job/thrust/job/branch/job/thrust-cpu-build/CXX_TYPE=gcc,CXX_VER=9,OS_TYPE=ubuntu,OS_VER=20.04,SDK_TYPE=cuda,SDK_VER=11.3.1-devel/badge/icon?subject=NVCC%2011.3.1%20%2B%20GCC%209%20build%20and%20host%20tests'></a>
+<a href='https://gpuci.gpuopenanalytics.com/job/nvidia/job/thrust/job/branch/job/thrust-cpu-build/CXX_TYPE=gcc,CXX_VER=10,OS_TYPE=ubuntu,OS_VER=20.04,SDK_TYPE=cuda,SDK_VER=11.5.1-devel/'><img src='https://gpuci.gpuopenanalytics.com/job/nvidia/job/thrust/job/branch/job/thrust-cpu-build/CXX_TYPE=gcc,CXX_VER=10,OS_TYPE=ubuntu,OS_VER=20.04,SDK_TYPE=cuda,SDK_VER=11.5.1-devel/badge/icon?subject=NVCC%2011.5.1%20%2B%20GCC%2010%20build%20and%20host%20tests'></a>
 
-<a href='https://gpuci.gpuopenanalytics.com/job/nvidia/job/thrust/job/branch/job/thrust-cpu-build/CXX_TYPE=gcc,CXX_VER=8,OS_TYPE=ubuntu,OS_VER=20.04,SDK_TYPE=cuda,SDK_VER=11.3.1-devel/'><img src='https://gpuci.gpuopenanalytics.com/job/nvidia/job/thrust/job/branch/job/thrust-cpu-build/CXX_TYPE=gcc,CXX_VER=8,OS_TYPE=ubuntu,OS_VER=20.04,SDK_TYPE=cuda,SDK_VER=11.3.1-devel/badge/icon?subject=NVCC%2011.3.1%20%2B%20GCC%208%20build%20and%20host%20tests'></a>
+<a href='https://gpuci.gpuopenanalytics.com/job/nvidia/job/thrust/job/branch/job/thrust-cpu-build/CXX_TYPE=gcc,CXX_VER=9,OS_TYPE=ubuntu,OS_VER=20.04,SDK_TYPE=cuda,SDK_VER=11.5.1-devel/'><img src='https://gpuci.gpuopenanalytics.com/job/nvidia/job/thrust/job/branch/job/thrust-cpu-build/CXX_TYPE=gcc,CXX_VER=9,OS_TYPE=ubuntu,OS_VER=20.04,SDK_TYPE=cuda,SDK_VER=11.5.1-devel/badge/icon?subject=NVCC%2011.5.1%20%2B%20GCC%209%20build%20and%20host%20tests'></a>
 
-<a href='https://gpuci.gpuopenanalytics.com/job/nvidia/job/thrust/job/branch/job/thrust-cpu-build/CXX_TYPE=gcc,CXX_VER=7,OS_TYPE=ubuntu,OS_VER=20.04,SDK_TYPE=cuda,SDK_VER=11.3.1-devel/'><img src='https://gpuci.gpuopenanalytics.com/job/nvidia/job/thrust/job/branch/job/thrust-cpu-build/CXX_TYPE=gcc,CXX_VER=7,OS_TYPE=ubuntu,OS_VER=20.04,SDK_TYPE=cuda,SDK_VER=11.3.1-devel/badge/icon?subject=NVCC%2011.3.1%20%2B%20GCC%207%20build%20and%20host%20tests'></a>
+<a href='https://gpuci.gpuopenanalytics.com/job/nvidia/job/thrust/job/branch/job/thrust-cpu-build/CXX_TYPE=gcc,CXX_VER=8,OS_TYPE=ubuntu,OS_VER=20.04,SDK_TYPE=cuda,SDK_VER=11.5.1-devel/'><img src='https://gpuci.gpuopenanalytics.com/job/nvidia/job/thrust/job/branch/job/thrust-cpu-build/CXX_TYPE=gcc,CXX_VER=8,OS_TYPE=ubuntu,OS_VER=20.04,SDK_TYPE=cuda,SDK_VER=11.5.1-devel/badge/icon?subject=NVCC%2011.5.1%20%2B%20GCC%208%20build%20and%20host%20tests'></a>
 
-<a href='https://gpuci.gpuopenanalytics.com/job/nvidia/job/thrust/job/branch/job/thrust-cpu-build/CXX_TYPE=gcc,CXX_VER=6,OS_TYPE=ubuntu,OS_VER=20.04,SDK_TYPE=cuda,SDK_VER=11.3.1-devel/'><img src='https://gpuci.gpuopenanalytics.com/job/nvidia/job/thrust/job/branch/job/thrust-cpu-build/CXX_TYPE=gcc,CXX_VER=6,OS_TYPE=ubuntu,OS_VER=20.04,SDK_TYPE=cuda,SDK_VER=11.3.1-devel/badge/icon?subject=NVCC%2011.3.1%20%2B%20GCC%206%20build%20and%20host%20tests'></a>
+<a href='https://gpuci.gpuopenanalytics.com/job/nvidia/job/thrust/job/branch/job/thrust-cpu-build/CXX_TYPE=gcc,CXX_VER=7,OS_TYPE=ubuntu,OS_VER=20.04,SDK_TYPE=cuda,SDK_VER=11.5.1-devel/'><img src='https://gpuci.gpuopenanalytics.com/job/nvidia/job/thrust/job/branch/job/thrust-cpu-build/CXX_TYPE=gcc,CXX_VER=7,OS_TYPE=ubuntu,OS_VER=20.04,SDK_TYPE=cuda,SDK_VER=11.5.1-devel/badge/icon?subject=NVCC%2011.5.1%20%2B%20GCC%207%20build%20and%20host%20tests'></a>
 
-<a href='https://gpuci.gpuopenanalytics.com/job/nvidia/job/thrust/job/branch/job/thrust-cpu-build/CXX_TYPE=gcc,CXX_VER=5,OS_TYPE=ubuntu,OS_VER=20.04,SDK_TYPE=cuda,SDK_VER=11.3.1-devel/'><img src='https://gpuci.gpuopenanalytics.com/job/nvidia/job/thrust/job/branch/job/thrust-cpu-build/CXX_TYPE=gcc,CXX_VER=5,OS_TYPE=ubuntu,OS_VER=20.04,SDK_TYPE=cuda,SDK_VER=11.3.1-devel/badge/icon?subject=NVCC%2011.3.1%20%2B%20GCC%205%20build%20and%20host%20tests'></a>
+<a href='https://gpuci.gpuopenanalytics.com/job/nvidia/job/thrust/job/branch/job/thrust-cpu-build/CXX_TYPE=gcc,CXX_VER=6,OS_TYPE=ubuntu,OS_VER=20.04,SDK_TYPE=cuda,SDK_VER=11.5.1-devel/'><img src='https://gpuci.gpuopenanalytics.com/job/nvidia/job/thrust/job/branch/job/thrust-cpu-build/CXX_TYPE=gcc,CXX_VER=6,OS_TYPE=ubuntu,OS_VER=20.04,SDK_TYPE=cuda,SDK_VER=11.5.1-devel/badge/icon?subject=NVCC%2011.5.1%20%2B%20GCC%206%20build%20and%20host%20tests'></a>
 
-<a href='https://gpuci.gpuopenanalytics.com/job/nvidia/job/thrust/job/branch/job/thrust-cpu-build/CXX_TYPE=clang,CXX_VER=11,OS_TYPE=ubuntu,OS_VER=20.04,SDK_TYPE=cuda,SDK_VER=11.3.1-devel/'><img src='https://gpuci.gpuopenanalytics.com/job/nvidia/job/thrust/job/branch/job/thrust-cpu-build/CXX_TYPE=clang,CXX_VER=11,OS_TYPE=ubuntu,OS_VER=20.04,SDK_TYPE=cuda,SDK_VER=11.3.1-devel/badge/icon?subject=NVCC%2011.3.1%20%2B%20Clang%2011%20build%20and%20host%20tests'></a>
+<a href='https://gpuci.gpuopenanalytics.com/job/nvidia/job/thrust/job/branch/job/thrust-cpu-build/CXX_TYPE=gcc,CXX_VER=5,OS_TYPE=ubuntu,OS_VER=20.04,SDK_TYPE=cuda,SDK_VER=11.5.1-devel/'><img src='https://gpuci.gpuopenanalytics.com/job/nvidia/job/thrust/job/branch/job/thrust-cpu-build/CXX_TYPE=gcc,CXX_VER=5,OS_TYPE=ubuntu,OS_VER=20.04,SDK_TYPE=cuda,SDK_VER=11.5.1-devel/badge/icon?subject=NVCC%2011.5.1%20%2B%20GCC%205%20build%20and%20host%20tests'></a>
 
-<a href='https://gpuci.gpuopenanalytics.com/job/nvidia/job/thrust/job/branch/job/thrust-cpu-build/CXX_TYPE=clang,CXX_VER=10,OS_TYPE=ubuntu,OS_VER=20.04,SDK_TYPE=cuda,SDK_VER=11.3.1-devel/'><img src='https://gpuci.gpuopenanalytics.com/job/nvidia/job/thrust/job/branch/job/thrust-cpu-build/CXX_TYPE=clang,CXX_VER=10,OS_TYPE=ubuntu,OS_VER=20.04,SDK_TYPE=cuda,SDK_VER=11.3.1-devel/badge/icon?subject=NVCC%2011.3.1%20%2B%20Clang%2010%20build%20and%20host%20tests'></a>
+<a href='https://gpuci.gpuopenanalytics.com/job/nvidia/job/thrust/job/branch/job/thrust-cpu-build/CXX_TYPE=clang,CXX_VER=12,OS_TYPE=ubuntu,OS_VER=20.04,SDK_TYPE=cuda,SDK_VER=11.5.1-devel/'><img src='https://gpuci.gpuopenanalytics.com/job/nvidia/job/thrust/job/branch/job/thrust-cpu-build/CXX_TYPE=clang,CXX_VER=12,OS_TYPE=ubuntu,OS_VER=20.04,SDK_TYPE=cuda,SDK_VER=11.5.1-devel/badge/icon?subject=NVCC%2011.5.1%20%2B%20Clang%2011%20build%20and%20host%20tests'></a>
 
-<a href='https://gpuci.gpuopenanalytics.com/job/nvidia/job/thrust/job/branch/job/thrust-cpu-build/CXX_TYPE=clang,CXX_VER=9,OS_TYPE=ubuntu,OS_VER=20.04,SDK_TYPE=cuda,SDK_VER=11.3.1-devel/'><img src='https://gpuci.gpuopenanalytics.com/job/nvidia/job/thrust/job/branch/job/thrust-cpu-build/CXX_TYPE=clang,CXX_VER=9,OS_TYPE=ubuntu,OS_VER=20.04,SDK_TYPE=cuda,SDK_VER=11.3.1-devel/badge/icon?subject=NVCC%2011.3.1%20%2B%20Clang%209%20build%20and%20host%20tests'></a>
+<a href='https://gpuci.gpuopenanalytics.com/job/nvidia/job/thrust/job/branch/job/thrust-cpu-build/CXX_TYPE=clang,CXX_VER=11,OS_TYPE=ubuntu,OS_VER=20.04,SDK_TYPE=cuda,SDK_VER=11.5.1-devel/'><img src='https://gpuci.gpuopenanalytics.com/job/nvidia/job/thrust/job/branch/job/thrust-cpu-build/CXX_TYPE=clang,CXX_VER=11,OS_TYPE=ubuntu,OS_VER=20.04,SDK_TYPE=cuda,SDK_VER=11.5.1-devel/badge/icon?subject=NVCC%2011.5.1%20%2B%20Clang%2011%20build%20and%20host%20tests'></a>
 
-<a href='https://gpuci.gpuopenanalytics.com/job/nvidia/job/thrust/job/branch/job/thrust-cpu-build/CXX_TYPE=clang,CXX_VER=8,OS_TYPE=ubuntu,OS_VER=20.04,SDK_TYPE=cuda,SDK_VER=11.3.1-devel/'><img src='https://gpuci.gpuopenanalytics.com/job/nvidia/job/thrust/job/branch/job/thrust-cpu-build/CXX_TYPE=clang,CXX_VER=8,OS_TYPE=ubuntu,OS_VER=20.04,SDK_TYPE=cuda,SDK_VER=11.3.1-devel/badge/icon?subject=NVCC%2011.3.1%20%2B%20Clang%208%20build%20and%20host%20tests'></a>
+<a href='https://gpuci.gpuopenanalytics.com/job/nvidia/job/thrust/job/branch/job/thrust-cpu-build/CXX_TYPE=clang,CXX_VER=10,OS_TYPE=ubuntu,OS_VER=20.04,SDK_TYPE=cuda,SDK_VER=11.5.1-devel/'><img src='https://gpuci.gpuopenanalytics.com/job/nvidia/job/thrust/job/branch/job/thrust-cpu-build/CXX_TYPE=clang,CXX_VER=10,OS_TYPE=ubuntu,OS_VER=20.04,SDK_TYPE=cuda,SDK_VER=11.5.1-devel/badge/icon?subject=NVCC%2011.5.1%20%2B%20Clang%2010%20build%20and%20host%20tests'></a>
 
-<a href='https://gpuci.gpuopenanalytics.com/job/nvidia/job/thrust/job/branch/job/thrust-cpu-build/CXX_TYPE=clang,CXX_VER=7,OS_TYPE=ubuntu,OS_VER=20.04,SDK_TYPE=cuda,SDK_VER=11.3.1-devel/'><img src='https://gpuci.gpuopenanalytics.com/job/nvidia/job/thrust/job/branch/job/thrust-cpu-build/CXX_TYPE=clang,CXX_VER=7,OS_TYPE=ubuntu,OS_VER=20.04,SDK_TYPE=cuda,SDK_VER=11.3.1-devel/badge/icon?subject=NVCC%2011.3.1%20%2B%20Clang%207%20build%20and%20host%20tests'></a>
+<a href='https://gpuci.gpuopenanalytics.com/job/nvidia/job/thrust/job/branch/job/thrust-cpu-build/CXX_TYPE=clang,CXX_VER=9,OS_TYPE=ubuntu,OS_VER=20.04,SDK_TYPE=cuda,SDK_VER=11.5.1-devel/'><img src='https://gpuci.gpuopenanalytics.com/job/nvidia/job/thrust/job/branch/job/thrust-cpu-build/CXX_TYPE=clang,CXX_VER=9,OS_TYPE=ubuntu,OS_VER=20.04,SDK_TYPE=cuda,SDK_VER=11.5.1-devel/badge/icon?subject=NVCC%2011.5.1%20%2B%20Clang%209%20build%20and%20host%20tests'></a>
 
-<a href='https://gpuci.gpuopenanalytics.com/job/nvidia/job/thrust/job/branch/job/thrust-cpu-build/CXX_TYPE=icc,CXX_VER=latest,OS_TYPE=ubuntu,OS_VER=20.04,SDK_TYPE=cuda,SDK_VER=11.3.1-devel/'><img src='https://gpuci.gpuopenanalytics.com/job/nvidia/job/thrust/job/branch/job/thrust-cpu-build/CXX_TYPE=icc,CXX_VER=latest,OS_TYPE=ubuntu,OS_VER=20.04,SDK_TYPE=cuda,SDK_VER=11.3.1-devel/badge/icon?subject=NVCC%2011.3.1%20%2B%20ICC%20build%20and%20host%20tests'></a>
+<a href='https://gpuci.gpuopenanalytics.com/job/nvidia/job/thrust/job/branch/job/thrust-cpu-build/CXX_TYPE=clang,CXX_VER=8,OS_TYPE=ubuntu,OS_VER=20.04,SDK_TYPE=cuda,SDK_VER=11.5.1-devel/'><img src='https://gpuci.gpuopenanalytics.com/job/nvidia/job/thrust/job/branch/job/thrust-cpu-build/CXX_TYPE=clang,CXX_VER=8,OS_TYPE=ubuntu,OS_VER=20.04,SDK_TYPE=cuda,SDK_VER=11.5.1-devel/badge/icon?subject=NVCC%2011.5.1%20%2B%20Clang%208%20build%20and%20host%20tests'></a>
 
-<a href='https://gpuci.gpuopenanalytics.com/job/nvidia/job/thrust/job/branch/job/thrust-cpu-build/CXX_TYPE=nvcxx,CXX_VER=21.5,OS_TYPE=ubuntu,OS_VER=20.04,SDK_TYPE=nvhpc,SDK_VER=21.5-devel-cuda11.3/'><img src='https://gpuci.gpuopenanalytics.com/job/nvidia/job/thrust/job/branch/job/thrust-cpu-build/CXX_TYPE=nvcxx,CXX_VER=21.5,OS_TYPE=ubuntu,OS_VER=20.04,SDK_TYPE=nvhpc,SDK_VER=21.5-devel-cuda11.3/badge/icon?subject=NVC%2B%2B%2021.5%20build%20and%20host%20tests'></a>
+<a href='https://gpuci.gpuopenanalytics.com/job/nvidia/job/thrust/job/branch/job/thrust-cpu-build/CXX_TYPE=clang,CXX_VER=7,OS_TYPE=ubuntu,OS_VER=20.04,SDK_TYPE=cuda,SDK_VER=11.5.1-devel/'><img src='https://gpuci.gpuopenanalytics.com/job/nvidia/job/thrust/job/branch/job/thrust-cpu-build/CXX_TYPE=clang,CXX_VER=7,OS_TYPE=ubuntu,OS_VER=20.04,SDK_TYPE=cuda,SDK_VER=11.5.1-devel/badge/icon?subject=NVCC%2011.5.1%20%2B%20Clang%207%20build%20and%20host%20tests'></a>
+
+<a href='https://gpuci.gpuopenanalytics.com/job/nvidia/job/thrust/job/branch/job/thrust-cpu-build/CXX_TYPE=icc,CXX_VER=latest,OS_TYPE=ubuntu,OS_VER=20.04,SDK_TYPE=cuda,SDK_VER=11.5.1-devel/'><img src='https://gpuci.gpuopenanalytics.com/job/nvidia/job/thrust/job/branch/job/thrust-cpu-build/CXX_TYPE=icc,CXX_VER=latest,OS_TYPE=ubuntu,OS_VER=20.04,SDK_TYPE=cuda,SDK_VER=11.5.1-devel/badge/icon?subject=NVCC%2011.5.1%20%2B%20ICC%20build%20and%20host%20tests'></a>
+
+<a href='https://gpuci.gpuopenanalytics.com/job/nvidia/job/thrust/job/branch/job/thrust-cpu-build/CXX_TYPE=nvcxx,CXX_VER=22.1,OS_TYPE=ubuntu,OS_VER=20.04,SDK_TYPE=nvhpc,SDK_VER=22.1-devel-cuda11.5/'><img src='https://gpuci.gpuopenanalytics.com/job/nvidia/job/thrust/job/branch/job/thrust-cpu-build/CXX_TYPE=nvcxx,CXX_VER=22.1,OS_TYPE=ubuntu,OS_VER=20.04,SDK_TYPE=nvhpc,SDK_VER=22.1-devel-cuda11.5/badge/icon?subject=NVC%2B%2B%2022.1%20build%20and%20host%20tests'></a>
 
 ## Supported Compilers
 
@@ -163,7 +167,8 @@
 
 | Thrust Release    | Included In                             |
 | ----------------- | --------------------------------------- |
-| 1.15.0            | TBD                                     |
+| 1.16.0            | TBD                                     |
+| 1.15.0            | NVIDIA HPC SDK 22.1 & CUDA Toolkit 11.6 |
 | 1.14.0            | NVIDIA HPC SDK 21.9                     |
 | 1.13.1            | CUDA Toolkit 11.5                       |
 | 1.13.0            | NVIDIA HPC SDK 21.7                     |
@@ -234,6 +239,8 @@
 ```
 
 By default, a serial `CPP` host system, `CUDA` accelerated device system, and
-C++14 standard are used. This can be changed in CMake. More information on
-configuring your Thrust build and creating a pull request can be found in
-[CONTRIBUTING.md](CONTRIBUTING.md).
+C++14 standard are used. This can be changed during configuration -- see
+[CMake Options](CONTRIBUTING.md#cmake-options) for details.
+
+More information on configuring your Thrust build and creating a pull request
+can be found in [CONTRIBUTING.md](CONTRIBUTING.md).
diff -Nru libthrust-1.15.0/testing/cmake/check_source_files.cmake libthrust-1.16.0/testing/cmake/check_source_files.cmake
--- libthrust-1.15.0/testing/cmake/check_source_files.cmake	2021-10-29 18:01:06.000000000 +0000
+++ libthrust-1.16.0/testing/cmake/check_source_files.cmake	2022-02-09 20:04:17.000000000 +0000
@@ -63,10 +63,12 @@
   # The wrappers are allowed to include the unwrapped headers
   thrust/detail/algorithm_wrapper.h
   thrust/detail/memory_wrapper.h
+  thrust/detail/numeric_wrapper.h
 )
 
 set(algorithm_regex "#[ \t]*include[ \t]+<algorithm>")
 set(memory_regex    "#[ \t]*include[ \t]+<memory>")
+set(numeric_regex   "#[ \t]*include[ \t]+<numeric>")
 
 # Validation check for the above regex pattern:
 count_substrings([=[
@@ -126,6 +128,7 @@
   if (NOT ${src} IN_LIST stdpar_header_exclusions)
     count_substrings("${src_contents}" "${algorithm_regex}" algorithm_count)
     count_substrings("${src_contents}" "${memory_regex}" memory_count)
+    count_substrings("${src_contents}" "${numeric_regex}" numeric_count)
 
     if (NOT algorithm_count EQUAL 0)
       message("'${src}' includes the <algorithm> header. Replace with <thrust/detail/algorithm_wrapper.h>.")
@@ -136,6 +139,11 @@
       message("'${src}' includes the <memory> header. Replace with <thrust/detail/memory_wrapper.h>.")
       set(found_errors 1)
     endif()
+
+    if (NOT numeric_count EQUAL 0)
+      message("'${src}' includes the <numeric> header. Replace with <thrust/detail/numeric_wrapper.h>.")
+      set(found_errors 1)
+    endif()
   endif()
 endforeach()
 
diff -Nru libthrust-1.15.0/testing/cuda/copy_if.cu libthrust-1.16.0/testing/cuda/copy_if.cu
--- libthrust-1.15.0/testing/cuda/copy_if.cu	2021-10-29 18:01:06.000000000 +0000
+++ libthrust-1.16.0/testing/cuda/copy_if.cu	2022-02-09 20:04:17.000000000 +0000
@@ -95,7 +95,14 @@
 DECLARE_UNITTEST(TestCopyIfDeviceDevice);
 
 
-void TestCopyIfCudaStreams()
+void TestCopyIfDeviceNoSync()
+{
+  TestCopyIfDevice(thrust::cuda::par_nosync);
+}
+DECLARE_UNITTEST(TestCopyIfDeviceNoSync);
+
+template<typename ExecutionPolicy>
+void TestCopyIfCudaStreams(ExecutionPolicy policy)
 {
   typedef thrust::device_vector<int> Vector;
 
@@ -111,7 +118,7 @@
   cudaStream_t s;
   cudaStreamCreate(&s);
 
-  Vector::iterator end = thrust::copy_if(thrust::cuda::par.on(s),
+  Vector::iterator end = thrust::copy_if(policy.on(s),
                                          data.begin(), 
                                          data.end(), 
                                          result.begin(),
@@ -124,7 +131,16 @@
 
   cudaStreamDestroy(s);
 }
-DECLARE_UNITTEST(TestCopyIfCudaStreams);
+
+void TestCopyIfCudaStreamsSync(){
+  TestCopyIfCudaStreams(thrust::cuda::par);
+}
+DECLARE_UNITTEST(TestCopyIfCudaStreamsSync);
+
+void TestCopyIfCudaStreamsNoSync(){
+  TestCopyIfCudaStreams(thrust::cuda::par_nosync);
+}
+DECLARE_UNITTEST(TestCopyIfCudaStreamsNoSync);
 
 
 template<typename ExecutionPolicy, typename Iterator1, typename Iterator2, typename Iterator3, typename Predicate, typename Iterator4>
@@ -205,7 +221,15 @@
 DECLARE_UNITTEST(TestCopyIfStencilDeviceDevice);
 
 
-void TestCopyIfStencilCudaStreams()
+void TestCopyIfStencilDeviceNoSync()
+{
+  TestCopyIfStencilDevice(thrust::cuda::par_nosync);
+}
+DECLARE_UNITTEST(TestCopyIfStencilDeviceNoSync);
+
+
+template<typename ExecutionPolicy>
+void TestCopyIfStencilCudaStreams(ExecutionPolicy policy)
 {
   typedef thrust::device_vector<int> Vector;
   typedef Vector::value_type T;
@@ -229,7 +253,7 @@
   cudaStream_t s;
   cudaStreamCreate(&s);
 
-  Vector::iterator end = thrust::copy_if(thrust::cuda::par.on(s),
+  Vector::iterator end = thrust::copy_if(policy.on(s),
                                          data.begin(), 
                                          data.end(),
                                          stencil.begin(),
@@ -243,5 +267,17 @@
 
   cudaStreamDestroy(s);
 }
-DECLARE_UNITTEST(TestCopyIfStencilCudaStreams);
+
+void TestCopyIfStencilCudaStreamsSync()
+{
+  TestCopyIfStencilCudaStreams(thrust::cuda::par);
+}
+DECLARE_UNITTEST(TestCopyIfStencilCudaStreamsSync);
+
+
+void TestCopyIfStencilCudaStreamsNoSync()
+{
+  TestCopyIfStencilCudaStreams(thrust::cuda::par_nosync);
+}
+DECLARE_UNITTEST(TestCopyIfStencilCudaStreamsNoSync);
 
diff -Nru libthrust-1.15.0/testing/cuda/max_element.cu libthrust-1.16.0/testing/cuda/max_element.cu
--- libthrust-1.15.0/testing/cuda/max_element.cu	2021-10-29 18:01:06.000000000 +0000
+++ libthrust-1.16.0/testing/cuda/max_element.cu	2022-02-09 20:04:17.000000000 +0000
@@ -67,7 +67,15 @@
 DECLARE_UNITTEST(TestMaxElementDeviceDevice);
 
 
-void TestMaxElementCudaStreams()
+void TestMaxElementDeviceNoSync()
+{
+  TestMaxElementDevice(thrust::cuda::par_nosync);
+}
+DECLARE_UNITTEST(TestMaxElementDeviceNoSync);
+
+
+template<typename ExecutionPolicy>
+void TestMaxElementCudaStreams(ExecutionPolicy policy)
 {
   typedef thrust::device_vector<int> Vector;
   typedef Vector::value_type T;
@@ -83,15 +91,28 @@
   cudaStream_t s;
   cudaStreamCreate(&s);
 
-  ASSERT_EQUAL( *thrust::max_element(thrust::cuda::par.on(s), data.begin(), data.end()), 5);
-  ASSERT_EQUAL( thrust::max_element(thrust::cuda::par.on(s), data.begin(), data.end()) - data.begin(), 1);
+  auto streampolicy = policy.on(s);
+
+  ASSERT_EQUAL( *thrust::max_element(streampolicy, data.begin(), data.end()), 5);
+  ASSERT_EQUAL( thrust::max_element(streampolicy, data.begin(), data.end()) - data.begin(), 1);
   
-  ASSERT_EQUAL( *thrust::max_element(thrust::cuda::par.on(s), data.begin(), data.end(), thrust::greater<T>()), 1);
-  ASSERT_EQUAL( thrust::max_element(thrust::cuda::par.on(s), data.begin(), data.end(), thrust::greater<T>()) - data.begin(), 2);
+  ASSERT_EQUAL( *thrust::max_element(streampolicy, data.begin(), data.end(), thrust::greater<T>()), 1);
+  ASSERT_EQUAL( thrust::max_element(streampolicy, data.begin(), data.end(), thrust::greater<T>()) - data.begin(), 2);
 
   cudaStreamDestroy(s);
 }
-DECLARE_UNITTEST(TestMaxElementCudaStreams);
+
+void TestMaxElementCudaStreamsSync(){
+  TestMaxElementCudaStreams(thrust::cuda::par);
+}
+DECLARE_UNITTEST(TestMaxElementCudaStreamsSync);
+
+
+void TestMaxElementCudaStreamsNoSync(){
+  TestMaxElementCudaStreams(thrust::cuda::par_nosync);
+}
+DECLARE_UNITTEST(TestMaxElementCudaStreamsNoSync);
+
 
 void TestMaxElementDevicePointer()
 {
diff -Nru libthrust-1.15.0/testing/cuda/partition.cu libthrust-1.16.0/testing/cuda/partition.cu
--- libthrust-1.15.0/testing/cuda/partition.cu	2021-10-29 18:01:06.000000000 +0000
+++ libthrust-1.16.0/testing/cuda/partition.cu	2022-02-09 20:04:17.000000000 +0000
@@ -65,6 +65,13 @@
 DECLARE_UNITTEST(TestPartitionDeviceDevice);
 
 
+void TestPartitionDeviceNoSync()
+{
+  TestPartitionDevice(thrust::cuda::par_nosync);
+}
+DECLARE_UNITTEST(TestPartitionDeviceNoSync);
+
+
 template<typename ExecutionPolicy, typename Iterator1, typename Iterator2, typename Predicate, typename Iterator3>
 __global__
 void partition_kernel(ExecutionPolicy exec, Iterator1 first, Iterator1 last, Iterator2 stencil_first, Predicate pred, Iterator3 result)
@@ -125,6 +132,13 @@
 DECLARE_UNITTEST(TestPartitionStencilDeviceDevice);
 
 
+void TestPartitionStencilDeviceNoSync()
+{
+  TestPartitionStencilDevice(thrust::cuda::par_nosync);
+}
+DECLARE_UNITTEST(TestPartitionStencilDeviceNoSync);
+
+
 template<typename ExecutionPolicy, typename Iterator1, typename Iterator2, typename Iterator3, typename Predicate, typename Iterator4>
 __global__
 void partition_copy_kernel(ExecutionPolicy exec, Iterator1 first, Iterator1 last, Iterator2 true_result, Iterator3 false_result, Predicate pred, Iterator4 result)
@@ -188,6 +202,13 @@
 DECLARE_UNITTEST(TestPartitionCopyDeviceDevice);
 
 
+void TestPartitionCopyDeviceNoSync()
+{
+  TestPartitionCopyDevice(thrust::cuda::par_nosync);
+}
+DECLARE_UNITTEST(TestPartitionCopyDeviceNoSync);
+
+
 template<typename ExecutionPolicy, typename Iterator1, typename Iterator2, typename Iterator3, typename Iterator4, typename Predicate, typename Iterator5>
 __global__
 void partition_copy_kernel(ExecutionPolicy exec, Iterator1 first, Iterator1 last, Iterator2 stencil_first, Iterator3 true_result, Iterator4 false_result, Predicate pred, Iterator5 result)
@@ -258,6 +279,13 @@
 DECLARE_UNITTEST(TestPartitionCopyStencilDeviceDevice);
 
 
+void TestPartitionCopyStencilDeviceNoSync()
+{
+  TestPartitionCopyStencilDevice(thrust::cuda::par_nosync);
+}
+DECLARE_UNITTEST(TestPartitionCopyStencilDeviceNoSync);
+
+
 template<typename ExecutionPolicy, typename Iterator1, typename Predicate, typename Iterator2, typename Iterator3>
 __global__
 void stable_partition_kernel(ExecutionPolicy exec, Iterator1 first, Iterator1 last, Predicate pred, Iterator2 result, Iterator3 is_supported)
@@ -320,6 +348,13 @@
 DECLARE_UNITTEST(TestStablePartitionDeviceDevice);
 
 
+void TestStablePartitionDeviceNoSync()
+{
+  TestStablePartitionDevice(thrust::cuda::par_nosync);
+}
+DECLARE_UNITTEST(TestStablePartitionDeviceNoSync);
+
+
 template<typename ExecutionPolicy, typename Iterator1, typename Iterator2, typename Predicate, typename Iterator3, typename Iterator4>
 __global__
 void stable_partition_kernel(ExecutionPolicy exec, Iterator1 first, Iterator1 last, Iterator2 stencil_first, Predicate pred, Iterator3 result, Iterator4 is_supported)
@@ -389,6 +424,13 @@
 DECLARE_UNITTEST(TestStablePartitionStencilDeviceDevice);
 
 
+void TestStablePartitionStencilDeviceNoSync()
+{
+  TestStablePartitionStencilDevice(thrust::cuda::par_nosync);
+}
+DECLARE_UNITTEST(TestStablePartitionStencilDeviceNoSync);
+
+
 template<typename ExecutionPolicy, typename Iterator1, typename Iterator2, typename Iterator3, typename Predicate, typename Iterator4>
 __global__
 void stable_partition_copy_kernel(ExecutionPolicy exec, Iterator1 first, Iterator1 last, Iterator2 true_result, Iterator3 false_result, Predicate pred, Iterator4 result)
@@ -452,6 +494,13 @@
 DECLARE_UNITTEST(TestStablePartitionCopyDeviceDevice);
 
 
+void TestStablePartitionCopyDeviceNoSync()
+{
+  TestStablePartitionCopyDevice(thrust::cuda::par_nosync);
+}
+DECLARE_UNITTEST(TestStablePartitionCopyDeviceNoSync);
+
+
 template<typename ExecutionPolicy, typename Iterator1, typename Iterator2, typename Iterator3, typename Iterator4, typename Predicate, typename Iterator5>
 __global__
 void stable_partition_copy_kernel(ExecutionPolicy exec, Iterator1 first, Iterator1 last, Iterator2 stencil_first, Iterator3 true_result, Iterator4 false_result, Predicate pred, Iterator5 result)
@@ -522,7 +571,15 @@
 DECLARE_UNITTEST(TestStablePartitionCopyStencilDeviceDevice);
 
 
-void TestPartitionCudaStreams()
+void TestStablePartitionCopyStencilDeviceNoSync()
+{
+  TestStablePartitionCopyStencilDevice(thrust::cuda::par_nosync);
+}
+DECLARE_UNITTEST(TestStablePartitionCopyStencilDeviceNoSync);
+
+
+template<typename ExecutionPolicy>
+void TestPartitionCudaStreams(ExecutionPolicy policy)
 {
   typedef thrust::device_vector<int> Vector;
   typedef Vector::value_type T;
@@ -537,8 +594,10 @@
 
   cudaStream_t s;
   cudaStreamCreate(&s);
+
+  auto streampolicy = policy.on(s);
   
-  Iterator iter = thrust::partition(thrust::cuda::par.on(s), data.begin(), data.end(), is_even<T>());
+  Iterator iter = thrust::partition(streampolicy, data.begin(), data.end(), is_even<T>());
   
   Vector ref(5);
   ref[0] = 2;
@@ -552,5 +611,17 @@
 
   cudaStreamDestroy(s);
 }
-DECLARE_UNITTEST(TestPartitionCudaStreams);
+
+void TestPartitionCudaStreamsSync()
+{
+  TestPartitionCudaStreams(thrust::cuda::par);
+}
+DECLARE_UNITTEST(TestPartitionCudaStreamsSync);
+
+
+void TestPartitionCudaStreamsNoSync()
+{
+  TestPartitionCudaStreams(thrust::cuda::par_nosync);
+}
+DECLARE_UNITTEST(TestPartitionCudaStreamsNoSync);
 
diff -Nru libthrust-1.15.0/testing/cuda/reduce_by_key.cu libthrust-1.16.0/testing/cuda/reduce_by_key.cu
--- libthrust-1.15.0/testing/cuda/reduce_by_key.cu	2021-10-29 18:01:06.000000000 +0000
+++ libthrust-1.16.0/testing/cuda/reduce_by_key.cu	2022-02-09 20:04:17.000000000 +0000
@@ -191,7 +191,15 @@
 DECLARE_UNITTEST(TestReduceByKeyDeviceDevice);
 
 
-void TestReduceByKeyCudaStreams()
+void TestReduceByKeyDeviceNoSync()
+{
+  TestReduceByKeyDevice(thrust::cuda::par_nosync);
+}
+DECLARE_UNITTEST(TestReduceByKeyDeviceNoSync);
+
+
+template<typename ExecutionPolicy>
+void TestReduceByKeyCudaStreams(ExecutionPolicy policy)
 {
   typedef thrust::device_vector<int> Vector;
   typedef Vector::value_type T;
@@ -210,7 +218,9 @@
   cudaStream_t s;
   cudaStreamCreate(&s);
 
-  new_last = thrust::reduce_by_key(thrust::cuda::par.on(s), keys.begin(), keys.end(), values.begin(), output_keys.begin(), output_values.begin());
+  auto streampolicy = policy.on(s);
+
+  new_last = thrust::reduce_by_key(streampolicy, keys.begin(), keys.end(), values.begin(), output_keys.begin(), output_values.begin());
 
   ASSERT_EQUAL(new_last.first  - output_keys.begin(),   5);
   ASSERT_EQUAL(new_last.second - output_values.begin(), 5);
@@ -229,7 +239,7 @@
   // test BinaryPredicate
   initialize_keys(keys);  initialize_values(values);
   
-  new_last = thrust::reduce_by_key(thrust::cuda::par.on(s), keys.begin(), keys.end(), values.begin(), output_keys.begin(), output_values.begin(), is_equal_div_10_reduce<T>());
+  new_last = thrust::reduce_by_key(streampolicy, keys.begin(), keys.end(), values.begin(), output_keys.begin(), output_values.begin(), is_equal_div_10_reduce<T>());
 
   ASSERT_EQUAL(new_last.first  - output_keys.begin(),   3);
   ASSERT_EQUAL(new_last.second - output_values.begin(), 3);
@@ -244,7 +254,7 @@
   // test BinaryFunction
   initialize_keys(keys);  initialize_values(values);
 
-  new_last = thrust::reduce_by_key(thrust::cuda::par.on(s), keys.begin(), keys.end(), values.begin(), output_keys.begin(), output_values.begin(), thrust::equal_to<T>(), thrust::plus<T>());
+  new_last = thrust::reduce_by_key(streampolicy, keys.begin(), keys.end(), values.begin(), output_keys.begin(), output_values.begin(), thrust::equal_to<T>(), thrust::plus<T>());
 
   ASSERT_EQUAL(new_last.first  - output_keys.begin(),   5);
   ASSERT_EQUAL(new_last.second - output_values.begin(), 5);
@@ -262,5 +272,17 @@
 
   cudaStreamDestroy(s);
 }
-DECLARE_UNITTEST(TestReduceByKeyCudaStreams);
+
+void TestReduceByKeyCudaStreamsSync()
+{
+  TestReduceByKeyCudaStreams(thrust::cuda::par);
+}
+DECLARE_UNITTEST(TestReduceByKeyCudaStreamsSync);
+
+
+void TestReduceByKeyCudaStreamsNoSync()
+{
+  TestReduceByKeyCudaStreams(thrust::cuda::par_nosync);
+}
+DECLARE_UNITTEST(TestReduceByKeyCudaStreamsNoSync);
 
diff -Nru libthrust-1.15.0/testing/cuda/reduce.cu libthrust-1.16.0/testing/cuda/reduce.cu
--- libthrust-1.15.0/testing/cuda/reduce.cu	2021-10-29 18:01:06.000000000 +0000
+++ libthrust-1.16.0/testing/cuda/reduce.cu	2022-02-09 20:04:17.000000000 +0000
@@ -53,7 +53,19 @@
 VariableUnitTest<TestReduceDeviceDevice, IntegralTypes> TestReduceDeviceDeviceInstance;
 
 
-void TestReduceCudaStreams()
+template<typename T>
+struct TestReduceDeviceNoSync
+{
+  void operator()(const size_t n)
+  {
+    TestReduceDevice<T>(thrust::cuda::par_nosync, n);
+  }
+};
+VariableUnitTest<TestReduceDeviceNoSync, IntegralTypes> TestReduceDeviceNoSyncInstance;
+
+
+template<typename ExecutionPolicy>
+void TestReduceCudaStreams(ExecutionPolicy policy)
 {
   typedef thrust::device_vector<int> Vector;
 
@@ -63,13 +75,27 @@
   cudaStream_t s;
   cudaStreamCreate(&s);
 
+  auto streampolicy = policy.on(s);
+
   // no initializer
-  ASSERT_EQUAL(thrust::reduce(thrust::cuda::par.on(s), v.begin(), v.end()), 2);
+  ASSERT_EQUAL(thrust::reduce(streampolicy, v.begin(), v.end()), 2);
 
   // with initializer
-  ASSERT_EQUAL(thrust::reduce(thrust::cuda::par.on(s), v.begin(), v.end(), 10), 12);
+  ASSERT_EQUAL(thrust::reduce(streampolicy, v.begin(), v.end(), 10), 12);
 
   cudaStreamDestroy(s);
 }
-DECLARE_UNITTEST(TestReduceCudaStreams);
+
+void TestReduceCudaStreamsSync()
+{
+  TestReduceCudaStreams(thrust::cuda::par);
+}
+DECLARE_UNITTEST(TestReduceCudaStreamsSync);
+
+
+void TestReduceCudaStreamsNoSync()
+{
+  TestReduceCudaStreams(thrust::cuda::par_nosync);
+}
+DECLARE_UNITTEST(TestReduceCudaStreamsNoSync);
 
diff -Nru libthrust-1.15.0/testing/cuda/set_intersection_by_key.cu libthrust-1.16.0/testing/cuda/set_intersection_by_key.cu
--- libthrust-1.15.0/testing/cuda/set_intersection_by_key.cu	2021-10-29 18:01:06.000000000 +0000
+++ libthrust-1.16.0/testing/cuda/set_intersection_by_key.cu	2022-02-09 20:04:17.000000000 +0000
@@ -73,7 +73,15 @@
 DECLARE_UNITTEST(TestSetIntersectionByKeyDeviceDevice);
 
 
-void TestSetIntersectionByKeyCudaStreams()
+void TestSetIntersectionByKeyDeviceNoSync()
+{
+  TestSetIntersectionByKeyDevice(thrust::cuda::par_nosync);
+}
+DECLARE_UNITTEST(TestSetIntersectionByKeyDeviceNoSync);
+
+
+template<typename ExecutionPolicy>
+void TestSetIntersectionByKeyCudaStreams(ExecutionPolicy policy)
 {
   typedef thrust::device_vector<int> Vector;
   typedef Vector::iterator Iterator;
@@ -95,8 +103,10 @@
   cudaStream_t s;
   cudaStreamCreate(&s);
 
+  auto streampolicy = policy.on(s);
+
   thrust::pair<Iterator,Iterator> end =
-    thrust::set_intersection_by_key(thrust::cuda::par.on(s),
+    thrust::set_intersection_by_key(streampolicy,
                                     a_key.begin(), a_key.end(),
                                     b_key.begin(), b_key.end(),
                                     a_val.begin(),
@@ -111,5 +121,17 @@
 
   cudaStreamDestroy(s);
 }
-DECLARE_UNITTEST(TestSetIntersectionByKeyCudaStreams);
+
+void TestSetIntersectionByKeyCudaStreamsSync()
+{
+  TestSetIntersectionByKeyCudaStreams(thrust::cuda::par);
+}
+DECLARE_UNITTEST(TestSetIntersectionByKeyCudaStreamsSync);
+
+
+void TestSetIntersectionByKeyCudaStreamsNoSync()
+{
+  TestSetIntersectionByKeyCudaStreams(thrust::cuda::par_nosync);
+}
+DECLARE_UNITTEST(TestSetIntersectionByKeyCudaStreamsNoSync);
 
diff -Nru libthrust-1.15.0/testing/cuda/set_intersection.cu libthrust-1.16.0/testing/cuda/set_intersection.cu
--- libthrust-1.15.0/testing/cuda/set_intersection.cu	2021-10-29 18:01:06.000000000 +0000
+++ libthrust-1.16.0/testing/cuda/set_intersection.cu	2022-02-09 20:04:17.000000000 +0000
@@ -59,7 +59,15 @@
 DECLARE_UNITTEST(TestSetIntersectionDeviceDevice);
 
 
-void TestSetIntersectionCudaStreams()
+void TestSetIntersectionDeviceNoSync()
+{
+  TestSetIntersectionDevice(thrust::cuda::par_nosync);
+}
+DECLARE_UNITTEST(TestSetIntersectionDeviceNoSync);
+
+
+template<typename ExecutionPolicy>
+void TestSetIntersectionCudaStreams(ExecutionPolicy policy)
 {
   typedef thrust::device_vector<int> Vector;
   typedef Vector::iterator Iterator;
@@ -77,7 +85,9 @@
   cudaStream_t s;
   cudaStreamCreate(&s);
 
-  Iterator end = thrust::set_intersection(thrust::cuda::par.on(s),
+  auto streampolicy = policy.on(s);
+
+  Iterator end = thrust::set_intersection(streampolicy,
                                           a.begin(), a.end(),
                                           b.begin(), b.end(),
                                           result.begin());
@@ -88,5 +98,17 @@
 
   cudaStreamDestroy(s);
 }
-DECLARE_UNITTEST(TestSetIntersectionCudaStreams);
+
+void TestSetIntersectionCudaStreamsSync()
+{
+  TestSetIntersectionCudaStreams(thrust::cuda::par);
+}
+DECLARE_UNITTEST(TestSetIntersectionCudaStreamsSync);
+
+
+void TestSetIntersectionCudaStreamsNoSync()
+{
+  TestSetIntersectionCudaStreams(thrust::cuda::par_nosync);
+}
+DECLARE_UNITTEST(TestSetIntersectionCudaStreamsNoSync);
 
diff -Nru libthrust-1.15.0/testing/cuda/unique_by_key.cu libthrust-1.16.0/testing/cuda/unique_by_key.cu
--- libthrust-1.15.0/testing/cuda/unique_by_key.cu	2021-10-29 18:01:06.000000000 +0000
+++ libthrust-1.16.0/testing/cuda/unique_by_key.cu	2022-02-09 20:04:17.000000000 +0000
@@ -134,7 +134,15 @@
 DECLARE_UNITTEST(TestUniqueByKeyDeviceDevice);
 
 
-void TestUniqueByKeyCudaStreams()
+void TestUniqueByKeyDeviceNoSync()
+{
+  TestUniqueByKeyDevice(thrust::cuda::par_nosync);
+}
+DECLARE_UNITTEST(TestUniqueByKeyDeviceNoSync);
+
+
+template<typename ExecutionPolicy>
+void TestUniqueByKeyCudaStreams(ExecutionPolicy policy)
 {
   typedef thrust::device_vector<int> Vector;
   typedef Vector::value_type T;
@@ -150,8 +158,10 @@
 
   cudaStream_t s;
   cudaStreamCreate(&s);
+
+  auto streampolicy = policy.on(s);
   
-  new_last = thrust::unique_by_key(thrust::cuda::par.on(s), keys.begin(), keys.end(), values.begin());
+  new_last = thrust::unique_by_key(streampolicy, keys.begin(), keys.end(), values.begin());
   cudaStreamSynchronize(s);
   
   ASSERT_EQUAL(new_last.first  - keys.begin(),   5);
@@ -171,7 +181,7 @@
   // test BinaryPredicate
   initialize_keys(keys);  initialize_values(values);
   
-  new_last = thrust::unique_by_key(thrust::cuda::par.on(s), keys.begin(), keys.end(), values.begin(), is_equal_div_10_unique<T>());
+  new_last = thrust::unique_by_key(streampolicy, keys.begin(), keys.end(), values.begin(), is_equal_div_10_unique<T>());
   
   ASSERT_EQUAL(new_last.first  - keys.begin(),   3);
   ASSERT_EQUAL(new_last.second - values.begin(), 3);
@@ -185,7 +195,19 @@
 
   cudaStreamDestroy(s);
 }
-DECLARE_UNITTEST(TestUniqueByKeyCudaStreams);
+
+void TestUniqueByKeyCudaStreamsSync()
+{
+  TestUniqueByKeyCudaStreams(thrust::cuda::par);
+}
+DECLARE_UNITTEST(TestUniqueByKeyCudaStreamsSync);
+
+
+void TestUniqueByKeyCudaStreamsNoSync()
+{
+  TestUniqueByKeyCudaStreams(thrust::cuda::par_nosync);
+}
+DECLARE_UNITTEST(TestUniqueByKeyCudaStreamsNoSync);
 
 
 template<typename ExecutionPolicy, typename Iterator1, typename Iterator2, typename Iterator3, typename Iterator4, typename Iterator5>
@@ -282,7 +304,15 @@
 DECLARE_UNITTEST(TestUniqueCopyByKeyDeviceDevice);
 
 
-void TestUniqueCopyByKeyCudaStreams()
+void TestUniqueCopyByKeyDeviceNoSync()
+{
+  TestUniqueCopyByKeyDevice(thrust::cuda::par_nosync);
+}
+DECLARE_UNITTEST(TestUniqueCopyByKeyDeviceNoSync);
+
+
+template<typename ExecutionPolicy>
+void TestUniqueCopyByKeyCudaStreams(ExecutionPolicy policy)
 {
   typedef thrust::device_vector<int> Vector;
   typedef Vector::value_type T;
@@ -302,7 +332,9 @@
   cudaStream_t s;
   cudaStreamCreate(&s);
 
-  new_last = thrust::unique_by_key_copy(thrust::cuda::par.on(s), keys.begin(), keys.end(), values.begin(), output_keys.begin(), output_values.begin());
+  auto streampolicy = policy.on(s);
+
+  new_last = thrust::unique_by_key_copy(streampolicy, keys.begin(), keys.end(), values.begin(), output_keys.begin(), output_values.begin());
   cudaStreamSynchronize(s);
 
   ASSERT_EQUAL(new_last.first  - output_keys.begin(),   5);
@@ -322,7 +354,7 @@
   // test BinaryPredicate
   initialize_keys(keys);  initialize_values(values);
   
-  new_last = thrust::unique_by_key_copy(thrust::cuda::par.on(s), keys.begin(), keys.end(), values.begin(), output_keys.begin(), output_values.begin(), is_equal_div_10_unique<T>());
+  new_last = thrust::unique_by_key_copy(streampolicy, keys.begin(), keys.end(), values.begin(), output_keys.begin(), output_values.begin(), is_equal_div_10_unique<T>());
   cudaStreamSynchronize(s);
 
   ASSERT_EQUAL(new_last.first  - output_keys.begin(),   3);
@@ -337,5 +369,17 @@
 
   cudaStreamDestroy(s);
 }
-DECLARE_UNITTEST(TestUniqueCopyByKeyCudaStreams);
+
+void TestUniqueCopyByKeyCudaStreamsSync()
+{
+  TestUniqueCopyByKeyCudaStreams(thrust::cuda::par);
+}
+DECLARE_UNITTEST(TestUniqueCopyByKeyCudaStreamsSync);
+
+
+void TestUniqueCopyByKeyCudaStreamsNoSync()
+{
+  TestUniqueCopyByKeyCudaStreams(thrust::cuda::par_nosync);
+}
+DECLARE_UNITTEST(TestUniqueCopyByKeyCudaStreamsNoSync);
 
diff -Nru libthrust-1.15.0/testing/cuda/unique.cu libthrust-1.16.0/testing/cuda/unique.cu
--- libthrust-1.15.0/testing/cuda/unique.cu	2021-10-29 18:01:06.000000000 +0000
+++ libthrust-1.16.0/testing/cuda/unique.cu	2022-02-09 20:04:17.000000000 +0000
@@ -94,7 +94,15 @@
 DECLARE_UNITTEST(TestUniqueDeviceDevice);
 
 
-void TestUniqueCudaStreams()
+void TestUniqueDeviceNoSync()
+{
+  TestUniqueDevice(thrust::cuda::par_nosync);
+}
+DECLARE_UNITTEST(TestUniqueDeviceNoSync);
+
+
+template<typename ExecutionPolicy>
+void TestUniqueCudaStreams(ExecutionPolicy policy)
 {
   typedef thrust::device_vector<int> Vector;
   typedef Vector::value_type T;
@@ -116,8 +124,10 @@
 
   cudaStream_t s;
   cudaStreamCreate(&s);
+
+  auto streampolicy = policy.on(s);
   
-  new_last = thrust::unique(thrust::cuda::par.on(s), data.begin(), data.end());
+  new_last = thrust::unique(streampolicy, data.begin(), data.end());
   cudaStreamSynchronize(s);
 
   ASSERT_EQUAL(new_last - data.begin(), 7);
@@ -129,7 +139,7 @@
   ASSERT_EQUAL(data[5], 31);
   ASSERT_EQUAL(data[6], 37);
 
-  new_last = thrust::unique(thrust::cuda::par.on(s), data.begin(), new_last, is_equal_div_10_unique<T>());
+  new_last = thrust::unique(streampolicy, data.begin(), new_last, is_equal_div_10_unique<T>());
   cudaStreamSynchronize(s);
 
   ASSERT_EQUAL(new_last - data.begin(), 3);
@@ -139,7 +149,19 @@
 
   cudaStreamDestroy(s);
 }
-DECLARE_UNITTEST(TestUniqueCudaStreams);
+
+void TestUniqueCudaStreamsSync()
+{
+  TestUniqueCudaStreams(thrust::cuda::par);
+}
+DECLARE_UNITTEST(TestUniqueCudaStreamsSync);
+
+
+void TestUniqueCudaStreamsNoSync()
+{
+  TestUniqueCudaStreams(thrust::cuda::par_nosync);
+}
+DECLARE_UNITTEST(TestUniqueCudaStreamsNoSync);
 
 
 template<typename ExecutionPolicy, typename Iterator1, typename Iterator2, typename Iterator3>
@@ -227,7 +249,15 @@
 DECLARE_UNITTEST(TestUniqueCopyDeviceDevice);
 
 
-void TestUniqueCopyCudaStreams()
+void TestUniqueCopyDeviceNoSync()
+{
+  TestUniqueCopyDevice(thrust::cuda::par_nosync);
+}
+DECLARE_UNITTEST(TestUniqueCopyDeviceNoSync);
+
+
+template<typename ExecutionPolicy>
+void TestUniqueCopyCudaStreams(ExecutionPolicy policy)
 {
   typedef thrust::device_vector<int> Vector;
   typedef Vector::value_type T;
@@ -251,8 +281,10 @@
 
   cudaStream_t s;
   cudaStreamCreate(&s);
+
+  auto streampolicy = policy.on(s);
   
-  new_last = thrust::unique_copy(thrust::cuda::par.on(s), data.begin(), data.end(), output.begin());
+  new_last = thrust::unique_copy(streampolicy, data.begin(), data.end(), output.begin());
   cudaStreamSynchronize(s);
 
   ASSERT_EQUAL(new_last - output.begin(), 7);
@@ -264,7 +296,7 @@
   ASSERT_EQUAL(output[5], 31);
   ASSERT_EQUAL(output[6], 37);
 
-  new_last = thrust::unique_copy(thrust::cuda::par.on(s), output.begin(), new_last, data.begin(), is_equal_div_10_unique<T>());
+  new_last = thrust::unique_copy(streampolicy, output.begin(), new_last, data.begin(), is_equal_div_10_unique<T>());
   cudaStreamSynchronize(s);
 
   ASSERT_EQUAL(new_last - data.begin(), 3);
@@ -274,5 +306,17 @@
 
   cudaStreamDestroy(s);
 }
-DECLARE_UNITTEST(TestUniqueCopyCudaStreams);
+
+void TestUniqueCopyCudaStreamsSync()
+{
+  TestUniqueCopyCudaStreams(thrust::cuda::par);
+}
+DECLARE_UNITTEST(TestUniqueCopyCudaStreamsSync);
+
+
+void TestUniqueCopyCudaStreamsNoSync()
+{
+  TestUniqueCopyCudaStreams(thrust::cuda::par_nosync);
+}
+DECLARE_UNITTEST(TestUniqueCopyCudaStreamsNoSync);
 
diff -Nru libthrust-1.15.0/testing/partition.cu libthrust-1.16.0/testing/partition.cu
--- libthrust-1.15.0/testing/partition.cu	2021-10-29 18:01:06.000000000 +0000
+++ libthrust-1.16.0/testing/partition.cu	2022-02-09 20:04:17.000000000 +0000
@@ -6,6 +6,12 @@
 #include <thrust/iterator/retag.h>
 #include <thrust/sort.h>
 
+#if defined(THRUST_GCC_VERSION) && \
+  THRUST_GCC_VERSION >= 110000 && \
+  THRUST_GCC_VERSION < 120000
+#define WAIVE_GCC11_FAILURES
+#endif
+
 template<typename T>
 struct is_even
 {
@@ -21,6 +27,17 @@
     typedef typename Vector::value_type T;
     typedef typename Vector::iterator   Iterator;
 
+    // GCC 11 miscompiles and segfaults for certain versions of this test.
+    // It's not reproducible on other compilers, and the test passes when
+    // optimizations are disabled. It only affects 32-bit value types, and
+    // impacts all CPU host/device combinations tested.
+#ifdef WAIVE_GCC11_FAILURES
+    if (sizeof(T) == 4)
+    {
+      return;
+    }
+#endif
+
     Vector data(5);
     data[0] = 1; 
     data[1] = 2; 
@@ -321,6 +338,17 @@
 {
     void operator()(const size_t n)
     {
+        // GCC 11 miscompiles and segfaults for certain versions of this test.
+        // It's not reproducible on other compilers, and the test passes when
+        // optimizations are disabled. It only affects 32-bit value types, and
+        // impacts all CPU host/device combinations tested.
+#ifdef WAIVE_GCC11_FAILURES
+        if (n == 0 && sizeof(T) == 4)
+        {
+          return;
+        }
+#endif
+
         // setup ranges
         thrust::host_vector<T>   h_data = unittest::random_integers<T>(n);
         thrust::host_vector<T>   h_stencil = unittest::random_integers<T>(n);
@@ -689,6 +717,17 @@
 {
     void operator()(const size_t n)
     {
+        // GCC 11 miscompiles and segfaults for certain versions of this test.
+        // It's not reproducible on other compilers, and the test passes when
+        // optimizations are disabled. It only affects 32-bit value types, and
+        // impacts all CPU host/device combinations tested.
+#ifdef WAIVE_GCC11_FAILURES
+        if (n == 0 && sizeof(T) == 4)
+        {
+          return;
+        }
+#endif
+
         // setup ranges
         thrust::host_vector<T>   h_data = unittest::random_integers<T>(n);
         thrust::device_vector<T> d_data = h_data;
@@ -708,6 +747,17 @@
 {
     void operator()(const size_t n)
     {
+        // GCC 11 miscompiles and segfaults for certain versions of this test.
+        // It's not reproducible on other compilers, and the test passes when
+        // optimizations are disabled. It only affects 32-bit value types, and
+        // impacts all CPU host/device combinations tested.
+#ifdef WAIVE_GCC11_FAILURES
+        if (n == 0 && sizeof(T) == 4)
+        {
+          return;
+        }
+#endif
+
         // setup ranges
         thrust::host_vector<T>   h_data = unittest::random_integers<T>(n);
         thrust::host_vector<T>   h_stencil = unittest::random_integers<T>(n);
diff -Nru libthrust-1.15.0/testing/scan.cu libthrust-1.16.0/testing/scan.cu
--- libthrust-1.15.0/testing/scan.cu	2021-10-29 18:01:06.000000000 +0000
+++ libthrust-1.16.0/testing/scan.cu	2022-02-09 20:04:17.000000000 +0000
@@ -26,6 +26,17 @@
 void TestScanSimple(void)
 {
     typedef typename Vector::value_type T;
+
+    // icc miscompiles the intermediate sum updates for custom_numeric.
+    // The issue doesn't happen with opts disabled, or on other compilers.
+    // Printing the intermediate sum each iteration "fixes" the issue,
+    // so likely a bad optimization.
+#if THRUST_HOST_COMPILER == THRUST_HOST_COMPILER_INTEL
+    if (std::is_same<T, custom_numeric>::value)
+    {
+      return;
+    }
+#endif
     
     typename Vector::iterator iter;
 
diff -Nru libthrust-1.15.0/testing/sequence.cu libthrust-1.16.0/testing/sequence.cu
--- libthrust-1.15.0/testing/sequence.cu	2021-10-29 18:01:06.000000000 +0000
+++ libthrust-1.16.0/testing/sequence.cu	2022-02-09 20:04:17.000000000 +0000
@@ -139,10 +139,21 @@
 };
 
 // Vector-Vector addition
-__host__ __device__ Vector operator+(const Vector a, const Vector b) { return Vector{a.x + b.x, a.y + b.y}; }
+__host__ __device__ Vector operator+(const Vector a, const Vector b)
+{
+  return Vector{a.x + b.x, a.y + b.y};
+}
+
 // Vector-Scalar Multiplication
-__host__ __device__ Vector operator*(const int a, const Vector b) { return Vector{a * b.x, a * b.y}; }
-__host__ __device__ Vector operator*(const Vector b, const int a) { return Vector{a * b.x, a * b.y}; }
+// Multiplication by std::size_t is required by thrust::sequence.
+__host__ __device__ Vector operator*(const std::size_t a, const Vector b)
+{
+  return Vector{static_cast<int>(a) * b.x, static_cast<int>(a) * b.y};
+}
+__host__ __device__ Vector operator*(const Vector b, const std::size_t a)
+{
+  return Vector{static_cast<int>(a) * b.x, static_cast<int>(a) * b.y};
+}
 
 void TestSequenceNoSizeTConversion()
 {
diff -Nru libthrust-1.15.0/testing/shuffle.cu libthrust-1.16.0/testing/shuffle.cu
--- libthrust-1.15.0/testing/shuffle.cu	2021-10-29 18:01:06.000000000 +0000
+++ libthrust-1.16.0/testing/shuffle.cu	2022-02-09 20:04:17.000000000 +0000
@@ -1,6 +1,5 @@
 #include <thrust/detail/config.h>
 
-#if THRUST_CPP_DIALECT >= 2011
 #include <map>
 #include <limits>
 #include <thrust/random.h>
@@ -383,7 +382,7 @@
   thrust::system::detail::generic::feistel_bijection host_f(m, host_g);
   thrust::system::detail::generic::feistel_bijection device_f(m, device_g);
 
-  if (host_f.nearest_power_of_two() >= std::numeric_limits<T>::max() || m == 0) {
+  if (static_cast<double>(host_f.nearest_power_of_two()) >= static_cast<double>(std::numeric_limits<T>::max()) || m == 0) {
     return;
   }
 
@@ -410,17 +409,17 @@
 void TestBijectionLength() {
   thrust::default_random_engine g(0xD5);
 
-  uint64_t m = 3;
+  uint64_t m = 31;
   thrust::system::detail::generic::feistel_bijection f(m, g);
-  ASSERT_EQUAL(f.nearest_power_of_two(), uint64_t(4));
+  ASSERT_EQUAL(f.nearest_power_of_two(), uint64_t(32));
 
-  m = 2;
+  m = 32;
   f = thrust::system::detail::generic::feistel_bijection(m, g);
-  ASSERT_EQUAL(f.nearest_power_of_two(), uint64_t(2));
+  ASSERT_EQUAL(f.nearest_power_of_two(), uint64_t(32));
 
-  m = 0;
+  m = 1;
   f = thrust::system::detail::generic::feistel_bijection(m, g);
-  ASSERT_EQUAL(f.nearest_power_of_two(), uint64_t(1));
+  ASSERT_EQUAL(f.nearest_power_of_two(), uint64_t(16));
 }
 DECLARE_UNITTEST(TestBijectionLength);
 
@@ -515,7 +514,7 @@
   thrust::host_vector<T> h_results;
   Vector sequence(shuffle_size);
   thrust::sequence(sequence.begin(), sequence.end(), 0);
-  thrust::default_random_engine g(0xD5);
+  thrust::default_random_engine g(0xD6);
   for (auto i = 0ull; i < num_samples; i++) {
     thrust::shuffle(sequence.begin(), sequence.end(), g);
     thrust::host_vector<T> tmp(sequence.begin(), sequence.end());
@@ -561,7 +560,7 @@
   const uint64_t shuffle_sizes[] = {10, 100, 500};
   thrust::default_random_engine g(0xD5);
   for (auto shuffle_size : shuffle_sizes) {
-    if(shuffle_size > std::numeric_limits<T>::max())
+    if(shuffle_size > (uint64_t)std::numeric_limits<T>::max())
       continue;
     const uint64_t num_samples = shuffle_size == 500 ? 1000 : 200;
 
@@ -601,4 +600,3 @@
   }
 }
 DECLARE_INTEGRAL_VECTOR_UNITTEST(TestShuffleEvenDistribution);
-#endif
diff -Nru libthrust-1.15.0/testing/unittest/system.h libthrust-1.16.0/testing/unittest/system.h
--- libthrust-1.15.0/testing/unittest/system.h	2021-10-29 18:01:06.000000000 +0000
+++ libthrust-1.16.0/testing/unittest/system.h	2022-02-09 20:04:17.000000000 +0000
@@ -12,7 +12,7 @@
 namespace unittest
 {
 
-#if __GNUC__ && !__NVCOMPILER_CUDA__
+#if __GNUC__ && !_NVHPC_CUDA
 inline std::string demangle(const char* name)
 {
   int status = 0;
diff -Nru libthrust-1.15.0/thrust/cmake/thrust-config.cmake libthrust-1.16.0/thrust/cmake/thrust-config.cmake
--- libthrust-1.15.0/thrust/cmake/thrust-config.cmake	2021-10-29 18:01:06.000000000 +0000
+++ libthrust-1.16.0/thrust/cmake/thrust-config.cmake	2022-02-09 20:04:17.000000000 +0000
@@ -497,7 +497,7 @@
 macro(_thrust_find_CUDA required)
   if (NOT TARGET Thrust::CUDA)
     thrust_debug("Searching for CUB ${required}" internal)
-    find_package(CUB CONFIG
+    find_package(CUB ${THRUST_VERSION} CONFIG
       ${_THRUST_QUIET_FLAG}
       ${required}
       NO_DEFAULT_PATH # Only check the explicit HINTS below:
diff -Nru libthrust-1.15.0/thrust/cmake/thrust-config-version.cmake libthrust-1.16.0/thrust/cmake/thrust-config-version.cmake
--- libthrust-1.15.0/thrust/cmake/thrust-config-version.cmake	2021-10-29 18:01:06.000000000 +0000
+++ libthrust-1.16.0/thrust/cmake/thrust-config-version.cmake	2022-02-09 20:04:17.000000000 +0000
@@ -1,12 +1,6 @@
 # Parse version information from version.h:
-unset(_THRUST_VERSION_INCLUDE_DIR CACHE) # Clear old result to force search
-find_path(_THRUST_VERSION_INCLUDE_DIR thrust/version.h
-  NO_DEFAULT_PATH # Only search explicit paths below:
-  PATHS
-    ${CMAKE_CURRENT_LIST_DIR}/../..            # Source tree
-    ${CMAKE_CURRENT_LIST_DIR}/../../../include # Install tree
-)
-set_property(CACHE _THRUST_VERSION_INCLUDE_DIR PROPERTY TYPE INTERNAL)
+include("${CMAKE_CURRENT_LIST_DIR}/thrust-header-search.cmake")
+
 file(READ "${_THRUST_VERSION_INCLUDE_DIR}/thrust/version.h" THRUST_VERSION_HEADER)
 string(REGEX MATCH "#define[ \t]+THRUST_VERSION[ \t]+([0-9]+)" DUMMY "${THRUST_VERSION_HEADER}")
 set(THRUST_VERSION_FLAT ${CMAKE_MATCH_1})
@@ -26,11 +20,12 @@
 set(PACKAGE_VERSION_UNSUITABLE FALSE)
 
 if(PACKAGE_VERSION VERSION_GREATER_EQUAL PACKAGE_FIND_VERSION)
-  if(PACKAGE_FIND_VERSION_MAJOR STREQUAL THRUST_VERSION_MAJOR)
+  if(THRUST_VERSION_MAJOR VERSION_EQUAL PACKAGE_FIND_VERSION_MAJOR AND
+     THRUST_VERSION_MINOR VERSION_GREATER_EQUAL PACKAGE_FIND_VERSION_MINOR)
     set(PACKAGE_VERSION_COMPATIBLE TRUE)
   endif()
 
-  if(PACKAGE_FIND_VERSION STREQUAL PACKAGE_VERSION)
+  if(PACKAGE_FIND_VERSION VERSION_EQUAL PACKAGE_VERSION)
     set(PACKAGE_VERSION_EXACT TRUE)
   endif()
 endif()
diff -Nru libthrust-1.15.0/thrust/cmake/thrust-header-search.cmake libthrust-1.16.0/thrust/cmake/thrust-header-search.cmake
--- libthrust-1.15.0/thrust/cmake/thrust-header-search.cmake	1970-01-01 00:00:00.000000000 +0000
+++ libthrust-1.16.0/thrust/cmake/thrust-header-search.cmake	2022-02-09 20:04:17.000000000 +0000
@@ -0,0 +1,8 @@
+# Parse version information from version.h:
+unset(_THRUST_VERSION_INCLUDE_DIR CACHE) # Clear old result to force search
+find_path(_THRUST_VERSION_INCLUDE_DIR thrust/version.h
+  NO_DEFAULT_PATH # Only search explicit paths below:
+  PATHS
+    "${CMAKE_CURRENT_LIST_DIR}/../.."            # Source tree
+)
+set_property(CACHE _THRUST_VERSION_INCLUDE_DIR PROPERTY TYPE INTERNAL)
diff -Nru libthrust-1.15.0/thrust/cmake/thrust-header-search.cmake.in libthrust-1.16.0/thrust/cmake/thrust-header-search.cmake.in
--- libthrust-1.15.0/thrust/cmake/thrust-header-search.cmake.in	1970-01-01 00:00:00.000000000 +0000
+++ libthrust-1.16.0/thrust/cmake/thrust-header-search.cmake.in	2022-02-09 20:04:17.000000000 +0000
@@ -0,0 +1,8 @@
+# Parse version information from version.h:
+unset(_THRUST_VERSION_INCLUDE_DIR CACHE) # Clear old result to force search
+find_path(_THRUST_VERSION_INCLUDE_DIR thrust/version.h
+  NO_DEFAULT_PATH # Only search explicit paths below:
+  PATHS
+    "${CMAKE_CURRENT_LIST_DIR}/../../../@CMAKE_INSTALL_INCLUDEDIR@"
+)
+set_property(CACHE _THRUST_VERSION_INCLUDE_DIR PROPERTY TYPE INTERNAL)
diff -Nru libthrust-1.15.0/thrust/detail/allocator/temporary_allocator.inl libthrust-1.16.0/thrust/detail/allocator/temporary_allocator.inl
--- libthrust-1.15.0/thrust/detail/allocator/temporary_allocator.inl	2021-10-29 18:01:06.000000000 +0000
+++ libthrust-1.16.0/thrust/detail/allocator/temporary_allocator.inl	2022-02-09 20:04:17.000000000 +0000
@@ -20,7 +20,7 @@
 #include <thrust/system/detail/bad_alloc.h>
 #include <cassert>
 
-#if (defined(__NVCOMPILER_CUDA__) || defined(__CUDA_ARCH__)) && \
+#if (defined(_NVHPC_CUDA) || defined(__CUDA_ARCH__)) && \
     THRUST_DEVICE_SYSTEM == THRUST_DEVICE_SYSTEM_CUDA
 #include <thrust/system/cuda/detail/terminate.h>
 #endif
diff -Nru libthrust-1.15.0/thrust/detail/complex/c99math.h libthrust-1.16.0/thrust/detail/complex/c99math.h
--- libthrust-1.15.0/thrust/detail/complex/c99math.h	2021-10-29 18:01:06.000000000 +0000
+++ libthrust-1.16.0/thrust/detail/complex/c99math.h	2022-02-09 20:04:17.000000000 +0000
@@ -102,7 +102,7 @@
 
 #else
 
-#  if defined(__CUDACC__) && !(defined(__CUDA__) && defined(__clang__)) && !defined(__NVCOMPILER_CUDA__)
+#  if defined(__CUDACC__) && !(defined(__CUDA__) && defined(__clang__)) && !defined(_NVHPC_CUDA)
 // NVCC implements at least some signature of these as functions not macros.
 using ::isinf;
 using ::isnan;
@@ -141,7 +141,7 @@
 
 
 
-#if !defined(__CUDACC__) && !defined(__NVCOMPILER_CUDA__)
+#if !defined(__CUDACC__) && !defined(_NVHPC_CUDA)
 
 // Simple approximation to log1p as Visual Studio is lacking one
 inline double log1p(double x){
diff -Nru libthrust-1.15.0/thrust/detail/config/compiler.h libthrust-1.16.0/thrust/detail/config/compiler.h
--- libthrust-1.15.0/thrust/detail/config/compiler.h	2021-10-29 18:01:06.000000000 +0000
+++ libthrust-1.16.0/thrust/detail/config/compiler.h	2022-02-09 20:04:17.000000000 +0000
@@ -58,7 +58,7 @@
 #endif // THRUST_HOST_COMPILER
 
 // figure out which device compiler we're using
-#if defined(__CUDACC__) || defined(__NVCOMPILER_CUDA__)
+#if defined(__CUDACC__) || defined(_NVHPC_CUDA)
 #define THRUST_DEVICE_COMPILER THRUST_DEVICE_COMPILER_NVCC
 #elif THRUST_HOST_COMPILER == THRUST_HOST_COMPILER_MSVC
 #define THRUST_DEVICE_COMPILER THRUST_DEVICE_COMPILER_MSVC
diff -Nru libthrust-1.15.0/thrust/detail/config/cpp_compatibility.h libthrust-1.16.0/thrust/detail/config/cpp_compatibility.h
--- libthrust-1.15.0/thrust/detail/config/cpp_compatibility.h	2021-10-29 18:01:06.000000000 +0000
+++ libthrust-1.16.0/thrust/detail/config/cpp_compatibility.h	2022-02-09 20:04:17.000000000 +0000
@@ -39,7 +39,7 @@
 // FIXME: Combine THRUST_INLINE_CONSTANT and
 // THRUST_INLINE_INTEGRAL_MEMBER_CONSTANT into one macro when NVCC properly
 // supports `constexpr` globals in host and device code.
-#if defined(__CUDA_ARCH__) || defined(__NVCOMPILER_CUDA__)
+#if defined(__CUDA_ARCH__) || defined(_NVHPC_CUDA)
 // FIXME: Add this when NVCC supports inline variables.
 //#  if   THRUST_CPP_DIALECT >= 2017
 //#    define THRUST_INLINE_CONSTANT                 inline constexpr
@@ -65,7 +65,7 @@
 #  endif
 #endif
 
-#if defined(__NVCOMPILER_CUDA__)
+#if defined(_NVHPC_CUDA)
 #  define THRUST_IS_DEVICE_CODE __builtin_is_device_code()
 #  define THRUST_IS_HOST_CODE (!__builtin_is_device_code())
 #  define THRUST_INCLUDE_DEVICE_CODE 1
diff -Nru libthrust-1.15.0/thrust/detail/config/exec_check_disable.h libthrust-1.16.0/thrust/detail/config/exec_check_disable.h
--- libthrust-1.15.0/thrust/detail/config/exec_check_disable.h	2021-10-29 18:01:06.000000000 +0000
+++ libthrust-1.16.0/thrust/detail/config/exec_check_disable.h	2022-02-09 20:04:17.000000000 +0000
@@ -25,7 +25,7 @@
 // #pragma nv_exec_check_disable is only recognized by NVCC.  Having a macro
 // expand to a #pragma (rather than _Pragma) only works with NVCC's compilation
 // model, not with other compilers.
-#if defined(__CUDACC__) && !defined(__NVCOMPILER_CUDA__) && \
+#if defined(__CUDACC__) && !defined(_NVHPC_CUDA) && \
     !(defined(__CUDA__) && defined(__clang__))
 
 #if THRUST_HOST_COMPILER == THRUST_HOST_COMPILER_MSVC
diff -Nru libthrust-1.15.0/thrust/detail/config/forceinline.h libthrust-1.16.0/thrust/detail/config/forceinline.h
--- libthrust-1.15.0/thrust/detail/config/forceinline.h	2021-10-29 18:01:06.000000000 +0000
+++ libthrust-1.16.0/thrust/detail/config/forceinline.h	2022-02-09 20:04:17.000000000 +0000
@@ -22,7 +22,7 @@
 
 #include <thrust/detail/config.h>
 
-#if defined(__CUDACC__) || defined(__NVCOMPILER_CUDA__)
+#if defined(__CUDACC__) || defined(_NVHPC_CUDA)
 
 #define __thrust_forceinline__ __forceinline__
 
diff -Nru libthrust-1.15.0/thrust/detail/cstdint.h libthrust-1.16.0/thrust/detail/cstdint.h
--- libthrust-1.15.0/thrust/detail/cstdint.h	2021-10-29 18:01:06.000000000 +0000
+++ libthrust-1.16.0/thrust/detail/cstdint.h	2022-02-09 20:04:17.000000000 +0000
@@ -18,7 +18,9 @@
 
 #include <thrust/detail/config.h>
 
-#if (THRUST_HOST_COMPILER == THRUST_HOST_COMPILER_GCC) || (THRUST_HOST_COMPILER == THRUST_HOST_COMPILER_CLANG)
+#if (THRUST_HOST_COMPILER == THRUST_HOST_COMPILER_GCC) || \
+    (THRUST_HOST_COMPILER == THRUST_HOST_COMPILER_CLANG) || \
+    (THRUST_HOST_COMPILER == THRUST_HOST_COMPILER_INTEL)
 #include <stdint.h>
 #endif
 
diff -Nru libthrust-1.15.0/thrust/detail/numeric_wrapper.h libthrust-1.16.0/thrust/detail/numeric_wrapper.h
--- libthrust-1.15.0/thrust/detail/numeric_wrapper.h	1970-01-01 00:00:00.000000000 +0000
+++ libthrust-1.16.0/thrust/detail/numeric_wrapper.h	2022-02-09 20:04:17.000000000 +0000
@@ -0,0 +1,27 @@
+/*
+ *  Copyright 2021 NVIDIA Corporation
+ *
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in compliance with the License.
+ *  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+
+#pragma once
+
+// When a compiler uses Thrust as part of its implementation of Standard C++
+// algorithms, a cycle of included files may result when Thrust code tries to
+// use a standard algorithm.  Having a macro that is defined only when Thrust
+// is including an algorithms-related header gives the compiler a chance to
+// detect and break the cycle of includes.
+
+#define THRUST_INCLUDING_ALGORITHMS_HEADER
+#include <numeric>
+#undef  THRUST_INCLUDING_ALGORITHMS_HEADER
diff -Nru libthrust-1.15.0/thrust/device_allocator.h libthrust-1.16.0/thrust/device_allocator.h
--- libthrust-1.15.0/thrust/device_allocator.h	2021-10-29 18:01:06.000000000 +0000
+++ libthrust-1.16.0/thrust/device_allocator.h	2022-02-09 20:04:17.000000000 +0000
@@ -119,7 +119,7 @@
     device_allocator() {}
 
     /*! Copy constructor has no effect. */
-    __host__
+    __host__ __device__
     device_allocator(const device_allocator& other) : base(other) {}
 
     /*! Constructor from other \p device_allocator has no effect. */
diff -Nru libthrust-1.15.0/thrust/random/detail/normal_distribution_base.h libthrust-1.16.0/thrust/random/detail/normal_distribution_base.h
--- libthrust-1.15.0/thrust/random/detail/normal_distribution_base.h	2021-10-29 18:01:06.000000000 +0000
+++ libthrust-1.16.0/thrust/random/detail/normal_distribution_base.h	2022-02-09 20:04:17.000000000 +0000
@@ -135,7 +135,7 @@
 template<typename RealType>
   struct normal_distribution_base
 {
-#if THRUST_DEVICE_COMPILER == THRUST_DEVICE_COMPILER_NVCC && !defined(__NVCOMPILER_CUDA__)
+#if THRUST_DEVICE_COMPILER == THRUST_DEVICE_COMPILER_NVCC && !defined(_NVHPC_CUDA)
   typedef normal_distribution_nvcc<RealType> type;
 #else
   typedef normal_distribution_portable<RealType> type;
diff -Nru libthrust-1.15.0/thrust/scan.h libthrust-1.16.0/thrust/scan.h
--- libthrust-1.15.0/thrust/scan.h	2021-10-29 18:01:06.000000000 +0000
+++ libthrust-1.16.0/thrust/scan.h	2022-02-09 20:04:17.000000000 +0000
@@ -44,12 +44,16 @@
  *  This version of \p inclusive_scan assumes plus as the associative operator.  
  *  When the input and output sequences are the same, the scan is performed 
  *  in-place.
- 
+ *
  *  \p inclusive_scan is similar to \c std::partial_sum in the STL.  The primary
  *  difference between the two functions is that \c std::partial_sum guarantees
  *  a serial summation order, while \p inclusive_scan requires associativity of 
  *  the binary operation to parallelize the prefix sum.
  *
+ *  Results are not deterministic for pseudo-associative operators (e.g.,
+ *  addition of floating-point types). Results for pseudo-associative
+ *  operators may vary from run to run.
+ *
  *  The algorithm's execution is parallelized as determined by \p exec.
  *    
  *  \param exec The execution policy to use for parallelization.
@@ -106,12 +110,16 @@
  *  This version of \p inclusive_scan assumes plus as the associative operator.  
  *  When the input and output sequences are the same, the scan is performed 
  *  in-place.
- 
+ *
  *  \p inclusive_scan is similar to \c std::partial_sum in the STL.  The primary
  *  difference between the two functions is that \c std::partial_sum guarantees
  *  a serial summation order, while \p inclusive_scan requires associativity of 
  *  the binary operation to parallelize the prefix sum.
- *    
+ *
+ *  Results are not deterministic for pseudo-associative operators (e.g.,
+ *  addition of floating-point types). Results for pseudo-associative
+ *  operators may vary from run to run.
+ *
  *  \param first The beginning of the input sequence.
  *  \param last The end of the input sequence.
  *  \param result The beginning of the output sequence.
@@ -154,12 +162,16 @@
  *  term 'inclusive' means that each result includes the corresponding
  *  input operand in the partial sum.  When the input and output sequences 
  *  are the same, the scan is performed in-place.
- *    
+ *
  *  \p inclusive_scan is similar to \c std::partial_sum in the STL.  The primary
  *  difference between the two functions is that \c std::partial_sum guarantees
  *  a serial summation order, while \p inclusive_scan requires associativity of 
  *  the binary operation to parallelize the prefix sum.
  *
+ *  Results are not deterministic for pseudo-associative operators (e.g.,
+ *  addition of floating-point types). Results for pseudo-associative
+ *  operators may vary from run to run.
+ *
  *  The algorithm's execution is parallelized as determined by \p exec.
  *
  *  \param exec The execution policy to use for parallelization.
@@ -220,6 +232,10 @@
  *  a serial summation order, while \p inclusive_scan requires associativity of 
  *  the binary operation to parallelize the prefix sum.
  *
+ *  Results are not deterministic for pseudo-associative operators (e.g.,
+ *  addition of floating-point types). Results for pseudo-associative
+ *  operators may vary from run to run.
+ *
  *  \param first The beginning of the input sequence.
  *  \param last The end of the input sequence.
  *  \param result The beginning of the output sequence.
@@ -271,6 +287,10 @@
  *  associative operator and \c 0 as the initial value.  When the input and 
  *  output sequences are the same, the scan is performed in-place.
  *
+ *  Results are not deterministic for pseudo-associative operators (e.g.,
+ *  addition of floating-point types). Results for pseudo-associative
+ *  operators may vary from run to run.
+ *
  *  The algorithm's execution is parallelized as determined by \p exec.
  *    
  *  \param exec The execution policy to use for parallelization.
@@ -326,7 +346,11 @@
  *  and so on. This version of \p exclusive_scan assumes plus as the 
  *  associative operator and \c 0 as the initial value.  When the input and 
  *  output sequences are the same, the scan is performed in-place.
- *    
+ *
+ *  Results are not deterministic for pseudo-associative operators (e.g.,
+ *  addition of floating-point types). Results for pseudo-associative
+ *  operators may vary from run to run.
+ *
  *  \param first The beginning of the input sequence.
  *  \param last The end of the input sequence.
  *  \param result The beginning of the output sequence.
@@ -373,6 +397,10 @@
  *  operator but requires an initial value \p init.  When the input and 
  *  output sequences are the same, the scan is performed in-place.
  *
+ *  Results are not deterministic for pseudo-associative operators (e.g.,
+ *  addition of floating-point types). Results for pseudo-associative
+ *  operators may vary from run to run.
+ *
  *  The algorithm's execution is parallelized as determined by \p exec.
  *
  *  \param exec The execution policy to use for parallelization.
@@ -430,6 +458,10 @@
  *  operator but requires an initial value \p init.  When the input and 
  *  output sequences are the same, the scan is performed in-place.
  *
+ *  Results are not deterministic for pseudo-associative operators (e.g.,
+ *  addition of floating-point types). Results for pseudo-associative
+ *  operators may vary from run to run.
+ *
  *  \param first The beginning of the input sequence.
  *  \param last The end of the input sequence.
  *  \param result The beginning of the output sequence.
@@ -478,6 +510,10 @@
  *  operator and an initial value \p init.  When the input and output
  *  sequences are the same, the scan is performed in-place.
  *
+ *  Results are not deterministic for pseudo-associative operators (e.g.,
+ *  addition of floating-point types). Results for pseudo-associative
+ *  operators may vary from run to run.
+ *
  *  The algorithm's execution is parallelized as determined by \p exec.
  *    
  *  \param exec The execution policy to use for parallelization.
@@ -545,7 +581,11 @@
  *  and so on. This version of the function requires both an associative 
  *  operator and an initial value \p init.  When the input and output
  *  sequences are the same, the scan is performed in-place.
- *    
+ *
+ *  Results are not deterministic for pseudo-associative operators (e.g.,
+ *  addition of floating-point types). Results for pseudo-associative
+ *  operators may vary from run to run.
+ *
  *  \param first The beginning of the input sequence.
  *  \param last The end of the input sequence.
  *  \param result The beginning of the output sequence.
@@ -618,6 +658,10 @@
  *  operator used to perform the prefix sum. When the input and output sequences
  *  are the same, the scan is performed in-place.
  *
+ *  Results are not deterministic for pseudo-associative operators (e.g.,
+ *  addition of floating-point types). Results for pseudo-associative
+ *  operators may vary from run to run.
+ *
  *  The algorithm's execution is parallelized as determined by \p exec.
  *
  *  \param exec The execution policy to use for parallelization.
@@ -687,6 +731,10 @@
  *  operator used to perform the prefix sum. When the input and output sequences
  *  are the same, the scan is performed in-place.
  *
+ *  Results are not deterministic for pseudo-associative operators (e.g.,
+ *  addition of floating-point types). Results for pseudo-associative
+ *  operators may vary from run to run.
+ *
  *  \param first1 The beginning of the key sequence.
  *  \param last1 The end of the key sequence.
  *  \param first2 The beginning of the input value sequence.
@@ -746,6 +794,10 @@
  *  operator used to perform the prefix sum. When the input and output sequences
  *  are the same, the scan is performed in-place.
  *
+ *  Results are not deterministic for pseudo-associative operators (e.g.,
+ *  addition of floating-point types). Results for pseudo-associative
+ *  operators may vary from run to run.
+ *
  *  The algorithm's execution is parallelized as determined by \p exec. 
  *
  *  \param exec The execution policy to use for parallelization.
@@ -822,6 +874,10 @@
  *  operator used to perform the prefix sum. When the input and output sequences
  *  are the same, the scan is performed in-place.
  *
+ *  Results are not deterministic for pseudo-associative operators (e.g.,
+ *  addition of floating-point types). Results for pseudo-associative
+ *  operators may vary from run to run.
+ *
  *  \param first1 The beginning of the key sequence.
  *  \param last1 The end of the key sequence.
  *  \param first2 The beginning of the input value sequence.
@@ -888,6 +944,10 @@
  *  \c binary_op to perform the prefix sum. When the input and output sequences
  *  are the same, the scan is performed in-place.
  *
+ *  Results are not deterministic for pseudo-associative operators (e.g.,
+ *  addition of floating-point types). Results for pseudo-associative
+ *  operators may vary from run to run.
+ *
  *  The algorithm's execution is parallelized as determined by \p exec.
  *
  *  \param exec The execution policy to use for parallelization.
@@ -967,6 +1027,10 @@
  *  belong to the same segment if <tt>binary_pred(*i, *(i+1))</tt> is true, and belong to 
  *  different segments otherwise.
  *
+ *  Results are not deterministic for pseudo-associative operators (e.g.,
+ *  addition of floating-point types). Results for pseudo-associative
+ *  operators may vary from run to run.
+ *
  *  This version of \p inclusive_scan_by_key uses the associative operator 
  *  \c binary_op to perform the prefix sum. When the input and output sequences
  *  are the same, the scan is performed in-place.
@@ -1042,6 +1106,10 @@
  *  belong to the same segment if <tt>*i == *(i+1)</tt>, and belong to 
  *  different segments otherwise.
  *
+ *  Results are not deterministic for pseudo-associative operators (e.g.,
+ *  addition of floating-point types). Results for pseudo-associative
+ *  operators may vary from run to run.
+ *
  *  Refer to the most general form of \p exclusive_scan_by_key for additional details.
  *
  *  The algorithm's execution is parallelized as determined by \p exec.
@@ -1101,6 +1169,10 @@
  *  belong to the same segment if <tt>*i == *(i+1)</tt>, and belong to 
  *  different segments otherwise.
  *
+ *  Results are not deterministic for pseudo-associative operators (e.g.,
+ *  addition of floating-point types). Results for pseudo-associative
+ *  operators may vary from run to run.
+ *
  *  Refer to the most general form of \p exclusive_scan_by_key for additional details.
  *
  *  \param first1 The beginning of the key sequence.
@@ -1146,6 +1218,10 @@
  *  This version of \p exclusive_scan_by_key uses the value \c init to
  *  initialize the exclusive scan operation.
  *
+ *  Results are not deterministic for pseudo-associative operators (e.g.,
+ *  addition of floating-point types). Results for pseudo-associative
+ *  operators may vary from run to run.
+ *
  *  The algorithm's execution is parallelized as determined by \p exec.
  *
  *  \param exec The execution policy to use for parallelization.
@@ -1206,6 +1282,10 @@
  *  This version of \p exclusive_scan_by_key uses the value \c init to
  *  initialize the exclusive scan operation.
  *
+ *  Results are not deterministic for pseudo-associative operators (e.g.,
+ *  addition of floating-point types). Results for pseudo-associative
+ *  operators may vary from run to run.
+ *
  *  \param first1 The beginning of the key sequence.
  *  \param last1 The end of the key sequence.
  *  \param first2 The beginning of the input value sequence.
@@ -1262,6 +1342,10 @@
  *  <tt>i+1</tt> in the range <tt>[first1, last1)</tt> belong to the same segment if
  *  <tt>binary_pred(*i, *(i+1))</tt> is true, and belong to different segments otherwise.
  *
+ *  Results are not deterministic for pseudo-associative operators (e.g.,
+ *  addition of floating-point types). Results for pseudo-associative
+ *  operators may vary from run to run.
+ *
  *  The algorithm's execution is parallelized as determined by \p exec.
  *
  *  \param exec The execution policy to use for parallelization.
@@ -1332,6 +1416,10 @@
  *  <tt>i+1</tt> in the range <tt>[first1, last1)</tt> belong to the same segment if
  *  <tt>binary_pred(*i, *(i+1))</tt> is true, and belong to different segments otherwise.
  *
+ *  Results are not deterministic for pseudo-associative operators (e.g.,
+ *  addition of floating-point types). Results for pseudo-associative
+ *  operators may vary from run to run.
+ *
  *  \param first1 The beginning of the key sequence.
  *  \param last1 The end of the key sequence.
  *  \param first2 The beginning of the input value sequence.
@@ -1397,6 +1485,10 @@
  *  \c binary_op to perform the prefix sum. When the input and output sequences
  *  are the same, the scan is performed in-place.
  *
+ *  Results are not deterministic for pseudo-associative operators (e.g.,
+ *  addition of floating-point types). Results for pseudo-associative
+ *  operators may vary from run to run.
+ *
  *  The algorithm's execution is parallelized as determined by \p exec.
  *
  *  \param exec The execution policy to use for parallelization.
@@ -1487,6 +1579,10 @@
  *  \c binary_op to perform the prefix sum. When the input and output sequences
  *  are the same, the scan is performed in-place.
  *
+ *  Results are not deterministic for pseudo-associative operators (e.g.,
+ *  addition of floating-point types). Results for pseudo-associative
+ *  operators may vary from run to run.
+ *
  *  \param first1 The beginning of the key sequence.
  *  \param last1 The end of the key sequence.
  *  \param first2 The beginning of the input value sequence.
diff -Nru libthrust-1.15.0/thrust/system/cuda/config.h libthrust-1.16.0/thrust/system/cuda/config.h
--- libthrust-1.15.0/thrust/system/cuda/config.h	2021-10-29 18:01:06.000000000 +0000
+++ libthrust-1.16.0/thrust/system/cuda/config.h	2022-02-09 20:04:17.000000000 +0000
@@ -32,7 +32,7 @@
 // older releases. This header will always pull in version info:
 #include <cub/util_namespace.cuh>
 
-#if defined(__CUDACC__) || defined(__NVCOMPILER_CUDA__)
+#if defined(__CUDACC__) || defined(_NVHPC_CUDA)
 #  if !defined(__CUDA_ARCH__) || (__CUDA_ARCH__>= 350 && defined(__CUDACC_RDC__))
 #    define __THRUST_HAS_CUDART__ 1
 #    define THRUST_RUNTIME_FUNCTION __host__ __device__ __forceinline__
diff -Nru libthrust-1.15.0/thrust/system/cuda/detail/adjacent_difference.h libthrust-1.16.0/thrust/system/cuda/detail/adjacent_difference.h
--- libthrust-1.15.0/thrust/system/cuda/detail/adjacent_difference.h	2021-10-29 18:01:06.000000000 +0000
+++ libthrust-1.16.0/thrust/system/cuda/detail/adjacent_difference.h	2022-02-09 20:04:17.000000000 +0000
@@ -210,7 +210,6 @@
                         Size tile_base)
       {
         input_type  input[ITEMS_PER_THREAD];
-        input_type  input_prev[ITEMS_PER_THREAD];
         output_type output[ITEMS_PER_THREAD];
 
         if (IS_LAST_TILE)
@@ -234,7 +233,7 @@
         if (IS_FIRST_TILE)
         {
           BlockAdjacentDifference(temp_storage.discontinuity)
-              .FlagHeads(output, input, input_prev, binary_op);
+              .SubtractLeft(input, output, binary_op);
           if (threadIdx.x == 0)
             output[0] = input[0];
         }
@@ -242,7 +241,7 @@
         {
           input_type tile_prev_input = first_tile_previous[tile_idx];
           BlockAdjacentDifference(temp_storage.discontinuity)
-              .FlagHeads(output, input, input_prev, binary_op, tile_prev_input);
+              .SubtractLeft(input, output, binary_op, tile_prev_input);
         }
 
         core::sync_threadblock();
@@ -467,7 +466,7 @@
            num_items_fixed, stream, debug_sync));
     cuda_cub::throw_on_error(status, "adjacent_difference failed on 2nd step");
 
-    status = cuda_cub::synchronize(policy);
+    status = cuda_cub::synchronize_optional(policy);
     cuda_cub::throw_on_error(status, "adjacent_difference failed to synchronize");
 
     return result + num_items;
diff -Nru libthrust-1.15.0/thrust/system/cuda/detail/binary_search.h libthrust-1.16.0/thrust/system/cuda/detail/binary_search.h
--- libthrust-1.15.0/thrust/system/cuda/detail/binary_search.h	2021-10-29 18:01:06.000000000 +0000
+++ libthrust-1.16.0/thrust/system/cuda/detail/binary_search.h	2022-02-09 20:04:17.000000000 +0000
@@ -1,782 +1,19 @@
-/******************************************************************************
- * Copyright (c) 2016, NVIDIA CORPORATION.  All rights reserved.
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *     * Redistributions of source code must retain the above copyright
- *       notice, this list of conditions and the following disclaimer.
- *     * Redistributions in binary form must reproduce the above copyright
- *       notice, this list of conditions and the following disclaimer in the
- *       documentation and/or other materials provided with the distribution.
- *     * Neither the name of the NVIDIA CORPORATION nor the
- *       names of its contributors may be used to endorse or promote products
- *       derived from this software without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
- * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
- * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
- * ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
- * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
- * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
- * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
- * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
- * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
- * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- *
- ******************************************************************************/
-#pragma once
-
-#if 0
-
-#include <thrust/detail/config.h>
-
-#if THRUST_DEVICE_COMPILER == THRUST_DEVICE_COMPILER_NVCC
-#include <thrust/detail/cstdint.h>
-#include <thrust/detail/temporary_array.h>
-#include <thrust/system/cuda/detail/util.h>
-
-#include <thrust/system/cuda/execution_policy.h>
-#include <thrust/system/cuda/detail/core/agent_launcher.h>
-#include <thrust/system/cuda/detail/core/util.h>
-#include <thrust/system/cuda/detail/par_to_seq.h>
-#include <thrust/binary_search.h>
-#include <thrust/distance.h>
-
-#if 1
-#  define BS_SIMPLE
-#endif
-
-THRUST_NAMESPACE_BEGIN
-namespace cuda_cub {
-
-namespace __binary_search {
-
-  template <class HaystackIt, class NeedlesIt>
-  struct lbf
-  {
-    typedef typename iterator_traits<HaystackIt>::difference_type result_type;
-    typedef typename iterator_traits<NeedlesIt>::value_type T;
-
-    template <class It, class CompareOp>
-    THRUST_DEVICE_FUNCTION result_type
-    operator()(It begin, It end, T const& value, CompareOp comp)
-    {
-      return system::detail::generic::scalar::lower_bound(begin,
-                                                          end,
-                                                          value,
-                                                          comp) -
-             begin;
-    }
-  };    // struct lbf
-
-  template<class HaystackIt, class NeedlesIt>
-  struct ubf
-  {
-    typedef typename iterator_traits<HaystackIt>::difference_type result_type;
-    typedef typename iterator_traits<NeedlesIt>::value_type T;
-
-    template <class It, class CompareOp>
-    THRUST_DEVICE_FUNCTION result_type
-    operator()(It begin, It end, T const& value, CompareOp comp)
-    {
-      return system::detail::generic::scalar::upper_bound(begin,
-                                                          end,
-                                                          value,
-                                                          comp) -
-             begin;
-    }
-  };    // struct ubf
-
-  template<class HaystackIt, class NeedlesIt>
-  struct bsf
-  {
-    typedef bool result_type;
-    typedef typename iterator_traits<NeedlesIt>::value_type T;
-
-    template <class It, class CompareOp>
-    THRUST_DEVICE_FUNCTION bool
-    operator()(It begin, It end, T const& value, CompareOp comp)
-    {
-      HaystackIt iter = system::detail::generic::scalar::lower_bound(begin,
-                                                                     end,
-                                                                     value,
-                                                                     comp);
-
-      detail::wrapped_function<CompareOp, bool> wrapped_comp(comp);
-
-      return iter != end && !wrapped_comp(value, *iter);
-    }
-  };    // struct bsf
-
-  template <class KeysIt1,
-            class KeysIt2,
-            class Size,
-            class BinaryPred>
-  THRUST_DEVICE_FUNCTION Size
-  merge_path(KeysIt1    keys1,
-             KeysIt2    keys2,
-             Size       keys1_count,
-             Size       keys2_count,
-             Size       diag,
-             BinaryPred binary_pred)
-  {
-    typedef typename iterator_traits<KeysIt1>::value_type key1_type;
-    typedef typename iterator_traits<KeysIt2>::value_type key2_type;
-
-    Size keys1_begin = thrust::max<Size>(0, diag - keys2_count);
-    Size keys1_end   = thrust::min<Size>(diag, keys1_count);
-
-    while (keys1_begin < keys1_end)
-    {
-      Size      mid  = (keys1_begin + keys1_end) >> 1;
-      key1_type key1 = keys1[mid];
-      key2_type key2 = keys2[diag - 1 - mid];
-      bool      pred = binary_pred(key2, key1);
-      if (pred)
-      {
-        keys1_end = mid;
-      }
-      else
-      {
-        keys1_begin = mid + 1;
-      }
-    }
-    return keys1_begin;
-  }
-
-  template <class It, class T2, class CompareOp, int ITEMS_PER_THREAD>
-  THRUST_DEVICE_FUNCTION void
-  serial_merge(It  keys_shared,
-               int keys1_beg,
-               int keys2_beg,
-               int keys1_count,
-               int keys2_count,
-               T2 (&output)[ITEMS_PER_THREAD],
-               int (&indices)[ITEMS_PER_THREAD],
-               CompareOp compare_op)
-  {
-    int keys1_end = keys1_beg + keys1_count;
-    int keys2_end = keys2_beg + keys2_count;
-
-    typedef typename iterator_value<It>::type key_type;
-
-    key_type key1 = keys_shared[keys1_beg];
-    key_type key2 = keys_shared[keys2_beg];
-
-
-#pragma unroll
-    for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ++ITEM)
-    {
-      bool p = (keys2_beg < keys2_end) &&
-               ((keys1_beg >= keys1_end) ||
-                compare_op(key2,key1));
-
-      output[ITEM]  = p ? key2 : key1;
-      indices[ITEM] = p ? keys2_beg++ : keys1_beg++;
-
-      if (p)
-      {
-        key2 = keys_shared[keys2_beg];
-      }
-      else
-      {
-        key1 = keys_shared[keys1_beg];
-      }
-    }
-  }
-
-  template <int                      _BLOCK_THREADS,
-            int                      _ITEMS_PER_THREAD = 1,
-            cub::BlockLoadAlgorithm  _LOAD_ALGORITHM   = cub::BLOCK_LOAD_DIRECT,
-            cub::CacheLoadModifier   _LOAD_MODIFIER    = cub::LOAD_LDG,
-            cub::BlockStoreAlgorithm _STORE_ALGORITHM  = cub::BLOCK_STORE_DIRECT>
-  struct PtxPolicy
-  {
-    enum
-    {
-      BLOCK_THREADS      = _BLOCK_THREADS,
-      ITEMS_PER_THREAD   = _ITEMS_PER_THREAD,
-      ITEMS_PER_TILE     = _BLOCK_THREADS * _ITEMS_PER_THREAD
-    };
-
-    static const cub::BlockLoadAlgorithm  LOAD_ALGORITHM  = _LOAD_ALGORITHM;
-    static const cub::CacheLoadModifier   LOAD_MODIFIER   = _LOAD_MODIFIER;
-    static const cub::BlockStoreAlgorithm STORE_ALGORITHM = _STORE_ALGORITHM;
-  };    // PtxPolicy
-
-  template <class Arch, class T>
-  struct Tuning;
-
-  template<class T>
-  struct Tuning<sm30,T>
-  {
-    enum
-    {
-      NOMINAL_4B_ITEMS_PER_THREAD = 7,
-      ITEMS_PER_THREAD            = CUB_MIN(NOMINAL_4B_ITEMS_PER_THREAD, CUB_MAX(3, (NOMINAL_4B_ITEMS_PER_THREAD * 4 / sizeof(T)))),
-    };
-
-    typedef PtxPolicy<128,
-                      ITEMS_PER_THREAD,
-                      cub::BLOCK_LOAD_WARP_TRANSPOSE,
-                      cub::LOAD_LDG,
-                      cub::BLOCK_STORE_TRANSPOSE>
-        type;
-  };
-
-  template<class T>
-  struct Tuning<sm52,T>
-  {
-    const static int INPUT_SIZE = sizeof(T);
-
-    enum
-    {
-      NOMINAL_4B_ITEMS_PER_THREAD = 7,
-      ITEMS_PER_THREAD            = CUB_MIN(NOMINAL_4B_ITEMS_PER_THREAD, CUB_MAX(1, (NOMINAL_4B_ITEMS_PER_THREAD * 4 / sizeof(T)))),
-    };
-
-    typedef PtxPolicy<128,
-                      ITEMS_PER_THREAD,
-                      cub::BLOCK_LOAD_WARP_TRANSPOSE,
-                      cub::LOAD_LDG,
-                      cub::BLOCK_STORE_WARP_TRANSPOSE>
-        type;
-  };
-
-  template <class NeedlesIt,
-            class HaystackIt,
-            class Size,
-            class OutputIt,
-            class CompareOp,
-            class SearchOp>
-  struct VectorizedBinarySearchAgent
-  {
-    typedef typename iterator_traits<NeedlesIt>::value_type  needle_type;
-    typedef typename iterator_traits<HaystackIt>::value_type haystack_type;
-    typedef typename SearchOp::result_type                   result_type;
-
-    template <class Arch>
-    struct PtxPlan : Tuning<Arch, needle_type>::type
-    {
-      typedef Tuning<Arch,needle_type> tuning;
-
-      typedef typename core::LoadIterator<PtxPlan, NeedlesIt>::type  NeedlesLoadIt;
-      typedef typename core::LoadIterator<PtxPlan, HaystackIt>::type HaystackLoadIt;
-
-      typedef typename core::BlockLoad<PtxPlan, NeedlesLoadIt>::type BlockLoadNeedles;
-
-      typedef typename core::BlockStore<PtxPlan, OutputIt, result_type>::type BlockStoreResult;
-
-      union TempStorage
-      {
-        typename BlockLoadNeedles::TempStorage load_needles;
-        typename BlockStoreResult::TempStorage store_result;
-
-#ifndef BS_SIMPLE
-        core::uninitialized_array<needle_type, PtxPlan::ITEMS_PER_TILE + 1> needles_shared;
-        core::uninitialized_array<result_type, PtxPlan::ITEMS_PER_TILE>     result_shared;
-        core::uninitialized_array<int, PtxPlan::ITEMS_PER_TILE>             indices_shared;
-#endif
-      };    // union TempStorage
-    };
-
-    typedef typename core::specialize_plan_msvc10_war<PtxPlan>::type::type ptx_plan;
-
-    typedef typename ptx_plan::NeedlesLoadIt    NeedlesLoadIt;
-    typedef typename ptx_plan::HaystackLoadIt   HaystackLoadIt;
-    typedef typename ptx_plan::BlockLoadNeedles BlockLoadNeedles;
-    typedef typename ptx_plan::BlockStoreResult BlockStoreResult;
-    typedef typename ptx_plan::TempStorage     TempStorage;
-
-    enum
-    {
-      ITEMS_PER_THREAD = ptx_plan::ITEMS_PER_THREAD,
-      BLOCK_THREADS    = ptx_plan::BLOCK_THREADS,
-      ITEMS_PER_TILE   = ptx_plan::ITEMS_PER_TILE
-    };
-
-    struct impl
-    {
-      TempStorage&   storage;
-      NeedlesLoadIt  needles_load_it;
-      HaystackLoadIt haystack_load_it;
-      Size           needles_count;
-      Size           haystack_size;
-      OutputIt       result;
-      CompareOp      compare_op;
-      SearchOp       search_op;
-
-      THRUST_DEVICE_FUNCTION
-      void stable_odd_even_sort(needle_type (&needles)[ITEMS_PER_THREAD],
-                                int (&indices)[ITEMS_PER_THREAD])
-      {
-#pragma unroll
-        for (int I = 0; I < ITEMS_PER_THREAD; ++I)
-        {
-#pragma unroll
-          for (int J = 1 & I; J < ITEMS_PER_THREAD - 1; J += 2)
-          {
-            if (compare_op(needles[J + 1], needles[J]))
-            {
-              using thrust::swap;
-              swap(needles[J], needles[J + 1]);
-              swap(indices[J], indices[J + 1]);
-            }
-          }    // inner loop
-        }      // outer loop
-      }
-
-      THRUST_DEVICE_FUNCTION void
-      block_mergesort(int tid,
-                      int count,
-                      needle_type (&needles_loc)[ITEMS_PER_THREAD],
-                      int (&indices_loc)[ITEMS_PER_THREAD])
-      {
-        using core::sync_threadblock;
-
-        // stable sort items in a single thread
-        //
-        stable_odd_even_sort(needles_loc,indices_loc);
-
-        // each thread has  sorted keys_loc
-        // merge sort keys_loc in shared memory
-        //
-#pragma unroll
-        for (int coop = 2; coop <= BLOCK_THREADS; coop *= 2)
-        {
-          sync_threadblock();
-
-          // store keys in shmem
-          //
-#pragma unroll
-          for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ++ITEM)
-          {
-            int idx = ITEMS_PER_THREAD * threadIdx.x + ITEM;
-            storage.needles_shared[idx] = needles_loc[ITEM];
-          }
-
-          sync_threadblock();
-
-          int  indices[ITEMS_PER_THREAD];
-
-          int list  = ~(coop - 1) & tid;
-          int start = ITEMS_PER_THREAD * list;
-          int size  = ITEMS_PER_THREAD * (coop >> 1);
-
-          int diag = min(count, ITEMS_PER_THREAD * ((coop - 1) & tid));
+/*
+*  Copyright 2021 NVIDIA Corporation
+*
+*  Licensed under the Apache License, Version 2.0 (the "License");
+*  you may not use this file except in compliance with the License.
+*  You may obtain a copy of the License at
+*
+*      http://www.apache.org/licenses/LICENSE-2.0
+*
+*  Unless required by applicable law or agreed to in writing, software
+*  distributed under the License is distributed on an "AS IS" BASIS,
+*  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+*  See the License for the specific language governing permissions and
+*  limitations under the License.
+*/
 
-          int keys1_beg = min(count, start);
-          int keys1_end = min(count, keys1_beg + size);
-          int keys2_beg = keys1_end;
-          int keys2_end = min(count, keys2_beg + size);
-
-          int keys1_count = keys1_end - keys1_beg;
-          int keys2_count = keys2_end - keys2_beg;
-
-          int partition_diag = merge_path(&storage.needles_shared[keys1_beg],
-                                          &storage.needles_shared[keys2_beg],
-                                          keys1_count,
-                                          keys2_count,
-                                          diag,
-                                          compare_op);
-
-          int keys1_beg_loc   = keys1_beg + partition_diag;
-          int keys1_end_loc   = keys1_end;
-          int keys2_beg_loc   = keys2_beg + diag - partition_diag;
-          int keys2_end_loc   = keys2_end;
-          int keys1_count_loc = keys1_end_loc - keys1_beg_loc;
-          int keys2_count_loc = keys2_end_loc - keys2_beg_loc;
-          serial_merge(&storage.needles_shared[0],
-                       keys1_beg_loc,
-                       keys2_beg_loc,
-                       keys1_count_loc,
-                       keys2_count_loc,
-                       needles_loc,
-                       indices,
-                       compare_op);
-
-
-          sync_threadblock();
-
-#pragma unroll
-          for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ++ITEM)
-          {
-            int idx = ITEMS_PER_THREAD * threadIdx.x + ITEM;
-            storage.indices_shared[idx] = indices_loc[ITEM];
-          }
-
-          sync_threadblock();
-
-#pragma unroll
-          for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ++ITEM)
-          {
-            indices_loc[ITEM] = storage.indices_shared[indices[ITEM]];
-          }
-        }
-      }    // func block_merge_sort
-
-      template <bool IS_LAST_TILE>
-      THRUST_DEVICE_FUNCTION void
-      consume_tile(int  tid,
-                   Size tile_idx,
-                   Size tile_base,
-                   int  num_remaining)
-      {
-        using core::sync_threadblock;
-
-        needle_type needles_loc[ITEMS_PER_THREAD];
-        BlockLoadNeedles(storage.load_needles)
-            .Load(needles_load_it + tile_base, needles_loc, num_remaining);
-
-#ifdef BS_SIMPLE
-
-        result_type results_loc[ITEMS_PER_THREAD];
-        for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ++ITEM)
-        {
-          results_loc[ITEM] = search_op(haystack_load_it,
-                                        haystack_load_it + haystack_size,
-                                        needles_loc[ITEM],
-                                        compare_op);
-        }
-
-
-#else
-
-        if (IS_LAST_TILE)
-        {
-          needle_type max_value = needles_loc[0];
-#pragma unroll
-          for (int ITEM = 1; ITEM < ITEMS_PER_THREAD; ++ITEM)
-          {
-            if (ITEMS_PER_THREAD * tid + ITEM < num_remaining)
-            {
-              max_value = compare_op(max_value, needles_loc[ITEM])
-                            ? needles_loc[ITEM]
-                            : max_value;
-            }
-            else
-            {
-              needles_loc[ITEM] = max_value;
-            }
-          }
-        }
-
-        sync_threadblock();
-
-        int indices_loc[ITEMS_PER_THREAD];
-        for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ++ITEM)
-        {
-          int idx = ITEMS_PER_THREAD*threadIdx.x + ITEM;
-          indices_loc[ITEM] = idx;
-        }
-
-        if (IS_LAST_TILE)
-        {
-          block_mergesort(tid,
-                          num_remaining,
-                          needles_loc,
-                          indices_loc);
-        }
-        else
-        {
-          block_mergesort(tid,
-                          ITEMS_PER_TILE,
-                          needles_loc,
-                          indices_loc);
-        }
-
-        sync_threadblock();
-
-#pragma unroll
-        for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ++ITEM)
-        {
-          int idx = indices_loc[ITEM];
-          storage.result_shared[idx] =
-              search_op(haystack_load_it,
-                        haystack_load_it + haystack_size,
-                        needles_loc[ITEM],
-                        compare_op);
-        }
-
-        sync_threadblock();
-
-        result_type results_loc[ITEMS_PER_THREAD];
-#pragma unroll
-        for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ++ITEM)
-        {
-          int idx = ITEMS_PER_THREAD*threadIdx.x + ITEM;
-          results_loc[ITEM] = storage.result_shared[idx];
-        }
-
-        sync_threadblock();
-#endif
-
-        BlockStoreResult(storage.store_result)
-            .Store(result + tile_base, results_loc, num_remaining);
-      }
-
-      THRUST_DEVICE_FUNCTION
-      impl(TempStorage& storage_,
-           NeedlesIt    needles_it_,
-           HaystackIt   haystack_it_,
-           Size         needles_count_,
-           Size         haystack_size_,
-           OutputIt     result_,
-           CompareOp    compare_op_,
-           SearchOp     search_op_)
-          : storage(storage_),
-            needles_load_it(core::make_load_iterator(ptx_plan(), needles_it_)),
-            haystack_load_it(core::make_load_iterator(ptx_plan(), haystack_it_)),
-            needles_count(needles_count_),
-            haystack_size(haystack_size_),
-            result(result_),
-            compare_op(compare_op_),
-            search_op(search_op_)
-      {
-        int  tid           = threadIdx.x;
-        Size tile_idx      = blockIdx.x;
-        Size num_tiles     = gridDim.x;
-        Size tile_base     = tile_idx * ITEMS_PER_TILE;
-        int  items_in_tile = min<int>(needles_count - tile_base, ITEMS_PER_TILE);
-        if (tile_idx < num_tiles - 1)
-        {
-          consume_tile<false>(tid, tile_idx, tile_base, ITEMS_PER_TILE);
-        }
-        else
-        {
-          consume_tile<true>(tid, tile_idx, tile_base, items_in_tile);
-        }
-      }
-    };    // struct impl
-
-
-    THRUST_AGENT_ENTRY(NeedlesIt  needles_it,
-                       HaystackIt haystack_it,
-                       Size       needles_count,
-                       Size       haystack_size,
-                       OutputIt   result,
-                       CompareOp  compare_op,
-                       SearchOp   search_op,
-                       char*      shmem)
-    {
-      TempStorage& storage = *reinterpret_cast<TempStorage*>(shmem);
-
-      impl(storage,
-           needles_it,
-           haystack_it,
-           needles_count,
-           haystack_size,
-           result,
-           compare_op,
-           search_op);
-    }
-  };    // struct VectorizedBinarySearchAgent
-
-  template <class NeedlesIt,
-            class HaystackIt,
-            class Size,
-            class OutputIt,
-            class CompareOp,
-            class SearchOp>
-  cudaError_t THRUST_RUNTIME_FUNCTION
-  doit_pass(void*        d_temp_storage,
-            size_t&      temp_storage_size,
-            NeedlesIt    needles_it,
-            HaystackIt   haystack_it,
-            Size         needles_count,
-            Size         haystack_size,
-            OutputIt     result,
-            CompareOp    compare_op,
-            SearchOp     search_op,
-            cudaStream_t stream,
-            bool         debug_sync)
-  {
-    if (needles_count == 0)
-      return cudaErrorNotSupported;
-
-    cudaError_t status = cudaSuccess;
-
-    using core::AgentPlan;
-    using core::AgentLauncher;
-
-
-    typedef AgentLauncher<
-        VectorizedBinarySearchAgent<NeedlesIt,
-                                    HaystackIt,
-                                    Size,
-                                    OutputIt,
-                                    CompareOp,
-                                    SearchOp> >
-        search_agent;
-
-    AgentPlan search_plan = search_agent::get_plan(stream);
-
-    temp_storage_size = 1;
-    if (d_temp_storage == NULL)
-    {
-      return status;
-    }
-
-    search_agent sa(search_plan, needles_count, stream, "binary_search::search_agent", debug_sync);
-    sa.launch(needles_it,
-              haystack_it,
-              needles_count,
-              haystack_size,
-              result,
-              compare_op,
-              search_op);
-
-    CUDA_CUB_RET_IF_FAIL(cudaPeekAtLastError());
-
-    return status;
-  }
-
-  template <typename Derived,
-            typename NeedlesIt,
-            typename HaystackIt,
-            typename OutputIt,
-            typename CompareOp,
-            typename SearchOp>
-  OutputIt THRUST_RUNTIME_FUNCTION
-  doit(execution_policy<Derived>& policy,
-       HaystackIt                 haystack_begin,
-       HaystackIt                 haystack_end,
-       NeedlesIt                  needles_begin,
-       NeedlesIt                  needles_end,
-       OutputIt                   result,
-       CompareOp                  compare_op,
-       SearchOp                   search_op)
-  {
-    typedef typename iterator_traits<NeedlesIt>::difference_type size_type;
-
-    size_type needles_count = thrust::distance(needles_begin, needles_end);
-    size_type haystack_size = thrust::distance(haystack_begin, haystack_end);
-
-    if (needles_count == 0)
-      return result;
-
-    size_t       storage_size = 0;
-    cudaStream_t stream       = cuda_cub::stream(policy);
-    bool         debug_sync   = THRUST_DEBUG_SYNC_FLAG;
-
-    cudaError status;
-    status = doit_pass(NULL,
-                       storage_size,
-                       needles_begin,
-                       haystack_begin,
-                       needles_count,
-                       haystack_size,
-                       result,
-                       compare_op,
-                       search_op,
-                       stream,
-                       debug_sync);
-    cuda_cub::throw_on_error(status, "binary_search: failed on 1st call");
-
-    // Allocate temporary storage.
-    thrust::detail::temporary_array<thrust::detail::uint8_t, Derived>
-      tmp(policy, storage_size);
-    void *ptr = static_cast<void*>(tmp.data().get());
-
-    status = doit_pass(ptr,
-                       storage_size,
-                       needles_begin,
-                       haystack_begin,
-                       needles_count,
-                       haystack_size,
-                       result,
-                       compare_op,
-                       search_op,
-                       stream,
-                       debug_sync);
-    cuda_cub::throw_on_error(status, "binary_search: failed on 2nt call");
-
-    status = cuda_cub::synchronize(policy);
-    cuda_cub::throw_on_error(status, "binary_search: failed to synchronize");
-
-    return result + needles_count;
-  }
-
-  struct less
-  {
-    template <typename T1, typename T2>
-    THRUST_DEVICE_FUNCTION bool
-    operator()(const T1& lhs, const T2& rhs) const
-    {
-      return lhs < rhs;
-    }
-  };
-}    // namespace __binary_search
-
-//-------------------------
-// Thrust API entry points
-//-------------------------
-
-__thrust_exec_check_disable__
-template <class Derived,
-          class HaystackIt,
-          class NeedlesIt,
-          class OutputIt,
-          class CompareOp>
-OutputIt __host__ __device__
-lower_bound(execution_policy<Derived>& policy,
-            HaystackIt                 first,
-            HaystackIt                 last,
-            NeedlesIt                  values_first,
-            NeedlesIt                  values_last,
-            OutputIt                   result,
-            CompareOp                  compare_op)
-{
-  OutputIt ret = result;
-  if (__THRUST_HAS_CUDART__)
-  {
-    ret = __binary_search::doit(policy,
-                                first,
-                                last,
-                                values_first,
-                                values_last,
-                                result,
-                                compare_op,
-                                __binary_search::lbf<HaystackIt, NeedlesIt>());
-  }
-  else
-  {
-#if !__THRUST_HAS_CUDART__
-    ret = thrust::lower_bound(cvt_to_seq(derived_cast(policy)),
-                              first,
-                              last,
-                              values_first,
-                              values_last,
-                              result);
-#endif
-  }
-  return ret;
-}
-
-
-template <class Derived,
-          class HaystackIt,
-          class NeedlesIt,
-          class OutputIt>
-OutputIt __host__ __device__
-lower_bound(execution_policy<Derived>& policy,
-            HaystackIt                 first,
-            HaystackIt                 last,
-            NeedlesIt                  values_first,
-            NeedlesIt                  values_last,
-            OutputIt                   result)
-{
-  return cuda_cub::lower_bound(policy,
-                               first,
-                               last,
-                               values_first,
-                               values_last,
-                               result,
-                               __binary_search::less());
-}
-
-}    // namespace cuda_cub
-THRUST_NAMESPACE_END
-#endif
+#pragma once
 
-#endif
+// this system has no special version of this algorithm
diff -Nru libthrust-1.15.0/thrust/system/cuda/detail/core/agent_launcher.h libthrust-1.16.0/thrust/system/cuda/detail/core/agent_launcher.h
--- libthrust-1.15.0/thrust/system/cuda/detail/core/agent_launcher.h	2021-10-29 18:01:06.000000000 +0000
+++ libthrust-1.16.0/thrust/system/cuda/detail/core/agent_launcher.h	2022-02-09 20:04:17.000000000 +0000
@@ -50,7 +50,7 @@
 namespace core {
 
 
-#if defined(__CUDA_ARCH__) || defined(__NVCOMPILER_CUDA__)
+#if defined(__CUDA_ARCH__) || defined(_NVHPC_CUDA)
 #if 0
   template <class Agent, class... Args>
   void __global__
diff -Nru libthrust-1.15.0/thrust/system/cuda/detail/core/triple_chevron_launch.h libthrust-1.16.0/thrust/system/cuda/detail/core/triple_chevron_launch.h
--- libthrust-1.15.0/thrust/system/cuda/detail/core/triple_chevron_launch.h	2021-10-29 18:01:06.000000000 +0000
+++ libthrust-1.16.0/thrust/system/cuda/detail/core/triple_chevron_launch.h	2022-02-09 20:04:17.000000000 +0000
@@ -834,7 +834,7 @@
     }
 
 
-#if defined(__NVCOMPILER_CUDA__)
+#if defined(_NVHPC_CUDA)
 #  define THRUST_TRIPLE_LAUNCHER_HOSTDEVICE(...) \
       (__builtin_is_device_code() ?              \
           doit_device(__VA_ARGS__) : doit_host(__VA_ARGS__))
diff -Nru libthrust-1.15.0/thrust/system/cuda/detail/core/util.h libthrust-1.16.0/thrust/system/cuda/detail/core/util.h
--- libthrust-1.15.0/thrust/system/cuda/detail/core/util.h	2021-10-29 18:01:06.000000000 +0000
+++ libthrust-1.16.0/thrust/system/cuda/detail/core/util.h	2022-02-09 20:04:17.000000000 +0000
@@ -41,7 +41,7 @@
 namespace cuda_cub {
 namespace core {
 
-#ifdef __NVCOMPILER_CUDA__
+#ifdef _NVHPC_CUDA
 #  if (__NVCOMPILER_CUDA_ARCH__ >= 600)
 #    define THRUST_TUNING_ARCH sm60
 #  elif (__NVCOMPILER_CUDA_ARCH__ >= 520)
@@ -358,7 +358,7 @@
       // get_agent_plan_impl::get(version), is for host code and for device
       // code without device-side kernel launches. NVCC and Feta check for
       // these situations differently.
-      #ifdef __NVCOMPILER_CUDA__
+      #ifdef _NVHPC_CUDA
         #ifdef __THRUST_HAS_CUDART__
           if (CUB_IS_DEVICE_CODE) {
             return typename get_plan<Agent>::type(typename Agent::ptx_plan());
diff -Nru libthrust-1.15.0/thrust/system/cuda/detail/extrema.h libthrust-1.16.0/thrust/system/cuda/detail/extrema.h
--- libthrust-1.15.0/thrust/system/cuda/detail/extrema.h	2021-10-29 18:01:06.000000000 +0000
+++ libthrust-1.16.0/thrust/system/cuda/detail/extrema.h	2022-02-09 20:04:17.000000000 +0000
@@ -130,8 +130,11 @@
       pair_type const &lhs_min = get<0>(lhs);
       pair_type const &rhs_max = get<1>(rhs);
       pair_type const &lhs_max = get<1>(lhs);
-      return thrust::make_tuple(arg_min_t(predicate)(lhs_min, rhs_min),
-                                arg_max_t(predicate)(lhs_max, rhs_max));
+
+      auto result = thrust::make_tuple(arg_min_t(predicate)(lhs_min, rhs_min),
+                                       arg_max_t(predicate)(lhs_max, rhs_max));
+
+      return result;
     }
 
     struct duplicate_tuple
@@ -265,7 +268,7 @@
 
         // if not enough to fill the device with threadblocks
         // then fill the device with threadblocks
-        reduce_grid_size = static_cast<int>(min(num_tiles, static_cast<size_t>(reduce_device_occupancy)));
+        reduce_grid_size = static_cast<int>((min)(num_tiles, static_cast<size_t>(reduce_device_occupancy)));
 
         typedef AgentLauncher<__reduce::DrainAgent<Size> > drain_agent;
         AgentPlan drain_plan = drain_agent::get_plan();
diff -Nru libthrust-1.15.0/thrust/system/cuda/detail/fill.h libthrust-1.16.0/thrust/system/cuda/detail/fill.h
--- libthrust-1.15.0/thrust/system/cuda/detail/fill.h	2021-10-29 18:01:06.000000000 +0000
+++ libthrust-1.16.0/thrust/system/cuda/detail/fill.h	2022-02-09 20:04:17.000000000 +0000
@@ -72,7 +72,7 @@
                          count);
 
   cuda_cub::throw_on_error(
-    cuda_cub::synchronize(policy)
+    cuda_cub::synchronize_optional(policy)
   , "fill_n: failed to synchronize"
   );
 
diff -Nru libthrust-1.15.0/thrust/system/cuda/detail/for_each.h libthrust-1.16.0/thrust/system/cuda/detail/for_each.h
--- libthrust-1.15.0/thrust/system/cuda/detail/for_each.h	2021-10-29 18:01:06.000000000 +0000
+++ libthrust-1.16.0/thrust/system/cuda/detail/for_each.h	2022-02-09 20:04:17.000000000 +0000
@@ -82,7 +82,7 @@
                            count);
 
     cuda_cub::throw_on_error(
-      cuda_cub::synchronize(policy)
+      cuda_cub::synchronize_optional(policy)
     , "for_each: failed to synchronize"
     );
 
diff -Nru libthrust-1.15.0/thrust/system/cuda/detail/get_value.h libthrust-1.16.0/thrust/system/cuda/detail/get_value.h
--- libthrust-1.15.0/thrust/system/cuda/detail/get_value.h	2021-10-29 18:01:06.000000000 +0000
+++ libthrust-1.16.0/thrust/system/cuda/detail/get_value.h	2022-02-09 20:04:17.000000000 +0000
@@ -66,7 +66,7 @@
   // because it would result in a compiler warning, either about falling off
   // the end of a non-void function, or about result_type's default constructor
   // being a host-only function.
-  #ifdef __NVCOMPILER_CUDA__
+  #ifdef _NVHPC_CUDA
   if (THRUST_IS_HOST_CODE) {
     return war_nvbugs_881631::host_path(exec, ptr);
   } else {
diff -Nru libthrust-1.15.0/thrust/system/cuda/detail/merge.h libthrust-1.16.0/thrust/system/cuda/detail/merge.h
--- libthrust-1.15.0/thrust/system/cuda/detail/merge.h	2021-10-29 18:01:06.000000000 +0000
+++ libthrust-1.16.0/thrust/system/cuda/detail/merge.h	2022-02-09 20:04:17.000000000 +0000
@@ -170,7 +170,7 @@
       Size partition_idx = blockDim.x * blockIdx.x + threadIdx.x;
       if (partition_idx < num_partitions)
       {
-        Size partition_at = thrust::min(partition_idx * items_per_tile,
+        Size partition_at = (thrust::min)(partition_idx * items_per_tile,
                                         keys1_count + keys2_count);
         Size partition_diag = merge_path(keys1,
                                          keys2,
@@ -463,7 +463,7 @@
         Size partition_end = merge_partitions[tile_idx + 1];
 
         Size diag0 = ITEMS_PER_TILE * tile_idx;
-        Size diag1 = thrust::min(keys1_count + keys2_count, diag0 + ITEMS_PER_TILE);
+        Size diag1 = (thrust::min)(keys1_count + keys2_count, diag0 + ITEMS_PER_TILE);
 
         // compute bounding box for keys1 & keys2
         //
@@ -847,7 +847,7 @@
                                     debug_sync);
     cuda_cub::throw_on_error(status, "merge: failed on 2nd step");
 
-    status = cuda_cub::synchronize(policy);
+    status = cuda_cub::synchronize_optional(policy);
     cuda_cub::throw_on_error(status, "merge: failed to synchronize");
 
     return thrust::make_pair(keys_result + count, items_result + count);
diff -Nru libthrust-1.15.0/thrust/system/cuda/detail/par.h libthrust-1.16.0/thrust/system/cuda/detail/par.h
--- libthrust-1.15.0/thrust/system/cuda/detail/par.h	2021-10-29 18:01:06.000000000 +0000
+++ libthrust-1.16.0/thrust/system/cuda/detail/par.h	2022-02-09 20:04:17.000000000 +0000
@@ -50,7 +50,7 @@
 public:
   __host__ __device__
   execute_on_stream_base(cudaStream_t stream_ = default_stream())
-      : stream(stream_) {}
+      : stream(stream_){}
 
   THRUST_RUNTIME_FUNCTION
   Derived
@@ -70,6 +70,42 @@
   }
 };
 
+template <class Derived>
+struct execute_on_stream_nosync_base : execution_policy<Derived>
+{
+private:
+  cudaStream_t stream;
+
+public:
+  __host__ __device__
+  execute_on_stream_nosync_base(cudaStream_t stream_ = default_stream())
+      : stream(stream_){}
+
+  THRUST_RUNTIME_FUNCTION
+  Derived
+  on(cudaStream_t const &s) const
+  {
+    Derived result = derived_cast(*this);
+    result.stream  = s;
+    return result;
+  }
+
+private:
+  friend __host__ __device__
+  cudaStream_t
+  get_stream(const execute_on_stream_nosync_base &exec)
+  {
+    return exec.stream;
+  }
+
+  friend __host__ __device__
+  bool
+  must_perform_optional_stream_synchronization(const execute_on_stream_nosync_base &)
+  {
+    return false;
+  }
+};
+
 struct execute_on_stream : execute_on_stream_base<execute_on_stream>
 {
   typedef execute_on_stream_base<execute_on_stream> base_t;
@@ -77,7 +113,19 @@
   __host__ __device__
   execute_on_stream() : base_t(){};
   __host__ __device__
-  execute_on_stream(cudaStream_t stream) : base_t(stream){};
+  execute_on_stream(cudaStream_t stream) 
+  : base_t(stream){};
+};
+
+struct execute_on_stream_nosync : execute_on_stream_nosync_base<execute_on_stream_nosync>
+{
+  typedef execute_on_stream_nosync_base<execute_on_stream_nosync> base_t;
+
+  __host__ __device__
+  execute_on_stream_nosync() : base_t(){};
+  __host__ __device__
+  execute_on_stream_nosync(cudaStream_t stream) 
+  : base_t(stream){};
 };
 
 
@@ -104,20 +152,105 @@
   }
 };
 
+struct par_nosync_t : execution_policy<par_nosync_t>,
+  thrust::detail::allocator_aware_execution_policy<
+    execute_on_stream_nosync_base>
+#if THRUST_CPP_DIALECT >= 2011
+, thrust::detail::dependencies_aware_execution_policy<
+    execute_on_stream_nosync_base>
+#endif
+{
+  typedef execution_policy<par_nosync_t> base_t;
+
+  __host__ __device__
+  constexpr par_nosync_t() : base_t() {}
+
+  typedef execute_on_stream_nosync stream_attachment_type;
+
+  THRUST_RUNTIME_FUNCTION
+  stream_attachment_type
+  on(cudaStream_t const &stream) const
+  {
+    return execute_on_stream_nosync(stream);
+  }
+
+private:
+  //this function is defined to allow non-blocking calls on the default_stream() with thrust::cuda::par_nosync
+  //without explicitly using thrust::cuda::par_nosync.on(default_stream())
+  friend __host__ __device__
+  bool
+  must_perform_optional_stream_synchronization(const par_nosync_t &)
+  {
+    return false;
+  }
+};
+
 THRUST_INLINE_CONSTANT par_t par;
+
+/*! \p thrust::cuda::par_nosync is a parallel execution policy targeting Thrust's CUDA device backend.
+ *  Similar to \p thrust::cuda::par it allows execution of Thrust algorithms in a specific CUDA stream.
+ *
+ *  \p thrust::cuda::par_nosync indicates that an algorithm is free to avoid any synchronization of the 
+ *  associated stream that is not strictly required for correctness. Additionally, algorithms may return
+ *  before the corresponding kernels are completed, similar to asynchronous kernel launches via <<< >>> syntax.
+ *  The user must take care to perform explicit synchronization if necessary.
+ *  
+ *  The following code snippet demonstrates how to use \p thrust::cuda::par_nosync :
+ *
+ *  \code
+ *    #include <thrust/device_vector.h>
+ *    #include <thrust/for_each.h>
+ *    #include <thrust/execution_policy.h>
+ *
+ *    struct IncFunctor{
+ *        __host__ __device__
+ *        void operator()(std::size_t& x){ x = x + 1; };
+ *    };
+ *
+ *    int main(){
+ *        std::size_t N = 1000000;
+ *        thrust::device_vector<std::size_t> d_vec(N);
+ *
+ *        cudaStream_t stream;
+ *        cudaStreamCreate(&stream);
+ *        auto nosync_policy = thrust::cuda::par_nosync.on(stream);
+ *
+ *        thrust::for_each(nosync_policy, d_vec.begin(), d_vec.end(), IncFunctor{});
+ *        thrust::for_each(nosync_policy, d_vec.begin(), d_vec.end(), IncFunctor{});
+ *        thrust::for_each(nosync_policy, d_vec.begin(), d_vec.end(), IncFunctor{});
+ *
+ *        //for_each may return before completion. Could do other cpu work in the meantime
+ *        // ...
+ *
+ *        //Wait for the completion of all for_each kernels
+ *        cudaStreamSynchronize(stream);
+ *
+ *        std::size_t x = thrust::reduce(nosync_policy, d_vec.begin(), d_vec.end());
+ *        //Currently, this synchronization is not necessary. reduce will still perform
+ *        //implicit synchronization to transfer the reduced value to the host to return it.
+ *        cudaStreamSynchronize(stream);
+ *        cudaStreamDestroy(stream);
+ *    }
+ *  \endcode
+ *
+ */
+THRUST_INLINE_CONSTANT par_nosync_t par_nosync;
 }    // namespace cuda_
 
 namespace system {
 namespace cuda {
   using thrust::cuda_cub::par;
+  using thrust::cuda_cub::par_nosync;
   namespace detail {
     using thrust::cuda_cub::par_t;
+    using thrust::cuda_cub::par_nosync_t;
   }
 } // namesapce cuda
 } // namespace system
 
 namespace cuda {
 using thrust::cuda_cub::par;
+using thrust::cuda_cub::par_nosync;
 } // namespace cuda
 
 THRUST_NAMESPACE_END
diff -Nru libthrust-1.15.0/thrust/system/cuda/detail/reduce.h libthrust-1.16.0/thrust/system/cuda/detail/reduce.h
--- libthrust-1.15.0/thrust/system/cuda/detail/reduce.h	2021-10-29 18:01:06.000000000 +0000
+++ libthrust-1.16.0/thrust/system/cuda/detail/reduce.h	2022-02-09 20:04:17.000000000 +0000
@@ -808,7 +808,7 @@
 
         // if not enough to fill the device with threadblocks
         // then fill the device with threadblocks
-        reduce_grid_size = static_cast<int>(min(num_tiles, static_cast<size_t>(reduce_device_occupancy)));
+        reduce_grid_size = static_cast<int>((min)(num_tiles, static_cast<size_t>(reduce_device_occupancy)));
 
         typedef AgentLauncher<DrainAgent<Size> > drain_agent;
         AgentPlan drain_plan = drain_agent::get_plan();
@@ -984,8 +984,8 @@
 
   // Synchronize the stream and get the value.
 
-  cuda_cub::throw_on_error(cuda_cub::synchronize(policy),
-    "reduce failed to synchronize");
+  status = cuda_cub::synchronize(policy);
+  cuda_cub::throw_on_error(status, "reduce failed to synchronize");
 
   // `tmp.begin()` yields a `normal_iterator`, which dereferences to a
   // `reference`, which has an `operator&` that returns a `pointer`, which
diff -Nru libthrust-1.15.0/thrust/system/cuda/detail/scan_by_key.h libthrust-1.16.0/thrust/system/cuda/detail/scan_by_key.h
--- libthrust-1.15.0/thrust/system/cuda/detail/scan_by_key.h	2021-10-29 18:01:06.000000000 +0000
+++ libthrust-1.16.0/thrust/system/cuda/detail/scan_by_key.h	2022-02-09 20:04:17.000000000 +0000
@@ -775,7 +775,7 @@
                                   debug_sync);
     cuda_cub::throw_on_error(status, "scan_by_key: failed on 2nd step");
 
-    status = cuda_cub::synchronize(policy);
+    status = cuda_cub::synchronize_optional(policy);
     cuda_cub::throw_on_error(status, "scan_by_key: failed to synchronize");
 
     return values_result + num_items;
diff -Nru libthrust-1.15.0/thrust/system/cuda/detail/scan.h libthrust-1.16.0/thrust/system/cuda/detail/scan.h
--- libthrust-1.15.0/thrust/system/cuda/detail/scan.h	2021-10-29 18:01:06.000000000 +0000
+++ libthrust-1.16.0/thrust/system/cuda/detail/scan.h	2022-02-09 20:04:17.000000000 +0000
@@ -115,7 +115,7 @@
                                  THRUST_DEBUG_SYNC_FLAG));
     thrust::cuda_cub::throw_on_error(status,
                                      "after dispatching inclusive_scan kernel");
-    thrust::cuda_cub::throw_on_error(thrust::cuda_cub::synchronize(policy),
+    thrust::cuda_cub::throw_on_error(thrust::cuda_cub::synchronize_optional(policy),
                                      "inclusive_scan failed to synchronize");
   }
 
@@ -194,7 +194,7 @@
                                  THRUST_DEBUG_SYNC_FLAG));
     thrust::cuda_cub::throw_on_error(status,
                                      "after dispatching exclusive_scan kernel");
-    thrust::cuda_cub::throw_on_error(thrust::cuda_cub::synchronize(policy),
+    thrust::cuda_cub::throw_on_error(thrust::cuda_cub::synchronize_optional(policy),
                                      "exclusive_scan failed to synchronize");
   }
 
diff -Nru libthrust-1.15.0/thrust/system/cuda/detail/sort.h libthrust-1.16.0/thrust/system/cuda/detail/sort.h
--- libthrust-1.15.0/thrust/system/cuda/detail/sort.h	2021-10-29 18:01:06.000000000 +0000
+++ libthrust-1.16.0/thrust/system/cuda/detail/sort.h	2022-02-09 20:04:17.000000000 +0000
@@ -36,6 +36,7 @@
 #include <thrust/system/cuda/detail/core/agent_launcher.h>
 #include <thrust/system/cuda/detail/core/util.h>
 #include <cub/device/device_radix_sort.cuh>
+#include <cub/device/device_merge_sort.cuh>
 
 #include <thrust/system/cuda/detail/execution_policy.h>
 #include <thrust/system/cuda/detail/par_to_seq.h>
@@ -48,1225 +49,112 @@
 #include <thrust/detail/alignment.h>
 #include <thrust/type_traits/is_contiguous_iterator.h>
 
+
 THRUST_NAMESPACE_BEGIN
 namespace cuda_cub {
 
 namespace __merge_sort {
 
-  template <class KeysIt1,
-            class KeysIt2,
-            class Size,
-            class BinaryPred>
-  THRUST_DEVICE_FUNCTION Size
-  merge_path(KeysIt1    keys1,
-             KeysIt2    keys2,
-             Size       keys1_count,
-             Size       keys2_count,
-             Size       diag,
-             BinaryPred binary_pred)
-  {
-    typedef typename iterator_traits<KeysIt1>::value_type key1_type;
-    typedef typename iterator_traits<KeysIt2>::value_type key2_type;
-
-    Size keys1_begin = thrust::max<Size>(0, diag - keys2_count);
-    Size keys1_end   = thrust::min<Size>(diag, keys1_count);
-
-    while (keys1_begin < keys1_end)
-    {
-      Size      mid  = (keys1_begin + keys1_end) >> 1;
-      key1_type key1 = keys1[mid];
-      key2_type key2 = keys2[diag - 1 - mid];
-      bool      pred = binary_pred(key2, key1);
-      if (pred)
-      {
-        keys1_end = mid;
-      }
-      else
-      {
-        keys1_begin = mid + 1;
-      }
-    }
-    return keys1_begin;
-  }
-
-  template <class It, class T2, class CompareOp, int ITEMS_PER_THREAD>
-  THRUST_DEVICE_FUNCTION void
-  serial_merge(It  keys_shared,
-               int keys1_beg,
-               int keys2_beg,
-               int keys1_count,
-               int keys2_count,
-               T2 (&output)[ITEMS_PER_THREAD],
-               int (&indices)[ITEMS_PER_THREAD],
-               CompareOp compare_op)
-  {
-    int keys1_end = keys1_beg + keys1_count;
-    int keys2_end = keys2_beg + keys2_count;
-
-    typedef typename iterator_value<It>::type key_type;
-
-    key_type key1 = keys_shared[keys1_beg];
-    key_type key2 = keys_shared[keys2_beg];
-
-
-#pragma unroll
-    for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ++ITEM)
-    {
-      bool p = (keys2_beg < keys2_end) &&
-               ((keys1_beg >= keys1_end) ||
-                compare_op(key2,key1));
-
-      output[ITEM]  = p ? key2 : key1;
-      indices[ITEM] = p ? keys2_beg++ : keys1_beg++;
-
-      if (p)
-      {
-        key2 = keys_shared[keys2_beg];
-      }
-      else
-      {
-        key1 = keys_shared[keys1_beg];
-      }
-    }
-  }
-
-  template <int                      _BLOCK_THREADS,
-            int                      _ITEMS_PER_THREAD = 1,
-            cub::BlockLoadAlgorithm  _LOAD_ALGORITHM   = cub::BLOCK_LOAD_DIRECT,
-            cub::CacheLoadModifier   _LOAD_MODIFIER    = cub::LOAD_LDG,
-            cub::BlockStoreAlgorithm _STORE_ALGORITHM  = cub::BLOCK_STORE_DIRECT>
-  struct PtxPolicy
-  {
-    enum
-    {
-      BLOCK_THREADS      = _BLOCK_THREADS,
-      ITEMS_PER_THREAD   = _ITEMS_PER_THREAD,
-      ITEMS_PER_TILE     = _BLOCK_THREADS * _ITEMS_PER_THREAD,
-    };
-
-    static const cub::BlockLoadAlgorithm  LOAD_ALGORITHM  = _LOAD_ALGORITHM;
-    static const cub::CacheLoadModifier   LOAD_MODIFIER   = _LOAD_MODIFIER;
-    static const cub::BlockStoreAlgorithm STORE_ALGORITHM = _STORE_ALGORITHM;
-  }; // PtxPolicy
-
-
-  template<class Arch, class T>
-  struct Tuning;
-
-  template<class T>
-  struct Tuning<sm35,T>
-  {
-    const static int INPUT_SIZE = sizeof(T);
-
-    enum
-    {
-      NOMINAL_4B_ITEMS_PER_THREAD = 11,
-      ITEMS_PER_THREAD            = CUB_MIN(NOMINAL_4B_ITEMS_PER_THREAD, CUB_MAX(1, (NOMINAL_4B_ITEMS_PER_THREAD * 4 / sizeof(T)))),
-    };
-
-    typedef PtxPolicy<256,
-                      ITEMS_PER_THREAD,
-                      cub::BLOCK_LOAD_WARP_TRANSPOSE,
-                      cub::LOAD_LDG,
-                      cub::BLOCK_STORE_WARP_TRANSPOSE>
-        type;
-  };
-
-  template<class T>
-  struct Tuning<sm52,T>
-  {
-    const static int INPUT_SIZE = sizeof(T);
-
-    enum
-    {
-      NOMINAL_4B_ITEMS_PER_THREAD = 15,
-      ITEMS_PER_THREAD            = CUB_MIN(NOMINAL_4B_ITEMS_PER_THREAD, CUB_MAX(1, (NOMINAL_4B_ITEMS_PER_THREAD * 4 / sizeof(T)))),
-    };
-
-    typedef PtxPolicy<512,
-                      ITEMS_PER_THREAD,
-                      cub::BLOCK_LOAD_WARP_TRANSPOSE,
-                      cub::LOAD_LDG,
-                      cub::BLOCK_STORE_WARP_TRANSPOSE>
-        type;
-  };
-
-  template<class T>
-  struct Tuning<sm60,T>
-  {
-    const static int INPUT_SIZE = sizeof(T);
-
-    enum
-    {
-      NOMINAL_4B_ITEMS_PER_THREAD = 17,
-      ITEMS_PER_THREAD            = CUB_MIN(NOMINAL_4B_ITEMS_PER_THREAD, CUB_MAX(1, (NOMINAL_4B_ITEMS_PER_THREAD * 4 / sizeof(T)))),
-    };
-
-    typedef PtxPolicy<256,
-                      ITEMS_PER_THREAD,
-                      cub::BLOCK_LOAD_WARP_TRANSPOSE,
-                      cub::LOAD_DEFAULT,
-                      cub::BLOCK_STORE_WARP_TRANSPOSE>
-        type;
-  };
-
-  template<class T>
-  struct Tuning<sm30,T>
-  {
-    enum
-    {
-      NOMINAL_4B_ITEMS_PER_THREAD = 7,
-      ITEMS_PER_THREAD            = CUB_MIN(NOMINAL_4B_ITEMS_PER_THREAD, CUB_MAX(1, (NOMINAL_4B_ITEMS_PER_THREAD * 4 / sizeof(T)))),
-    };
-
-    typedef PtxPolicy<128,
-                      ITEMS_PER_THREAD,
-                      cub::BLOCK_LOAD_WARP_TRANSPOSE,
-                      cub::LOAD_DEFAULT,
-                      cub::BLOCK_STORE_WARP_TRANSPOSE>
-        type;
-  };
-
   template <class KeysIt,
             class ItemsIt,
             class Size,
-            class CompareOp,
-            class SORT_ITEMS,
-            class STABLE>
-  struct BlockSortAgent
-  {
-    typedef typename iterator_traits<KeysIt>::value_type key_type;
-    typedef typename iterator_traits<ItemsIt>::value_type item_type;
-
-    template <class Arch>
-    struct PtxPlan : Tuning<Arch, key_type>::type
-    {
-      typedef Tuning<Arch,key_type> tuning;
-
-      typedef typename core::LoadIterator<PtxPlan, KeysIt>::type  KeysLoadIt;
-      typedef typename core::LoadIterator<PtxPlan, ItemsIt>::type ItemsLoadIt;
-
-      typedef typename core::BlockLoad<PtxPlan, KeysLoadIt>::type  BlockLoadKeys;
-      typedef typename core::BlockLoad<PtxPlan, ItemsLoadIt>::type BlockLoadItems;
-
-      typedef typename core::BlockStore<PtxPlan, KeysIt>::type     BlockStoreKeysIt;
-      typedef typename core::BlockStore<PtxPlan, ItemsIt>::type    BlockStoreItemsIt;
-      typedef typename core::BlockStore<PtxPlan, key_type*>::type  BlockStoreKeysRaw;
-      typedef typename core::BlockStore<PtxPlan, item_type*>::type BlockStoreItemsRaw;
-
-      union TempStorage
-      {
-        typename BlockLoadKeys::TempStorage   load_keys;
-        typename BlockLoadItems::TempStorage  load_items;
-        typename BlockStoreKeysIt::TempStorage  store_keys_it;
-        typename BlockStoreItemsIt::TempStorage store_items_it;
-        typename BlockStoreKeysRaw::TempStorage  store_keys_raw;
-        typename BlockStoreItemsRaw::TempStorage store_items_raw;
-
-        core::uninitialized_array<key_type, PtxPlan::ITEMS_PER_TILE + 1>  keys_shared;
-        core::uninitialized_array<item_type, PtxPlan::ITEMS_PER_TILE + 1> items_shared;
-      };    // union TempStorage
-    };      // struct PtxPlan
-
-    typedef typename core::specialize_plan_msvc10_war<PtxPlan>::type::type ptx_plan;
-
-    typedef typename ptx_plan::KeysLoadIt         KeysLoadIt;
-    typedef typename ptx_plan::ItemsLoadIt        ItemsLoadIt;
-    typedef typename ptx_plan::BlockLoadKeys      BlockLoadKeys;
-    typedef typename ptx_plan::BlockLoadItems     BlockLoadItems;
-    typedef typename ptx_plan::BlockStoreKeysIt   BlockStoreKeysIt;
-    typedef typename ptx_plan::BlockStoreItemsIt  BlockStoreItemsIt;
-    typedef typename ptx_plan::BlockStoreKeysRaw  BlockStoreKeysRaw;
-    typedef typename ptx_plan::BlockStoreItemsRaw BlockStoreItemsRaw;
-    typedef typename ptx_plan::TempStorage        TempStorage;
-
-    enum
-    {
-      ITEMS_PER_THREAD = ptx_plan::ITEMS_PER_THREAD,
-      BLOCK_THREADS    = ptx_plan::BLOCK_THREADS,
-      ITEMS_PER_TILE   = ptx_plan::ITEMS_PER_TILE
-    };
-
-    struct impl
-    {
-      //---------------------------------------------------------------------
-      // Per thread data
-      //---------------------------------------------------------------------
-
-      bool         ping;
-      TempStorage& storage;
-      KeysLoadIt   keys_in;
-      ItemsLoadIt  items_in;
-      Size         keys_count;
-      KeysIt       keys_out_it;
-      ItemsIt      items_out_it;
-      key_type*    keys_out_raw;
-      item_type*   items_out_raw;
-      CompareOp    compare_op;
-
-      //---------------------------------------------------------------------
-      // Serial stable sort network
-      //---------------------------------------------------------------------
-
-      THRUST_DEVICE_FUNCTION
-      void stable_odd_even_sort(key_type (&keys)[ITEMS_PER_THREAD],
-                                item_type (&items)[ITEMS_PER_THREAD])
-      {
-#pragma unroll
-        for (int i = 0; i < ITEMS_PER_THREAD; ++i)
-        {
-#pragma unroll
-          for (int j = 1 & i; j < ITEMS_PER_THREAD - 1; j += 2)
-          {
-            if (compare_op(keys[j + 1], keys[j]))
-            {
-              using thrust::swap;
-              swap(keys[j], keys[j + 1]);
-              if (SORT_ITEMS::value)
-              {
-                swap(items[j], items[j + 1]);
-              }
-            }
-          }    // inner loop
-        }      // outer loop
-      }
-
-      //---------------------------------------------------------------------
-      // Parallel thread block merge sort
-      //---------------------------------------------------------------------
-
-      template <bool IS_LAST_TILE>
-      THRUST_DEVICE_FUNCTION void
-      block_mergesort(int tid,
-                      int count,
-                      key_type (&keys_loc)[ITEMS_PER_THREAD],
-                      item_type (&items_loc)[ITEMS_PER_THREAD])
-      {
-        using core::uninitialized_array;
-        using core::sync_threadblock;
-
-        // if first element of thread is in input range, stable sort items
-        //
-        if (!IS_LAST_TILE || ITEMS_PER_THREAD * tid < count)
-        {
-          stable_odd_even_sort(keys_loc, items_loc);
-        }
-
-        // each thread has  sorted keys_loc
-        // merge sort keys_loc in shared memory
-        //
-#pragma unroll
-        for (int coop = 2; coop <= BLOCK_THREADS; coop *= 2)
-        {
-          sync_threadblock();
-
-          // store keys in shmem
-          //
-#pragma unroll
-          for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ++ITEM)
-          {
-            int idx                  = ITEMS_PER_THREAD * threadIdx.x + ITEM;
-            storage.keys_shared[idx] = keys_loc[ITEM];
-          }
-
-          sync_threadblock();
-
-          int  indices[ITEMS_PER_THREAD];
-
-          int list  = ~(coop - 1) & tid;
-          int start = ITEMS_PER_THREAD * list;
-          int size  = ITEMS_PER_THREAD * (coop >> 1);
-
-          int diag = min(count,
-                         ITEMS_PER_THREAD * ((coop - 1) & tid));
-
-          int keys1_beg = min(count, start);
-          int keys1_end = min(count, keys1_beg + size);
-          int keys2_beg = keys1_end;
-          int keys2_end = min(count, keys2_beg + size);
-
-          int keys1_count = keys1_end - keys1_beg;
-          int keys2_count = keys2_end - keys2_beg;
-
-          int partition_diag = merge_path(&storage.keys_shared[keys1_beg],
-                                          &storage.keys_shared[keys2_beg],
-                                          keys1_count,
-                                          keys2_count,
-                                          diag,
-                                          compare_op);
-
-          int keys1_beg_loc   = keys1_beg + partition_diag;
-          int keys1_end_loc   = keys1_end;
-          int keys2_beg_loc   = keys2_beg + diag - partition_diag;
-          int keys2_end_loc   = keys2_end;
-          int keys1_count_loc = keys1_end_loc - keys1_beg_loc;
-          int keys2_count_loc = keys2_end_loc - keys2_beg_loc;
-          serial_merge(&storage.keys_shared[0],
-                       keys1_beg_loc,
-                       keys2_beg_loc,
-                       keys1_count_loc,
-                       keys2_count_loc,
-                       keys_loc,
-                       indices,
-                       compare_op);
-
-
-          if (SORT_ITEMS::value)
-          {
-            sync_threadblock();
-
-            // store keys in shmem
-            //
-#pragma unroll
-            for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ++ITEM)
-            {
-              int idx                   = ITEMS_PER_THREAD * threadIdx.x + ITEM;
-              storage.items_shared[idx] = items_loc[ITEM];
-            }
-
-            sync_threadblock();
-
-            // gather items from shmem
-            //
-#pragma unroll
-            for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ++ITEM)
-            {
-              items_loc[ITEM] = storage.items_shared[indices[ITEM]];
-            }
-          }
-        }
-      }    // func block_merge_sort
-
-      //---------------------------------------------------------------------
-      // Tile processing
-      //---------------------------------------------------------------------
-
-      template <bool IS_LAST_TILE>
-      THRUST_DEVICE_FUNCTION void
-      consume_tile(int  tid,
-                   Size /*tile_idx*/,
-                   Size tile_base,
-                   int  num_remaining)
-      {
-        using core::uninitialized_array;
-        using core::sync_threadblock;
-
-        item_type items_loc[ITEMS_PER_THREAD];
-        if (SORT_ITEMS::value)
-        {
-          BlockLoadItems(storage.load_items)
-              .Load(items_in + tile_base,
-                    items_loc,
-                    num_remaining,
-                    *(items_in + tile_base));
-
-          sync_threadblock();
-        }
-
-        key_type keys_loc[ITEMS_PER_THREAD];
-        if (IS_LAST_TILE)
-        {
-          BlockLoadKeys(storage.load_keys)
-              .Load(keys_in + tile_base,
-                    keys_loc,
-                    num_remaining,
-                    *(keys_in + tile_base));
-        }
-        else
-        {
-          BlockLoadKeys(storage.load_keys)
-              .Load(keys_in + tile_base, keys_loc);
-        }
-
-        if (IS_LAST_TILE)
-        {
-          // if last tile, find valid max_key
-          // and fill the remainig keys with it
-          //
-          key_type max_key = keys_loc[0];
-#pragma unroll
-          for (int ITEM = 1; ITEM < ITEMS_PER_THREAD; ++ITEM)
-          {
-            if (ITEMS_PER_THREAD * tid + ITEM < num_remaining)
-            {
-              max_key = compare_op(max_key, keys_loc[ITEM])
-                            ? keys_loc[ITEM]
-                            : max_key;
-            }
-            else
-            {
-              keys_loc[ITEM] = max_key;
-            }
-          }
-        }
-
-        sync_threadblock();
-
-        if (IS_LAST_TILE)
-        {
-          block_mergesort<IS_LAST_TILE>(tid,
-                                        num_remaining,
-                                        keys_loc,
-                                        items_loc);
-        }
-        else
-        {
-          block_mergesort<IS_LAST_TILE>(tid,
-                                        ITEMS_PER_TILE,
-                                        keys_loc,
-                                        items_loc);
-        }
-
-        sync_threadblock();
-
-        if (ping)
-        {
-          if (IS_LAST_TILE)
-          {
-            BlockStoreKeysIt(storage.store_keys_it)
-                .Store(keys_out_it + tile_base, keys_loc, num_remaining);
-          }
-          else
-          {
-            BlockStoreKeysIt(storage.store_keys_it)
-                .Store(keys_out_it + tile_base, keys_loc);
-          }
-
-          if (SORT_ITEMS::value)
-          {
-            sync_threadblock();
-
-            BlockStoreItemsIt(storage.store_items_it)
-                .Store(items_out_it + tile_base, items_loc, num_remaining);
-          }
-        }
-        else
-        {
-          if (IS_LAST_TILE)
-          {
-            BlockStoreKeysRaw(storage.store_keys_raw)
-                .Store(keys_out_raw + tile_base, keys_loc, num_remaining);
-          }
-          else
-          {
-            BlockStoreKeysRaw(storage.store_keys_raw)
-                .Store(keys_out_raw + tile_base, keys_loc);
-          }
-
-          if (SORT_ITEMS::value)
-          {
-            sync_threadblock();
-
-            BlockStoreItemsRaw(storage.store_items_raw)
-                .Store(items_out_raw + tile_base, items_loc, num_remaining);
-          }
-        }
-      }
-
-      //---------------------------------------------------------------------
-      // Constructor
-      //---------------------------------------------------------------------
-
-      THRUST_DEVICE_FUNCTION
-      impl(bool         ping_,
-           TempStorage& storage_,
-           KeysLoadIt   keys_in_,
-           ItemsLoadIt  items_in_,
-           Size         keys_count_,
-           KeysIt       keys_out_it_,
-           ItemsIt      items_out_it_,
-           key_type*    keys_out_raw_,
-           item_type*   items_out_raw_,
-           CompareOp    compare_op_)
-          : ping(ping_),
-            storage(storage_),
-            keys_in(keys_in_),
-            items_in(items_in_),
-            keys_count(keys_count_),
-            keys_out_it(keys_out_it_),
-            items_out_it(items_out_it_),
-            keys_out_raw(keys_out_raw_),
-            items_out_raw(items_out_raw_),
-            compare_op(compare_op_)
-      {
-        int  tid           = threadIdx.x;
-        Size tile_idx      = blockIdx.x;
-        Size num_tiles     = gridDim.x;
-        Size tile_base     = tile_idx * ITEMS_PER_TILE;
-        int  items_in_tile = min<int>(keys_count - tile_base, ITEMS_PER_TILE);
-        if (tile_idx < num_tiles - 1)
-        {
-          consume_tile<false>(tid, tile_idx, tile_base, ITEMS_PER_TILE);
-        }
-        else
-        {
-          consume_tile<true>(tid, tile_idx, tile_base, items_in_tile);
-        }
-      }
-    };    // struct impl
-
-    //---------------------------------------------------------------------
-    // Agent entry point
-    //---------------------------------------------------------------------
-
-    THRUST_AGENT_ENTRY(bool       ping,
-                       KeysIt     keys_inout,
-                       ItemsIt    items_inout,
-                       Size       keys_count,
-                       key_type*  keys_out,
-                       item_type* items_out,
-                       CompareOp  compare_op,
-                       char*      shmem)
-    {
-      TempStorage& storage = *reinterpret_cast<TempStorage*>(shmem);
-
-      impl(ping,
-           storage,
-           core::make_load_iterator(ptx_plan(), keys_inout),
-           core::make_load_iterator(ptx_plan(), items_inout),
-           keys_count,
-           keys_inout,
-           items_inout,
-           keys_out,
-           items_out,
-           compare_op);
-    }
-  };    // struct BlockSortAgent
-
-  template <class KeysIt,
-            class Size,
             class CompareOp>
-  struct PartitionAgent
+  THRUST_RUNTIME_FUNCTION cudaError_t
+  doit_step(void*        d_temp_storage,
+            size_t&      temp_storage_bytes,
+            KeysIt       keys,
+            ItemsIt      ,
+            Size         keys_count,
+            CompareOp    compare_op,
+            cudaStream_t stream,
+            bool         debug_sync,
+            thrust::detail::integral_constant<bool, false> /* sort_keys */)
   {
-    typedef typename iterator_traits<KeysIt>::value_type key_type;
-    template<class Arch>
-    struct PtxPlan : PtxPolicy<256> {};
-
-    typedef core::specialize_plan<PtxPlan> ptx_plan;
-
-    //---------------------------------------------------------------------
-    // Agent entry point
-    //---------------------------------------------------------------------
-
-    THRUST_AGENT_ENTRY(bool      ping,
-                       KeysIt    keys_ping,
-                       key_type* keys_pong,
-                       Size      keys_count,
-                       Size      num_partitions,
-                       Size*     merge_partitions,
-                       CompareOp compare_op,
-                       Size      coop,
-                       int       items_per_tile,
-                       char*     /*shmem*/)
-    {
-      Size partition_idx = blockDim.x * blockIdx.x + threadIdx.x;
-      if (partition_idx < num_partitions)
-      {
-        Size list  = ~(coop - 1) & partition_idx;
-        Size start = items_per_tile * list;
-        Size size  = items_per_tile * (coop >> 1);
-
-        Size keys1_beg = min(keys_count, start);
-        Size keys1_end = min(keys_count, start + size);
-        Size keys2_beg = keys1_end;
-        Size keys2_end = min(keys_count, keys2_beg + size);
-
-
-        Size partition_at = min(keys2_end - keys1_beg,
-                                items_per_tile * ((coop - 1) & partition_idx));
-
-        Size partition_diag = ping ? merge_path(keys_ping + keys1_beg,
-                                                keys_ping + keys2_beg,
-                                                keys1_end - keys1_beg,
-                                                keys2_end - keys2_beg,
-                                                partition_at,
-                                                compare_op)
-                                   : merge_path(keys_pong + keys1_beg,
-                                                keys_pong + keys2_beg,
-                                                keys1_end - keys1_beg,
-                                                keys2_end - keys2_beg,
-                                                partition_at,
-                                                compare_op);
-
+    using ItemsInputIt = cub::NullType *;
+    ItemsInputIt items = nullptr;
 
-        merge_partitions[partition_idx] = keys1_beg + partition_diag;
-      }
-    }
-  };    // struct PartitionAgent
+    using DispatchMergeSortT = cub::DispatchMergeSort<KeysIt,
+                                                      ItemsInputIt,
+                                                      KeysIt,
+                                                      ItemsInputIt,
+                                                      Size,
+                                                      CompareOp>;
+
+
+    return DispatchMergeSortT::Dispatch(d_temp_storage,
+                                        temp_storage_bytes,
+                                        keys,
+                                        items,
+                                        keys,
+                                        items,
+                                        keys_count,
+                                        compare_op,
+                                        stream,
+                                        debug_sync);
+  }
 
   template <class KeysIt,
             class ItemsIt,
             class Size,
-            class CompareOp,
-            class MERGE_ITEMS>
-  struct MergeAgent
+            class CompareOp>
+  THRUST_RUNTIME_FUNCTION cudaError_t
+  doit_step(void *d_temp_storage,
+            size_t &temp_storage_bytes,
+            KeysIt keys,
+            ItemsIt items,
+            Size keys_count,
+            CompareOp compare_op,
+            cudaStream_t stream,
+            bool debug_sync,
+            thrust::detail::integral_constant<bool, true> /* sort_items */)
   {
-    typedef typename iterator_traits<KeysIt>::value_type  key_type;
-    typedef typename iterator_traits<ItemsIt>::value_type item_type;
-
-    typedef KeysIt     KeysOutputPongIt;
-    typedef ItemsIt    ItemsOutputPongIt;
-    typedef key_type*  KeysOutputPingIt;
-    typedef item_type* ItemsOutputPingIt;
-
-    template<class Arch>
-    struct PtxPlan : Tuning<Arch,key_type>::type
-    {
-      typedef Tuning<Arch,key_type> tuning;
-
-      typedef typename core::LoadIterator<PtxPlan, KeysIt>::type     KeysLoadPingIt;
-      typedef typename core::LoadIterator<PtxPlan, ItemsIt>::type    ItemsLoadPingIt;
-      typedef typename core::LoadIterator<PtxPlan, key_type*>::type  KeysLoadPongIt;
-      typedef typename core::LoadIterator<PtxPlan, item_type*>::type ItemsLoadPongIt;
-
-      typedef typename core::BlockLoad<PtxPlan, KeysLoadPingIt>::type  BlockLoadKeysPing;
-      typedef typename core::BlockLoad<PtxPlan, ItemsLoadPingIt>::type BlockLoadItemsPing;
-      typedef typename core::BlockLoad<PtxPlan, KeysLoadPongIt>::type  BlockLoadKeysPong;
-      typedef typename core::BlockLoad<PtxPlan, ItemsLoadPongIt>::type BlockLoadItemsPong;
-
-      typedef typename core::BlockStore<PtxPlan, KeysOutputPongIt>::type  BlockStoreKeysPong;
-      typedef typename core::BlockStore<PtxPlan, ItemsOutputPongIt>::type BlockStoreItemsPong;
-      typedef typename core::BlockStore<PtxPlan, KeysOutputPingIt>::type  BlockStoreKeysPing;
-      typedef typename core::BlockStore<PtxPlan, ItemsOutputPingIt>::type BlockStoreItemsPing;
-
-      // gather required temporary storage in a union
-      //
-      union TempStorage
-      {
-        typename BlockLoadKeysPing::TempStorage  load_keys_ping;
-        typename BlockLoadItemsPing::TempStorage load_items_ping;
-        typename BlockLoadKeysPong::TempStorage  load_keys_pong;
-        typename BlockLoadItemsPong::TempStorage load_items_pong;
-
-        typename BlockStoreKeysPing::TempStorage  store_keys_ping;
-        typename BlockStoreItemsPing::TempStorage store_items_ping;
-        typename BlockStoreKeysPong::TempStorage  store_keys_pong;
-        typename BlockStoreItemsPong::TempStorage store_items_pong;
-
-        core::uninitialized_array<key_type, PtxPlan::ITEMS_PER_TILE + 1>  keys_shared;
-        core::uninitialized_array<item_type, PtxPlan::ITEMS_PER_TILE + 1> items_shared;
-      };    // union TempStorage
-    };    // struct PtxPlan
-
-    typedef typename core::specialize_plan_msvc10_war<PtxPlan>::type::type ptx_plan;
-
-    typedef typename ptx_plan::KeysLoadPingIt  KeysLoadPingIt;
-    typedef typename ptx_plan::ItemsLoadPingIt ItemsLoadPingIt;
-    typedef typename ptx_plan::KeysLoadPongIt  KeysLoadPongIt;
-    typedef typename ptx_plan::ItemsLoadPongIt ItemsLoadPongIt;
-
-    typedef typename ptx_plan::BlockLoadKeysPing  BlockLoadKeysPing;
-    typedef typename ptx_plan::BlockLoadItemsPing BlockLoadItemsPing;
-    typedef typename ptx_plan::BlockLoadKeysPong  BlockLoadKeysPong;
-    typedef typename ptx_plan::BlockLoadItemsPong BlockLoadItemsPong;
-
-    typedef typename ptx_plan::BlockStoreKeysPing  BlockStoreKeysPing;
-    typedef typename ptx_plan::BlockStoreItemsPing BlockStoreItemsPing;
-    typedef typename ptx_plan::BlockStoreKeysPong  BlockStoreKeysPong;
-    typedef typename ptx_plan::BlockStoreItemsPong BlockStoreItemsPong;
-
-    typedef typename ptx_plan::TempStorage     TempStorage;
-
-    enum
-    {
-      ITEMS_PER_THREAD = ptx_plan::ITEMS_PER_THREAD,
-      BLOCK_THREADS    = ptx_plan::BLOCK_THREADS,
-      ITEMS_PER_TILE   = ptx_plan::ITEMS_PER_TILE
-    };
-
-    struct impl
-    {
-      //---------------------------------------------------------------------
-      // Per thread data
-      //---------------------------------------------------------------------
-
-      bool            ping;
-      TempStorage&    storage;
-
-      KeysLoadPingIt  keys_in_ping;
-      ItemsLoadPingIt items_in_ping;
-      KeysLoadPongIt  keys_in_pong;
-      ItemsLoadPongIt items_in_pong;
-
-      Size            keys_count;
-
-      KeysOutputPongIt  keys_out_pong;
-      ItemsOutputPongIt items_out_pong;
-      KeysOutputPingIt  keys_out_ping;
-      ItemsOutputPingIt items_out_ping;
-
-      CompareOp       compare_op;
-      Size*           merge_partitions;
-      Size            coop;
-
-      //---------------------------------------------------------------------
-      // Utility functions
-      //---------------------------------------------------------------------
-
-      template <bool IS_FULL_TILE, class T, class It1, class It2>
-      THRUST_DEVICE_FUNCTION void
-      gmem_to_reg(T (&output)[ITEMS_PER_THREAD],
-                  It1 input1,
-                  It2 input2,
-                  int count1,
-                  int count2)
-      {
-        if (IS_FULL_TILE)
-        {
-#pragma unroll
-          for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ++ITEM)
-          {
-            int idx = BLOCK_THREADS * ITEM + threadIdx.x;
-            output[ITEM] = (idx < count1) ? input1[idx] : input2[idx - count1];
-          }
-        }
-        else
-        {
-#pragma unroll
-          for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ++ITEM)
-          {
-            int idx = BLOCK_THREADS * ITEM + threadIdx.x;
-            if (idx < count1 + count2)
-            {
-              output[ITEM] = (idx < count1) ? input1[idx] : input2[idx - count1];
-            }
-          }
-        }
-      }
-
-      template <class T, class It>
-      THRUST_DEVICE_FUNCTION void
-      reg_to_shared(It output,
-                    T (&input)[ITEMS_PER_THREAD])
-      {
-#pragma unroll
-        for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ++ITEM)
-        {
-          int idx = BLOCK_THREADS * ITEM + threadIdx.x;
-          output[idx] = input[ITEM];
-        }
-      }
-
-      //---------------------------------------------------------------------
-      // Tile processing
-      //---------------------------------------------------------------------
-
-      template <bool IS_FULL_TILE>
-      THRUST_DEVICE_FUNCTION void
-      consume_tile(int  tid,
-                   Size tile_idx,
-                   Size tile_base,
-                   int  count)
-      {
-        using core::sync_threadblock;
-        using core::uninitialized_array;
-
-        Size partition_beg = merge_partitions[tile_idx + 0];
-        Size partition_end = merge_partitions[tile_idx + 1];
-
-        Size list = ~(coop - 1) & tile_idx;
-        Size start = ITEMS_PER_TILE * list;
-        Size size  = ITEMS_PER_TILE * (coop >> 1);
-
-        Size diag   = ITEMS_PER_TILE * tile_idx - start;
-
-        Size keys1_beg = partition_beg;
-        Size keys1_end = partition_end;
-        Size keys2_beg = min<Size>(keys_count, 2 * start + size + diag - partition_beg);
-        Size keys2_end = min<Size>(keys_count, 2 * start + size + diag + ITEMS_PER_TILE - partition_end);
-
-        if (coop - 1 == ((coop - 1) & tile_idx))
-        {
-          keys1_end = min(keys_count, start + size);
-          keys2_end = min(keys_count, start + size * 2);
-        }
-
-        // number of keys per tile
-        //
-        int num_keys1 = static_cast<int>(keys1_end - keys1_beg);
-        int num_keys2 = static_cast<int>(keys2_end - keys2_beg);
-
-        // load keys1 & keys2
-        key_type keys_loc[ITEMS_PER_THREAD];
-        if (ping)
-        {
-          gmem_to_reg<IS_FULL_TILE>(keys_loc,
-                                    keys_in_ping + keys1_beg,
-                                    keys_in_ping + keys2_beg,
-                                    num_keys1,
-                                    num_keys2);
-        }
-        else
-        {
-          gmem_to_reg<IS_FULL_TILE>(keys_loc,
-                                    keys_in_pong + keys1_beg,
-                                    keys_in_pong + keys2_beg,
-                                    num_keys1,
-                                    num_keys2);
-        }
-        reg_to_shared(&storage.keys_shared[0], keys_loc);
-
-        // preload items into registers already
-        //
-        item_type items_loc[ITEMS_PER_THREAD];
-        if (MERGE_ITEMS::value)
-        {
-          if (ping)
-          {
-            gmem_to_reg<IS_FULL_TILE>(items_loc,
-                                      items_in_ping + keys1_beg,
-                                      items_in_ping + keys2_beg,
-                                      num_keys1,
-                                      num_keys2);
-          }
-          else
-          {
-            gmem_to_reg<IS_FULL_TILE>(items_loc,
-                                      items_in_pong + keys1_beg,
-                                      items_in_pong + keys2_beg,
-                                      num_keys1,
-                                      num_keys2);
-          }
-        }
-
-        sync_threadblock();
-
-        // use binary search in shared memory
-        // to find merge path for each of thread
-        // we can use int type here, because the number of
-        // items in shared memory is limited
-        //
-        int diag0_loc = min<Size>(num_keys1 + num_keys2,
-                                  ITEMS_PER_THREAD * tid);
-
-        int keys1_beg_loc = merge_path(&storage.keys_shared[0],
-                                       &storage.keys_shared[num_keys1],
-                                       num_keys1,
-                                       num_keys2,
-                                       diag0_loc,
-                                       compare_op);
-        int keys1_end_loc = num_keys1;
-        int keys2_beg_loc = diag0_loc - keys1_beg_loc;
-        int keys2_end_loc = num_keys2;
-
-        int num_keys1_loc = keys1_end_loc - keys1_beg_loc;
-        int num_keys2_loc = keys2_end_loc - keys2_beg_loc;
-
-        // perform serial merge
-        //
-        int indices[ITEMS_PER_THREAD];
-
-        serial_merge(&storage.keys_shared[0],
-                     keys1_beg_loc,
-                     keys2_beg_loc + num_keys1,
-                     num_keys1_loc,
-                     num_keys2_loc,
-                     keys_loc,
-                     indices,
-                     compare_op);
-
-        sync_threadblock();
-
-        // write keys
-        //
-        if (ping)
-        {
-          if (IS_FULL_TILE)
-          {
-            BlockStoreKeysPing(storage.store_keys_ping)
-                .Store(keys_out_ping + tile_base, keys_loc);
-          }
-          else
-          {
-            BlockStoreKeysPing(storage.store_keys_ping)
-                .Store(keys_out_ping + tile_base, keys_loc, num_keys1 + num_keys2);
-          }
-        }
-        else
-        {
-          if (IS_FULL_TILE)
-          {
-            BlockStoreKeysPong(storage.store_keys_pong)
-                .Store(keys_out_pong + tile_base, keys_loc);
-          }
-          else
-          {
-            BlockStoreKeysPong(storage.store_keys_pong)
-                .Store(keys_out_pong + tile_base, keys_loc, num_keys1 + num_keys2);
-          }
-        }
-
-        // if items are provided, merge them
-        if (MERGE_ITEMS::value)
-        {
-          sync_threadblock();
-
-          reg_to_shared(&storage.items_shared[0], items_loc);
-
-          sync_threadblock();
-
-          // gather items from shared mem
-          //
-#pragma unroll
-          for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ++ITEM)
-          {
-            items_loc[ITEM] = storage.items_shared[indices[ITEM]];
-          }
-
-          sync_threadblock();
-
-          // write from reg to gmem
-          //
-          if (ping)
-          {
-            if (IS_FULL_TILE)
-            {
-              BlockStoreItemsPing(storage.store_items_ping)
-                  .Store(items_out_ping + tile_base, items_loc);
-            }
-            else
-            {
-              BlockStoreItemsPing(storage.store_items_ping)
-                  .Store(items_out_ping + tile_base, items_loc, count);
-            }
-          }
-          else
-          {
-            if (IS_FULL_TILE)
-            {
-              BlockStoreItemsPong(storage.store_items_pong)
-                  .Store(items_out_pong + tile_base, items_loc);
-            }
-            else
-            {
-              BlockStoreItemsPong(storage.store_items_pong)
-                  .Store(items_out_pong + tile_base, items_loc, count);
-            }
-          }
-        }
-      }
+    using DispatchMergeSortT =
+      cub::DispatchMergeSort<KeysIt, ItemsIt, KeysIt, ItemsIt, Size, CompareOp>;
 
-      //---------------------------------------------------------------------
-      // Constructor
-      //---------------------------------------------------------------------
-
-      THRUST_DEVICE_FUNCTION
-      impl(bool              ping_,
-           TempStorage&      storage_,
-           KeysLoadPingIt    keys_in_ping_,
-           ItemsLoadPingIt   items_in_ping_,
-           KeysLoadPongIt    keys_in_pong_,
-           ItemsLoadPongIt   items_in_pong_,
-           Size              keys_count_,
-           KeysOutputPingIt  keys_out_ping_,
-           ItemsOutputPingIt items_out_ping_,
-           KeysOutputPongIt  keys_out_pong_,
-           ItemsOutputPongIt items_out_pong_,
-           CompareOp         compare_op_,
-           Size*             merge_partitions_,
-           Size              coop_)
-          : ping(ping_),
-            storage(storage_),
-            keys_in_ping(keys_in_ping_),
-            items_in_ping(items_in_ping_),
-            keys_in_pong(keys_in_pong_),
-            items_in_pong(items_in_pong_),
-            keys_count(keys_count_),
-            keys_out_pong(keys_out_pong_),
-            items_out_pong(items_out_pong_),
-            keys_out_ping(keys_out_ping_),
-            items_out_ping(items_out_ping_),
-            compare_op(compare_op_),
-            merge_partitions(merge_partitions_),
-            coop(coop_)
-      {
-        // XXX with 8.5 chaging type to Size (or long long) results in error!
-        int  tile_idx      = blockIdx.x;
-        Size num_tiles     = gridDim.x;
-        Size tile_base     = Size(tile_idx) * ITEMS_PER_TILE;
-        int tid           = threadIdx.x;
-        int items_in_tile = static_cast<int>(min((Size)ITEMS_PER_TILE,
-                                                 keys_count - tile_base));
-        if (tile_idx < num_tiles - 1)
-        {
-          consume_tile<true>(tid,
-                             tile_idx,
-                             tile_base,
-                             ITEMS_PER_TILE);
-        }
-        else
-        {
-          consume_tile<false>(tid,
-                              tile_idx,
-                              tile_base,
-                              items_in_tile);
-        }
-      }
-    };    // struct impl
-
-    //---------------------------------------------------------------------
-    // Agent entry point
-    //---------------------------------------------------------------------
-
-    THRUST_AGENT_ENTRY(bool       ping,
-                       KeysIt     keys_ping,
-                       ItemsIt    items_ping,
-                       Size       keys_count,
-                       key_type*  keys_pong,
-                       item_type* items_pong,
-                       CompareOp  compare_op,
-                       Size*      merge_partitions,
-                       Size       coop,
-                       char*      shmem)
-    {
-      TempStorage& storage = *reinterpret_cast<TempStorage*>(shmem);
-
-      impl(ping,
-           storage,
-           core::make_load_iterator(ptx_plan(), keys_ping),
-           core::make_load_iterator(ptx_plan(), items_ping),
-           core::make_load_iterator(ptx_plan(), keys_pong),
-           core::make_load_iterator(ptx_plan(), items_pong),
-           keys_count,
-           keys_pong,
-           items_pong,
-           keys_ping,
-           items_ping,
-           compare_op,
-           merge_partitions,
-           coop);
-    }
-  };    // struct MergeAgent;
-
-  /////////////////////////
+    return DispatchMergeSortT::Dispatch(d_temp_storage,
+                                        temp_storage_bytes,
+                                        keys,
+                                        items,
+                                        keys,
+                                        items,
+                                        keys_count,
+                                        compare_op,
+                                        stream,
+                                        debug_sync);
+  }
 
   template <class SORT_ITEMS,
-            class STABLE,
+            class /* STABLE */,
             class KeysIt,
             class ItemsIt,
             class Size,
             class CompareOp>
   THRUST_RUNTIME_FUNCTION cudaError_t
-  doit_step(void*        d_temp_storage,
-            size_t&      temp_storage_bytes,
-            KeysIt       keys,
-            ItemsIt      items,
-            Size         keys_count,
-            CompareOp    compare_op,
+  doit_step(void *d_temp_storage,
+            size_t &temp_storage_bytes,
+            KeysIt keys,
+            ItemsIt items,
+            Size keys_count,
+            CompareOp compare_op,
             cudaStream_t stream,
-            bool         debug_sync)
+            bool debug_sync)
   {
-    using core::AgentPlan;
-    using core::get_agent_plan;
-
-    typedef typename iterator_traits<KeysIt>::value_type  key_type;
-    typedef typename iterator_traits<ItemsIt>::value_type item_type;
-
-    typedef core::AgentLauncher<
-        BlockSortAgent<KeysIt,
-                       ItemsIt,
-                       Size,
-                       CompareOp,
-                       SORT_ITEMS,
-                       STABLE> >
-        block_sort_agent;
-
-    typedef core::AgentLauncher<PartitionAgent<KeysIt, Size, CompareOp> >
-        partition_agent;
-
-    typedef core::AgentLauncher<
-        MergeAgent<KeysIt,
-                   ItemsIt,
-                   Size,
-                   CompareOp,
-                   SORT_ITEMS> >
-        merge_agent;
-
-    cudaError_t status = cudaSuccess;
-
     if (keys_count == 0)
-      return status;
-
-    typename core::get_plan<partition_agent>::type partition_plan =
-        partition_agent::get_plan();
-
-    typename core::get_plan<merge_agent>::type merge_plan =
-        merge_agent::get_plan(stream);
-
-    AgentPlan block_sort_plan = merge_plan;
-
-    int tile_size = merge_plan.items_per_tile;
-    Size num_tiles = (keys_count + tile_size - 1) / tile_size;
-
-    size_t temp_storage1 = (1 + num_tiles) * sizeof(Size);
-    size_t temp_storage2 = keys_count * sizeof(key_type);
-    size_t temp_storage3 = keys_count * sizeof(item_type) * SORT_ITEMS::value;
-    size_t temp_storage4 = core::vshmem_size(max(block_sort_plan.shared_memory_size,
-                                                 merge_plan.shared_memory_size),
-                                             num_tiles);
-
-    void*  allocations[4]      = {NULL, NULL, NULL, NULL};
-    size_t allocation_sizes[4] = {temp_storage1, temp_storage2, temp_storage3, temp_storage4};
-
-    status = core::alias_storage(d_temp_storage,
-                                 temp_storage_bytes,
-                                 allocations,
-                                 allocation_sizes);
-    CUDA_CUB_RET_IF_FAIL(status);
-
-    if (d_temp_storage == NULL)
-    {
-      return status;
-    };
-
-    int num_passes = static_cast<int>(thrust::detail::log2_ri(num_tiles));
-    bool ping = !(1 & num_passes);
-
-    Size*      merge_partitions = (Size*)allocations[0];
-    key_type*  keys_buffer      = (key_type*)allocations[1];
-    item_type* items_buffer     = (item_type*)allocations[2];
-
-    char* vshmem_ptr = temp_storage4 > 0 ? (char*)allocations[3] : NULL;
-
-
-    block_sort_agent(block_sort_plan, keys_count, stream, vshmem_ptr, "block_sort_agent", debug_sync)
-        .launch(ping, keys, items, keys_count, keys_buffer, items_buffer, compare_op);
-    CUDA_CUB_RET_IF_FAIL(cudaPeekAtLastError());
-
-    size_t num_partitions = num_tiles + 1;
-
-    partition_agent pa(partition_plan, num_partitions, stream, "partition_agent", debug_sync);
-    merge_agent     ma(merge_plan, keys_count, stream, vshmem_ptr, "merge_agent", debug_sync);
-
-    for (int pass = 0; pass < num_passes; ++pass, ping = !ping)
     {
-      Size coop = Size(2) << pass;
-
-      pa.launch(ping,
-                keys,
-                keys_buffer,
-                keys_count,
-                num_partitions,
-                merge_partitions,
-                compare_op,
-                coop,
-                merge_plan.items_per_tile);
-      CUDA_CUB_RET_IF_FAIL(cudaPeekAtLastError());
-
-
-      ma.launch(ping,
-                keys,
-                items,
-                keys_count,
-                keys_buffer,
-                items_buffer,
-                compare_op,
-                merge_partitions,
-                coop);
-      CUDA_CUB_RET_IF_FAIL(cudaPeekAtLastError());
+      return cudaSuccess;
     }
 
-    return status;
+    thrust::detail::integral_constant<bool, SORT_ITEMS::value> sort_items{};
+
+    return doit_step(d_temp_storage,
+                     temp_storage_bytes,
+                     keys,
+                     items,
+                     keys_count,
+                     compare_op,
+                     stream,
+                     debug_sync,
+                     sort_items);
   }
 
   template <typename SORT_ITEMS,
@@ -1317,7 +205,7 @@
                                            debug_sync);
     cuda_cub::throw_on_error(status, "merge_sort: failed on 2nd step");
 
-    status = cuda_cub::synchronize(policy);
+    status = cuda_cub::synchronize_optional(policy);
     cuda_cub::throw_on_error(status, "merge_sort: failed to synchronize");
   }
 }    // namespace __merge_sort
@@ -1608,7 +496,7 @@
     }
 
     cuda_cub::throw_on_error(
-      cuda_cub::synchronize(policy),
+      cuda_cub::synchronize_optional(policy),
       "smart_sort: failed to synchronize");
   }
 }    // namespace __smart_sort
diff -Nru libthrust-1.15.0/thrust/system/cuda/detail/swap_ranges.h libthrust-1.16.0/thrust/system/cuda/detail/swap_ranges.h
--- libthrust-1.15.0/thrust/system/cuda/detail/swap_ranges.h	2021-10-29 18:01:06.000000000 +0000
+++ libthrust-1.16.0/thrust/system/cuda/detail/swap_ranges.h	2022-02-09 20:04:17.000000000 +0000
@@ -93,7 +93,7 @@
                          num_items);
 
   cuda_cub::throw_on_error(
-    cuda_cub::synchronize(policy)
+    cuda_cub::synchronize_optional(policy)
   , "swap_ranges: failed to synchronize"
   );
 
diff -Nru libthrust-1.15.0/thrust/system/cuda/detail/tabulate.h libthrust-1.16.0/thrust/system/cuda/detail/tabulate.h
--- libthrust-1.15.0/thrust/system/cuda/detail/tabulate.h	2021-10-29 18:01:06.000000000 +0000
+++ libthrust-1.16.0/thrust/system/cuda/detail/tabulate.h	2022-02-09 20:04:17.000000000 +0000
@@ -78,7 +78,7 @@
                          count);
 
   cuda_cub::throw_on_error(
-    cuda_cub::synchronize(policy)
+    cuda_cub::synchronize_optional(policy)
   , "tabulate: failed to synchronize"
   );
 }
diff -Nru libthrust-1.15.0/thrust/system/cuda/detail/transform.h libthrust-1.16.0/thrust/system/cuda/detail/transform.h
--- libthrust-1.15.0/thrust/system/cuda/detail/transform.h	2021-10-29 18:01:06.000000000 +0000
+++ libthrust-1.16.0/thrust/system/cuda/detail/transform.h	2022-02-09 20:04:17.000000000 +0000
@@ -233,7 +233,7 @@
                            num_items);
 
     cuda_cub::throw_on_error(
-      cuda_cub::synchronize(policy)
+      cuda_cub::synchronize_optional(policy)
     , "transform: failed to synchronize"
     );
 
@@ -279,7 +279,7 @@
                            num_items);
 
     cuda_cub::throw_on_error(
-      cuda_cub::synchronize(policy)
+      cuda_cub::synchronize_optional(policy)
     , "transform: failed to synchronize"
     );
 
diff -Nru libthrust-1.15.0/thrust/system/cuda/detail/uninitialized_copy.h libthrust-1.16.0/thrust/system/cuda/detail/uninitialized_copy.h
--- libthrust-1.15.0/thrust/system/cuda/detail/uninitialized_copy.h	2021-10-29 18:01:06.000000000 +0000
+++ libthrust-1.16.0/thrust/system/cuda/detail/uninitialized_copy.h	2022-02-09 20:04:17.000000000 +0000
@@ -88,7 +88,7 @@
                          count);
 
   cuda_cub::throw_on_error(
-    cuda_cub::synchronize(policy)
+    cuda_cub::synchronize_optional(policy)
   , "uninitialized_copy_n: failed to synchronize"
   );
 
diff -Nru libthrust-1.15.0/thrust/system/cuda/detail/uninitialized_fill.h libthrust-1.16.0/thrust/system/cuda/detail/uninitialized_fill.h
--- libthrust-1.15.0/thrust/system/cuda/detail/uninitialized_fill.h	2021-10-29 18:01:06.000000000 +0000
+++ libthrust-1.16.0/thrust/system/cuda/detail/uninitialized_fill.h	2022-02-09 20:04:17.000000000 +0000
@@ -86,7 +86,7 @@
                          count);
 
   cuda_cub::throw_on_error(
-    cuda_cub::synchronize(policy)
+    cuda_cub::synchronize_optional(policy)
   , "uninitialized_fill_n: failed to synchronize"
   );
 
diff -Nru libthrust-1.15.0/thrust/system/cuda/detail/util.h libthrust-1.16.0/thrust/system/cuda/detail/util.h
--- libthrust-1.15.0/thrust/system/cuda/detail/util.h	2021-10-29 18:01:06.000000000 +0000
+++ libthrust-1.16.0/thrust/system/cuda/detail/util.h	2022-02-09 20:04:17.000000000 +0000
@@ -68,6 +68,25 @@
   return get_stream(derived_cast(policy));
 }
 
+
+// Fallback implementation of the customization point.
+template <class Derived>
+__host__ __device__
+bool
+must_perform_optional_stream_synchronization(execution_policy<Derived> &)
+{
+  return true;
+}
+
+// Entry point/interface.
+template <class Derived>
+__host__ __device__ bool
+must_perform_optional_synchronization(execution_policy<Derived> &policy)
+{
+  return must_perform_optional_stream_synchronization(derived_cast(policy));
+}
+
+
 // Fallback implementation of the customization point.
 __thrust_exec_check_disable__
 template <class Derived>
@@ -105,6 +124,50 @@
   return synchronize_stream(derived_cast(policy));
 }
 
+// Fallback implementation of the customization point.
+__thrust_exec_check_disable__
+template <class Derived>
+__host__ __device__
+cudaError_t
+synchronize_stream_optional(execution_policy<Derived> &policy)
+{
+  cudaError_t result;
+  if (THRUST_IS_HOST_CODE) {
+    #if THRUST_INCLUDE_HOST_CODE
+      if(must_perform_optional_synchronization(policy)){
+        cudaStreamSynchronize(stream(policy));
+        result = cudaGetLastError();
+      }else{
+        result = cudaSuccess;
+      }
+    #endif
+  } else {
+    #if THRUST_INCLUDE_DEVICE_CODE
+      #if __THRUST_HAS_CUDART__
+        if(must_perform_optional_synchronization(policy)){
+          cub::detail::device_synchronize();
+          result = cudaGetLastError();
+        }else{
+          result = cudaSuccess;
+        }
+      #else
+        THRUST_UNUSED_VAR(policy);
+        result = cudaSuccess;
+      #endif
+    #endif
+  }
+  return result;
+}
+
+// Entry point/interface.
+template <class Policy>
+__host__ __device__
+cudaError_t
+synchronize_optional(Policy &policy)
+{
+  return synchronize_stream_optional(derived_cast(policy));
+}
+
 template <class Type>
 THRUST_HOST_FUNCTION cudaError_t
 trivial_copy_from_device(Type *       dst,
diff -Nru libthrust-1.15.0/thrust/system/cuda/execution_policy.h libthrust-1.16.0/thrust/system/cuda/execution_policy.h
--- libthrust-1.15.0/thrust/system/cuda/execution_policy.h	2021-10-29 18:01:06.000000000 +0000
+++ libthrust-1.16.0/thrust/system/cuda/execution_policy.h	2022-02-09 20:04:17.000000000 +0000
@@ -26,59 +26,6 @@
  ******************************************************************************/
 #pragma once
 
-// histogram
-// sort (radix-sort, merge-sort)
-
 #include <thrust/detail/config.h>
 #include <thrust/system/cuda/detail/execution_policy.h>
 #include <thrust/system/cuda/detail/par.h>
-
-// pass
-// ----------------
-#include <thrust/system/cuda/detail/adjacent_difference.h>
-#include <thrust/system/cuda/detail/copy.h>
-#include <thrust/system/cuda/detail/copy_if.h>
-#include <thrust/system/cuda/detail/count.h>
-#include <thrust/system/cuda/detail/equal.h>
-#include <thrust/system/cuda/detail/extrema.h>
-#include <thrust/system/cuda/detail/fill.h>
-#include <thrust/system/cuda/detail/find.h>
-#include <thrust/system/cuda/detail/for_each.h>
-#include <thrust/system/cuda/detail/gather.h>
-#include <thrust/system/cuda/detail/generate.h>
-#include <thrust/system/cuda/detail/inner_product.h>
-#include <thrust/system/cuda/detail/mismatch.h>
-#include <thrust/system/cuda/detail/partition.h>
-#include <thrust/system/cuda/detail/reduce_by_key.h>
-#include <thrust/system/cuda/detail/remove.h>
-#include <thrust/system/cuda/detail/replace.h>
-#include <thrust/system/cuda/detail/reverse.h>
-#include <thrust/system/cuda/detail/scatter.h>
-#include <thrust/system/cuda/detail/swap_ranges.h>
-#include <thrust/system/cuda/detail/tabulate.h>
-#include <thrust/system/cuda/detail/transform.h>
-#include <thrust/system/cuda/detail/transform_reduce.h>
-#include <thrust/system/cuda/detail/transform_scan.h>
-#include <thrust/system/cuda/detail/uninitialized_copy.h>
-#include <thrust/system/cuda/detail/uninitialized_fill.h>
-#include <thrust/system/cuda/detail/unique.h>
-#include <thrust/system/cuda/detail/unique_by_key.h>
-
-// fail
-// ----------------
-// fails with mixed types
-#include <thrust/system/cuda/detail/reduce.h>
-
-// mixed types are not compiling, commented in testing/scan.cu
-#include <thrust/system/cuda/detail/scan.h>
-
-// stubs passed
-// ----------------
-#include <thrust/system/cuda/detail/binary_search.h>
-#include <thrust/system/cuda/detail/merge.h>
-#include <thrust/system/cuda/detail/scan_by_key.h>
-#include <thrust/system/cuda/detail/set_operations.h>
-#include <thrust/system/cuda/detail/sort.h>
-
-// work in progress
-
diff -Nru libthrust-1.15.0/thrust/system/cuda/memory_resource.h libthrust-1.16.0/thrust/system/cuda/memory_resource.h
--- libthrust-1.15.0/thrust/system/cuda/memory_resource.h	2021-10-29 18:01:06.000000000 +0000
+++ libthrust-1.16.0/thrust/system/cuda/memory_resource.h	2022-02-09 20:04:17.000000000 +0000
@@ -42,8 +42,8 @@
 namespace detail
 {
 
-    typedef cudaError_t (*allocation_fn)(void **, std::size_t);
-    typedef cudaError_t (*deallocation_fn)(void *);
+    typedef cudaError_t (CUDARTAPI *allocation_fn)(void **, std::size_t);
+    typedef cudaError_t (CUDARTAPI *deallocation_fn)(void *);
 
     template<allocation_fn Alloc, deallocation_fn Dealloc, typename Pointer>
     class cuda_memory_resource final : public mr::memory_resource<Pointer>
@@ -79,7 +79,7 @@
         }
     };
 
-    inline cudaError_t cudaMallocManaged(void ** ptr, std::size_t bytes)
+    inline cudaError_t CUDARTAPI cudaMallocManaged(void ** ptr, std::size_t bytes)
     {
         return ::cudaMallocManaged(ptr, bytes, cudaMemAttachGlobal);
     }
diff -Nru libthrust-1.15.0/thrust/system/detail/generic/shuffle.inl libthrust-1.16.0/thrust/system/detail/generic/shuffle.inl
--- libthrust-1.15.0/thrust/system/detail/generic/shuffle.inl	2021-10-29 18:01:06.000000000 +0000
+++ libthrust-1.16.0/thrust/system/detail/generic/shuffle.inl	2022-02-09 20:04:17.000000000 +0000
@@ -48,7 +48,7 @@
     right_side_bits = total_bits - left_side_bits;
     right_side_mask = (1ull << right_side_bits) - 1;
 
-    for (std::uint64_t i = 0; i < num_rounds; i++) {
+    for (std::uint32_t i = 0; i < num_rounds; i++) {
       key[i] = g();
     }
   }
@@ -56,28 +56,34 @@
   __host__ __device__ std::uint64_t nearest_power_of_two() const {
     return 1ull << (left_side_bits + right_side_bits);
   }
-  __host__ __device__ std::uint64_t operator()(const std::uint64_t val) const {
-    // Extract the right and left sides of the input
-    auto left = static_cast<std::uint32_t>(val >> right_side_bits);
-    auto right = static_cast<std::uint32_t>(val & right_side_mask);
-    round_state state = {left, right};
 
-    for (std::uint64_t i = 0; i < num_rounds; i++) {
-      state = do_round(state, i);
+  __host__ __device__ std::uint64_t operator()(const std::uint64_t val) const {
+    std::uint32_t state[2] = { static_cast<std::uint32_t>( val >> right_side_bits ), static_cast<std::uint32_t>( val & right_side_mask ) };
+    for( std::uint32_t i = 0; i < num_rounds; i++ )
+    {
+        std::uint32_t hi, lo;
+        constexpr std::uint64_t M0 = UINT64_C( 0xD2B74407B1CE6E93 );
+        mulhilo( M0, state[0], hi, lo );
+        lo = ( lo << ( right_side_bits - left_side_bits ) ) | state[1] >> left_side_bits;
+        state[0] = ( ( hi ^ key[i] ) ^ state[1] ) & left_side_mask;
+        state[1] = lo & right_side_mask;
     }
-
-    // Check we have the correct number of bits on each side
-    assert((state.left >> left_side_bits) == 0);
-    assert((state.right >> right_side_bits) == 0);
-
     // Combine the left and right sides together to get result
-    return state.left << right_side_bits | state.right;
+    return static_cast<std::uint64_t>(state[0] << right_side_bits) | static_cast<std::uint64_t>(state[1]);
   }
 
  private:
+   // Perform 64 bit multiplication and save result in two 32 bit int
+   static __host__ __device__ void mulhilo( std::uint64_t a, std::uint64_t b, std::uint32_t& hi, std::uint32_t& lo )
+   {
+       std::uint64_t product = a * b;
+       hi = static_cast<std::uint32_t>( product >> 32 );
+       lo = static_cast<std::uint32_t>( product );
+   }
+
   // Find the nearest power of two
-  __host__ __device__ std::uint64_t get_cipher_bits(std::uint64_t m) {
-    if (m == 0) return 0;
+  static __host__ __device__ std::uint64_t get_cipher_bits(std::uint64_t m) {
+    if (m <= 16) return 4;
     std::uint64_t i = 0;
     m--;
     while (m != 0) {
@@ -87,45 +93,12 @@
     return i;
   }
 
-  // Equivalent to boost::hash_combine
-  __host__ __device__
-  std::size_t hash_combine(std::uint64_t lhs, std::uint64_t rhs) const {
-    lhs ^= rhs + 0x9e3779b9 + (lhs << 6) + (lhs >> 2);
-    return lhs;
-  }
-
-  // Round function, a 'pseudorandom function' who's output is indistinguishable
-  // from random for each key value input. This is not cryptographically secure
-  // but sufficient for generating permutations. 
-  __host__ __device__ std::uint32_t round_function(std::uint64_t value,
-                                              const std::uint64_t key_) const {
-    std::uint64_t hash0 = thrust::random::taus88(static_cast<std::uint32_t>(value))();
-    std::uint64_t hash1 = thrust::random::ranlux48(value)();
-    return static_cast<std::uint32_t>(
-      hash_combine(hash_combine(hash0, key_), hash1) & left_side_mask);
-  }
-
-  __host__ __device__ round_state do_round(const round_state state,
-                                           const std::uint64_t round) const {
-    const std::uint32_t new_left = state.right & left_side_mask;
-    const std::uint32_t round_function_res =
-        state.left ^ round_function(state.right, key[round]);
-    if (right_side_bits != left_side_bits) {
-      // Upper bit of the old right becomes lower bit of new right if we have
-      // odd length feistel
-      const std::uint32_t new_right =
-          (round_function_res << 1ull) | state.right >> left_side_bits;
-      return {new_left, new_right};
-    }
-    return {new_left, round_function_res};
-  }
-
-  static constexpr std::uint64_t num_rounds = 16;
+  static constexpr std::uint32_t num_rounds = 24;
   std::uint64_t right_side_bits;
   std::uint64_t left_side_bits;
   std::uint64_t right_side_mask;
   std::uint64_t left_side_mask;
-  std::uint64_t key[num_rounds];
+  std::uint32_t key[num_rounds];
 };
 
 struct key_flag_tuple {
diff -Nru libthrust-1.15.0/thrust/system/detail/sequential/stable_merge_sort.inl libthrust-1.16.0/thrust/system/detail/sequential/stable_merge_sort.inl
--- libthrust-1.15.0/thrust/system/detail/sequential/stable_merge_sort.inl	2021-10-29 18:01:06.000000000 +0000
+++ libthrust-1.16.0/thrust/system/detail/sequential/stable_merge_sort.inl	2022-02-09 20:04:17.000000000 +0000
@@ -97,7 +97,7 @@
   {
     for(; first < last; first += partition_size)
     {
-      RandomAccessIterator partition_last = thrust::min(last, first + partition_size);
+      RandomAccessIterator partition_last = (thrust::min)(last, first + partition_size);
 
       thrust::system::detail::sequential::insertion_sort(first, partition_last, comp);
     } // end for
@@ -120,7 +120,7 @@
   {
     for(; keys_first < keys_last; keys_first += partition_size, values_first += partition_size)
     {
-      RandomAccessIterator1 keys_partition_last = thrust::min(keys_last, keys_first + partition_size);
+      RandomAccessIterator1 keys_partition_last = (thrust::min)(keys_last, keys_first + partition_size);
 
       thrust::system::detail::sequential::insertion_sort_by_key(keys_first, keys_partition_last, values_first, comp);
     } // end for
@@ -143,8 +143,8 @@
 {
   for(; first < last; first += 2 * partition_size, result += 2 * partition_size)
   {
-    RandomAccessIterator1 interval_middle = thrust::min(last, first + partition_size);
-    RandomAccessIterator1 interval_last   = thrust::min(last, interval_middle + partition_size);
+    RandomAccessIterator1 interval_middle = (thrust::min)(last, first + partition_size);
+    RandomAccessIterator1 interval_last   = (thrust::min)(last, interval_middle + partition_size);
 
     thrust::merge(exec,
                   first, interval_middle,
@@ -178,8 +178,8 @@
       keys_first < keys_last;
       keys_first += stride, values_first += stride, keys_result += stride, values_result += stride)
   {
-    RandomAccessIterator1 keys_interval_middle = thrust::min(keys_last, keys_first + partition_size);
-    RandomAccessIterator1 keys_interval_last   = thrust::min(keys_last, keys_interval_middle + partition_size);
+    RandomAccessIterator1 keys_interval_middle = (thrust::min)(keys_last, keys_first + partition_size);
+    RandomAccessIterator1 keys_interval_last   = (thrust::min)(keys_last, keys_interval_middle + partition_size);
 
     RandomAccessIterator2 values_first2 = values_first + (keys_interval_middle - keys_first);
 
diff -Nru libthrust-1.15.0/thrust/system/omp/detail/for_each.inl libthrust-1.16.0/thrust/system/omp/detail/for_each.inl
--- libthrust-1.15.0/thrust/system/omp/detail/for_each.inl	2021-10-29 18:01:06.000000000 +0000
+++ libthrust-1.16.0/thrust/system/omp/detail/for_each.inl	2022-02-09 20:04:17.000000000 +0000
@@ -20,12 +20,12 @@
  */
 
 #include <thrust/detail/config.h>
-#include <thrust/detail/static_assert.h>
-#include <thrust/distance.h>
 #include <thrust/detail/function.h>
-#include <thrust/iterator/iterator_traits.h>
+#include <thrust/detail/static_assert.h>
 #include <thrust/distance.h>
 #include <thrust/for_each.h>
+#include <thrust/iterator/iterator_traits.h>
+#include <thrust/system/omp/detail/pragma_omp.h>
 
 THRUST_NAMESPACE_BEGIN
 namespace system
@@ -61,14 +61,11 @@
   // create a wrapped function for f
   thrust::detail::wrapped_function<UnaryFunction,void> wrapped_f(f);
 
-// do not attempt to compile the body of this function, which depends on #pragma omp,
-// without support from the compiler
-// XXX implement the body of this function in another file to eliminate this ugliness
-#if (THRUST_DEVICE_COMPILER_IS_OMP_CAPABLE == THRUST_TRUE)
   // use a signed type for the iteration variable or suffer the consequences of warnings
   typedef typename thrust::iterator_difference<RandomAccessIterator>::type DifferenceType;
   DifferenceType signed_n = n;
-#pragma omp parallel for
+
+  THRUST_PRAGMA_OMP(parallel for)
   for(DifferenceType i = 0;
       i < signed_n;
       ++i)
@@ -76,7 +73,6 @@
     RandomAccessIterator temp = first + i;
     wrapped_f(*temp);
   }
-#endif // THRUST_DEVICE_COMPILER_IS_OMP_CAPABLE
 
   return first + n;
 } // end for_each_n() 
diff -Nru libthrust-1.15.0/thrust/system/omp/detail/pragma_omp.h libthrust-1.16.0/thrust/system/omp/detail/pragma_omp.h
--- libthrust-1.15.0/thrust/system/omp/detail/pragma_omp.h	1970-01-01 00:00:00.000000000 +0000
+++ libthrust-1.16.0/thrust/system/omp/detail/pragma_omp.h	2022-02-09 20:04:17.000000000 +0000
@@ -0,0 +1,56 @@
+/******************************************************************************
+* Copyright (c) 2021, NVIDIA CORPORATION.  All rights reserved.
+*
+* Redistribution and use in source and binary forms, with or without
+* modification, are permitted provided that the following conditions are met:
+*     * Redistributions of source code must retain the above copyright
+*       notice, this list of conditions and the following disclaimer.
+*     * Redistributions in binary form must reproduce the above copyright
+*       notice, this list of conditions and the following disclaimer in the
+*       documentation and/or other materials provided with the distribution.
+*     * Neither the name of the NVIDIA CORPORATION nor the
+*       names of its contributors may be used to endorse or promote products
+*       derived from this software without specific prior written permission.
+*
+* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+* ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
+* DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+* LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+* ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+* SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*
+******************************************************************************/
+
+#pragma once
+
+#include <thrust/detail/config.h>
+
+#if THRUST_HOST_COMPILER == THRUST_HOST_COMPILER_MSVC
+// MSVC ICEs when using the standard C++11 `_Pragma` operator with OpenMP
+// directives.
+// WAR this by using the MSVC-extension `__pragma`. See this link for more info:
+// https://developercommunity.visualstudio.com/t/Using-C11s-_Pragma-with-OpenMP-dire/1590628
+#define THRUST_PRAGMA_OMP_IMPL(directive) __pragma(directive)
+#else // Not MSVC:
+#define THRUST_PRAGMA_OMP_IMPL(directive) _Pragma(#directive)
+#endif
+
+// For internal use only -- THRUST_PRAGMA_OMP is used to switch between
+// different flavors of openmp pragmas. Pragmas are not emitted when OpenMP is
+// not available.
+//
+// Usage:
+//   Replace: #pragma omp parallel for
+//   With   : THRUST_PRAGMA_OMP(parallel for)
+//
+#if defined(_NVHPC_STDPAR_OPENMP) && _NVHPC_STDPAR_OPENMP == 1
+#define THRUST_PRAGMA_OMP(directive) THRUST_PRAGMA_OMP_IMPL(omp_stdpar directive)
+#elif defined(_OPENMP)
+#define THRUST_PRAGMA_OMP(directive) THRUST_PRAGMA_OMP_IMPL(omp directive)
+#else
+#define THRUST_PRAGMA_OMP(directive)
+#endif
diff -Nru libthrust-1.15.0/thrust/system/omp/detail/reduce_intervals.inl libthrust-1.16.0/thrust/system/omp/detail/reduce_intervals.inl
--- libthrust-1.15.0/thrust/system/omp/detail/reduce_intervals.inl	2021-10-29 18:01:06.000000000 +0000
+++ libthrust-1.16.0/thrust/system/omp/detail/reduce_intervals.inl	2022-02-09 20:04:17.000000000 +0000
@@ -62,9 +62,7 @@
 
   index_type n = static_cast<index_type>(decomp.size());
 
-#if (THRUST_DEVICE_COMPILER_IS_OMP_CAPABLE == THRUST_TRUE)
-# pragma omp parallel for
-#endif // THRUST_DEVICE_COMPILER_IS_OMP_CAPABLE
+  THRUST_PRAGMA_OMP(parallel for)
   for(index_type i = 0; i < n; i++)
   {
     InputIterator begin = input + decomp[i].begin();
diff -Nru libthrust-1.15.0/thrust/system/omp/detail/sort.inl libthrust-1.16.0/thrust/system/omp/detail/sort.inl
--- libthrust-1.15.0/thrust/system/omp/detail/sort.inl	2021-10-29 18:01:06.000000000 +0000
+++ libthrust-1.16.0/thrust/system/omp/detail/sort.inl	2022-02-09 20:04:17.000000000 +0000
@@ -113,13 +113,14 @@
   , "OpenMP compiler support is not enabled"
   );
 
+  // Avoid issues on compilers that don't provide `omp_get_num_threads()`.
 #if (THRUST_DEVICE_COMPILER_IS_OMP_CAPABLE == THRUST_TRUE)
   typedef typename thrust::iterator_difference<RandomAccessIterator>::type IndexType;
-  
+
   if(first == last)
     return;
 
-  #pragma omp parallel
+  THRUST_PRAGMA_OMP(parallel)
   {
     thrust::system::detail::internal::uniform_decomposition<IndexType> decomp(last - first, 1, omp_get_num_threads());
 
@@ -135,7 +136,7 @@
                           comp);
     }
 
-    #pragma omp barrier
+    THRUST_PRAGMA_OMP(barrier)
 
     // XXX For some reason, MSVC 2015 yields an error unless we include this meaningless semicolon here
     ;
@@ -166,7 +167,7 @@
       nseg = (nseg + 1) / 2;
       h *= 2;
 
-      #pragma omp barrier
+      THRUST_PRAGMA_OMP(barrier)
     }
   }
 #endif // THRUST_DEVICE_COMPILER_IS_OMP_CAPABLE
@@ -195,13 +196,14 @@
   , "OpenMP compiler support is not enabled"
   );
 
+  // Avoid issues on compilers that don't provide `omp_get_num_threads()`.
 #if (THRUST_DEVICE_COMPILER_IS_OMP_CAPABLE == THRUST_TRUE)
   typedef typename thrust::iterator_difference<RandomAccessIterator1>::type IndexType;
-  
+
   if(keys_first == keys_last)
     return;
 
-  #pragma omp parallel
+  THRUST_PRAGMA_OMP(parallel)
   {
     thrust::system::detail::internal::uniform_decomposition<IndexType> decomp(keys_last - keys_first, 1, omp_get_num_threads());
 
@@ -218,7 +220,7 @@
                                  comp);
     }
 
-    #pragma omp barrier
+    THRUST_PRAGMA_OMP(barrier)
 
     // XXX For some reason, MSVC 2015 yields an error unless we include this meaningless semicolon here
     ;
@@ -250,7 +252,7 @@
       nseg = (nseg + 1) / 2;
       h *= 2;
 
-      #pragma omp barrier
+      THRUST_PRAGMA_OMP(barrier)
     }
   }
 #endif // THRUST_DEVICE_COMPILER_IS_OMP_CAPABLE
diff -Nru libthrust-1.15.0/thrust/system/tbb/detail/reduce_by_key.inl libthrust-1.16.0/thrust/system/tbb/detail/reduce_by_key.inl
--- libthrust-1.15.0/thrust/system/tbb/detail/reduce_by_key.inl	2021-10-29 18:01:06.000000000 +0000
+++ libthrust-1.16.0/thrust/system/tbb/detail/reduce_by_key.inl	2022-02-09 20:04:17.000000000 +0000
@@ -198,7 +198,7 @@
     const size_type interval_idx = r.begin();
 
     const size_type offset_to_first = interval_size * interval_idx;
-    const size_type offset_to_last = thrust::min(n, offset_to_first + interval_size);
+    const size_type offset_to_last = (thrust::min)(n, offset_to_first + interval_size);
 
     Iterator1 my_keys_first     = keys_first    + offset_to_first;
     Iterator1 my_keys_last      = keys_first    + offset_to_last;
diff -Nru libthrust-1.15.0/thrust/system/tbb/detail/reduce_intervals.h libthrust-1.16.0/thrust/system/tbb/detail/reduce_intervals.h
--- libthrust-1.15.0/thrust/system/tbb/detail/reduce_intervals.h	2021-10-29 18:01:06.000000000 +0000
+++ libthrust-1.16.0/thrust/system/tbb/detail/reduce_intervals.h	2022-02-09 20:04:17.000000000 +0000
@@ -64,7 +64,7 @@
     Size interval_idx = r.begin();
 
     Size offset_to_first = interval_size * interval_idx;
-    Size offset_to_last = thrust::min(n, offset_to_first + interval_size);
+    Size offset_to_last = (thrust::min)(n, offset_to_first + interval_size);
 
     RandomAccessIterator1 my_first = first + offset_to_first;
     RandomAccessIterator1 my_last  = first + offset_to_last;
diff -Nru libthrust-1.15.0/thrust/version.h libthrust-1.16.0/thrust/version.h
--- libthrust-1.15.0/thrust/version.h	2021-10-29 18:01:06.000000000 +0000
+++ libthrust-1.16.0/thrust/version.h	2022-02-09 20:04:17.000000000 +0000
@@ -47,7 +47,7 @@
  *         <tt>THRUST_VERSION / 100 % 1000</tt> is the minor version.
  *         <tt>THRUST_VERSION / 100000</tt> is the major version.
  */
-#define THRUST_VERSION 101500
+#define THRUST_VERSION 101600
 
 /*! \def THRUST_MAJOR_VERSION
  *  \brief The preprocessor macro \p THRUST_MAJOR_VERSION encodes the