diff -Nru libthrust-1.15.0/CHANGELOG.md libthrust-1.16.0/CHANGELOG.md --- libthrust-1.15.0/CHANGELOG.md 2021-10-29 18:01:06.000000000 +0000 +++ libthrust-1.16.0/CHANGELOG.md 2022-02-09 20:04:17.000000000 +0000 @@ -1,4 +1,99 @@ -# Thrust 1.15.0 +# Thrust 1.16.0 + +## Summary + +Thrust 1.16.0 provides a new “nosync” hint for the CUDA backend, as well as +numerous bugfixes and stability improvements. + +### New `thrust::cuda::par_nosync` Execution Policy + +Most of Thrust’s parallel algorithms are fully synchronous and will block the +calling CPU thread until all work is completed. This design avoids many pitfalls +associated with asynchronous GPU programming, resulting in simpler and +less-error prone usage for new CUDA developers. Unfortunately, this improvement +in user experience comes at a performance cost that often frustrates more +experienced CUDA programmers. + +Prior to this release, the only synchronous-to-asynchronous migration path for +existing Thrust codebases involved significant refactoring, replacing calls +to `thrust` algorithms with a limited set of `future`-based `thrust::async` +algorithms or lower-level CUB kernels. The new `thrust::cuda::par_nosync` +execution policy provides a new, less-invasive entry point for asynchronous +computation. + +`par_nosync` is a hint to the Thrust execution engine that any non-essential +internal synchronizations should be skipped and that an explicit synchronization +will be performed by the caller before accessing results. + +While some Thrust algorithms require internal synchronization to safely compute +their results, many do not. For example, multiple `thrust::for_each` invocations +can be launched without waiting for earlier calls to complete: + +```cpp +// Queue three `for_each` kernels: +thrust::for_each(thrust::cuda::par_nosync, vec1.begin(), vec1.end(), Op{}); +thrust::for_each(thrust::cuda::par_nosync, vec2.begin(), vec2.end(), Op{}); +thrust::for_each(thrust::cuda::par_nosync, vec3.begin(), vec3.end(), Op{}); + +// Do other work while kernels execute: +do_something(); + +// Must explictly synchronize before accessing `for_each` results: +cudaDeviceSynchronize(); +``` + +Thanks to @fkallen for this contribution. + +## Deprecation Notices + +### CUDA Dynamic Parallelism Support + +**A future version of Thrust will remove support for CUDA Dynamic Parallelism +(CDP).** + +This will only affect calls to Thrust algorithms made from CUDA device-side code +that currently launches a kernel; such calls will instead execute sequentially +on the calling GPU thread instead of launching a device-wide kernel. + +## Breaking Changes + +- Thrust 1.14.0 included a change that aliased the `cub` namespace + to `thrust::cub`. This has caused issues with ambiguous namespaces for + projects that declare `using namespace thrust;` from the global namespace. We + recommend against this practice. +- NVIDIA/thrust#1572: Removed several unnecessary header includes. Downstream + projects may need to update their includes if they were relying on this + behavior. + +## New Features + +- NVIDIA/thrust#1568: Add `thrust::cuda::par_nosync` policy. Thanks to @fkallen + for this contribution. + +## Enhancements + +- NVIDIA/thrust#1511: Use CUB’s new `DeviceMergeSort` API and remove Thrust’s + internal implementation. +- NVIDIA/thrust#1566: Improved performance of `thrust::shuffle`. Thanks to + @djns99 for this contribution. +- NVIDIA/thrust#1584: Support user-defined `CMAKE_INSTALL_INCLUDEDIR` values in + Thrust’s CMake install rules. Thanks to @robertmaynard for this contribution. + +## Bug Fixes + +- NVIDIA/thrust#1496: Fix some issues affecting `icc` builds. +- NVIDIA/thrust#1552: Fix some collisions with the `min`/`max` macros defined + in `windows.h`. +- NVIDIA/thrust#1582: Fix issue with function type alias on 32-bit MSVC builds. +- NVIDIA/thrust#1591: Workaround issue affecting compilation with `nvc++`. +- NVIDIA/thrust#1597: Fix some collisions with the `small` macro defined + in `windows.h`. +- NVIDIA/thrust#1599, NVIDIA/thrust#1603: Fix some issues with version handling + in Thrust’s CMake packages. +- NVIDIA/thrust#1614: Clarify that scan algorithm results are non-deterministic + for pseudo-associative operators (e.g. floating-point addition). + +# Thrust 1.15.0 (NVIDIA HPC SDK 22.1, CUDA Toolkit 11.6) ## Summary @@ -51,6 +146,13 @@ `thrust::iterator_traits` is better integrated with `std::iterator_traits`. See below for more details and references. +## Breaking Changes + +- Thrust 1.14.0 included a change that aliased the `cub` namespace + to `thrust::cub`. This has caused issues with ambiguous namespaces for + projects that declare `using namespace thrust;` from the global namespace. We + recommend against this practice. + ## New Features - NVIDIA/thrust#1464: Add preprocessor hooks that allow `thrust::` to be wrapped diff -Nru libthrust-1.15.0/ci/axis/cpu.yml libthrust-1.16.0/ci/axis/cpu.yml --- libthrust-1.15.0/ci/axis/cpu.yml 2021-10-29 18:01:06.000000000 +0000 +++ libthrust-1.16.0/ci/axis/cpu.yml 2022-02-09 20:04:17.000000000 +0000 @@ -8,8 +8,8 @@ - nvhpc SDK_VER: - - 11.3.1-devel - - 21.5-devel-cuda11.3 + - 11.5.1-devel + - 22.1-devel-cuda11.5 OS_TYPE: - ubuntu @@ -31,7 +31,8 @@ - 9 - 10 - 11 - - 21.5 + - 12 + - 22.1 - latest exclude: @@ -46,9 +47,9 @@ SDK_TYPE: cuda # Excludes by `SDK_VER`. - SDK_TYPE: cuda - SDK_VER: 21.5-devel-cuda11.3 + SDK_VER: 22.1-devel-cuda11.5 - SDK_TYPE: nvhpc - SDK_VER: 11.3.1-devel + SDK_VER: 11.5.1-devel # Excludes by `CXX_VER`. - CXX_TYPE: nvcxx CXX_VER: 5 @@ -65,11 +66,13 @@ - CXX_TYPE: nvcxx CXX_VER: 11 - CXX_TYPE: nvcxx + CXX_VER: 12 + - CXX_TYPE: nvcxx CXX_VER: latest - CXX_TYPE: gcc - CXX_VER: 11 + CXX_VER: 12 - CXX_TYPE: gcc - CXX_VER: 21.5 + CXX_VER: 22.1 - CXX_TYPE: gcc CXX_VER: latest - CXX_TYPE: clang @@ -77,7 +80,7 @@ - CXX_TYPE: clang CXX_VER: 6 - CXX_TYPE: clang - CXX_VER: 21.5 + CXX_VER: 22.1 - CXX_TYPE: clang CXX_VER: latest - CXX_TYPE: icc @@ -95,5 +98,7 @@ - CXX_TYPE: icc CXX_VER: 11 - CXX_TYPE: icc - CXX_VER: 21.5 + CXX_VER: 12 + - CXX_TYPE: icc + CXX_VER: 22.1 diff -Nru libthrust-1.15.0/ci/axis/gpu.yml libthrust-1.16.0/ci/axis/gpu.yml --- libthrust-1.15.0/ci/axis/gpu.yml 2021-10-29 18:01:06.000000000 +0000 +++ libthrust-1.16.0/ci/axis/gpu.yml 2022-02-09 20:04:17.000000000 +0000 @@ -7,7 +7,7 @@ - cuda SDK_VER: - - 11.3.1-devel + - 11.5.1-devel OS_TYPE: - ubuntu @@ -19,5 +19,4 @@ - gcc CXX_VER: - - 7 - + - 9 diff -Nru libthrust-1.15.0/ci/common/build.bash libthrust-1.16.0/ci/common/build.bash --- libthrust-1.15.0/ci/common/build.bash 2021-10-29 18:01:06.000000000 +0000 +++ libthrust-1.16.0/ci/common/build.bash 2022-02-09 20:04:17.000000000 +0000 @@ -1,6 +1,6 @@ #! /usr/bin/env bash -# Copyright (c) 2018-2020 NVIDIA Corporation +# Copyright (c) 2018-2022 NVIDIA Corporation # SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception # Released under the Apache License v2.0 with LLVM Exceptions. # See https://llvm.org/LICENSE.txt for license information. @@ -9,7 +9,7 @@ # Thrust and CUB build script for gpuCI ################################################################################ -set -e +set -e # Stop on errors. # append variable value # Appends ${value} to ${variable}, adding a space before ${value} if @@ -32,6 +32,13 @@ printf "%s\n\n" "${*}" } +# echo_and_run name args... +# Echo ${args[@]}, then execute ${args[@]} +function echo_and_run { + echo "${1}: ${@:2}" + ${@:2} +} + # echo_and_run_timed name args... # Echo ${args[@]}, then execute ${args[@]} and report how long it took, # including ${name} in the output of the time. @@ -58,19 +65,27 @@ ################################################################################ # Get the variables the Docker container set up for us: ${CXX}, ${CUDACXX}, etc. +set +e # Don't stop on errors from /etc/cccl.bashrc. source /etc/cccl.bashrc +set -e # Stop on errors. -# Set path and build parallel level +# Set path. export PATH=/usr/local/cuda/bin:${PATH} # Set home to the job's workspace. export HOME=${WORKSPACE} +# Per-process memory util logs: +MEMMON_LOG=${WORKSPACE}/build/memmon_log + # Switch to the build directory. cd ${WORKSPACE} mkdir -p build cd build +# Remove any old .ninja_log file so the PrintNinjaBuildTimes step is accurate: +rm -f .ninja_log + if [[ -z "${CMAKE_BUILD_TYPE}" ]]; then CMAKE_BUILD_TYPE="Release" fi @@ -109,8 +124,29 @@ append CMAKE_BUILD_FLAGS "-k0" fi +DETERMINE_PARALLELISM_FLAGS="" + +# Used to limit the number of default build threads. Any build/link +# steps that exceed this limit will cause this script to report a +# failure. Tune this using the memmon logs printed after each run. +# +# Build steps that take more memory than this limit should +# be split into multiple steps/translation units. Any temporary +# increases to this threshold should be reverted ASAP. The goal +# to do decrease this as much as possible and not increase it. +if [[ -z "${MIN_MEMORY_PER_THREAD}" ]]; then + if [[ "${CXX_TYPE}" == "nvcxx" ]]; then + MIN_MEMORY_PER_THREAD=3.0 # GiB + elif [[ "${CXX_TYPE}" == "icc" ]]; then + MIN_MEMORY_PER_THREAD=2.5 # GiB + else + MIN_MEMORY_PER_THREAD=2.0 # GiB + fi +fi +append DETERMINE_PARALLELISM_FLAGS "--min-memory-per-thread ${MIN_MEMORY_PER_THREAD}" + if [[ -n "${PARALLEL_LEVEL}" ]]; then - DETERMINE_PARALLELISM_FLAGS="-j ${PARALLEL_LEVEL}" + append DETERMINE_PARALLELISM_FLAGS "-j ${PARALLEL_LEVEL}" fi # COVERAGE_PLAN options: @@ -148,6 +184,9 @@ append CMAKE_FLAGS "-DTHRUST_MULTICONFIG_WORKLOAD=LARGE" ;; Thorough) + # Build the legacy bench.cu. We'll probably want to remove this when we + # switch to the new, heavier thrust_benchmarks project. + append CMAKE_FLAGS "-DTHRUST_ENABLE_BENCHMARKS=ON" append CMAKE_FLAGS "-DTHRUST_ENABLE_MULTICONFIG=ON" append CMAKE_FLAGS "-DTHRUST_IGNORE_DEPRECATED_CPP_11=ON" append CMAKE_FLAGS "-DTHRUST_MULTICONFIG_ENABLE_DIALECT_ALL=ON" @@ -175,9 +214,6 @@ append CMAKE_FLAGS "-DTHRUST_MULTICONFIG_ENABLE_SYSTEM_CUDA=ON" append CMAKE_FLAGS "-DTHRUST_MULTICONFIG_WORKLOAD=SMALL" append CMAKE_FLAGS "-DTHRUST_INCLUDE_CUB_CMAKE=ON" - append CMAKE_FLAGS "-DCUB_ENABLE_THOROUGH_TESTING=OFF" - append CMAKE_FLAGS "-DCUB_ENABLE_BENCHMARK_TESTING=OFF" - append CMAKE_FLAGS "-DCUB_ENABLE_MINIMAL_TESTING=ON" append CMAKE_FLAGS "-DTHRUST_AUTO_DETECT_COMPUTE_ARCHS=ON" if [[ "${BUILD_TYPE}" == "cpu" ]] && [[ "${CXX_TYPE}" == "nvcxx" ]]; then # If no GPU is automatically detected, NVC++ insists that you explicitly @@ -229,7 +265,7 @@ log "Get environment..." -env +env | sort log "Check versions..." @@ -244,6 +280,10 @@ echo +cmake --version 2>&1 | sed -Ez '$ s/\n*$/\n/' + +echo + if [[ "${BUILD_TYPE}" == "gpu" ]]; then nvidia-smi 2>&1 | sed -Ez '$ s/\n*$/\n/' fi @@ -254,10 +294,7 @@ log "Configure Thrust and CUB..." -# Clear out any stale CMake configs: -rm -rf CMakeCache.txt CMakeFiles/ - -echo_and_run_timed "Configure" cmake .. ${CMAKE_FLAGS} +echo_and_run_timed "Configure" cmake .. --log-level=VERBOSE ${CMAKE_FLAGS} configure_status=$? log "Build Thrust and CUB..." @@ -265,8 +302,22 @@ # ${PARALLEL_LEVEL} needs to be passed after we run # determine_build_parallelism.bash, so it can't be part of ${CMAKE_BUILD_FLAGS}. set +e # Don't stop on build failures. + +# Monitor memory usage. Thresholds in GiB: +python3 ${WORKSPACE}/ci/common/memmon.py \ + --log-threshold 0.0 \ + --fail-threshold ${MIN_MEMORY_PER_THREAD} \ + --log-file ${MEMMON_LOG} \ + & +memmon_pid=$! + echo_and_run_timed "Build" cmake --build . ${CMAKE_BUILD_FLAGS} -j ${PARALLEL_LEVEL} build_status=$? + +# Stop memmon: +kill -s SIGINT ${memmon_pid} + +# Re-enable exit on failure: set -e ################################################################################ @@ -275,21 +326,68 @@ log "Test Thrust and CUB..." -echo_and_run_timed "Test" ctest ${CTEST_FLAGS} +( + # Make sure test_status captures ctest, not tee: + # https://stackoverflow.com/a/999259/11130318 + set -o pipefail + echo_and_run_timed "Test" ctest ${CTEST_FLAGS} | tee ctest_log +) test_status=$? ################################################################################ +# COMPILE TIME INFO: Print the 20 longest running build steps (ninja only) +################################################################################ + +if [[ -f ".ninja_log" ]]; then + log "Checking slowest build steps:" + echo_and_run "CompileTimeInfo" cmake -P ../cmake/PrintNinjaBuildTimes.cmake | head -n 23 +fi + +################################################################################ +# RUNTIME INFO: Print the 20 longest running test steps +################################################################################ + +if [[ -f "ctest_log" ]]; then + log "Checking slowest test steps:" + echo_and_run "TestTimeInfo" cmake -DLOGFILE=ctest_log -P ../cmake/PrintCTestRunTimes.cmake | head -n 20 +fi + +################################################################################ +# MEMORY_USAGE +################################################################################ + +memmon_status=0 +if [[ -f "${MEMMON_LOG}" ]]; then + log "Checking memmon logfile: ${MEMMON_LOG}" + + if [[ -n "$(grep -E "^FAIL" ${MEMMON_LOG})" ]]; then + log "error: Some build steps exceeded MIN_MEMORY_PER_THREAD (${MIN_MEMORY_PER_THREAD} GiB):" + grep -E "^FAIL" ${MEMMON_LOG} + memmon_status=1 + else + log "Top memory usage per build step (all less than limit of ${MIN_MEMORY_PER_THREAD} GiB):" + if [[ -s ${MEMMON_LOG} ]]; then + # Not empty: + head -n5 ${MEMMON_LOG} + else + echo "None detected above logging threshold." + fi + fi +fi + +################################################################################ # SUMMARY - Print status of each step and exit with failure if needed. ################################################################################ log "Summary:" -log "- Configure Error Code: ${configure_status}" -log "- Build Error Code: ${build_status}" -log "- Test Error Code: ${test_status}" - +echo "- Configure Error Code: ${configure_status}" +echo "- Build Error Code: ${build_status}" +echo "- Build Memory Check: ${memmon_status}" +echo "- Test Error Code: ${test_status}" if [[ "${configure_status}" != "0" ]] || \ [[ "${build_status}" != "0" ]] || \ + [[ "${memmon_status}" != "0" ]] || \ [[ "${test_status}" != "0" ]]; then exit 1 fi diff -Nru libthrust-1.15.0/ci/common/memmon_config/procps/toprc libthrust-1.16.0/ci/common/memmon_config/procps/toprc --- libthrust-1.15.0/ci/common/memmon_config/procps/toprc 1970-01-01 00:00:00.000000000 +0000 +++ libthrust-1.16.0/ci/common/memmon_config/procps/toprc 2022-02-09 20:04:17.000000000 +0000 @@ -0,0 +1,16 @@ +top's Config File (Linux processes with windows) +Id:i, Mode_altscr=0, Mode_irixps=1, Delay_time=3.0, Curwin=0 +Def fieldscur=%(34;@D7:9&')*+,-./012568<>?ABCFGHIJKLMNOPQRSTUVWXYZ[\]^_`abcdefghij + winflags=193972, sortindx=18, maxtasks=0, graph_cpus=0, graph_mems=0 + summclr=1, msgsclr=1, headclr=3, taskclr=1 +Job fieldscur=(Ļ@<)*+,-./012568>?ABCFGHIJKLMNOPQRSTUVWXYZ[\]^_`abcdefghij + winflags=193844, sortindx=0, maxtasks=0, graph_cpus=0, graph_mems=0 + summclr=6, msgsclr=6, headclr=7, taskclr=6 +Mem fieldscur=?@ABCFGHIJKLMNOPQRSTUVWXYZ[\]^_`abcdefghij + winflags=193844, sortindx=3, maxtasks=0, graph_cpus=0, graph_mems=0 + summclr=3, msgsclr=3, headclr=2, taskclr=3 +Fixed_widest=0, Summ_mscale=1, Task_mscale=0, Zero_suppress=0 + diff -Nru libthrust-1.15.0/ci/common/memmon.py libthrust-1.16.0/ci/common/memmon.py --- libthrust-1.15.0/ci/common/memmon.py 1970-01-01 00:00:00.000000000 +0000 +++ libthrust-1.16.0/ci/common/memmon.py 2022-02-09 20:04:17.000000000 +0000 @@ -0,0 +1,99 @@ +#! /usr/bin/env python + +# Copyright (c) 2022 NVIDIA Corporation +# Reply-To: Allison Vacanti +# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +# Released under the Apache License v2.0 with LLVM Exceptions. +# See https://llvm.org/LICENSE.txt for license information. + +help_text = """%(prog)s [reference.json compare.json | reference_dir/ compare_dir/] + +This script: + +1. Runs `top -bco RES`, continuously extracting the memory usage of each process. +2. If a process uses more than `log_threshold` GiB and exceeds any other recorded + entry for the process, it is stored in `entries`. +3. When this script receives SIGINT, it writes two files: + * `log_file` will contain all recorded max-memory-per-process entries + * `fail_file` will contain all entries that exceed `fail_threshold` +""" + +import argparse +import os +import re +import signal +import sys + +from subprocess import Popen, PIPE, STDOUT + +parser = argparse.ArgumentParser(prog='memmon.py', usage=help_text) +parser.add_argument('--log-threshold', type=float, dest='log_threshold', + default=0.5, + help='Logging threshold in GiB.') +parser.add_argument('--fail-threshold', type=float, dest='fail_threshold', + default=2, + help='Failure threshold in GiB.') +parser.add_argument('--log-file', type=str, dest='log_file', default='memmon_log', + help='Output file for log entries.') +args, unused = parser.parse_known_args() + +entries = {} + +def signal_handler(sig, frame): + # Sort by mem: + sortentries = sorted(entries.items(), key=lambda x:x[1], reverse=True) + + lf = open(args.log_file, "w") + + for com, mem in sortentries: + status="PASS" + if mem >= args.fail_threshold: + status="FAIL" + line = "%4s | %3.1f GiB | %s\n"%(status, mem, com) + lf.write(line) + + lf.close() + sys.exit(0) + +signal.signal(signal.SIGINT, signal_handler) + +# Find the toprc config file and configure top's env. +# This config: +# - Hides all columns except for RES and COMMAND +# - Sorts by RES +# - Enables long command strings (-c) +script_dir = os.path.dirname(os.path.realpath(__file__)) +config_dir = os.path.join(script_dir, 'memmon_config') + +proc = Popen(["top", "-b", "-w", "512"], + stdin=PIPE, stdout=PIPE, stderr=STDOUT, + env={"XDG_CONFIG_HOME": config_dir}) + +regex = re.compile("^\\s*([0-9.]+[kmgtp]?)\\s+(.+)\\s*$") + +# Convert a memory string from top into floating point GiB +def parse_mem(mem_str): + if mem_str[-1] == "k": + return float(mem_str[:-1]) / (1024 * 1024) + elif mem_str[-1] == "m": + return float(mem_str[:-1]) / (1024) + elif mem_str[-1] == "g": + return float(mem_str[:-1]) + elif mem_str[-1] == "t": + return float(mem_str[:-1]) * 1024 + elif mem_str[-1] == "p": # please no + return float(mem_str[:-1]) * 1024 * 1024 + # bytes: + return float(mem_str) / (1024 * 1024 * 1024) + +for line in proc.stdout: + line = line.decode() + match = regex.match(line) + if match: + mem = parse_mem(match.group(1)) + if mem < args.log_threshold and mem < args.fail_threshold: + continue + com = match.group(2) + if com in entries and entries[com] > mem: + continue + entries[com] = mem diff -Nru libthrust-1.15.0/ci/cpu/build.bash libthrust-1.16.0/ci/cpu/build.bash --- libthrust-1.15.0/ci/cpu/build.bash 2021-10-29 18:01:06.000000000 +0000 +++ libthrust-1.16.0/ci/cpu/build.bash 2022-02-09 20:04:17.000000000 +0000 @@ -12,4 +12,3 @@ export PARALLEL_LEVEL=${PARALLEL_LEVEL:-4} source ${WORKSPACE}/ci/common/build.bash - diff -Nru libthrust-1.15.0/ci/local/build.bash libthrust-1.16.0/ci/local/build.bash --- libthrust-1.15.0/ci/local/build.bash 2021-10-29 18:01:06.000000000 +0000 +++ libthrust-1.16.0/ci/local/build.bash 2022-02-09 20:04:17.000000000 +0000 @@ -60,7 +60,7 @@ # FLAGS - Process command line flags. ################################################################################ -IMAGE="gpuci/cccl:cuda11.3.1-devel-ubuntu20.04-gcc7" +IMAGE="gpuci/cccl:cuda11.5.1-devel-ubuntu20.04-gcc9" LOCAL_IMAGE=0 @@ -195,8 +195,8 @@ NVIDIA_DOCKER_INSTALLED=$(docker info 2>&1 | grep -i runtime | grep -c nvidia) if [[ "${NVIDIA_DOCKER_INSTALLED}" == 0 ]]; then - echo "NVIDIA Docker not found, please install it: https://docs.nvidia.com/datacenter/cloud-native/container-toolkit/install-guide.html#installing-docker-ce" - exit -4 + echo "NVIDIA Docker not found, the build may fail." + echo "Please install it if you encounter issues: https://docs.nvidia.com/datacenter/cloud-native/container-toolkit/install-guide.html#installing-docker-ce" fi if [[ "${LOCAL_IMAGE}" == 0 ]]; then diff -Nru libthrust-1.15.0/cmake/header_test.in libthrust-1.16.0/cmake/header_test.in --- libthrust-1.15.0/cmake/header_test.in 2021-10-29 18:01:06.000000000 +0000 +++ libthrust-1.16.0/cmake/header_test.in 2022-02-09 20:04:17.000000000 +0000 @@ -15,7 +15,7 @@ // Hacky way to build a string, but it works on all tested platforms. #define THRUST_MACRO_CHECK(MACRO, HEADER) \ THRUST_MACRO_CHECK_IMPL(Identifier MACRO should not be used from Thrust \ - headers due to conflicts with HEADER.) + headers due to conflicts with HEADER macros.) // Use raw platform checks instead of the THRUST_HOST_COMPILER macros since we // don't want to #include any headers other than the one being tested. @@ -45,11 +45,17 @@ #define I THRUST_MACRO_CHECK('I', complex.h) // windows.h conflicts -// Disabling for now; we use min/max in many places, but since most -// projects build with NOMINMAX this doesn't seem to be high priority to fix. +#define small THRUST_MACRO_CHECK('small', windows.h) +// We can't enable these checks without breaking some builds -- some standard +// library implementations unconditionally `#undef` these macros, which then +// causes random failures later. +// Leaving these commented out as a warning: Here be dragons. //#define min(...) THRUST_MACRO_CHECK('min', windows.h) //#define max(...) THRUST_MACRO_CHECK('max', windows.h) +// termios.h conflicts (NVIDIA/thrust#1547) +#define B0 THRUST_MACRO_CHECK("B0", termios.h) + #endif // THRUST_IGNORE_MACRO_CHECKS #include diff -Nru libthrust-1.15.0/cmake/PrintCTestRunTimes.cmake libthrust-1.16.0/cmake/PrintCTestRunTimes.cmake --- libthrust-1.15.0/cmake/PrintCTestRunTimes.cmake 1970-01-01 00:00:00.000000000 +0000 +++ libthrust-1.16.0/cmake/PrintCTestRunTimes.cmake 2022-02-09 20:04:17.000000000 +0000 @@ -0,0 +1,109 @@ +## This CMake script parses the output of ctest and prints a formatted list +## of individual test runtimes, sorted longest first. +## +## ctest > ctest_log +## cmake -DLOGFILE=ctest_log \ +## -P PrintCTestRunTimes.cmake +## +################################################################################ + +cmake_minimum_required(VERSION 3.15) + +# Prepend the string with "0" until the string length equals the specified width +function(pad_string_with_zeros string_var width) + set(local_string "${${string_var}}") + string(LENGTH "${local_string}" size) + while(size LESS width) + string(PREPEND local_string "0") + string(LENGTH "${local_string}" size) + endwhile() + set(${string_var} "${local_string}" PARENT_SCOPE) +endfunction() + +################################################################################ + +if (NOT LOGFILE) + message(FATAL_ERROR "Missing -DLOGFILE= argument.") +endif() + +# Check if logfile exists +if (NOT EXISTS "${LOGFILE}") + message(FATAL_ERROR "LOGFILE does not exist ('${LOGFILE}').") +endif() + +string(JOIN "" regex + "^[ ]*[0-9]+/[0-9]+[ ]+Test[ ]+#" + "([0-9]+)" # Test ID + ":[ ]+" + "(.+)" # Test Name + "[ ]+\\.+[ ]+" + "(.+[^ ])" # Result + "[ ]+" + "([0-9]+)" # Seconds + "\\.[0-9]+[ ]+sec[ ]*$" +) + +message(DEBUG "Regex: ${regex}") + +# Read the logfile and generate a map / keylist +set(keys) +file(STRINGS "${LOGFILE}" lines) +foreach(line ${lines}) + + # Parse each build time + string(REGEX MATCH "${regex}" _DUMMY "${line}") + + if (CMAKE_MATCH_COUNT EQUAL 4) + set(test_id "${CMAKE_MATCH_1}") + set(test_name "${CMAKE_MATCH_2}") + set(test_result "${CMAKE_MATCH_3}") + set(tmp "${CMAKE_MATCH_4}") # floor(runtime_seconds) + + # Compute human readable time + math(EXPR days "${tmp} / (60 * 60 * 24)") + math(EXPR tmp "${tmp} - (${days} * 60 * 60 * 24)") + math(EXPR hours "${tmp} / (60 * 60)") + math(EXPR tmp "${tmp} - (${hours} * 60 * 60)") + math(EXPR minutes "${tmp} / (60)") + math(EXPR tmp "${tmp} - (${minutes} * 60)") + math(EXPR seconds "${tmp}") + + # Format time components + pad_string_with_zeros(days 3) + pad_string_with_zeros(hours 2) + pad_string_with_zeros(minutes 2) + pad_string_with_zeros(seconds 2) + + # Construct table entry + # Later values in the file for the same command overwrite earlier entries + string(MAKE_C_IDENTIFIER "${test_id}" key) + string(JOIN " | " ENTRY_${key} + "${days}d ${hours}h ${minutes}m ${seconds}s" + "${test_result}" + "${test_id}: ${test_name}" + ) + + # Record the key: + list(APPEND keys "${key}") + endif() +endforeach() + +list(REMOVE_DUPLICATES keys) + +# Build the entry list: +set(entries) +foreach(key ${keys}) + list(APPEND entries "${ENTRY_${key}}") +endforeach() + +if (NOT entries) + message(FATAL_ERROR "LOGFILE contained no test times ('${LOGFILE}').") +endif() + +# Sort in descending order: +list(SORT entries ORDER DESCENDING) + +# Dump table: +foreach(entry ${entries}) + message(STATUS ${entry}) +endforeach() diff -Nru libthrust-1.15.0/cmake/ThrustBuildTargetList.cmake libthrust-1.16.0/cmake/ThrustBuildTargetList.cmake --- libthrust-1.15.0/cmake/ThrustBuildTargetList.cmake 2021-10-29 18:01:06.000000000 +0000 +++ libthrust-1.16.0/cmake/ThrustBuildTargetList.cmake 2022-02-09 20:04:17.000000000 +0000 @@ -313,10 +313,7 @@ add_flag_option(IGNORE_DEPRECATED_CPP_11 "Don't warn about deprecated C++11." OFF) add_flag_option(IGNORE_DEPRECATED_COMPILER "Don't warn about deprecated compilers." OFF) add_flag_option(IGNORE_CUB_VERSION_CHECK "Don't warn about mismatched CUB versions." OFF) - - # By default, suppress deprecation warnings when building our test suite, - ## since we'll need to test deprecated APIs with `-Werror`. - add_flag_option(IGNORE_DEPRECATED_API "Don't warn about deprecated Thrust or CUB APIs." ON) + add_flag_option(IGNORE_DEPRECATED_API "Don't warn about deprecated Thrust or CUB APIs." OFF) # Top level meta-target. Makes it easier to just build thrust targets when # building both CUB and Thrust. Add all project files here so IDEs will be diff -Nru libthrust-1.15.0/cmake/ThrustInstallRules.cmake libthrust-1.16.0/cmake/ThrustInstallRules.cmake --- libthrust-1.15.0/cmake/ThrustInstallRules.cmake 2021-10-29 18:01:06.000000000 +0000 +++ libthrust-1.16.0/cmake/ThrustInstallRules.cmake 2022-02-09 20:04:17.000000000 +0000 @@ -5,7 +5,7 @@ set(CMAKE_SKIP_INSTALL_ALL_DEPENDENCY TRUE) install(DIRECTORY "${Thrust_SOURCE_DIR}/thrust" - TYPE INCLUDE + DESTINATION "${CMAKE_INSTALL_INCLUDEDIR}" FILES_MATCHING PATTERN "*.h" PATTERN "*.inl" @@ -13,7 +13,15 @@ install(DIRECTORY "${Thrust_SOURCE_DIR}/thrust/cmake/" DESTINATION "${CMAKE_INSTALL_LIBDIR}/cmake/thrust" + PATTERN thrust-header-search EXCLUDE ) +# Need to configure a file to store the infix specified in +# CMAKE_INSTALL_INCLUDEDIR since it can be defined by the user +configure_file("${Thrust_SOURCE_DIR}/thrust/cmake/thrust-header-search.cmake.in" + "${Thrust_BINARY_DIR}/thrust/cmake/thrust-header-search.cmake" + @ONLY) +install(FILES "${Thrust_BINARY_DIR}/thrust/cmake/thrust-header-search.cmake" + DESTINATION "${CMAKE_INSTALL_LIBDIR}/cmake/thrust") # Depending on how Thrust is configured, CUB's CMake scripts may or may not be # included, so maintain a set of CUB install rules in both projects. By default @@ -22,12 +30,19 @@ option(THRUST_INSTALL_CUB_HEADERS "Include cub headers when installing." ON) if (THRUST_INSTALL_CUB_HEADERS) install(DIRECTORY "${Thrust_SOURCE_DIR}/dependencies/cub/cub" - TYPE INCLUDE + DESTINATION "${CMAKE_INSTALL_INCLUDEDIR}" FILES_MATCHING PATTERN "*.cuh" ) + # Need to configure a file to store THRUST_INSTALL_HEADER_INFIX install(DIRECTORY "${Thrust_SOURCE_DIR}/dependencies/cub/cub/cmake/" DESTINATION "${CMAKE_INSTALL_LIBDIR}/cmake/cub" + PATTERN cub-header-search EXCLUDE ) + configure_file("${Thrust_SOURCE_DIR}/dependencies/cub/cub/cmake/cub-header-search.cmake.in" + "${Thrust_BINARY_DIR}/dependencies/cub/cub/cmake/cub-header-search.cmake" + @ONLY) + install(FILES "${Thrust_BINARY_DIR}/dependencies/cub/cub/cmake/cub-header-search.cmake" + DESTINATION "${CMAKE_INSTALL_LIBDIR}/cmake/cub") endif() diff -Nru libthrust-1.15.0/CMakeLists.txt libthrust-1.16.0/CMakeLists.txt --- libthrust-1.15.0/CMakeLists.txt 2021-10-29 18:01:06.000000000 +0000 +++ libthrust-1.16.0/CMakeLists.txt 2022-02-09 20:04:17.000000000 +0000 @@ -49,14 +49,21 @@ option(THRUST_ENABLE_HEADER_TESTING "Test that all public headers compile." "ON") option(THRUST_ENABLE_TESTING "Build Thrust testing suite." "ON") option(THRUST_ENABLE_EXAMPLES "Build Thrust examples." "ON") +option(THRUST_ENABLE_BENCHMARKS "Build Thrust runtime benchmarks." "OFF") option(THRUST_INCLUDE_CUB_CMAKE "Build CUB tests and examples. (Requires CUDA)." "OFF") +# Mark this option as advanced for now. We'll revisit this later once the new +# benchmarks are ready. For now, we just need to expose a way to compile +# bench.cu from CMake for NVIDIA's internal builds. +mark_as_advanced(THRUST_ENABLE_BENCHMARKS) + # Check if we're actually building anything before continuing. If not, no need # to search for deps, etc. This is a common approach for packagers that just # need the install rules. See GH issue NVIDIA/thrust#1211. if (NOT (THRUST_ENABLE_HEADER_TESTING OR THRUST_ENABLE_TESTING OR THRUST_ENABLE_EXAMPLES OR + THRUST_ENABLE_BENCHMARKS OR THRUST_INCLUDE_CUB_CMAKE)) return() endif() @@ -118,6 +125,10 @@ add_subdirectory(examples) endif() +if (THRUST_ENABLE_BENCHMARKS) + add_subdirectory(internal/benchmark) +endif() + if (THRUST_INCLUDE_CUB_CMAKE AND THRUST_CUDA_FOUND) set(CUB_IN_THRUST ON) # CUB's path is specified generically to support both GitHub and Perforce diff -Nru libthrust-1.15.0/CODE_OF_CONDUCT.md libthrust-1.16.0/CODE_OF_CONDUCT.md --- libthrust-1.15.0/CODE_OF_CONDUCT.md 2021-10-29 18:01:06.000000000 +0000 +++ libthrust-1.16.0/CODE_OF_CONDUCT.md 2022-02-09 20:04:17.000000000 +0000 @@ -93,4 +93,4 @@ [FAQ]: https://www.contributor-covenant.org/faq [NVIDIA RAPIDS]: https://docs.rapids.ai/resources/conduct/ -[Contributor Covenant]: https://www.contributor-covenant.org/version/1/4/code-of-conduct.html +[Contributor Covenant version 1.4]: https://www.contributor-covenant.org/version/1/4/code-of-conduct.html diff -Nru libthrust-1.15.0/debian/changelog libthrust-1.16.0/debian/changelog --- libthrust-1.15.0/debian/changelog 2022-06-05 21:19:51.000000000 +0000 +++ libthrust-1.16.0/debian/changelog 2022-06-19 18:27:00.000000000 +0000 @@ -1,3 +1,9 @@ +libthrust (1.16.0-1) unstable; urgency=medium + + * New upstream release 1.16.0. + + -- Andreas Beckmann Sun, 19 Jun 2022 20:27:00 +0200 + libthrust (1.15.0-6) unstable; urgency=medium * Disable cuda tests on !amd64, they always run into timeout. diff -Nru libthrust-1.15.0/debian/rules libthrust-1.16.0/debian/rules --- libthrust-1.15.0/debian/rules 2022-06-05 21:19:51.000000000 +0000 +++ libthrust-1.16.0/debian/rules 2022-06-19 18:27:00.000000000 +0000 @@ -20,6 +20,7 @@ dh_auto_install --destdir=debian/tmp execute_after_dh_auto_install: + $(RM) -v debian/tmp/usr/lib/$(DEB_HOST_MULTIARCH)/cmake/thrust/thrust-header-search.cmake.in # remove empty directories find debian/tmp -depth -type d -exec sh -c 'd="{}"; rmdir --ignore-fail-on-non-empty "$$d" ; test -d "$$d" || echo "removed $$d"' \; diff -Nru libthrust-1.15.0/debian/tests/upstream-testsuite libthrust-1.16.0/debian/tests/upstream-testsuite --- libthrust-1.15.0/debian/tests/upstream-testsuite 2022-06-05 21:19:51.000000000 +0000 +++ libthrust-1.16.0/debian/tests/upstream-testsuite 2022-06-19 18:27:00.000000000 +0000 @@ -61,6 +61,7 @@ ${host:+-DTHRUST_HOST_SYSTEM_OPTIONS=$host} \ -DTHRUST_DEVICE_SYSTEM_OPTIONS=$device \ -D_THRUST_CMAKE_DIR=/usr/share/cmake/thrust \ + -DTHRUST_INSTALL_CUB_HEADERS=OFF \ -Wno-dev cd $AUTOPKGTEST_TMP diff -Nru libthrust-1.15.0/.gitignore libthrust-1.16.0/.gitignore --- libthrust-1.15.0/.gitignore 2021-10-29 18:01:06.000000000 +0000 +++ libthrust-1.16.0/.gitignore 2022-02-09 20:04:17.000000000 +0000 @@ -2,3 +2,5 @@ .p4config doc/html discrete_voronoi.pgm +*build*/ +.idea/ diff -Nru libthrust-1.15.0/internal/benchmark/bench.cu libthrust-1.16.0/internal/benchmark/bench.cu --- libthrust-1.15.0/internal/benchmark/bench.cu 2021-10-29 18:01:06.000000000 +0000 +++ libthrust-1.16.0/internal/benchmark/bench.cu 2022-02-09 20:04:17.000000000 +0000 @@ -992,7 +992,6 @@ , RegularTrials >::run_experiment(); -#if THRUST_CPP_DIALECT >= 2011 experiment_driver< shuffle_tester , ElementMetaType @@ -1000,7 +999,6 @@ , BaselineTrials , RegularTrials >::run_experiment(); -#endif } /////////////////////////////////////////////////////////////////////////////// diff -Nru libthrust-1.15.0/internal/benchmark/CMakeLists.txt libthrust-1.16.0/internal/benchmark/CMakeLists.txt --- libthrust-1.15.0/internal/benchmark/CMakeLists.txt 1970-01-01 00:00:00.000000000 +0000 +++ libthrust-1.16.0/internal/benchmark/CMakeLists.txt 2022-02-09 20:04:17.000000000 +0000 @@ -0,0 +1,29 @@ +if (CMAKE_CXX_COMPILER_ID STREQUAL "MSVC") + # MSVC builds fail at runtime. Benchmarks are linux-only for now. + message(STATUS "Thrust benchmarking is not available on MSVC.") + return() +endif() + +add_custom_target(thrust.all.bench) + +foreach(thrust_target IN LISTS THRUST_TARGETS) + thrust_get_target_property(config_host ${thrust_target} HOST) + thrust_get_target_property(config_device ${thrust_target} DEVICE) + thrust_get_target_property(config_prefix ${thrust_target} PREFIX) + + # Skip non cpp.cuda targets: + if (NOT config_host STREQUAL "CPP" OR + NOT config_device STREQUAL "CUDA") + continue() + endif() + + set(bench_target ${config_prefix}.bench) + + add_executable(${bench_target} bench.cu) + target_link_libraries(${bench_target} PRIVATE ${thrust_target}) + target_include_directories(${bench_target} PRIVATE "${CMAKE_CURRENT_SOURCE_DIR}") + thrust_clone_target_properties(${bench_target} ${thrust_target}) + + add_dependencies(thrust.all.bench ${bench_target}) + add_dependencies(${config_prefix}.all ${bench_target}) +endforeach() diff -Nru libthrust-1.15.0/internal/build/common_build.mk libthrust-1.16.0/internal/build/common_build.mk --- libthrust-1.15.0/internal/build/common_build.mk 2021-10-29 18:01:06.000000000 +0000 +++ libthrust-1.16.0/internal/build/common_build.mk 2022-02-09 20:04:17.000000000 +0000 @@ -6,10 +6,6 @@ LIBRARIES += m endif -# Disable our THRUST_DEPRECATED and CUB_DEPRECATED macros for internal -# builds, since we need to build and test our deprecated APIs with -Werror. -CUDACC_FLAGS += -DTHRUST_IGNORE_DEPRECATED_API - include $(ROOTDIR)/thrust/internal/build/common_compiler.mk # Add /bigobj to Windows build flag to workaround building Thrust with debug diff -Nru libthrust-1.15.0/README.md libthrust-1.16.0/README.md --- libthrust-1.15.0/README.md 2021-10-29 18:01:06.000000000 +0000 +++ libthrust-1.16.0/README.md 2022-02-09 20:04:17.000000000 +0000 @@ -1,4 +1,4 @@ - + # Thrust: Code at the speed of light @@ -33,10 +33,10 @@ For CMake-based projects, we provide a CMake package for use with `find_package`. See the [CMake README](thrust/cmake/README.md) for more information. Thrust can also be added via `add_subdirectory` or tools like -the [CMake Package Manager](https://github.com/TheLartians/CPM.cmake). +the [CMake Package Manager](https://github.com/cpm-cmake/CPM.cmake). For non-CMake projects, compile with: -- The Thrust include path (`-I/thrust`) +- The Thrust include path (`-I`) - The CUB include path, if using the CUDA device system (`-I/dependencies/cub/`) - By default, the CPP host system and CUDA device system are used. These can be changed using compiler definitions: @@ -114,33 +114,37 @@ ## CI Status - + - + - + - + - + - + - + - + - + - + - + - + - + - + + + + + ## Supported Compilers @@ -163,7 +167,8 @@ | Thrust Release | Included In | | ----------------- | --------------------------------------- | -| 1.15.0 | TBD | +| 1.16.0 | TBD | +| 1.15.0 | NVIDIA HPC SDK 22.1 & CUDA Toolkit 11.6 | | 1.14.0 | NVIDIA HPC SDK 21.9 | | 1.13.1 | CUDA Toolkit 11.5 | | 1.13.0 | NVIDIA HPC SDK 21.7 | @@ -234,6 +239,8 @@ ``` By default, a serial `CPP` host system, `CUDA` accelerated device system, and -C++14 standard are used. This can be changed in CMake. More information on -configuring your Thrust build and creating a pull request can be found in -[CONTRIBUTING.md](CONTRIBUTING.md). +C++14 standard are used. This can be changed during configuration -- see +[CMake Options](CONTRIBUTING.md#cmake-options) for details. + +More information on configuring your Thrust build and creating a pull request +can be found in [CONTRIBUTING.md](CONTRIBUTING.md). diff -Nru libthrust-1.15.0/testing/cmake/check_source_files.cmake libthrust-1.16.0/testing/cmake/check_source_files.cmake --- libthrust-1.15.0/testing/cmake/check_source_files.cmake 2021-10-29 18:01:06.000000000 +0000 +++ libthrust-1.16.0/testing/cmake/check_source_files.cmake 2022-02-09 20:04:17.000000000 +0000 @@ -63,10 +63,12 @@ # The wrappers are allowed to include the unwrapped headers thrust/detail/algorithm_wrapper.h thrust/detail/memory_wrapper.h + thrust/detail/numeric_wrapper.h ) set(algorithm_regex "#[ \t]*include[ \t]+") set(memory_regex "#[ \t]*include[ \t]+") +set(numeric_regex "#[ \t]*include[ \t]+") # Validation check for the above regex pattern: count_substrings([=[ @@ -126,6 +128,7 @@ if (NOT ${src} IN_LIST stdpar_header_exclusions) count_substrings("${src_contents}" "${algorithm_regex}" algorithm_count) count_substrings("${src_contents}" "${memory_regex}" memory_count) + count_substrings("${src_contents}" "${numeric_regex}" numeric_count) if (NOT algorithm_count EQUAL 0) message("'${src}' includes the header. Replace with .") @@ -136,6 +139,11 @@ message("'${src}' includes the header. Replace with .") set(found_errors 1) endif() + + if (NOT numeric_count EQUAL 0) + message("'${src}' includes the header. Replace with .") + set(found_errors 1) + endif() endif() endforeach() diff -Nru libthrust-1.15.0/testing/cuda/copy_if.cu libthrust-1.16.0/testing/cuda/copy_if.cu --- libthrust-1.15.0/testing/cuda/copy_if.cu 2021-10-29 18:01:06.000000000 +0000 +++ libthrust-1.16.0/testing/cuda/copy_if.cu 2022-02-09 20:04:17.000000000 +0000 @@ -95,7 +95,14 @@ DECLARE_UNITTEST(TestCopyIfDeviceDevice); -void TestCopyIfCudaStreams() +void TestCopyIfDeviceNoSync() +{ + TestCopyIfDevice(thrust::cuda::par_nosync); +} +DECLARE_UNITTEST(TestCopyIfDeviceNoSync); + +template +void TestCopyIfCudaStreams(ExecutionPolicy policy) { typedef thrust::device_vector Vector; @@ -111,7 +118,7 @@ cudaStream_t s; cudaStreamCreate(&s); - Vector::iterator end = thrust::copy_if(thrust::cuda::par.on(s), + Vector::iterator end = thrust::copy_if(policy.on(s), data.begin(), data.end(), result.begin(), @@ -124,7 +131,16 @@ cudaStreamDestroy(s); } -DECLARE_UNITTEST(TestCopyIfCudaStreams); + +void TestCopyIfCudaStreamsSync(){ + TestCopyIfCudaStreams(thrust::cuda::par); +} +DECLARE_UNITTEST(TestCopyIfCudaStreamsSync); + +void TestCopyIfCudaStreamsNoSync(){ + TestCopyIfCudaStreams(thrust::cuda::par_nosync); +} +DECLARE_UNITTEST(TestCopyIfCudaStreamsNoSync); template @@ -205,7 +221,15 @@ DECLARE_UNITTEST(TestCopyIfStencilDeviceDevice); -void TestCopyIfStencilCudaStreams() +void TestCopyIfStencilDeviceNoSync() +{ + TestCopyIfStencilDevice(thrust::cuda::par_nosync); +} +DECLARE_UNITTEST(TestCopyIfStencilDeviceNoSync); + + +template +void TestCopyIfStencilCudaStreams(ExecutionPolicy policy) { typedef thrust::device_vector Vector; typedef Vector::value_type T; @@ -229,7 +253,7 @@ cudaStream_t s; cudaStreamCreate(&s); - Vector::iterator end = thrust::copy_if(thrust::cuda::par.on(s), + Vector::iterator end = thrust::copy_if(policy.on(s), data.begin(), data.end(), stencil.begin(), @@ -243,5 +267,17 @@ cudaStreamDestroy(s); } -DECLARE_UNITTEST(TestCopyIfStencilCudaStreams); + +void TestCopyIfStencilCudaStreamsSync() +{ + TestCopyIfStencilCudaStreams(thrust::cuda::par); +} +DECLARE_UNITTEST(TestCopyIfStencilCudaStreamsSync); + + +void TestCopyIfStencilCudaStreamsNoSync() +{ + TestCopyIfStencilCudaStreams(thrust::cuda::par_nosync); +} +DECLARE_UNITTEST(TestCopyIfStencilCudaStreamsNoSync); diff -Nru libthrust-1.15.0/testing/cuda/max_element.cu libthrust-1.16.0/testing/cuda/max_element.cu --- libthrust-1.15.0/testing/cuda/max_element.cu 2021-10-29 18:01:06.000000000 +0000 +++ libthrust-1.16.0/testing/cuda/max_element.cu 2022-02-09 20:04:17.000000000 +0000 @@ -67,7 +67,15 @@ DECLARE_UNITTEST(TestMaxElementDeviceDevice); -void TestMaxElementCudaStreams() +void TestMaxElementDeviceNoSync() +{ + TestMaxElementDevice(thrust::cuda::par_nosync); +} +DECLARE_UNITTEST(TestMaxElementDeviceNoSync); + + +template +void TestMaxElementCudaStreams(ExecutionPolicy policy) { typedef thrust::device_vector Vector; typedef Vector::value_type T; @@ -83,15 +91,28 @@ cudaStream_t s; cudaStreamCreate(&s); - ASSERT_EQUAL( *thrust::max_element(thrust::cuda::par.on(s), data.begin(), data.end()), 5); - ASSERT_EQUAL( thrust::max_element(thrust::cuda::par.on(s), data.begin(), data.end()) - data.begin(), 1); + auto streampolicy = policy.on(s); + + ASSERT_EQUAL( *thrust::max_element(streampolicy, data.begin(), data.end()), 5); + ASSERT_EQUAL( thrust::max_element(streampolicy, data.begin(), data.end()) - data.begin(), 1); - ASSERT_EQUAL( *thrust::max_element(thrust::cuda::par.on(s), data.begin(), data.end(), thrust::greater()), 1); - ASSERT_EQUAL( thrust::max_element(thrust::cuda::par.on(s), data.begin(), data.end(), thrust::greater()) - data.begin(), 2); + ASSERT_EQUAL( *thrust::max_element(streampolicy, data.begin(), data.end(), thrust::greater()), 1); + ASSERT_EQUAL( thrust::max_element(streampolicy, data.begin(), data.end(), thrust::greater()) - data.begin(), 2); cudaStreamDestroy(s); } -DECLARE_UNITTEST(TestMaxElementCudaStreams); + +void TestMaxElementCudaStreamsSync(){ + TestMaxElementCudaStreams(thrust::cuda::par); +} +DECLARE_UNITTEST(TestMaxElementCudaStreamsSync); + + +void TestMaxElementCudaStreamsNoSync(){ + TestMaxElementCudaStreams(thrust::cuda::par_nosync); +} +DECLARE_UNITTEST(TestMaxElementCudaStreamsNoSync); + void TestMaxElementDevicePointer() { diff -Nru libthrust-1.15.0/testing/cuda/partition.cu libthrust-1.16.0/testing/cuda/partition.cu --- libthrust-1.15.0/testing/cuda/partition.cu 2021-10-29 18:01:06.000000000 +0000 +++ libthrust-1.16.0/testing/cuda/partition.cu 2022-02-09 20:04:17.000000000 +0000 @@ -65,6 +65,13 @@ DECLARE_UNITTEST(TestPartitionDeviceDevice); +void TestPartitionDeviceNoSync() +{ + TestPartitionDevice(thrust::cuda::par_nosync); +} +DECLARE_UNITTEST(TestPartitionDeviceNoSync); + + template __global__ void partition_kernel(ExecutionPolicy exec, Iterator1 first, Iterator1 last, Iterator2 stencil_first, Predicate pred, Iterator3 result) @@ -125,6 +132,13 @@ DECLARE_UNITTEST(TestPartitionStencilDeviceDevice); +void TestPartitionStencilDeviceNoSync() +{ + TestPartitionStencilDevice(thrust::cuda::par_nosync); +} +DECLARE_UNITTEST(TestPartitionStencilDeviceNoSync); + + template __global__ void partition_copy_kernel(ExecutionPolicy exec, Iterator1 first, Iterator1 last, Iterator2 true_result, Iterator3 false_result, Predicate pred, Iterator4 result) @@ -188,6 +202,13 @@ DECLARE_UNITTEST(TestPartitionCopyDeviceDevice); +void TestPartitionCopyDeviceNoSync() +{ + TestPartitionCopyDevice(thrust::cuda::par_nosync); +} +DECLARE_UNITTEST(TestPartitionCopyDeviceNoSync); + + template __global__ void partition_copy_kernel(ExecutionPolicy exec, Iterator1 first, Iterator1 last, Iterator2 stencil_first, Iterator3 true_result, Iterator4 false_result, Predicate pred, Iterator5 result) @@ -258,6 +279,13 @@ DECLARE_UNITTEST(TestPartitionCopyStencilDeviceDevice); +void TestPartitionCopyStencilDeviceNoSync() +{ + TestPartitionCopyStencilDevice(thrust::cuda::par_nosync); +} +DECLARE_UNITTEST(TestPartitionCopyStencilDeviceNoSync); + + template __global__ void stable_partition_kernel(ExecutionPolicy exec, Iterator1 first, Iterator1 last, Predicate pred, Iterator2 result, Iterator3 is_supported) @@ -320,6 +348,13 @@ DECLARE_UNITTEST(TestStablePartitionDeviceDevice); +void TestStablePartitionDeviceNoSync() +{ + TestStablePartitionDevice(thrust::cuda::par_nosync); +} +DECLARE_UNITTEST(TestStablePartitionDeviceNoSync); + + template __global__ void stable_partition_kernel(ExecutionPolicy exec, Iterator1 first, Iterator1 last, Iterator2 stencil_first, Predicate pred, Iterator3 result, Iterator4 is_supported) @@ -389,6 +424,13 @@ DECLARE_UNITTEST(TestStablePartitionStencilDeviceDevice); +void TestStablePartitionStencilDeviceNoSync() +{ + TestStablePartitionStencilDevice(thrust::cuda::par_nosync); +} +DECLARE_UNITTEST(TestStablePartitionStencilDeviceNoSync); + + template __global__ void stable_partition_copy_kernel(ExecutionPolicy exec, Iterator1 first, Iterator1 last, Iterator2 true_result, Iterator3 false_result, Predicate pred, Iterator4 result) @@ -452,6 +494,13 @@ DECLARE_UNITTEST(TestStablePartitionCopyDeviceDevice); +void TestStablePartitionCopyDeviceNoSync() +{ + TestStablePartitionCopyDevice(thrust::cuda::par_nosync); +} +DECLARE_UNITTEST(TestStablePartitionCopyDeviceNoSync); + + template __global__ void stable_partition_copy_kernel(ExecutionPolicy exec, Iterator1 first, Iterator1 last, Iterator2 stencil_first, Iterator3 true_result, Iterator4 false_result, Predicate pred, Iterator5 result) @@ -522,7 +571,15 @@ DECLARE_UNITTEST(TestStablePartitionCopyStencilDeviceDevice); -void TestPartitionCudaStreams() +void TestStablePartitionCopyStencilDeviceNoSync() +{ + TestStablePartitionCopyStencilDevice(thrust::cuda::par_nosync); +} +DECLARE_UNITTEST(TestStablePartitionCopyStencilDeviceNoSync); + + +template +void TestPartitionCudaStreams(ExecutionPolicy policy) { typedef thrust::device_vector Vector; typedef Vector::value_type T; @@ -537,8 +594,10 @@ cudaStream_t s; cudaStreamCreate(&s); + + auto streampolicy = policy.on(s); - Iterator iter = thrust::partition(thrust::cuda::par.on(s), data.begin(), data.end(), is_even()); + Iterator iter = thrust::partition(streampolicy, data.begin(), data.end(), is_even()); Vector ref(5); ref[0] = 2; @@ -552,5 +611,17 @@ cudaStreamDestroy(s); } -DECLARE_UNITTEST(TestPartitionCudaStreams); + +void TestPartitionCudaStreamsSync() +{ + TestPartitionCudaStreams(thrust::cuda::par); +} +DECLARE_UNITTEST(TestPartitionCudaStreamsSync); + + +void TestPartitionCudaStreamsNoSync() +{ + TestPartitionCudaStreams(thrust::cuda::par_nosync); +} +DECLARE_UNITTEST(TestPartitionCudaStreamsNoSync); diff -Nru libthrust-1.15.0/testing/cuda/reduce_by_key.cu libthrust-1.16.0/testing/cuda/reduce_by_key.cu --- libthrust-1.15.0/testing/cuda/reduce_by_key.cu 2021-10-29 18:01:06.000000000 +0000 +++ libthrust-1.16.0/testing/cuda/reduce_by_key.cu 2022-02-09 20:04:17.000000000 +0000 @@ -191,7 +191,15 @@ DECLARE_UNITTEST(TestReduceByKeyDeviceDevice); -void TestReduceByKeyCudaStreams() +void TestReduceByKeyDeviceNoSync() +{ + TestReduceByKeyDevice(thrust::cuda::par_nosync); +} +DECLARE_UNITTEST(TestReduceByKeyDeviceNoSync); + + +template +void TestReduceByKeyCudaStreams(ExecutionPolicy policy) { typedef thrust::device_vector Vector; typedef Vector::value_type T; @@ -210,7 +218,9 @@ cudaStream_t s; cudaStreamCreate(&s); - new_last = thrust::reduce_by_key(thrust::cuda::par.on(s), keys.begin(), keys.end(), values.begin(), output_keys.begin(), output_values.begin()); + auto streampolicy = policy.on(s); + + new_last = thrust::reduce_by_key(streampolicy, keys.begin(), keys.end(), values.begin(), output_keys.begin(), output_values.begin()); ASSERT_EQUAL(new_last.first - output_keys.begin(), 5); ASSERT_EQUAL(new_last.second - output_values.begin(), 5); @@ -229,7 +239,7 @@ // test BinaryPredicate initialize_keys(keys); initialize_values(values); - new_last = thrust::reduce_by_key(thrust::cuda::par.on(s), keys.begin(), keys.end(), values.begin(), output_keys.begin(), output_values.begin(), is_equal_div_10_reduce()); + new_last = thrust::reduce_by_key(streampolicy, keys.begin(), keys.end(), values.begin(), output_keys.begin(), output_values.begin(), is_equal_div_10_reduce()); ASSERT_EQUAL(new_last.first - output_keys.begin(), 3); ASSERT_EQUAL(new_last.second - output_values.begin(), 3); @@ -244,7 +254,7 @@ // test BinaryFunction initialize_keys(keys); initialize_values(values); - new_last = thrust::reduce_by_key(thrust::cuda::par.on(s), keys.begin(), keys.end(), values.begin(), output_keys.begin(), output_values.begin(), thrust::equal_to(), thrust::plus()); + new_last = thrust::reduce_by_key(streampolicy, keys.begin(), keys.end(), values.begin(), output_keys.begin(), output_values.begin(), thrust::equal_to(), thrust::plus()); ASSERT_EQUAL(new_last.first - output_keys.begin(), 5); ASSERT_EQUAL(new_last.second - output_values.begin(), 5); @@ -262,5 +272,17 @@ cudaStreamDestroy(s); } -DECLARE_UNITTEST(TestReduceByKeyCudaStreams); + +void TestReduceByKeyCudaStreamsSync() +{ + TestReduceByKeyCudaStreams(thrust::cuda::par); +} +DECLARE_UNITTEST(TestReduceByKeyCudaStreamsSync); + + +void TestReduceByKeyCudaStreamsNoSync() +{ + TestReduceByKeyCudaStreams(thrust::cuda::par_nosync); +} +DECLARE_UNITTEST(TestReduceByKeyCudaStreamsNoSync); diff -Nru libthrust-1.15.0/testing/cuda/reduce.cu libthrust-1.16.0/testing/cuda/reduce.cu --- libthrust-1.15.0/testing/cuda/reduce.cu 2021-10-29 18:01:06.000000000 +0000 +++ libthrust-1.16.0/testing/cuda/reduce.cu 2022-02-09 20:04:17.000000000 +0000 @@ -53,7 +53,19 @@ VariableUnitTest TestReduceDeviceDeviceInstance; -void TestReduceCudaStreams() +template +struct TestReduceDeviceNoSync +{ + void operator()(const size_t n) + { + TestReduceDevice(thrust::cuda::par_nosync, n); + } +}; +VariableUnitTest TestReduceDeviceNoSyncInstance; + + +template +void TestReduceCudaStreams(ExecutionPolicy policy) { typedef thrust::device_vector Vector; @@ -63,13 +75,27 @@ cudaStream_t s; cudaStreamCreate(&s); + auto streampolicy = policy.on(s); + // no initializer - ASSERT_EQUAL(thrust::reduce(thrust::cuda::par.on(s), v.begin(), v.end()), 2); + ASSERT_EQUAL(thrust::reduce(streampolicy, v.begin(), v.end()), 2); // with initializer - ASSERT_EQUAL(thrust::reduce(thrust::cuda::par.on(s), v.begin(), v.end(), 10), 12); + ASSERT_EQUAL(thrust::reduce(streampolicy, v.begin(), v.end(), 10), 12); cudaStreamDestroy(s); } -DECLARE_UNITTEST(TestReduceCudaStreams); + +void TestReduceCudaStreamsSync() +{ + TestReduceCudaStreams(thrust::cuda::par); +} +DECLARE_UNITTEST(TestReduceCudaStreamsSync); + + +void TestReduceCudaStreamsNoSync() +{ + TestReduceCudaStreams(thrust::cuda::par_nosync); +} +DECLARE_UNITTEST(TestReduceCudaStreamsNoSync); diff -Nru libthrust-1.15.0/testing/cuda/set_intersection_by_key.cu libthrust-1.16.0/testing/cuda/set_intersection_by_key.cu --- libthrust-1.15.0/testing/cuda/set_intersection_by_key.cu 2021-10-29 18:01:06.000000000 +0000 +++ libthrust-1.16.0/testing/cuda/set_intersection_by_key.cu 2022-02-09 20:04:17.000000000 +0000 @@ -73,7 +73,15 @@ DECLARE_UNITTEST(TestSetIntersectionByKeyDeviceDevice); -void TestSetIntersectionByKeyCudaStreams() +void TestSetIntersectionByKeyDeviceNoSync() +{ + TestSetIntersectionByKeyDevice(thrust::cuda::par_nosync); +} +DECLARE_UNITTEST(TestSetIntersectionByKeyDeviceNoSync); + + +template +void TestSetIntersectionByKeyCudaStreams(ExecutionPolicy policy) { typedef thrust::device_vector Vector; typedef Vector::iterator Iterator; @@ -95,8 +103,10 @@ cudaStream_t s; cudaStreamCreate(&s); + auto streampolicy = policy.on(s); + thrust::pair end = - thrust::set_intersection_by_key(thrust::cuda::par.on(s), + thrust::set_intersection_by_key(streampolicy, a_key.begin(), a_key.end(), b_key.begin(), b_key.end(), a_val.begin(), @@ -111,5 +121,17 @@ cudaStreamDestroy(s); } -DECLARE_UNITTEST(TestSetIntersectionByKeyCudaStreams); + +void TestSetIntersectionByKeyCudaStreamsSync() +{ + TestSetIntersectionByKeyCudaStreams(thrust::cuda::par); +} +DECLARE_UNITTEST(TestSetIntersectionByKeyCudaStreamsSync); + + +void TestSetIntersectionByKeyCudaStreamsNoSync() +{ + TestSetIntersectionByKeyCudaStreams(thrust::cuda::par_nosync); +} +DECLARE_UNITTEST(TestSetIntersectionByKeyCudaStreamsNoSync); diff -Nru libthrust-1.15.0/testing/cuda/set_intersection.cu libthrust-1.16.0/testing/cuda/set_intersection.cu --- libthrust-1.15.0/testing/cuda/set_intersection.cu 2021-10-29 18:01:06.000000000 +0000 +++ libthrust-1.16.0/testing/cuda/set_intersection.cu 2022-02-09 20:04:17.000000000 +0000 @@ -59,7 +59,15 @@ DECLARE_UNITTEST(TestSetIntersectionDeviceDevice); -void TestSetIntersectionCudaStreams() +void TestSetIntersectionDeviceNoSync() +{ + TestSetIntersectionDevice(thrust::cuda::par_nosync); +} +DECLARE_UNITTEST(TestSetIntersectionDeviceNoSync); + + +template +void TestSetIntersectionCudaStreams(ExecutionPolicy policy) { typedef thrust::device_vector Vector; typedef Vector::iterator Iterator; @@ -77,7 +85,9 @@ cudaStream_t s; cudaStreamCreate(&s); - Iterator end = thrust::set_intersection(thrust::cuda::par.on(s), + auto streampolicy = policy.on(s); + + Iterator end = thrust::set_intersection(streampolicy, a.begin(), a.end(), b.begin(), b.end(), result.begin()); @@ -88,5 +98,17 @@ cudaStreamDestroy(s); } -DECLARE_UNITTEST(TestSetIntersectionCudaStreams); + +void TestSetIntersectionCudaStreamsSync() +{ + TestSetIntersectionCudaStreams(thrust::cuda::par); +} +DECLARE_UNITTEST(TestSetIntersectionCudaStreamsSync); + + +void TestSetIntersectionCudaStreamsNoSync() +{ + TestSetIntersectionCudaStreams(thrust::cuda::par_nosync); +} +DECLARE_UNITTEST(TestSetIntersectionCudaStreamsNoSync); diff -Nru libthrust-1.15.0/testing/cuda/unique_by_key.cu libthrust-1.16.0/testing/cuda/unique_by_key.cu --- libthrust-1.15.0/testing/cuda/unique_by_key.cu 2021-10-29 18:01:06.000000000 +0000 +++ libthrust-1.16.0/testing/cuda/unique_by_key.cu 2022-02-09 20:04:17.000000000 +0000 @@ -134,7 +134,15 @@ DECLARE_UNITTEST(TestUniqueByKeyDeviceDevice); -void TestUniqueByKeyCudaStreams() +void TestUniqueByKeyDeviceNoSync() +{ + TestUniqueByKeyDevice(thrust::cuda::par_nosync); +} +DECLARE_UNITTEST(TestUniqueByKeyDeviceNoSync); + + +template +void TestUniqueByKeyCudaStreams(ExecutionPolicy policy) { typedef thrust::device_vector Vector; typedef Vector::value_type T; @@ -150,8 +158,10 @@ cudaStream_t s; cudaStreamCreate(&s); + + auto streampolicy = policy.on(s); - new_last = thrust::unique_by_key(thrust::cuda::par.on(s), keys.begin(), keys.end(), values.begin()); + new_last = thrust::unique_by_key(streampolicy, keys.begin(), keys.end(), values.begin()); cudaStreamSynchronize(s); ASSERT_EQUAL(new_last.first - keys.begin(), 5); @@ -171,7 +181,7 @@ // test BinaryPredicate initialize_keys(keys); initialize_values(values); - new_last = thrust::unique_by_key(thrust::cuda::par.on(s), keys.begin(), keys.end(), values.begin(), is_equal_div_10_unique()); + new_last = thrust::unique_by_key(streampolicy, keys.begin(), keys.end(), values.begin(), is_equal_div_10_unique()); ASSERT_EQUAL(new_last.first - keys.begin(), 3); ASSERT_EQUAL(new_last.second - values.begin(), 3); @@ -185,7 +195,19 @@ cudaStreamDestroy(s); } -DECLARE_UNITTEST(TestUniqueByKeyCudaStreams); + +void TestUniqueByKeyCudaStreamsSync() +{ + TestUniqueByKeyCudaStreams(thrust::cuda::par); +} +DECLARE_UNITTEST(TestUniqueByKeyCudaStreamsSync); + + +void TestUniqueByKeyCudaStreamsNoSync() +{ + TestUniqueByKeyCudaStreams(thrust::cuda::par_nosync); +} +DECLARE_UNITTEST(TestUniqueByKeyCudaStreamsNoSync); template @@ -282,7 +304,15 @@ DECLARE_UNITTEST(TestUniqueCopyByKeyDeviceDevice); -void TestUniqueCopyByKeyCudaStreams() +void TestUniqueCopyByKeyDeviceNoSync() +{ + TestUniqueCopyByKeyDevice(thrust::cuda::par_nosync); +} +DECLARE_UNITTEST(TestUniqueCopyByKeyDeviceNoSync); + + +template +void TestUniqueCopyByKeyCudaStreams(ExecutionPolicy policy) { typedef thrust::device_vector Vector; typedef Vector::value_type T; @@ -302,7 +332,9 @@ cudaStream_t s; cudaStreamCreate(&s); - new_last = thrust::unique_by_key_copy(thrust::cuda::par.on(s), keys.begin(), keys.end(), values.begin(), output_keys.begin(), output_values.begin()); + auto streampolicy = policy.on(s); + + new_last = thrust::unique_by_key_copy(streampolicy, keys.begin(), keys.end(), values.begin(), output_keys.begin(), output_values.begin()); cudaStreamSynchronize(s); ASSERT_EQUAL(new_last.first - output_keys.begin(), 5); @@ -322,7 +354,7 @@ // test BinaryPredicate initialize_keys(keys); initialize_values(values); - new_last = thrust::unique_by_key_copy(thrust::cuda::par.on(s), keys.begin(), keys.end(), values.begin(), output_keys.begin(), output_values.begin(), is_equal_div_10_unique()); + new_last = thrust::unique_by_key_copy(streampolicy, keys.begin(), keys.end(), values.begin(), output_keys.begin(), output_values.begin(), is_equal_div_10_unique()); cudaStreamSynchronize(s); ASSERT_EQUAL(new_last.first - output_keys.begin(), 3); @@ -337,5 +369,17 @@ cudaStreamDestroy(s); } -DECLARE_UNITTEST(TestUniqueCopyByKeyCudaStreams); + +void TestUniqueCopyByKeyCudaStreamsSync() +{ + TestUniqueCopyByKeyCudaStreams(thrust::cuda::par); +} +DECLARE_UNITTEST(TestUniqueCopyByKeyCudaStreamsSync); + + +void TestUniqueCopyByKeyCudaStreamsNoSync() +{ + TestUniqueCopyByKeyCudaStreams(thrust::cuda::par_nosync); +} +DECLARE_UNITTEST(TestUniqueCopyByKeyCudaStreamsNoSync); diff -Nru libthrust-1.15.0/testing/cuda/unique.cu libthrust-1.16.0/testing/cuda/unique.cu --- libthrust-1.15.0/testing/cuda/unique.cu 2021-10-29 18:01:06.000000000 +0000 +++ libthrust-1.16.0/testing/cuda/unique.cu 2022-02-09 20:04:17.000000000 +0000 @@ -94,7 +94,15 @@ DECLARE_UNITTEST(TestUniqueDeviceDevice); -void TestUniqueCudaStreams() +void TestUniqueDeviceNoSync() +{ + TestUniqueDevice(thrust::cuda::par_nosync); +} +DECLARE_UNITTEST(TestUniqueDeviceNoSync); + + +template +void TestUniqueCudaStreams(ExecutionPolicy policy) { typedef thrust::device_vector Vector; typedef Vector::value_type T; @@ -116,8 +124,10 @@ cudaStream_t s; cudaStreamCreate(&s); + + auto streampolicy = policy.on(s); - new_last = thrust::unique(thrust::cuda::par.on(s), data.begin(), data.end()); + new_last = thrust::unique(streampolicy, data.begin(), data.end()); cudaStreamSynchronize(s); ASSERT_EQUAL(new_last - data.begin(), 7); @@ -129,7 +139,7 @@ ASSERT_EQUAL(data[5], 31); ASSERT_EQUAL(data[6], 37); - new_last = thrust::unique(thrust::cuda::par.on(s), data.begin(), new_last, is_equal_div_10_unique()); + new_last = thrust::unique(streampolicy, data.begin(), new_last, is_equal_div_10_unique()); cudaStreamSynchronize(s); ASSERT_EQUAL(new_last - data.begin(), 3); @@ -139,7 +149,19 @@ cudaStreamDestroy(s); } -DECLARE_UNITTEST(TestUniqueCudaStreams); + +void TestUniqueCudaStreamsSync() +{ + TestUniqueCudaStreams(thrust::cuda::par); +} +DECLARE_UNITTEST(TestUniqueCudaStreamsSync); + + +void TestUniqueCudaStreamsNoSync() +{ + TestUniqueCudaStreams(thrust::cuda::par_nosync); +} +DECLARE_UNITTEST(TestUniqueCudaStreamsNoSync); template @@ -227,7 +249,15 @@ DECLARE_UNITTEST(TestUniqueCopyDeviceDevice); -void TestUniqueCopyCudaStreams() +void TestUniqueCopyDeviceNoSync() +{ + TestUniqueCopyDevice(thrust::cuda::par_nosync); +} +DECLARE_UNITTEST(TestUniqueCopyDeviceNoSync); + + +template +void TestUniqueCopyCudaStreams(ExecutionPolicy policy) { typedef thrust::device_vector Vector; typedef Vector::value_type T; @@ -251,8 +281,10 @@ cudaStream_t s; cudaStreamCreate(&s); + + auto streampolicy = policy.on(s); - new_last = thrust::unique_copy(thrust::cuda::par.on(s), data.begin(), data.end(), output.begin()); + new_last = thrust::unique_copy(streampolicy, data.begin(), data.end(), output.begin()); cudaStreamSynchronize(s); ASSERT_EQUAL(new_last - output.begin(), 7); @@ -264,7 +296,7 @@ ASSERT_EQUAL(output[5], 31); ASSERT_EQUAL(output[6], 37); - new_last = thrust::unique_copy(thrust::cuda::par.on(s), output.begin(), new_last, data.begin(), is_equal_div_10_unique()); + new_last = thrust::unique_copy(streampolicy, output.begin(), new_last, data.begin(), is_equal_div_10_unique()); cudaStreamSynchronize(s); ASSERT_EQUAL(new_last - data.begin(), 3); @@ -274,5 +306,17 @@ cudaStreamDestroy(s); } -DECLARE_UNITTEST(TestUniqueCopyCudaStreams); + +void TestUniqueCopyCudaStreamsSync() +{ + TestUniqueCopyCudaStreams(thrust::cuda::par); +} +DECLARE_UNITTEST(TestUniqueCopyCudaStreamsSync); + + +void TestUniqueCopyCudaStreamsNoSync() +{ + TestUniqueCopyCudaStreams(thrust::cuda::par_nosync); +} +DECLARE_UNITTEST(TestUniqueCopyCudaStreamsNoSync); diff -Nru libthrust-1.15.0/testing/partition.cu libthrust-1.16.0/testing/partition.cu --- libthrust-1.15.0/testing/partition.cu 2021-10-29 18:01:06.000000000 +0000 +++ libthrust-1.16.0/testing/partition.cu 2022-02-09 20:04:17.000000000 +0000 @@ -6,6 +6,12 @@ #include #include +#if defined(THRUST_GCC_VERSION) && \ + THRUST_GCC_VERSION >= 110000 && \ + THRUST_GCC_VERSION < 120000 +#define WAIVE_GCC11_FAILURES +#endif + template struct is_even { @@ -21,6 +27,17 @@ typedef typename Vector::value_type T; typedef typename Vector::iterator Iterator; + // GCC 11 miscompiles and segfaults for certain versions of this test. + // It's not reproducible on other compilers, and the test passes when + // optimizations are disabled. It only affects 32-bit value types, and + // impacts all CPU host/device combinations tested. +#ifdef WAIVE_GCC11_FAILURES + if (sizeof(T) == 4) + { + return; + } +#endif + Vector data(5); data[0] = 1; data[1] = 2; @@ -321,6 +338,17 @@ { void operator()(const size_t n) { + // GCC 11 miscompiles and segfaults for certain versions of this test. + // It's not reproducible on other compilers, and the test passes when + // optimizations are disabled. It only affects 32-bit value types, and + // impacts all CPU host/device combinations tested. +#ifdef WAIVE_GCC11_FAILURES + if (n == 0 && sizeof(T) == 4) + { + return; + } +#endif + // setup ranges thrust::host_vector h_data = unittest::random_integers(n); thrust::host_vector h_stencil = unittest::random_integers(n); @@ -689,6 +717,17 @@ { void operator()(const size_t n) { + // GCC 11 miscompiles and segfaults for certain versions of this test. + // It's not reproducible on other compilers, and the test passes when + // optimizations are disabled. It only affects 32-bit value types, and + // impacts all CPU host/device combinations tested. +#ifdef WAIVE_GCC11_FAILURES + if (n == 0 && sizeof(T) == 4) + { + return; + } +#endif + // setup ranges thrust::host_vector h_data = unittest::random_integers(n); thrust::device_vector d_data = h_data; @@ -708,6 +747,17 @@ { void operator()(const size_t n) { + // GCC 11 miscompiles and segfaults for certain versions of this test. + // It's not reproducible on other compilers, and the test passes when + // optimizations are disabled. It only affects 32-bit value types, and + // impacts all CPU host/device combinations tested. +#ifdef WAIVE_GCC11_FAILURES + if (n == 0 && sizeof(T) == 4) + { + return; + } +#endif + // setup ranges thrust::host_vector h_data = unittest::random_integers(n); thrust::host_vector h_stencil = unittest::random_integers(n); diff -Nru libthrust-1.15.0/testing/scan.cu libthrust-1.16.0/testing/scan.cu --- libthrust-1.15.0/testing/scan.cu 2021-10-29 18:01:06.000000000 +0000 +++ libthrust-1.16.0/testing/scan.cu 2022-02-09 20:04:17.000000000 +0000 @@ -26,6 +26,17 @@ void TestScanSimple(void) { typedef typename Vector::value_type T; + + // icc miscompiles the intermediate sum updates for custom_numeric. + // The issue doesn't happen with opts disabled, or on other compilers. + // Printing the intermediate sum each iteration "fixes" the issue, + // so likely a bad optimization. +#if THRUST_HOST_COMPILER == THRUST_HOST_COMPILER_INTEL + if (std::is_same::value) + { + return; + } +#endif typename Vector::iterator iter; diff -Nru libthrust-1.15.0/testing/sequence.cu libthrust-1.16.0/testing/sequence.cu --- libthrust-1.15.0/testing/sequence.cu 2021-10-29 18:01:06.000000000 +0000 +++ libthrust-1.16.0/testing/sequence.cu 2022-02-09 20:04:17.000000000 +0000 @@ -139,10 +139,21 @@ }; // Vector-Vector addition -__host__ __device__ Vector operator+(const Vector a, const Vector b) { return Vector{a.x + b.x, a.y + b.y}; } +__host__ __device__ Vector operator+(const Vector a, const Vector b) +{ + return Vector{a.x + b.x, a.y + b.y}; +} + // Vector-Scalar Multiplication -__host__ __device__ Vector operator*(const int a, const Vector b) { return Vector{a * b.x, a * b.y}; } -__host__ __device__ Vector operator*(const Vector b, const int a) { return Vector{a * b.x, a * b.y}; } +// Multiplication by std::size_t is required by thrust::sequence. +__host__ __device__ Vector operator*(const std::size_t a, const Vector b) +{ + return Vector{static_cast(a) * b.x, static_cast(a) * b.y}; +} +__host__ __device__ Vector operator*(const Vector b, const std::size_t a) +{ + return Vector{static_cast(a) * b.x, static_cast(a) * b.y}; +} void TestSequenceNoSizeTConversion() { diff -Nru libthrust-1.15.0/testing/shuffle.cu libthrust-1.16.0/testing/shuffle.cu --- libthrust-1.15.0/testing/shuffle.cu 2021-10-29 18:01:06.000000000 +0000 +++ libthrust-1.16.0/testing/shuffle.cu 2022-02-09 20:04:17.000000000 +0000 @@ -1,6 +1,5 @@ #include -#if THRUST_CPP_DIALECT >= 2011 #include #include #include @@ -383,7 +382,7 @@ thrust::system::detail::generic::feistel_bijection host_f(m, host_g); thrust::system::detail::generic::feistel_bijection device_f(m, device_g); - if (host_f.nearest_power_of_two() >= std::numeric_limits::max() || m == 0) { + if (static_cast(host_f.nearest_power_of_two()) >= static_cast(std::numeric_limits::max()) || m == 0) { return; } @@ -410,17 +409,17 @@ void TestBijectionLength() { thrust::default_random_engine g(0xD5); - uint64_t m = 3; + uint64_t m = 31; thrust::system::detail::generic::feistel_bijection f(m, g); - ASSERT_EQUAL(f.nearest_power_of_two(), uint64_t(4)); + ASSERT_EQUAL(f.nearest_power_of_two(), uint64_t(32)); - m = 2; + m = 32; f = thrust::system::detail::generic::feistel_bijection(m, g); - ASSERT_EQUAL(f.nearest_power_of_two(), uint64_t(2)); + ASSERT_EQUAL(f.nearest_power_of_two(), uint64_t(32)); - m = 0; + m = 1; f = thrust::system::detail::generic::feistel_bijection(m, g); - ASSERT_EQUAL(f.nearest_power_of_two(), uint64_t(1)); + ASSERT_EQUAL(f.nearest_power_of_two(), uint64_t(16)); } DECLARE_UNITTEST(TestBijectionLength); @@ -515,7 +514,7 @@ thrust::host_vector h_results; Vector sequence(shuffle_size); thrust::sequence(sequence.begin(), sequence.end(), 0); - thrust::default_random_engine g(0xD5); + thrust::default_random_engine g(0xD6); for (auto i = 0ull; i < num_samples; i++) { thrust::shuffle(sequence.begin(), sequence.end(), g); thrust::host_vector tmp(sequence.begin(), sequence.end()); @@ -561,7 +560,7 @@ const uint64_t shuffle_sizes[] = {10, 100, 500}; thrust::default_random_engine g(0xD5); for (auto shuffle_size : shuffle_sizes) { - if(shuffle_size > std::numeric_limits::max()) + if(shuffle_size > (uint64_t)std::numeric_limits::max()) continue; const uint64_t num_samples = shuffle_size == 500 ? 1000 : 200; @@ -601,4 +600,3 @@ } } DECLARE_INTEGRAL_VECTOR_UNITTEST(TestShuffleEvenDistribution); -#endif diff -Nru libthrust-1.15.0/testing/unittest/system.h libthrust-1.16.0/testing/unittest/system.h --- libthrust-1.15.0/testing/unittest/system.h 2021-10-29 18:01:06.000000000 +0000 +++ libthrust-1.16.0/testing/unittest/system.h 2022-02-09 20:04:17.000000000 +0000 @@ -12,7 +12,7 @@ namespace unittest { -#if __GNUC__ && !__NVCOMPILER_CUDA__ +#if __GNUC__ && !_NVHPC_CUDA inline std::string demangle(const char* name) { int status = 0; diff -Nru libthrust-1.15.0/thrust/cmake/thrust-config.cmake libthrust-1.16.0/thrust/cmake/thrust-config.cmake --- libthrust-1.15.0/thrust/cmake/thrust-config.cmake 2021-10-29 18:01:06.000000000 +0000 +++ libthrust-1.16.0/thrust/cmake/thrust-config.cmake 2022-02-09 20:04:17.000000000 +0000 @@ -497,7 +497,7 @@ macro(_thrust_find_CUDA required) if (NOT TARGET Thrust::CUDA) thrust_debug("Searching for CUB ${required}" internal) - find_package(CUB CONFIG + find_package(CUB ${THRUST_VERSION} CONFIG ${_THRUST_QUIET_FLAG} ${required} NO_DEFAULT_PATH # Only check the explicit HINTS below: diff -Nru libthrust-1.15.0/thrust/cmake/thrust-config-version.cmake libthrust-1.16.0/thrust/cmake/thrust-config-version.cmake --- libthrust-1.15.0/thrust/cmake/thrust-config-version.cmake 2021-10-29 18:01:06.000000000 +0000 +++ libthrust-1.16.0/thrust/cmake/thrust-config-version.cmake 2022-02-09 20:04:17.000000000 +0000 @@ -1,12 +1,6 @@ # Parse version information from version.h: -unset(_THRUST_VERSION_INCLUDE_DIR CACHE) # Clear old result to force search -find_path(_THRUST_VERSION_INCLUDE_DIR thrust/version.h - NO_DEFAULT_PATH # Only search explicit paths below: - PATHS - ${CMAKE_CURRENT_LIST_DIR}/../.. # Source tree - ${CMAKE_CURRENT_LIST_DIR}/../../../include # Install tree -) -set_property(CACHE _THRUST_VERSION_INCLUDE_DIR PROPERTY TYPE INTERNAL) +include("${CMAKE_CURRENT_LIST_DIR}/thrust-header-search.cmake") + file(READ "${_THRUST_VERSION_INCLUDE_DIR}/thrust/version.h" THRUST_VERSION_HEADER) string(REGEX MATCH "#define[ \t]+THRUST_VERSION[ \t]+([0-9]+)" DUMMY "${THRUST_VERSION_HEADER}") set(THRUST_VERSION_FLAT ${CMAKE_MATCH_1}) @@ -26,11 +20,12 @@ set(PACKAGE_VERSION_UNSUITABLE FALSE) if(PACKAGE_VERSION VERSION_GREATER_EQUAL PACKAGE_FIND_VERSION) - if(PACKAGE_FIND_VERSION_MAJOR STREQUAL THRUST_VERSION_MAJOR) + if(THRUST_VERSION_MAJOR VERSION_EQUAL PACKAGE_FIND_VERSION_MAJOR AND + THRUST_VERSION_MINOR VERSION_GREATER_EQUAL PACKAGE_FIND_VERSION_MINOR) set(PACKAGE_VERSION_COMPATIBLE TRUE) endif() - if(PACKAGE_FIND_VERSION STREQUAL PACKAGE_VERSION) + if(PACKAGE_FIND_VERSION VERSION_EQUAL PACKAGE_VERSION) set(PACKAGE_VERSION_EXACT TRUE) endif() endif() diff -Nru libthrust-1.15.0/thrust/cmake/thrust-header-search.cmake libthrust-1.16.0/thrust/cmake/thrust-header-search.cmake --- libthrust-1.15.0/thrust/cmake/thrust-header-search.cmake 1970-01-01 00:00:00.000000000 +0000 +++ libthrust-1.16.0/thrust/cmake/thrust-header-search.cmake 2022-02-09 20:04:17.000000000 +0000 @@ -0,0 +1,8 @@ +# Parse version information from version.h: +unset(_THRUST_VERSION_INCLUDE_DIR CACHE) # Clear old result to force search +find_path(_THRUST_VERSION_INCLUDE_DIR thrust/version.h + NO_DEFAULT_PATH # Only search explicit paths below: + PATHS + "${CMAKE_CURRENT_LIST_DIR}/../.." # Source tree +) +set_property(CACHE _THRUST_VERSION_INCLUDE_DIR PROPERTY TYPE INTERNAL) diff -Nru libthrust-1.15.0/thrust/cmake/thrust-header-search.cmake.in libthrust-1.16.0/thrust/cmake/thrust-header-search.cmake.in --- libthrust-1.15.0/thrust/cmake/thrust-header-search.cmake.in 1970-01-01 00:00:00.000000000 +0000 +++ libthrust-1.16.0/thrust/cmake/thrust-header-search.cmake.in 2022-02-09 20:04:17.000000000 +0000 @@ -0,0 +1,8 @@ +# Parse version information from version.h: +unset(_THRUST_VERSION_INCLUDE_DIR CACHE) # Clear old result to force search +find_path(_THRUST_VERSION_INCLUDE_DIR thrust/version.h + NO_DEFAULT_PATH # Only search explicit paths below: + PATHS + "${CMAKE_CURRENT_LIST_DIR}/../../../@CMAKE_INSTALL_INCLUDEDIR@" +) +set_property(CACHE _THRUST_VERSION_INCLUDE_DIR PROPERTY TYPE INTERNAL) diff -Nru libthrust-1.15.0/thrust/detail/allocator/temporary_allocator.inl libthrust-1.16.0/thrust/detail/allocator/temporary_allocator.inl --- libthrust-1.15.0/thrust/detail/allocator/temporary_allocator.inl 2021-10-29 18:01:06.000000000 +0000 +++ libthrust-1.16.0/thrust/detail/allocator/temporary_allocator.inl 2022-02-09 20:04:17.000000000 +0000 @@ -20,7 +20,7 @@ #include #include -#if (defined(__NVCOMPILER_CUDA__) || defined(__CUDA_ARCH__)) && \ +#if (defined(_NVHPC_CUDA) || defined(__CUDA_ARCH__)) && \ THRUST_DEVICE_SYSTEM == THRUST_DEVICE_SYSTEM_CUDA #include #endif diff -Nru libthrust-1.15.0/thrust/detail/complex/c99math.h libthrust-1.16.0/thrust/detail/complex/c99math.h --- libthrust-1.15.0/thrust/detail/complex/c99math.h 2021-10-29 18:01:06.000000000 +0000 +++ libthrust-1.16.0/thrust/detail/complex/c99math.h 2022-02-09 20:04:17.000000000 +0000 @@ -102,7 +102,7 @@ #else -# if defined(__CUDACC__) && !(defined(__CUDA__) && defined(__clang__)) && !defined(__NVCOMPILER_CUDA__) +# if defined(__CUDACC__) && !(defined(__CUDA__) && defined(__clang__)) && !defined(_NVHPC_CUDA) // NVCC implements at least some signature of these as functions not macros. using ::isinf; using ::isnan; @@ -141,7 +141,7 @@ -#if !defined(__CUDACC__) && !defined(__NVCOMPILER_CUDA__) +#if !defined(__CUDACC__) && !defined(_NVHPC_CUDA) // Simple approximation to log1p as Visual Studio is lacking one inline double log1p(double x){ diff -Nru libthrust-1.15.0/thrust/detail/config/compiler.h libthrust-1.16.0/thrust/detail/config/compiler.h --- libthrust-1.15.0/thrust/detail/config/compiler.h 2021-10-29 18:01:06.000000000 +0000 +++ libthrust-1.16.0/thrust/detail/config/compiler.h 2022-02-09 20:04:17.000000000 +0000 @@ -58,7 +58,7 @@ #endif // THRUST_HOST_COMPILER // figure out which device compiler we're using -#if defined(__CUDACC__) || defined(__NVCOMPILER_CUDA__) +#if defined(__CUDACC__) || defined(_NVHPC_CUDA) #define THRUST_DEVICE_COMPILER THRUST_DEVICE_COMPILER_NVCC #elif THRUST_HOST_COMPILER == THRUST_HOST_COMPILER_MSVC #define THRUST_DEVICE_COMPILER THRUST_DEVICE_COMPILER_MSVC diff -Nru libthrust-1.15.0/thrust/detail/config/cpp_compatibility.h libthrust-1.16.0/thrust/detail/config/cpp_compatibility.h --- libthrust-1.15.0/thrust/detail/config/cpp_compatibility.h 2021-10-29 18:01:06.000000000 +0000 +++ libthrust-1.16.0/thrust/detail/config/cpp_compatibility.h 2022-02-09 20:04:17.000000000 +0000 @@ -39,7 +39,7 @@ // FIXME: Combine THRUST_INLINE_CONSTANT and // THRUST_INLINE_INTEGRAL_MEMBER_CONSTANT into one macro when NVCC properly // supports `constexpr` globals in host and device code. -#if defined(__CUDA_ARCH__) || defined(__NVCOMPILER_CUDA__) +#if defined(__CUDA_ARCH__) || defined(_NVHPC_CUDA) // FIXME: Add this when NVCC supports inline variables. //# if THRUST_CPP_DIALECT >= 2017 //# define THRUST_INLINE_CONSTANT inline constexpr @@ -65,7 +65,7 @@ # endif #endif -#if defined(__NVCOMPILER_CUDA__) +#if defined(_NVHPC_CUDA) # define THRUST_IS_DEVICE_CODE __builtin_is_device_code() # define THRUST_IS_HOST_CODE (!__builtin_is_device_code()) # define THRUST_INCLUDE_DEVICE_CODE 1 diff -Nru libthrust-1.15.0/thrust/detail/config/exec_check_disable.h libthrust-1.16.0/thrust/detail/config/exec_check_disable.h --- libthrust-1.15.0/thrust/detail/config/exec_check_disable.h 2021-10-29 18:01:06.000000000 +0000 +++ libthrust-1.16.0/thrust/detail/config/exec_check_disable.h 2022-02-09 20:04:17.000000000 +0000 @@ -25,7 +25,7 @@ // #pragma nv_exec_check_disable is only recognized by NVCC. Having a macro // expand to a #pragma (rather than _Pragma) only works with NVCC's compilation // model, not with other compilers. -#if defined(__CUDACC__) && !defined(__NVCOMPILER_CUDA__) && \ +#if defined(__CUDACC__) && !defined(_NVHPC_CUDA) && \ !(defined(__CUDA__) && defined(__clang__)) #if THRUST_HOST_COMPILER == THRUST_HOST_COMPILER_MSVC diff -Nru libthrust-1.15.0/thrust/detail/config/forceinline.h libthrust-1.16.0/thrust/detail/config/forceinline.h --- libthrust-1.15.0/thrust/detail/config/forceinline.h 2021-10-29 18:01:06.000000000 +0000 +++ libthrust-1.16.0/thrust/detail/config/forceinline.h 2022-02-09 20:04:17.000000000 +0000 @@ -22,7 +22,7 @@ #include -#if defined(__CUDACC__) || defined(__NVCOMPILER_CUDA__) +#if defined(__CUDACC__) || defined(_NVHPC_CUDA) #define __thrust_forceinline__ __forceinline__ diff -Nru libthrust-1.15.0/thrust/detail/cstdint.h libthrust-1.16.0/thrust/detail/cstdint.h --- libthrust-1.15.0/thrust/detail/cstdint.h 2021-10-29 18:01:06.000000000 +0000 +++ libthrust-1.16.0/thrust/detail/cstdint.h 2022-02-09 20:04:17.000000000 +0000 @@ -18,7 +18,9 @@ #include -#if (THRUST_HOST_COMPILER == THRUST_HOST_COMPILER_GCC) || (THRUST_HOST_COMPILER == THRUST_HOST_COMPILER_CLANG) +#if (THRUST_HOST_COMPILER == THRUST_HOST_COMPILER_GCC) || \ + (THRUST_HOST_COMPILER == THRUST_HOST_COMPILER_CLANG) || \ + (THRUST_HOST_COMPILER == THRUST_HOST_COMPILER_INTEL) #include #endif diff -Nru libthrust-1.15.0/thrust/detail/numeric_wrapper.h libthrust-1.16.0/thrust/detail/numeric_wrapper.h --- libthrust-1.15.0/thrust/detail/numeric_wrapper.h 1970-01-01 00:00:00.000000000 +0000 +++ libthrust-1.16.0/thrust/detail/numeric_wrapper.h 2022-02-09 20:04:17.000000000 +0000 @@ -0,0 +1,27 @@ +/* + * Copyright 2021 NVIDIA Corporation + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#pragma once + +// When a compiler uses Thrust as part of its implementation of Standard C++ +// algorithms, a cycle of included files may result when Thrust code tries to +// use a standard algorithm. Having a macro that is defined only when Thrust +// is including an algorithms-related header gives the compiler a chance to +// detect and break the cycle of includes. + +#define THRUST_INCLUDING_ALGORITHMS_HEADER +#include +#undef THRUST_INCLUDING_ALGORITHMS_HEADER diff -Nru libthrust-1.15.0/thrust/device_allocator.h libthrust-1.16.0/thrust/device_allocator.h --- libthrust-1.15.0/thrust/device_allocator.h 2021-10-29 18:01:06.000000000 +0000 +++ libthrust-1.16.0/thrust/device_allocator.h 2022-02-09 20:04:17.000000000 +0000 @@ -119,7 +119,7 @@ device_allocator() {} /*! Copy constructor has no effect. */ - __host__ + __host__ __device__ device_allocator(const device_allocator& other) : base(other) {} /*! Constructor from other \p device_allocator has no effect. */ diff -Nru libthrust-1.15.0/thrust/random/detail/normal_distribution_base.h libthrust-1.16.0/thrust/random/detail/normal_distribution_base.h --- libthrust-1.15.0/thrust/random/detail/normal_distribution_base.h 2021-10-29 18:01:06.000000000 +0000 +++ libthrust-1.16.0/thrust/random/detail/normal_distribution_base.h 2022-02-09 20:04:17.000000000 +0000 @@ -135,7 +135,7 @@ template struct normal_distribution_base { -#if THRUST_DEVICE_COMPILER == THRUST_DEVICE_COMPILER_NVCC && !defined(__NVCOMPILER_CUDA__) +#if THRUST_DEVICE_COMPILER == THRUST_DEVICE_COMPILER_NVCC && !defined(_NVHPC_CUDA) typedef normal_distribution_nvcc type; #else typedef normal_distribution_portable type; diff -Nru libthrust-1.15.0/thrust/scan.h libthrust-1.16.0/thrust/scan.h --- libthrust-1.15.0/thrust/scan.h 2021-10-29 18:01:06.000000000 +0000 +++ libthrust-1.16.0/thrust/scan.h 2022-02-09 20:04:17.000000000 +0000 @@ -44,12 +44,16 @@ * This version of \p inclusive_scan assumes plus as the associative operator. * When the input and output sequences are the same, the scan is performed * in-place. - + * * \p inclusive_scan is similar to \c std::partial_sum in the STL. The primary * difference between the two functions is that \c std::partial_sum guarantees * a serial summation order, while \p inclusive_scan requires associativity of * the binary operation to parallelize the prefix sum. * + * Results are not deterministic for pseudo-associative operators (e.g., + * addition of floating-point types). Results for pseudo-associative + * operators may vary from run to run. + * * The algorithm's execution is parallelized as determined by \p exec. * * \param exec The execution policy to use for parallelization. @@ -106,12 +110,16 @@ * This version of \p inclusive_scan assumes plus as the associative operator. * When the input and output sequences are the same, the scan is performed * in-place. - + * * \p inclusive_scan is similar to \c std::partial_sum in the STL. The primary * difference between the two functions is that \c std::partial_sum guarantees * a serial summation order, while \p inclusive_scan requires associativity of * the binary operation to parallelize the prefix sum. - * + * + * Results are not deterministic for pseudo-associative operators (e.g., + * addition of floating-point types). Results for pseudo-associative + * operators may vary from run to run. + * * \param first The beginning of the input sequence. * \param last The end of the input sequence. * \param result The beginning of the output sequence. @@ -154,12 +162,16 @@ * term 'inclusive' means that each result includes the corresponding * input operand in the partial sum. When the input and output sequences * are the same, the scan is performed in-place. - * + * * \p inclusive_scan is similar to \c std::partial_sum in the STL. The primary * difference between the two functions is that \c std::partial_sum guarantees * a serial summation order, while \p inclusive_scan requires associativity of * the binary operation to parallelize the prefix sum. * + * Results are not deterministic for pseudo-associative operators (e.g., + * addition of floating-point types). Results for pseudo-associative + * operators may vary from run to run. + * * The algorithm's execution is parallelized as determined by \p exec. * * \param exec The execution policy to use for parallelization. @@ -220,6 +232,10 @@ * a serial summation order, while \p inclusive_scan requires associativity of * the binary operation to parallelize the prefix sum. * + * Results are not deterministic for pseudo-associative operators (e.g., + * addition of floating-point types). Results for pseudo-associative + * operators may vary from run to run. + * * \param first The beginning of the input sequence. * \param last The end of the input sequence. * \param result The beginning of the output sequence. @@ -271,6 +287,10 @@ * associative operator and \c 0 as the initial value. When the input and * output sequences are the same, the scan is performed in-place. * + * Results are not deterministic for pseudo-associative operators (e.g., + * addition of floating-point types). Results for pseudo-associative + * operators may vary from run to run. + * * The algorithm's execution is parallelized as determined by \p exec. * * \param exec The execution policy to use for parallelization. @@ -326,7 +346,11 @@ * and so on. This version of \p exclusive_scan assumes plus as the * associative operator and \c 0 as the initial value. When the input and * output sequences are the same, the scan is performed in-place. - * + * + * Results are not deterministic for pseudo-associative operators (e.g., + * addition of floating-point types). Results for pseudo-associative + * operators may vary from run to run. + * * \param first The beginning of the input sequence. * \param last The end of the input sequence. * \param result The beginning of the output sequence. @@ -373,6 +397,10 @@ * operator but requires an initial value \p init. When the input and * output sequences are the same, the scan is performed in-place. * + * Results are not deterministic for pseudo-associative operators (e.g., + * addition of floating-point types). Results for pseudo-associative + * operators may vary from run to run. + * * The algorithm's execution is parallelized as determined by \p exec. * * \param exec The execution policy to use for parallelization. @@ -430,6 +458,10 @@ * operator but requires an initial value \p init. When the input and * output sequences are the same, the scan is performed in-place. * + * Results are not deterministic for pseudo-associative operators (e.g., + * addition of floating-point types). Results for pseudo-associative + * operators may vary from run to run. + * * \param first The beginning of the input sequence. * \param last The end of the input sequence. * \param result The beginning of the output sequence. @@ -478,6 +510,10 @@ * operator and an initial value \p init. When the input and output * sequences are the same, the scan is performed in-place. * + * Results are not deterministic for pseudo-associative operators (e.g., + * addition of floating-point types). Results for pseudo-associative + * operators may vary from run to run. + * * The algorithm's execution is parallelized as determined by \p exec. * * \param exec The execution policy to use for parallelization. @@ -545,7 +581,11 @@ * and so on. This version of the function requires both an associative * operator and an initial value \p init. When the input and output * sequences are the same, the scan is performed in-place. - * + * + * Results are not deterministic for pseudo-associative operators (e.g., + * addition of floating-point types). Results for pseudo-associative + * operators may vary from run to run. + * * \param first The beginning of the input sequence. * \param last The end of the input sequence. * \param result The beginning of the output sequence. @@ -618,6 +658,10 @@ * operator used to perform the prefix sum. When the input and output sequences * are the same, the scan is performed in-place. * + * Results are not deterministic for pseudo-associative operators (e.g., + * addition of floating-point types). Results for pseudo-associative + * operators may vary from run to run. + * * The algorithm's execution is parallelized as determined by \p exec. * * \param exec The execution policy to use for parallelization. @@ -687,6 +731,10 @@ * operator used to perform the prefix sum. When the input and output sequences * are the same, the scan is performed in-place. * + * Results are not deterministic for pseudo-associative operators (e.g., + * addition of floating-point types). Results for pseudo-associative + * operators may vary from run to run. + * * \param first1 The beginning of the key sequence. * \param last1 The end of the key sequence. * \param first2 The beginning of the input value sequence. @@ -746,6 +794,10 @@ * operator used to perform the prefix sum. When the input and output sequences * are the same, the scan is performed in-place. * + * Results are not deterministic for pseudo-associative operators (e.g., + * addition of floating-point types). Results for pseudo-associative + * operators may vary from run to run. + * * The algorithm's execution is parallelized as determined by \p exec. * * \param exec The execution policy to use for parallelization. @@ -822,6 +874,10 @@ * operator used to perform the prefix sum. When the input and output sequences * are the same, the scan is performed in-place. * + * Results are not deterministic for pseudo-associative operators (e.g., + * addition of floating-point types). Results for pseudo-associative + * operators may vary from run to run. + * * \param first1 The beginning of the key sequence. * \param last1 The end of the key sequence. * \param first2 The beginning of the input value sequence. @@ -888,6 +944,10 @@ * \c binary_op to perform the prefix sum. When the input and output sequences * are the same, the scan is performed in-place. * + * Results are not deterministic for pseudo-associative operators (e.g., + * addition of floating-point types). Results for pseudo-associative + * operators may vary from run to run. + * * The algorithm's execution is parallelized as determined by \p exec. * * \param exec The execution policy to use for parallelization. @@ -967,6 +1027,10 @@ * belong to the same segment if binary_pred(*i, *(i+1)) is true, and belong to * different segments otherwise. * + * Results are not deterministic for pseudo-associative operators (e.g., + * addition of floating-point types). Results for pseudo-associative + * operators may vary from run to run. + * * This version of \p inclusive_scan_by_key uses the associative operator * \c binary_op to perform the prefix sum. When the input and output sequences * are the same, the scan is performed in-place. @@ -1042,6 +1106,10 @@ * belong to the same segment if *i == *(i+1), and belong to * different segments otherwise. * + * Results are not deterministic for pseudo-associative operators (e.g., + * addition of floating-point types). Results for pseudo-associative + * operators may vary from run to run. + * * Refer to the most general form of \p exclusive_scan_by_key for additional details. * * The algorithm's execution is parallelized as determined by \p exec. @@ -1101,6 +1169,10 @@ * belong to the same segment if *i == *(i+1), and belong to * different segments otherwise. * + * Results are not deterministic for pseudo-associative operators (e.g., + * addition of floating-point types). Results for pseudo-associative + * operators may vary from run to run. + * * Refer to the most general form of \p exclusive_scan_by_key for additional details. * * \param first1 The beginning of the key sequence. @@ -1146,6 +1218,10 @@ * This version of \p exclusive_scan_by_key uses the value \c init to * initialize the exclusive scan operation. * + * Results are not deterministic for pseudo-associative operators (e.g., + * addition of floating-point types). Results for pseudo-associative + * operators may vary from run to run. + * * The algorithm's execution is parallelized as determined by \p exec. * * \param exec The execution policy to use for parallelization. @@ -1206,6 +1282,10 @@ * This version of \p exclusive_scan_by_key uses the value \c init to * initialize the exclusive scan operation. * + * Results are not deterministic for pseudo-associative operators (e.g., + * addition of floating-point types). Results for pseudo-associative + * operators may vary from run to run. + * * \param first1 The beginning of the key sequence. * \param last1 The end of the key sequence. * \param first2 The beginning of the input value sequence. @@ -1262,6 +1342,10 @@ * i+1 in the range [first1, last1) belong to the same segment if * binary_pred(*i, *(i+1)) is true, and belong to different segments otherwise. * + * Results are not deterministic for pseudo-associative operators (e.g., + * addition of floating-point types). Results for pseudo-associative + * operators may vary from run to run. + * * The algorithm's execution is parallelized as determined by \p exec. * * \param exec The execution policy to use for parallelization. @@ -1332,6 +1416,10 @@ * i+1 in the range [first1, last1) belong to the same segment if * binary_pred(*i, *(i+1)) is true, and belong to different segments otherwise. * + * Results are not deterministic for pseudo-associative operators (e.g., + * addition of floating-point types). Results for pseudo-associative + * operators may vary from run to run. + * * \param first1 The beginning of the key sequence. * \param last1 The end of the key sequence. * \param first2 The beginning of the input value sequence. @@ -1397,6 +1485,10 @@ * \c binary_op to perform the prefix sum. When the input and output sequences * are the same, the scan is performed in-place. * + * Results are not deterministic for pseudo-associative operators (e.g., + * addition of floating-point types). Results for pseudo-associative + * operators may vary from run to run. + * * The algorithm's execution is parallelized as determined by \p exec. * * \param exec The execution policy to use for parallelization. @@ -1487,6 +1579,10 @@ * \c binary_op to perform the prefix sum. When the input and output sequences * are the same, the scan is performed in-place. * + * Results are not deterministic for pseudo-associative operators (e.g., + * addition of floating-point types). Results for pseudo-associative + * operators may vary from run to run. + * * \param first1 The beginning of the key sequence. * \param last1 The end of the key sequence. * \param first2 The beginning of the input value sequence. diff -Nru libthrust-1.15.0/thrust/system/cuda/config.h libthrust-1.16.0/thrust/system/cuda/config.h --- libthrust-1.15.0/thrust/system/cuda/config.h 2021-10-29 18:01:06.000000000 +0000 +++ libthrust-1.16.0/thrust/system/cuda/config.h 2022-02-09 20:04:17.000000000 +0000 @@ -32,7 +32,7 @@ // older releases. This header will always pull in version info: #include -#if defined(__CUDACC__) || defined(__NVCOMPILER_CUDA__) +#if defined(__CUDACC__) || defined(_NVHPC_CUDA) # if !defined(__CUDA_ARCH__) || (__CUDA_ARCH__>= 350 && defined(__CUDACC_RDC__)) # define __THRUST_HAS_CUDART__ 1 # define THRUST_RUNTIME_FUNCTION __host__ __device__ __forceinline__ diff -Nru libthrust-1.15.0/thrust/system/cuda/detail/adjacent_difference.h libthrust-1.16.0/thrust/system/cuda/detail/adjacent_difference.h --- libthrust-1.15.0/thrust/system/cuda/detail/adjacent_difference.h 2021-10-29 18:01:06.000000000 +0000 +++ libthrust-1.16.0/thrust/system/cuda/detail/adjacent_difference.h 2022-02-09 20:04:17.000000000 +0000 @@ -210,7 +210,6 @@ Size tile_base) { input_type input[ITEMS_PER_THREAD]; - input_type input_prev[ITEMS_PER_THREAD]; output_type output[ITEMS_PER_THREAD]; if (IS_LAST_TILE) @@ -234,7 +233,7 @@ if (IS_FIRST_TILE) { BlockAdjacentDifference(temp_storage.discontinuity) - .FlagHeads(output, input, input_prev, binary_op); + .SubtractLeft(input, output, binary_op); if (threadIdx.x == 0) output[0] = input[0]; } @@ -242,7 +241,7 @@ { input_type tile_prev_input = first_tile_previous[tile_idx]; BlockAdjacentDifference(temp_storage.discontinuity) - .FlagHeads(output, input, input_prev, binary_op, tile_prev_input); + .SubtractLeft(input, output, binary_op, tile_prev_input); } core::sync_threadblock(); @@ -467,7 +466,7 @@ num_items_fixed, stream, debug_sync)); cuda_cub::throw_on_error(status, "adjacent_difference failed on 2nd step"); - status = cuda_cub::synchronize(policy); + status = cuda_cub::synchronize_optional(policy); cuda_cub::throw_on_error(status, "adjacent_difference failed to synchronize"); return result + num_items; diff -Nru libthrust-1.15.0/thrust/system/cuda/detail/binary_search.h libthrust-1.16.0/thrust/system/cuda/detail/binary_search.h --- libthrust-1.15.0/thrust/system/cuda/detail/binary_search.h 2021-10-29 18:01:06.000000000 +0000 +++ libthrust-1.16.0/thrust/system/cuda/detail/binary_search.h 2022-02-09 20:04:17.000000000 +0000 @@ -1,782 +1,19 @@ -/****************************************************************************** - * Copyright (c) 2016, NVIDIA CORPORATION. All rights reserved. - * - * Redistribution and use in source and binary forms, with or without - * modification, are permitted provided that the following conditions are met: - * * Redistributions of source code must retain the above copyright - * notice, this list of conditions and the following disclaimer. - * * Redistributions in binary form must reproduce the above copyright - * notice, this list of conditions and the following disclaimer in the - * documentation and/or other materials provided with the distribution. - * * Neither the name of the NVIDIA CORPORATION nor the - * names of its contributors may be used to endorse or promote products - * derived from this software without specific prior written permission. - * - * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" - * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE - * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE - * ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY - * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES - * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; - * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND - * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT - * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS - * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - * - ******************************************************************************/ -#pragma once - -#if 0 - -#include - -#if THRUST_DEVICE_COMPILER == THRUST_DEVICE_COMPILER_NVCC -#include -#include -#include - -#include -#include -#include -#include -#include -#include - -#if 1 -# define BS_SIMPLE -#endif - -THRUST_NAMESPACE_BEGIN -namespace cuda_cub { - -namespace __binary_search { - - template - struct lbf - { - typedef typename iterator_traits::difference_type result_type; - typedef typename iterator_traits::value_type T; - - template - THRUST_DEVICE_FUNCTION result_type - operator()(It begin, It end, T const& value, CompareOp comp) - { - return system::detail::generic::scalar::lower_bound(begin, - end, - value, - comp) - - begin; - } - }; // struct lbf - - template - struct ubf - { - typedef typename iterator_traits::difference_type result_type; - typedef typename iterator_traits::value_type T; - - template - THRUST_DEVICE_FUNCTION result_type - operator()(It begin, It end, T const& value, CompareOp comp) - { - return system::detail::generic::scalar::upper_bound(begin, - end, - value, - comp) - - begin; - } - }; // struct ubf - - template - struct bsf - { - typedef bool result_type; - typedef typename iterator_traits::value_type T; - - template - THRUST_DEVICE_FUNCTION bool - operator()(It begin, It end, T const& value, CompareOp comp) - { - HaystackIt iter = system::detail::generic::scalar::lower_bound(begin, - end, - value, - comp); - - detail::wrapped_function wrapped_comp(comp); - - return iter != end && !wrapped_comp(value, *iter); - } - }; // struct bsf - - template - THRUST_DEVICE_FUNCTION Size - merge_path(KeysIt1 keys1, - KeysIt2 keys2, - Size keys1_count, - Size keys2_count, - Size diag, - BinaryPred binary_pred) - { - typedef typename iterator_traits::value_type key1_type; - typedef typename iterator_traits::value_type key2_type; - - Size keys1_begin = thrust::max(0, diag - keys2_count); - Size keys1_end = thrust::min(diag, keys1_count); - - while (keys1_begin < keys1_end) - { - Size mid = (keys1_begin + keys1_end) >> 1; - key1_type key1 = keys1[mid]; - key2_type key2 = keys2[diag - 1 - mid]; - bool pred = binary_pred(key2, key1); - if (pred) - { - keys1_end = mid; - } - else - { - keys1_begin = mid + 1; - } - } - return keys1_begin; - } - - template - THRUST_DEVICE_FUNCTION void - serial_merge(It keys_shared, - int keys1_beg, - int keys2_beg, - int keys1_count, - int keys2_count, - T2 (&output)[ITEMS_PER_THREAD], - int (&indices)[ITEMS_PER_THREAD], - CompareOp compare_op) - { - int keys1_end = keys1_beg + keys1_count; - int keys2_end = keys2_beg + keys2_count; - - typedef typename iterator_value::type key_type; - - key_type key1 = keys_shared[keys1_beg]; - key_type key2 = keys_shared[keys2_beg]; - - -#pragma unroll - for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ++ITEM) - { - bool p = (keys2_beg < keys2_end) && - ((keys1_beg >= keys1_end) || - compare_op(key2,key1)); - - output[ITEM] = p ? key2 : key1; - indices[ITEM] = p ? keys2_beg++ : keys1_beg++; - - if (p) - { - key2 = keys_shared[keys2_beg]; - } - else - { - key1 = keys_shared[keys1_beg]; - } - } - } - - template - struct PtxPolicy - { - enum - { - BLOCK_THREADS = _BLOCK_THREADS, - ITEMS_PER_THREAD = _ITEMS_PER_THREAD, - ITEMS_PER_TILE = _BLOCK_THREADS * _ITEMS_PER_THREAD - }; - - static const cub::BlockLoadAlgorithm LOAD_ALGORITHM = _LOAD_ALGORITHM; - static const cub::CacheLoadModifier LOAD_MODIFIER = _LOAD_MODIFIER; - static const cub::BlockStoreAlgorithm STORE_ALGORITHM = _STORE_ALGORITHM; - }; // PtxPolicy - - template - struct Tuning; - - template - struct Tuning - { - enum - { - NOMINAL_4B_ITEMS_PER_THREAD = 7, - ITEMS_PER_THREAD = CUB_MIN(NOMINAL_4B_ITEMS_PER_THREAD, CUB_MAX(3, (NOMINAL_4B_ITEMS_PER_THREAD * 4 / sizeof(T)))), - }; - - typedef PtxPolicy<128, - ITEMS_PER_THREAD, - cub::BLOCK_LOAD_WARP_TRANSPOSE, - cub::LOAD_LDG, - cub::BLOCK_STORE_TRANSPOSE> - type; - }; - - template - struct Tuning - { - const static int INPUT_SIZE = sizeof(T); - - enum - { - NOMINAL_4B_ITEMS_PER_THREAD = 7, - ITEMS_PER_THREAD = CUB_MIN(NOMINAL_4B_ITEMS_PER_THREAD, CUB_MAX(1, (NOMINAL_4B_ITEMS_PER_THREAD * 4 / sizeof(T)))), - }; - - typedef PtxPolicy<128, - ITEMS_PER_THREAD, - cub::BLOCK_LOAD_WARP_TRANSPOSE, - cub::LOAD_LDG, - cub::BLOCK_STORE_WARP_TRANSPOSE> - type; - }; - - template - struct VectorizedBinarySearchAgent - { - typedef typename iterator_traits::value_type needle_type; - typedef typename iterator_traits::value_type haystack_type; - typedef typename SearchOp::result_type result_type; - - template - struct PtxPlan : Tuning::type - { - typedef Tuning tuning; - - typedef typename core::LoadIterator::type NeedlesLoadIt; - typedef typename core::LoadIterator::type HaystackLoadIt; - - typedef typename core::BlockLoad::type BlockLoadNeedles; - - typedef typename core::BlockStore::type BlockStoreResult; - - union TempStorage - { - typename BlockLoadNeedles::TempStorage load_needles; - typename BlockStoreResult::TempStorage store_result; - -#ifndef BS_SIMPLE - core::uninitialized_array needles_shared; - core::uninitialized_array result_shared; - core::uninitialized_array indices_shared; -#endif - }; // union TempStorage - }; - - typedef typename core::specialize_plan_msvc10_war::type::type ptx_plan; - - typedef typename ptx_plan::NeedlesLoadIt NeedlesLoadIt; - typedef typename ptx_plan::HaystackLoadIt HaystackLoadIt; - typedef typename ptx_plan::BlockLoadNeedles BlockLoadNeedles; - typedef typename ptx_plan::BlockStoreResult BlockStoreResult; - typedef typename ptx_plan::TempStorage TempStorage; - - enum - { - ITEMS_PER_THREAD = ptx_plan::ITEMS_PER_THREAD, - BLOCK_THREADS = ptx_plan::BLOCK_THREADS, - ITEMS_PER_TILE = ptx_plan::ITEMS_PER_TILE - }; - - struct impl - { - TempStorage& storage; - NeedlesLoadIt needles_load_it; - HaystackLoadIt haystack_load_it; - Size needles_count; - Size haystack_size; - OutputIt result; - CompareOp compare_op; - SearchOp search_op; - - THRUST_DEVICE_FUNCTION - void stable_odd_even_sort(needle_type (&needles)[ITEMS_PER_THREAD], - int (&indices)[ITEMS_PER_THREAD]) - { -#pragma unroll - for (int I = 0; I < ITEMS_PER_THREAD; ++I) - { -#pragma unroll - for (int J = 1 & I; J < ITEMS_PER_THREAD - 1; J += 2) - { - if (compare_op(needles[J + 1], needles[J])) - { - using thrust::swap; - swap(needles[J], needles[J + 1]); - swap(indices[J], indices[J + 1]); - } - } // inner loop - } // outer loop - } - - THRUST_DEVICE_FUNCTION void - block_mergesort(int tid, - int count, - needle_type (&needles_loc)[ITEMS_PER_THREAD], - int (&indices_loc)[ITEMS_PER_THREAD]) - { - using core::sync_threadblock; - - // stable sort items in a single thread - // - stable_odd_even_sort(needles_loc,indices_loc); - - // each thread has sorted keys_loc - // merge sort keys_loc in shared memory - // -#pragma unroll - for (int coop = 2; coop <= BLOCK_THREADS; coop *= 2) - { - sync_threadblock(); - - // store keys in shmem - // -#pragma unroll - for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ++ITEM) - { - int idx = ITEMS_PER_THREAD * threadIdx.x + ITEM; - storage.needles_shared[idx] = needles_loc[ITEM]; - } - - sync_threadblock(); - - int indices[ITEMS_PER_THREAD]; - - int list = ~(coop - 1) & tid; - int start = ITEMS_PER_THREAD * list; - int size = ITEMS_PER_THREAD * (coop >> 1); - - int diag = min(count, ITEMS_PER_THREAD * ((coop - 1) & tid)); +/* +* Copyright 2021 NVIDIA Corporation +* +* Licensed under the Apache License, Version 2.0 (the "License"); +* you may not use this file except in compliance with the License. +* You may obtain a copy of the License at +* +* http://www.apache.org/licenses/LICENSE-2.0 +* +* Unless required by applicable law or agreed to in writing, software +* distributed under the License is distributed on an "AS IS" BASIS, +* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +* See the License for the specific language governing permissions and +* limitations under the License. +*/ - int keys1_beg = min(count, start); - int keys1_end = min(count, keys1_beg + size); - int keys2_beg = keys1_end; - int keys2_end = min(count, keys2_beg + size); - - int keys1_count = keys1_end - keys1_beg; - int keys2_count = keys2_end - keys2_beg; - - int partition_diag = merge_path(&storage.needles_shared[keys1_beg], - &storage.needles_shared[keys2_beg], - keys1_count, - keys2_count, - diag, - compare_op); - - int keys1_beg_loc = keys1_beg + partition_diag; - int keys1_end_loc = keys1_end; - int keys2_beg_loc = keys2_beg + diag - partition_diag; - int keys2_end_loc = keys2_end; - int keys1_count_loc = keys1_end_loc - keys1_beg_loc; - int keys2_count_loc = keys2_end_loc - keys2_beg_loc; - serial_merge(&storage.needles_shared[0], - keys1_beg_loc, - keys2_beg_loc, - keys1_count_loc, - keys2_count_loc, - needles_loc, - indices, - compare_op); - - - sync_threadblock(); - -#pragma unroll - for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ++ITEM) - { - int idx = ITEMS_PER_THREAD * threadIdx.x + ITEM; - storage.indices_shared[idx] = indices_loc[ITEM]; - } - - sync_threadblock(); - -#pragma unroll - for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ++ITEM) - { - indices_loc[ITEM] = storage.indices_shared[indices[ITEM]]; - } - } - } // func block_merge_sort - - template - THRUST_DEVICE_FUNCTION void - consume_tile(int tid, - Size tile_idx, - Size tile_base, - int num_remaining) - { - using core::sync_threadblock; - - needle_type needles_loc[ITEMS_PER_THREAD]; - BlockLoadNeedles(storage.load_needles) - .Load(needles_load_it + tile_base, needles_loc, num_remaining); - -#ifdef BS_SIMPLE - - result_type results_loc[ITEMS_PER_THREAD]; - for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ++ITEM) - { - results_loc[ITEM] = search_op(haystack_load_it, - haystack_load_it + haystack_size, - needles_loc[ITEM], - compare_op); - } - - -#else - - if (IS_LAST_TILE) - { - needle_type max_value = needles_loc[0]; -#pragma unroll - for (int ITEM = 1; ITEM < ITEMS_PER_THREAD; ++ITEM) - { - if (ITEMS_PER_THREAD * tid + ITEM < num_remaining) - { - max_value = compare_op(max_value, needles_loc[ITEM]) - ? needles_loc[ITEM] - : max_value; - } - else - { - needles_loc[ITEM] = max_value; - } - } - } - - sync_threadblock(); - - int indices_loc[ITEMS_PER_THREAD]; - for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ++ITEM) - { - int idx = ITEMS_PER_THREAD*threadIdx.x + ITEM; - indices_loc[ITEM] = idx; - } - - if (IS_LAST_TILE) - { - block_mergesort(tid, - num_remaining, - needles_loc, - indices_loc); - } - else - { - block_mergesort(tid, - ITEMS_PER_TILE, - needles_loc, - indices_loc); - } - - sync_threadblock(); - -#pragma unroll - for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ++ITEM) - { - int idx = indices_loc[ITEM]; - storage.result_shared[idx] = - search_op(haystack_load_it, - haystack_load_it + haystack_size, - needles_loc[ITEM], - compare_op); - } - - sync_threadblock(); - - result_type results_loc[ITEMS_PER_THREAD]; -#pragma unroll - for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ++ITEM) - { - int idx = ITEMS_PER_THREAD*threadIdx.x + ITEM; - results_loc[ITEM] = storage.result_shared[idx]; - } - - sync_threadblock(); -#endif - - BlockStoreResult(storage.store_result) - .Store(result + tile_base, results_loc, num_remaining); - } - - THRUST_DEVICE_FUNCTION - impl(TempStorage& storage_, - NeedlesIt needles_it_, - HaystackIt haystack_it_, - Size needles_count_, - Size haystack_size_, - OutputIt result_, - CompareOp compare_op_, - SearchOp search_op_) - : storage(storage_), - needles_load_it(core::make_load_iterator(ptx_plan(), needles_it_)), - haystack_load_it(core::make_load_iterator(ptx_plan(), haystack_it_)), - needles_count(needles_count_), - haystack_size(haystack_size_), - result(result_), - compare_op(compare_op_), - search_op(search_op_) - { - int tid = threadIdx.x; - Size tile_idx = blockIdx.x; - Size num_tiles = gridDim.x; - Size tile_base = tile_idx * ITEMS_PER_TILE; - int items_in_tile = min(needles_count - tile_base, ITEMS_PER_TILE); - if (tile_idx < num_tiles - 1) - { - consume_tile(tid, tile_idx, tile_base, ITEMS_PER_TILE); - } - else - { - consume_tile(tid, tile_idx, tile_base, items_in_tile); - } - } - }; // struct impl - - - THRUST_AGENT_ENTRY(NeedlesIt needles_it, - HaystackIt haystack_it, - Size needles_count, - Size haystack_size, - OutputIt result, - CompareOp compare_op, - SearchOp search_op, - char* shmem) - { - TempStorage& storage = *reinterpret_cast(shmem); - - impl(storage, - needles_it, - haystack_it, - needles_count, - haystack_size, - result, - compare_op, - search_op); - } - }; // struct VectorizedBinarySearchAgent - - template - cudaError_t THRUST_RUNTIME_FUNCTION - doit_pass(void* d_temp_storage, - size_t& temp_storage_size, - NeedlesIt needles_it, - HaystackIt haystack_it, - Size needles_count, - Size haystack_size, - OutputIt result, - CompareOp compare_op, - SearchOp search_op, - cudaStream_t stream, - bool debug_sync) - { - if (needles_count == 0) - return cudaErrorNotSupported; - - cudaError_t status = cudaSuccess; - - using core::AgentPlan; - using core::AgentLauncher; - - - typedef AgentLauncher< - VectorizedBinarySearchAgent > - search_agent; - - AgentPlan search_plan = search_agent::get_plan(stream); - - temp_storage_size = 1; - if (d_temp_storage == NULL) - { - return status; - } - - search_agent sa(search_plan, needles_count, stream, "binary_search::search_agent", debug_sync); - sa.launch(needles_it, - haystack_it, - needles_count, - haystack_size, - result, - compare_op, - search_op); - - CUDA_CUB_RET_IF_FAIL(cudaPeekAtLastError()); - - return status; - } - - template - OutputIt THRUST_RUNTIME_FUNCTION - doit(execution_policy& policy, - HaystackIt haystack_begin, - HaystackIt haystack_end, - NeedlesIt needles_begin, - NeedlesIt needles_end, - OutputIt result, - CompareOp compare_op, - SearchOp search_op) - { - typedef typename iterator_traits::difference_type size_type; - - size_type needles_count = thrust::distance(needles_begin, needles_end); - size_type haystack_size = thrust::distance(haystack_begin, haystack_end); - - if (needles_count == 0) - return result; - - size_t storage_size = 0; - cudaStream_t stream = cuda_cub::stream(policy); - bool debug_sync = THRUST_DEBUG_SYNC_FLAG; - - cudaError status; - status = doit_pass(NULL, - storage_size, - needles_begin, - haystack_begin, - needles_count, - haystack_size, - result, - compare_op, - search_op, - stream, - debug_sync); - cuda_cub::throw_on_error(status, "binary_search: failed on 1st call"); - - // Allocate temporary storage. - thrust::detail::temporary_array - tmp(policy, storage_size); - void *ptr = static_cast(tmp.data().get()); - - status = doit_pass(ptr, - storage_size, - needles_begin, - haystack_begin, - needles_count, - haystack_size, - result, - compare_op, - search_op, - stream, - debug_sync); - cuda_cub::throw_on_error(status, "binary_search: failed on 2nt call"); - - status = cuda_cub::synchronize(policy); - cuda_cub::throw_on_error(status, "binary_search: failed to synchronize"); - - return result + needles_count; - } - - struct less - { - template - THRUST_DEVICE_FUNCTION bool - operator()(const T1& lhs, const T2& rhs) const - { - return lhs < rhs; - } - }; -} // namespace __binary_search - -//------------------------- -// Thrust API entry points -//------------------------- - -__thrust_exec_check_disable__ -template -OutputIt __host__ __device__ -lower_bound(execution_policy& policy, - HaystackIt first, - HaystackIt last, - NeedlesIt values_first, - NeedlesIt values_last, - OutputIt result, - CompareOp compare_op) -{ - OutputIt ret = result; - if (__THRUST_HAS_CUDART__) - { - ret = __binary_search::doit(policy, - first, - last, - values_first, - values_last, - result, - compare_op, - __binary_search::lbf()); - } - else - { -#if !__THRUST_HAS_CUDART__ - ret = thrust::lower_bound(cvt_to_seq(derived_cast(policy)), - first, - last, - values_first, - values_last, - result); -#endif - } - return ret; -} - - -template -OutputIt __host__ __device__ -lower_bound(execution_policy& policy, - HaystackIt first, - HaystackIt last, - NeedlesIt values_first, - NeedlesIt values_last, - OutputIt result) -{ - return cuda_cub::lower_bound(policy, - first, - last, - values_first, - values_last, - result, - __binary_search::less()); -} - -} // namespace cuda_cub -THRUST_NAMESPACE_END -#endif +#pragma once -#endif +// this system has no special version of this algorithm diff -Nru libthrust-1.15.0/thrust/system/cuda/detail/core/agent_launcher.h libthrust-1.16.0/thrust/system/cuda/detail/core/agent_launcher.h --- libthrust-1.15.0/thrust/system/cuda/detail/core/agent_launcher.h 2021-10-29 18:01:06.000000000 +0000 +++ libthrust-1.16.0/thrust/system/cuda/detail/core/agent_launcher.h 2022-02-09 20:04:17.000000000 +0000 @@ -50,7 +50,7 @@ namespace core { -#if defined(__CUDA_ARCH__) || defined(__NVCOMPILER_CUDA__) +#if defined(__CUDA_ARCH__) || defined(_NVHPC_CUDA) #if 0 template void __global__ diff -Nru libthrust-1.15.0/thrust/system/cuda/detail/core/triple_chevron_launch.h libthrust-1.16.0/thrust/system/cuda/detail/core/triple_chevron_launch.h --- libthrust-1.15.0/thrust/system/cuda/detail/core/triple_chevron_launch.h 2021-10-29 18:01:06.000000000 +0000 +++ libthrust-1.16.0/thrust/system/cuda/detail/core/triple_chevron_launch.h 2022-02-09 20:04:17.000000000 +0000 @@ -834,7 +834,7 @@ } -#if defined(__NVCOMPILER_CUDA__) +#if defined(_NVHPC_CUDA) # define THRUST_TRIPLE_LAUNCHER_HOSTDEVICE(...) \ (__builtin_is_device_code() ? \ doit_device(__VA_ARGS__) : doit_host(__VA_ARGS__)) diff -Nru libthrust-1.15.0/thrust/system/cuda/detail/core/util.h libthrust-1.16.0/thrust/system/cuda/detail/core/util.h --- libthrust-1.15.0/thrust/system/cuda/detail/core/util.h 2021-10-29 18:01:06.000000000 +0000 +++ libthrust-1.16.0/thrust/system/cuda/detail/core/util.h 2022-02-09 20:04:17.000000000 +0000 @@ -41,7 +41,7 @@ namespace cuda_cub { namespace core { -#ifdef __NVCOMPILER_CUDA__ +#ifdef _NVHPC_CUDA # if (__NVCOMPILER_CUDA_ARCH__ >= 600) # define THRUST_TUNING_ARCH sm60 # elif (__NVCOMPILER_CUDA_ARCH__ >= 520) @@ -358,7 +358,7 @@ // get_agent_plan_impl::get(version), is for host code and for device // code without device-side kernel launches. NVCC and Feta check for // these situations differently. - #ifdef __NVCOMPILER_CUDA__ + #ifdef _NVHPC_CUDA #ifdef __THRUST_HAS_CUDART__ if (CUB_IS_DEVICE_CODE) { return typename get_plan::type(typename Agent::ptx_plan()); diff -Nru libthrust-1.15.0/thrust/system/cuda/detail/extrema.h libthrust-1.16.0/thrust/system/cuda/detail/extrema.h --- libthrust-1.15.0/thrust/system/cuda/detail/extrema.h 2021-10-29 18:01:06.000000000 +0000 +++ libthrust-1.16.0/thrust/system/cuda/detail/extrema.h 2022-02-09 20:04:17.000000000 +0000 @@ -130,8 +130,11 @@ pair_type const &lhs_min = get<0>(lhs); pair_type const &rhs_max = get<1>(rhs); pair_type const &lhs_max = get<1>(lhs); - return thrust::make_tuple(arg_min_t(predicate)(lhs_min, rhs_min), - arg_max_t(predicate)(lhs_max, rhs_max)); + + auto result = thrust::make_tuple(arg_min_t(predicate)(lhs_min, rhs_min), + arg_max_t(predicate)(lhs_max, rhs_max)); + + return result; } struct duplicate_tuple @@ -265,7 +268,7 @@ // if not enough to fill the device with threadblocks // then fill the device with threadblocks - reduce_grid_size = static_cast(min(num_tiles, static_cast(reduce_device_occupancy))); + reduce_grid_size = static_cast((min)(num_tiles, static_cast(reduce_device_occupancy))); typedef AgentLauncher<__reduce::DrainAgent > drain_agent; AgentPlan drain_plan = drain_agent::get_plan(); diff -Nru libthrust-1.15.0/thrust/system/cuda/detail/fill.h libthrust-1.16.0/thrust/system/cuda/detail/fill.h --- libthrust-1.15.0/thrust/system/cuda/detail/fill.h 2021-10-29 18:01:06.000000000 +0000 +++ libthrust-1.16.0/thrust/system/cuda/detail/fill.h 2022-02-09 20:04:17.000000000 +0000 @@ -72,7 +72,7 @@ count); cuda_cub::throw_on_error( - cuda_cub::synchronize(policy) + cuda_cub::synchronize_optional(policy) , "fill_n: failed to synchronize" ); diff -Nru libthrust-1.15.0/thrust/system/cuda/detail/for_each.h libthrust-1.16.0/thrust/system/cuda/detail/for_each.h --- libthrust-1.15.0/thrust/system/cuda/detail/for_each.h 2021-10-29 18:01:06.000000000 +0000 +++ libthrust-1.16.0/thrust/system/cuda/detail/for_each.h 2022-02-09 20:04:17.000000000 +0000 @@ -82,7 +82,7 @@ count); cuda_cub::throw_on_error( - cuda_cub::synchronize(policy) + cuda_cub::synchronize_optional(policy) , "for_each: failed to synchronize" ); diff -Nru libthrust-1.15.0/thrust/system/cuda/detail/get_value.h libthrust-1.16.0/thrust/system/cuda/detail/get_value.h --- libthrust-1.15.0/thrust/system/cuda/detail/get_value.h 2021-10-29 18:01:06.000000000 +0000 +++ libthrust-1.16.0/thrust/system/cuda/detail/get_value.h 2022-02-09 20:04:17.000000000 +0000 @@ -66,7 +66,7 @@ // because it would result in a compiler warning, either about falling off // the end of a non-void function, or about result_type's default constructor // being a host-only function. - #ifdef __NVCOMPILER_CUDA__ + #ifdef _NVHPC_CUDA if (THRUST_IS_HOST_CODE) { return war_nvbugs_881631::host_path(exec, ptr); } else { diff -Nru libthrust-1.15.0/thrust/system/cuda/detail/merge.h libthrust-1.16.0/thrust/system/cuda/detail/merge.h --- libthrust-1.15.0/thrust/system/cuda/detail/merge.h 2021-10-29 18:01:06.000000000 +0000 +++ libthrust-1.16.0/thrust/system/cuda/detail/merge.h 2022-02-09 20:04:17.000000000 +0000 @@ -170,7 +170,7 @@ Size partition_idx = blockDim.x * blockIdx.x + threadIdx.x; if (partition_idx < num_partitions) { - Size partition_at = thrust::min(partition_idx * items_per_tile, + Size partition_at = (thrust::min)(partition_idx * items_per_tile, keys1_count + keys2_count); Size partition_diag = merge_path(keys1, keys2, @@ -463,7 +463,7 @@ Size partition_end = merge_partitions[tile_idx + 1]; Size diag0 = ITEMS_PER_TILE * tile_idx; - Size diag1 = thrust::min(keys1_count + keys2_count, diag0 + ITEMS_PER_TILE); + Size diag1 = (thrust::min)(keys1_count + keys2_count, diag0 + ITEMS_PER_TILE); // compute bounding box for keys1 & keys2 // @@ -847,7 +847,7 @@ debug_sync); cuda_cub::throw_on_error(status, "merge: failed on 2nd step"); - status = cuda_cub::synchronize(policy); + status = cuda_cub::synchronize_optional(policy); cuda_cub::throw_on_error(status, "merge: failed to synchronize"); return thrust::make_pair(keys_result + count, items_result + count); diff -Nru libthrust-1.15.0/thrust/system/cuda/detail/par.h libthrust-1.16.0/thrust/system/cuda/detail/par.h --- libthrust-1.15.0/thrust/system/cuda/detail/par.h 2021-10-29 18:01:06.000000000 +0000 +++ libthrust-1.16.0/thrust/system/cuda/detail/par.h 2022-02-09 20:04:17.000000000 +0000 @@ -50,7 +50,7 @@ public: __host__ __device__ execute_on_stream_base(cudaStream_t stream_ = default_stream()) - : stream(stream_) {} + : stream(stream_){} THRUST_RUNTIME_FUNCTION Derived @@ -70,6 +70,42 @@ } }; +template +struct execute_on_stream_nosync_base : execution_policy +{ +private: + cudaStream_t stream; + +public: + __host__ __device__ + execute_on_stream_nosync_base(cudaStream_t stream_ = default_stream()) + : stream(stream_){} + + THRUST_RUNTIME_FUNCTION + Derived + on(cudaStream_t const &s) const + { + Derived result = derived_cast(*this); + result.stream = s; + return result; + } + +private: + friend __host__ __device__ + cudaStream_t + get_stream(const execute_on_stream_nosync_base &exec) + { + return exec.stream; + } + + friend __host__ __device__ + bool + must_perform_optional_stream_synchronization(const execute_on_stream_nosync_base &) + { + return false; + } +}; + struct execute_on_stream : execute_on_stream_base { typedef execute_on_stream_base base_t; @@ -77,7 +113,19 @@ __host__ __device__ execute_on_stream() : base_t(){}; __host__ __device__ - execute_on_stream(cudaStream_t stream) : base_t(stream){}; + execute_on_stream(cudaStream_t stream) + : base_t(stream){}; +}; + +struct execute_on_stream_nosync : execute_on_stream_nosync_base +{ + typedef execute_on_stream_nosync_base base_t; + + __host__ __device__ + execute_on_stream_nosync() : base_t(){}; + __host__ __device__ + execute_on_stream_nosync(cudaStream_t stream) + : base_t(stream){}; }; @@ -104,20 +152,105 @@ } }; +struct par_nosync_t : execution_policy, + thrust::detail::allocator_aware_execution_policy< + execute_on_stream_nosync_base> +#if THRUST_CPP_DIALECT >= 2011 +, thrust::detail::dependencies_aware_execution_policy< + execute_on_stream_nosync_base> +#endif +{ + typedef execution_policy base_t; + + __host__ __device__ + constexpr par_nosync_t() : base_t() {} + + typedef execute_on_stream_nosync stream_attachment_type; + + THRUST_RUNTIME_FUNCTION + stream_attachment_type + on(cudaStream_t const &stream) const + { + return execute_on_stream_nosync(stream); + } + +private: + //this function is defined to allow non-blocking calls on the default_stream() with thrust::cuda::par_nosync + //without explicitly using thrust::cuda::par_nosync.on(default_stream()) + friend __host__ __device__ + bool + must_perform_optional_stream_synchronization(const par_nosync_t &) + { + return false; + } +}; + THRUST_INLINE_CONSTANT par_t par; + +/*! \p thrust::cuda::par_nosync is a parallel execution policy targeting Thrust's CUDA device backend. + * Similar to \p thrust::cuda::par it allows execution of Thrust algorithms in a specific CUDA stream. + * + * \p thrust::cuda::par_nosync indicates that an algorithm is free to avoid any synchronization of the + * associated stream that is not strictly required for correctness. Additionally, algorithms may return + * before the corresponding kernels are completed, similar to asynchronous kernel launches via <<< >>> syntax. + * The user must take care to perform explicit synchronization if necessary. + * + * The following code snippet demonstrates how to use \p thrust::cuda::par_nosync : + * + * \code + * #include + * #include + * #include + * + * struct IncFunctor{ + * __host__ __device__ + * void operator()(std::size_t& x){ x = x + 1; }; + * }; + * + * int main(){ + * std::size_t N = 1000000; + * thrust::device_vector d_vec(N); + * + * cudaStream_t stream; + * cudaStreamCreate(&stream); + * auto nosync_policy = thrust::cuda::par_nosync.on(stream); + * + * thrust::for_each(nosync_policy, d_vec.begin(), d_vec.end(), IncFunctor{}); + * thrust::for_each(nosync_policy, d_vec.begin(), d_vec.end(), IncFunctor{}); + * thrust::for_each(nosync_policy, d_vec.begin(), d_vec.end(), IncFunctor{}); + * + * //for_each may return before completion. Could do other cpu work in the meantime + * // ... + * + * //Wait for the completion of all for_each kernels + * cudaStreamSynchronize(stream); + * + * std::size_t x = thrust::reduce(nosync_policy, d_vec.begin(), d_vec.end()); + * //Currently, this synchronization is not necessary. reduce will still perform + * //implicit synchronization to transfer the reduced value to the host to return it. + * cudaStreamSynchronize(stream); + * cudaStreamDestroy(stream); + * } + * \endcode + * + */ +THRUST_INLINE_CONSTANT par_nosync_t par_nosync; } // namespace cuda_ namespace system { namespace cuda { using thrust::cuda_cub::par; + using thrust::cuda_cub::par_nosync; namespace detail { using thrust::cuda_cub::par_t; + using thrust::cuda_cub::par_nosync_t; } } // namesapce cuda } // namespace system namespace cuda { using thrust::cuda_cub::par; +using thrust::cuda_cub::par_nosync; } // namespace cuda THRUST_NAMESPACE_END diff -Nru libthrust-1.15.0/thrust/system/cuda/detail/reduce.h libthrust-1.16.0/thrust/system/cuda/detail/reduce.h --- libthrust-1.15.0/thrust/system/cuda/detail/reduce.h 2021-10-29 18:01:06.000000000 +0000 +++ libthrust-1.16.0/thrust/system/cuda/detail/reduce.h 2022-02-09 20:04:17.000000000 +0000 @@ -808,7 +808,7 @@ // if not enough to fill the device with threadblocks // then fill the device with threadblocks - reduce_grid_size = static_cast(min(num_tiles, static_cast(reduce_device_occupancy))); + reduce_grid_size = static_cast((min)(num_tiles, static_cast(reduce_device_occupancy))); typedef AgentLauncher > drain_agent; AgentPlan drain_plan = drain_agent::get_plan(); @@ -984,8 +984,8 @@ // Synchronize the stream and get the value. - cuda_cub::throw_on_error(cuda_cub::synchronize(policy), - "reduce failed to synchronize"); + status = cuda_cub::synchronize(policy); + cuda_cub::throw_on_error(status, "reduce failed to synchronize"); // `tmp.begin()` yields a `normal_iterator`, which dereferences to a // `reference`, which has an `operator&` that returns a `pointer`, which diff -Nru libthrust-1.15.0/thrust/system/cuda/detail/scan_by_key.h libthrust-1.16.0/thrust/system/cuda/detail/scan_by_key.h --- libthrust-1.15.0/thrust/system/cuda/detail/scan_by_key.h 2021-10-29 18:01:06.000000000 +0000 +++ libthrust-1.16.0/thrust/system/cuda/detail/scan_by_key.h 2022-02-09 20:04:17.000000000 +0000 @@ -775,7 +775,7 @@ debug_sync); cuda_cub::throw_on_error(status, "scan_by_key: failed on 2nd step"); - status = cuda_cub::synchronize(policy); + status = cuda_cub::synchronize_optional(policy); cuda_cub::throw_on_error(status, "scan_by_key: failed to synchronize"); return values_result + num_items; diff -Nru libthrust-1.15.0/thrust/system/cuda/detail/scan.h libthrust-1.16.0/thrust/system/cuda/detail/scan.h --- libthrust-1.15.0/thrust/system/cuda/detail/scan.h 2021-10-29 18:01:06.000000000 +0000 +++ libthrust-1.16.0/thrust/system/cuda/detail/scan.h 2022-02-09 20:04:17.000000000 +0000 @@ -115,7 +115,7 @@ THRUST_DEBUG_SYNC_FLAG)); thrust::cuda_cub::throw_on_error(status, "after dispatching inclusive_scan kernel"); - thrust::cuda_cub::throw_on_error(thrust::cuda_cub::synchronize(policy), + thrust::cuda_cub::throw_on_error(thrust::cuda_cub::synchronize_optional(policy), "inclusive_scan failed to synchronize"); } @@ -194,7 +194,7 @@ THRUST_DEBUG_SYNC_FLAG)); thrust::cuda_cub::throw_on_error(status, "after dispatching exclusive_scan kernel"); - thrust::cuda_cub::throw_on_error(thrust::cuda_cub::synchronize(policy), + thrust::cuda_cub::throw_on_error(thrust::cuda_cub::synchronize_optional(policy), "exclusive_scan failed to synchronize"); } diff -Nru libthrust-1.15.0/thrust/system/cuda/detail/sort.h libthrust-1.16.0/thrust/system/cuda/detail/sort.h --- libthrust-1.15.0/thrust/system/cuda/detail/sort.h 2021-10-29 18:01:06.000000000 +0000 +++ libthrust-1.16.0/thrust/system/cuda/detail/sort.h 2022-02-09 20:04:17.000000000 +0000 @@ -36,6 +36,7 @@ #include #include #include +#include #include #include @@ -48,1225 +49,112 @@ #include #include + THRUST_NAMESPACE_BEGIN namespace cuda_cub { namespace __merge_sort { - template - THRUST_DEVICE_FUNCTION Size - merge_path(KeysIt1 keys1, - KeysIt2 keys2, - Size keys1_count, - Size keys2_count, - Size diag, - BinaryPred binary_pred) - { - typedef typename iterator_traits::value_type key1_type; - typedef typename iterator_traits::value_type key2_type; - - Size keys1_begin = thrust::max(0, diag - keys2_count); - Size keys1_end = thrust::min(diag, keys1_count); - - while (keys1_begin < keys1_end) - { - Size mid = (keys1_begin + keys1_end) >> 1; - key1_type key1 = keys1[mid]; - key2_type key2 = keys2[diag - 1 - mid]; - bool pred = binary_pred(key2, key1); - if (pred) - { - keys1_end = mid; - } - else - { - keys1_begin = mid + 1; - } - } - return keys1_begin; - } - - template - THRUST_DEVICE_FUNCTION void - serial_merge(It keys_shared, - int keys1_beg, - int keys2_beg, - int keys1_count, - int keys2_count, - T2 (&output)[ITEMS_PER_THREAD], - int (&indices)[ITEMS_PER_THREAD], - CompareOp compare_op) - { - int keys1_end = keys1_beg + keys1_count; - int keys2_end = keys2_beg + keys2_count; - - typedef typename iterator_value::type key_type; - - key_type key1 = keys_shared[keys1_beg]; - key_type key2 = keys_shared[keys2_beg]; - - -#pragma unroll - for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ++ITEM) - { - bool p = (keys2_beg < keys2_end) && - ((keys1_beg >= keys1_end) || - compare_op(key2,key1)); - - output[ITEM] = p ? key2 : key1; - indices[ITEM] = p ? keys2_beg++ : keys1_beg++; - - if (p) - { - key2 = keys_shared[keys2_beg]; - } - else - { - key1 = keys_shared[keys1_beg]; - } - } - } - - template - struct PtxPolicy - { - enum - { - BLOCK_THREADS = _BLOCK_THREADS, - ITEMS_PER_THREAD = _ITEMS_PER_THREAD, - ITEMS_PER_TILE = _BLOCK_THREADS * _ITEMS_PER_THREAD, - }; - - static const cub::BlockLoadAlgorithm LOAD_ALGORITHM = _LOAD_ALGORITHM; - static const cub::CacheLoadModifier LOAD_MODIFIER = _LOAD_MODIFIER; - static const cub::BlockStoreAlgorithm STORE_ALGORITHM = _STORE_ALGORITHM; - }; // PtxPolicy - - - template - struct Tuning; - - template - struct Tuning - { - const static int INPUT_SIZE = sizeof(T); - - enum - { - NOMINAL_4B_ITEMS_PER_THREAD = 11, - ITEMS_PER_THREAD = CUB_MIN(NOMINAL_4B_ITEMS_PER_THREAD, CUB_MAX(1, (NOMINAL_4B_ITEMS_PER_THREAD * 4 / sizeof(T)))), - }; - - typedef PtxPolicy<256, - ITEMS_PER_THREAD, - cub::BLOCK_LOAD_WARP_TRANSPOSE, - cub::LOAD_LDG, - cub::BLOCK_STORE_WARP_TRANSPOSE> - type; - }; - - template - struct Tuning - { - const static int INPUT_SIZE = sizeof(T); - - enum - { - NOMINAL_4B_ITEMS_PER_THREAD = 15, - ITEMS_PER_THREAD = CUB_MIN(NOMINAL_4B_ITEMS_PER_THREAD, CUB_MAX(1, (NOMINAL_4B_ITEMS_PER_THREAD * 4 / sizeof(T)))), - }; - - typedef PtxPolicy<512, - ITEMS_PER_THREAD, - cub::BLOCK_LOAD_WARP_TRANSPOSE, - cub::LOAD_LDG, - cub::BLOCK_STORE_WARP_TRANSPOSE> - type; - }; - - template - struct Tuning - { - const static int INPUT_SIZE = sizeof(T); - - enum - { - NOMINAL_4B_ITEMS_PER_THREAD = 17, - ITEMS_PER_THREAD = CUB_MIN(NOMINAL_4B_ITEMS_PER_THREAD, CUB_MAX(1, (NOMINAL_4B_ITEMS_PER_THREAD * 4 / sizeof(T)))), - }; - - typedef PtxPolicy<256, - ITEMS_PER_THREAD, - cub::BLOCK_LOAD_WARP_TRANSPOSE, - cub::LOAD_DEFAULT, - cub::BLOCK_STORE_WARP_TRANSPOSE> - type; - }; - - template - struct Tuning - { - enum - { - NOMINAL_4B_ITEMS_PER_THREAD = 7, - ITEMS_PER_THREAD = CUB_MIN(NOMINAL_4B_ITEMS_PER_THREAD, CUB_MAX(1, (NOMINAL_4B_ITEMS_PER_THREAD * 4 / sizeof(T)))), - }; - - typedef PtxPolicy<128, - ITEMS_PER_THREAD, - cub::BLOCK_LOAD_WARP_TRANSPOSE, - cub::LOAD_DEFAULT, - cub::BLOCK_STORE_WARP_TRANSPOSE> - type; - }; - template - struct BlockSortAgent - { - typedef typename iterator_traits::value_type key_type; - typedef typename iterator_traits::value_type item_type; - - template - struct PtxPlan : Tuning::type - { - typedef Tuning tuning; - - typedef typename core::LoadIterator::type KeysLoadIt; - typedef typename core::LoadIterator::type ItemsLoadIt; - - typedef typename core::BlockLoad::type BlockLoadKeys; - typedef typename core::BlockLoad::type BlockLoadItems; - - typedef typename core::BlockStore::type BlockStoreKeysIt; - typedef typename core::BlockStore::type BlockStoreItemsIt; - typedef typename core::BlockStore::type BlockStoreKeysRaw; - typedef typename core::BlockStore::type BlockStoreItemsRaw; - - union TempStorage - { - typename BlockLoadKeys::TempStorage load_keys; - typename BlockLoadItems::TempStorage load_items; - typename BlockStoreKeysIt::TempStorage store_keys_it; - typename BlockStoreItemsIt::TempStorage store_items_it; - typename BlockStoreKeysRaw::TempStorage store_keys_raw; - typename BlockStoreItemsRaw::TempStorage store_items_raw; - - core::uninitialized_array keys_shared; - core::uninitialized_array items_shared; - }; // union TempStorage - }; // struct PtxPlan - - typedef typename core::specialize_plan_msvc10_war::type::type ptx_plan; - - typedef typename ptx_plan::KeysLoadIt KeysLoadIt; - typedef typename ptx_plan::ItemsLoadIt ItemsLoadIt; - typedef typename ptx_plan::BlockLoadKeys BlockLoadKeys; - typedef typename ptx_plan::BlockLoadItems BlockLoadItems; - typedef typename ptx_plan::BlockStoreKeysIt BlockStoreKeysIt; - typedef typename ptx_plan::BlockStoreItemsIt BlockStoreItemsIt; - typedef typename ptx_plan::BlockStoreKeysRaw BlockStoreKeysRaw; - typedef typename ptx_plan::BlockStoreItemsRaw BlockStoreItemsRaw; - typedef typename ptx_plan::TempStorage TempStorage; - - enum - { - ITEMS_PER_THREAD = ptx_plan::ITEMS_PER_THREAD, - BLOCK_THREADS = ptx_plan::BLOCK_THREADS, - ITEMS_PER_TILE = ptx_plan::ITEMS_PER_TILE - }; - - struct impl - { - //--------------------------------------------------------------------- - // Per thread data - //--------------------------------------------------------------------- - - bool ping; - TempStorage& storage; - KeysLoadIt keys_in; - ItemsLoadIt items_in; - Size keys_count; - KeysIt keys_out_it; - ItemsIt items_out_it; - key_type* keys_out_raw; - item_type* items_out_raw; - CompareOp compare_op; - - //--------------------------------------------------------------------- - // Serial stable sort network - //--------------------------------------------------------------------- - - THRUST_DEVICE_FUNCTION - void stable_odd_even_sort(key_type (&keys)[ITEMS_PER_THREAD], - item_type (&items)[ITEMS_PER_THREAD]) - { -#pragma unroll - for (int i = 0; i < ITEMS_PER_THREAD; ++i) - { -#pragma unroll - for (int j = 1 & i; j < ITEMS_PER_THREAD - 1; j += 2) - { - if (compare_op(keys[j + 1], keys[j])) - { - using thrust::swap; - swap(keys[j], keys[j + 1]); - if (SORT_ITEMS::value) - { - swap(items[j], items[j + 1]); - } - } - } // inner loop - } // outer loop - } - - //--------------------------------------------------------------------- - // Parallel thread block merge sort - //--------------------------------------------------------------------- - - template - THRUST_DEVICE_FUNCTION void - block_mergesort(int tid, - int count, - key_type (&keys_loc)[ITEMS_PER_THREAD], - item_type (&items_loc)[ITEMS_PER_THREAD]) - { - using core::uninitialized_array; - using core::sync_threadblock; - - // if first element of thread is in input range, stable sort items - // - if (!IS_LAST_TILE || ITEMS_PER_THREAD * tid < count) - { - stable_odd_even_sort(keys_loc, items_loc); - } - - // each thread has sorted keys_loc - // merge sort keys_loc in shared memory - // -#pragma unroll - for (int coop = 2; coop <= BLOCK_THREADS; coop *= 2) - { - sync_threadblock(); - - // store keys in shmem - // -#pragma unroll - for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ++ITEM) - { - int idx = ITEMS_PER_THREAD * threadIdx.x + ITEM; - storage.keys_shared[idx] = keys_loc[ITEM]; - } - - sync_threadblock(); - - int indices[ITEMS_PER_THREAD]; - - int list = ~(coop - 1) & tid; - int start = ITEMS_PER_THREAD * list; - int size = ITEMS_PER_THREAD * (coop >> 1); - - int diag = min(count, - ITEMS_PER_THREAD * ((coop - 1) & tid)); - - int keys1_beg = min(count, start); - int keys1_end = min(count, keys1_beg + size); - int keys2_beg = keys1_end; - int keys2_end = min(count, keys2_beg + size); - - int keys1_count = keys1_end - keys1_beg; - int keys2_count = keys2_end - keys2_beg; - - int partition_diag = merge_path(&storage.keys_shared[keys1_beg], - &storage.keys_shared[keys2_beg], - keys1_count, - keys2_count, - diag, - compare_op); - - int keys1_beg_loc = keys1_beg + partition_diag; - int keys1_end_loc = keys1_end; - int keys2_beg_loc = keys2_beg + diag - partition_diag; - int keys2_end_loc = keys2_end; - int keys1_count_loc = keys1_end_loc - keys1_beg_loc; - int keys2_count_loc = keys2_end_loc - keys2_beg_loc; - serial_merge(&storage.keys_shared[0], - keys1_beg_loc, - keys2_beg_loc, - keys1_count_loc, - keys2_count_loc, - keys_loc, - indices, - compare_op); - - - if (SORT_ITEMS::value) - { - sync_threadblock(); - - // store keys in shmem - // -#pragma unroll - for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ++ITEM) - { - int idx = ITEMS_PER_THREAD * threadIdx.x + ITEM; - storage.items_shared[idx] = items_loc[ITEM]; - } - - sync_threadblock(); - - // gather items from shmem - // -#pragma unroll - for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ++ITEM) - { - items_loc[ITEM] = storage.items_shared[indices[ITEM]]; - } - } - } - } // func block_merge_sort - - //--------------------------------------------------------------------- - // Tile processing - //--------------------------------------------------------------------- - - template - THRUST_DEVICE_FUNCTION void - consume_tile(int tid, - Size /*tile_idx*/, - Size tile_base, - int num_remaining) - { - using core::uninitialized_array; - using core::sync_threadblock; - - item_type items_loc[ITEMS_PER_THREAD]; - if (SORT_ITEMS::value) - { - BlockLoadItems(storage.load_items) - .Load(items_in + tile_base, - items_loc, - num_remaining, - *(items_in + tile_base)); - - sync_threadblock(); - } - - key_type keys_loc[ITEMS_PER_THREAD]; - if (IS_LAST_TILE) - { - BlockLoadKeys(storage.load_keys) - .Load(keys_in + tile_base, - keys_loc, - num_remaining, - *(keys_in + tile_base)); - } - else - { - BlockLoadKeys(storage.load_keys) - .Load(keys_in + tile_base, keys_loc); - } - - if (IS_LAST_TILE) - { - // if last tile, find valid max_key - // and fill the remainig keys with it - // - key_type max_key = keys_loc[0]; -#pragma unroll - for (int ITEM = 1; ITEM < ITEMS_PER_THREAD; ++ITEM) - { - if (ITEMS_PER_THREAD * tid + ITEM < num_remaining) - { - max_key = compare_op(max_key, keys_loc[ITEM]) - ? keys_loc[ITEM] - : max_key; - } - else - { - keys_loc[ITEM] = max_key; - } - } - } - - sync_threadblock(); - - if (IS_LAST_TILE) - { - block_mergesort(tid, - num_remaining, - keys_loc, - items_loc); - } - else - { - block_mergesort(tid, - ITEMS_PER_TILE, - keys_loc, - items_loc); - } - - sync_threadblock(); - - if (ping) - { - if (IS_LAST_TILE) - { - BlockStoreKeysIt(storage.store_keys_it) - .Store(keys_out_it + tile_base, keys_loc, num_remaining); - } - else - { - BlockStoreKeysIt(storage.store_keys_it) - .Store(keys_out_it + tile_base, keys_loc); - } - - if (SORT_ITEMS::value) - { - sync_threadblock(); - - BlockStoreItemsIt(storage.store_items_it) - .Store(items_out_it + tile_base, items_loc, num_remaining); - } - } - else - { - if (IS_LAST_TILE) - { - BlockStoreKeysRaw(storage.store_keys_raw) - .Store(keys_out_raw + tile_base, keys_loc, num_remaining); - } - else - { - BlockStoreKeysRaw(storage.store_keys_raw) - .Store(keys_out_raw + tile_base, keys_loc); - } - - if (SORT_ITEMS::value) - { - sync_threadblock(); - - BlockStoreItemsRaw(storage.store_items_raw) - .Store(items_out_raw + tile_base, items_loc, num_remaining); - } - } - } - - //--------------------------------------------------------------------- - // Constructor - //--------------------------------------------------------------------- - - THRUST_DEVICE_FUNCTION - impl(bool ping_, - TempStorage& storage_, - KeysLoadIt keys_in_, - ItemsLoadIt items_in_, - Size keys_count_, - KeysIt keys_out_it_, - ItemsIt items_out_it_, - key_type* keys_out_raw_, - item_type* items_out_raw_, - CompareOp compare_op_) - : ping(ping_), - storage(storage_), - keys_in(keys_in_), - items_in(items_in_), - keys_count(keys_count_), - keys_out_it(keys_out_it_), - items_out_it(items_out_it_), - keys_out_raw(keys_out_raw_), - items_out_raw(items_out_raw_), - compare_op(compare_op_) - { - int tid = threadIdx.x; - Size tile_idx = blockIdx.x; - Size num_tiles = gridDim.x; - Size tile_base = tile_idx * ITEMS_PER_TILE; - int items_in_tile = min(keys_count - tile_base, ITEMS_PER_TILE); - if (tile_idx < num_tiles - 1) - { - consume_tile(tid, tile_idx, tile_base, ITEMS_PER_TILE); - } - else - { - consume_tile(tid, tile_idx, tile_base, items_in_tile); - } - } - }; // struct impl - - //--------------------------------------------------------------------- - // Agent entry point - //--------------------------------------------------------------------- - - THRUST_AGENT_ENTRY(bool ping, - KeysIt keys_inout, - ItemsIt items_inout, - Size keys_count, - key_type* keys_out, - item_type* items_out, - CompareOp compare_op, - char* shmem) - { - TempStorage& storage = *reinterpret_cast(shmem); - - impl(ping, - storage, - core::make_load_iterator(ptx_plan(), keys_inout), - core::make_load_iterator(ptx_plan(), items_inout), - keys_count, - keys_inout, - items_inout, - keys_out, - items_out, - compare_op); - } - }; // struct BlockSortAgent - - template - struct PartitionAgent + THRUST_RUNTIME_FUNCTION cudaError_t + doit_step(void* d_temp_storage, + size_t& temp_storage_bytes, + KeysIt keys, + ItemsIt , + Size keys_count, + CompareOp compare_op, + cudaStream_t stream, + bool debug_sync, + thrust::detail::integral_constant /* sort_keys */) { - typedef typename iterator_traits::value_type key_type; - template - struct PtxPlan : PtxPolicy<256> {}; - - typedef core::specialize_plan ptx_plan; - - //--------------------------------------------------------------------- - // Agent entry point - //--------------------------------------------------------------------- - - THRUST_AGENT_ENTRY(bool ping, - KeysIt keys_ping, - key_type* keys_pong, - Size keys_count, - Size num_partitions, - Size* merge_partitions, - CompareOp compare_op, - Size coop, - int items_per_tile, - char* /*shmem*/) - { - Size partition_idx = blockDim.x * blockIdx.x + threadIdx.x; - if (partition_idx < num_partitions) - { - Size list = ~(coop - 1) & partition_idx; - Size start = items_per_tile * list; - Size size = items_per_tile * (coop >> 1); - - Size keys1_beg = min(keys_count, start); - Size keys1_end = min(keys_count, start + size); - Size keys2_beg = keys1_end; - Size keys2_end = min(keys_count, keys2_beg + size); - - - Size partition_at = min(keys2_end - keys1_beg, - items_per_tile * ((coop - 1) & partition_idx)); - - Size partition_diag = ping ? merge_path(keys_ping + keys1_beg, - keys_ping + keys2_beg, - keys1_end - keys1_beg, - keys2_end - keys2_beg, - partition_at, - compare_op) - : merge_path(keys_pong + keys1_beg, - keys_pong + keys2_beg, - keys1_end - keys1_beg, - keys2_end - keys2_beg, - partition_at, - compare_op); - + using ItemsInputIt = cub::NullType *; + ItemsInputIt items = nullptr; - merge_partitions[partition_idx] = keys1_beg + partition_diag; - } - } - }; // struct PartitionAgent + using DispatchMergeSortT = cub::DispatchMergeSort; + + + return DispatchMergeSortT::Dispatch(d_temp_storage, + temp_storage_bytes, + keys, + items, + keys, + items, + keys_count, + compare_op, + stream, + debug_sync); + } template - struct MergeAgent + class CompareOp> + THRUST_RUNTIME_FUNCTION cudaError_t + doit_step(void *d_temp_storage, + size_t &temp_storage_bytes, + KeysIt keys, + ItemsIt items, + Size keys_count, + CompareOp compare_op, + cudaStream_t stream, + bool debug_sync, + thrust::detail::integral_constant /* sort_items */) { - typedef typename iterator_traits::value_type key_type; - typedef typename iterator_traits::value_type item_type; - - typedef KeysIt KeysOutputPongIt; - typedef ItemsIt ItemsOutputPongIt; - typedef key_type* KeysOutputPingIt; - typedef item_type* ItemsOutputPingIt; - - template - struct PtxPlan : Tuning::type - { - typedef Tuning tuning; - - typedef typename core::LoadIterator::type KeysLoadPingIt; - typedef typename core::LoadIterator::type ItemsLoadPingIt; - typedef typename core::LoadIterator::type KeysLoadPongIt; - typedef typename core::LoadIterator::type ItemsLoadPongIt; - - typedef typename core::BlockLoad::type BlockLoadKeysPing; - typedef typename core::BlockLoad::type BlockLoadItemsPing; - typedef typename core::BlockLoad::type BlockLoadKeysPong; - typedef typename core::BlockLoad::type BlockLoadItemsPong; - - typedef typename core::BlockStore::type BlockStoreKeysPong; - typedef typename core::BlockStore::type BlockStoreItemsPong; - typedef typename core::BlockStore::type BlockStoreKeysPing; - typedef typename core::BlockStore::type BlockStoreItemsPing; - - // gather required temporary storage in a union - // - union TempStorage - { - typename BlockLoadKeysPing::TempStorage load_keys_ping; - typename BlockLoadItemsPing::TempStorage load_items_ping; - typename BlockLoadKeysPong::TempStorage load_keys_pong; - typename BlockLoadItemsPong::TempStorage load_items_pong; - - typename BlockStoreKeysPing::TempStorage store_keys_ping; - typename BlockStoreItemsPing::TempStorage store_items_ping; - typename BlockStoreKeysPong::TempStorage store_keys_pong; - typename BlockStoreItemsPong::TempStorage store_items_pong; - - core::uninitialized_array keys_shared; - core::uninitialized_array items_shared; - }; // union TempStorage - }; // struct PtxPlan - - typedef typename core::specialize_plan_msvc10_war::type::type ptx_plan; - - typedef typename ptx_plan::KeysLoadPingIt KeysLoadPingIt; - typedef typename ptx_plan::ItemsLoadPingIt ItemsLoadPingIt; - typedef typename ptx_plan::KeysLoadPongIt KeysLoadPongIt; - typedef typename ptx_plan::ItemsLoadPongIt ItemsLoadPongIt; - - typedef typename ptx_plan::BlockLoadKeysPing BlockLoadKeysPing; - typedef typename ptx_plan::BlockLoadItemsPing BlockLoadItemsPing; - typedef typename ptx_plan::BlockLoadKeysPong BlockLoadKeysPong; - typedef typename ptx_plan::BlockLoadItemsPong BlockLoadItemsPong; - - typedef typename ptx_plan::BlockStoreKeysPing BlockStoreKeysPing; - typedef typename ptx_plan::BlockStoreItemsPing BlockStoreItemsPing; - typedef typename ptx_plan::BlockStoreKeysPong BlockStoreKeysPong; - typedef typename ptx_plan::BlockStoreItemsPong BlockStoreItemsPong; - - typedef typename ptx_plan::TempStorage TempStorage; - - enum - { - ITEMS_PER_THREAD = ptx_plan::ITEMS_PER_THREAD, - BLOCK_THREADS = ptx_plan::BLOCK_THREADS, - ITEMS_PER_TILE = ptx_plan::ITEMS_PER_TILE - }; - - struct impl - { - //--------------------------------------------------------------------- - // Per thread data - //--------------------------------------------------------------------- - - bool ping; - TempStorage& storage; - - KeysLoadPingIt keys_in_ping; - ItemsLoadPingIt items_in_ping; - KeysLoadPongIt keys_in_pong; - ItemsLoadPongIt items_in_pong; - - Size keys_count; - - KeysOutputPongIt keys_out_pong; - ItemsOutputPongIt items_out_pong; - KeysOutputPingIt keys_out_ping; - ItemsOutputPingIt items_out_ping; - - CompareOp compare_op; - Size* merge_partitions; - Size coop; - - //--------------------------------------------------------------------- - // Utility functions - //--------------------------------------------------------------------- - - template - THRUST_DEVICE_FUNCTION void - gmem_to_reg(T (&output)[ITEMS_PER_THREAD], - It1 input1, - It2 input2, - int count1, - int count2) - { - if (IS_FULL_TILE) - { -#pragma unroll - for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ++ITEM) - { - int idx = BLOCK_THREADS * ITEM + threadIdx.x; - output[ITEM] = (idx < count1) ? input1[idx] : input2[idx - count1]; - } - } - else - { -#pragma unroll - for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ++ITEM) - { - int idx = BLOCK_THREADS * ITEM + threadIdx.x; - if (idx < count1 + count2) - { - output[ITEM] = (idx < count1) ? input1[idx] : input2[idx - count1]; - } - } - } - } - - template - THRUST_DEVICE_FUNCTION void - reg_to_shared(It output, - T (&input)[ITEMS_PER_THREAD]) - { -#pragma unroll - for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ++ITEM) - { - int idx = BLOCK_THREADS * ITEM + threadIdx.x; - output[idx] = input[ITEM]; - } - } - - //--------------------------------------------------------------------- - // Tile processing - //--------------------------------------------------------------------- - - template - THRUST_DEVICE_FUNCTION void - consume_tile(int tid, - Size tile_idx, - Size tile_base, - int count) - { - using core::sync_threadblock; - using core::uninitialized_array; - - Size partition_beg = merge_partitions[tile_idx + 0]; - Size partition_end = merge_partitions[tile_idx + 1]; - - Size list = ~(coop - 1) & tile_idx; - Size start = ITEMS_PER_TILE * list; - Size size = ITEMS_PER_TILE * (coop >> 1); - - Size diag = ITEMS_PER_TILE * tile_idx - start; - - Size keys1_beg = partition_beg; - Size keys1_end = partition_end; - Size keys2_beg = min(keys_count, 2 * start + size + diag - partition_beg); - Size keys2_end = min(keys_count, 2 * start + size + diag + ITEMS_PER_TILE - partition_end); - - if (coop - 1 == ((coop - 1) & tile_idx)) - { - keys1_end = min(keys_count, start + size); - keys2_end = min(keys_count, start + size * 2); - } - - // number of keys per tile - // - int num_keys1 = static_cast(keys1_end - keys1_beg); - int num_keys2 = static_cast(keys2_end - keys2_beg); - - // load keys1 & keys2 - key_type keys_loc[ITEMS_PER_THREAD]; - if (ping) - { - gmem_to_reg(keys_loc, - keys_in_ping + keys1_beg, - keys_in_ping + keys2_beg, - num_keys1, - num_keys2); - } - else - { - gmem_to_reg(keys_loc, - keys_in_pong + keys1_beg, - keys_in_pong + keys2_beg, - num_keys1, - num_keys2); - } - reg_to_shared(&storage.keys_shared[0], keys_loc); - - // preload items into registers already - // - item_type items_loc[ITEMS_PER_THREAD]; - if (MERGE_ITEMS::value) - { - if (ping) - { - gmem_to_reg(items_loc, - items_in_ping + keys1_beg, - items_in_ping + keys2_beg, - num_keys1, - num_keys2); - } - else - { - gmem_to_reg(items_loc, - items_in_pong + keys1_beg, - items_in_pong + keys2_beg, - num_keys1, - num_keys2); - } - } - - sync_threadblock(); - - // use binary search in shared memory - // to find merge path for each of thread - // we can use int type here, because the number of - // items in shared memory is limited - // - int diag0_loc = min(num_keys1 + num_keys2, - ITEMS_PER_THREAD * tid); - - int keys1_beg_loc = merge_path(&storage.keys_shared[0], - &storage.keys_shared[num_keys1], - num_keys1, - num_keys2, - diag0_loc, - compare_op); - int keys1_end_loc = num_keys1; - int keys2_beg_loc = diag0_loc - keys1_beg_loc; - int keys2_end_loc = num_keys2; - - int num_keys1_loc = keys1_end_loc - keys1_beg_loc; - int num_keys2_loc = keys2_end_loc - keys2_beg_loc; - - // perform serial merge - // - int indices[ITEMS_PER_THREAD]; - - serial_merge(&storage.keys_shared[0], - keys1_beg_loc, - keys2_beg_loc + num_keys1, - num_keys1_loc, - num_keys2_loc, - keys_loc, - indices, - compare_op); - - sync_threadblock(); - - // write keys - // - if (ping) - { - if (IS_FULL_TILE) - { - BlockStoreKeysPing(storage.store_keys_ping) - .Store(keys_out_ping + tile_base, keys_loc); - } - else - { - BlockStoreKeysPing(storage.store_keys_ping) - .Store(keys_out_ping + tile_base, keys_loc, num_keys1 + num_keys2); - } - } - else - { - if (IS_FULL_TILE) - { - BlockStoreKeysPong(storage.store_keys_pong) - .Store(keys_out_pong + tile_base, keys_loc); - } - else - { - BlockStoreKeysPong(storage.store_keys_pong) - .Store(keys_out_pong + tile_base, keys_loc, num_keys1 + num_keys2); - } - } - - // if items are provided, merge them - if (MERGE_ITEMS::value) - { - sync_threadblock(); - - reg_to_shared(&storage.items_shared[0], items_loc); - - sync_threadblock(); - - // gather items from shared mem - // -#pragma unroll - for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ++ITEM) - { - items_loc[ITEM] = storage.items_shared[indices[ITEM]]; - } - - sync_threadblock(); - - // write from reg to gmem - // - if (ping) - { - if (IS_FULL_TILE) - { - BlockStoreItemsPing(storage.store_items_ping) - .Store(items_out_ping + tile_base, items_loc); - } - else - { - BlockStoreItemsPing(storage.store_items_ping) - .Store(items_out_ping + tile_base, items_loc, count); - } - } - else - { - if (IS_FULL_TILE) - { - BlockStoreItemsPong(storage.store_items_pong) - .Store(items_out_pong + tile_base, items_loc); - } - else - { - BlockStoreItemsPong(storage.store_items_pong) - .Store(items_out_pong + tile_base, items_loc, count); - } - } - } - } + using DispatchMergeSortT = + cub::DispatchMergeSort; - //--------------------------------------------------------------------- - // Constructor - //--------------------------------------------------------------------- - - THRUST_DEVICE_FUNCTION - impl(bool ping_, - TempStorage& storage_, - KeysLoadPingIt keys_in_ping_, - ItemsLoadPingIt items_in_ping_, - KeysLoadPongIt keys_in_pong_, - ItemsLoadPongIt items_in_pong_, - Size keys_count_, - KeysOutputPingIt keys_out_ping_, - ItemsOutputPingIt items_out_ping_, - KeysOutputPongIt keys_out_pong_, - ItemsOutputPongIt items_out_pong_, - CompareOp compare_op_, - Size* merge_partitions_, - Size coop_) - : ping(ping_), - storage(storage_), - keys_in_ping(keys_in_ping_), - items_in_ping(items_in_ping_), - keys_in_pong(keys_in_pong_), - items_in_pong(items_in_pong_), - keys_count(keys_count_), - keys_out_pong(keys_out_pong_), - items_out_pong(items_out_pong_), - keys_out_ping(keys_out_ping_), - items_out_ping(items_out_ping_), - compare_op(compare_op_), - merge_partitions(merge_partitions_), - coop(coop_) - { - // XXX with 8.5 chaging type to Size (or long long) results in error! - int tile_idx = blockIdx.x; - Size num_tiles = gridDim.x; - Size tile_base = Size(tile_idx) * ITEMS_PER_TILE; - int tid = threadIdx.x; - int items_in_tile = static_cast(min((Size)ITEMS_PER_TILE, - keys_count - tile_base)); - if (tile_idx < num_tiles - 1) - { - consume_tile(tid, - tile_idx, - tile_base, - ITEMS_PER_TILE); - } - else - { - consume_tile(tid, - tile_idx, - tile_base, - items_in_tile); - } - } - }; // struct impl - - //--------------------------------------------------------------------- - // Agent entry point - //--------------------------------------------------------------------- - - THRUST_AGENT_ENTRY(bool ping, - KeysIt keys_ping, - ItemsIt items_ping, - Size keys_count, - key_type* keys_pong, - item_type* items_pong, - CompareOp compare_op, - Size* merge_partitions, - Size coop, - char* shmem) - { - TempStorage& storage = *reinterpret_cast(shmem); - - impl(ping, - storage, - core::make_load_iterator(ptx_plan(), keys_ping), - core::make_load_iterator(ptx_plan(), items_ping), - core::make_load_iterator(ptx_plan(), keys_pong), - core::make_load_iterator(ptx_plan(), items_pong), - keys_count, - keys_pong, - items_pong, - keys_ping, - items_ping, - compare_op, - merge_partitions, - coop); - } - }; // struct MergeAgent; - - ///////////////////////// + return DispatchMergeSortT::Dispatch(d_temp_storage, + temp_storage_bytes, + keys, + items, + keys, + items, + keys_count, + compare_op, + stream, + debug_sync); + } template THRUST_RUNTIME_FUNCTION cudaError_t - doit_step(void* d_temp_storage, - size_t& temp_storage_bytes, - KeysIt keys, - ItemsIt items, - Size keys_count, - CompareOp compare_op, + doit_step(void *d_temp_storage, + size_t &temp_storage_bytes, + KeysIt keys, + ItemsIt items, + Size keys_count, + CompareOp compare_op, cudaStream_t stream, - bool debug_sync) + bool debug_sync) { - using core::AgentPlan; - using core::get_agent_plan; - - typedef typename iterator_traits::value_type key_type; - typedef typename iterator_traits::value_type item_type; - - typedef core::AgentLauncher< - BlockSortAgent > - block_sort_agent; - - typedef core::AgentLauncher > - partition_agent; - - typedef core::AgentLauncher< - MergeAgent > - merge_agent; - - cudaError_t status = cudaSuccess; - if (keys_count == 0) - return status; - - typename core::get_plan::type partition_plan = - partition_agent::get_plan(); - - typename core::get_plan::type merge_plan = - merge_agent::get_plan(stream); - - AgentPlan block_sort_plan = merge_plan; - - int tile_size = merge_plan.items_per_tile; - Size num_tiles = (keys_count + tile_size - 1) / tile_size; - - size_t temp_storage1 = (1 + num_tiles) * sizeof(Size); - size_t temp_storage2 = keys_count * sizeof(key_type); - size_t temp_storage3 = keys_count * sizeof(item_type) * SORT_ITEMS::value; - size_t temp_storage4 = core::vshmem_size(max(block_sort_plan.shared_memory_size, - merge_plan.shared_memory_size), - num_tiles); - - void* allocations[4] = {NULL, NULL, NULL, NULL}; - size_t allocation_sizes[4] = {temp_storage1, temp_storage2, temp_storage3, temp_storage4}; - - status = core::alias_storage(d_temp_storage, - temp_storage_bytes, - allocations, - allocation_sizes); - CUDA_CUB_RET_IF_FAIL(status); - - if (d_temp_storage == NULL) - { - return status; - }; - - int num_passes = static_cast(thrust::detail::log2_ri(num_tiles)); - bool ping = !(1 & num_passes); - - Size* merge_partitions = (Size*)allocations[0]; - key_type* keys_buffer = (key_type*)allocations[1]; - item_type* items_buffer = (item_type*)allocations[2]; - - char* vshmem_ptr = temp_storage4 > 0 ? (char*)allocations[3] : NULL; - - - block_sort_agent(block_sort_plan, keys_count, stream, vshmem_ptr, "block_sort_agent", debug_sync) - .launch(ping, keys, items, keys_count, keys_buffer, items_buffer, compare_op); - CUDA_CUB_RET_IF_FAIL(cudaPeekAtLastError()); - - size_t num_partitions = num_tiles + 1; - - partition_agent pa(partition_plan, num_partitions, stream, "partition_agent", debug_sync); - merge_agent ma(merge_plan, keys_count, stream, vshmem_ptr, "merge_agent", debug_sync); - - for (int pass = 0; pass < num_passes; ++pass, ping = !ping) { - Size coop = Size(2) << pass; - - pa.launch(ping, - keys, - keys_buffer, - keys_count, - num_partitions, - merge_partitions, - compare_op, - coop, - merge_plan.items_per_tile); - CUDA_CUB_RET_IF_FAIL(cudaPeekAtLastError()); - - - ma.launch(ping, - keys, - items, - keys_count, - keys_buffer, - items_buffer, - compare_op, - merge_partitions, - coop); - CUDA_CUB_RET_IF_FAIL(cudaPeekAtLastError()); + return cudaSuccess; } - return status; + thrust::detail::integral_constant sort_items{}; + + return doit_step(d_temp_storage, + temp_storage_bytes, + keys, + items, + keys_count, + compare_op, + stream, + debug_sync, + sort_items); } template +__host__ __device__ +bool +must_perform_optional_stream_synchronization(execution_policy &) +{ + return true; +} + +// Entry point/interface. +template +__host__ __device__ bool +must_perform_optional_synchronization(execution_policy &policy) +{ + return must_perform_optional_stream_synchronization(derived_cast(policy)); +} + + // Fallback implementation of the customization point. __thrust_exec_check_disable__ template @@ -105,6 +124,50 @@ return synchronize_stream(derived_cast(policy)); } +// Fallback implementation of the customization point. +__thrust_exec_check_disable__ +template +__host__ __device__ +cudaError_t +synchronize_stream_optional(execution_policy &policy) +{ + cudaError_t result; + if (THRUST_IS_HOST_CODE) { + #if THRUST_INCLUDE_HOST_CODE + if(must_perform_optional_synchronization(policy)){ + cudaStreamSynchronize(stream(policy)); + result = cudaGetLastError(); + }else{ + result = cudaSuccess; + } + #endif + } else { + #if THRUST_INCLUDE_DEVICE_CODE + #if __THRUST_HAS_CUDART__ + if(must_perform_optional_synchronization(policy)){ + cub::detail::device_synchronize(); + result = cudaGetLastError(); + }else{ + result = cudaSuccess; + } + #else + THRUST_UNUSED_VAR(policy); + result = cudaSuccess; + #endif + #endif + } + return result; +} + +// Entry point/interface. +template +__host__ __device__ +cudaError_t +synchronize_optional(Policy &policy) +{ + return synchronize_stream_optional(derived_cast(policy)); +} + template THRUST_HOST_FUNCTION cudaError_t trivial_copy_from_device(Type * dst, diff -Nru libthrust-1.15.0/thrust/system/cuda/execution_policy.h libthrust-1.16.0/thrust/system/cuda/execution_policy.h --- libthrust-1.15.0/thrust/system/cuda/execution_policy.h 2021-10-29 18:01:06.000000000 +0000 +++ libthrust-1.16.0/thrust/system/cuda/execution_policy.h 2022-02-09 20:04:17.000000000 +0000 @@ -26,59 +26,6 @@ ******************************************************************************/ #pragma once -// histogram -// sort (radix-sort, merge-sort) - #include #include #include - -// pass -// ---------------- -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include - -// fail -// ---------------- -// fails with mixed types -#include - -// mixed types are not compiling, commented in testing/scan.cu -#include - -// stubs passed -// ---------------- -#include -#include -#include -#include -#include - -// work in progress - diff -Nru libthrust-1.15.0/thrust/system/cuda/memory_resource.h libthrust-1.16.0/thrust/system/cuda/memory_resource.h --- libthrust-1.15.0/thrust/system/cuda/memory_resource.h 2021-10-29 18:01:06.000000000 +0000 +++ libthrust-1.16.0/thrust/system/cuda/memory_resource.h 2022-02-09 20:04:17.000000000 +0000 @@ -42,8 +42,8 @@ namespace detail { - typedef cudaError_t (*allocation_fn)(void **, std::size_t); - typedef cudaError_t (*deallocation_fn)(void *); + typedef cudaError_t (CUDARTAPI *allocation_fn)(void **, std::size_t); + typedef cudaError_t (CUDARTAPI *deallocation_fn)(void *); template class cuda_memory_resource final : public mr::memory_resource @@ -79,7 +79,7 @@ } }; - inline cudaError_t cudaMallocManaged(void ** ptr, std::size_t bytes) + inline cudaError_t CUDARTAPI cudaMallocManaged(void ** ptr, std::size_t bytes) { return ::cudaMallocManaged(ptr, bytes, cudaMemAttachGlobal); } diff -Nru libthrust-1.15.0/thrust/system/detail/generic/shuffle.inl libthrust-1.16.0/thrust/system/detail/generic/shuffle.inl --- libthrust-1.15.0/thrust/system/detail/generic/shuffle.inl 2021-10-29 18:01:06.000000000 +0000 +++ libthrust-1.16.0/thrust/system/detail/generic/shuffle.inl 2022-02-09 20:04:17.000000000 +0000 @@ -48,7 +48,7 @@ right_side_bits = total_bits - left_side_bits; right_side_mask = (1ull << right_side_bits) - 1; - for (std::uint64_t i = 0; i < num_rounds; i++) { + for (std::uint32_t i = 0; i < num_rounds; i++) { key[i] = g(); } } @@ -56,28 +56,34 @@ __host__ __device__ std::uint64_t nearest_power_of_two() const { return 1ull << (left_side_bits + right_side_bits); } - __host__ __device__ std::uint64_t operator()(const std::uint64_t val) const { - // Extract the right and left sides of the input - auto left = static_cast(val >> right_side_bits); - auto right = static_cast(val & right_side_mask); - round_state state = {left, right}; - for (std::uint64_t i = 0; i < num_rounds; i++) { - state = do_round(state, i); + __host__ __device__ std::uint64_t operator()(const std::uint64_t val) const { + std::uint32_t state[2] = { static_cast( val >> right_side_bits ), static_cast( val & right_side_mask ) }; + for( std::uint32_t i = 0; i < num_rounds; i++ ) + { + std::uint32_t hi, lo; + constexpr std::uint64_t M0 = UINT64_C( 0xD2B74407B1CE6E93 ); + mulhilo( M0, state[0], hi, lo ); + lo = ( lo << ( right_side_bits - left_side_bits ) ) | state[1] >> left_side_bits; + state[0] = ( ( hi ^ key[i] ) ^ state[1] ) & left_side_mask; + state[1] = lo & right_side_mask; } - - // Check we have the correct number of bits on each side - assert((state.left >> left_side_bits) == 0); - assert((state.right >> right_side_bits) == 0); - // Combine the left and right sides together to get result - return state.left << right_side_bits | state.right; + return static_cast(state[0] << right_side_bits) | static_cast(state[1]); } private: + // Perform 64 bit multiplication and save result in two 32 bit int + static __host__ __device__ void mulhilo( std::uint64_t a, std::uint64_t b, std::uint32_t& hi, std::uint32_t& lo ) + { + std::uint64_t product = a * b; + hi = static_cast( product >> 32 ); + lo = static_cast( product ); + } + // Find the nearest power of two - __host__ __device__ std::uint64_t get_cipher_bits(std::uint64_t m) { - if (m == 0) return 0; + static __host__ __device__ std::uint64_t get_cipher_bits(std::uint64_t m) { + if (m <= 16) return 4; std::uint64_t i = 0; m--; while (m != 0) { @@ -87,45 +93,12 @@ return i; } - // Equivalent to boost::hash_combine - __host__ __device__ - std::size_t hash_combine(std::uint64_t lhs, std::uint64_t rhs) const { - lhs ^= rhs + 0x9e3779b9 + (lhs << 6) + (lhs >> 2); - return lhs; - } - - // Round function, a 'pseudorandom function' who's output is indistinguishable - // from random for each key value input. This is not cryptographically secure - // but sufficient for generating permutations. - __host__ __device__ std::uint32_t round_function(std::uint64_t value, - const std::uint64_t key_) const { - std::uint64_t hash0 = thrust::random::taus88(static_cast(value))(); - std::uint64_t hash1 = thrust::random::ranlux48(value)(); - return static_cast( - hash_combine(hash_combine(hash0, key_), hash1) & left_side_mask); - } - - __host__ __device__ round_state do_round(const round_state state, - const std::uint64_t round) const { - const std::uint32_t new_left = state.right & left_side_mask; - const std::uint32_t round_function_res = - state.left ^ round_function(state.right, key[round]); - if (right_side_bits != left_side_bits) { - // Upper bit of the old right becomes lower bit of new right if we have - // odd length feistel - const std::uint32_t new_right = - (round_function_res << 1ull) | state.right >> left_side_bits; - return {new_left, new_right}; - } - return {new_left, round_function_res}; - } - - static constexpr std::uint64_t num_rounds = 16; + static constexpr std::uint32_t num_rounds = 24; std::uint64_t right_side_bits; std::uint64_t left_side_bits; std::uint64_t right_side_mask; std::uint64_t left_side_mask; - std::uint64_t key[num_rounds]; + std::uint32_t key[num_rounds]; }; struct key_flag_tuple { diff -Nru libthrust-1.15.0/thrust/system/detail/sequential/stable_merge_sort.inl libthrust-1.16.0/thrust/system/detail/sequential/stable_merge_sort.inl --- libthrust-1.15.0/thrust/system/detail/sequential/stable_merge_sort.inl 2021-10-29 18:01:06.000000000 +0000 +++ libthrust-1.16.0/thrust/system/detail/sequential/stable_merge_sort.inl 2022-02-09 20:04:17.000000000 +0000 @@ -97,7 +97,7 @@ { for(; first < last; first += partition_size) { - RandomAccessIterator partition_last = thrust::min(last, first + partition_size); + RandomAccessIterator partition_last = (thrust::min)(last, first + partition_size); thrust::system::detail::sequential::insertion_sort(first, partition_last, comp); } // end for @@ -120,7 +120,7 @@ { for(; keys_first < keys_last; keys_first += partition_size, values_first += partition_size) { - RandomAccessIterator1 keys_partition_last = thrust::min(keys_last, keys_first + partition_size); + RandomAccessIterator1 keys_partition_last = (thrust::min)(keys_last, keys_first + partition_size); thrust::system::detail::sequential::insertion_sort_by_key(keys_first, keys_partition_last, values_first, comp); } // end for @@ -143,8 +143,8 @@ { for(; first < last; first += 2 * partition_size, result += 2 * partition_size) { - RandomAccessIterator1 interval_middle = thrust::min(last, first + partition_size); - RandomAccessIterator1 interval_last = thrust::min(last, interval_middle + partition_size); + RandomAccessIterator1 interval_middle = (thrust::min)(last, first + partition_size); + RandomAccessIterator1 interval_last = (thrust::min)(last, interval_middle + partition_size); thrust::merge(exec, first, interval_middle, @@ -178,8 +178,8 @@ keys_first < keys_last; keys_first += stride, values_first += stride, keys_result += stride, values_result += stride) { - RandomAccessIterator1 keys_interval_middle = thrust::min(keys_last, keys_first + partition_size); - RandomAccessIterator1 keys_interval_last = thrust::min(keys_last, keys_interval_middle + partition_size); + RandomAccessIterator1 keys_interval_middle = (thrust::min)(keys_last, keys_first + partition_size); + RandomAccessIterator1 keys_interval_last = (thrust::min)(keys_last, keys_interval_middle + partition_size); RandomAccessIterator2 values_first2 = values_first + (keys_interval_middle - keys_first); diff -Nru libthrust-1.15.0/thrust/system/omp/detail/for_each.inl libthrust-1.16.0/thrust/system/omp/detail/for_each.inl --- libthrust-1.15.0/thrust/system/omp/detail/for_each.inl 2021-10-29 18:01:06.000000000 +0000 +++ libthrust-1.16.0/thrust/system/omp/detail/for_each.inl 2022-02-09 20:04:17.000000000 +0000 @@ -20,12 +20,12 @@ */ #include -#include -#include #include -#include +#include #include #include +#include +#include THRUST_NAMESPACE_BEGIN namespace system @@ -61,14 +61,11 @@ // create a wrapped function for f thrust::detail::wrapped_function wrapped_f(f); -// do not attempt to compile the body of this function, which depends on #pragma omp, -// without support from the compiler -// XXX implement the body of this function in another file to eliminate this ugliness -#if (THRUST_DEVICE_COMPILER_IS_OMP_CAPABLE == THRUST_TRUE) // use a signed type for the iteration variable or suffer the consequences of warnings typedef typename thrust::iterator_difference::type DifferenceType; DifferenceType signed_n = n; -#pragma omp parallel for + + THRUST_PRAGMA_OMP(parallel for) for(DifferenceType i = 0; i < signed_n; ++i) @@ -76,7 +73,6 @@ RandomAccessIterator temp = first + i; wrapped_f(*temp); } -#endif // THRUST_DEVICE_COMPILER_IS_OMP_CAPABLE return first + n; } // end for_each_n() diff -Nru libthrust-1.15.0/thrust/system/omp/detail/pragma_omp.h libthrust-1.16.0/thrust/system/omp/detail/pragma_omp.h --- libthrust-1.15.0/thrust/system/omp/detail/pragma_omp.h 1970-01-01 00:00:00.000000000 +0000 +++ libthrust-1.16.0/thrust/system/omp/detail/pragma_omp.h 2022-02-09 20:04:17.000000000 +0000 @@ -0,0 +1,56 @@ +/****************************************************************************** +* Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved. +* +* Redistribution and use in source and binary forms, with or without +* modification, are permitted provided that the following conditions are met: +* * Redistributions of source code must retain the above copyright +* notice, this list of conditions and the following disclaimer. +* * Redistributions in binary form must reproduce the above copyright +* notice, this list of conditions and the following disclaimer in the +* documentation and/or other materials provided with the distribution. +* * Neither the name of the NVIDIA CORPORATION nor the +* names of its contributors may be used to endorse or promote products +* derived from this software without specific prior written permission. +* +* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +* ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY +* DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES +* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; +* LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND +* ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +* SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +* +******************************************************************************/ + +#pragma once + +#include + +#if THRUST_HOST_COMPILER == THRUST_HOST_COMPILER_MSVC +// MSVC ICEs when using the standard C++11 `_Pragma` operator with OpenMP +// directives. +// WAR this by using the MSVC-extension `__pragma`. See this link for more info: +// https://developercommunity.visualstudio.com/t/Using-C11s-_Pragma-with-OpenMP-dire/1590628 +#define THRUST_PRAGMA_OMP_IMPL(directive) __pragma(directive) +#else // Not MSVC: +#define THRUST_PRAGMA_OMP_IMPL(directive) _Pragma(#directive) +#endif + +// For internal use only -- THRUST_PRAGMA_OMP is used to switch between +// different flavors of openmp pragmas. Pragmas are not emitted when OpenMP is +// not available. +// +// Usage: +// Replace: #pragma omp parallel for +// With : THRUST_PRAGMA_OMP(parallel for) +// +#if defined(_NVHPC_STDPAR_OPENMP) && _NVHPC_STDPAR_OPENMP == 1 +#define THRUST_PRAGMA_OMP(directive) THRUST_PRAGMA_OMP_IMPL(omp_stdpar directive) +#elif defined(_OPENMP) +#define THRUST_PRAGMA_OMP(directive) THRUST_PRAGMA_OMP_IMPL(omp directive) +#else +#define THRUST_PRAGMA_OMP(directive) +#endif diff -Nru libthrust-1.15.0/thrust/system/omp/detail/reduce_intervals.inl libthrust-1.16.0/thrust/system/omp/detail/reduce_intervals.inl --- libthrust-1.15.0/thrust/system/omp/detail/reduce_intervals.inl 2021-10-29 18:01:06.000000000 +0000 +++ libthrust-1.16.0/thrust/system/omp/detail/reduce_intervals.inl 2022-02-09 20:04:17.000000000 +0000 @@ -62,9 +62,7 @@ index_type n = static_cast(decomp.size()); -#if (THRUST_DEVICE_COMPILER_IS_OMP_CAPABLE == THRUST_TRUE) -# pragma omp parallel for -#endif // THRUST_DEVICE_COMPILER_IS_OMP_CAPABLE + THRUST_PRAGMA_OMP(parallel for) for(index_type i = 0; i < n; i++) { InputIterator begin = input + decomp[i].begin(); diff -Nru libthrust-1.15.0/thrust/system/omp/detail/sort.inl libthrust-1.16.0/thrust/system/omp/detail/sort.inl --- libthrust-1.15.0/thrust/system/omp/detail/sort.inl 2021-10-29 18:01:06.000000000 +0000 +++ libthrust-1.16.0/thrust/system/omp/detail/sort.inl 2022-02-09 20:04:17.000000000 +0000 @@ -113,13 +113,14 @@ , "OpenMP compiler support is not enabled" ); + // Avoid issues on compilers that don't provide `omp_get_num_threads()`. #if (THRUST_DEVICE_COMPILER_IS_OMP_CAPABLE == THRUST_TRUE) typedef typename thrust::iterator_difference::type IndexType; - + if(first == last) return; - #pragma omp parallel + THRUST_PRAGMA_OMP(parallel) { thrust::system::detail::internal::uniform_decomposition decomp(last - first, 1, omp_get_num_threads()); @@ -135,7 +136,7 @@ comp); } - #pragma omp barrier + THRUST_PRAGMA_OMP(barrier) // XXX For some reason, MSVC 2015 yields an error unless we include this meaningless semicolon here ; @@ -166,7 +167,7 @@ nseg = (nseg + 1) / 2; h *= 2; - #pragma omp barrier + THRUST_PRAGMA_OMP(barrier) } } #endif // THRUST_DEVICE_COMPILER_IS_OMP_CAPABLE @@ -195,13 +196,14 @@ , "OpenMP compiler support is not enabled" ); + // Avoid issues on compilers that don't provide `omp_get_num_threads()`. #if (THRUST_DEVICE_COMPILER_IS_OMP_CAPABLE == THRUST_TRUE) typedef typename thrust::iterator_difference::type IndexType; - + if(keys_first == keys_last) return; - #pragma omp parallel + THRUST_PRAGMA_OMP(parallel) { thrust::system::detail::internal::uniform_decomposition decomp(keys_last - keys_first, 1, omp_get_num_threads()); @@ -218,7 +220,7 @@ comp); } - #pragma omp barrier + THRUST_PRAGMA_OMP(barrier) // XXX For some reason, MSVC 2015 yields an error unless we include this meaningless semicolon here ; @@ -250,7 +252,7 @@ nseg = (nseg + 1) / 2; h *= 2; - #pragma omp barrier + THRUST_PRAGMA_OMP(barrier) } } #endif // THRUST_DEVICE_COMPILER_IS_OMP_CAPABLE diff -Nru libthrust-1.15.0/thrust/system/tbb/detail/reduce_by_key.inl libthrust-1.16.0/thrust/system/tbb/detail/reduce_by_key.inl --- libthrust-1.15.0/thrust/system/tbb/detail/reduce_by_key.inl 2021-10-29 18:01:06.000000000 +0000 +++ libthrust-1.16.0/thrust/system/tbb/detail/reduce_by_key.inl 2022-02-09 20:04:17.000000000 +0000 @@ -198,7 +198,7 @@ const size_type interval_idx = r.begin(); const size_type offset_to_first = interval_size * interval_idx; - const size_type offset_to_last = thrust::min(n, offset_to_first + interval_size); + const size_type offset_to_last = (thrust::min)(n, offset_to_first + interval_size); Iterator1 my_keys_first = keys_first + offset_to_first; Iterator1 my_keys_last = keys_first + offset_to_last; diff -Nru libthrust-1.15.0/thrust/system/tbb/detail/reduce_intervals.h libthrust-1.16.0/thrust/system/tbb/detail/reduce_intervals.h --- libthrust-1.15.0/thrust/system/tbb/detail/reduce_intervals.h 2021-10-29 18:01:06.000000000 +0000 +++ libthrust-1.16.0/thrust/system/tbb/detail/reduce_intervals.h 2022-02-09 20:04:17.000000000 +0000 @@ -64,7 +64,7 @@ Size interval_idx = r.begin(); Size offset_to_first = interval_size * interval_idx; - Size offset_to_last = thrust::min(n, offset_to_first + interval_size); + Size offset_to_last = (thrust::min)(n, offset_to_first + interval_size); RandomAccessIterator1 my_first = first + offset_to_first; RandomAccessIterator1 my_last = first + offset_to_last; diff -Nru libthrust-1.15.0/thrust/version.h libthrust-1.16.0/thrust/version.h --- libthrust-1.15.0/thrust/version.h 2021-10-29 18:01:06.000000000 +0000 +++ libthrust-1.16.0/thrust/version.h 2022-02-09 20:04:17.000000000 +0000 @@ -47,7 +47,7 @@ * THRUST_VERSION / 100 % 1000 is the minor version. * THRUST_VERSION / 100000 is the major version. */ -#define THRUST_VERSION 101500 +#define THRUST_VERSION 101600 /*! \def THRUST_MAJOR_VERSION * \brief The preprocessor macro \p THRUST_MAJOR_VERSION encodes the