diff -Nru rocfft-5.5.0/.github/dependabot.yml rocfft-5.7.1/.github/dependabot.yml --- rocfft-5.5.0/.github/dependabot.yml 1970-01-01 00:00:00.000000000 +0000 +++ rocfft-5.7.1/.github/dependabot.yml 2023-08-09 16:19:51.000000000 +0000 @@ -0,0 +1,12 @@ +# To get started with Dependabot version updates, you'll need to specify which +# package ecosystems to update and where the package manifests are located. +# Please see the documentation for all configuration options: +# https://docs.github.com/github/administering-a-repository/configuration-options-for-dependency-updates + +version: 2 +updates: + - package-ecosystem: "pip" # See documentation for possible values + directory: "/docs/.sphinx" # Location of package manifests + open-pull-requests-limit: 10 + schedule: + interval: "daily" diff -Nru rocfft-5.5.0/.gitignore rocfft-5.7.1/.gitignore --- rocfft-5.5.0/.gitignore 2023-01-31 06:20:16.000000000 +0000 +++ rocfft-5.7.1/.gitignore 2023-08-09 16:19:51.000000000 +0000 @@ -40,3 +40,12 @@ # python bytecode __pycache__ + +# documentation artifacts +_build/ +_images/ +_static/ +_templates/ +_toc.yml +docBin/ +_doxygen/ diff -Nru rocfft-5.5.0/.jenkins/application.groovy rocfft-5.7.1/.jenkins/application.groovy --- rocfft-5.5.0/.jenkins/application.groovy 1970-01-01 00:00:00.000000000 +0000 +++ rocfft-5.7.1/.jenkins/application.groovy 2023-08-09 16:19:51.000000000 +0000 @@ -0,0 +1,182 @@ +#!/usr/bin/env groovy +// This shared library is available at https://github.com/ROCmSoftwarePlatform/rocJENKINS/ +@Library('rocJenkins@pong') _ + +// This is file for internal AMD use. +// If you are interested in running your own Jenkins, please raise a github issue for assistance. + +import com.amd.project.* +import com.amd.docker.* +import java.nio.file.Path + +def runCI = +{ + nodeDetails, jobName-> + + def prj = new rocProject('rocFFT-internal', 'application') + + prj.defaults.ccache = true + prj.timeout.compile = 600 + prj.timeout.test = 600 + prj.libraryDependencies = ['rocFFT', 'hipFFT'] + + // Define test architectures, optional rocm version argument is available + def nodes = new dockerNodes(nodeDetails, jobName, prj) + + boolean formatCheck = false + + def commonGroovy + + def compileCommand = + { + platform, project-> + def getDependenciesCommand = "" + if (project.installLibraryDependenciesFromCI) + { + project.libraryDependencies.each + { libraryName -> + getDependenciesCommand += auxiliary.getLibrary(libraryName, platform.jenkinsLabel, null, false) + } + } + + def command = """#!/usr/bin/env bash + set -ex + cd ${project.paths.project_build_prefix} + ${getDependenciesCommand} + git clone -b develop-2021 https://github.com/ROCmSoftwarePlatform/Gromacs.git + cd Gromacs + + mkdir build_tmpi + cd build_tmpi + cmake -DCMAKE_HIP_ARCHITECTURES=gfx90a -DBUILD_SHARED_LIBS=ON -DGMX_BUILD_FOR_COVERAGE=ON -DCMAKE_BUILD_TYPE=Release -DCMAKE_C_COMPILER=gcc -DCMAKE_CXX_COMPILER=g++ -DGMX_MPI=OFF -DGMX_GPU=hip -DGMX_OPENMP=ON -DGMX_SIMD=AVX2_256 -DREGRESSIONTEST_DOWNLOAD=OFF -DGMX_GPU_USE_VKFFT=OFF -DCMAKE_PREFIX_PATH=/opt/rocm -DCMAKE_INSTALL_PREFIX=../gromacs-install .. + make + make install + cd .. + + mkdir build_mpi + cd build_mpi + cmake -DCMAKE_HIP_ARCHITECTURES=gfx908 -DBUILD_SHARED_LIBS=ON -DGMX_BUILD_FOR_COVERAGE=ON -DCMAKE_BUILD_TYPE=Release -DCMAKE_C_COMPILER=mpicc -DCMAKE_CXX_COMPILER=mpic++ -DGMX_MPI=ON -DGMX_GPU=hip -DGMX_OPENMP=ON -DGMX_SIMD=AVX2_256 -DREGRESSIONTEST_DOWNLOAD=OFF -DGMX_GPU_USE_VKFFT=OFF -DCMAKE_PREFIX_PATH=/opt/rocm -DCMAKE_INSTALL_PREFIX=../gromacs-install .. + make + make install + cd .. + """ + platform.runCommand(this, command) + } + + def testCommand = + { + platform, project-> + + def command = """#!/usr/bin/env bash + set -ex + cd ${project.paths.project_build_prefix} + cd Gromacs + + source gromacs-install/bin/GMXRC + gmx --version + + export LD_LIBRARY_PATH=\$LD_LIBRARY_PATH:/opt/rocm/lib + echo \$LD_LIBRARY_PATH + + git clone https://github.com/jychang48/benchmark-gromacs.git + cd benchmark-gromacs + + export GMX_MAXBACKUP=-1 + + echo "* Threaded MPI ******************************************************************************************************" + + #ADH_DODEC + cd adh_dodec + tar zxf adh_dodec.tar.gz + gmx --quiet mdrun -pin on -nsteps 100000 -resetstep 90000 -ntmpi 1 -ntomp 64 -noconfout -nb gpu -bonded gpu -pme gpu -v -gpu_id 0 -s topol.tpr -nstlist 100 # 1 GPU + gmx --quiet mdrun -pin on -nsteps 100000 -resetstep 90000 -ntmpi 4 -ntomp 16 -noconfout -nb gpu -bonded gpu -pme gpu -npme 1 -v -gpu_id 01 -s topol.tpr -nstlist 200 # 2 GPUs + gmx --quiet mdrun -pin on -nsteps 100000 -resetstep 90000 -ntmpi 4 -ntomp 16 -noconfout -nb gpu -bonded gpu -pme gpu -npme 1 -v -gpu_id 0123 -s topol.tpr -nstlist 200 # 4 GPUs + gmx --quiet mdrun -pin on -nsteps 100000 -resetstep 90000 -ntmpi 8 -ntomp 8 -noconfout -nb gpu -bonded gpu -pme gpu -npme 1 -v -gpu_id 01234567 -s topol.tpr -nstlist 150 # 8 GPUs + + # STMV + cd .. + cd stmv/ + tar zxf stmv.tar.gz + gmx --quiet mdrun -pin on -nsteps 100000 -resetstep 90000 -ntmpi 1 -ntomp 64 -noconfout -nb gpu -bonded gpu -pme gpu -v -gpu_id 0 -s topol.tpr -nstlist 200 # 1 GPU + gmx --quiet mdrun -pin on -nsteps 100000 -resetstep 90000 -ntmpi 4 -ntomp 16 -noconfout -nb gpu -bonded gpu -pme gpu -npme 1 -v -gpu_id 01 -s topol.tpr -nstlist 200 # 2 GPUs + gmx --quiet mdrun -pin on -nsteps 100000 -resetstep 90000 -ntmpi 8 -ntomp 8 -noconfout -nb gpu -bonded gpu -pme gpu -npme 1 -v -gpu_id 0123 -s topol.tpr -nstlist 400 # 4 GPUs + gmx --quiet mdrun -pin on -nsteps 100000 -resetstep 90000 -ntmpi 8 -ntomp 8 -noconfout -nb gpu -bonded gpu -pme gpu -npme 1 -v -gpu_id 01234567 -s topol.tpr -nstlist 400 # 8 GPUs + + # CELLULOSE_NVE + cd .. + cd cellulose_nve/ + tar zxf cellulose_nve.tar.gz + gmx --quiet mdrun -pin on -nsteps 100000 -resetstep 90000 -ntmpi 1 -ntomp 64 -noconfout -nb gpu -bonded gpu -pme gpu -v -gpu_id 0 -s topol.tpr -nstlist 100 # 1 GPU + gmx --quiet mdrun -pin on -nsteps 100000 -resetstep 90000 -ntmpi 4 -ntomp 16 -noconfout -nb gpu -bonded gpu -pme gpu -npme 1 -v -gpu_id 01 -s topol.tpr -nstlist 200 # 2 GPUs + gmx --quiet mdrun -pin on -nsteps 100000 -resetstep 90000 -ntmpi 8 -ntomp 8 -noconfout -nb gpu -bonded gpu -pme gpu -npme 1 -v -gpu_id 0123 -s topol.tpr -nstlist 200 # 4 GPUs + gmx --quiet mdrun -pin on -nsteps 100000 -resetstep 90000 -ntmpi 8 -ntomp 8 -noconfout -nb gpu -bonded gpu -pme gpu -npme 1 -v -gpu_id 01234567 -s topol.tpr -nstlist 200 # 8 GPUs + + echo "* MPI ***************************************************************************************************************" + + # ADH_DODEC + cd .. + cd adh_dodec/ + tar zxf adh_dodec.tar.gz + mpirun -np 1 gmx_mpi --quiet mdrun -pin on -nsteps 100000 -resetstep 90000 -ntomp 64 -noconfout -nb gpu -bonded gpu -pme gpu -v -gpu_id 0 -s topol.tpr # 1 GPU + mpirun -np 4 gmx_mpi --quiet mdrun -pin on -nsteps 100000 -resetstep 90000 -ntomp 8 -noconfout -nb gpu -bonded gpu -pme gpu -npme 1 -v -gpu_id 01 -s topol.tpr # 2 GPUs + mpirun -np 8 gmx_mpi --quiet mdrun -pin on -nsteps 100000 -resetstep 90000 -ntomp 6 -noconfout -nb gpu -bonded gpu -pme gpu -npme 1 -v -gpu_id 0123 -s topol.tpr # 4 GPUs + mpirun -np 8 gmx_mpi --quiet mdrun -pin on -nsteps 100000 -resetstep 90000 -ntomp 6 -noconfout -nb gpu -bonded gpu -pme gpu -npme 1 -v -gpu_id 01234567 -s topol.tpr # 8 GPUs + + # STMV + cd .. + cd stmv/ + tar zxf stmv.tar.gz + mpirun -np 1 gmx_mpi --quiet mdrun -pin on -nsteps 100000 -resetstep 90000 -ntomp 64 -noconfout -nb gpu -bonded gpu -pme gpu -v -nstlist 400 -gpu_id 0 -s topol.tpr # 1 GPU + mpirun -np 4 gmx_mpi --quiet mdrun -pin on -nsteps 100000 -resetstep 90000 -ntomp 8 -noconfout -nb gpu -bonded gpu -pme gpu -npme 1 -v -gpu_id 01 -s topol.tpr # 2 GPUs + mpirun -np 8 gmx_mpi --quiet mdrun -pin on -nsteps 100000 -resetstep 90000 -ntomp 8 -noconfout -nb gpu -bonded gpu -pme gpu -npme 1 -v -gpu_id 0123 -s topol.tpr # 4 GPUs + mpirun -np 8 gmx_mpi --quiet mdrun -pin on -nsteps 100000 -resetstep 90000 -ntomp 8 -noconfout -nb gpu -bonded gpu -pme gpu -npme 1 -v -gpu_id 01234567 -s topol.tpr # 8 GPUs + + # CELLULOSE_NVE + cd .. + cd cellulose_nve/ + tar zxf cellulose_nve.tar.gz + mpirun -np 1 gmx_mpi --quiet mdrun -pin on -nsteps 100000 -resetstep 90000 -ntomp 64 -noconfout -nb gpu -bonded gpu -pme gpu -v -gpu_id 0 -s topol.tpr # 1 GPU + mpirun -np 4 gmx_mpi --quiet mdrun -pin on -nsteps 100000 -resetstep 90000 -ntomp 8 -noconfout -nb gpu -bonded gpu -pme gpu -npme 1 -v -gpu_id 01 -s topol.tpr # 2 GPUs + mpirun -np 8 gmx_mpi --quiet mdrun -pin on -nsteps 100000 -resetstep 90000 -ntomp 6 -noconfout -nb gpu -bonded gpu -pme gpu -npme 1 -v -gpu_id 0123 -s topol.tpr # 4 GPUs + mpirun -np 8 gmx_mpi --quiet mdrun -pin on -nsteps 100000 -resetstep 90000 -ntomp 8 -noconfout -nb gpu -bonded gpu -pme gpu -npme 1 -v -gpu_id 01234567 -s topol.tpr # 8 GPUs + """ + platform.runCommand(this, command) + } + + buildProject(prj, formatCheck, nodes.dockerArray, compileCommand, testCommand, null) +} + +ci: { + String urlJobName = auxiliary.getTopJobName(env.BUILD_URL) + + def propertyList = ["compute-rocm-dkms-no-npi-hipclang":[pipelineTriggers([cron('0 1 * * 5')])]] + propertyList = auxiliary.appendPropertyList(propertyList) + + def jobNameList = ["compute-rocm-dkms-no-npi-hipclang":([ubuntu20:['8gfx90a']])] + jobNameList = auxiliary.appendJobNameList(jobNameList) + + propertyList.each + { + jobName, property-> + if (urlJobName == jobName) + properties(auxiliary.addCommonProperties(property)) + } + + jobNameList.each + { + jobName, nodeDetails-> + if (urlJobName == jobName) + stage(jobName) { + runCI(nodeDetails, jobName) + } + } + + // For url job names that are not listed by the jobNameList i.e. compute-rocm-dkms-no-npi-1901 + if(!jobNameList.keySet().contains(urlJobName)) + { + properties(auxiliary.addCommonProperties([pipelineTriggers([cron('0 1 * * *')])])) + stage(urlJobName) { + runCI([ubuntu18:['8gfx90a']], urlJobName) + } + } +} diff -Nru rocfft-5.5.0/.jenkins/common.groovy rocfft-5.7.1/.jenkins/common.groovy --- rocfft-5.5.0/.jenkins/common.groovy 2023-01-31 06:20:16.000000000 +0000 +++ rocfft-5.7.1/.jenkins/common.groovy 2023-08-09 16:19:51.000000000 +0000 @@ -12,14 +12,14 @@ { libraryName -> getDependenciesCommand += auxiliary.getLibrary(libraryName, platform.jenkinsLabel, null, false) } - } + } - String clientArgs = '-DBUILD_CLIENTS_SAMPLES=ON -DBUILD_CLIENTS_TESTS=ON -DBUILD_CLIENTS_RIDER=ON -DBUILD_FFTW=ON' + String clientArgs = '-DBUILD_CLIENTS_SAMPLES=ON -DBUILD_CLIENTS_TESTS=ON -DBUILD_CLIENTS_RIDER=ON' String warningArgs = '-DWERROR=ON' + String buildTunerArgs = '-DROCFFT_BUILD_OFFLINE_TUNER=ON' String buildTypeArg = debug ? '-DCMAKE_BUILD_TYPE=Debug -DROCFFT_DEVICE_FORCE_RELEASE=ON' : '-DCMAKE_BUILD_TYPE=Release' String buildTypeDir = debug ? 'debug' : 'release' String staticArg = buildStatic ? '-DBUILD_SHARED_LIBS=off' : '' - String hipClangArgs = jobName.contains('hipclang') ? '-DUSE_HIP_CLANG=ON -DHIP_COMPILER=clang' : '' String cmake = platform.jenkinsLabel.contains('centos') ? 'cmake3' : 'cmake' //Set CI node's gfx arch as target if PR, otherwise use default targets of the library String amdgpuTargets = env.BRANCH_NAME.startsWith('PR-') ? '-DAMDGPU_TARGETS=\$gfx_arch' : '' @@ -32,7 +32,7 @@ set -e mkdir -p build/${buildTypeDir} && cd build/${buildTypeDir} ${auxiliary.gfxTargetParser()} - ${cmake} -DCMAKE_CXX_COMPILER=/opt/rocm/bin/hipcc -DCMAKE_C_COMPILER=/opt/rocm/bin/hipcc ${buildTypeArg} ${clientArgs} ${warningArgs} ${hipClangArgs} ${staticArg} ${amdgpuTargets} ${rtcBuildCache} ../.. + ${cmake} -DCMAKE_CXX_COMPILER=/opt/rocm/bin/hipcc -DCMAKE_C_COMPILER=/opt/rocm/bin/hipcc ${buildTypeArg} ${clientArgs} ${warningArgs} ${buildTunerArgs} ${staticArg} ${amdgpuTargets} ${rtcBuildCache} ../.. make -j\$(nproc) sudo make install """ @@ -46,12 +46,11 @@ project.paths.construct_build_prefix() - String clientArgs = '-DBUILD_CLIENTS_SAMPLES=ON -DBUILD_CLIENTS_TESTS=ON -DBUILD_CLIENTS_RIDER=ON -DBUILD_GTEST=ON -DBUILD_FFTW=ON' + String clientArgs = '-DBUILD_CLIENTS_SAMPLES=ON -DBUILD_CLIENTS_TESTS=ON -DBUILD_CLIENTS_RIDER=ON' String warningArgs = '-DWERROR=ON' String buildTypeArg = debug ? '-DCMAKE_BUILD_TYPE=Debug -DROCFFT_DEVICE_FORCE_RELEASE=ON' : '-DCMAKE_BUILD_TYPE=Release' String buildTypeDir = debug ? 'debug' : 'release' //String staticArg = buildStatic ? '-DBUILD_SHARED_LIBS=off' : '' - String hipClangArgs = jobName.contains('hipclang') ? '-DUSE_HIP_CLANG=ON -DHIP_COMPILER=clang' : '' String cmake = platform.jenkinsLabel.contains('centos') ? 'cmake3' : 'cmake' String amdgpuTargets = env.BRANCH_NAME.startsWith('PR-') ? '-DAMDGPU_TARGETS=\$gfx_arch' : '' @@ -62,7 +61,7 @@ set -ex cd ${project.paths.project_build_prefix}/clients mkdir -p build && cd build - ${cmake} -DCMAKE_CXX_COMPILER=/opt/rocm/bin/hipcc -DCMAKE_C_COMPILER=/opt/rocm/bin/hipcc ${buildTypeArgClients} ${hipClangArgs} ${cmakePrefixPathArg} ../ + ${cmake} -DCMAKE_CXX_COMPILER=/opt/rocm/bin/hipcc -DCMAKE_C_COMPILER=/opt/rocm/bin/hipcc ${buildTypeArgClients} ${cmakePrefixPathArg} ../ make -j\$(nproc) """ platform.runCommand(this, command) @@ -88,6 +87,15 @@ def packageHelper = platform.makePackage(platform.jenkinsLabel,"${project.paths.project_build_prefix}/build/${directory}",false) platform.runCommand(this, packageHelper[0]) platform.archiveArtifacts(this, packageHelper[1]) + + //trim temp files + def command = """#!/usr/bin/env bash + set -ex + cd ${project.paths.project_build_prefix}/build/${directory}/ + rm -rf _CPack_Packages/ + find -name '*.o' -delete + """ + platform.runCommand(this, command) } def runSubsetBuildCommand(platform, project, jobName, genPattern, genSmall, genLarge, boolean onlyDouble) @@ -106,7 +114,6 @@ String precisionArgs = onlyDouble ? '-DGENERATOR_PRECISION=double' : '' String kernelArgs = "${genPatternArgs} ${manualSmallArgs} ${manualLargeArgs} ${precisionArgs}" - String hipClangArgs = jobName.contains('hipclang') ? '-DUSE_HIP_CLANG=ON -DHIP_COMPILER=clang' : '' String cmake = platform.jenkinsLabel.contains('centos') ? 'cmake3' : 'cmake' //Set CI node's gfx arch as target if PR, otherwise use default targets of the library String amdgpuTargets = env.BRANCH_NAME.startsWith('PR-') ? '-DAMDGPU_TARGETS=\$gfx_arch' : '' @@ -119,7 +126,7 @@ rm -rf build/${buildTypeDir} mkdir -p build/${buildTypeDir} && cd build/${buildTypeDir} ${auxiliary.gfxTargetParser()} - ${cmake} -DCMAKE_CXX_COMPILER=/opt/rocm/bin/hipcc -DCMAKE_C_COMPILER=/opt/rocm/bin/hipcc ${buildTypeArg} ${clientArgs} ${kernelArgs} ${warningArgs} ${hipClangArgs} ${amdgpuTargets} ${rtcBuildCache} ../.. + ${cmake} -DCMAKE_CXX_COMPILER=/opt/rocm/bin/hipcc -DCMAKE_C_COMPILER=/opt/rocm/bin/hipcc ${buildTypeArg} ${clientArgs} ${kernelArgs} ${warningArgs} ${amdgpuTargets} ${rtcBuildCache} ../.. make -j\$(nproc) """ platform.runCommand(this, command) diff -Nru rocfft-5.5.0/.jenkins/debug.groovy rocfft-5.7.1/.jenkins/debug.groovy --- rocfft-5.5.0/.jenkins/debug.groovy 2023-01-31 06:20:16.000000000 +0000 +++ rocfft-5.7.1/.jenkins/debug.groovy 2023-08-09 16:19:51.000000000 +0000 @@ -18,7 +18,7 @@ prj.defaults.ccache = true prj.timeout.compile = 600 prj.timeout.test = 600 - prj.libraryDependencies = ['rocRAND'] + prj.libraryDependencies = ['rocRAND','hipRAND'] // Define test architectures, optional rocm version argument is available def nodes = new dockerNodes(nodeDetails, jobName, prj) diff -Nru rocfft-5.5.0/.jenkins/performance.groovy rocfft-5.7.1/.jenkins/performance.groovy --- rocfft-5.5.0/.jenkins/performance.groovy 2023-01-31 06:20:16.000000000 +0000 +++ rocfft-5.7.1/.jenkins/performance.groovy 2023-08-09 16:19:51.000000000 +0000 @@ -28,12 +28,11 @@ git branch: "${reference}", url: 'https://github.com/ROCmSoftwarePlatform/rocFFT.git' } - String clientArgs = '-DBUILD_CLIENTS_SAMPLES=ON -DBUILD_CLIENTS_TESTS=ON -DBUILD_CLIENTS_RIDER=ON -DBUILD_FFTW=OFF' - String noclientArgs = '-DBUILD_CLIENTS_SAMPLES=OFF -DBUILD_CLIENTS_TESTS=OFF -DBUILD_CLIENTS_RIDER=OFF -DBUILD_FFTW=OFF' + String clientArgs = '-DBUILD_CLIENTS_SAMPLES=ON -DBUILD_CLIENTS_TESTS=ON -DBUILD_CLIENTS_RIDER=ON' + String noclientArgs = '-DBUILD_CLIENTS_SAMPLES=OFF -DBUILD_CLIENTS_TESTS=OFF -DBUILD_CLIENTS_RIDER=OFF' String warningArgs = '-DWERROR=ON' String buildTypeArg = debug ? '-DCMAKE_BUILD_TYPE=Debug -DROCFFT_DEVICE_FORCE_RELEASE=ON' : '-DCMAKE_BUILD_TYPE=Release' String buildTypeDir = debug ? 'debug' : 'release' - String hipClangArgs = jobName.contains('hipclang') ? '-DUSE_HIP_CLANG=ON -DHIP_COMPILER=clang' : '' String rtcBuildCache = "-DROCFFT_BUILD_KERNEL_CACHE_PATH=\$JENKINS_HOME_DIR/rocfft_build_cache.db" String cmake = platform.jenkinsLabel.contains('centos') ? 'cmake3' : 'cmake' @@ -44,13 +43,13 @@ set -e mkdir -p build/${buildTypeDir} && pushd build/${buildTypeDir} ${auxiliary.gfxTargetParser()} - ${cmake} -DCMAKE_CXX_COMPILER=/opt/rocm/bin/hipcc -DCMAKE_C_COMPILER=/opt/rocm/bin/hipcc -DAMDGPU_TARGETS=\$gfx_arch -DSINGLELIB=on ${buildTypeArg} ${clientArgs} ${warningArgs} ${hipClangArgs} ${rtcBuildCache} ../.. + ${cmake} -DCMAKE_CXX_COMPILER=/opt/rocm/bin/hipcc -DCMAKE_C_COMPILER=/opt/rocm/bin/hipcc -DAMDGPU_TARGETS=\$gfx_arch -DSINGLELIB=on ${buildTypeArg} ${clientArgs} ${warningArgs} ${rtcBuildCache} ../.. make -j\$(nproc) popd cd ref-repo mkdir -p build/${buildTypeDir} && pushd build/${buildTypeDir} ${auxiliary.gfxTargetParser()} - ${cmake} -DCMAKE_CXX_COMPILER=/opt/rocm/bin/hipcc -DCMAKE_C_COMPILER=/opt/rocm/bin/hipcc -DAMDGPU_TARGETS=\$gfx_arch -DSINGLELIB=on ${buildTypeArg} ${noclientArgs} ${warningArgs} ${hipClangArgs} ${rtcBuildCache} ../.. + ${cmake} -DCMAKE_CXX_COMPILER=/opt/rocm/bin/hipcc -DCMAKE_C_COMPILER=/opt/rocm/bin/hipcc -DAMDGPU_TARGETS=\$gfx_arch -DSINGLELIB=on ${buildTypeArg} ${noclientArgs} ${warningArgs} ${rtcBuildCache} ../.. make -j\$(nproc) """ platform.runCommand(this, command) @@ -89,41 +88,39 @@ reportTitles: "${dataType}-precision-${platform.gpu}"]) } - if (platform.gpu != 'gfx90a') + + withCredentials([gitUsernamePassword(credentialsId: 'GitHub-ROCmMathLibrariesBot-Token', gitToolName: 'git-tool')]) { - withCredentials([gitUsernamePassword(credentialsId: 'GitHub-ROCmMathLibrariesBot-Token', gitToolName: 'git-tool')]) - { - platform.runCommand( - this, - """ - cd ${project.paths.build_prefix} - git clone https://github.com/ROCmSoftwarePlatform/rocPTS.git -b release/rocpts-rel-1.0 - cd rocPTS - python3 -m pip install build - python3 -m build - python3 -m pip install . - """ - ) - } - writeFile( - file: project.paths.project_build_prefix + "/record_pts.py", - text: libraryResource("com/amd/scripts/record_pts.py")) - def setupBranch = env.CHANGE_ID ? "git branch \$BRANCH_NAME" : "" - def command = """#!/usr/bin/env bash - set -ex - cd ${project.paths.project_build_prefix} - ${setupBranch} - git checkout \$BRANCH_NAME - benchmark_folder=rocFFT_Benchmark_Dataset_\$(date +%Y%m%d) - mkdir -p \${benchmark_folder}/all_change \${benchmark_folder}/all_ref - cp -uf ./*_change/* \${benchmark_folder}/all_change - cp -uf ./*_ref/* \${benchmark_folder}/all_ref - python3 ./record_pts.py --dataset-path \$PWD/\${benchmark_folder} --reference-dataset all_ref --new-dataset all_change -v 5.3 -l pts_rocfft_benchmark_data - """ - withCredentials([usernamePassword(credentialsId: 'PTS_API_ID_KEY_PROD', usernameVariable: 'PTS_API_ID', passwordVariable: 'PTS_API_KEY')]) - { - platform.runCommand(this, command) - } + platform.runCommand( + this, + """ + cd ${project.paths.build_prefix} + git clone https://github.com/ROCmSoftwarePlatform/rocPTS.git -b release/rocpts-rel-1.1.0 + cd rocPTS + python3 -m pip install build + python3 -m build + python3 -m pip install . + """ + ) + } + writeFile( + file: project.paths.project_build_prefix + "/record_pts.py", + text: libraryResource("com/amd/scripts/record_pts.py")) + def setupBranch = env.CHANGE_ID ? "git branch \$BRANCH_NAME" : "" + def command = """#!/usr/bin/env bash + set -ex + cd ${project.paths.project_build_prefix} + ${setupBranch} + git checkout \$BRANCH_NAME + benchmark_folder=rocFFT_Benchmark_Dataset_\$(date +%Y%m%d) + mkdir -p \${benchmark_folder}/all_change \${benchmark_folder}/all_ref + cp -uf ./*_change/* \${benchmark_folder}/all_change + cp -uf ./*_ref/* \${benchmark_folder}/all_ref + python3 ./record_pts.py --dataset-path \$PWD/\${benchmark_folder} --reference-dataset all_ref --new-dataset all_change -v 5.5 -l pts_rocfft_benchmark_data-v1.0.0 + """ + withCredentials([usernamePassword(credentialsId: 'PTS_API_ID_KEY_PROD', usernameVariable: 'PTS_API_ID', passwordVariable: 'PTS_API_KEY')]) + { + platform.runCommand(this, command) } } @@ -136,7 +133,7 @@ prj.defaults.ccache = true prj.timeout.compile = 600 prj.timeout.test = 600 - prj.libraryDependencies = ['rocRAND'] + prj.libraryDependencies = ['rocRAND','hipRAND'] // Define test architectures, optional rocm version argument is available def nodes = new dockerNodes(nodeDetails, jobName, prj) diff -Nru rocfft-5.5.0/.jenkins/precheckin.groovy rocfft-5.7.1/.jenkins/precheckin.groovy --- rocfft-5.5.0/.jenkins/precheckin.groovy 2023-01-31 06:20:16.000000000 +0000 +++ rocfft-5.7.1/.jenkins/precheckin.groovy 2023-08-09 16:19:51.000000000 +0000 @@ -18,7 +18,7 @@ prj.defaults.ccache = true prj.timeout.compile = 600 prj.timeout.test = 600 - prj.libraryDependencies = ['rocRAND'] + prj.libraryDependencies = ['rocRAND','hipRAND'] // Define test architectures, optional rocm version argument is available def nodes = new dockerNodes(nodeDetails, jobName, prj) diff -Nru rocfft-5.5.0/.jenkins/staticanalysis.groovy rocfft-5.7.1/.jenkins/staticanalysis.groovy --- rocfft-5.5.0/.jenkins/staticanalysis.groovy 2023-01-31 06:20:16.000000000 +0000 +++ rocfft-5.7.1/.jenkins/staticanalysis.groovy 2023-08-09 16:19:51.000000000 +0000 @@ -13,13 +13,6 @@ { project.paths.construct_build_prefix() - def command = """#!/usr/bin/env bash - set -x - ${project.paths.project_build_prefix}/docs/run_doc.sh - """ - - platform.runCommand(this, command) - def yapfCommand = """#!/usr/bin/env bash set -x cd ${project.paths.project_build_prefix} @@ -30,14 +23,6 @@ """ platform.runCommand(this, yapfCommand) - - publishHTML([allowMissing: false, - alwaysLinkToLastBuild: false, - keepAll: false, - reportDir: "${project.paths.project_build_prefix}/docs/source/_build/html", - reportFiles: "index.html", - reportName: "Documentation", - reportTitles: "Documentation"]) } def runCI = @@ -45,7 +30,7 @@ nodeDetails, jobName-> def prj = new rocProject('rocFFT-internal', 'StaticAnalysis') - prj.libraryDependencies = ['rocRAND'] + prj.libraryDependencies = ['rocRAND','hipRAND'] // Define test architectures, optional rocm version argument is available def nodes = new dockerNodes(nodeDetails, jobName, prj) diff -Nru rocfft-5.5.0/.jenkins/staticlibrary.groovy rocfft-5.7.1/.jenkins/staticlibrary.groovy --- rocfft-5.5.0/.jenkins/staticlibrary.groovy 2023-01-31 06:20:16.000000000 +0000 +++ rocfft-5.7.1/.jenkins/staticlibrary.groovy 2023-08-09 16:19:51.000000000 +0000 @@ -18,7 +18,7 @@ prj.defaults.ccache = true prj.timeout.compile = 600 prj.timeout.test = 600 - prj.libraryDependencies = ['rocRAND'] + prj.libraryDependencies = ['rocRAND','hipRAND'] // Define test architectures, optional rocm version argument is available def nodes = new dockerNodes(nodeDetails, jobName, prj) diff -Nru rocfft-5.5.0/.readthedocs.yaml rocfft-5.7.1/.readthedocs.yaml --- rocfft-5.5.0/.readthedocs.yaml 1970-01-01 00:00:00.000000000 +0000 +++ rocfft-5.7.1/.readthedocs.yaml 2023-08-09 16:19:51.000000000 +0000 @@ -0,0 +1,14 @@ +# Read the Docs configuration file +# See https://docs.readthedocs.io/en/stable/config-file/v2.html for details + +version: 2 + +sphinx: + configuration: docs/conf.py + +formats: [htmlzip] + +python: + version: "3.8" + install: + - requirements: docs/.sphinx/requirements.txt diff -Nru rocfft-5.5.0/CHANGELOG.md rocfft-5.7.1/CHANGELOG.md --- rocfft-5.5.0/CHANGELOG.md 2023-01-31 06:20:16.000000000 +0000 +++ rocfft-5.7.1/CHANGELOG.md 2023-08-09 16:19:51.000000000 +0000 @@ -2,6 +2,35 @@ Full documentation for rocFFT is available at [rocfft.readthedocs.io](https://rocfft.readthedocs.io/en/latest/). +## rocFFT 1.0.24 for ROCm 5.7.0 + +### Optimizations +- Improved performance of complex forward/inverse 1D FFTs (2049 <= length <= 131071) that use Bluestein's algorithm. + +### Added +- Implemented a solution map version converter and finish the first conversion from ver.0 to ver.1. Where version 1 removes some incorrect kernels (sbrc/sbcr using half_lds) + +### Changed + +- Moved rocfft_rtc_helper executable to lib/rocFFT directory on Linux. +- Moved library kernel cache to lib/rocFFT directory. + +## rocFFT 1.0.23 for ROCm 5.6.0 + +### Added +- Implemented half-precision transforms, which can be requested by passing rocfft_precision_half to rocfft_plan_create. +- Implemented a hierarchical solution map which saves how to decompose a problem and the kernels to be used. +- Implemented a first version of offline-tuner to support tuning kernels for C2C/Z2Z problems. + +### Changed +- Replaced std::complex with hipComplex data types for data generator. +- FFT plan dimensions are now sorted to be row-major internally where possible, which produces better plans if the dimensions were accidentally specified in a different order (column-major, for example). +- Added --precision argument to benchmark/test clients. --double is still accepted but is deprecated as a method to request a double-precision transform. +- Improved performance test suite statistical framework. + +### Fixed +- Fixed over-allocation of LDS in some real-complex kernels, which was resulting in kernel launch failure. + ## rocFFT 1.0.22 for ROCm 5.5.0 ### Optimizations @@ -43,8 +72,8 @@ - Added gfx1100 and gfx1102 to default AMDGPU_TARGETS. ### Changed -- Moved runtime compilation cache to in-memory by default. A default on-disk cache can encounter contention problems -on multi-node clusters with a shared filesystem. rocFFT can still be told to use an on-disk cache by setting the +- Moved runtime compilation cache to in-memory by default. A default on-disk cache can encounter contention problems +on multi-node clusters with a shared filesystem. rocFFT can still be told to use an on-disk cache by setting the ROCFFT_RTC_CACHE_PATH environment variable. ## rocFFT 1.0.18 for ROCm 5.3.0 diff -Nru rocfft-5.5.0/CMakeLists.txt rocfft-5.7.1/CMakeLists.txt --- rocfft-5.5.0/CMakeLists.txt 2023-01-31 06:20:16.000000000 +0000 +++ rocfft-5.7.1/CMakeLists.txt 2023-08-09 16:19:51.000000000 +0000 @@ -1,5 +1,5 @@ # ############################################################################# -# Copyright (C) 2016 - 2022 Advanced Micro Devices, Inc. All rights reserved. +# Copyright (C) 2016 - 2023 Advanced Micro Devices, Inc. All rights reserved. # # Permission is hereby granted, free of charge, to any person obtaining a copy # of this software and associated documentation files (the "Software"), to deal @@ -22,6 +22,9 @@ cmake_minimum_required( VERSION 3.16 ) +# We use C++17 features, this will add compile option: -std=c++17 +set( CMAKE_CXX_STANDARD 17 ) + # This should appear before the project command, because it does not # use FORCE if( WIN32 ) @@ -46,16 +49,6 @@ project( rocfft LANGUAGES CXX C ) -# Control hip-clang use: -set( USE_HIP_CLANG OFF CACHE BOOL "Use hip-clang to build for amdgpu" ) -if( USE_HIP_CLANG ) - message( STATUS "Use hip-clang to build for amdgpu backend" ) - set( HIP_PLATFORM "hip-clang" ) - set( HIP_COMPILER "clang" ) -else() - set( HIP_PLATFORM "hcc" ) -endif() - # This finds the rocm-cmake project, and installs it if not found # rocm-cmake contains common cmake code for rocm projects to help setup and install set( PROJECT_EXTERN_DIR ${CMAKE_CURRENT_BINARY_DIR}/extern ) @@ -98,7 +91,7 @@ include( ROCMHeaderWrapper ) # Using standardized versioning from rocm-cmake -set ( VERSION_STRING "1.0.21" ) +set ( VERSION_STRING "1.0.23" ) rocm_setup_version( VERSION ${VERSION_STRING} ) # Append our library helper cmake path and the cmake path for hip (for @@ -123,6 +116,10 @@ option(ROCFFT_RUNTIME_COMPILE "Enable runtime compilation of kernels" ON) option(ROCFFT_RUNTIME_COMPILE_DEFAULT "Compile kernels at runtime by default" OFF) +# Using -DROCFFT_BUILD_OFFLINE_TUNER=ON to compile an executable, +# Set default to OFF since users are not likely to tune +option(ROCFFT_BUILD_OFFLINE_TUNER "Build with offline tuner executable rocfft_offline_tuner" OFF) + if(BUILD_ADDRESS_SANITIZER) set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -fsanitize=address -shared-libasan") set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -fsanitize=address -shared-libasan") @@ -147,7 +144,7 @@ # Use target ID syntax if supported for AMDGPU_TARGETS rocm_check_target_ids(DEFAULT_AMDGPU_TARGETS - TARGETS "gfx803;gfx900;gfx906;gfx908;gfx90a;gfx1030;gfx1100;gfx1101;gfx1102") + TARGETS "gfx803;gfx900;gfx906;gfx908;gfx90a;gfx940;gfx941;gfx942;gfx1030;gfx1100;gfx1101;gfx1102") set(AMDGPU_TARGETS "${DEFAULT_AMDGPU_TARGETS}" CACHE STRING "List of specific machine types for library to target") list(LENGTH AMDGPU_TARGETS AMDGPU_TARGETS_LENGTH) @@ -206,40 +203,40 @@ if(BUILD_CLIENTS_TESTS OR BUILD_CLIENTS_SELFTEST OR BUILD_CLIENTS_RIDER) find_package( Boost COMPONENTS program_options REQUIRED) set(BOOST_DEB "libboost-program-options${Boost_VERSION_MAJOR}.${Boost_VERSION_MINOR}.${Boost_VERSION_PATCH}") - set(BOOST_RPM "boost-devel = ${Boost_VERSION}") + set(BOOST_RPM "boost-program-options = ${Boost_VERSION_MAJOR}.${Boost_VERSION_MINOR}.${Boost_VERSION_PATCH}") endif() if( NOT CLIENTS_OS ) rocm_set_os_id( CLIENTS_OS ) endif() if(BUILD_CLIENTS_TESTS AND (NOT DEFINED BUILD_CLIENTS_TESTS_OPENMP OR BUILD_CLIENTS_TESTS_OPENMP)) set(OPENMP_DEB "libgomp1") + set(FFTW_DEB "libfftw3-bin") if(CLIENTS_OS STREQUAL "sles") set(OPENMP_RPM "libgomp1") + set(FFTW_RPM "libfftw3-3") else() set(OPENMP_RPM "libgomp") + set(FFTW_RPM "fftw-libs") endif() endif() if(CLIENTS_OS STREQUAL "sles") - set(BOOST_RPM RPM "libboost_program_options${Boost_VERSION_MAJOR}_${Boost_VERSION_MINOR}_${Boost_VERSION_PATCH}-devel") + set(BOOST_RPM RPM "libboost_program_options${Boost_VERSION_MAJOR}_${Boost_VERSION_MINOR}_${Boost_VERSION_PATCH}") endif() rocm_package_setup_component(clients) - rocm_package_setup_client_component(clients-common) if(BUILD_CLIENTS_TESTS) rocm_package_setup_client_component( tests DEPENDS - COMPONENT clients-common - DEB ${BOOST_DEB} ${OPENMP_DEB} - RPM ${BOOST_RPM} ${OPENMP_RPM} + DEB ${BOOST_DEB} ${OPENMP_DEB} ${FFTW_DEB} rocrand + RPM ${BOOST_RPM} ${OPENMP_RPM} ${FFTW_RPM} rocrand ) endif() if(BUILD_CLIENTS_RIDER) rocm_package_setup_client_component( benchmarks DEPENDS - COMPONENT clients-common - DEB ${BOOST_DEB} - RPM ${BOOST_RPM} + DEB ${BOOST_DEB} rocrand + RPM ${BOOST_RPM} rocrand ) rocm_install( DIRECTORY scripts/perf diff -Nru rocfft-5.5.0/LICENSE.md rocfft-5.7.1/LICENSE.md --- rocfft-5.5.0/LICENSE.md 2023-01-31 06:20:16.000000000 +0000 +++ rocfft-5.7.1/LICENSE.md 2023-08-09 16:19:51.000000000 +0000 @@ -1,4 +1,4 @@ -Copyright (C) 2016 - 2022 Advanced Micro Devices, Inc. All rights reserved. +Copyright (C) 2016 - 2023 Advanced Micro Devices, Inc. All rights reserved. Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: diff -Nru rocfft-5.5.0/README.md rocfft-5.7.1/README.md --- rocfft-5.5.0/README.md 2023-01-31 06:20:16.000000000 +0000 +++ rocfft-5.7.1/README.md 2023-08-09 16:19:51.000000000 +0000 @@ -34,12 +34,11 @@ A static library can be compiled by using the option `-DBUILD_SHARED_LIBS=off` -To use the [hip-clang compiler][3], one must specify -`-DUSE_HIP_CLANG=ON -DHIP_COMPILER=clang` to cmake. rocFFT enables -use of indirect function calls by default and requires ROCm 4.3 or -higher to build successfully. `-DROCFFT_CALLBACKS_ENABLED=off` -may be specified to cmake to disable those calls on older ROCm -compilers, though callbacks will not work correctly in this configuration. +rocFFT enables use of indirect function calls by default and requires +ROCm 4.3 or higher to build successfully. +`-DROCFFT_CALLBACKS_ENABLED=off` may be specified to cmake to disable +those calls on older ROCm compilers, though callbacks will not work +correctly in this configuration. There are several clients included with rocFFT: 1. rocfft-rider runs general transforms and is useful for performance analysis; @@ -83,6 +82,18 @@ Please refer to the [library documentation][4] for current documentation. +### How to build documentation + +Please follow the steps below to build the documentation. + +``` +cd docs + +pip3 install -r .sphinx/requirements.txt + +python3 -m sphinx -T -E -b html -d _build/doctrees -D language=en . _build/html +``` + ## Examples Examples may be found in the [clients/samples][5] subdirectory. diff -Nru rocfft-5.5.0/clients/data_gen.h rocfft-5.7.1/clients/data_gen.h --- rocfft-5.5.0/clients/data_gen.h 2023-01-31 06:20:16.000000000 +0000 +++ rocfft-5.7.1/clients/data_gen.h 1970-01-01 00:00:00.000000000 +0000 @@ -1,1153 +0,0 @@ -// Copyright (C) 2022 Advanced Micro Devices, Inc. All rights reserved. -// -// Permission is hereby granted, free of charge, to any person obtaining a copy -// of this software and associated documentation files (the "Software"), to deal -// in the Software without restriction, including without limitation the rights -// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell -// copies of the Software, and to permit persons to whom the Software is -// furnished to do so, subject to the following conditions: -// -// The above copyright notice and this permission notice shall be included in -// all copies or substantial portions of the Software. -// -// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE -// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER -// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN -// THE SOFTWARE. - -#ifndef DATA_GEN_H -#define DATA_GEN_H - -#include "../shared/arithmetic.h" -#include "../shared/gpubuf.h" -#include -#include -#include -#include -#include -#include -#include - -static const unsigned int DATA_GEN_THREADS = 32; - -template -struct input_val_1D -{ - T val1; -}; - -template -struct input_val_2D -{ - T val1; - T val2; -}; - -template -struct input_val_3D -{ - T val1; - T val2; - T val3; -}; - -template -static input_val_1D get_input_val(const T& val) -{ - return input_val_1D{val}; -} - -template -static input_val_2D get_input_val(const std::tuple& val) -{ - return input_val_2D{std::get<0>(val), std::get<1>(val)}; -} - -template -static input_val_3D get_input_val(const std::tuple& val) -{ - return input_val_3D{std::get<0>(val), std::get<1>(val), std::get<2>(val)}; -} - -template -__device__ static size_t - compute_index(const input_val_1D& length, const input_val_1D& stride, size_t base) -{ - return (length.val1 * stride.val1) + base; -} - -template -__device__ static size_t - compute_index(const input_val_2D& length, const input_val_2D& stride, size_t base) -{ - return (length.val1 * stride.val1) + (length.val2 * stride.val2) + base; -} - -template -__device__ static size_t - compute_index(const input_val_3D& length, const input_val_3D& stride, size_t base) -{ - return (length.val1 * stride.val1) + (length.val2 * stride.val2) + (length.val3 * stride.val3) - + base; -} - -template -static inline input_val_1D make_zero_length(const input_val_1D& whole_length) -{ - return input_val_1D{0}; -} - -template -static inline input_val_2D make_zero_length(const input_val_2D& whole_length) -{ - return input_val_2D{0, 0}; -} - -template -static inline input_val_3D make_zero_length(const input_val_3D& whole_length) -{ - return input_val_3D{0, 0, 0}; -} - -template -__device__ static input_val_1D get_length(const size_t i, const input_val_1D& whole_length) -{ - auto xlen = whole_length.val1; - - auto xidx = i % xlen; - - return input_val_1D{xidx}; -} - -template -__device__ static size_t get_batch(const size_t i, const input_val_1D& whole_length) -{ - auto xlen = whole_length.val1; - - auto yidx = i / xlen; - - return yidx; -} - -template -__device__ static input_val_2D get_length(const size_t i, const input_val_2D& whole_length) -{ - auto xlen = whole_length.val1; - auto ylen = whole_length.val2; - - auto xidx = i % xlen; - auto yidx = i / xlen % ylen; - - return input_val_2D{xidx, yidx}; -} - -template -__device__ static size_t get_batch(const size_t i, const input_val_2D& whole_length) -{ - auto xlen = whole_length.val1; - auto ylen = whole_length.val2; - - auto zidx = i / xlen / ylen; - - return zidx; -} - -template -__device__ static input_val_3D get_length(const size_t i, const input_val_3D& whole_length) -{ - auto xlen = whole_length.val1; - auto ylen = whole_length.val2; - auto zlen = whole_length.val3; - - auto xidx = i % xlen; - auto yidx = i / xlen % ylen; - auto zidx = i / xlen / ylen % zlen; - - return input_val_3D{xidx, yidx, zidx}; -} - -template -__device__ static size_t get_batch(const size_t i, const input_val_3D& length) -{ - auto xlen = length.val1; - auto ylen = length.val2; - auto zlen = length.val3; - - auto widx = i / xlen / ylen / zlen; - - return widx; -} - -template -__global__ static void __launch_bounds__(DATA_GEN_THREADS) - generate_float_interleaved_data_kernel(const T1 whole_length, - const T1 zero_length, - size_t idist, - size_t isize, - const T1 istride, - std::complex* data) -{ - auto const i = threadIdx.x + blockIdx.x * blockDim.x; - if(i < isize) - { - auto i_length = get_length(i, whole_length); - auto i_batch = get_batch(i, whole_length); - auto i_base = i_batch * idist; - - auto seed = compute_index(zero_length, istride, i_base); - auto idx = compute_index(i_length, istride, i_base); - - rocrand_state_philox4x32_10 gen_state; - rocrand_init(seed, idx, 0, &gen_state); - - auto item = rocrand_uniform2(&gen_state); - - data[idx] = std::complex(item.x, item.y); - } -} - -template -__global__ static void __launch_bounds__(DATA_GEN_THREADS) - generate_double_interleaved_data_kernel(const T1 whole_length, - const T1 zero_length, - size_t idist, - size_t isize, - const T1 istride, - std::complex* data) -{ - auto const i = threadIdx.x + blockIdx.x * blockDim.x; - if(i < isize) - { - auto i_length = get_length(i, whole_length); - auto i_batch = get_batch(i, whole_length); - auto i_base = i_batch * idist; - - auto seed = compute_index(zero_length, istride, i_base); - auto idx = compute_index(i_length, istride, i_base); - - rocrand_state_philox4x32_10 gen_state; - rocrand_init(seed, idx, 0, &gen_state); - - auto item = rocrand_uniform_double2(&gen_state); - - data[idx] = std::complex(item.x, item.y); - } -} - -template -__global__ static void __launch_bounds__(DATA_GEN_THREADS) - generate_float_planar_data_kernel(const T1 whole_length, - const T1 zero_length, - size_t idist, - size_t isize, - const T1 istride, - float* real_data, - float* imag_data) -{ - auto const i = threadIdx.x + blockIdx.x * blockDim.x; - if(i < isize) - { - auto i_length = get_length(i, whole_length); - auto i_batch = get_batch(i, whole_length); - auto i_base = i_batch * idist; - - auto seed = compute_index(zero_length, istride, i_base); - auto idx = compute_index(i_length, istride, i_base); - - rocrand_state_philox4x32_10 gen_state; - rocrand_init(seed, idx, 0, &gen_state); - - auto item = rocrand_uniform2(&gen_state); - - real_data[idx] = item.x; - imag_data[idx] = item.y; - } -} - -template -__global__ static void __launch_bounds__(DATA_GEN_THREADS) - generate_double_planar_data_kernel(const T1 whole_length, - const T1 zero_length, - size_t idist, - size_t isize, - const T1 istride, - double* real_data, - double* imag_data) -{ - auto const i = threadIdx.x + blockIdx.x * blockDim.x; - if(i < isize) - { - auto i_length = get_length(i, whole_length); - auto i_batch = get_batch(i, whole_length); - auto i_base = i_batch * idist; - - auto seed = compute_index(zero_length, istride, i_base); - auto idx = compute_index(i_length, istride, i_base); - - rocrand_state_philox4x32_10 gen_state; - rocrand_init(seed, idx, 0, &gen_state); - - auto item = rocrand_uniform_double2(&gen_state); - - real_data[idx] = item.x; - imag_data[idx] = item.y; - } -} - -template -__global__ static void __launch_bounds__(DATA_GEN_THREADS) - generate_float_real_data_kernel(const T1 whole_length, - const T1 zero_length, - size_t idist, - size_t isize, - const T1 istride, - float* data) -{ - auto const i = threadIdx.x + blockIdx.x * blockDim.x; - if(i < isize) - { - auto i_length = get_length(i, whole_length); - auto i_batch = get_batch(i, whole_length); - auto i_base = i_batch * idist; - - auto seed = compute_index(zero_length, istride, i_base); - auto idx = compute_index(i_length, istride, i_base); - - rocrand_state_philox4x32_10 gen_state; - rocrand_init(seed, idx, 0, &gen_state); - - data[idx] = rocrand_uniform(&gen_state); - } -} - -template -__global__ static void __launch_bounds__(DATA_GEN_THREADS) - generate_double_real_data_kernel(const T1 whole_length, - const T1 zero_length, - size_t idist, - size_t isize, - const T1 istride, - double* data) -{ - auto const i = threadIdx.x + blockIdx.x * blockDim.x; - if(i < isize) - { - auto i_length = get_length(i, whole_length); - auto i_batch = get_batch(i, whole_length); - auto i_base = i_batch * idist; - - auto seed = compute_index(zero_length, istride, i_base); - auto idx = compute_index(i_length, istride, i_base); - - rocrand_state_philox4x32_10 gen_state; - rocrand_init(seed, idx, 0, &gen_state); - - data[idx] = rocrand_uniform_double(&gen_state); - } -} - -// For complex-to-real transforms, the input data must be Hermitiam-symmetric. -// That is, u_k is the complex conjugate of u_{-k}, where k is the wavevector in Fourier -// space. For multi-dimensional data, this means that we only need to store a bit more -// than half of the complex values; the rest are redundant. However, there are still -// some restrictions: -// * the origin and Nyquist value(s) must be real-valued -// * some of the remaining values are still redundant, and you might get different results -// than you expect if the values don't agree. -// Below are some example kernels which impose Hermitian symmetry on a complex array -// of the given dimensions. - -// Kernels for imposing Hermitian symmetry on 1D -// complex (interleaved/planar) data on the GPU. - -template -__global__ static void __launch_bounds__(DATA_GEN_THREADS) - impose_hermitian_symmetry_interleaved_1(std::complex* x, - const size_t Nx, - const size_t xstride, - const size_t dist, - const size_t nbatch, - const bool Nxeven) -{ - auto idx = blockIdx.x * blockDim.x + threadIdx.x; - - if(idx < nbatch) - { - idx *= dist; - - // The DC mode must be real-valued. - x[idx].imag(0); - - if(Nxeven) - { - // Nyquist mode - auto pos = idx + (Nx / 2) * xstride; - x[pos].imag(0); - } - } -} - -template -__global__ static void __launch_bounds__(DATA_GEN_THREADS) - impose_hermitian_symmetry_planar_1(Tfloat* xreal, - Tfloat* ximag, - const size_t Nx, - const size_t xstride, - const size_t dist, - const size_t nbatch, - const bool Nxeven) -{ - auto idx = blockIdx.x * blockDim.x + threadIdx.x; - - if(idx < nbatch) - { - idx *= dist; - - // The DC mode must be real-valued. - ximag[idx] = 0; - - if(Nxeven) - { - // Nyquist mode - auto pos = idx + (Nx / 2) * xstride; - ximag[pos] = 0; - } - } -} - -// Kernels for imposing Hermitian symmetry on 2D -// complex (interleaved/planar) data on the GPU. - -template -__global__ static void __launch_bounds__(DATA_GEN_THREADS* DATA_GEN_THREADS) - impose_hermitian_symmetry_interleaved_2(std::complex* x, - const size_t Nx, - const size_t Ny, - const size_t xstride, - const size_t ystride, - const size_t dist, - const size_t nbatch, - const bool Nxeven, - const bool Nyeven) -{ - auto idx = blockIdx.y * blockDim.y + threadIdx.y; - const auto idy = blockIdx.x * blockDim.x + threadIdx.x; - - if(idy < (Ny / 2 + 1) && idx < nbatch) - { - idx *= dist; - - auto pos = idx + idy * ystride; - auto cpos = idx + ((Ny - idy) % Ny) * ystride; - - auto val = x[pos]; - - // DC mode: - if(idy == 0) - val.imag(0); - - // Axes need to be symmetrized: - if(idy > 0 && idy < (Ny + 1) / 2) - val = std::conj(val); - - // y-Nyquist - if(Nyeven && idy == Ny / 2) - val.imag(0); - - x[cpos] = val; - - if(Nxeven) - { - pos += (Nx / 2) * xstride; - cpos += (Nx / 2) * xstride; - - val = x[pos]; - - // DC mode: - if(idy == 0) - val.imag(0); - - // Axes need to be symmetrized: - if(idy > 0 && idy < (Ny + 1) / 2) - val = std::conj(val); - - // y-Nyquist - if(Nyeven && idy == Ny / 2) - val.imag(0); - - x[cpos] = val; - } - } -} - -template -__global__ static void __launch_bounds__(DATA_GEN_THREADS* DATA_GEN_THREADS) - impose_hermitian_symmetry_planar_2(Tfloat* xreal, - Tfloat* ximag, - const size_t Nx, - const size_t Ny, - const size_t xstride, - const size_t ystride, - const size_t dist, - const size_t nbatch, - const bool Nxeven, - const bool Nyeven) -{ - auto idx = blockIdx.y * blockDim.y + threadIdx.y; - const auto idy = blockIdx.x * blockDim.x + threadIdx.x; - - if(idy < (Ny / 2 + 1) && idx < nbatch) - { - idx *= dist; - - auto pos = idx + idy * ystride; - auto cpos = idx + ((Ny - idy) % Ny) * ystride; - - auto valreal = xreal[pos]; - auto valimag = ximag[pos]; - - // DC mode: - if(idy == 0) - valimag = 0; - - // Axes need to be symmetrized: - if(idy > 0 && idy < (Ny + 1) / 2) - valimag = -valimag; - - // y-Nyquist - if(Nyeven && idy == Ny / 2) - valimag = 0; - - xreal[cpos] = valreal; - ximag[cpos] = valimag; - - if(Nxeven) - { - pos += (Nx / 2) * xstride; - cpos += (Nx / 2) * xstride; - - valreal = xreal[pos]; - valimag = ximag[pos]; - - // DC mode: - if(idy == 0) - valimag = 0; - - // Axes need to be symmetrized: - if(idy > 0 && idy < (Ny + 1) / 2) - valimag = -valimag; - - // y-Nyquist - if(Nyeven && idy == Ny / 2) - valimag = 0; - - xreal[cpos] = valreal; - ximag[cpos] = valimag; - } - } -} - -// Kernels for imposing Hermitian symmetry on 3D -// complex (interleaved/planar) data on the GPU. - -template -__global__ static void __launch_bounds__(DATA_GEN_THREADS* DATA_GEN_THREADS* DATA_GEN_THREADS) - impose_hermitian_symmetry_interleaved_3(std::complex* x, - const size_t Nx, - const size_t Ny, - const size_t Nz, - const size_t xstride, - const size_t ystride, - const size_t zstride, - const size_t dist, - const size_t nbatch, - const bool Nxeven, - const bool Nyeven, - const bool Nzeven) -{ - const auto idy = blockIdx.x * blockDim.x + threadIdx.x; - const auto idz = blockIdx.y * blockDim.y + threadIdx.y; - auto idx = blockIdx.z * blockDim.z + threadIdx.z; - - if(idy < Ny && idz < Nz && idx < nbatch) - { - idx *= dist; - - auto pos = idx + idy * ystride + idz * zstride; - auto cpos = idx + ((Ny - idy) % Ny) * ystride + ((Nz - idz) % Nz) * zstride; - - // Origin - if(idy == 0 && idz == 0) - { - x[pos].imag(0); - } - - // y-Nyquist - if(Nyeven && idy == Ny / 2 && idz == 0) - { - x[pos].imag(0); - } - - // z-Nyquist - if(Nzeven && idz == Nz / 2 && idy == 0) - { - x[pos].imag(0); - } - - // yz-Nyquist - if(Nyeven && Nzeven && idy == Ny / 2 && idz == Nz / 2) - { - x[pos].imag(0); - } - - // z-axis - if(idy == 0 && idz > 0 && idz < (Nz + 1) / 2) - x[cpos] = std::conj(x[pos]); - - // y-Nyquist axis - if(Nyeven && idy == Ny / 2 && idz > 0 && idz < (Nz + 1) / 2) - x[cpos] = std::conj(x[pos]); - - // y-axis - if(idy > 0 && idy < (Ny + 1) / 2 && idz == 0) - x[cpos] = std::conj(x[pos]); - - // z-Nyquist axis - if(Nzeven && idz == Nz / 2 && idy > 0 && idy < (Ny + 1) / 2) - x[cpos] = std::conj(x[pos]); - - // yz plane - if(idy > 0 && idy < (Ny + 1) / 2 && idz > 0 && idz < Nz) - x[cpos] = std::conj(x[pos]); - - if(Nxeven) - { - pos += (Nx / 2) * xstride; - cpos += (Nx / 2) * xstride; - // Origin - if(idy == 0 && idz == 0) - x[pos].imag(0); - - // y-Nyquist - if(Nyeven && idy == Ny / 2 && idz == 0) - x[pos].imag(0); - - // z-Nyquist - if(Nzeven && idz == Nz / 2 && idy == 0) - x[pos].imag(0); - - // yz-Nyquist - if(Nyeven && Nzeven && idy == Ny / 2 && idz == Nz / 2) - x[pos].imag(0); - - // z-axis - if(idy == 0 && idz > 0 && idz < (Nz + 1) / 2) - x[cpos] = std::conj(x[pos]); - - // y-Nyquist axis - if(Nyeven && idy == Ny / 2 && idz > 0 && idz < (Nz + 1) / 2) - x[cpos] = std::conj(x[pos]); - - // y-axis - if(idy > 0 && idy < (Ny + 1) / 2 && idz == 0) - x[cpos] = std::conj(x[pos]); - - // z-Nyquist axis - if(Nzeven && idz == Nz / 2 && idy > 0 && idy < (Ny + 1) / 2) - x[cpos] = std::conj(x[pos]); - - // yz plane - if(idy > 0 && idy < (Ny + 1) / 2 && idz > 0 && idz < Nz) - x[cpos] = std::conj(x[pos]); - } - } -} - -template -__global__ static void __launch_bounds__(DATA_GEN_THREADS* DATA_GEN_THREADS* DATA_GEN_THREADS) - impose_hermitian_symmetry_planar_3(Tfloat* xreal, - Tfloat* ximag, - const size_t Nx, - const size_t Ny, - const size_t Nz, - const size_t xstride, - const size_t ystride, - const size_t zstride, - const size_t dist, - const size_t nbatch, - const bool Nxeven, - const bool Nyeven, - const bool Nzeven) -{ - const auto idy = blockIdx.x * blockDim.x + threadIdx.x; - const auto idz = blockIdx.y * blockDim.y + threadIdx.y; - auto idx = blockIdx.z * blockDim.z + threadIdx.z; - - if(idy < Ny && idz < Nz && idx < nbatch) - { - idx *= dist; - - auto pos = idx + idy * ystride + idz * zstride; - auto cpos = idx + ((Ny - idy) % Ny) * ystride + ((Nz - idz) % Nz) * zstride; - - // Origin - if(idy == 0 && idz == 0) - { - ximag[pos] = 0; - } - - // y-Nyquist - if(Nyeven && idy == Ny / 2 && idz == 0) - { - ximag[pos] = 0; - } - - // z-Nyquist - if(Nzeven && idz == Nz / 2 && idy == 0) - { - ximag[pos] = 0; - } - - // yz-Nyquist - if(Nyeven && Nzeven && idy == Ny / 2 && idz == Nz / 2) - { - ximag[pos] = 0; - } - - // z-axis - if(idy == 0 && idz > 0 && idz < (Nz + 1) / 2) - { - xreal[cpos] = xreal[pos]; - ximag[cpos] = -ximag[pos]; - } - - // y-Nyquist axis - if(Nyeven && idy == Ny / 2 && idz > 0 && idz < (Nz + 1) / 2) - { - xreal[cpos] = xreal[pos]; - ximag[cpos] = -ximag[pos]; - } - - // y-axis - if(idy > 0 && idy < (Ny + 1) / 2 && idz == 0) - { - xreal[cpos] = xreal[pos]; - ximag[cpos] = -ximag[pos]; - } - - // z-Nyquist axis - if(Nzeven && idz == Nz / 2 && idy > 0 && idy < (Ny + 1) / 2) - { - xreal[cpos] = xreal[pos]; - ximag[cpos] = -ximag[pos]; - } - - // yz plane - if(idy > 0 && idy < (Ny + 1) / 2 && idz > 0 && idz < Nz) - { - xreal[cpos] = xreal[pos]; - ximag[cpos] = -ximag[pos]; - } - - if(Nxeven) - { - pos += (Nx / 2) * xstride; - cpos += (Nx / 2) * xstride; - // Origin - if(idy == 0 && idz == 0) - ximag[pos] = 0; - - // y-Nyquist - if(Nyeven && idy == Ny / 2 && idz == 0) - ximag[pos] = 0; - - // z-Nyquist - if(Nzeven && idz == Nz / 2 && idy == 0) - ximag[pos] = 0; - - // yz-Nyquist - if(Nyeven && Nzeven && idy == Ny / 2 && idz == Nz / 2) - ximag[pos] = 0; - - // z-axis - if(idy == 0 && idz > 0 && idz < (Nz + 1) / 2) - { - xreal[cpos] = xreal[pos]; - ximag[cpos] = -ximag[pos]; - } - - // y-Nyquist axis - if(Nyeven && idy == Ny / 2 && idz > 0 && idz < (Nz + 1) / 2) - { - xreal[cpos] = xreal[pos]; - ximag[cpos] = -ximag[pos]; - } - - // y-axis - if(idy > 0 && idy < (Ny + 1) / 2 && idz == 0) - { - xreal[cpos] = xreal[pos]; - ximag[cpos] = -ximag[pos]; - } - - // z-Nyquist axis - if(Nzeven && idz == Nz / 2 && idy > 0 && idy < (Ny + 1) / 2) - { - xreal[cpos] = xreal[pos]; - ximag[cpos] = -ximag[pos]; - } - - // yz plane - if(idy > 0 && idy < (Ny + 1) / 2 && idz > 0 && idz < Nz) - { - xreal[cpos] = xreal[pos]; - ximag[cpos] = -ximag[pos]; - } - } - } -} - -template -inline void generate_interleaved_data(const Tint& whole_length, - const size_t idist, - const size_t isize, - const Tint& istride, - std::complex* input_data) -{ - auto blockSize = DATA_GEN_THREADS; - auto numBlocks_setup = DivRoundingUp(isize, blockSize); - - auto input_length = get_input_val(whole_length); - auto zero_length = make_zero_length(input_length); - auto input_stride = get_input_val(istride); - - hipLaunchKernelGGL(generate_float_interleaved_data_kernel, - dim3(numBlocks_setup), - dim3(blockSize), - 0, // sharedMemBytes - 0, // stream - input_length, - zero_length, - idist, - isize, - input_stride, - input_data); -} - -template -inline void generate_interleaved_data(const Tint& whole_length, - const size_t idist, - const size_t isize, - const Tint& istride, - std::complex* input_data) -{ - auto blockSize = DATA_GEN_THREADS; - auto numBlocks_setup = DivRoundingUp(isize, blockSize); - - auto input_length = get_input_val(whole_length); - auto zero_length = make_zero_length(input_length); - auto input_stride = get_input_val(istride); - - hipLaunchKernelGGL(generate_double_interleaved_data_kernel, - dim3(numBlocks_setup), - dim3(blockSize), - 0, // sharedMemBytes - 0, // stream - input_length, - zero_length, - idist, - isize, - input_stride, - input_data); -} - -template -inline void generate_planar_data(const Tint& whole_length, - const size_t idist, - const size_t isize, - const Tint& istride, - float* real_data, - float* imag_data) -{ - auto blockSize = DATA_GEN_THREADS; - auto numBlocks_setup = DivRoundingUp(isize, blockSize); - - auto input_length = get_input_val(whole_length); - auto zero_length = make_zero_length(input_length); - auto input_stride = get_input_val(istride); - - hipLaunchKernelGGL(generate_float_planar_data_kernel, - dim3(numBlocks_setup), - dim3(blockSize), - 0, // sharedMemBytes - 0, // stream - input_length, - zero_length, - idist, - isize, - input_stride, - real_data, - imag_data); -} - -template -inline void generate_planar_data(const Tint& whole_length, - const size_t idist, - const size_t isize, - const Tint& istride, - double* real_data, - double* imag_data) -{ - auto blockSize = DATA_GEN_THREADS; - auto numBlocks_setup = DivRoundingUp(isize, blockSize); - - auto input_length = get_input_val(whole_length); - auto zero_length = make_zero_length(input_length); - auto input_stride = get_input_val(istride); - - hipLaunchKernelGGL(generate_double_planar_data_kernel, - dim3(numBlocks_setup), - dim3(blockSize), - 0, // sharedMemBytes - 0, // stream - input_length, - zero_length, - idist, - isize, - input_stride, - real_data, - imag_data); -} - -template -inline void generate_real_data(const Tint& whole_length, - const size_t idist, - const size_t isize, - const Tint& istride, - float* input_data) -{ - auto blockSize = DATA_GEN_THREADS; - auto numBlocks_setup = DivRoundingUp(isize, blockSize); - - auto input_length = get_input_val(whole_length); - auto zero_length = make_zero_length(input_length); - auto input_stride = get_input_val(istride); - - hipLaunchKernelGGL(generate_float_real_data_kernel, - dim3(numBlocks_setup), - dim3(blockSize), - 0, // sharedMemBytes - 0, // stream - input_length, - zero_length, - idist, - isize, - input_stride, - input_data); -} - -template -inline void generate_real_data(const Tint& whole_length, - const size_t idist, - const size_t isize, - const Tint& istride, - double* input_data) -{ - auto blockSize = DATA_GEN_THREADS; - auto numBlocks_setup = DivRoundingUp(isize, blockSize); - - auto input_length = get_input_val(whole_length); - auto zero_length = make_zero_length(input_length); - auto input_stride = get_input_val(istride); - - hipLaunchKernelGGL(generate_double_real_data_kernel, - dim3(numBlocks_setup), - dim3(blockSize), - 0, // sharedMemBytes - 0, // stream - input_length, - zero_length, - idist, - isize, - input_stride, - input_data); -} - -template -void impose_hermitian_symmetry_interleaved(const std::vector& length, - const std::vector& ilength, - const std::vector& stride, - size_t dist, - size_t batch, - std::complex* input_data) -{ - auto blockSize = DATA_GEN_THREADS; - - switch(length.size()) - { - case 1: - { - const auto gridDim = dim3(blockSize); - const auto blockDim = dim3(DivRoundingUp(batch, blockSize)); - - hipLaunchKernelGGL(impose_hermitian_symmetry_interleaved_1, - gridDim, - blockDim, - 0, - 0, - input_data, - length[0], - stride[0], - dist, - batch, - length[0] % 2 == 0); - - break; - } - case 2: - { - const auto gridDim = dim3(blockSize, blockSize); - const auto blockDim = dim3(DivRoundingUp(ilength[0], blockSize), - DivRoundingUp(batch, blockSize)); - - hipLaunchKernelGGL(impose_hermitian_symmetry_interleaved_2, - gridDim, - blockDim, - 0, - 0, - input_data, - length[1], - length[0], - stride[1], - stride[0], - dist, - batch, - length[1] % 2 == 0, - length[0] % 2 == 0); - - break; - } - case 3: - { - const auto gridDim = dim3(blockSize, blockSize, blockSize); - const auto blockDim = dim3(DivRoundingUp(ilength[0], blockSize), - DivRoundingUp(ilength[1], blockSize), - DivRoundingUp(batch, blockSize)); - - hipLaunchKernelGGL(impose_hermitian_symmetry_interleaved_3, - gridDim, - blockDim, - 0, - 0, - input_data, - length[2], - length[0], - length[1], - stride[2], - stride[0], - stride[1], - dist, - batch, - length[2] % 2 == 0, - length[0] % 2 == 0, - length[1] % 2 == 0); - break; - } - default: - throw std::runtime_error("Invalid dimension for impose_hermitian_symmetry"); - } -} - -template -void impose_hermitian_symmetry_planar(const std::vector& length, - const std::vector& ilength, - const std::vector& stride, - size_t dist, - size_t batch, - Tfloat* input_data_real, - Tfloat* input_data_imag) -{ - auto blockSize = DATA_GEN_THREADS; - - switch(length.size()) - { - case 1: - { - const auto gridDim = dim3(blockSize); - const auto blockDim = dim3(DivRoundingUp(batch, blockSize)); - - hipLaunchKernelGGL(impose_hermitian_symmetry_planar_1, - gridDim, - blockDim, - 0, - 0, - input_data_real, - input_data_imag, - length[0], - stride[0], - dist, - batch, - length[0] % 2 == 0); - - break; - } - case 2: - { - const auto gridDim = dim3(blockSize, blockSize); - const auto blockDim = dim3(DivRoundingUp(ilength[0], blockSize), - DivRoundingUp(batch, blockSize)); - - hipLaunchKernelGGL(impose_hermitian_symmetry_planar_2, - gridDim, - blockDim, - 0, - 0, - input_data_real, - input_data_imag, - length[1], - length[0], - stride[1], - stride[0], - dist, - batch, - length[1] % 2 == 0, - length[0] % 2 == 0); - - break; - } - case 3: - { - const auto gridDim = dim3(blockSize, blockSize, blockSize); - const auto blockDim = dim3(DivRoundingUp(ilength[0], blockSize), - DivRoundingUp(ilength[1], blockSize), - DivRoundingUp(batch, blockSize)); - - hipLaunchKernelGGL(impose_hermitian_symmetry_planar_3, - gridDim, - blockDim, - 0, - 0, - input_data_real, - input_data_imag, - length[2], - length[0], - length[1], - stride[2], - stride[0], - stride[1], - dist, - batch, - length[2] % 2 == 0, - length[0] % 2 == 0, - length[1] % 2 == 0); - break; - } - default: - throw std::runtime_error("Invalid dimension for impose_hermitian_symmetry"); - } -} - -#endif // DATA_GEN_H \ No newline at end of file diff -Nru rocfft-5.5.0/clients/fft_params.h rocfft-5.7.1/clients/fft_params.h --- rocfft-5.5.0/clients/fft_params.h 2023-01-31 06:20:16.000000000 +0000 +++ rocfft-5.7.1/clients/fft_params.h 1970-01-01 00:00:00.000000000 +0000 @@ -1,2730 +0,0 @@ -// Copyright (C) 2020 - 2022 Advanced Micro Devices, Inc. All rights reserved. -// -// Permission is hereby granted, free of charge, to any person obtaining a copy -// of this software and associated documentation files (the "Software"), to deal -// in the Software without restriction, including without limitation the rights -// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell -// copies of the Software, and to permit persons to whom the Software is -// furnished to do so, subject to the following conditions: -// -// The above copyright notice and this permission notice shall be included in -// all copies or substantial portions of the Software. -// -// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE -// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER -// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN -// THE SOFTWARE. - -#ifndef FFT_PARAMS_H -#define FFT_PARAMS_H - -#include "data_gen.h" -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include - -#include "../shared/array_validator.h" -#include "../shared/printbuffer.h" -#include "../shared/ptrdiff.h" - -enum fft_status -{ - fft_status_success, - fft_status_failure, - fft_status_invalid_arg_value, - fft_status_invalid_dimensions, - fft_status_invalid_array_type, - fft_status_invalid_strides, - fft_status_invalid_distance, - fft_status_invalid_offset, - fft_status_invalid_work_buffer, -}; - -enum fft_transform_type -{ - fft_transform_type_complex_forward, - fft_transform_type_complex_inverse, - fft_transform_type_real_forward, - fft_transform_type_real_inverse, -}; - -enum fft_precision -{ - fft_precision_single, - fft_precision_double, -}; - -enum fft_array_type -{ - fft_array_type_complex_interleaved, - fft_array_type_complex_planar, - fft_array_type_real, - fft_array_type_hermitian_interleaved, - fft_array_type_hermitian_planar, - fft_array_type_unset, -}; - -enum fft_result_placement -{ - fft_placement_inplace, - fft_placement_notinplace, -}; - -// Determine the size of the data type given the precision and type. -template -inline Tsize var_size(const fft_precision precision, const fft_array_type type) -{ - size_t var_size = 0; - switch(precision) - { - case fft_precision_single: - var_size = sizeof(float); - break; - case fft_precision_double: - var_size = sizeof(double); - break; - } - switch(type) - { - case fft_array_type_complex_interleaved: - case fft_array_type_hermitian_interleaved: - var_size *= 2; - break; - default: - break; - } - return var_size; -} - -// Container class for test parameters. -class fft_params -{ -public: - // All parameters are row-major. - std::vector length; - std::vector istride; - std::vector ostride; - size_t nbatch = 1; - fft_precision precision = fft_precision_double; - fft_transform_type transform_type = fft_transform_type_complex_forward; - fft_result_placement placement = fft_placement_inplace; - size_t idist = 0; - size_t odist = 0; - fft_array_type itype = fft_array_type_unset; - fft_array_type otype = fft_array_type_unset; - std::vector ioffset = {0, 0}; - std::vector ooffset = {0, 0}; - - std::vector isize; - std::vector osize; - - size_t workbuffersize = 0; - - // run testing load/store callbacks - bool run_callbacks = false; - static constexpr double load_cb_scalar = 0.457813941; - static constexpr double store_cb_scalar = 0.391504938; - - // Check that data outside of output strides is not overwritten. - // This is only set explicitly on some tests where there's space - // between dimensions, but the dimensions are still in-order. - // We're not trying to generically find holes in arbitrary data - // layouts. - // - // NOTE: this flag is not included in tokens, since it doesn't - // affect how the FFT library behaves. - bool check_output_strides = false; - - // scaling factor - we do a pointwise multiplication of outputs by - // this factor - double scale_factor = 1.0; - - fft_params(){}; - virtual ~fft_params(){}; - - // Given an array type, return the name as a string. - static std::string array_type_name(const fft_array_type type, bool verbose = true) - { - switch(type) - { - case fft_array_type_complex_interleaved: - return verbose ? "fft_array_type_complex_interleaved" : "CI"; - case fft_array_type_complex_planar: - return verbose ? "fft_array_type_complex_planar" : "CP"; - case fft_array_type_real: - return verbose ? "fft_array_type_real" : "R"; - case fft_array_type_hermitian_interleaved: - return verbose ? "fft_array_type_hermitian_interleaved" : "HI"; - case fft_array_type_hermitian_planar: - return verbose ? "fft_array_type_hermitian_planar" : "HP"; - case fft_array_type_unset: - return verbose ? "fft_array_type_unset" : "UN"; - } - return ""; - } - - std::string transform_type_name() const - { - switch(transform_type) - { - case fft_transform_type_complex_forward: - return "fft_transform_type_complex_forward"; - case fft_transform_type_complex_inverse: - return "fft_transform_type_complex_inverse"; - case fft_transform_type_real_forward: - return "fft_transform_type_real_forward"; - case fft_transform_type_real_inverse: - return "fft_transform_type_real_inverse"; - default: - throw std::runtime_error("Invalid transform type"); - } - } - - // Convert to string for output. - std::string str(const std::string& separator = ", ") const - { - std::stringstream ss; - ss << "length:"; - for(auto i : length) - ss << " " << i; - ss << separator; - ss << "istride:"; - for(auto i : istride) - ss << " " << i; - ss << separator; - ss << "idist: " << idist << separator; - - ss << "ostride:"; - for(auto i : ostride) - ss << " " << i; - ss << separator; - ss << "odist: " << odist << separator; - - ss << "batch: " << nbatch << separator; - ss << "isize:"; - for(auto i : isize) - ss << " " << i; - ss << separator; - ss << "osize:"; - for(auto i : osize) - ss << " " << i; - ss << separator; - - ss << "ioffset:"; - for(auto i : ioffset) - ss << " " << i; - ss << separator; - ss << "ooffset:"; - for(auto i : ooffset) - ss << " " << i; - ss << separator; - - if(placement == fft_placement_inplace) - ss << "in-place"; - else - ss << "out-of-place"; - ss << separator; - ss << "transform_type: " << transform_type_name() << separator; - ss << array_type_name(itype) << " -> " << array_type_name(otype) << separator; - if(precision == fft_precision_single) - ss << "single-precision"; - else - ss << "double-precision"; - ss << separator; - - ss << "ilength:"; - for(const auto i : ilength()) - ss << " " << i; - ss << separator; - ss << "olength:"; - for(const auto i : olength()) - ss << " " << i; - ss << separator; - - ss << "ibuffer_size:"; - for(const auto i : ibuffer_sizes()) - ss << " " << i; - ss << separator; - - ss << "obuffer_size:"; - for(const auto i : obuffer_sizes()) - ss << " " << i; - ss << separator; - - if(scale_factor != 1.0) - ss << "scale factor: " << scale_factor << separator; - - return ss.str(); - } - - // Produce a stringified token of the test fft params. - std::string token() const - { - std::string ret; - - switch(transform_type) - { - case fft_transform_type_complex_forward: - ret += "complex_forward_"; - break; - case fft_transform_type_complex_inverse: - ret += "complex_inverse_"; - break; - case fft_transform_type_real_forward: - ret += "real_forward_"; - break; - case fft_transform_type_real_inverse: - ret += "real_inverse_"; - break; - } - - ret += "len_"; - - for(auto n : length) - { - ret += std::to_string(n); - ret += "_"; - } - switch(precision) - { - case fft_precision_single: - ret += "single_"; - break; - case fft_precision_double: - ret += "double_"; - break; - } - - switch(placement) - { - case fft_placement_inplace: - ret += "ip_"; - break; - case fft_placement_notinplace: - ret += "op_"; - break; - } - - ret += "batch_"; - ret += std::to_string(nbatch); - - auto append_array_info = [&ret](const std::vector& stride, fft_array_type type) { - for(auto s : stride) - { - ret += std::to_string(s); - ret += "_"; - } - - switch(type) - { - case fft_array_type_complex_interleaved: - ret += "CI"; - break; - case fft_array_type_complex_planar: - ret += "CP"; - break; - case fft_array_type_real: - ret += "R"; - break; - case fft_array_type_hermitian_interleaved: - ret += "HI"; - break; - case fft_array_type_hermitian_planar: - ret += "HP"; - break; - default: - ret += "UN"; - break; - } - }; - - ret += "_istride_"; - append_array_info(istride, itype); - - ret += "_ostride_"; - append_array_info(ostride, otype); - - ret += "_idist_"; - ret += std::to_string(idist); - ret += "_odist_"; - ret += std::to_string(odist); - - ret += "_ioffset"; - for(auto n : ioffset) - { - ret += "_"; - ret += std::to_string(n); - } - - ret += "_ooffset"; - for(auto n : ooffset) - { - ret += "_"; - ret += std::to_string(n); - } - - if(run_callbacks) - ret += "_CB"; - - if(scale_factor != 1.0) - ret += "_scale"; - - return ret; - } - - // Set all params from a stringified token. - void from_token(std::string token) - { - std::vector vals; - - std::string delimiter = "_"; - { - size_t pos = 0; - while((pos = token.find(delimiter)) != std::string::npos) - { - auto val = token.substr(0, pos); - vals.push_back(val); - token.erase(0, pos + delimiter.length()); - } - vals.push_back(token); - } - - auto vector_parser - = [](const std::vector& vals, const std::string token, size_t& pos) { - if(vals[pos++] != token) - throw std::runtime_error("Unable to parse token"); - std::vector vec; - - while(pos < vals.size()) - { - if(std::all_of(vals[pos].begin(), vals[pos].end(), ::isdigit)) - { - vec.push_back(std::stoull(vals[pos++])); - } - else - { - break; - } - } - return vec; - }; - - auto type_parser = [](const std::string& val) { - if(val == "CI") - return fft_array_type_complex_interleaved; - else if(val == "CP") - return fft_array_type_complex_planar; - else if(val == "R") - return fft_array_type_real; - else if(val == "HI") - return fft_array_type_hermitian_interleaved; - else if(val == "HP") - return fft_array_type_hermitian_planar; - return fft_array_type_unset; - }; - - size_t pos = 0; - - bool complex = vals[pos++] == "complex"; - bool forward = vals[pos++] == "forward"; - - if(complex && forward) - transform_type = fft_transform_type_complex_forward; - if(complex && !forward) - transform_type = fft_transform_type_complex_inverse; - if(!complex && forward) - transform_type = fft_transform_type_real_forward; - if(!complex && !forward) - transform_type = fft_transform_type_real_inverse; - - length = vector_parser(vals, "len", pos); - - if(vals[pos] == "single") - precision = fft_precision_single; - else if(vals[pos] == "double") - precision = fft_precision_double; - pos++; - - placement = (vals[pos++] == "ip") ? fft_placement_inplace : fft_placement_notinplace; - - if(vals[pos++] != "batch") - throw std::runtime_error("Unable to parse token"); - nbatch = std::stoull(vals[pos++]); - - istride = vector_parser(vals, "istride", pos); - - itype = type_parser(vals[pos]); - pos++; - - ostride = vector_parser(vals, "ostride", pos); - - otype = type_parser(vals[pos]); - pos++; - - if(vals[pos++] != "idist") - throw std::runtime_error("Unable to parse token"); - idist = std::stoull(vals[pos++]); - - if(vals[pos++] != "odist") - throw std::runtime_error("Unable to parse token"); - odist = std::stoull(vals[pos++]); - - ioffset = vector_parser(vals, "ioffset", pos); - - ooffset = vector_parser(vals, "ooffset", pos); - - if(pos < vals.size() && vals[pos] == "CB") - { - run_callbacks = true; - ++pos; - } - - if(pos < vals.size() && vals[pos] == "scale") - { - // just pick some factor that's not zero or one - scale_factor = 0.1239; - ++pos; - } - } - - // Stream output operator (for gtest, etc). - friend std::ostream& operator<<(std::ostream& stream, const fft_params& params) - { - stream << params.str(); - return stream; - } - - // Dimension of the transform. - size_t dim() const - { - return length.size(); - } - - virtual std::vector ilength() const - { - auto ilength = length; - if(transform_type == fft_transform_type_real_inverse) - ilength[dim() - 1] = ilength[dim() - 1] / 2 + 1; - return ilength; - } - - virtual std::vector olength() const - { - auto olength = length; - if(transform_type == fft_transform_type_real_forward) - olength[dim() - 1] = olength[dim() - 1] / 2 + 1; - return olength; - } - - static size_t nbuffer(const fft_array_type type) - { - switch(type) - { - case fft_array_type_real: - case fft_array_type_complex_interleaved: - case fft_array_type_hermitian_interleaved: - return 1; - case fft_array_type_complex_planar: - case fft_array_type_hermitian_planar: - return 2; - case fft_array_type_unset: - return 0; - } - return 0; - } - - // Number of input buffers - size_t nibuffer() const - { - return nbuffer(itype); - } - - // Number of output buffers - size_t nobuffer() const - { - return nbuffer(otype); - } - - void set_iotypes() - { - if(itype == fft_array_type_unset) - { - switch(transform_type) - { - case fft_transform_type_complex_forward: - case fft_transform_type_complex_inverse: - itype = fft_array_type_complex_interleaved; - break; - case fft_transform_type_real_forward: - itype = fft_array_type_real; - break; - case fft_transform_type_real_inverse: - itype = fft_array_type_hermitian_interleaved; - break; - default: - throw std::runtime_error("Invalid transform type"); - } - } - if(otype == fft_array_type_unset) - { - switch(transform_type) - { - case fft_transform_type_complex_forward: - case fft_transform_type_complex_inverse: - otype = fft_array_type_complex_interleaved; - break; - case fft_transform_type_real_forward: - otype = fft_array_type_hermitian_interleaved; - break; - case fft_transform_type_real_inverse: - otype = fft_array_type_real; - break; - default: - throw std::runtime_error("Invalid transform type"); - } - } - } - - // Check that the input and output types are consistent. - bool check_iotypes() const - { - switch(itype) - { - case fft_array_type_complex_interleaved: - case fft_array_type_complex_planar: - case fft_array_type_hermitian_interleaved: - case fft_array_type_hermitian_planar: - case fft_array_type_real: - break; - default: - throw std::runtime_error("Invalid Input array type format"); - } - - switch(otype) - { - case fft_array_type_complex_interleaved: - case fft_array_type_complex_planar: - case fft_array_type_hermitian_interleaved: - case fft_array_type_hermitian_planar: - case fft_array_type_real: - break; - default: - throw std::runtime_error("Invalid Input array type format"); - } - - // Check that format choices are supported - if(transform_type != fft_transform_type_real_forward - && transform_type != fft_transform_type_real_inverse) - { - if(placement == fft_placement_inplace && itype != otype) - { - throw std::runtime_error( - "In-place transforms must have identical input and output types"); - } - } - - bool okformat = true; - switch(itype) - { - case fft_array_type_complex_interleaved: - case fft_array_type_complex_planar: - okformat = (otype == fft_array_type_complex_interleaved - || otype == fft_array_type_complex_planar); - break; - case fft_array_type_hermitian_interleaved: - case fft_array_type_hermitian_planar: - okformat = otype == fft_array_type_real; - break; - case fft_array_type_real: - okformat = (otype == fft_array_type_hermitian_interleaved - || otype == fft_array_type_hermitian_planar); - break; - default: - throw std::runtime_error("Invalid Input array type format"); - } - - return okformat; - } - - // Given a length vector, set the rest of the strides. - // The optional argument stride0 sets the stride for the contiguous dimension. - // The optional rcpadding argument sets the stride correctly for in-place - // multi-dimensional real/complex transforms. - // Format is row-major. - template - std::vector compute_stride(const std::vector& length, - const std::vector& stride0 = std::vector(), - const bool rcpadding = false) const - { - std::vector stride(dim()); - - size_t dimoffset = 0; - - if(stride0.size() == 0) - { - // Set the contiguous stride: - stride[dim() - 1] = 1; - dimoffset = 1; - } - else - { - // Copy the input values to the end of the stride array: - for(size_t i = 0; i < stride0.size(); ++i) - { - stride[dim() - stride0.size() + i] = stride0[i]; - } - } - - if(stride0.size() < dim()) - { - // Compute any remaining values via recursion. - for(size_t i = dim() - dimoffset - stride0.size(); i-- > 0;) - { - auto lengthip1 = length[i + 1]; - if(rcpadding && i == dim() - 2) - { - lengthip1 = 2 * (lengthip1 / 2 + 1); - } - stride[i] = stride[i + 1] * lengthip1; - } - } - - return stride; - } - - void compute_istride() - { - istride = compute_stride(ilength(), - istride, - placement == fft_placement_inplace - && transform_type == fft_transform_type_real_forward); - } - - void compute_ostride() - { - ostride = compute_stride(olength(), - ostride, - placement == fft_placement_inplace - && transform_type == fft_transform_type_real_inverse); - } - - virtual void compute_isize() - { - auto il = ilength(); - size_t val = compute_ptrdiff(il, istride, nbatch, idist); - isize.resize(nibuffer()); - for(unsigned int i = 0; i < isize.size(); ++i) - { - isize[i] = val + ioffset[i]; - } - } - - virtual void compute_osize() - { - auto ol = olength(); - size_t val = compute_ptrdiff(ol, ostride, nbatch, odist); - osize.resize(nobuffer()); - for(unsigned int i = 0; i < osize.size(); ++i) - { - osize[i] = val + ooffset[i]; - } - } - - std::vector ibuffer_sizes() const - { - std::vector ibuffer_sizes; - - // In-place real-to-complex transforms need to have enough space in the input buffer to - // accomadate the output, which is slightly larger. - if(placement == fft_placement_inplace && transform_type == fft_transform_type_real_forward) - { - return obuffer_sizes(); - } - - if(isize.empty()) - return ibuffer_sizes; - - switch(itype) - { - case fft_array_type_complex_planar: - case fft_array_type_hermitian_planar: - ibuffer_sizes.resize(2); - break; - default: - ibuffer_sizes.resize(1); - } - for(unsigned i = 0; i < ibuffer_sizes.size(); i++) - { - ibuffer_sizes[i] = isize[i] * var_size(precision, itype); - } - return ibuffer_sizes; - } - - virtual std::vector obuffer_sizes() const - { - std::vector obuffer_sizes; - - if(osize.empty()) - return obuffer_sizes; - - switch(otype) - { - case fft_array_type_complex_planar: - case fft_array_type_hermitian_planar: - obuffer_sizes.resize(2); - break; - default: - obuffer_sizes.resize(1); - } - for(unsigned i = 0; i < obuffer_sizes.size(); i++) - { - obuffer_sizes[i] = osize[i] * var_size(precision, otype); - } - return obuffer_sizes; - } - - // Compute the idist for a given transform based on the placeness, transform type, and data - // layout. - void set_idist() - { - if(idist != 0) - return; - - // In-place 1D transforms need extra dist. - if(transform_type == fft_transform_type_real_forward && dim() == 1 - && placement == fft_placement_inplace) - { - idist = 2 * (length[0] / 2 + 1) * istride[0]; - return; - } - - if(transform_type == fft_transform_type_real_inverse && dim() == 1) - { - idist = (length[0] / 2 + 1) * istride[0]; - return; - } - - idist = (transform_type == fft_transform_type_real_inverse) - ? (length[dim() - 1] / 2 + 1) * istride[dim() - 1] - : length[dim() - 1] * istride[dim() - 1]; - for(unsigned int i = 0; i < dim() - 1; ++i) - { - idist = std::max(length[i] * istride[i], idist); - } - } - - // Compute the odist for a given transform based on the placeness, transform type, and data - // layout. Row-major. - void set_odist() - { - if(odist != 0) - return; - - // In-place 1D transforms need extra dist. - if(transform_type == fft_transform_type_real_inverse && dim() == 1 - && placement == fft_placement_inplace) - { - odist = 2 * (length[0] / 2 + 1) * ostride[0]; - return; - } - - if(transform_type == fft_transform_type_real_forward && dim() == 1) - { - odist = (length[0] / 2 + 1) * ostride[0]; - return; - } - - odist = (transform_type == fft_transform_type_real_forward) - ? (length[dim() - 1] / 2 + 1) * ostride[dim() - 1] - : length[dim() - 1] * ostride[dim() - 1]; - for(unsigned int i = 0; i < dim() - 1; ++i) - { - odist = std::max(length[i] * ostride[i], odist); - } - } - - // Put the length, stride, batch, and dist into a single length/stride array and pass off to the - // validity checker. - bool valid_length_stride_batch_dist(const std::vector& l0, - const std::vector& s0, - const size_t n, - const size_t dist, - const int verbose = 0) const - { - if(l0.size() != s0.size()) - return false; - - // Length and stride vectors, including bathes: - std::vector l{}, s{}; - for(unsigned int i = 0; i < l0.size(); ++i) - { - if(l0[i] > 1) - { - if(s0[i] == 0) - return false; - l.push_back(l0[i]); - s.push_back(s0[i]); - } - } - if(n > 1) - { - if(dist == 0) - return false; - l.push_back(n); - s.push_back(dist); - } - - return array_valid(l, s, verbose); - } - - // Return true if the given GPU parameters would produce a valid transform. - bool valid(const int verbose) const - { - if(ioffset.size() < nibuffer() || ooffset.size() < nobuffer()) - return false; - - // Check that in-place transforms have the same input and output stride: - if(placement == fft_placement_inplace) - { - const auto stridesize = std::min(istride.size(), ostride.size()); - bool samestride = true; - for(unsigned int i = 0; i < stridesize; ++i) - { - if(istride[i] != ostride[i]) - samestride = false; - } - if((transform_type == fft_transform_type_complex_forward - || transform_type == fft_transform_type_complex_inverse) - && !samestride) - { - // In-place transforms require identical input and output strides. - if(verbose) - { - std::cout << "istride:"; - for(const auto& i : istride) - std::cout << " " << i; - std::cout << " ostride0:"; - for(const auto& i : ostride) - std::cout << " " << i; - std::cout << " differ; skipped for in-place transforms: skipping test" - << std::endl; - } - return false; - } - - if((transform_type == fft_transform_type_complex_forward - || transform_type == fft_transform_type_complex_inverse) - && (idist != odist)) - { - // In-place transforms require identical distance - if(verbose) - { - std::cout << "idist:" << idist << " odist:" << odist - << " differ; skipped for in-place transforms: skipping test" - << std::endl; - } - return false; - } - - if((transform_type == fft_transform_type_real_forward - || transform_type == fft_transform_type_real_inverse) - && (istride.back() != 1 || ostride.back() != 1)) - { - // In-place real/complex transforms require unit strides. - if(verbose) - { - std::cout - << "istride.back(): " << istride.back() - << " ostride.back(): " << ostride.back() - << " must be unitary for in-place real/complex transforms: skipping test" - << std::endl; - } - return false; - } - - if((itype == fft_array_type_complex_interleaved - && otype == fft_array_type_complex_planar) - || (itype == fft_array_type_complex_planar - && otype == fft_array_type_complex_interleaved)) - { - if(verbose) - { - std::cout << "In-place c2c transforms require identical io types; skipped.\n"; - } - return false; - } - - // Check offsets - switch(transform_type) - { - case fft_transform_type_complex_forward: - case fft_transform_type_complex_inverse: - for(unsigned int i = 0; i < nibuffer(); ++i) - { - if(ioffset[i] != ooffset[i]) - return false; - } - break; - case fft_transform_type_real_forward: - if(ioffset[0] != 2 * ooffset[0]) - return false; - break; - case fft_transform_type_real_inverse: - if(2 * ioffset[0] != ooffset[0]) - return false; - break; - } - } - - if(!check_iotypes()) - return false; - - // we can only check output strides on out-of-place - // transforms, since we need to initialize output to a known - // pattern - if(placement == fft_placement_inplace && check_output_strides) - return false; - - // Check input and output strides - if(valid_length_stride_batch_dist(ilength(), istride, nbatch, idist, verbose) != true) - { - if(verbose) - std::cout << "Invalid input data format.\n"; - return false; - } - if(!(ilength() == olength() && istride == ostride && idist == odist)) - { - // Only check if different - if(valid_length_stride_batch_dist(olength(), ostride, nbatch, odist, verbose) != true) - { - if(verbose) - std::cout << "Invalid output data format.\n"; - return false; - } - } - - // The parameters are valid. - return true; - } - - // Fill in any missing parameters. - void validate() - { - set_iotypes(); - compute_istride(); - compute_ostride(); - set_idist(); - set_odist(); - compute_isize(); - compute_osize(); - } - - // Column-major getters: - std::vector length_cm() const - { - auto length_cm = length; - std::reverse(std::begin(length_cm), std::end(length_cm)); - return length_cm; - } - std::vector ilength_cm() const - { - auto ilength_cm = ilength(); - std::reverse(std::begin(ilength_cm), std::end(ilength_cm)); - return ilength_cm; - } - std::vector olength_cm() const - { - auto olength_cm = olength(); - std::reverse(std::begin(olength_cm), std::end(olength_cm)); - return olength_cm; - } - std::vector istride_cm() const - { - auto istride_cm = istride; - std::reverse(std::begin(istride_cm), std::end(istride_cm)); - return istride_cm; - } - std::vector ostride_cm() const - { - auto ostride_cm = ostride; - std::reverse(std::begin(ostride_cm), std::end(ostride_cm)); - return ostride_cm; - } - - template - void print_ibuffer(const std::vector>& buf, - Tstream& stream = std::cout) const - { - switch(itype) - { - case fft_array_type_complex_interleaved: - case fft_array_type_hermitian_interleaved: - { - switch(precision) - { - case fft_precision_single: - { - buffer_printer> s; - s.print_buffer(buf, ilength(), istride, nbatch, idist, ioffset); - break; - } - case fft_precision_double: - { - buffer_printer> s; - s.print_buffer(buf, ilength(), istride, nbatch, idist, ioffset); - break; - } - } - break; - } - case fft_array_type_complex_planar: - case fft_array_type_hermitian_planar: - case fft_array_type_real: - { - switch(precision) - { - case fft_precision_single: - { - buffer_printer s; - s.print_buffer(buf, ilength(), istride, nbatch, idist, ioffset); - break; - } - case fft_precision_double: - { - buffer_printer s; - s.print_buffer(buf, ilength(), istride, nbatch, idist, ioffset); - break; - } - } - break; - } - default: - throw std::runtime_error("Invalid itype in print_ibuffer"); - } - } - - template - void print_obuffer(const std::vector>& buf, - Tstream& stream = std::cout) const - { - switch(otype) - { - case fft_array_type_complex_interleaved: - case fft_array_type_hermitian_interleaved: - { - switch(precision) - { - case fft_precision_single: - { - buffer_printer> s; - s.print_buffer(buf, olength(), ostride, nbatch, odist, ooffset); - break; - } - case fft_precision_double: - buffer_printer> s; - s.print_buffer(buf, olength(), ostride, nbatch, odist, ooffset); - break; - } - break; - } - case fft_array_type_complex_planar: - case fft_array_type_hermitian_planar: - case fft_array_type_real: - { - switch(precision) - { - case fft_precision_single: - { - buffer_printer s; - s.print_buffer(buf, olength(), ostride, nbatch, odist, ooffset); - break; - } - case fft_precision_double: - { - buffer_printer s; - s.print_buffer(buf, olength(), ostride, nbatch, odist, ooffset); - break; - } - } - break; - } - - default: - throw std::runtime_error("Invalid itype in print_obuffer"); - } - } - - template - void print_ibuffer_flat(const std::vector>& buf) const - { - switch(itype) - { - case fft_array_type_complex_interleaved: - case fft_array_type_hermitian_interleaved: - { - switch(precision) - { - case fft_precision_single: - { - buffer_printer> s; - s.print_buffer_flat(buf, osize, ooffset); - break; - } - case fft_precision_double: - buffer_printer> s; - s.print_buffer_flat(buf, osize, ooffset); - break; - } - break; - } - case fft_array_type_complex_planar: - case fft_array_type_hermitian_planar: - case fft_array_type_real: - { - switch(precision) - { - case fft_precision_single: - { - buffer_printer s; - s.print_buffer_flat(buf, osize, ooffset); - break; - } - case fft_precision_double: - { - buffer_printer s; - s.print_buffer_flat(buf, osize, ooffset); - break; - } - } - break; - default: - throw std::runtime_error("Invalid itype in print_ibuffer_flat"); - } - } - } - - template - void print_obuffer_flat(const std::vector>& buf) const - { - switch(otype) - { - case fft_array_type_complex_interleaved: - case fft_array_type_hermitian_interleaved: - { - switch(precision) - { - case fft_precision_single: - { - buffer_printer> s; - s.print_buffer_flat(buf, osize, ooffset); - break; - } - case fft_precision_double: - buffer_printer> s; - s.print_buffer_flat(buf, osize, ooffset); - break; - } - break; - } - case fft_array_type_complex_planar: - case fft_array_type_hermitian_planar: - case fft_array_type_real: - { - switch(precision) - { - case fft_precision_single: - { - buffer_printer s; - s.print_buffer_flat(buf, osize, ooffset); - break; - } - - case fft_precision_double: - { - buffer_printer s; - s.print_buffer_flat(buf, osize, ooffset); - break; - } - } - break; - default: - throw std::runtime_error("Invalid itype in print_ibuffer_flat"); - } - } - } - - virtual fft_status set_callbacks(void* load_cb_host, - void* load_cb_data, - void* store_cb_host, - void* store_cb_data) - { - return fft_status_success; - } - - virtual fft_status execute(void** in, void** out) - { - return fft_status_success; - }; - - size_t fft_params_vram_footprint() - { - return fft_params::vram_footprint(); - } - - virtual size_t vram_footprint() - { - const auto ibuf_size = ibuffer_sizes(); - size_t val = std::accumulate(ibuf_size.begin(), ibuf_size.end(), (size_t)1); - if(placement == fft_placement_notinplace) - { - const auto obuf_size = obuffer_sizes(); - val += std::accumulate(obuf_size.begin(), obuf_size.end(), (size_t)1); - } - return val; - } - - // Specific exception type for work buffer allocation failure. - // Tests that hit this can't fit on the GPU and should be skipped. - struct work_buffer_alloc_failure : public std::runtime_error - { - work_buffer_alloc_failure(const std::string& s) - : std::runtime_error(s) - { - } - }; - - virtual fft_status create_plan() - { - return fft_status_success; - } -}; - -// This is used with the program_options class so that the user can type an integer on the -// command line and we store into an enum varaible -template -std::basic_istream<_Elem, _Traits>& operator>>(std::basic_istream<_Elem, _Traits>& stream, - fft_array_type& atype) -{ - unsigned tmp; - stream >> tmp; - atype = fft_array_type(tmp); - return stream; -} - -// similarly for transform type -template -std::basic_istream<_Elem, _Traits>& operator>>(std::basic_istream<_Elem, _Traits>& stream, - fft_transform_type& ttype) -{ - unsigned tmp; - stream >> tmp; - ttype = fft_transform_type(tmp); - return stream; -} - -// count the number of total iterations for 1-, 2-, and 3-D dimensions -template -size_t count_iters(const T1& i) -{ - return i; -} - -template -size_t count_iters(const std::tuple& i) -{ - return std::get<0>(i) * std::get<1>(i); -} - -template -size_t count_iters(const std::tuple& i) -{ - return std::get<0>(i) * std::get<1>(i) * std::get<2>(i); -} - -// Work out how many partitions to break our iteration problem into -template -static size_t compute_partition_count(T1 length) -{ -#ifdef BUILD_CLIENTS_TESTS_OPENMP - // we seem to get contention from too many threads, which slows - // things down. particularly noticeable with mix_3D tests - static const size_t MAX_PARTITIONS = 8; - size_t iters = count_iters(length); - size_t hw_threads = std::min(MAX_PARTITIONS, static_cast(omp_get_num_procs())); - if(!hw_threads) - return 1; - - // don't bother threading problem sizes that are too small. pick - // an arbitrary number of iterations and ensure that each thread - // has at least that many iterations to process - static const size_t MIN_ITERS_PER_THREAD = 2048; - - // either use the whole CPU, or use ceil(iters/iters_per_thread) - return std::min(hw_threads, (iters + MIN_ITERS_PER_THREAD + 1) / MIN_ITERS_PER_THREAD); -#else - return 1; -#endif -} - -// Break a scalar length into some number of pieces, returning -// [(start0, end0), (start1, end1), ...] -template -std::vector> partition_base(const T1& length, size_t num_parts) -{ - static_assert(std::is_integral::value, "Integral required."); - - // make sure we don't exceed the length - num_parts = std::min(length, num_parts); - - std::vector> ret(num_parts); - auto partition_size = length / num_parts; - T1 cur_partition = 0; - for(size_t i = 0; i < num_parts; ++i, cur_partition += partition_size) - { - ret[i].first = cur_partition; - ret[i].second = cur_partition + partition_size; - } - // last partition might not divide evenly, fix it up - ret.back().second = length; - return ret; -} - -// Returns pairs of startindex, endindex, for 1D, 2D, 3D lengths -template -std::vector> partition_rowmajor(const T1& length) -{ - return partition_base(length, compute_partition_count(length)); -} - -// Partition on the leftmost part of the tuple, for row-major indexing -template -std::vector, std::tuple>> - partition_rowmajor(const std::tuple& length) -{ - auto partitions = partition_base(std::get<0>(length), compute_partition_count(length)); - std::vector, std::tuple>> ret(partitions.size()); - for(size_t i = 0; i < partitions.size(); ++i) - { - std::get<0>(ret[i].first) = partitions[i].first; - std::get<1>(ret[i].first) = 0; - std::get<0>(ret[i].second) = partitions[i].second; - std::get<1>(ret[i].second) = std::get<1>(length); - } - return ret; -} -template -std::vector, std::tuple>> - partition_rowmajor(const std::tuple& length) -{ - auto partitions = partition_base(std::get<0>(length), compute_partition_count(length)); - std::vector, std::tuple>> ret(partitions.size()); - for(size_t i = 0; i < partitions.size(); ++i) - { - std::get<0>(ret[i].first) = partitions[i].first; - std::get<1>(ret[i].first) = 0; - std::get<2>(ret[i].first) = 0; - std::get<0>(ret[i].second) = partitions[i].second; - std::get<1>(ret[i].second) = std::get<1>(length); - std::get<2>(ret[i].second) = std::get<2>(length); - } - return ret; -} - -// Returns pairs of startindex, endindex, for 1D, 2D, 3D lengths -template -std::vector> partition_colmajor(const T1& length) -{ - return partition_base(length, compute_partition_count(length)); -} - -// Partition on the rightmost part of the tuple, for col-major indexing -template -std::vector, std::tuple>> - partition_colmajor(const std::tuple& length) -{ - auto partitions = partition_base(std::get<1>(length), compute_partition_count(length)); - std::vector, std::tuple>> ret(partitions.size()); - for(size_t i = 0; i < partitions.size(); ++i) - { - std::get<1>(ret[i].first) = partitions[i].first; - std::get<0>(ret[i].first) = 0; - std::get<1>(ret[i].second) = partitions[i].second; - std::get<0>(ret[i].second) = std::get<0>(length); - } - return ret; -} -template -std::vector, std::tuple>> - partition_colmajor(const std::tuple& length) -{ - auto partitions = partition_base(std::get<2>(length), compute_partition_count(length)); - std::vector, std::tuple>> ret(partitions.size()); - for(size_t i = 0; i < partitions.size(); ++i) - { - std::get<2>(ret[i].first) = partitions[i].first; - std::get<1>(ret[i].first) = 0; - std::get<0>(ret[i].first) = 0; - std::get<2>(ret[i].second) = partitions[i].second; - std::get<1>(ret[i].second) = std::get<1>(length); - std::get<0>(ret[i].second) = std::get<0>(length); - } - return ret; -} - -// Specialized computation of index given 1-, 2-, 3- dimension length + stride -template -size_t compute_index(T1 length, T2 stride, size_t base) -{ - return (length * stride) + base; -} - -template -size_t - compute_index(const std::tuple& length, const std::tuple& stride, size_t base) -{ - static_assert(std::is_integral::value, "Integral required."); - static_assert(std::is_integral::value, "Integral required."); - return (std::get<0>(length) * std::get<0>(stride)) + (std::get<1>(length) * std::get<1>(stride)) - + base; -} - -template -size_t compute_index(const std::tuple& length, - const std::tuple& stride, - size_t base) -{ - static_assert(std::is_integral::value, "Integral required."); - static_assert(std::is_integral::value, "Integral required."); - return (std::get<0>(length) * std::get<0>(stride)) + (std::get<1>(length) * std::get<1>(stride)) - + (std::get<2>(length) * std::get<2>(stride)) + base; -} - -// Copy data of dimensions length with strides istride and length idist between batches to -// a buffer with strides ostride and length odist between batches. The input and output -// types are identical. -template -inline void copy_buffers_1to1(const Tval* input, - Tval* output, - const Tint1& whole_length, - const size_t nbatch, - const Tint2& istride, - const size_t idist, - const Tint3& ostride, - const size_t odist, - const std::vector& ioffset, - const std::vector& ooffset) -{ - const bool idx_equals_odx = istride == ostride && idist == odist; - size_t idx_base = 0; - size_t odx_base = 0; - auto partitions = partition_rowmajor(whole_length); - for(size_t b = 0; b < nbatch; b++, idx_base += idist, odx_base += odist) - { -#pragma omp parallel for num_threads(partitions.size()) - for(size_t part = 0; part < partitions.size(); ++part) - { - auto index = partitions[part].first; - const auto length = partitions[part].second; - do - { - const auto idx = compute_index(index, istride, idx_base); - const auto odx = idx_equals_odx ? idx : compute_index(index, ostride, odx_base); - output[odx + ooffset[0]] = input[idx + ioffset[0]]; - } while(increment_rowmajor(index, length)); - } - } -} - -// Copy data of dimensions length with strides istride and length idist between batches to -// a buffer with strides ostride and length odist between batches. The input type is -// planar and the output type is complex interleaved. -template -inline void copy_buffers_2to1(const Tval* input0, - const Tval* input1, - std::complex* output, - const Tint1& whole_length, - const size_t nbatch, - const Tint2& istride, - const size_t idist, - const Tint3& ostride, - const size_t odist, - const std::vector& ioffset, - const std::vector& ooffset) -{ - const bool idx_equals_odx = istride == ostride && idist == odist; - size_t idx_base = 0; - size_t odx_base = 0; - auto partitions = partition_rowmajor(whole_length); - for(size_t b = 0; b < nbatch; b++, idx_base += idist, odx_base += odist) - { -#pragma omp parallel for num_threads(partitions.size()) - for(size_t part = 0; part < partitions.size(); ++part) - { - auto index = partitions[part].first; - const auto length = partitions[part].second; - do - { - const auto idx = compute_index(index, istride, idx_base); - const auto odx = idx_equals_odx ? idx : compute_index(index, ostride, odx_base); - output[odx + ooffset[0]] - = std::complex(input0[idx + ioffset[0]], input1[idx + ioffset[1]]); - } while(increment_rowmajor(index, length)); - } - } -} - -// Copy data of dimensions length with strides istride and length idist between batches to -// a buffer with strides ostride and length odist between batches. The input type is -// complex interleaved and the output type is planar. -template -inline void copy_buffers_1to2(const std::complex* input, - Tval* output0, - Tval* output1, - const Tint1& whole_length, - const size_t nbatch, - const Tint2& istride, - const size_t idist, - const Tint3& ostride, - const size_t odist, - const std::vector& ioffset, - const std::vector& ooffset) -{ - const bool idx_equals_odx = istride == ostride && idist == odist; - size_t idx_base = 0; - size_t odx_base = 0; - auto partitions = partition_rowmajor(whole_length); - for(size_t b = 0; b < nbatch; b++, idx_base += idist, odx_base += odist) - { -#pragma omp parallel for num_threads(partitions.size()) - for(size_t part = 0; part < partitions.size(); ++part) - { - auto index = partitions[part].first; - const auto length = partitions[part].second; - do - { - const auto idx = compute_index(index, istride, idx_base); - const auto odx = idx_equals_odx ? idx : compute_index(index, ostride, odx_base); - output0[odx + ooffset[0]] = input[idx + ioffset[0]].real(); - output1[odx + ooffset[1]] = input[idx + ioffset[0]].imag(); - } while(increment_rowmajor(index, length)); - } - } -} - -// Copy data of dimensions length with strides istride and length idist between batches to -// a buffer with strides ostride and length odist between batches. The input type given -// by itype, and the output type is given by otype. -template -inline void copy_buffers(const std::vector>& input, - std::vector>& output, - const Tint1& length, - const size_t nbatch, - const fft_precision precision, - const fft_array_type itype, - const Tint2& istride, - const size_t idist, - const fft_array_type otype, - const Tint3& ostride, - const size_t odist, - const std::vector& ioffset, - const std::vector& ooffset) -{ - if(itype == otype) - { - switch(itype) - { - case fft_array_type_complex_interleaved: - case fft_array_type_hermitian_interleaved: - switch(precision) - { - case fft_precision_single: - copy_buffers_1to1(reinterpret_cast*>(input[0].data()), - reinterpret_cast*>(output[0].data()), - length, - nbatch, - istride, - idist, - ostride, - odist, - ioffset, - ooffset); - break; - case fft_precision_double: - copy_buffers_1to1(reinterpret_cast*>(input[0].data()), - reinterpret_cast*>(output[0].data()), - length, - nbatch, - istride, - idist, - ostride, - odist, - ioffset, - ooffset); - break; - } - break; - case fft_array_type_real: - case fft_array_type_complex_planar: - case fft_array_type_hermitian_planar: - for(unsigned int idx = 0; idx < input.size(); ++idx) - { - switch(precision) - { - case fft_precision_single: - copy_buffers_1to1(reinterpret_cast(input[idx].data()), - reinterpret_cast(output[idx].data()), - length, - nbatch, - istride, - idist, - ostride, - odist, - ioffset, - ooffset); - break; - case fft_precision_double: - copy_buffers_1to1(reinterpret_cast(input[idx].data()), - reinterpret_cast(output[idx].data()), - length, - nbatch, - istride, - idist, - ostride, - odist, - ioffset, - ooffset); - break; - } - } - break; - default: - throw std::runtime_error("Invalid data type"); - } - } - else if((itype == fft_array_type_complex_interleaved && otype == fft_array_type_complex_planar) - || (itype == fft_array_type_hermitian_interleaved - && otype == fft_array_type_hermitian_planar)) - { - // copy 1to2 - switch(precision) - { - case fft_precision_single: - copy_buffers_1to2(reinterpret_cast*>(input[0].data()), - reinterpret_cast(output[0].data()), - reinterpret_cast(output[1].data()), - length, - nbatch, - istride, - idist, - ostride, - odist, - ioffset, - ooffset); - break; - case fft_precision_double: - copy_buffers_1to2(reinterpret_cast*>(input[0].data()), - reinterpret_cast(output[0].data()), - reinterpret_cast(output[1].data()), - length, - nbatch, - istride, - idist, - ostride, - odist, - ioffset, - ooffset); - break; - } - } - else if((itype == fft_array_type_complex_planar && otype == fft_array_type_complex_interleaved) - || (itype == fft_array_type_hermitian_planar - && otype == fft_array_type_hermitian_interleaved)) - { - // copy 2 to 1 - switch(precision) - { - case fft_precision_single: - copy_buffers_2to1(reinterpret_cast(input[0].data()), - reinterpret_cast(input[1].data()), - reinterpret_cast*>(output[0].data()), - length, - nbatch, - istride, - idist, - ostride, - odist, - ioffset, - ooffset); - break; - case fft_precision_double: - copy_buffers_2to1(reinterpret_cast(input[0].data()), - reinterpret_cast(input[1].data()), - reinterpret_cast*>(output[0].data()), - length, - nbatch, - istride, - idist, - ostride, - odist, - ioffset, - ooffset); - break; - } - } - else - { - throw std::runtime_error("Invalid input and output types."); - } -} - -// unroll arbitrary-dimension copy_buffers into specializations for 1-, 2-, 3-dimensions -template -inline void copy_buffers(const std::vector>& input, - std::vector>& output, - const std::vector& length, - const size_t nbatch, - const fft_precision precision, - const fft_array_type itype, - const std::vector& istride, - const size_t idist, - const fft_array_type otype, - const std::vector& ostride, - const size_t odist, - const std::vector& ioffset, - const std::vector& ooffset) -{ - switch(length.size()) - { - case 1: - return copy_buffers(input, - output, - length[0], - nbatch, - precision, - itype, - istride[0], - idist, - otype, - ostride[0], - odist, - ioffset, - ooffset); - case 2: - return copy_buffers(input, - output, - std::make_tuple(length[0], length[1]), - nbatch, - precision, - itype, - std::make_tuple(istride[0], istride[1]), - idist, - otype, - std::make_tuple(ostride[0], ostride[1]), - odist, - ioffset, - ooffset); - case 3: - return copy_buffers(input, - output, - std::make_tuple(length[0], length[1], length[2]), - nbatch, - precision, - itype, - std::make_tuple(istride[0], istride[1], istride[2]), - idist, - otype, - std::make_tuple(ostride[0], ostride[1], ostride[2]), - odist, - ioffset, - ooffset); - default: - abort(); - } -} - -// Compute the L-infinity and L-2 distance between two buffers with strides istride and -// length idist between batches to a buffer with strides ostride and length odist between -// batches. Both buffers are of complex type. - -struct VectorNorms -{ - double l_2 = 0.0, l_inf = 0.0; -}; - -template -inline VectorNorms distance_1to1_complex(const Tcomplex* input, - const Tcomplex* output, - const Tint1& whole_length, - const size_t nbatch, - const Tint2& istride, - const size_t idist, - const Tint3& ostride, - const size_t odist, - std::vector>& linf_failures, - const double linf_cutoff, - const std::vector& ioffset, - const std::vector& ooffset) -{ - double linf = 0.0; - double l2 = 0.0; - - std::mutex linf_failure_lock; - - const bool idx_equals_odx = istride == ostride && idist == odist; - size_t idx_base = 0; - size_t odx_base = 0; - auto partitions = partition_colmajor(whole_length); - for(size_t b = 0; b < nbatch; b++, idx_base += idist, odx_base += odist) - { -#pragma omp parallel for reduction(max : linf) reduction(+ : l2) num_threads(partitions.size()) - for(size_t part = 0; part < partitions.size(); ++part) - { - double cur_linf = 0.0; - double cur_l2 = 0.0; - auto index = partitions[part].first; - const auto length = partitions[part].second; - - do - { - const auto idx = compute_index(index, istride, idx_base); - const auto odx = idx_equals_odx ? idx : compute_index(index, ostride, odx_base); - const double rdiff - = std::abs(output[odx + ooffset[0]].real() - input[idx + ioffset[0]].real()); - cur_linf = std::max(rdiff, cur_linf); - if(cur_linf > linf_cutoff) - { - std::pair fval(b, idx); - linf_failure_lock.lock(); - linf_failures.push_back(fval); - linf_failure_lock.unlock(); - } - cur_l2 += rdiff * rdiff; - - const double idiff - = std::abs(output[odx + ooffset[0]].imag() - input[idx + ioffset[0]].imag()); - cur_linf = std::max(idiff, cur_linf); - if(cur_linf > linf_cutoff) - { - std::pair fval(b, idx); - linf_failure_lock.lock(); - linf_failures.push_back(fval); - linf_failure_lock.unlock(); - } - cur_l2 += idiff * idiff; - - } while(increment_rowmajor(index, length)); - linf = std::max(linf, cur_linf); - l2 += cur_l2; - } - } - return {.l_2 = sqrt(l2), .l_inf = linf}; -} - -// Compute the L-infinity and L-2 distance between two buffers with strides istride and -// length idist between batches to a buffer with strides ostride and length odist between -// batches. Both buffers are of real type. -template -inline VectorNorms distance_1to1_real(const Tfloat* input, - const Tfloat* output, - const Tint1& whole_length, - const size_t nbatch, - const Tint2& istride, - const size_t idist, - const Tint3& ostride, - const size_t odist, - std::vector>& linf_failures, - const double linf_cutoff, - const std::vector& ioffset, - const std::vector& ooffset) -{ - double linf = 0.0; - double l2 = 0.0; - - std::mutex linf_failure_lock; - - const bool idx_equals_odx = istride == ostride && idist == odist; - size_t idx_base = 0; - size_t odx_base = 0; - auto partitions = partition_rowmajor(whole_length); - for(size_t b = 0; b < nbatch; b++, idx_base += idist, odx_base += odist) - { -#pragma omp parallel for reduction(max : linf) reduction(+ : l2) num_threads(partitions.size()) - for(size_t part = 0; part < partitions.size(); ++part) - { - double cur_linf = 0.0; - double cur_l2 = 0.0; - auto index = partitions[part].first; - const auto length = partitions[part].second; - do - { - const auto idx = compute_index(index, istride, idx_base); - const auto odx = idx_equals_odx ? idx : compute_index(index, ostride, odx_base); - const double diff = std::abs(output[odx + ooffset[0]] - input[idx + ioffset[0]]); - cur_linf = std::max(diff, cur_linf); - if(cur_linf > linf_cutoff) - { - std::pair fval(b, idx); - linf_failure_lock.lock(); - linf_failures.push_back(fval); - linf_failure_lock.unlock(); - } - cur_l2 += diff * diff; - - } while(increment_rowmajor(index, length)); - linf = std::max(linf, cur_linf); - l2 += cur_l2; - } - } - return {.l_2 = sqrt(l2), .l_inf = linf}; -} - -// Compute the L-infinity and L-2 distance between two buffers with strides istride and -// length idist between batches to a buffer with strides ostride and length odist between -// batches. input is complex-interleaved, output is complex-planar. -template -inline VectorNorms distance_1to2(const std::complex* input, - const Tval* output0, - const Tval* output1, - const Tint1& whole_length, - const size_t nbatch, - const T2& istride, - const size_t idist, - const T3& ostride, - const size_t odist, - std::vector>& linf_failures, - const double linf_cutoff, - const std::vector& ioffset, - const std::vector& ooffset) -{ - double linf = 0.0; - double l2 = 0.0; - - std::mutex linf_failure_lock; - - const bool idx_equals_odx = istride == ostride && idist == odist; - size_t idx_base = 0; - size_t odx_base = 0; - auto partitions = partition_rowmajor(whole_length); - for(size_t b = 0; b < nbatch; b++, idx_base += idist, odx_base += odist) - { -#pragma omp parallel for reduction(max : linf) reduction(+ : l2) num_threads(partitions.size()) - for(size_t part = 0; part < partitions.size(); ++part) - { - double cur_linf = 0.0; - double cur_l2 = 0.0; - auto index = partitions[part].first; - const auto length = partitions[part].second; - do - { - const auto idx = compute_index(index, istride, idx_base); - const auto odx = idx_equals_odx ? idx : compute_index(index, ostride, odx_base); - const double rdiff - = std::abs(output0[odx + ooffset[0]] - input[idx + ioffset[0]].real()); - cur_linf = std::max(rdiff, cur_linf); - if(cur_linf > linf_cutoff) - { - std::pair fval(b, idx); - linf_failure_lock.lock(); - linf_failures.push_back(fval); - linf_failure_lock.unlock(); - } - cur_l2 += rdiff * rdiff; - - const double idiff - = std::abs(output1[odx + ooffset[1]] - input[idx + ioffset[0]].imag()); - cur_linf = std::max(idiff, cur_linf); - if(cur_linf > linf_cutoff) - { - std::pair fval(b, idx); - linf_failure_lock.lock(); - linf_failures.push_back(fval); - linf_failure_lock.unlock(); - } - cur_l2 += idiff * idiff; - - } while(increment_rowmajor(index, length)); - linf = std::max(linf, cur_linf); - l2 += cur_l2; - } - } - return {.l_2 = sqrt(l2), .l_inf = linf}; -} - -// Compute the L-inifnity and L-2 distance between two buffers of dimension length and -// with types given by itype, otype, and precision. -template -inline VectorNorms distance(const std::vector>& input, - const std::vector>& output, - const Tint1& length, - const size_t nbatch, - const fft_precision precision, - const fft_array_type itype, - const Tint2& istride, - const size_t idist, - const fft_array_type otype, - const Tint3& ostride, - const size_t odist, - std::vector>& linf_failures, - const double linf_cutoff, - const std::vector& ioffset, - const std::vector& ooffset) -{ - VectorNorms dist; - - if(itype == otype) - { - switch(itype) - { - case fft_array_type_complex_interleaved: - case fft_array_type_hermitian_interleaved: - switch(precision) - { - case fft_precision_single: - dist = distance_1to1_complex( - reinterpret_cast*>(input[0].data()), - reinterpret_cast*>(output[0].data()), - length, - nbatch, - istride, - idist, - ostride, - odist, - linf_failures, - linf_cutoff, - ioffset, - ooffset); - break; - case fft_precision_double: - dist = distance_1to1_complex( - reinterpret_cast*>(input[0].data()), - reinterpret_cast*>(output[0].data()), - length, - nbatch, - istride, - idist, - ostride, - odist, - linf_failures, - linf_cutoff, - ioffset, - ooffset); - break; - } - dist.l_2 *= dist.l_2; - break; - case fft_array_type_real: - case fft_array_type_complex_planar: - case fft_array_type_hermitian_planar: - for(unsigned int idx = 0; idx < input.size(); ++idx) - { - VectorNorms d; - switch(precision) - { - case fft_precision_single: - d = distance_1to1_real(reinterpret_cast(input[idx].data()), - reinterpret_cast(output[idx].data()), - length, - nbatch, - istride, - idist, - ostride, - odist, - linf_failures, - linf_cutoff, - ioffset, - ooffset); - break; - case fft_precision_double: - d = distance_1to1_real(reinterpret_cast(input[idx].data()), - reinterpret_cast(output[idx].data()), - length, - nbatch, - istride, - idist, - ostride, - odist, - linf_failures, - linf_cutoff, - ioffset, - ooffset); - break; - } - dist.l_inf = std::max(d.l_inf, dist.l_inf); - dist.l_2 += d.l_2 * d.l_2; - } - break; - default: - throw std::runtime_error("Invalid input and output types."); - } - } - else if((itype == fft_array_type_complex_interleaved && otype == fft_array_type_complex_planar) - || (itype == fft_array_type_hermitian_interleaved - && otype == fft_array_type_hermitian_planar)) - { - switch(precision) - { - case fft_precision_single: - dist = distance_1to2(reinterpret_cast*>(input[0].data()), - reinterpret_cast(output[0].data()), - reinterpret_cast(output[1].data()), - length, - nbatch, - istride, - idist, - ostride, - odist, - linf_failures, - linf_cutoff, - ioffset, - ooffset); - break; - case fft_precision_double: - dist = distance_1to2(reinterpret_cast*>(input[0].data()), - reinterpret_cast(output[0].data()), - reinterpret_cast(output[1].data()), - length, - nbatch, - istride, - idist, - ostride, - odist, - linf_failures, - linf_cutoff, - ioffset, - ooffset); - break; - } - dist.l_2 *= dist.l_2; - } - else if((itype == fft_array_type_complex_planar && otype == fft_array_type_complex_interleaved) - || (itype == fft_array_type_hermitian_planar - && otype == fft_array_type_hermitian_interleaved)) - { - switch(precision) - { - case fft_precision_single: - dist = distance_1to2(reinterpret_cast*>(output[0].data()), - reinterpret_cast(input[0].data()), - reinterpret_cast(input[1].data()), - length, - nbatch, - ostride, - odist, - istride, - idist, - linf_failures, - linf_cutoff, - ioffset, - ooffset); - break; - case fft_precision_double: - dist = distance_1to2(reinterpret_cast*>(output[0].data()), - reinterpret_cast(input[0].data()), - reinterpret_cast(input[1].data()), - length, - nbatch, - ostride, - odist, - istride, - idist, - linf_failures, - linf_cutoff, - ioffset, - ooffset); - break; - } - dist.l_2 *= dist.l_2; - } - else - { - throw std::runtime_error("Invalid input and output types."); - } - dist.l_2 = sqrt(dist.l_2); - return dist; -} - -// Unroll arbitrary-dimension distance into specializations for 1-, 2-, 3-dimensions -template -inline VectorNorms distance(const std::vector>& input, - const std::vector>& output, - const std::vector& length, - const size_t nbatch, - const fft_precision precision, - const fft_array_type itype, - const std::vector& istride, - const size_t idist, - const fft_array_type otype, - const std::vector& ostride, - const size_t odist, - std::vector>& linf_failures, - const double linf_cutoff, - const std::vector& ioffset, - const std::vector& ooffset) -{ - switch(length.size()) - { - case 1: - return distance(input, - output, - length[0], - nbatch, - precision, - itype, - istride[0], - idist, - otype, - ostride[0], - odist, - linf_failures, - linf_cutoff, - ioffset, - ooffset); - case 2: - return distance(input, - output, - std::make_tuple(length[0], length[1]), - nbatch, - precision, - itype, - std::make_tuple(istride[0], istride[1]), - idist, - otype, - std::make_tuple(ostride[0], ostride[1]), - odist, - linf_failures, - linf_cutoff, - ioffset, - ooffset); - case 3: - return distance(input, - output, - std::make_tuple(length[0], length[1], length[2]), - nbatch, - precision, - itype, - std::make_tuple(istride[0], istride[1], istride[2]), - idist, - otype, - std::make_tuple(ostride[0], ostride[1], ostride[2]), - odist, - linf_failures, - linf_cutoff, - ioffset, - ooffset); - default: - abort(); - } -} - -// Compute the L-infinity and L-2 norm of a buffer with strides istride and -// length idist. Data is std::complex. -template -inline VectorNorms norm_complex(const Tcomplex* input, - const T1& whole_length, - const size_t nbatch, - const T2& istride, - const size_t idist, - const std::vector& offset) -{ - double linf = 0.0; - double l2 = 0.0; - - size_t idx_base = 0; - auto partitions = partition_rowmajor(whole_length); - for(size_t b = 0; b < nbatch; b++, idx_base += idist) - { -#pragma omp parallel for reduction(max : linf) reduction(+ : l2) num_threads(partitions.size()) - for(size_t part = 0; part < partitions.size(); ++part) - { - double cur_linf = 0.0; - double cur_l2 = 0.0; - auto index = partitions[part].first; - const auto length = partitions[part].second; - do - { - const auto idx = compute_index(index, istride, idx_base); - - const double rval = std::abs(input[idx + offset[0]].real()); - cur_linf = std::max(rval, cur_linf); - cur_l2 += rval * rval; - - const double ival = std::abs(input[idx + offset[0]].imag()); - cur_linf = std::max(ival, cur_linf); - cur_l2 += ival * ival; - - } while(increment_rowmajor(index, length)); - linf = std::max(linf, cur_linf); - l2 += cur_l2; - } - } - return {.l_2 = sqrt(l2), .l_inf = linf}; -} - -// Compute the L-infinity and L-2 norm of abuffer with strides istride and -// length idist. Data is real-valued. -template -inline VectorNorms norm_real(const Tfloat* input, - const T1& whole_length, - const size_t nbatch, - const T2& istride, - const size_t idist, - const std::vector& offset) -{ - double linf = 0.0; - double l2 = 0.0; - - size_t idx_base = 0; - auto partitions = partition_rowmajor(whole_length); - for(size_t b = 0; b < nbatch; b++, idx_base += idist) - { -#pragma omp parallel for reduction(max : linf) reduction(+ : l2) num_threads(partitions.size()) - for(size_t part = 0; part < partitions.size(); ++part) - { - double cur_linf = 0.0; - double cur_l2 = 0.0; - auto index = partitions[part].first; - const auto length = partitions[part].second; - do - { - const auto idx = compute_index(index, istride, idx_base); - const double val = std::abs(input[idx + offset[0]]); - cur_linf = std::max(val, cur_linf); - cur_l2 += val * val; - - } while(increment_rowmajor(index, length)); - linf = std::max(linf, cur_linf); - l2 += cur_l2; - } - } - return {.l_2 = sqrt(l2), .l_inf = linf}; -} - -// Compute the L-infinity and L-2 norm of abuffer with strides istride and -// length idist. Data format is given by precision and itype. -template -inline VectorNorms norm(const std::vector>& input, - const T1& length, - const size_t nbatch, - const fft_precision precision, - const fft_array_type itype, - const T2& istride, - const size_t idist, - const std::vector& offset) -{ - VectorNorms norm; - - switch(itype) - { - case fft_array_type_complex_interleaved: - case fft_array_type_hermitian_interleaved: - switch(precision) - { - case fft_precision_single: - norm = norm_complex(reinterpret_cast*>(input[0].data()), - length, - nbatch, - istride, - idist, - offset); - break; - case fft_precision_double: - norm = norm_complex(reinterpret_cast*>(input[0].data()), - length, - nbatch, - istride, - idist, - offset); - break; - } - norm.l_2 *= norm.l_2; - break; - case fft_array_type_real: - case fft_array_type_complex_planar: - case fft_array_type_hermitian_planar: - for(unsigned int idx = 0; idx < input.size(); ++idx) - { - VectorNorms n; - switch(precision) - { - case fft_precision_single: - n = norm_real(reinterpret_cast(input[idx].data()), - length, - nbatch, - istride, - idist, - offset); - break; - case fft_precision_double: - n = norm_real(reinterpret_cast(input[idx].data()), - length, - nbatch, - istride, - idist, - offset); - break; - } - norm.l_inf = std::max(n.l_inf, norm.l_inf); - norm.l_2 += n.l_2 * n.l_2; - } - break; - default: - throw std::runtime_error("Invalid data type"); - } - - norm.l_2 = sqrt(norm.l_2); - return norm; -} - -// Unroll arbitrary-dimension norm into specializations for 1-, 2-, 3-dimensions -template -inline VectorNorms norm(const std::vector>& input, - const std::vector& length, - const size_t nbatch, - const fft_precision precision, - const fft_array_type type, - const std::vector& stride, - const size_t dist, - const std::vector& offset) -{ - switch(length.size()) - { - case 1: - return norm(input, length[0], nbatch, precision, type, stride[0], dist, offset); - case 2: - return norm(input, - std::make_tuple(length[0], length[1]), - nbatch, - precision, - type, - std::make_tuple(stride[0], stride[1]), - dist, - offset); - case 3: - return norm(input, - std::make_tuple(length[0], length[1], length[2]), - nbatch, - precision, - type, - std::make_tuple(stride[0], stride[1], stride[2]), - dist, - offset); - default: - abort(); - } -} - -// Given an array type and transform length, strides, etc, load random floats in [0,1] -// into the input array of floats/doubles or complex floats/doubles gpu buffers. -template -inline void set_input(std::vector& input, - const fft_array_type itype, - const std::vector& length, - const std::vector& ilength, - const std::vector& stride, - const Tint1& whole_length, - const Tint1& istride, - const size_t idist, - const size_t nbatch) -{ - auto isize = count_iters(whole_length) * nbatch; - - switch(itype) - { - case fft_array_type_complex_interleaved: - case fft_array_type_hermitian_interleaved: - { - auto ibuffer = (std::complex*)input[0].data(); - - generate_interleaved_data(whole_length, idist, isize, istride, ibuffer); - - if(itype == fft_array_type_hermitian_interleaved) - impose_hermitian_symmetry_interleaved(length, ilength, stride, idist, nbatch, ibuffer); - - break; - } - case fft_array_type_complex_planar: - case fft_array_type_hermitian_planar: - { - auto ibuffer_real = (Tfloat*)input[0].data(); - auto ibuffer_imag = (Tfloat*)input[1].data(); - - generate_planar_data(whole_length, idist, isize, istride, ibuffer_real, ibuffer_imag); - - if(itype == fft_array_type_hermitian_planar) - impose_hermitian_symmetry_planar( - length, ilength, stride, idist, nbatch, ibuffer_real, ibuffer_imag); - - break; - } - case fft_array_type_real: - { - auto ibuffer = (Tfloat*)input[0].data(); - - generate_real_data(whole_length, idist, isize, istride, ibuffer); - - break; - } - default: - throw std::runtime_error("Input layout format not yet supported"); - } -} - -// unroll set_input for dimension 1, 2, 3 -template -inline void set_input(std::vector& input, - const fft_array_type itype, - const std::vector& length, - const std::vector& ilength, - const std::vector& istride, - const size_t idist, - const size_t nbatch) -{ - switch(length.size()) - { - case 1: - set_input( - input, itype, length, ilength, istride, ilength[0], istride[0], idist, nbatch); - break; - case 2: - set_input(input, - itype, - length, - ilength, - istride, - std::make_tuple(ilength[0], ilength[1]), - std::make_tuple(istride[0], istride[1]), - idist, - nbatch); - break; - case 3: - set_input(input, - itype, - length, - ilength, - istride, - std::make_tuple(ilength[0], ilength[1], ilength[2]), - std::make_tuple(istride[0], istride[1], istride[2]), - idist, - nbatch); - break; - default: - abort(); - } -} - -// Given a data type and precision, the distance between batches, and -// the batch size, allocate the required host buffer(s). -template > -inline std::vector> allocate_host_buffer( - const fft_precision precision, const fft_array_type type, const std::vector& size) -{ - std::vector> buffers(size.size()); - for(unsigned int i = 0; i < size.size(); ++i) - { - buffers[i].resize(size[i] * var_size(precision, type)); - } - return buffers; -} - -// Given a data type and dimensions, fill the buffer, imposing Hermitian symmetry if -// necessary. -inline void compute_input(const fft_params& params, std::vector& input) -{ - switch(params.precision) - { - case fft_precision_double: - set_input(input, - params.itype, - params.length, - params.ilength(), - params.istride, - params.idist, - params.nbatch); - break; - case fft_precision_single: - set_input(input, - params.itype, - params.length, - params.ilength(), - params.istride, - params.idist, - params.nbatch); - break; - } -} - -// Check if the required buffers fit in the device vram. -inline bool vram_fits_problem(const size_t prob_size, int deviceId = 0) -{ - // We keep a small margin of error for fitting the problem into vram: - const size_t extra = 1 << 20; - - // Check free and total available memory: - size_t free = 0; - size_t total = 0; - auto retval = hipMemGetInfo(&free, &total); - - if(retval != hipSuccess) - throw std::runtime_error("Failure in hipMemGetInfo"); - - if(total < prob_size + extra) - return false; - - if(free < prob_size + extra) - return false; - - return true; -} - -// Computes the twiddle table VRAM footprint for r2c/c2r transforms. -// This function will return 0 for the other transform types, since -// the VRAM footprint in rocFFT is negligible for the other cases. -inline size_t twiddle_table_vram_footprint(const fft_params& params) -{ - size_t vram_footprint = 0; - - // Add vram footprint from real/complex even twiddle buffer size. - if(params.transform_type == fft_transform_type_real_forward - || params.transform_type == fft_transform_type_real_inverse) - { - const auto realdim = params.length.back(); - if(realdim % 2 == 0) - { - const auto complex_size = params.precision == fft_precision_single ? 8 : 16; - // even length twiddle size is 1/4 of the real size, but - // in complex elements - vram_footprint += realdim * complex_size / 4; - } - } - - return vram_footprint; -} - -#endif diff -Nru rocfft-5.5.0/clients/rider/CMakeLists.txt rocfft-5.7.1/clients/rider/CMakeLists.txt --- rocfft-5.5.0/clients/rider/CMakeLists.txt 2023-01-31 06:20:16.000000000 +0000 +++ rocfft-5.7.1/clients/rider/CMakeLists.txt 2023-08-09 16:19:51.000000000 +0000 @@ -60,8 +60,8 @@ find_package( ROCM 0.7.3 REQUIRED ) endif() -if( NOT rocrand_FOUND ) - find_package( rocrand REQUIRED ) +if( NOT hiprand_FOUND ) + find_package( hiprand REQUIRED ) endif() include( ROCMInstallTargets ) @@ -71,7 +71,7 @@ set( rider_list rocfft-rider dyna-rocfft-rider ) foreach( rider ${rider_list}) - + if(${rider} STREQUAL "rocfft-rider") add_executable( ${rider} ../../shared/array_validator.cpp rider.cpp rider.h ) else() @@ -82,7 +82,7 @@ # NB: hip-clang includes omp.h, so we need to specify the location # of ROCM_CLANG_ROOT at cmake config time if we are using clang++. - + target_include_directories( ${rider} PRIVATE $ @@ -96,16 +96,16 @@ PRIVATE hip::device roc::rocfft - roc::rocrand + hip::hiprand Boost::program_options ) else() - target_link_libraries( ${rider} + target_link_libraries( ${rider} PRIVATE ${CMAKE_DL_LIBS} hip::device - roc::rocrand - ${Boost_LIBRARIES} + hip::hiprand + ${Boost_LIBRARIES} ) # We need to include both rocfft.h and rocfft-export.h @@ -136,10 +136,10 @@ endif() string( CONCAT RIDER_OUT_DIR "${PROJECT_BINARY_DIR}" ${RIDER_OUT_DIR} ) - set_target_properties(${rider} - PROPERTIES - RUNTIME_OUTPUT_DIRECTORY + set_target_properties(${rider} + PROPERTIES + RUNTIME_OUTPUT_DIRECTORY ${RIDER_OUT_DIR} ) - + rocm_install(TARGETS ${rider} COMPONENT benchmarks) endforeach() diff -Nru rocfft-5.5.0/clients/rider/dyna-rider.cpp rocfft-5.7.1/clients/rider/dyna-rider.cpp --- rocfft-5.5.0/clients/rider/dyna-rider.cpp 2023-01-31 06:20:16.000000000 +0000 +++ rocfft-5.7.1/clients/rider/dyna-rider.cpp 2023-08-09 16:19:51.000000000 +0000 @@ -1,4 +1,4 @@ -// Copyright (C) 2020 - 2022 Advanced Micro Devices, Inc. All rights reserved. +// Copyright (C) 2020 - 2023 Advanced Micro Devices, Inc. All rights reserved. // // Permission is hereby granted, free of charge, to any person obtaining a copy // of this software and associated documentation files (the "Software"), to deal @@ -22,7 +22,7 @@ // This allows one to randomize the execution order for better a better experimental setup // which produces fewer type 1 errors where one incorrectly rejects the null hypothesis. -#include +#include #include #include #include @@ -38,7 +38,7 @@ #endif #include "../../shared/gpubuf.h" -#include "../rocfft_params.h" +#include "../../shared/rocfft_params.h" #include "rider.h" #include "rocfft.h" @@ -340,9 +340,12 @@ // hip Device number for running tests: int deviceId{}; - // Number of performance trial samples + // Number of performance trial samples: int ntrial{}; + // Test sequence choice: + int test_sequence{}; + // Vector of test target libraries std::vector libs; @@ -362,8 +365,11 @@ ("device", po::value(&deviceId)->default_value(0), "Select a specific device id") ("verbose", po::value(&verbose)->default_value(0), "Control output verbosity") ("ntrial,N", po::value(&ntrial)->default_value(1), "Trial size for the problem") + ("sequence", po::value(&test_sequence)->default_value(0), + "Test sequence: random(0), alternating(1) sequential(2)") ("notInPlace,o", "Not in-place FFT transform (default: in-place)") - ("double", "Double precision transform (default: single)") + ("double", "Double precision transform (deprecated: use --precision double)") + ("precision", po::value(¶ms.precision), "Transform precision: single (default), double, half") ("transformType,t", po::value(¶ms.transform_type) ->default_value(fft_transform_type_complex_forward), "Type of transform:\n0) complex forward\n1) complex inverse\n2) real " @@ -394,7 +400,7 @@ ("ioffset", po::value>(¶ms.ioffset)->multitoken(), "Input offsets.") ("ooffset", po::value>(¶ms.ooffset)->multitoken(), "Output offsets.") ("scalefactor", po::value(¶ms.scale_factor), "Scale factor to apply to output.") - ("token", po::value(&token));; + ("token", po::value(&token)); // clang-format on po::variables_map vm; @@ -446,7 +452,8 @@ params.placement = vm.count("notInPlace") ? fft_placement_notinplace : fft_placement_inplace; - params.precision = vm.count("double") ? fft_precision_double : fft_precision_single; + if(vm.count("double")) + params.precision = fft_precision_double; if(vm.count("notInPlace")) { @@ -524,9 +531,14 @@ std::cout << params.str() << std::endl; } + // Check free and total available memory: + size_t free = 0; + size_t total = 0; + HIP_V_THROW(hipMemGetInfo(&free, &total), "hipMemGetInfo failed"); + const auto raw_vram_footprint = params.fft_params_vram_footprint() + twiddle_table_vram_footprint(params); - if(!vram_fits_problem(raw_vram_footprint)) + if(!vram_fits_problem(raw_vram_footprint, free)) { std::cout << "SKIPPED: Problem size (" << raw_vram_footprint << ") raw data too large for device.\n"; @@ -534,7 +546,7 @@ } const auto vram_footprint = params.vram_footprint(); - if(!vram_fits_problem(vram_footprint)) + if(!vram_fits_problem(vram_footprint, free)) { std::cout << "SKIPPED: Problem size (" << vram_footprint << ") raw data too large for device.\n"; @@ -618,7 +630,7 @@ } // Input data: - compute_input(params, ibuffer); + params.compute_input(ibuffer); if(verbose > 1) { @@ -671,21 +683,64 @@ // Execution times for loaded libraries: std::vector> time(libs.size()); + std::vector testcase(ntrial * libs.size()); + switch(test_sequence) + { + case 0: + { + // Random order: + for(int itrial = 0; itrial < ntrial; ++itrial) + { + for(size_t ilib = 0; ilib < libs.size(); ++ilib) + { + testcase[libs.size() * itrial + ilib] = ilib; + } + } + + std::random_device rd; + std::mt19937 g(rd()); + std::shuffle(testcase.begin(), testcase.end(), g); + break; + } + case 1: + // Alternating order: + for(int itrial = 0; itrial < ntrial; ++itrial) + { + for(size_t ilib = 0; ilib < libs.size(); ++ilib) + { + testcase[libs.size() * itrial + ilib] = ilib; + } + } + break; + case 2: + // Sequential order: + for(int itrial = 0; itrial < ntrial; ++itrial) + { + for(size_t ilib = 0; ilib < libs.size(); ++ilib) + { + testcase[ilib * ntrial + itrial] = ilib; + } + } + + break; + default: + throw std::runtime_error("Invalid test sequence choice."); + } + + std::cout << "test case:"; + for(const auto i : testcase) + std::cout << " " << i; + std::cout << "\n"; + // Run the FFTs from the different libraries in random order until they all have at // least ntrial times. std::vector ndone(libs.size()); std::fill(ndone.begin(), ndone.end(), 0); - while(!std::all_of(ndone.begin(), ndone.end(), [&ntrial](int i) { return (i >= ntrial); })) + for(size_t itest = 0; itest < testcase.size(); ++itest) { - const int idx = rand() % ndone.size(); - ndone[idx]++; - - // We can optionally require that all runs have exactly ntrial, but it may be more - // iid to just let things run: - // if(ndone[idx] > ntrial) - // continue; + const int idx = testcase[itest]; - compute_input(params, ibuffer); + params.compute_input(ibuffer); // Run the plan using its associated rocFFT library: time[idx].push_back( diff -Nru rocfft-5.5.0/clients/rider/rider.cpp rocfft-5.7.1/clients/rider/rider.cpp --- rocfft-5.5.0/clients/rider/rider.cpp 2023-01-31 06:20:16.000000000 +0000 +++ rocfft-5.7.1/clients/rider/rider.cpp 2023-08-09 16:19:51.000000000 +0000 @@ -1,4 +1,4 @@ -// Copyright (C) 2016 - 2022 Advanced Micro Devices, Inc. All rights reserved. +// Copyright (C) 2016 - 2023 Advanced Micro Devices, Inc. All rights reserved. // // Permission is hereby granted, free of charge, to any person obtaining a copy // of this software and associated documentation files (the "Software"), to deal @@ -24,7 +24,7 @@ #include #include "../../shared/gpubuf.h" -#include "../rocfft_params.h" +#include "../../shared/rocfft_params.h" #include "rider.h" #include "rocfft.h" #include @@ -61,7 +61,8 @@ ("verbose", po::value(&verbose)->default_value(0), "Control output verbosity") ("ntrial,N", po::value(&ntrial)->default_value(1), "Trial size for the problem") ("notInPlace,o", "Not in-place FFT transform (default: in-place)") - ("double", "Double precision transform (default: single)") + ("double", "Double precision transform (deprecated: use --precision double)") + ("precision", po::value(¶ms.precision), "Transform precision: single (default), double, half") ("transformType,t", po::value(¶ms.transform_type) ->default_value(fft_transform_type_complex_forward), "Type of transform:\n0) complex forward\n1) complex inverse\n2) real " @@ -141,7 +142,8 @@ params.placement = vm.count("notInPlace") ? fft_placement_notinplace : fft_placement_inplace; - params.precision = vm.count("double") ? fft_precision_double : fft_precision_single; + if(vm.count("double")) + params.precision = fft_precision_double; if(vm.count("notInPlace")) { @@ -221,9 +223,13 @@ std::cout << params.str(" ") << std::endl; } + // Check free and total available memory: + size_t free = 0; + size_t total = 0; + HIP_V_THROW(hipMemGetInfo(&free, &total), "hipMemGetInfo failed"); const auto raw_vram_footprint = params.fft_params_vram_footprint() + twiddle_table_vram_footprint(params); - if(!vram_fits_problem(raw_vram_footprint)) + if(!vram_fits_problem(raw_vram_footprint, free)) { std::cout << "SKIPPED: Problem size (" << raw_vram_footprint << ") raw data too large for device.\n"; @@ -231,7 +237,7 @@ } const auto vram_footprint = params.vram_footprint(); - if(!vram_fits_problem(vram_footprint)) + if(!vram_fits_problem(vram_footprint, free)) { std::cout << "SKIPPED: Problem size (" << vram_footprint << ") raw data too large for device.\n"; @@ -253,7 +259,7 @@ } // Input data: - compute_input(params, ibuffer); + params.compute_input(ibuffer); if(verbose > 1) { @@ -304,7 +310,7 @@ HIP_V_THROW(hipEventCreate(&stop), "hipEventCreate failed"); for(unsigned int itrial = 0; itrial < gpu_time.size(); ++itrial) { - compute_input(params, ibuffer); + params.compute_input(ibuffer); HIP_V_THROW(hipEventRecord(start), "hipEventRecord failed"); diff -Nru rocfft-5.5.0/clients/rocfft_params.h rocfft-5.7.1/clients/rocfft_params.h --- rocfft-5.5.0/clients/rocfft_params.h 2023-01-31 06:20:16.000000000 +0000 +++ rocfft-5.7.1/clients/rocfft_params.h 1970-01-01 00:00:00.000000000 +0000 @@ -1,313 +0,0 @@ -// Copyright (C) 2021 - 2022 Advanced Micro Devices, Inc. All rights reserved. -// -// Permission is hereby granted, free of charge, to any person obtaining a copy -// of this software and associated documentation files (the "Software"), to deal -// in the Software without restriction, including without limitation the rights -// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell -// copies of the Software, and to permit persons to whom the Software is -// furnished to do so, subject to the following conditions: -// -// The above copyright notice and this permission notice shall be included in -// all copies or substantial portions of the Software. -// -// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE -// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER -// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN -// THE SOFTWARE. - -#ifndef ROCFFT_PARAMS_H -#define ROCFFT_PARAMS_H - -#include "../shared/gpubuf.h" -#include "fft_params.h" -#include "rocfft.h" - -inline fft_status fft_status_from_rocfftparams(const rocfft_status val) -{ - switch(val) - { - case rocfft_status_success: - return fft_status_success; - case rocfft_status_failure: - return fft_status_failure; - case rocfft_status_invalid_arg_value: - return fft_status_invalid_arg_value; - case rocfft_status_invalid_dimensions: - return fft_status_invalid_dimensions; - case rocfft_status_invalid_array_type: - return fft_status_invalid_array_type; - case rocfft_status_invalid_strides: - return fft_status_invalid_strides; - case rocfft_status_invalid_distance: - return fft_status_invalid_distance; - case rocfft_status_invalid_offset: - return fft_status_invalid_offset; - case rocfft_status_invalid_work_buffer: - return fft_status_invalid_work_buffer; - default: - throw std::runtime_error("Invalid status"); - } -} - -inline rocfft_precision rocfft_precision_from_fftparams(const fft_precision val) -{ - switch(val) - { - case fft_precision_single: - return rocfft_precision_single; - case fft_precision_double: - return rocfft_precision_double; - default: - throw std::runtime_error("Invalid precision"); - } -} - -inline rocfft_array_type rocfft_array_type_from_fftparams(const fft_array_type val) -{ - switch(val) - { - case fft_array_type_complex_interleaved: - return rocfft_array_type_complex_interleaved; - case fft_array_type_complex_planar: - return rocfft_array_type_complex_planar; - case fft_array_type_real: - return rocfft_array_type_real; - case fft_array_type_hermitian_interleaved: - return rocfft_array_type_hermitian_interleaved; - case fft_array_type_hermitian_planar: - return rocfft_array_type_hermitian_planar; - case fft_array_type_unset: - return rocfft_array_type_unset; - } - return rocfft_array_type_unset; -} - -inline rocfft_transform_type rocfft_transform_type_from_fftparams(const fft_transform_type val) -{ - switch(val) - { - case fft_transform_type_complex_forward: - return rocfft_transform_type_complex_forward; - case fft_transform_type_complex_inverse: - return rocfft_transform_type_complex_inverse; - case fft_transform_type_real_forward: - return rocfft_transform_type_real_forward; - case fft_transform_type_real_inverse: - return rocfft_transform_type_real_inverse; - default: - throw std::runtime_error("Invalid transform type"); - } -} - -inline rocfft_result_placement - rocfft_result_placement_from_fftparams(const fft_result_placement val) -{ - switch(val) - { - case fft_placement_inplace: - return rocfft_placement_inplace; - case fft_placement_notinplace: - return rocfft_placement_notinplace; - default: - throw std::runtime_error("Invalid result placement"); - } -} - -class rocfft_params : public fft_params -{ -public: - rocfft_plan plan = nullptr; - rocfft_execution_info info = nullptr; - rocfft_plan_description desc = nullptr; - gpubuf_t wbuffer; - - explicit rocfft_params(){}; - - explicit rocfft_params(const fft_params& p) - : fft_params(p){}; - - rocfft_params(const rocfft_params&) = delete; - rocfft_params& operator=(const rocfft_params&) = delete; - - ~rocfft_params() - { - free(); - }; - - void free() - { - if(plan != nullptr) - { - rocfft_plan_destroy(plan); - plan = nullptr; - } - if(info != nullptr) - { - rocfft_execution_info_destroy(info); - info = nullptr; - } - if(desc != nullptr) - { - rocfft_plan_description_destroy(desc); - desc = nullptr; - } - } - - rocfft_precision get_rocfft_precision() - { - return rocfft_precision_from_fftparams(precision); - } - - size_t vram_footprint() override - { - size_t val = fft_params::vram_footprint(); - if(setup_structs() != fft_status_success) - { - throw std::runtime_error("Struct setup failed"); - } - val += workbuffersize; - - return val; - } - - fft_status setup_structs() - { - rocfft_status fft_status = rocfft_status_success; - if(desc == nullptr) - { - rocfft_plan_description_create(&desc); - if(fft_status != rocfft_status_success) - return fft_status_from_rocfftparams(fft_status); - - fft_status - = rocfft_plan_description_set_data_layout(desc, - rocfft_array_type_from_fftparams(itype), - rocfft_array_type_from_fftparams(otype), - ioffset.data(), - ooffset.data(), - istride_cm().size(), - istride_cm().data(), - idist, - ostride_cm().size(), - ostride_cm().data(), - odist); - if(fft_status != rocfft_status_success) - { - throw std::runtime_error("rocfft_plan_description_set_data_layout failed"); - } - - if(scale_factor != 1.0) - { - fft_status = rocfft_plan_description_set_scale_factor(desc, scale_factor); - if(fft_status != rocfft_status_success) - { - throw std::runtime_error("rocfft_plan_description_set_scale_factor failed"); - } - } - } - - if(plan == nullptr) - { - fft_status = rocfft_plan_create(&plan, - rocfft_result_placement_from_fftparams(placement), - rocfft_transform_type_from_fftparams(transform_type), - get_rocfft_precision(), - length_cm().size(), - length_cm().data(), - nbatch, - desc); - if(fft_status != rocfft_status_success) - { - throw std::runtime_error("rocfft_plan_create failed"); - } - } - - if(info == nullptr) - { - fft_status = rocfft_execution_info_create(&info); - if(fft_status != rocfft_status_success) - { - throw std::runtime_error("rocfft_execution_info_create failed"); - } - } - - fft_status = rocfft_plan_get_work_buffer_size(plan, &workbuffersize); - if(fft_status != rocfft_status_success) - { - throw std::runtime_error("rocfft_plan_get_work_buffer_size failed"); - } - - return fft_status_from_rocfftparams(fft_status); - } - - fft_status create_plan() override - { - fft_status ret = setup_structs(); - if(ret != fft_status_success) - { - return ret; - } - if(workbuffersize > 0) - { - hipError_t hip_status = hipSuccess; - hip_status = wbuffer.alloc(workbuffersize); - if(hip_status != hipSuccess) - { - std::ostringstream oss; - oss << "work buffer allocation failed (" << workbuffersize << " requested)"; - size_t mem_free = 0; - size_t mem_total = 0; - hip_status = hipMemGetInfo(&mem_free, &mem_total); - if(hip_status == hipSuccess) - { - oss << "free vram: " << mem_free << " total vram: " << mem_total; - } - else - { - oss << "hipMemGetInfo also failed"; - } - throw work_buffer_alloc_failure(oss.str()); - } - - auto rocret - = rocfft_execution_info_set_work_buffer(info, wbuffer.data(), workbuffersize); - if(rocret != rocfft_status_success) - { - throw std::runtime_error("rocfft_execution_info_set_work_buffer failed"); - } - } - - return ret; - } - - fft_status set_callbacks(void* load_cb_host, - void* load_cb_data, - void* store_cb_host, - void* store_cb_data) override - { - if(run_callbacks) - { - auto roc_status - = rocfft_execution_info_set_load_callback(info, &load_cb_host, &load_cb_data, 0); - if(roc_status != rocfft_status_success) - return fft_status_from_rocfftparams(roc_status); - - roc_status - = rocfft_execution_info_set_store_callback(info, &store_cb_host, &store_cb_data, 0); - if(roc_status != rocfft_status_success) - return fft_status_from_rocfftparams(roc_status); - } - return fft_status_success; - } - - fft_status execute(void** in, void** out) override - { - auto ret = rocfft_execute(plan, in, out, info); - return fft_status_from_rocfftparams(ret); - } -}; - -#endif diff -Nru rocfft-5.5.0/clients/samples/fixed-16/CMakeLists.txt rocfft-5.7.1/clients/samples/fixed-16/CMakeLists.txt --- rocfft-5.5.0/clients/samples/fixed-16/CMakeLists.txt 2023-01-31 06:20:16.000000000 +0000 +++ rocfft-5.7.1/clients/samples/fixed-16/CMakeLists.txt 2023-08-09 16:19:51.000000000 +0000 @@ -1,5 +1,5 @@ # ############################################################################# -# Copyright (C) 2016 - 2022 Advanced Micro Devices, Inc. All rights reserved. +# Copyright (C) 2016 - 2023 Advanced Micro Devices, Inc. All rights reserved. # # Permission is hereby granted, free of charge, to any person obtaining a copy # of this software and associated documentation files (the "Software"), to deal @@ -52,7 +52,7 @@ find_package( HIP REQUIRED ) endif() -set( sample_list fixed-16-float fixed-16-double ) +set( sample_list fixed-16-float fixed-16-double fixed-16-half ) foreach( sample ${sample_list} ) @@ -63,13 +63,13 @@ $ ) - target_link_libraries( ${sample} PRIVATE roc::rocfft ${FFTW_LIBRARIES} ) + target_link_libraries( ${sample} PRIVATE roc::rocfft hip::device ${FFTW_LIBRARIES} ) target_compile_options( ${sample} PRIVATE ${WARNING_FLAGS} ) set_target_properties( ${sample} PROPERTIES DEBUG_POSTFIX "-d" - CXX_STANDARD 14 + CXX_STANDARD 17 CXX_STANDARD_REQUIRED ON ) diff -Nru rocfft-5.5.0/clients/samples/fixed-16/fixed-16-double.cpp rocfft-5.7.1/clients/samples/fixed-16/fixed-16-double.cpp --- rocfft-5.5.0/clients/samples/fixed-16/fixed-16-double.cpp 2023-01-31 06:20:16.000000000 +0000 +++ rocfft-5.7.1/clients/samples/fixed-16/fixed-16-double.cpp 2023-08-09 16:19:51.000000000 +0000 @@ -24,7 +24,6 @@ #include #include #include -#include #include int main() @@ -43,57 +42,76 @@ // rocfft gpu compute // ======================================== - rocfft_setup(); + if(rocfft_setup() != rocfft_status_success) + throw std::runtime_error("rocfft_setup failed."); size_t Nbytes = N * sizeof(double2); // Create HIP device object. double2* x; - hipMalloc(&x, Nbytes); + if(hipMalloc(&x, Nbytes) != hipSuccess) + throw std::runtime_error("hipMalloc failed."); // Copy data to device - hipMemcpy(x, &cx[0], Nbytes, hipMemcpyHostToDevice); + if(hipMemcpy(x, &cx[0], Nbytes, hipMemcpyHostToDevice) != hipSuccess) + throw std::runtime_error("hipMemcpy failed."); // Create plan rocfft_plan plan = NULL; size_t length = N; - rocfft_plan_create(&plan, - rocfft_placement_inplace, - rocfft_transform_type_complex_forward, - rocfft_precision_double, - 1, - &length, - 1, - NULL); + if(rocfft_plan_create(&plan, + rocfft_placement_inplace, + rocfft_transform_type_complex_forward, + rocfft_precision_double, + 1, + &length, + 1, + NULL) + != rocfft_status_success) + throw std::runtime_error("rocfft_plan_create failed."); // Check if the plan requires a work buffer size_t work_buf_size = 0; - rocfft_plan_get_work_buffer_size(plan, &work_buf_size); + if(rocfft_plan_get_work_buffer_size(plan, &work_buf_size) != rocfft_status_success) + throw std::runtime_error("rocfft_plan_get_work_buffer_size failed."); void* work_buf = nullptr; rocfft_execution_info info = nullptr; if(work_buf_size) { - rocfft_execution_info_create(&info); - hipMalloc(&work_buf, work_buf_size); - rocfft_execution_info_set_work_buffer(info, work_buf, work_buf_size); + if(rocfft_execution_info_create(&info) != rocfft_status_success) + throw std::runtime_error("rocfft_execution_info_create failed."); + if(hipMalloc(&work_buf, work_buf_size) != hipSuccess) + throw std::runtime_error("hipMalloc failed."); + if(rocfft_execution_info_set_work_buffer(info, work_buf, work_buf_size) + != rocfft_status_success) + throw std::runtime_error("rocfft_execution_info_set_work_buffer failed."); } // Execute plan - rocfft_execute(plan, (void**)&x, NULL, info); + if(rocfft_execute(plan, (void**)&x, NULL, info) != rocfft_status_success) + throw std::runtime_error("rocfft_execute failed."); + if(hipDeviceSynchronize() != hipSuccess) + throw std::runtime_error("hipDeviceSynchronize failed."); // Clean up work buffer if(work_buf_size) { - hipFree(work_buf); - rocfft_execution_info_destroy(info); + if(hipFree(work_buf) != hipSuccess) + throw std::runtime_error("hipFree failed."); + if(rocfft_execution_info_destroy(info) != rocfft_status_success) + throw std::runtime_error("rocfft_execution_info_destroy failed."); + info = nullptr; } // Destroy plan - rocfft_plan_destroy(plan); + if(rocfft_plan_destroy(plan) != rocfft_status_success) + throw std::runtime_error("rocfft_plan_destroy failed."); + plan = nullptr; // Copy result back to host std::vector y(N); - hipMemcpy(&y[0], x, Nbytes, hipMemcpyDeviceToHost); + if(hipMemcpy(&y[0], x, Nbytes, hipMemcpyDeviceToHost) != hipSuccess) + throw std::runtime_error("hipMemcpy failed."); for(size_t i = 0; i < N; i++) { @@ -101,9 +119,11 @@ << " output: (" << y[i].x << "," << y[i].y << ")" << std::endl; } - hipFree(x); + if(hipFree(x) != hipSuccess) + throw std::runtime_error("hipFree failed."); - rocfft_cleanup(); + if(rocfft_cleanup() != rocfft_status_success) + throw std::runtime_error("rocfft_cleanup failed."); return 0; } diff -Nru rocfft-5.5.0/clients/samples/fixed-16/fixed-16-float.cpp rocfft-5.7.1/clients/samples/fixed-16/fixed-16-float.cpp --- rocfft-5.5.0/clients/samples/fixed-16/fixed-16-float.cpp 2023-01-31 06:20:16.000000000 +0000 +++ rocfft-5.7.1/clients/samples/fixed-16/fixed-16-float.cpp 2023-08-09 16:19:51.000000000 +0000 @@ -19,19 +19,20 @@ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN * THE SOFTWARE. *******************************************************************************/ + #include "rocfft.h" #include #include #include -#include #include int main() { - // For size N <= 4096 + const size_t N = 16; std::vector cx(N); + for(size_t i = 0; i < N; i++) { cx[i].x = i + (i % 3) - (i % 7); @@ -41,57 +42,76 @@ // rocfft gpu compute // ======================================== - rocfft_setup(); + if(rocfft_setup() != rocfft_status_success) + throw std::runtime_error("rocfft_setup failed."); size_t Nbytes = N * sizeof(float2); // Create HIP device object. float2* x; - hipMalloc(&x, Nbytes); + if(hipMalloc(&x, Nbytes) != hipSuccess) + throw std::runtime_error("hipMalloc failed."); // Copy data to device - hipMemcpy(x, &cx[0], Nbytes, hipMemcpyHostToDevice); + if(hipMemcpy(x, &cx[0], Nbytes, hipMemcpyHostToDevice) != hipSuccess) + throw std::runtime_error("hipMemcpy failed."); // Create plan rocfft_plan plan = NULL; size_t length = N; - rocfft_plan_create(&plan, - rocfft_placement_inplace, - rocfft_transform_type_complex_forward, - rocfft_precision_single, - 1, - &length, - 1, - NULL); + if(rocfft_plan_create(&plan, + rocfft_placement_inplace, + rocfft_transform_type_complex_forward, + rocfft_precision_single, + 1, + &length, + 1, + NULL) + != rocfft_status_success) + throw std::runtime_error("rocfft_plan_create failed."); // Check if the plan requires a work buffer size_t work_buf_size = 0; - rocfft_plan_get_work_buffer_size(plan, &work_buf_size); + if(rocfft_plan_get_work_buffer_size(plan, &work_buf_size) != rocfft_status_success) + throw std::runtime_error("rocfft_plan_get_work_buffer_size failed."); void* work_buf = nullptr; rocfft_execution_info info = nullptr; if(work_buf_size) { - rocfft_execution_info_create(&info); - hipMalloc(&work_buf, work_buf_size); - rocfft_execution_info_set_work_buffer(info, work_buf, work_buf_size); + if(rocfft_execution_info_create(&info) != rocfft_status_success) + throw std::runtime_error("rocfft_execution_info_create failed."); + if(hipMalloc(&work_buf, work_buf_size) != hipSuccess) + throw std::runtime_error("hipMalloc failed."); + if(rocfft_execution_info_set_work_buffer(info, work_buf, work_buf_size) + != rocfft_status_success) + throw std::runtime_error("rocfft_execution_info_set_work_buffer failed."); } // Execute plan - rocfft_execute(plan, (void**)&x, NULL, NULL); + if(rocfft_execute(plan, (void**)&x, NULL, info) != rocfft_status_success) + throw std::runtime_error("rocfft_execute failed."); + if(hipDeviceSynchronize() != hipSuccess) + throw std::runtime_error("hipDeviceSynchronize failed."); // Clean up work buffer if(work_buf_size) { - hipFree(work_buf); - rocfft_execution_info_destroy(info); + if(hipFree(work_buf) != hipSuccess) + throw std::runtime_error("hipFree failed."); + if(rocfft_execution_info_destroy(info) != rocfft_status_success) + throw std::runtime_error("rocfft_execution_info_destroy failed."); + info = nullptr; } // Destroy plan - rocfft_plan_destroy(plan); + if(rocfft_plan_destroy(plan) != rocfft_status_success) + throw std::runtime_error("rocfft_plan_destroy failed."); + plan = nullptr; // Copy result back to host std::vector y(N); - hipMemcpy(&y[0], x, Nbytes, hipMemcpyDeviceToHost); + if(hipMemcpy(&y[0], x, Nbytes, hipMemcpyDeviceToHost) != hipSuccess) + throw std::runtime_error("hipMemcpy failed."); for(size_t i = 0; i < N; i++) { @@ -99,9 +119,11 @@ << " output: (" << y[i].x << "," << y[i].y << ")" << std::endl; } - hipFree(x); + if(hipFree(x) != hipSuccess) + throw std::runtime_error("hipFree failed."); - rocfft_cleanup(); + if(rocfft_cleanup() != rocfft_status_success) + throw std::runtime_error("rocfft_cleanup failed."); return 0; } diff -Nru rocfft-5.5.0/clients/samples/fixed-16/fixed-16-half.cpp rocfft-5.7.1/clients/samples/fixed-16/fixed-16-half.cpp --- rocfft-5.5.0/clients/samples/fixed-16/fixed-16-half.cpp 1970-01-01 00:00:00.000000000 +0000 +++ rocfft-5.7.1/clients/samples/fixed-16/fixed-16-half.cpp 2023-08-09 16:19:51.000000000 +0000 @@ -0,0 +1,131 @@ +/****************************************************************************** +* Copyright (C) 2023 Advanced Micro Devices, Inc. All rights reserved. +* +* Permission is hereby granted, free of charge, to any person obtaining a copy +* of this software and associated documentation files (the "Software"), to deal +* in the Software without restriction, including without limitation the rights +* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +* copies of the Software, and to permit persons to whom the Software is +* furnished to do so, subject to the following conditions: +* +* The above copyright notice and this permission notice shall be included in +* all copies or substantial portions of the Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN +* THE SOFTWARE. +*******************************************************************************/ + +#include "rocfft.h" +#include +#include +#include +#include + +int main() +{ + + const size_t N = 16; + + std::vector<_Float16_2> cx(N); + + for(size_t i = 0; i < N; i++) + { + cx[i].x = static_cast<_Float16>(i + (i % 3) - (i % 7)); + cx[i].y = 0; + } + + // rocfft gpu compute + // ======================================== + + if(rocfft_setup() != rocfft_status_success) + throw std::runtime_error("rocfft_setup failed."); + + size_t Nbytes = N * sizeof(_Float16_2); + + // Create HIP device object. + _Float16_2* x = nullptr; + if(hipMalloc(&x, Nbytes) != hipSuccess) + throw std::runtime_error("hipMalloc failed."); + + // Copy data to device + if(hipMemcpy(x, &cx[0], Nbytes, hipMemcpyHostToDevice) != hipSuccess) + throw std::runtime_error("hipMemcpy failed."); + + // Create plan + rocfft_plan plan = NULL; + size_t length = N; + if(rocfft_plan_create(&plan, + rocfft_placement_inplace, + rocfft_transform_type_complex_forward, + rocfft_precision_half, + 1, + &length, + 1, + NULL) + != rocfft_status_success) + throw std::runtime_error("rocfft_plan_create failed."); + + // Check if the plan requires a work buffer + size_t work_buf_size = 0; + if(rocfft_plan_get_work_buffer_size(plan, &work_buf_size) != rocfft_status_success) + throw std::runtime_error("rocfft_plan_get_work_buffer_size failed."); + void* work_buf = nullptr; + rocfft_execution_info info = nullptr; + if(work_buf_size) + { + if(rocfft_execution_info_create(&info) != rocfft_status_success) + throw std::runtime_error("rocfft_execution_info_create failed."); + if(hipMalloc(&work_buf, work_buf_size) != hipSuccess) + throw std::runtime_error("hipMalloc failed."); + if(rocfft_execution_info_set_work_buffer(info, work_buf, work_buf_size) + != rocfft_status_success) + throw std::runtime_error("rocfft_execution_info_set_work_buffer failed."); + } + + // Execute plan + if(rocfft_execute(plan, (void**)&x, NULL, info) != rocfft_status_success) + throw std::runtime_error("rocfft_execute failed."); + if(hipDeviceSynchronize() != hipSuccess) + throw std::runtime_error("hipDeviceSynchronize failed."); + + // Clean up work buffer + if(work_buf_size) + { + if(hipFree(work_buf) != hipSuccess) + throw std::runtime_error("hipFree failed."); + if(rocfft_execution_info_destroy(info) != rocfft_status_success) + throw std::runtime_error("rocfft_execution_info_destroy failed."); + info = nullptr; + } + + // Destroy plan + if(rocfft_plan_destroy(plan) != rocfft_status_success) + throw std::runtime_error("rocfft_plan_destroy failed."); + plan = nullptr; + + // Copy result back to host + std::vector<_Float16_2> y(N); + if(hipMemcpy(&y[0], x, Nbytes, hipMemcpyDeviceToHost) != hipSuccess) + throw std::runtime_error("hipMemcpy failed."); + + for(size_t i = 0; i < N; i++) + { + std::cout << "element " << i << " input: (" << static_cast(cx[i].x) << "," + << static_cast(cx[i].y) << ")" + << " output: (" << static_cast(y[i].x) << "," + << static_cast(y[i].y) << ")" << std::endl; + } + + if(hipFree(x) != hipSuccess) + throw std::runtime_error("hipFree failed."); + + if(rocfft_cleanup() != rocfft_status_success) + throw std::runtime_error("rocfft_cleanup failed."); + + return 0; +} diff -Nru rocfft-5.5.0/clients/samples/fixed-large/CMakeLists.txt rocfft-5.7.1/clients/samples/fixed-large/CMakeLists.txt --- rocfft-5.5.0/clients/samples/fixed-large/CMakeLists.txt 2023-01-31 06:20:16.000000000 +0000 +++ rocfft-5.7.1/clients/samples/fixed-large/CMakeLists.txt 2023-08-09 16:19:51.000000000 +0000 @@ -1,5 +1,5 @@ # ############################################################################# -# Copyright (C) 2016 - 2022 Advanced Micro Devices, Inc. All rights reserved. +# Copyright (C) 2016 - 2023 Advanced Micro Devices, Inc. All rights reserved. # # Permission is hereby granted, free of charge, to any person obtaining a copy # of this software and associated documentation files (the "Software"), to deal @@ -68,7 +68,7 @@ set_target_properties( ${sample} PROPERTIES DEBUG_POSTFIX "-d" - CXX_STANDARD 14 + CXX_STANDARD 17 CXX_STANDARD_REQUIRED ON ) diff -Nru rocfft-5.5.0/clients/samples/fixed-large/fixed-large-double.cpp rocfft-5.7.1/clients/samples/fixed-large/fixed-large-double.cpp --- rocfft-5.5.0/clients/samples/fixed-large/fixed-large-double.cpp 2023-01-31 06:20:16.000000000 +0000 +++ rocfft-5.7.1/clients/samples/fixed-large/fixed-large-double.cpp 2023-08-09 16:19:51.000000000 +0000 @@ -44,60 +44,78 @@ // rocfft gpu compute // ======================================== - rocfft_setup(); + if(rocfft_setup() != rocfft_status_success) + throw std::runtime_error("rocfft_setup failed."); size_t Nbytes = N * sizeof(double2); // Create HIP device object. double2* x; - hipMalloc(&x, Nbytes); + if(hipMalloc(&x, Nbytes) != hipSuccess) + throw std::runtime_error("hipMalloc failed."); // Copy data to device - hipMemcpy(x, &cx[0], Nbytes, hipMemcpyHostToDevice); + if(hipMemcpy(x, &cx[0], Nbytes, hipMemcpyHostToDevice) != hipSuccess) + throw std::runtime_error("hipMemcpy failed."); // Create plan - rocfft_plan plan = NULL; + rocfft_plan plan = nullptr; size_t length = N; - rocfft_plan_create(&plan, - rocfft_placement_inplace, - rocfft_transform_type_complex_forward, - rocfft_precision_double, - 1, - &length, - 1, - NULL); + if(rocfft_plan_create(&plan, + rocfft_placement_inplace, + rocfft_transform_type_complex_forward, + rocfft_precision_double, + 1, + &length, + 1, + nullptr) + != rocfft_status_success) + throw std::runtime_error("rocfft_plan_create failed."); // Setup work buffer void* workBuffer = nullptr; size_t workBufferSize = 0; - rocfft_plan_get_work_buffer_size(plan, &workBufferSize); + if(rocfft_plan_get_work_buffer_size(plan, &workBufferSize) != rocfft_status_success) + throw std::runtime_error("rocfft_plan_get_work_buffer_size failed."); // Setup exec info to pass work buffer to the library rocfft_execution_info info = nullptr; - rocfft_execution_info_create(&info); + if(rocfft_execution_info_create(&info) != rocfft_status_success) + throw std::runtime_error("rocfft_execution_info_create failed."); if(workBufferSize > 0) { printf("size of workbuffer=%d\n", (int)workBufferSize); - hipMalloc(&workBuffer, workBufferSize); - rocfft_execution_info_set_work_buffer(info, workBuffer, workBufferSize); + if(hipMalloc(&workBuffer, workBufferSize) != hipSuccess) + throw std::runtime_error("hipMalloc failed."); + if(rocfft_execution_info_set_work_buffer(info, workBuffer, workBufferSize) + != rocfft_status_success) + throw std::runtime_error("rocfft_execution_info_set_work_buffer failed."); } // Execute plan - rocfft_execute(plan, (void**)&x, NULL, info); - hipDeviceSynchronize(); + if(rocfft_execute(plan, (void**)&x, nullptr, info) != rocfft_status_success) + throw std::runtime_error("rocfft_execute failed."); + if(hipDeviceSynchronize() != hipSuccess) + throw std::runtime_error("hipDeviceSynchronize failed."); // Destroy plan - rocfft_plan_destroy(plan); + if(rocfft_plan_destroy(plan) != rocfft_status_success) + throw std::runtime_error("rocfft_plan_destroy failed."); + plan = nullptr; if(workBuffer) - hipFree(workBuffer); + if(hipFree(workBuffer) != hipSuccess) + throw std::runtime_error("hipFree failed."); - rocfft_execution_info_destroy(info); + if(rocfft_execution_info_destroy(info) != rocfft_status_success) + throw std::runtime_error("rocfft_execution_info_destroy failed."); + info = nullptr; // Copy result back to host std::vector y(N); - hipMemcpy(&y[0], x, Nbytes, hipMemcpyDeviceToHost); + if(hipMemcpy(&y[0], x, Nbytes, hipMemcpyDeviceToHost) != hipSuccess) + throw std::runtime_error("hipMemcpy failed."); for(size_t i = 0; i < N; i++) { @@ -105,7 +123,11 @@ << " output: (" << y[i].x << "," << y[i].y << ")" << std::endl; } - rocfft_cleanup(); + if(hipFree(x) != hipSuccess) + throw std::runtime_error("hipFree failed."); + + if(rocfft_cleanup() != rocfft_status_success) + throw std::runtime_error("rocfft_cleanup failed."); return 0; } diff -Nru rocfft-5.5.0/clients/samples/fixed-large/fixed-large-float.cpp rocfft-5.7.1/clients/samples/fixed-large/fixed-large-float.cpp --- rocfft-5.5.0/clients/samples/fixed-large/fixed-large-float.cpp 2023-01-31 06:20:16.000000000 +0000 +++ rocfft-5.7.1/clients/samples/fixed-large/fixed-large-float.cpp 2023-08-09 16:19:51.000000000 +0000 @@ -44,60 +44,78 @@ // rocfft gpu compute // ======================================== - rocfft_setup(); + if(rocfft_setup() != rocfft_status_success) + throw std::runtime_error("rocfft_setup failed."); size_t Nbytes = N * sizeof(float2); // Create HIP device object. float2* x; - hipMalloc(&x, Nbytes); + if(hipMalloc(&x, Nbytes) != hipSuccess) + throw std::runtime_error("hipMalloc failed."); // Copy data to device - hipMemcpy(x, &cx[0], Nbytes, hipMemcpyHostToDevice); + if(hipMemcpy(x, &cx[0], Nbytes, hipMemcpyHostToDevice) != hipSuccess) + throw std::runtime_error("hipMemcpy failed."); // Create plan - rocfft_plan plan = NULL; + rocfft_plan plan = nullptr; size_t length = N; - rocfft_plan_create(&plan, - rocfft_placement_inplace, - rocfft_transform_type_complex_forward, - rocfft_precision_single, - 1, - &length, - 1, - NULL); + if(rocfft_plan_create(&plan, + rocfft_placement_inplace, + rocfft_transform_type_complex_forward, + rocfft_precision_single, + 1, + &length, + 1, + nullptr) + != rocfft_status_success) + throw std::runtime_error("rocfft_plan_create failed."); // Setup work buffer void* workBuffer = nullptr; size_t workBufferSize = 0; - rocfft_plan_get_work_buffer_size(plan, &workBufferSize); + if(rocfft_plan_get_work_buffer_size(plan, &workBufferSize) != rocfft_status_success) + throw std::runtime_error("rocfft_plan_get_work_buffer_size failed."); // Setup exec info to pass work buffer to the library rocfft_execution_info info = nullptr; - rocfft_execution_info_create(&info); + if(rocfft_execution_info_create(&info) != rocfft_status_success) + throw std::runtime_error("rocfft_execution_info_create failed."); if(workBufferSize > 0) { printf("size of workbuffer=%d\n", (int)workBufferSize); - hipMalloc(&workBuffer, workBufferSize); - rocfft_execution_info_set_work_buffer(info, workBuffer, workBufferSize); + if(hipMalloc(&workBuffer, workBufferSize) != hipSuccess) + throw std::runtime_error("hipMalloc failed."); + if(rocfft_execution_info_set_work_buffer(info, workBuffer, workBufferSize) + != rocfft_status_success) + throw std::runtime_error("rocfft_execution_info_set_work_buffer failed."); } // Execute plan - rocfft_execute(plan, (void**)&x, NULL, info); - hipDeviceSynchronize(); + if(rocfft_execute(plan, (void**)&x, nullptr, info) != rocfft_status_success) + throw std::runtime_error("rocfft_execute failed."); + if(hipDeviceSynchronize() != hipSuccess) + throw std::runtime_error("hipDeviceSynchronize failed."); // Destroy plan - rocfft_plan_destroy(plan); + if(rocfft_plan_destroy(plan) != rocfft_status_success) + throw std::runtime_error("rocfft_plan_destroy failed."); + plan = nullptr; if(workBuffer) - hipFree(workBuffer); + if(hipFree(workBuffer) != hipSuccess) + throw std::runtime_error("hipFree failed."); - rocfft_execution_info_destroy(info); + if(rocfft_execution_info_destroy(info) != rocfft_status_success) + throw std::runtime_error("rocfft_execution_info_destroy failed."); + info = nullptr; // Copy result back to host std::vector y(N); - hipMemcpy(&y[0], x, Nbytes, hipMemcpyDeviceToHost); + if(hipMemcpy(&y[0], x, Nbytes, hipMemcpyDeviceToHost) != hipSuccess) + throw std::runtime_error("hipMemcpy failed."); for(size_t i = 0; i < N; i++) { @@ -105,7 +123,11 @@ << " output: (" << y[i].x << "," << y[i].y << ")" << std::endl; } - rocfft_cleanup(); + if(hipFree(x) != hipSuccess) + throw std::runtime_error("hipFree failed."); + + if(rocfft_cleanup() != rocfft_status_success) + throw std::runtime_error("rocfft_cleanup failed."); return 0; } diff -Nru rocfft-5.5.0/clients/samples/rocfft/CMakeLists.txt rocfft-5.7.1/clients/samples/rocfft/CMakeLists.txt --- rocfft-5.5.0/clients/samples/rocfft/CMakeLists.txt 2023-01-31 06:20:16.000000000 +0000 +++ rocfft-5.7.1/clients/samples/rocfft/CMakeLists.txt 2023-08-09 16:19:51.000000000 +0000 @@ -1,5 +1,5 @@ # ############################################################################# -# Copyright (C) 2016 - 2022 Advanced Micro Devices, Inc. All rights reserved. +# Copyright (C) 2016 - 2023 Advanced Micro Devices, Inc. All rights reserved. # # Permission is hereby granted, free of charge, to any person obtaining a copy # of this software and associated documentation files (the "Software"), to deal @@ -52,8 +52,8 @@ find_package( HIP REQUIRED ) endif() -if( NOT rocrand_FOUND ) - find_package( rocrand REQUIRED ) +if( NOT hiprand_FOUND ) + find_package( hiprand REQUIRED ) endif() find_package( Boost COMPONENTS program_options REQUIRED) @@ -82,7 +82,7 @@ target_link_libraries( ${sample} PRIVATE roc::rocfft - roc::rocrand + hip::hiprand ${Boost_LIBRARIES} ) @@ -90,7 +90,7 @@ set_target_properties( ${sample} PROPERTIES DEBUG_POSTFIX "-d" - CXX_STANDARD 11 + CXX_STANDARD 17 CXX_STANDARD_REQUIRED ON ) diff -Nru rocfft-5.5.0/clients/samples/rocfft/examplekernels.h rocfft-5.7.1/clients/samples/rocfft/examplekernels.h --- rocfft-5.5.0/clients/samples/rocfft/examplekernels.h 2023-01-31 06:20:16.000000000 +0000 +++ rocfft-5.7.1/clients/samples/rocfft/examplekernels.h 2023-08-09 16:19:51.000000000 +0000 @@ -1,4 +1,4 @@ -// Copyright (C) 2019 - 2022 Advanced Micro Devices, Inc. All rights reserved. +// Copyright (C) 2019 - 2023 Advanced Micro Devices, Inc. All rights reserved. // // Permission is hereby granted, free of charge, to any person obtaining a copy // of this software and associated documentation files (the "Software"), to deal @@ -18,10 +18,11 @@ // OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN // THE SOFTWARE. -#ifndef __EXAMPLEKERNELS_H__ -#define __EXAMPLEKERNELS_H__ +#ifndef EXAMPLEKERNELS_H +#define EXAMPLEKERNELS_H -#include "../../data_gen.h" +#include "../../../shared/data_gen.h" +#include #include #include @@ -69,42 +70,42 @@ } // Kernel for initializing 1D complex data on the GPU. -__global__ void initcdata1(std::complex* x, const size_t Nx, const size_t xstride) +__global__ void initcdata1(hipDoubleComplex* x, const size_t Nx, const size_t xstride) { const size_t idx = blockIdx.x * blockDim.x + threadIdx.x; if(idx < Nx) { const auto pos = idx * xstride; - x[pos].real(1 + idx); - x[pos].imag(1 + idx); + x[pos].x = 1 + idx; + x[pos].y = 1 + idx; } } // Kernel for initializing 2D complex input data on the GPU. -__global__ void initcdata2(std::complex* x, - const size_t Nx, - const size_t Ny, - const size_t xstride, - const size_t ystride) +__global__ void initcdata2(hipDoubleComplex* x, + const size_t Nx, + const size_t Ny, + const size_t xstride, + const size_t ystride) { const auto idx = blockIdx.x * blockDim.x + threadIdx.x; const auto idy = blockIdx.y * blockDim.y + threadIdx.y; if(idx < Nx && idy < Ny) { const auto pos = idx * xstride + idy * ystride; - x[pos].real(idx + 1); - x[pos].imag(idy + 1); + x[pos].x = idx + 1; + x[pos].y = idy + 1; } } // Kernel for initializing 3D complex input data on the GPU. -__global__ void initcdata3(std::complex* x, - const size_t Nx, - const size_t Ny, - const size_t Nz, - const size_t xstride, - const size_t ystride, - const size_t zstride) +__global__ void initcdata3(hipDoubleComplex* x, + const size_t Nx, + const size_t Ny, + const size_t Nz, + const size_t xstride, + const size_t ystride, + const size_t zstride) { const size_t idx = blockIdx.x * blockDim.x + threadIdx.x; const size_t idy = blockIdx.y * blockDim.y + threadIdx.y; @@ -112,8 +113,8 @@ if(idx < Nx && idy < Ny && idz < Nz) { const auto pos = idx * xstride + idy * ystride + idz * zstride; - x[pos].real(idx + 10.0 * idz + 1); - x[pos].imag(idy + 10); + x[pos].x = idx + 10.0 * idz + 1; + x[pos].y = idy + 10; } } @@ -141,7 +142,7 @@ griddim, 0, 0, - (std::complex*)gpu_in, + (hipDoubleComplex*)gpu_in, length_cm[0], stride_cm[0]); break; @@ -155,7 +156,7 @@ griddim, 0, 0, - (std::complex*)gpu_in, + (hipDoubleComplex*)gpu_in, length_cm[0], length_cm[1], stride_cm[0], @@ -173,7 +174,7 @@ griddim, 0, 0, - (std::complex*)gpu_in, + (hipDoubleComplex*)gpu_in, length_cm[0], length_cm[1], length_cm[2], @@ -255,12 +256,12 @@ { case 1: { - hipLaunchKernelGGL(impose_hermitian_symmetry_interleaved_1, + hipLaunchKernelGGL(impose_hermitian_symmetry_interleaved_1, dim3(1), dim3(1), 0, 0, - (std::complex*)gpu_in, + (hipDoubleComplex*)gpu_in, length[0], stride[0], 1, @@ -270,12 +271,12 @@ } case 2: { - hipLaunchKernelGGL(impose_hermitian_symmetry_interleaved_2, + hipLaunchKernelGGL(impose_hermitian_symmetry_interleaved_2, dim3(256), dim3(ceildiv(ceildiv(ilength[1], 2), 256)), 0, 0, - (std::complex*)gpu_in, + (hipDoubleComplex*)gpu_in, length[0], length[1], stride[0], @@ -288,12 +289,12 @@ } case 3: { - hipLaunchKernelGGL(impose_hermitian_symmetry_interleaved_3, + hipLaunchKernelGGL(impose_hermitian_symmetry_interleaved_3, dim3(64, 64), dim3(ceildiv(ilength[1], 64), ceildiv(ceildiv(ilength[2], 2), 64)), 0, 0, - (std::complex*)gpu_in, + (hipDoubleComplex*)gpu_in, length[0], length[1], length[2], @@ -326,20 +327,14 @@ { const dim3 blockdim(256); const dim3 griddim(ceildiv(ilength[0], blockdim.x)); - hipLaunchKernelGGL(initcdata1, - blockdim, - griddim, - 0, - 0, - (std::complex*)gpu_in, - ilength[0], - stride[0]); - hipLaunchKernelGGL(impose_hermitian_symmetry_interleaved_1, + hipLaunchKernelGGL( + initcdata1, blockdim, griddim, 0, 0, (hipDoubleComplex*)gpu_in, ilength[0], stride[0]); + hipLaunchKernelGGL(impose_hermitian_symmetry_interleaved_1, dim3(1), dim3(1), 0, 0, - (std::complex*)gpu_in, + (hipDoubleComplex*)gpu_in, length[0], stride[0], 1, @@ -356,17 +351,17 @@ griddim, 0, 0, - (std::complex*)gpu_in, + (hipDoubleComplex*)gpu_in, ilength[0], ilength[1], stride[0], stride[1]); - hipLaunchKernelGGL(impose_hermitian_symmetry_interleaved_2, + hipLaunchKernelGGL(impose_hermitian_symmetry_interleaved_2, dim3(256), dim3(ceildiv(ceildiv(ilength[1], 2), 256)), 0, 0, - (std::complex*)gpu_in, + (hipDoubleComplex*)gpu_in, length[0], length[1], stride[0], @@ -389,7 +384,7 @@ griddim, 0, 0, - (std::complex*)gpu_in, + (hipDoubleComplex*)gpu_in, ilength[0], ilength[1], ilength[2], @@ -397,12 +392,12 @@ stride[1], stride[2]); - hipLaunchKernelGGL(impose_hermitian_symmetry_interleaved_3, + hipLaunchKernelGGL(impose_hermitian_symmetry_interleaved_3, dim3(64, 64), dim3(ceildiv(ilength[1], 64), ceildiv(ceildiv(ilength[2], 2), 64)), 0, 0, - (std::complex*)gpu_in, + (hipDoubleComplex*)gpu_in, length[0], length[1], length[2], @@ -423,4 +418,4 @@ impose_hermitian_symmetry_cm(length, ilength, stride, gpu_in); } -#endif +#endif /* EXAMPLEKERNELS_H */ diff -Nru rocfft-5.5.0/clients/samples/rocfft/exampleutils.h rocfft-5.7.1/clients/samples/rocfft/exampleutils.h --- rocfft-5.5.0/clients/samples/rocfft/exampleutils.h 2023-01-31 06:20:16.000000000 +0000 +++ rocfft-5.7.1/clients/samples/rocfft/exampleutils.h 2023-08-09 16:19:51.000000000 +0000 @@ -1,4 +1,4 @@ -// Copyright (C) 2019 - 2022 Advanced Micro Devices, Inc. All rights reserved. +// Copyright (C) 2019 - 2023 Advanced Micro Devices, Inc. All rights reserved. // // Permission is hereby granted, free of charge, to any person obtaining a copy // of this software and associated documentation files (the "Software"), to deal @@ -18,8 +18,14 @@ // OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN // THE SOFTWARE. -#ifndef __EXAMPLEUTILS_H__ -#define __EXAMPLEUTILS_H__ +#ifndef EXAMPLEUTILS_H +#define EXAMPLEUTILS_H + +std::ostream& operator<<(std::ostream& stream, hipDoubleComplex c) +{ + stream << "(" << c.x << "," << c.y << ")"; + return stream; +} // Increment the index (column-major) for looping over arbitrary dimensional loops with // dimensions length. @@ -60,7 +66,9 @@ const auto i = std::inner_product(index.begin(), index.end(), stride.begin(), b * dist); assert(i >= 0); assert(i < data.size()); + std::cout << data[i] << " "; + for(size_t idx = 0; idx < index.size(); ++idx) { if(index[idx] == (length[idx] - 1)) @@ -80,13 +88,13 @@ // Check that an multi-dimensional array of complex values with dimensions length // and straide stride, with nbatch copies separated by dist is Hermitian-symmetric. // Column-major version. -template -bool check_symmetry_cm(const std::vector>& data, - const std::vector& length_cm, - const std::vector& stride_cm, - const size_t nbatch, - const size_t dist, - const bool verbose = true) +template +bool check_symmetry_cm(const std::vector& data, + const std::vector& length_cm, + const std::vector& stride_cm, + const size_t nbatch, + const size_t dist, + const bool verbose = true) { bool issymmetric = true; for(size_t b = 0; b < nbatch; b++) @@ -118,7 +126,7 @@ = std::inner_product(index.begin(), index.end(), stride_cm.begin(), b * dist); const auto j = std::inner_product( negindex.begin(), negindex.end(), stride_cm.begin(), b * dist); - if(data[i] != std::conj(data[j])) + if((data[i].x != data[j].x) or (data[i].y != -data[j].y)) { if(verbose) { @@ -152,4 +160,4 @@ return issymmetric; } -#endif +#endif /* EXAMPLEUTILS_H */ diff -Nru rocfft-5.5.0/clients/samples/rocfft/rocfft_example_callback.cpp rocfft-5.7.1/clients/samples/rocfft/rocfft_example_callback.cpp --- rocfft-5.5.0/clients/samples/rocfft/rocfft_example_callback.cpp 2023-01-31 06:20:16.000000000 +0000 +++ rocfft-5.7.1/clients/samples/rocfft/rocfft_example_callback.cpp 2023-08-09 16:19:51.000000000 +0000 @@ -26,6 +26,7 @@ #include #include #include +#include #include // example of using load/store callbacks with rocfft @@ -63,77 +64,113 @@ } // rocfft gpu compute - // ======================================== + // ================== - rocfft_setup(); + if(rocfft_setup() != rocfft_status_success) + throw std::runtime_error("rocfft_setup failed."); size_t Nbytes = N * sizeof(double2); // Create HIP device object. double2 *x, *filter_dev; - hipMalloc(&x, Nbytes); - hipMalloc(&filter_dev, Nbytes); + + // create buffers + if(hipMalloc(&x, Nbytes) != hipSuccess) + throw std::runtime_error("hipMalloc failed."); + + if(hipMalloc(&filter_dev, Nbytes) != hipSuccess) + throw std::runtime_error("hipMalloc failed."); // Copy data to device - hipMemcpy(x, cx.data(), Nbytes, hipMemcpyHostToDevice); - hipMemcpy(filter_dev, filter.data(), Nbytes, hipMemcpyHostToDevice); + hipError_t hip_status = hipMemcpy(x, cx.data(), Nbytes, hipMemcpyHostToDevice); + if(hip_status != hipSuccess) + throw std::runtime_error("hipMemcpy failed."); + + hip_status = hipMemcpy(filter_dev, filter.data(), Nbytes, hipMemcpyHostToDevice); + if(hip_status != hipSuccess) + throw std::runtime_error("hipMemcpy failed."); // Create plan - rocfft_plan plan = NULL; + rocfft_plan plan = nullptr; size_t length = N; - rocfft_plan_create(&plan, - rocfft_placement_inplace, - rocfft_transform_type_complex_forward, - rocfft_precision_double, - 1, - &length, - 1, - NULL); + if(rocfft_plan_create(&plan, + rocfft_placement_inplace, + rocfft_transform_type_complex_forward, + rocfft_precision_double, + 1, + &length, + 1, + nullptr) + != rocfft_status_success) + throw std::runtime_error("rocfft_plan_create failed."); // Check if the plan requires a work buffer size_t work_buf_size = 0; - rocfft_plan_get_work_buffer_size(plan, &work_buf_size); + if(rocfft_plan_get_work_buffer_size(plan, &work_buf_size) != rocfft_status_success) + throw std::runtime_error("rocfft_plan_get_work_buffer_size failed."); void* work_buf = nullptr; rocfft_execution_info info = nullptr; - rocfft_execution_info_create(&info); + if(rocfft_execution_info_create(&info) != rocfft_status_success) + throw std::runtime_error("rocfft_execution_info_create failed."); if(work_buf_size) { - hipMalloc(&work_buf, work_buf_size); - rocfft_execution_info_set_work_buffer(info, work_buf, work_buf_size); + if(hipMalloc(&work_buf, work_buf_size) != hipSuccess) + throw std::runtime_error("hipMalloc failed."); + + if(rocfft_execution_info_set_work_buffer(info, work_buf, work_buf_size) + != rocfft_status_success) + throw std::runtime_error("rocfft_execution_info_set_work_buffer failed."); } - // prepare callback + // Prepare callback load_cbdata cbdata_host; cbdata_host.filter = filter_dev; cbdata_host.scale = 1.0 / static_cast(N); + void* cbdata_dev; - hipMalloc(&cbdata_dev, sizeof(load_cbdata)); - hipMemcpy(cbdata_dev, &cbdata_host, sizeof(load_cbdata), hipMemcpyHostToDevice); + if(hipMalloc(&cbdata_dev, sizeof(load_cbdata)) != hipSuccess) + throw std::runtime_error("hipMalloc failed."); + + hip_status = hipMemcpy(cbdata_dev, &cbdata_host, sizeof(load_cbdata), hipMemcpyHostToDevice); + if(hip_status != hipSuccess) + throw std::runtime_error("hipMemcpy failed."); - // get a properly-typed host pointer to the device function, as + // Get a properly-typed host pointer to the device function, as // rocfft_execution_info_set_load_callback expects void*. void* cbptr_host = nullptr; - hipMemcpyFromSymbol(&cbptr_host, HIP_SYMBOL(load_callback_dev), sizeof(void*)); + hip_status = hipMemcpyFromSymbol(&cbptr_host, HIP_SYMBOL(load_callback_dev), sizeof(void*)); + if(hip_status != hipSuccess) + throw std::runtime_error("hipMemcpyFromSymbol failed."); // set callback - rocfft_execution_info_set_load_callback(info, &cbptr_host, &cbdata_dev, 0); + if(rocfft_execution_info_set_load_callback(info, &cbptr_host, &cbdata_dev, 0) + != rocfft_status_success) + throw std::runtime_error("rocfft_execution_info_set_load_callback failed."); // Execute plan - rocfft_execute(plan, (void**)&x, NULL, info); + if(rocfft_execute(plan, (void**)&x, nullptr, info) != rocfft_status_success) + throw std::runtime_error("rocfft_execute failed."); // Clean up work buffer if(work_buf_size) { - hipFree(work_buf); - rocfft_execution_info_destroy(info); + if(hipFree(work_buf) != hipSuccess) + throw std::runtime_error("hipFree failed."); + if(rocfft_execution_info_destroy(info) != rocfft_status_success) + throw std::runtime_error("rocfft_execution_info_destroy failed."); + info = nullptr; } // Destroy plan - rocfft_plan_destroy(plan); + if(rocfft_plan_destroy(plan) != rocfft_status_success) + throw std::runtime_error("rocfft_plan_destroy failed."); + plan = nullptr; // Copy result back to host std::vector y(N); - hipMemcpy(&y[0], x, Nbytes, hipMemcpyDeviceToHost); + hip_status = hipMemcpy(&y[0], x, Nbytes, hipMemcpyDeviceToHost); + if(hip_status != hipSuccess) + throw std::runtime_error("hipMemcpy failed."); for(size_t i = 0; i < N; i++) { @@ -141,11 +178,15 @@ << " output: (" << y[i].x << "," << y[i].y << ")" << std::endl; } - hipFree(cbdata_dev); - hipFree(filter_dev); - hipFree(x); + if(hipFree(cbdata_dev) != hipSuccess) + throw std::runtime_error("hipFree failed."); + if(hipFree(filter_dev) != hipSuccess) + throw std::runtime_error("hipFree failed."); + if(hipFree(x) != hipSuccess) + throw std::runtime_error("hipFree failed."); - rocfft_cleanup(); + if(rocfft_cleanup() != rocfft_status_success) + throw std::runtime_error("rocfft_cleanup failed."); return 0; } diff -Nru rocfft-5.5.0/clients/samples/rocfft/rocfft_example_complexcomplex.cpp rocfft-5.7.1/clients/samples/rocfft/rocfft_example_complexcomplex.cpp --- rocfft-5.5.0/clients/samples/rocfft/rocfft_example_complexcomplex.cpp 2023-01-31 06:20:16.000000000 +0000 +++ rocfft-5.7.1/clients/samples/rocfft/rocfft_example_complexcomplex.cpp 2023-08-09 16:19:51.000000000 +0000 @@ -1,4 +1,4 @@ -// Copyright (C) 2019 - 2022 Advanced Micro Devices, Inc. All rights reserved. +// Copyright (C) 2019 - 2023 Advanced Micro Devices, Inc. All rights reserved. // // Permission is hereby granted, free of charge, to any person obtaining a copy // of this software and associated documentation files (the "Software"), to deal @@ -32,6 +32,7 @@ #include "examplekernels.h" #include "exampleutils.h" +#include int main(int argc, char* argv[]) { @@ -63,7 +64,8 @@ } // Placeness for the transform - rocfft_setup(); + if(rocfft_setup() != rocfft_status_success) + throw std::runtime_error("rocfft_setup failed."); const rocfft_result_placement place = vm.count("outofplace") ? rocfft_placement_notinplace : rocfft_placement_inplace; const bool inplace = place == rocfft_placement_inplace; @@ -117,26 +119,34 @@ std::cout << std::endl; // Set the device: - hipSetDevice(deviceId); + if(hipSetDevice(deviceId) != hipSuccess) + throw std::runtime_error("hipSetDevice failed."); - // Create HIP device object and copy data to device - double2* gpu_in = NULL; - hipMalloc(&gpu_in, isize * sizeof(std::complex)); + // Create HIP device object and allocate data + hipDoubleComplex* gpu_in = nullptr; + if(hipMalloc(&gpu_in, isize * sizeof(hipDoubleComplex)) != hipSuccess) + throw std::runtime_error("hipMalloc failed."); // Inititalize the data on the device initcomplex_cm(length, istride, gpu_in); - hipDeviceSynchronize(); + if(hipDeviceSynchronize() != hipSuccess) + throw std::runtime_error("hipDeviceSynchronize failed."); + hipError_t hip_status = hipGetLastError(); if(hip_status != hipSuccess) throw std::runtime_error("device error"); std::cout << "input:\n"; - std::vector> idata(isize); - hipMemcpy(idata.data(), gpu_in, isize * sizeof(std::complex), hipMemcpyDefault); + std::vector idata(isize); + hip_status + = hipMemcpy(idata.data(), gpu_in, isize * sizeof(hipDoubleComplex), hipMemcpyDefault); + if(hip_status != hipSuccess) + throw std::runtime_error("hipMemcpy failed."); + printbuffer_cm(idata, length, istride, 1, isize); // Create the a descrition struct to set data layout: - rocfft_plan_description gpu_description = NULL; + rocfft_plan_description gpu_description = nullptr; // rocfft_status can be used to capture API status info rocfft_status rc = rocfft_plan_description_create(&gpu_description); if(rc != rocfft_status_success) @@ -144,8 +154,8 @@ rc = rocfft_plan_description_set_data_layout(gpu_description, rocfft_array_type_complex_interleaved, rocfft_array_type_complex_interleaved, - NULL, - NULL, + nullptr, + nullptr, istride.size(), // input stride length istride.data(), // input stride data 0, // input batch distance @@ -154,12 +164,12 @@ 0); // ouptut batch distance if(rc != rocfft_status_success) throw std::runtime_error("failed to set data layout"); - // We can also pass "NULL" instead of a description; rocFFT will use reasonable + // We can also pass "nullptr" instead of a description; rocFFT will use reasonable // default parameters. If the data isn't contiguous, we need to set strides, etc, // using the description. // Create the plan - rocfft_plan gpu_plan = NULL; + rocfft_plan gpu_plan = nullptr; rc = rocfft_plan_create(&gpu_plan, place, direction, @@ -172,7 +182,7 @@ throw std::runtime_error("failed to create plan"); // Get the execution info for the fft plan (in particular, work memory requirements): - rocfft_execution_info planinfo = NULL; + rocfft_execution_info planinfo = nullptr; rc = rocfft_execution_info_create(&planinfo); if(rc != rocfft_status_success) throw std::runtime_error("failed to create execution info"); @@ -182,24 +192,24 @@ throw std::runtime_error("failed to get work buffer size"); // If the transform requires work memory, allocate a work buffer: - void* wbuffer = NULL; + void* wbuffer = nullptr; if(workbuffersize > 0) { hip_status = hipMalloc(&wbuffer, workbuffersize); if(hip_status != hipSuccess) - throw std::runtime_error("hipMalloc failed"); + throw std::runtime_error("hipMalloc failed."); rc = rocfft_execution_info_set_work_buffer(planinfo, wbuffer, workbuffersize); if(rc != rocfft_status_success) - throw std::runtime_error("failed to set work buffer"); + throw std::runtime_error("failed to set work buffer."); } // If the transform is out-of-place, allocate the output buffer as well: - double2* gpu_out = inplace ? gpu_in : NULL; + double2* gpu_out = inplace ? gpu_in : nullptr; if(!inplace) { - hip_status = hipMalloc(&gpu_out, osize * sizeof(std::complex)); + hip_status = hipMalloc(&gpu_out, osize * sizeof(hipDoubleComplex)); if(hip_status != hipSuccess) - throw std::runtime_error("hipMalloc failed"); + throw std::runtime_error("hipMalloc failed."); } // Execute the GPU transform: @@ -208,30 +218,45 @@ (void**)&gpu_out, // out_buffer planinfo); // execution info if(rc != rocfft_status_success) - throw std::runtime_error("failed to execute"); + throw std::runtime_error("failed to execute."); // Get the output from the device and print to cout: std::cout << "output:\n"; - std::vector> odata(osize); - hipMemcpy(odata.data(), gpu_out, osize * sizeof(std::complex), hipMemcpyDeviceToHost); + std::vector odata(osize); + hip_status + = hipMemcpy(odata.data(), gpu_out, osize * sizeof(hipDoubleComplex), hipMemcpyDeviceToHost); + if(hip_status != hipSuccess) + throw std::runtime_error("hipMemcpy failed."); + printbuffer_cm(odata, length, istride, 1, isize); // Clean up: free GPU memory: - hipFree(gpu_in); + if(hipFree(gpu_in) != hipSuccess) + throw std::runtime_error("hipFree failed."); + if(!inplace) { - hipFree(gpu_out); + if(hipFree(gpu_out) != hipSuccess) + throw std::runtime_error("hipFree failed."); } - if(wbuffer != NULL) + if(wbuffer != nullptr) { - hipFree(wbuffer); + if(hipFree(wbuffer) != hipSuccess) + throw std::runtime_error("hipFree failed."); } // Clean up: destroy plans: - rocfft_execution_info_destroy(planinfo); - rocfft_plan_description_destroy(gpu_description); - rocfft_plan_destroy(gpu_plan); + if(rocfft_execution_info_destroy(planinfo) != rocfft_status_success) + throw std::runtime_error("rocfft_execution_info_destroy failed."); + planinfo = nullptr; + if(rocfft_plan_description_destroy(gpu_description) != rocfft_status_success) + throw std::runtime_error("rocfft_plan_description_destroy failed."); + gpu_description = nullptr; + if(rocfft_plan_destroy(gpu_plan) != rocfft_status_success) + throw std::runtime_error("rocfft_plan_destroy failed."); + gpu_plan = nullptr; - rocfft_cleanup(); + if(rocfft_cleanup() != rocfft_status_success) + throw std::runtime_error("rocfft_cleanup failed."); return 0; } diff -Nru rocfft-5.5.0/clients/samples/rocfft/rocfft_example_realcomplex.cpp rocfft-5.7.1/clients/samples/rocfft/rocfft_example_realcomplex.cpp --- rocfft-5.5.0/clients/samples/rocfft/rocfft_example_realcomplex.cpp 2023-01-31 06:20:16.000000000 +0000 +++ rocfft-5.7.1/clients/samples/rocfft/rocfft_example_realcomplex.cpp 2023-08-09 16:19:51.000000000 +0000 @@ -1,4 +1,4 @@ -// Copyright (C) 2019 - 2022 Advanced Micro Devices, Inc. All rights reserved. +// Copyright (C) 2019 - 2023 Advanced Micro Devices, Inc. All rights reserved. // // Permission is hereby granted, free of charge, to any person obtaining a copy // of this software and associated documentation files (the "Software"), to deal @@ -32,6 +32,7 @@ #include "examplekernels.h" #include "exampleutils.h" +#include int main(int argc, char* argv[]) { @@ -63,7 +64,8 @@ } // Placeness for the transform - rocfft_setup(); + if(rocfft_setup() != rocfft_status_success) + throw std::runtime_error("rocfft_setup failed."); const rocfft_result_placement place = vm.count("outofplace") ? rocfft_placement_notinplace : rocfft_placement_inplace; const bool inplace = place == rocfft_placement_inplace; @@ -97,16 +99,16 @@ cstride.push_back(clength[i - 1] * cstride[i - 1]); } const size_t complex_size = clength[clength.size() - 1] * cstride[cstride.size() - 1]; - std::vector> cdata(complex_size); // host storage + std::vector cdata(complex_size); // host storage // Based on the direction, we set the input and output parameters appropriately. const size_t isize = forward ? real_size : complex_size; - const size_t ibytes = isize * (forward ? sizeof(double) : sizeof(std::complex)); + const size_t ibytes = isize * (forward ? sizeof(double) : sizeof(hipDoubleComplex)); const std::vector ilength = forward ? length : clength; const std::vector istride = forward ? rstride : cstride; const size_t osize = forward ? complex_size : real_size; - const size_t obytes = osize * (forward ? sizeof(std::complex) : sizeof(double)); + const size_t obytes = osize * (forward ? sizeof(hipDoubleComplex) : sizeof(double)); const std::vector olength = forward ? clength : length; const std::vector ostride = forward ? cstride : rstride; @@ -147,11 +149,12 @@ std::cout << std::endl; // Set the device: - hipSetDevice(deviceId); + if(hipSetDevice(deviceId) != hipSuccess) + throw std::runtime_error("hipSetDevice failed."); // Create HIP device object and initialize data // Kernels are provided in examplekernels.h - void* gpu_in = NULL; + void* gpu_in = nullptr; hipError_t hip_status = hipMalloc(&gpu_in, inplace ? std::max(ibytes, obytes) : ibytes); if(hip_status != hipSuccess) throw std::runtime_error("device error"); @@ -169,12 +172,16 @@ std::cout << "input:\n"; if(forward) { - hipMemcpy(rdata.data(), gpu_in, ibytes, hipMemcpyDeviceToHost); + hip_status = hipMemcpy(rdata.data(), gpu_in, ibytes, hipMemcpyDeviceToHost); + if(hip_status != hipSuccess) + throw std::runtime_error("hipMemcpy failed."); printbuffer_cm(rdata, ilength, istride, 1, isize); } else { - hipMemcpy(cdata.data(), gpu_in, ibytes, hipMemcpyDeviceToHost); + hip_status = hipMemcpy(cdata.data(), gpu_in, ibytes, hipMemcpyDeviceToHost); + if(hip_status != hipSuccess) + throw std::runtime_error("hipMemcpy failed."); printbuffer_cm(cdata, ilength, istride, 1, isize); // Check that the buffer is Hermitian symmetric: @@ -185,7 +192,7 @@ rocfft_status rc = rocfft_status_success; // Create the a descrition struct to set data layout: - rocfft_plan_description gpu_description = NULL; + rocfft_plan_description gpu_description = nullptr; rc = rocfft_plan_description_create(&gpu_description); if(rc != rocfft_status_success) throw std::runtime_error("failed to create plan description"); @@ -196,8 +203,8 @@ forward ? rocfft_array_type_real : rocfft_array_type_hermitian_interleaved, // output data format: forward ? rocfft_array_type_hermitian_interleaved : rocfft_array_type_real, - NULL, - NULL, + nullptr, + nullptr, istride.size(), // input stride length istride.data(), // input stride data 0, // input batch distance @@ -207,12 +214,12 @@ if(rc != rocfft_status_success) throw std::runtime_error("failed to set data layout"); - // We can also pass "NULL" instead of a description; rocFFT will use reasonable + // We can also pass "nullptr" instead of a description; rocFFT will use reasonable // default parameters. If the data isn't contiguous, we need to set strides, etc, // using the description. // Create the FFT plan: - rocfft_plan gpu_plan = NULL; + rocfft_plan gpu_plan = nullptr; rc = rocfft_plan_create(&gpu_plan, place, direction, @@ -225,7 +232,7 @@ throw std::runtime_error("failed to create plan"); // Get the execution info for the fft plan (in particular, work memory requirements): - rocfft_execution_info planinfo = NULL; + rocfft_execution_info planinfo = nullptr; rc = rocfft_execution_info_create(&planinfo); if(rc != rocfft_status_success) throw std::runtime_error("failed to create execution info"); @@ -236,7 +243,7 @@ throw std::runtime_error("failed to get work buffer size"); // If the transform requires work memory, allocate a work buffer: - void* wbuffer = NULL; + void* wbuffer = nullptr; if(workbuffersize > 0) { hip_status = hipMalloc(&wbuffer, workbuffersize); @@ -249,7 +256,7 @@ } // If the transform is out-of-place, allocate the output buffer as well: - void* gpu_out = inplace ? gpu_in : NULL; + void* gpu_out = inplace ? gpu_in : nullptr; if(!inplace) { hip_status = hipMalloc(&gpu_out, obytes); @@ -269,30 +276,44 @@ std::cout << "output:\n"; if(forward) { - hipMemcpy(cdata.data(), gpu_out, obytes, hipMemcpyDeviceToHost); + hip_status = hipMemcpy(cdata.data(), gpu_out, obytes, hipMemcpyDeviceToHost); + if(hip_status != hipSuccess) + throw std::runtime_error("hipMemcpy failed."); printbuffer_cm(cdata, olength, ostride, 1, osize); } else { - hipMemcpy(rdata.data(), gpu_out, obytes, hipMemcpyDeviceToHost); + hip_status = hipMemcpy(rdata.data(), gpu_out, obytes, hipMemcpyDeviceToHost); + if(hip_status != hipSuccess) + throw std::runtime_error("hipMemcpy failed."); printbuffer_cm(rdata, olength, ostride, 1, osize); } // Clean up: free GPU memory: - hipFree(gpu_in); + if(hipFree(gpu_in) != hipSuccess) + throw std::runtime_error("hipFree failed."); + if(!inplace) { - hipFree(gpu_out); + if(hipFree(gpu_out) != hipSuccess) + throw std::runtime_error("hipFree failed."); } - if(wbuffer != NULL) + if(wbuffer != nullptr) { - hipFree(wbuffer); + if(hipFree(wbuffer) != hipSuccess) + throw std::runtime_error("hipFree failed."); } // Clean up: destroy plans: - rocfft_execution_info_destroy(planinfo); - rocfft_plan_description_destroy(gpu_description); - rocfft_plan_destroy(gpu_plan); + if(rocfft_execution_info_destroy(planinfo) != rocfft_status_success) + throw std::runtime_error("rocfft_execution_info_destroy failed."); + planinfo = nullptr; + if(rocfft_plan_description_destroy(gpu_description) != rocfft_status_success) + throw std::runtime_error("rocfft_plan_description_destroy failed."); + gpu_description = nullptr; + if(rocfft_plan_destroy(gpu_plan) != rocfft_status_success) + throw std::runtime_error("rocfft_plan_destroy failed."); + gpu_plan = nullptr; rocfft_cleanup(); return 0; diff -Nru rocfft-5.5.0/clients/samples/rocfft/rocfft_example_set_stream.cpp rocfft-5.7.1/clients/samples/rocfft/rocfft_example_set_stream.cpp --- rocfft-5.5.0/clients/samples/rocfft/rocfft_example_set_stream.cpp 2023-01-31 06:20:16.000000000 +0000 +++ rocfft-5.7.1/clients/samples/rocfft/rocfft_example_set_stream.cpp 2023-08-09 16:19:51.000000000 +0000 @@ -1,4 +1,4 @@ -// Copyright (C) 2020 - 2022 Advanced Micro Devices, Inc. All rights reserved. +// Copyright (C) 2020 - 2023 Advanced Micro Devices, Inc. All rights reserved. // // Permission is hereby granted, free of charge, to any person obtaining a copy // of this software and associated documentation files (the "Software"), to deal @@ -22,103 +22,120 @@ #include #include #include - -#define CHECK_HIP_ERR(err) \ - if(err != hipSuccess) \ - { \ - std::cerr << "hip error code : " << err << std::endl; \ - exit(-1); \ - } - -#define CHECK_ROCFFT_ERR(err) \ - if(err != rocfft_status_success) \ - { \ - std::cerr << "rocFFT error code : " << err << std::endl; \ - exit(-1); \ - } +#include +#include struct fft_fixture_t { - double2* cpu_buf; - double2* gpu_buf; - hipStream_t stream; - rocfft_execution_info info; - rocfft_plan plan; + std::vector cpu_buf; + double2* gpu_buf = nullptr; + hipStream_t stream = nullptr; + rocfft_execution_info info = nullptr; + rocfft_plan plan = nullptr; }; int main(int argc, char* argv[]) { std::cout << "rocfft example of 2 inplace transforms with 2 streams.\n" << std::endl; - size_t length = 8; - size_t total_bytes = length * sizeof(double2); + size_t length = 8; + size_t total_bytes = length * sizeof(double2); + hipError_t hip_status; + rocfft_status fft_status; fft_fixture_t ffts[2]; /// preparation - rocfft_setup(); + if(rocfft_setup() != rocfft_status_success) + throw std::runtime_error("rocfft_setup failed."); for(auto& it : ffts) { // create cpu buffer - it.cpu_buf = new double2[length]; + it.cpu_buf.resize(length); // init cpu buffer... // create gpu buffer - CHECK_HIP_ERR(hipMalloc(&(it.gpu_buf), total_bytes)); + if(hipMalloc(&(it.gpu_buf), total_bytes) != hipSuccess) + throw std::runtime_error("hipMalloc failed."); // copy host to device - CHECK_HIP_ERR(hipMemcpy(it.gpu_buf, it.cpu_buf, total_bytes, hipMemcpyHostToDevice)); + if(hipMemcpy(it.gpu_buf, it.cpu_buf.data(), total_bytes, hipMemcpyHostToDevice) + != hipSuccess) + throw std::runtime_error("hipMemcpy failed."); // create stream - CHECK_HIP_ERR(hipStreamCreate(&(it.stream))); + if(hipStreamCreate(&(it.stream)) != hipSuccess) + throw std::runtime_error("hipStreamCreate failed."); // create execution info - CHECK_ROCFFT_ERR(rocfft_execution_info_create(&(it.info))); + fft_status = rocfft_execution_info_create(&(it.info)); + if(fft_status != rocfft_status_success) + throw std::runtime_error("rocfft_execution_info_create failed."); // set stream // NOTE: The stream must be of type hipStream_t. // It is an error to pass the address of a hipStream_t object. - CHECK_ROCFFT_ERR(rocfft_execution_info_set_stream(it.info, it.stream)); + fft_status = rocfft_execution_info_set_stream(it.info, it.stream); + if(fft_status != rocfft_status_success) + throw std::runtime_error("rocfft_execution_info_set_stream failed."); // create plan - CHECK_ROCFFT_ERR(rocfft_plan_create(&it.plan, - rocfft_placement_inplace, - rocfft_transform_type_complex_forward, - rocfft_precision_double, - 1, - &length, - 1, - nullptr)); + fft_status = rocfft_plan_create(&it.plan, + rocfft_placement_inplace, + rocfft_transform_type_complex_forward, + rocfft_precision_double, + 1, + &length, + 1, + nullptr); + if(fft_status != rocfft_status_success) + throw std::runtime_error("rocfft_plan_create failed."); + size_t work_buf_size = 0; - CHECK_ROCFFT_ERR(rocfft_plan_get_work_buffer_size(it.plan, &work_buf_size)); + fft_status = rocfft_plan_get_work_buffer_size(it.plan, &work_buf_size); + if(fft_status != rocfft_status_success) + throw std::runtime_error("rocfft_plan_get_work_buffer_size failed."); + assert(work_buf_size == 0); // simple 1D inplace fft doesn't need extra working buffer } /// execution for(auto& it : ffts) { - CHECK_ROCFFT_ERR( - rocfft_execute(it.plan, (void**)&(it.gpu_buf), (void**)&(it.gpu_buf), nullptr)); + fft_status = rocfft_execute(it.plan, (void**)&(it.gpu_buf), (void**)&(it.gpu_buf), nullptr); + if(fft_status != rocfft_status_success) + throw std::runtime_error("rocfft_execute failed."); } /// wait and copy back for(auto& it : ffts) { - CHECK_HIP_ERR(hipStreamSynchronize(it.stream)); - CHECK_HIP_ERR(hipMemcpy(it.cpu_buf, it.gpu_buf, total_bytes, hipMemcpyDeviceToHost)); + if(hipStreamSynchronize(it.stream) != hipSuccess) + throw std::runtime_error("hipStreamSynchronize failed."); + hip_status = hipMemcpy(it.cpu_buf.data(), it.gpu_buf, total_bytes, hipMemcpyDeviceToHost); + if(hip_status != hipSuccess) + throw std::runtime_error("hipMemcpy failed."); } /// clean up for(auto& it : ffts) { - CHECK_ROCFFT_ERR(rocfft_plan_destroy(it.plan)); - CHECK_ROCFFT_ERR(rocfft_execution_info_destroy(it.info)); - CHECK_HIP_ERR(hipStreamDestroy(it.stream)); - CHECK_HIP_ERR(hipFree(it.gpu_buf)); - delete[] it.cpu_buf; + fft_status = rocfft_plan_destroy(it.plan); + if(fft_status != rocfft_status_success) + throw std::runtime_error("rocfft_plan_destroy failed."); + + fft_status = rocfft_execution_info_destroy(it.info); + if(fft_status != rocfft_status_success) + throw std::runtime_error("rocfft_execution_info_destroy failed."); + + if(hipStreamDestroy(it.stream) != hipSuccess) + throw std::runtime_error("hipStreamDestroy failed."); + if(hipFree(it.gpu_buf) != hipSuccess) + throw std::runtime_error("hipFree failed."); } - rocfft_cleanup(); + if(rocfft_cleanup() != rocfft_status_success) + throw std::runtime_error("rocfft_cleanup failed."); return 0; } diff -Nru rocfft-5.5.0/clients/tests/CMakeLists.txt rocfft-5.7.1/clients/tests/CMakeLists.txt --- rocfft-5.5.0/clients/tests/CMakeLists.txt 2023-01-31 06:20:16.000000000 +0000 +++ rocfft-5.7.1/clients/tests/CMakeLists.txt 2023-08-09 16:19:51.000000000 +0000 @@ -60,8 +60,8 @@ find_package( ROCM 0.7.3 REQUIRED ) endif() -if( NOT rocrand_FOUND ) - find_package( rocrand REQUIRED ) +if( NOT hiprand_FOUND ) + find_package( hiprand REQUIRED ) endif() include( ROCMInstallTargets ) @@ -161,18 +161,15 @@ # FFTW we build is always threaded set( FFTW_MULTITHREAD TRUE ) -endif() -if( BUILD_FFTW OR NOT FFTW_FOUND ) add_dependencies( rocfft-test fftw_double fftw_single ) + rocm_install( + FILES ${FFTW_LIBRARIES} + DESTINATION ${CMAKE_INSTALL_LIBDIR}/fftw + COMPONENT clients-common + ) endif() -rocm_install( - FILES ${FFTW_LIBRARIES} - DESTINATION ${CMAKE_INSTALL_LIBDIR}/fftw - COMPONENT clients-common -) - set( rocfft-test_include_dirs $ $ @@ -216,7 +213,7 @@ PRIVATE hip::device roc::rocfft - roc::rocrand + hip::hiprand ${rocfft-test_link_libs} ) @@ -234,13 +231,13 @@ if( BUILD_CLIENTS_TESTS_OPENMP ) if( CMAKE_CXX_COMPILER MATCHES ".*/hipcc$" ) - target_compile_options( rocfft-test PRIVATE -fopenmp -DBUILD_CLIENTS_TESTS_OPENMP ) + target_compile_options( rocfft-test PRIVATE -fopenmp ) target_link_libraries( rocfft-test PRIVATE -fopenmp -L${HIP_CLANG_ROOT}/lib -Wl,-rpath=${HIP_CLANG_ROOT}/lib ) target_include_directories( rocfft-test PRIVATE ${HIP_CLANG_ROOT}/include ) else() if(CMAKE_CXX_COMPILER_ID STREQUAL "Clang") - set(OpenMP_CXX_FLAG "-fopenmp=libomp") - target_link_libraries(rocfft-test ${OpenMP_CXX_LIBRARIES}) + target_compile_options( rocfft-test PRIVATE -fopenmp=libomp ) + target_link_options( rocfft-test PRIVATE -fopenmp=libomp ) endif() endif() endif() @@ -289,6 +286,6 @@ C:/Windows/System32/libomp140*.dll ) foreach( file_i ${third_party_dlls}) - add_custom_command( TARGET rocfft-test POST_BUILD COMMAND ${CMAKE_COMMAND} ARGS -E copy ${file_i} ${PROJECT_BINARY_DIR}/staging ) + add_custom_command( TARGET rocfft-test POST_BUILD COMMAND ${CMAKE_COMMAND} ARGS -E copy ${file_i} $ ) endforeach( file_i ) endif() diff -Nru rocfft-5.5.0/clients/tests/accuracy_test.cpp rocfft-5.7.1/clients/tests/accuracy_test.cpp --- rocfft-5.5.0/clients/tests/accuracy_test.cpp 2023-01-31 06:20:16.000000000 +0000 +++ rocfft-5.7.1/clients/tests/accuracy_test.cpp 2023-08-09 16:19:51.000000000 +0000 @@ -1,4 +1,4 @@ -// Copyright (C) 2022 - 2022 Advanced Micro Devices, Inc. All rights reserved. +// Copyright (C) 2022 - 2023 Advanced Micro Devices, Inc. All rights reserved. // // Permission is hereby granted, free of charge, to any person obtaining a copy // of this software and associated documentation files (the "Software"), to deal @@ -19,53 +19,44 @@ // THE SOFTWARE. #include "accuracy_test.h" +#include "../../shared/rocfft_complex.h" -#include #include -__host__ __device__ float multiply_by_scalar(float a, double b) -{ - return a * b; -} -__host__ __device__ float2 multiply_by_scalar(float2 a, double b) -{ - return hipCmulf(a, make_float2(b, 0.0)); -} -__host__ __device__ double multiply_by_scalar(double a, double b) -{ - return a * b; -} -__host__ __device__ double2 multiply_by_scalar(double2 a, double b) +// load/store callbacks - cbdata in each is actually a scalar double +// with a number to apply to each element +template +__host__ __device__ Tdata load_callback(Tdata* input, size_t offset, void* cbdata, void* sharedMem) { - return hipCmul(a, make_double2(b, 0.0)); + auto testdata = static_cast(cbdata); + // multiply each element by scalar + if(input == testdata->base) + return input[offset] * testdata->scalar; + // wrong base address passed, return something obviously wrong + else + { + // wrong base address passed, return something obviously wrong + return input[0]; + } } -__host__ __device__ float add_scalar(float a, double b) -{ - return a + b; -} -__host__ __device__ float2 add_scalar(float2 a, double b) -{ - return hipCaddf(a, make_float2(b, 0.0)); -} -__host__ __device__ double add_scalar(double a, double b) -{ - return a + b; -} -__host__ __device__ double2 add_scalar(double2 a, double b) -{ - return hipCadd(a, make_double2(b, 0.0)); -} +__device__ auto load_callback_dev_half = load_callback<_Float16>; +__device__ auto load_callback_dev_complex_half = load_callback>; +__device__ auto load_callback_dev_float = load_callback; +__device__ auto load_callback_dev_complex_float = load_callback>; +__device__ auto load_callback_dev_double = load_callback; +__device__ auto load_callback_dev_complex_double = load_callback>; // load/store callbacks - cbdata in each is actually a scalar double // with a number to apply to each element template -__host__ __device__ Tdata load_callback(Tdata* input, size_t offset, void* cbdata, void* sharedMem) +__host__ __device__ Tdata + load_callback_round_trip_inverse(Tdata* input, size_t offset, void* cbdata, void* sharedMem) { auto testdata = static_cast(cbdata); - // multiply each element by scalar + // subtract each element by scalar if(input == testdata->base) - return multiply_by_scalar(input[offset], testdata->scalar); + return input[offset] - testdata->scalar; // wrong base address passed, return something obviously wrong else { @@ -74,12 +65,22 @@ } } -__device__ auto load_callback_dev_float = load_callback; -__device__ auto load_callback_dev_float2 = load_callback; -__device__ auto load_callback_dev_double = load_callback; -__device__ auto load_callback_dev_double2 = load_callback; - -void* get_load_callback_host(fft_array_type itype, fft_precision precision) +__device__ auto load_callback_round_trip_inverse_dev_half + = load_callback_round_trip_inverse<_Float16>; +__device__ auto load_callback_round_trip_inverse_dev_complex_half + = load_callback_round_trip_inverse>; +__device__ auto load_callback_round_trip_inverse_dev_float + = load_callback_round_trip_inverse; +__device__ auto load_callback_round_trip_inverse_dev_complex_float + = load_callback_round_trip_inverse>; +__device__ auto load_callback_round_trip_inverse_dev_double + = load_callback_round_trip_inverse; +__device__ auto load_callback_round_trip_inverse_dev_complex_double + = load_callback_round_trip_inverse>; + +void* get_load_callback_host(fft_array_type itype, + fft_precision precision, + bool round_trip_inverse = false) { void* load_callback_host = nullptr; switch(itype) @@ -89,16 +90,56 @@ { switch(precision) { + case fft_precision_half: + if(round_trip_inverse) + { + EXPECT_EQ(hipMemcpyFromSymbol( + &load_callback_host, + HIP_SYMBOL(load_callback_round_trip_inverse_dev_complex_half), + sizeof(void*)), + hipSuccess); + } + else + { + EXPECT_EQ(hipMemcpyFromSymbol(&load_callback_host, + HIP_SYMBOL(load_callback_dev_complex_half), + sizeof(void*)), + hipSuccess); + } + return load_callback_host; case fft_precision_single: - EXPECT_EQ(hipMemcpyFromSymbol( - &load_callback_host, HIP_SYMBOL(load_callback_dev_float2), sizeof(void*)), - hipSuccess); + if(round_trip_inverse) + { + EXPECT_EQ(hipMemcpyFromSymbol( + &load_callback_host, + HIP_SYMBOL(load_callback_round_trip_inverse_dev_complex_float), + sizeof(void*)), + hipSuccess); + } + else + { + EXPECT_EQ(hipMemcpyFromSymbol(&load_callback_host, + HIP_SYMBOL(load_callback_dev_complex_float), + sizeof(void*)), + hipSuccess); + } return load_callback_host; case fft_precision_double: - EXPECT_EQ(hipMemcpyFromSymbol(&load_callback_host, - HIP_SYMBOL(load_callback_dev_double2), - sizeof(void*)), - hipSuccess); + if(round_trip_inverse) + { + EXPECT_EQ(hipMemcpyFromSymbol( + &load_callback_host, + HIP_SYMBOL(load_callback_round_trip_inverse_dev_complex_double), + sizeof(void*)), + hipSuccess); + } + else + { + EXPECT_EQ(hipMemcpyFromSymbol(&load_callback_host, + HIP_SYMBOL(load_callback_dev_complex_double), + sizeof(void*)), + hipSuccess); + } return load_callback_host; } } @@ -106,15 +147,56 @@ { switch(precision) { + case fft_precision_half: + if(round_trip_inverse) + { + EXPECT_EQ(hipMemcpyFromSymbol(&load_callback_host, + HIP_SYMBOL(load_callback_round_trip_inverse_dev_half), + sizeof(void*)), + hipSuccess); + } + else + { + EXPECT_EQ(hipMemcpyFromSymbol(&load_callback_host, + HIP_SYMBOL(load_callback_dev_half), + sizeof(void*)), + hipSuccess); + } + return load_callback_host; case fft_precision_single: - EXPECT_EQ(hipMemcpyFromSymbol( - &load_callback_host, HIP_SYMBOL(load_callback_dev_float), sizeof(void*)), - hipSuccess); + if(round_trip_inverse) + { + EXPECT_EQ( + hipMemcpyFromSymbol(&load_callback_host, + HIP_SYMBOL(load_callback_round_trip_inverse_dev_float), + sizeof(void*)), + hipSuccess); + } + else + { + EXPECT_EQ(hipMemcpyFromSymbol(&load_callback_host, + HIP_SYMBOL(load_callback_dev_float), + sizeof(void*)), + hipSuccess); + } return load_callback_host; case fft_precision_double: - EXPECT_EQ(hipMemcpyFromSymbol( - &load_callback_host, HIP_SYMBOL(load_callback_dev_double), sizeof(void*)), - hipSuccess); + if(round_trip_inverse) + { + EXPECT_EQ( + hipMemcpyFromSymbol(&load_callback_host, + HIP_SYMBOL(load_callback_round_trip_inverse_dev_double), + sizeof(void*)), + hipSuccess); + } + else + { + EXPECT_EQ(hipMemcpyFromSymbol(&load_callback_host, + HIP_SYMBOL(load_callback_dev_double), + sizeof(void*)), + hipSuccess); + } + return load_callback_host; } } @@ -132,16 +214,45 @@ // add scalar to each element if(output == testdata->base) { - output[offset] = add_scalar(element, testdata->scalar); + output[offset] = element + testdata->scalar; } // otherwise, wrong base address passed, just don't write } -__device__ auto store_callback_dev_float = store_callback; -__device__ auto store_callback_dev_float2 = store_callback; -__device__ auto store_callback_dev_double = store_callback; -__device__ auto store_callback_dev_double2 = store_callback; +__device__ auto store_callback_dev_half = store_callback<_Float16>; +__device__ auto store_callback_dev_complex_half = store_callback>; +__device__ auto store_callback_dev_float = store_callback; +__device__ auto store_callback_dev_complex_float = store_callback>; +__device__ auto store_callback_dev_double = store_callback; +__device__ auto store_callback_dev_complex_double = store_callback>; -void* get_store_callback_host(fft_array_type otype, fft_precision precision) +template +__host__ __device__ static void store_callback_round_trip_inverse( + Tdata* output, size_t offset, Tdata element, void* cbdata, void* sharedMem) +{ + auto testdata = static_cast(cbdata); + // add scalar to each element + if(output == testdata->base) + { + output[offset] = element / testdata->scalar; + } + // otherwise, wrong base address passed, just don't write +} +__device__ auto store_callback_round_trip_inverse_dev_half + = store_callback_round_trip_inverse<_Float16>; +__device__ auto store_callback_round_trip_inverse_dev_complex_half + = store_callback_round_trip_inverse>; +__device__ auto store_callback_round_trip_inverse_dev_float + = store_callback_round_trip_inverse; +__device__ auto store_callback_round_trip_inverse_dev_complex_float + = store_callback_round_trip_inverse>; +__device__ auto store_callback_round_trip_inverse_dev_double + = store_callback_round_trip_inverse; +__device__ auto store_callback_round_trip_inverse_dev_complex_double + = store_callback_round_trip_inverse>; + +void* get_store_callback_host(fft_array_type otype, + fft_precision precision, + bool round_trip_inverse = false) { void* store_callback_host = nullptr; switch(otype) @@ -151,17 +262,56 @@ { switch(precision) { + case fft_precision_half: + if(round_trip_inverse) + { + EXPECT_EQ(hipMemcpyFromSymbol( + &store_callback_host, + HIP_SYMBOL(store_callback_round_trip_inverse_dev_complex_half), + sizeof(void*)), + hipSuccess); + } + else + { + EXPECT_EQ(hipMemcpyFromSymbol(&store_callback_host, + HIP_SYMBOL(store_callback_dev_complex_half), + sizeof(void*)), + hipSuccess); + } + return store_callback_host; case fft_precision_single: - EXPECT_EQ(hipMemcpyFromSymbol(&store_callback_host, - HIP_SYMBOL(store_callback_dev_float2), - sizeof(void*)), - hipSuccess); + if(round_trip_inverse) + { + EXPECT_EQ(hipMemcpyFromSymbol( + &store_callback_host, + HIP_SYMBOL(store_callback_round_trip_inverse_dev_complex_float), + sizeof(void*)), + hipSuccess); + } + else + { + EXPECT_EQ(hipMemcpyFromSymbol(&store_callback_host, + HIP_SYMBOL(store_callback_dev_complex_float), + sizeof(void*)), + hipSuccess); + } return store_callback_host; case fft_precision_double: - EXPECT_EQ(hipMemcpyFromSymbol(&store_callback_host, - HIP_SYMBOL(store_callback_dev_double2), - sizeof(void*)), - hipSuccess); + if(round_trip_inverse) + { + EXPECT_EQ(hipMemcpyFromSymbol( + &store_callback_host, + HIP_SYMBOL(store_callback_round_trip_inverse_dev_complex_double), + sizeof(void*)), + hipSuccess); + } + else + { + EXPECT_EQ(hipMemcpyFromSymbol(&store_callback_host, + HIP_SYMBOL(store_callback_dev_complex_double), + sizeof(void*)), + hipSuccess); + } return store_callback_host; } } @@ -169,17 +319,56 @@ { switch(precision) { + case fft_precision_half: + if(round_trip_inverse) + { + EXPECT_EQ( + hipMemcpyFromSymbol(&store_callback_host, + HIP_SYMBOL(store_callback_round_trip_inverse_dev_half), + sizeof(void*)), + hipSuccess); + } + else + { + EXPECT_EQ(hipMemcpyFromSymbol(&store_callback_host, + HIP_SYMBOL(store_callback_dev_half), + sizeof(void*)), + hipSuccess); + } + return store_callback_host; case fft_precision_single: - EXPECT_EQ(hipMemcpyFromSymbol(&store_callback_host, - HIP_SYMBOL(store_callback_dev_float), - sizeof(void*)), - hipSuccess); + if(round_trip_inverse) + { + EXPECT_EQ( + hipMemcpyFromSymbol(&store_callback_host, + HIP_SYMBOL(store_callback_round_trip_inverse_dev_float), + sizeof(void*)), + hipSuccess); + } + else + { + EXPECT_EQ(hipMemcpyFromSymbol(&store_callback_host, + HIP_SYMBOL(store_callback_dev_float), + sizeof(void*)), + hipSuccess); + } return store_callback_host; case fft_precision_double: - EXPECT_EQ(hipMemcpyFromSymbol(&store_callback_host, - HIP_SYMBOL(store_callback_dev_double), - sizeof(void*)), - hipSuccess); + if(round_trip_inverse) + { + EXPECT_EQ( + hipMemcpyFromSymbol(&store_callback_host, + HIP_SYMBOL(store_callback_round_trip_inverse_dev_double), + sizeof(void*)), + hipSuccess); + } + else + { + EXPECT_EQ(hipMemcpyFromSymbol(&store_callback_host, + HIP_SYMBOL(store_callback_dev_double), + sizeof(void*)), + hipSuccess); + } return store_callback_host; } } @@ -190,7 +379,7 @@ } // Apply store callback if necessary -void apply_store_callback(const fft_params& params, fftw_data_t& output) +void apply_store_callback(const fft_params& params, std::vector& output) { if(!params.run_callbacks && params.scale_factor == 1.0) return; @@ -206,12 +395,28 @@ { switch(params.precision) { + case fft_precision_half: + { + const size_t elem_size = sizeof(rocfft_complex<_Float16>); + const size_t num_elems = output.front().size() / elem_size; + + auto output_begin = reinterpret_cast*>(output.front().data()); + for(size_t i = 0; i < num_elems; ++i) + { + auto& element = output_begin[i]; + if(params.scale_factor != 1.0) + element = element * params.scale_factor; + if(params.run_callbacks) + store_callback(output_begin, i, element, &cbdata, nullptr); + } + break; + } case fft_precision_single: { - const size_t elem_size = sizeof(std::complex); + const size_t elem_size = sizeof(rocfft_complex); const size_t num_elems = output.front().size() / elem_size; - auto output_begin = reinterpret_cast(output.front().data()); + auto output_begin = reinterpret_cast*>(output.front().data()); for(size_t i = 0; i < num_elems; ++i) { auto& element = output_begin[i]; @@ -224,10 +429,10 @@ } case fft_precision_double: { - const size_t elem_size = sizeof(std::complex); + const size_t elem_size = sizeof(rocfft_complex); const size_t num_elems = output.front().size() / elem_size; - auto output_begin = reinterpret_cast(output.front().data()); + auto output_begin = reinterpret_cast*>(output.front().data()); for(size_t i = 0; i < num_elems; ++i) { auto& element = output_begin[i]; @@ -247,14 +452,31 @@ // planar wouldn't run callbacks, but we could still want scaling switch(params.precision) { + case fft_precision_half: + { + const size_t elem_size = sizeof(rocfft_complex<_Float16>); + for(auto& buf : output) + { + const size_t num_elems = buf.size() / elem_size; + + auto output_begin = reinterpret_cast*>(buf.data()); + for(size_t i = 0; i < num_elems; ++i) + { + auto& element = output_begin[i]; + if(params.scale_factor != 1.0) + element = element * params.scale_factor; + } + } + break; + } case fft_precision_single: { - const size_t elem_size = sizeof(std::complex); + const size_t elem_size = sizeof(rocfft_complex); for(auto& buf : output) { const size_t num_elems = buf.size() / elem_size; - auto output_begin = reinterpret_cast(buf.data()); + auto output_begin = reinterpret_cast*>(buf.data()); for(size_t i = 0; i < num_elems; ++i) { auto& element = output_begin[i]; @@ -266,12 +488,12 @@ } case fft_precision_double: { - const size_t elem_size = sizeof(std::complex); + const size_t elem_size = sizeof(rocfft_complex); for(auto& buf : output) { const size_t num_elems = buf.size() / elem_size; - auto output_begin = reinterpret_cast(buf.data()); + auto output_begin = reinterpret_cast*>(buf.data()); for(size_t i = 0; i < num_elems; ++i) { auto& element = output_begin[i]; @@ -288,6 +510,22 @@ { switch(params.precision) { + case fft_precision_half: + { + const size_t elem_size = sizeof(_Float16); + const size_t num_elems = output.front().size() / elem_size; + + auto output_begin = reinterpret_cast<_Float16*>(output.front().data()); + for(size_t i = 0; i < num_elems; ++i) + { + auto& element = output_begin[i]; + if(params.scale_factor != 1.0) + element = element * params.scale_factor; + if(params.run_callbacks) + store_callback(output_begin, i, element, &cbdata, nullptr); + } + break; + } case fft_precision_single: { const size_t elem_size = sizeof(float); @@ -330,7 +568,7 @@ } // apply load callback if necessary -void apply_load_callback(const fft_params& params, fftw_data_t& input) +void apply_load_callback(const fft_params& params, std::vector& input) { if(!params.run_callbacks) return; @@ -348,12 +586,24 @@ { switch(params.precision) { + case fft_precision_half: + { + const size_t elem_size = sizeof(rocfft_complex<_Float16>); + const size_t num_elems = input.front().size() / elem_size; + + auto input_begin = reinterpret_cast*>(input.front().data()); + for(size_t i = 0; i < num_elems; ++i) + { + input_begin[i] = load_callback(input_begin, i, &cbdata, nullptr); + } + break; + } case fft_precision_single: { - const size_t elem_size = sizeof(std::complex); + const size_t elem_size = sizeof(rocfft_complex); const size_t num_elems = input.front().size() / elem_size; - auto input_begin = reinterpret_cast(input.front().data()); + auto input_begin = reinterpret_cast*>(input.front().data()); for(size_t i = 0; i < num_elems; ++i) { input_begin[i] = load_callback(input_begin, i, &cbdata, nullptr); @@ -362,10 +612,10 @@ } case fft_precision_double: { - const size_t elem_size = sizeof(std::complex); + const size_t elem_size = sizeof(rocfft_complex); const size_t num_elems = input.front().size() / elem_size; - auto input_begin = reinterpret_cast(input.front().data()); + auto input_begin = reinterpret_cast*>(input.front().data()); for(size_t i = 0; i < num_elems; ++i) { input_begin[i] = load_callback(input_begin, i, &cbdata, nullptr); @@ -379,6 +629,18 @@ { switch(params.precision) { + case fft_precision_half: + { + const size_t elem_size = sizeof(_Float16); + const size_t num_elems = input.front().size() / elem_size; + + auto input_begin = reinterpret_cast<_Float16*>(input.front().data()); + for(size_t i = 0; i < num_elems; ++i) + { + input_begin[i] = load_callback(input_begin, i, &cbdata, nullptr); + } + break; + } case fft_precision_single: { const size_t elem_size = sizeof(float); diff -Nru rocfft-5.5.0/clients/tests/accuracy_test.h rocfft-5.7.1/clients/tests/accuracy_test.h --- rocfft-5.5.0/clients/tests/accuracy_test.h 2023-01-31 06:20:16.000000000 +0000 +++ rocfft-5.7.1/clients/tests/accuracy_test.h 2023-08-09 16:19:51.000000000 +0000 @@ -1,4 +1,4 @@ -// Copyright (C) 2020 - 2022 Advanced Micro Devices, Inc. All rights reserved. +// Copyright (C) 2020 - 2023 Advanced Micro Devices, Inc. All rights reserved. // // Permission is hereby granted, free of charge, to any person obtaining a copy // of this software and associated documentation files (the "Software"), to deal @@ -24,12 +24,15 @@ #define ACCURACY_TEST #include +#include #include #include +#include #include +#include "../../shared/enum_to_string.h" +#include "../../shared/fft_params.h" #include "../../shared/gpubuf.h" -#include "../fft_params.h" #include "fftw_transform.h" #include "rocfft_against_fftw.h" #include "test_params.h" @@ -39,24 +42,48 @@ static const size_t ONE_GiB = 1 << 30; -typedef std::vector>> fftw_data_t; +inline size_t bytes_to_GiB(const size_t bytes) +{ + return bytes == 0 ? 0 : (bytes - 1 + ONE_GiB) / ONE_GiB; +} typedef std::tuple type_place_io_t; -// Estimate the amount of host memory needed. -inline size_t needed_ram(const fft_params& params, const int verbose) +// Remember the results of the last FFT we computed with FFTW. Tests +// are ordered so that later cases can often reuse this result. +struct last_cpu_fft_cache { - // We need at most 3 copies of the raw data: 2 are strictly - // required (input + output) but we keep a third copy around to - // save effort recomputing input for a smaller batch size or - // precision. - // - // This calculation is assuming contiguous data - noncontiguous - // temp buffers may be briefly required to mirror the data layout - // on the GPU, but they're assumed to require a close enough - // amount of space for the purposes of this estimate. - size_t needed_ram = 3 + // keys to the cache + std::vector length; + size_t nbatch = 0; + fft_transform_type transform_type = fft_transform_type_complex_forward; + bool run_callbacks = false; + fft_precision precision = fft_precision_single; + + // FFTW input/output + std::vector cpu_input; + std::vector cpu_output; +}; +extern last_cpu_fft_cache last_cpu_fft_data; + +struct system_memory +{ + size_t total_bytes = 0; + size_t free_bytes = 0; +}; +extern system_memory start_memory; + +system_memory get_system_memory(); + +// Estimate the amount of host memory needed for buffers. +inline size_t needed_ram_buffers(const fft_params& params, const int verbose) +{ + // This calculation is assuming contiguous data but noncontiguous buffers + // are assumed to require a close enough amount of space for the purposes + // of this estimate. + + size_t needed_ram = 6 * std::accumulate(params.length.begin(), params.length.end(), static_cast(1), @@ -70,6 +97,9 @@ } switch(params.precision) { + case fft_precision_half: + needed_ram *= 2; + break; case fft_precision_single: needed_ram *= 4; break; @@ -82,7 +112,87 @@ if(verbose) { - std::cout << "required host memory (GiB): " << needed_ram / ONE_GiB << std::endl; + std::cout << "required host memory for buffers (GiB): " << bytes_to_GiB(needed_ram) << "\n"; + } + + return needed_ram; +} + +template +bool fftw_plan_uses_bluestein(const typename fftw_trait::fftw_plan_type& cpu_plan) +{ +#ifdef FFTW_HAVE_SPRINT_PLAN + char* print_plan_c_str = fftw_sprint_plan(cpu_plan); + std::string print_plan(print_plan_c_str); + free(print_plan_c_str); + return print_plan.find("bluestein") != std::string::npos; +#else + // assume worst case (bluestein is always used) + return true; +#endif +} + +// Estimate the amount of host memory needed for fftw. +template +inline size_t needed_ram_fftw(const fft_params& contiguous_params, + const typename fftw_trait::fftw_plan_type& cpu_plan, + const int verbose) +{ + size_t total_length = std::accumulate(contiguous_params.length.begin(), + contiguous_params.length.end(), + static_cast(1), + std::multiplies()); + size_t needed_ram = 0; + // Detect Bluestein in plan + if(fftw_plan_uses_bluestein(cpu_plan)) + { + for(size_t dim : contiguous_params.length) + { + unsigned int needed_ram_dim = dim; + + // Next-plus-one-power-of-two multiplied any other lengths + needed_ram_dim--; + + needed_ram_dim |= needed_ram_dim >> 2; + needed_ram_dim |= needed_ram_dim >> 4; + needed_ram_dim |= needed_ram_dim >> 8; + needed_ram_dim |= needed_ram_dim >> 16; + + needed_ram_dim++; + + needed_ram_dim *= 2 * (total_length / dim); + + if(needed_ram_dim > needed_ram) + { + needed_ram = needed_ram_dim; + } + } + } + + // Account for precision and data type: + if(contiguous_params.transform_type != fft_transform_type_real_forward + && contiguous_params.transform_type != fft_transform_type_real_inverse) + { + needed_ram *= 2; + } + switch(contiguous_params.precision) + { + case fft_precision_half: + needed_ram *= 2; + break; + case fft_precision_single: + needed_ram *= 4; + break; + case fft_precision_double: + needed_ram *= 8; + break; + } + + needed_ram *= contiguous_params.nbatch; + + if(verbose) + { + std::cout << "required host memory for FFTW (GiB): " << bytes_to_GiB(needed_ram) << "\n"; } return needed_ram; @@ -102,37 +212,21 @@ } }; -// Remember the results of the last FFT we computed with FFTW. Tests -// are ordered so that later cases can often reuse this result. -struct last_cpu_fft_cache -{ - // keys to the cache - std::vector length; - size_t nbatch = 0; - fft_transform_type transform_type = fft_transform_type_complex_forward; - bool run_callbacks = false; - fft_precision precision = fft_precision_single; - - // FFTW input/output - fftw_data_t cpu_input; - fftw_data_t cpu_output; -}; -extern last_cpu_fft_cache last_cpu_fft_data; - const static std::vector batch_range = {2, 1}; -const static std::vector precision_range +const static std::vector precision_range_full + = {fft_precision_double, fft_precision_single, fft_precision_half}; +const static std::vector precision_range_sp_dp = {fft_precision_double, fft_precision_single}; + const static std::vector place_range = {fft_placement_inplace, fft_placement_notinplace}; -const static std::vector trans_type_range = {fft_transform_type_complex_forward, - fft_transform_type_complex_inverse, - fft_transform_type_real_forward, - fft_transform_type_real_inverse}; +const static std::vector trans_type_range + = {fft_transform_type_complex_forward, fft_transform_type_real_forward}; const static std::vector trans_type_range_complex - = {fft_transform_type_complex_forward, fft_transform_type_complex_inverse}; + = {fft_transform_type_complex_forward}; const static std::vector trans_type_range_real - = {fft_transform_type_real_forward, fft_transform_type_real_inverse}; + = {fft_transform_type_real_forward}; // Given a vector of vector of lengths, generate all unique permutations. // Add an optional vector of ad-hoc lengths to the result. @@ -337,7 +431,6 @@ // something to be passed to generate_lengths if(lengths.empty() || lengths.size() > 3) { - assert(false); continue; } { @@ -389,6 +482,47 @@ } } param.validate(); + + // Keeping the random number generator here + // allows one to run the same tests for a given + // random seed; ie the test suite is repeatable. + std::hash hasher; + std::ranlux24_base gen(random_seed + + hasher(param.token())); + std::uniform_real_distribution<> dis(0.0, 1.0); + + if(param.is_planar()) + { + const double roll = dis(gen); + if(roll > planar_prob) + { + if(verbose > 4) + { + std::cout << "Planar transform skipped " + "(planar_prob: " + << planar_prob << " > " << roll + << ")\n"; + } + continue; + } + } + if(run_callbacks) + { + const double roll = dis(gen); + if(roll > callback_prob) + { + + if(verbose > 4) + { + std::cout << "Callback transform skipped " + "(planar_prob: " + << planar_prob << " > " << roll + << ")\n"; + } + continue; + } + } + if(param.valid(0)) { params.push_back(param); @@ -471,7 +605,7 @@ const bool planar, const bool run_callbacks = false) { - return param_generator_base(trans_type_range_complex, + return param_generator_base(trans_type_range_real, v_lengths, precision_range, batch_range, @@ -506,40 +640,56 @@ void* base; }; -void* get_load_callback_host(fft_array_type itype, fft_precision precision); -void apply_load_callback(const fft_params& params, fftw_data_t& input); -void apply_store_callback(const fft_params& params, fftw_data_t& output); -void* get_store_callback_host(fft_array_type otype, fft_precision precision); +void* get_load_callback_host(fft_array_type itype, + fft_precision precision, + bool round_trip_inverse); +void apply_load_callback(const fft_params& params, std::vector& input); +void apply_store_callback(const fft_params& params, std::vector& output); +void* get_store_callback_host(fft_array_type otype, + fft_precision precision, + bool round_trip_inverse); + +static auto allocate_cpu_fft_buffer(const fft_precision precision, + const fft_array_type type, + const std::vector& size) +{ + // FFTW does not support half-precision, so we do single instead. + // So if we need to do a half-precision FFTW transform, allocate + // enough buffer for single-precision instead. + return allocate_host_buffer( + precision == fft_precision_half ? fft_precision_single : precision, type, size); +} template inline void execute_cpu_fft(fft_params& params, fft_params& contiguous_params, typename fftw_trait::fftw_plan_type& cpu_plan, - fftw_data_t& cpu_input, - fftw_data_t& cpu_output) + std::vector& cpu_input, + std::vector& cpu_output) { // CPU output might not be allocated already for us, if FFTW never // needed an output buffer during planning if(cpu_output.empty()) - cpu_output = allocate_host_buffer>( + cpu_output = allocate_cpu_fft_buffer( contiguous_params.precision, contiguous_params.otype, contiguous_params.osize); // If this is either C2R or callbacks are enabled, the // input will be modified. So we need to modify the copy instead. - fftw_data_t cpu_input_copy; - fftw_data_t* input_ptr = &cpu_input; + std::vector cpu_input_copy(cpu_input.size()); + std::vector* input_ptr = &cpu_input; if(params.run_callbacks || contiguous_params.transform_type == fft_transform_type_real_inverse) { - cpu_input_copy = cpu_input; - input_ptr = &cpu_input_copy; + for(size_t i = 0; i < cpu_input.size(); ++i) + { + cpu_input_copy[i] = cpu_input[i].copy(); + } + + input_ptr = &cpu_input_copy; } // run FFTW (which may destroy CPU input) apply_load_callback(params, *input_ptr); - fftw_run(contiguous_params.transform_type, - cpu_plan, - input_ptr->front().data(), - cpu_output.front().data()); + fftw_run(contiguous_params.transform_type, cpu_plan, *input_ptr, cpu_output); // clean up fftw_destroy_plan_type(cpu_plan); // ask FFTW to fully clean up, since it tries to cache plan details @@ -550,40 +700,111 @@ // execute the GPU transform template -inline void execute_gpu_fft(Tparams& params, - std::vector& pibuffer, - std::vector& pobuffer, - fftw_data_t& gpu_output) +inline void execute_gpu_fft(Tparams& params, + std::vector& pibuffer, + std::vector& pobuffer, + std::vector& gpu_output, + bool round_trip_inverse = false) { gpubuf_t load_cb_data_dev; gpubuf_t store_cb_data_dev; if(params.run_callbacks) { - void* load_cb_host = get_load_callback_host(params.itype, params.precision); + void* load_cb_host + = get_load_callback_host(params.itype, params.precision, round_trip_inverse); callback_test_data load_cb_data_host; - load_cb_data_host.scalar = params.load_cb_scalar; - load_cb_data_host.base = pibuffer.front(); - ASSERT_TRUE(hipSuccess == load_cb_data_dev.alloc(sizeof(callback_test_data))); - ASSERT_TRUE(hipSuccess - == hipMemcpy(load_cb_data_dev.data(), - &load_cb_data_host, - sizeof(callback_test_data), - hipMemcpyHostToDevice)); + if(round_trip_inverse) + { + load_cb_data_host.scalar = params.store_cb_scalar; + } + else + { + load_cb_data_host.scalar = params.load_cb_scalar; + } + + load_cb_data_host.base = pibuffer.front(); + + auto hip_status = hipSuccess; + + hip_status = load_cb_data_dev.alloc(sizeof(callback_test_data)); + if(hip_status != hipSuccess) + { + ++n_hip_failures; + if(skip_runtime_fails) + { + GTEST_SKIP(); + } + else + { + GTEST_FAIL(); + } + } + hip_status = hipMemcpy(load_cb_data_dev.data(), + &load_cb_data_host, + sizeof(callback_test_data), + hipMemcpyHostToDevice); + if(hip_status != hipSuccess) + { + ++n_hip_failures; + if(skip_runtime_fails) + { + GTEST_SKIP(); + } + else + { + GTEST_FAIL(); + } + } - void* store_cb_host = get_store_callback_host(params.otype, params.precision); + void* store_cb_host + = get_store_callback_host(params.otype, params.precision, round_trip_inverse); callback_test_data store_cb_data_host; - store_cb_data_host.scalar = params.store_cb_scalar; - store_cb_data_host.base = pobuffer.front(); - ASSERT_TRUE(hipSuccess == store_cb_data_dev.alloc(sizeof(callback_test_data))); - ASSERT_TRUE(hipSuccess - == hipMemcpy(store_cb_data_dev.data(), - &store_cb_data_host, - sizeof(callback_test_data), - hipMemcpyHostToDevice)); + if(round_trip_inverse) + { + store_cb_data_host.scalar = params.load_cb_scalar; + } + else + { + store_cb_data_host.scalar = params.store_cb_scalar; + } + + store_cb_data_host.base = pobuffer.front(); + + hip_status = store_cb_data_dev.alloc(sizeof(callback_test_data)); + if(hip_status != hipSuccess) + { + ++n_hip_failures; + if(skip_runtime_fails) + { + GTEST_SKIP(); + } + else + { + GTEST_FAIL(); + } + } + + hip_status = hipMemcpy(store_cb_data_dev.data(), + &store_cb_data_host, + sizeof(callback_test_data), + hipMemcpyHostToDevice); + if(hip_status != hipSuccess) + { + ++n_hip_failures; + if(skip_runtime_fails) + { + GTEST_SKIP(); + } + else + { + GTEST_FAIL(); + } + } + auto fft_status = params.set_callbacks( load_cb_host, load_cb_data_dev.data(), store_cb_host, store_cb_data_dev.data()); if(fft_status != fft_status_success) @@ -596,18 +817,27 @@ throw std::runtime_error("rocFFT plan execution failure"); // copy GPU output back - ASSERT_TRUE(!params.osize.empty()) << "Error: params osize is empty"; - gpu_output - = allocate_host_buffer>(params.precision, params.otype, params.osize); ASSERT_TRUE(!gpu_output.empty()) << "no output buffers"; for(unsigned int idx = 0; idx < gpu_output.size(); ++idx) { - ASSERT_TRUE(!gpu_output[idx].empty()) << "output buffer index " << idx << " is empty"; + ASSERT_TRUE(gpu_output[idx].data() != nullptr) + << "output buffer index " << idx << " is empty"; auto hip_status = hipMemcpy(gpu_output[idx].data(), pobuffer.at(idx), gpu_output[idx].size(), hipMemcpyDeviceToHost); - ASSERT_EQ(hip_status, hipSuccess) << "hipMemcpy failure"; + if(hip_status != hipSuccess) + { + ++n_hip_failures; + if(skip_runtime_fails) + { + GTEST_SKIP() << "hipMemcpy failure"; + } + else + { + GTEST_FAIL() << "hipMemcpy failure"; + } + } } if(verbose > 2) { @@ -622,57 +852,69 @@ } template -static void assert_init_value(const fftw_data_t& output, const size_t idx, const Tfloat orig_value); +static void assert_init_value(const std::vector& output, + const size_t idx, + const Tfloat orig_value); template <> -void assert_init_value(const fftw_data_t& output, const size_t idx, const float orig_value) +void assert_init_value(const std::vector& output, const size_t idx, const float orig_value) { float actual_value = reinterpret_cast(output.front().data())[idx]; ASSERT_EQ(actual_value, orig_value) << "index " << idx; } template <> -void assert_init_value(const fftw_data_t& output, const size_t idx, const double orig_value) +void assert_init_value(const std::vector& output, + const size_t idx, + const double orig_value) { double actual_value = reinterpret_cast(output.front().data())[idx]; ASSERT_EQ(actual_value, orig_value) << "index " << idx; } template <> -void assert_init_value(const fftw_data_t& output, const size_t idx, const float2 orig_value) +void assert_init_value(const std::vector& output, + const size_t idx, + const rocfft_complex orig_value) { // if this is interleaved, check directly if(output.size() == 1) { - float2 actual_value = reinterpret_cast(output.front().data())[idx]; + rocfft_complex actual_value + = reinterpret_cast*>(output.front().data())[idx]; ASSERT_EQ(actual_value.x, orig_value.x) << "x index " << idx; ASSERT_EQ(actual_value.y, orig_value.y) << "y index " << idx; } else { // planar - float2 actual_value{reinterpret_cast(output.front().data())[idx], - reinterpret_cast(output.back().data())[idx]}; + rocfft_complex actual_value{ + reinterpret_cast(output.front().data())[idx], + reinterpret_cast(output.back().data())[idx]}; ASSERT_EQ(actual_value.x, orig_value.x) << "x index " << idx; ASSERT_EQ(actual_value.y, orig_value.y) << "y index " << idx; } } template <> -void assert_init_value(const fftw_data_t& output, const size_t idx, const double2 orig_value) +void assert_init_value(const std::vector& output, + const size_t idx, + const rocfft_complex orig_value) { // if this is interleaved, check directly if(output.size() == 1) { - double2 actual_value = reinterpret_cast(output.front().data())[idx]; + rocfft_complex actual_value + = reinterpret_cast*>(output.front().data())[idx]; ASSERT_EQ(actual_value.x, orig_value.x) << "x index " << idx; ASSERT_EQ(actual_value.y, orig_value.y) << "y index " << idx; } else { // planar - double2 actual_value{reinterpret_cast(output.front().data())[idx], - reinterpret_cast(output.back().data())[idx]}; + rocfft_complex actual_value{ + reinterpret_cast(output.front().data())[idx], + reinterpret_cast(output.back().data())[idx]}; ASSERT_EQ(actual_value.x, orig_value.x) << "x index " << idx; ASSERT_EQ(actual_value.y, orig_value.y) << "y index " << idx; } @@ -680,11 +922,11 @@ static const int OUTPUT_INIT_PATTERN = 0xcd; template -void check_single_output_stride(const fftw_data_t& output, - const size_t offset, - const std::vector& length, - const std::vector& stride, - const size_t i) +void check_single_output_stride(const std::vector& output, + const size_t offset, + const std::vector& length, + const std::vector& stride, + const size_t i) { Tfloat orig; memset(static_cast(&orig), OUTPUT_INIT_PATTERN, sizeof(Tfloat)); @@ -720,7 +962,7 @@ } template -void check_output_strides(const fftw_data_t& output, Tparams& params) +void check_output_strides(const std::vector& output, Tparams& params) { // treat batch+dist like highest length+stride, if batch > 1 std::vector length; @@ -740,46 +982,273 @@ if(params.otype == fft_array_type_real) check_single_output_stride(output, 0, length, stride, 0); else - check_single_output_stride(output, 0, length, stride, 0); + check_single_output_stride>(output, 0, length, stride, 0); } else { if(params.otype == fft_array_type_real) check_single_output_stride(output, 0, length, stride, 0); else - check_single_output_stride(output, 0, length, stride, 0); + check_single_output_stride>(output, 0, length, stride, 0); } } -// run CPU + rocFFT transform with the given params and compare -template -inline void fft_vs_reference_impl(Tparams& params) +// run rocFFT inverse transform +template +inline void run_round_trip_inverse(Tparams& params, + std::vector& obuffer, + std::vector& pibuffer, + std::vector& pobuffer, + std::vector& gpu_output) { + params.validate(); + // Make sure that the parameters make sense: ASSERT_TRUE(params.valid(verbose)); - if(ramgb > 0 && needed_ram(params, verbose) > ramgb * ONE_GiB) + // Create FFT plan - this will also allocate work buffer, but will throw a + // specific exception if that step fails + auto plan_status = fft_status_success; + try { - if(verbose) + plan_status = params.create_plan(); + } + catch(fft_params::work_buffer_alloc_failure& e) + { + std::stringstream ss; + ss << "Failed to allocate work buffer (size: " << params.workbuffersize << ")"; + ++n_hip_failures; + if(skip_runtime_fails) { - std::cout << "Problem exceeds memory limit; skipped [rocfft_transform]." << std::endl; + GTEST_SKIP() << ss.str(); + } + else + { + GTEST_FAIL() << ss.str(); } - GTEST_SKIP(); - return; + } + ASSERT_EQ(plan_status, fft_status_success) << "round trip inverse plan creation failed"; + + auto obuffer_sizes = params.obuffer_sizes(); + + if(params.placement != fft_placement_inplace) + { + for(unsigned int i = 0; i < obuffer_sizes.size(); ++i) + { + // If we're validating output strides, init the + // output buffer to a known pattern and we can check + // that the pattern is untouched in places that + // shouldn't have been touched. + if(params.check_output_strides) + { + auto hip_status + = hipMemset(obuffer[i].data(), OUTPUT_INIT_PATTERN, obuffer_sizes[i]); + if(hip_status != hipSuccess) + { + ++n_hip_failures; + if(skip_runtime_fails) + { + GTEST_SKIP() << "hipMemset failure"; + } + else + { + GTEST_FAIL() << "hipMemset failure"; + } + } + } + } + } + + // execute GPU transform + // + // limited scope for local variables + + execute_gpu_fft(params, pibuffer, pobuffer, gpu_output, true); +} + +// compare rocFFT inverse transform with forward transform input +template +inline void compare_round_trip_inverse(Tparams& params, + fft_params& contiguous_params, + std::vector& gpu_output, + std::vector& cpu_input, + const VectorNorms& cpu_input_norm, + size_t total_length) +{ + if(params.check_output_strides) + { + check_output_strides(gpu_output, params); + } + + // compute GPU output norm + std::shared_future gpu_norm = std::async(std::launch::async, [&]() { + return norm(gpu_output, + params.olength(), + params.nbatch, + params.precision, + params.otype, + params.ostride, + params.odist, + params.ooffset); + }); + + // compare GPU inverse output to CPU forward input + std::unique_ptr>> linf_failures; + if(verbose > 1) + linf_failures = std::make_unique>>(); + const double linf_cutoff + = type_epsilon(params.precision) * cpu_input_norm.l_inf * log(total_length); + + VectorNorms diff = distance(cpu_input, + gpu_output, + params.olength(), + params.nbatch, + params.precision, + contiguous_params.itype, + contiguous_params.istride, + contiguous_params.idist, + params.otype, + params.ostride, + params.odist, + linf_failures.get(), + linf_cutoff, + {0}, + params.ooffset, + 1.0 / total_length); + + if(verbose > 1) + { + std::cout << "GPU output Linf norm: " << gpu_norm.get().l_inf << "\n"; + std::cout << "GPU output L2 norm: " << gpu_norm.get().l_2 << "\n"; + std::cout << "GPU linf norm failures:"; + std::sort(linf_failures->begin(), linf_failures->end()); + for(const auto& i : *linf_failures) + { + std::cout << " (" << i.first << "," << i.second << ")"; + } + std::cout << std::endl; + } + + EXPECT_TRUE(std::isfinite(gpu_norm.get().l_inf)) << params.str(); + EXPECT_TRUE(std::isfinite(gpu_norm.get().l_2)) << params.str(); + + switch(params.precision) + { + case fft_precision_half: + max_linf_eps_half + = std::max(max_linf_eps_half, diff.l_inf / cpu_input_norm.l_inf / log(total_length)); + max_l2_eps_half + = std::max(max_l2_eps_half, diff.l_2 / cpu_input_norm.l_2 * sqrt(log2(total_length))); + break; + case fft_precision_single: + max_linf_eps_single + = std::max(max_linf_eps_single, diff.l_inf / cpu_input_norm.l_inf / log(total_length)); + max_l2_eps_single + = std::max(max_l2_eps_single, diff.l_2 / cpu_input_norm.l_2 * sqrt(log2(total_length))); + break; + case fft_precision_double: + max_linf_eps_double + = std::max(max_linf_eps_double, diff.l_inf / cpu_input_norm.l_inf / log(total_length)); + max_l2_eps_double + = std::max(max_l2_eps_double, diff.l_2 / cpu_input_norm.l_2 * sqrt(log2(total_length))); + break; + } + + if(verbose > 1) + { + std::cout << "L2 diff: " << diff.l_2 << "\n"; + std::cout << "Linf diff: " << diff.l_inf << "\n"; + } + + EXPECT_TRUE(diff.l_inf <= linf_cutoff) + << "Linf test failed. Linf:" << diff.l_inf + << "\tnormalized Linf: " << diff.l_inf / cpu_input_norm.l_inf << "\tcutoff: " << linf_cutoff + << params.str(); + + EXPECT_TRUE(diff.l_2 / cpu_input_norm.l_2 + < sqrt(log2(total_length)) * type_epsilon(params.precision)) + << "L2 test failed. L2: " << diff.l_2 + << "\tnormalized L2: " << diff.l_2 / cpu_input_norm.l_2 + << "\tepsilon: " << sqrt(log2(total_length)) * type_epsilon(params.precision) + << params.str(); +} + +// RAII type to put data into the cache when this object leaves scope +struct StoreCPUDataToCache +{ + StoreCPUDataToCache(std::vector& cpu_input, std::vector& cpu_output) + : cpu_input(cpu_input) + , cpu_output(cpu_output) + { + } + ~StoreCPUDataToCache() + { + last_cpu_fft_data.cpu_output.swap(cpu_output); + last_cpu_fft_data.cpu_input.swap(cpu_input); + } + std::vector& cpu_input; + std::vector& cpu_output; +}; + +// run CPU + rocFFT transform with the given params and compare +template +inline void fft_vs_reference_impl(Tparams& params, bool round_trip) +{ + // Make sure that the parameters make sense: + ASSERT_TRUE(params.valid(verbose)); + + size_t needed_ram = needed_ram_buffers(params, verbose); + + if(ramgb > 0 && needed_ram > ramgb * ONE_GiB) + { + GTEST_SKIP() << "needed_ramgb: " << bytes_to_GiB(needed_ram) << ", ramgb limit: " << ramgb + << ".\n"; } auto ibuffer_sizes = params.ibuffer_sizes(); auto obuffer_sizes = params.obuffer_sizes(); + size_t vram_avail = 0; + + if(vramgb == 0) + { + // Check free and total available memory: + size_t free = 0; + size_t total = 0; + auto hip_status = hipMemGetInfo(&free, &total); + if(hip_status != hipSuccess || total == 0) + { + ++n_hip_failures; + std::stringstream ss; + if(total == 0) + ss << "hipMemGetInfo claims there there isn't any vram"; + else + ss << "hipMemGetInfo failure with error " << hip_status; + if(skip_runtime_fails) + { + GTEST_SKIP() << ss.str(); + } + else + { + GTEST_FAIL() << ss.str(); + } + } + vram_avail = total; + } + else + { + vram_avail = vramgb * ONE_GiB; + } + // First try a quick estimation of vram footprint, to speed up skipping tests // that are too large to fit in the gpu (no plan created with the rocFFT backend) const auto raw_vram_footprint = params.fft_params_vram_footprint() + twiddle_table_vram_footprint(params); - if(!vram_fits_problem(raw_vram_footprint)) + if(!vram_fits_problem(raw_vram_footprint, vram_avail)) { - GTEST_SKIP() << "Raw problem size (" << raw_vram_footprint - << ") raw data too large for device"; + GTEST_SKIP() << "Raw problem size (" << bytes_to_GiB(raw_vram_footprint) + << " GiB) raw data too large for device"; } if(verbose > 2) @@ -791,41 +1260,40 @@ // accurate calculation that actually creates the plan and // take into account the work buffer size const auto vram_footprint = params.vram_footprint(); - if(!vram_fits_problem(vram_footprint)) + if(!vram_fits_problem(vram_footprint, vram_avail)) { if(verbose) { std::cout << "Problem raw data won't fit on device; skipped." << std::endl; } - GTEST_SKIP() << "Problem size (" << vram_footprint << ") raw data too large for device"; + GTEST_SKIP() << "Problem size (" << bytes_to_GiB(vram_footprint) + << " GiB) raw data too large for device"; } // Create FFT plan - this will also allocate work buffer, but // will throw a specific exception if that step fails + auto plan_status = fft_status_success; try { - ASSERT_EQ(params.create_plan(), fft_status_success); + plan_status = params.create_plan(); } catch(fft_params::work_buffer_alloc_failure& e) { - GTEST_SKIP() << "Problem size with work buffer (" << vram_footprint + params.workbuffersize - << ") too large for device"; + ++n_hip_failures; + std::stringstream ss; + ss << "Work buffer allocation failed with size: " << params.workbuffersize; + if(skip_runtime_fails) + { + GTEST_SKIP() << ss.str(); + } + else + { + GTEST_FAIL() << ss.str(); + } } + ASSERT_EQ(plan_status, fft_status_success) << "plan creation failed"; - // Recheck whether the raw data fits on the device, now that the - // work buffer has been allocated (if required). - if(verbose > 1) - { - size_t free = 0; - size_t total = 0; - hipError_t retval = hipMemGetInfo(&free, &total); - ASSERT_EQ(retval, hipSuccess) << "hipMemGetInfo failed with error " << retval; - std::cout << "data footprint: " << vram_footprint << " (" << (double)vram_footprint - << ") workbuffer: " << params.workbuffersize << " (" - << (double)params.workbuffersize << ") free: " << free << " (" << (double)free - << ") total: " << total << " (" << (double)total << ")\n"; - } - if(!vram_fits_problem(vram_footprint)) + if(!vram_fits_problem(vram_footprint, vram_avail)) { if(verbose) { @@ -853,30 +1321,31 @@ if(verbose > 3) { - std::cout << "CPU params:\n"; + std::cout << "CPU params:\n"; std::cout << contiguous_params.str("\n\t") << std::endl; } - // helper function to convert double input/output to float - // in-place so we don't need extra memory - auto convert_to_single = [](fftw_data_t& data) { - for(auto& arr : data) - { - const double* readPtr = reinterpret_cast(arr.data()); - const double* readEnd = readPtr + (arr.size() / sizeof(double)); - float* writePtr = reinterpret_cast(arr.data()); - std::copy(readPtr, readEnd, writePtr); - arr.resize(arr.size() / 2); - } - }; - std::vector ibuffer(ibuffer_sizes.size()); std::vector pibuffer(ibuffer_sizes.size()); for(unsigned int i = 0; i < ibuffer.size(); ++i) { auto hip_status = ibuffer[i].alloc(ibuffer_sizes[i]); - ASSERT_EQ(hip_status, hipSuccess) << "hipMalloc failure for input buffer " << i << " size " - << ibuffer_sizes[i] << " " << params.str(); + if(hip_status != hipSuccess) + { + std::stringstream ss; + ss << "hipMalloc failure for input buffer " << i << " size " << ibuffer_sizes[i] << "(" + << bytes_to_GiB(ibuffer_sizes[i]) << " GiB)" + << " with code " << hipError_to_string(hip_status); + ++n_hip_failures; + if(skip_runtime_fails) + { + GTEST_SKIP() << ss.str(); + } + else + { + GTEST_FAIL() << ss.str(); + } + } pibuffer[i] = ibuffer[i].data(); } @@ -888,11 +1357,12 @@ // Check cache first - nbatch is a >= comparison because we compute // the largest batch size and cache it. Smaller batch runs can // compare against the larger data. - fftw_data_t cpu_input; - fftw_data_t cpu_output; - std::shared_future convert_cpu_output_precision; - std::shared_future convert_cpu_input_precision; - bool run_fftw = true; + std::vector cpu_input; + std::vector cpu_output; + std::shared_future convert_cpu_output_precision; + std::shared_future convert_cpu_input_precision; + bool run_fftw = true; + std::unique_ptr store_to_cache; if(last_cpu_fft_data.length == params.length && last_cpu_fft_data.transform_type == params.transform_type && last_cpu_fft_data.run_callbacks == params.run_callbacks) @@ -904,26 +1374,64 @@ cpu_output.swap(last_cpu_fft_data.cpu_output); run_fftw = false; + store_to_cache = std::make_unique(cpu_input, cpu_output); + if(params.precision != last_cpu_fft_data.precision) { - // Tests should be ordered so we do double first, then float. - if(last_cpu_fft_data.precision == fft_precision_double) + // Tests should be ordered so we do wider first, then narrower. + switch(params.precision) { - // convert the input/output to single-precision - convert_cpu_output_precision - = std::async(std::launch::async, [&]() { convert_to_single(cpu_output); }); - convert_cpu_input_precision - = std::async(std::launch::async, [&]() { convert_to_single(cpu_input); }); - last_cpu_fft_data.precision = fft_precision_single; - } - else - { - // Somehow we've done float first, then double? - // Tests are ordered wrong, and we don't want to - // lose precision - std::cerr << "Can't do float first then double: aborting." << std::endl; + case fft_precision_double: + std::cerr + << "test ordering is incorrect: double precision follows a narrower one" + << std::endl; abort(); + break; + case fft_precision_single: + if(last_cpu_fft_data.precision != fft_precision_double) + { + std::cerr + << "test ordering is incorrect: float precision follows a narrower one" + << std::endl; + abort(); + } + // convert the input/output to single-precision + convert_cpu_output_precision = std::async(std::launch::async, [&]() { + narrow_precision_inplace(cpu_output.front()); + }); + convert_cpu_input_precision = std::async(std::launch::async, [&]() { + narrow_precision_inplace(cpu_input.front()); + }); + break; + case fft_precision_half: + // convert to half precision + if(last_cpu_fft_data.precision == fft_precision_double) + { + convert_cpu_output_precision = std::async(std::launch::async, [&]() { + narrow_precision_inplace(cpu_output.front()); + }); + convert_cpu_input_precision = std::async(std::launch::async, [&]() { + narrow_precision_inplace(cpu_input.front()); + }); + } + else if(last_cpu_fft_data.precision == fft_precision_single) + { + convert_cpu_output_precision = std::async(std::launch::async, [&]() { + narrow_precision_inplace(cpu_output.front()); + }); + convert_cpu_input_precision = std::async(std::launch::async, [&]() { + narrow_precision_inplace(cpu_input.front()); + }); + } + else + { + std::cerr << "unhandled previous precision, cannot convert to half" + << std::endl; + abort(); + } + break; } + last_cpu_fft_data.precision = params.precision; } } // If the last result has a smaller batch than the new @@ -933,14 +1441,17 @@ // might never have tried to generate the bigger batch first. // So just fall through and redo the CPU FFT. } - // Clear cache explicitly so that even if we didn't get a hit, - // we're not uselessly holding on to cached cpu input/output - last_cpu_fft_data = last_cpu_fft_cache(); + else + { + // Clear cache explicitly so that even if we didn't get a hit, + // we're not uselessly holding on to cached cpu input/output + last_cpu_fft_data = last_cpu_fft_cache(); + } // Allocate CPU input if(run_fftw) { - cpu_input = allocate_host_buffer>( + cpu_input = allocate_cpu_fft_buffer( contiguous_params.precision, contiguous_params.itype, contiguous_params.isize); } @@ -955,7 +1466,7 @@ // creation time. if(use_fftw_wisdom) { - cpu_output = allocate_host_buffer>( + cpu_output = allocate_cpu_fft_buffer( contiguous_params.precision, contiguous_params.otype, contiguous_params.osize); } cpu_plan = fftw_plan_via_rocfft(contiguous_params.length, @@ -967,32 +1478,56 @@ contiguous_params.transform_type, cpu_input, cpu_output); + + needed_ram += needed_ram_fftw(contiguous_params, cpu_plan, verbose); + + if(ramgb > 0 && needed_ram > ramgb * ONE_GiB) + { + if(verbose) + { + std::cout << "Problem exceeds memory limit; skipped [rocfft_transform]." + << std::endl; + } + GTEST_SKIP(); + return; + } } + std::vector gpu_input_data + = allocate_host_buffer(params.precision, params.itype, ibuffer_sizes_elems); + // allocate and populate the input buffer (cpu/gpu) if(run_fftw) { //generate the input directly on the gpu - compute_input(params, ibuffer); + params.compute_input(ibuffer); // Copy the input to CPU if(params.itype != contiguous_params.itype || params.istride != contiguous_params.istride || params.idist != contiguous_params.idist || params.isize != contiguous_params.isize) { - auto tmp_cpu_input = allocate_host_buffer>( - params.precision, params.itype, ibuffer_sizes_elems); - // Copy input to CPU for(unsigned int idx = 0; idx < ibuffer.size(); ++idx) { - auto hip_status = hipMemcpy(tmp_cpu_input.at(idx).data(), + auto hip_status = hipMemcpy(gpu_input_data.at(idx).data(), ibuffer[idx].data(), ibuffer_sizes[idx], hipMemcpyDeviceToHost); - ASSERT_EQ(hip_status, hipSuccess) << "hipMemcpy failure with error " << hip_status; + if(hip_status != hipSuccess) + { + ++n_hip_failures; + if(skip_runtime_fails) + { + GTEST_SKIP() << "hipMemcpy failure with error " << hip_status; + } + else + { + GTEST_FAIL() << "hipMemcpy failure with error " << hip_status; + } + } } - copy_buffers(tmp_cpu_input, + copy_buffers(gpu_input_data, cpu_input, params.ilength(), params.nbatch, @@ -1015,7 +1550,18 @@ ibuffer[idx].data(), ibuffer_sizes[idx], hipMemcpyDeviceToHost); - ASSERT_EQ(hip_status, hipSuccess) << "hipMemcpy failure with error " << hip_status; + if(hip_status != hipSuccess) + { + ++n_hip_failures; + if(skip_runtime_fails) + { + GTEST_SKIP() << "hipMemcpy failure with error " << hip_status; + } + else + { + GTEST_FAIL() << "hipMemcpy failure with error " << hip_status; + } + } } } } @@ -1026,17 +1572,13 @@ convert_cpu_input_precision.get(); // gets a pre-computed gpu input buffer from the cpu cache - fftw_data_t temp_gpu_input; - fftw_data_t* gpu_input = &cpu_input; + std::vector* gpu_input = &cpu_input; if(params.itype != contiguous_params.itype || params.istride != contiguous_params.istride || params.idist != contiguous_params.idist || params.isize != contiguous_params.isize) { - temp_gpu_input = allocate_host_buffer>( - params.precision, params.itype, ibuffer_sizes_elems); - copy_buffers(cpu_input, - temp_gpu_input, + gpu_input_data, params.ilength(), params.nbatch, params.precision, @@ -1048,7 +1590,7 @@ params.idist, {0}, params.ioffset); - gpu_input = &temp_gpu_input; + gpu_input = &gpu_input_data; } // Copy input to GPU @@ -1058,7 +1600,19 @@ gpu_input->at(idx).data(), ibuffer_sizes[idx], hipMemcpyHostToDevice); - ASSERT_EQ(hip_status, hipSuccess) << "hipMemcpy failure with error " << hip_status; + + if(hip_status != hipSuccess) + { + ++n_hip_failures; + if(skip_runtime_fails) + { + GTEST_SKIP() << "hipMemcpy failure with error " << hip_status; + } + else + { + GTEST_FAIL() << "hipMemcpy failure with error " << hip_status; + } + } } } @@ -1109,25 +1663,20 @@ auto hip_status = obuffer_data[i].alloc(obuffer_sizes[i]); if(hip_status != hipSuccess) { - // Try and figure out why hip malloc failed. - size_t free = 0; - size_t total = 0; - hipError_t retval = hipMemGetInfo(&free, &total); - EXPECT_EQ(retval, hipSuccess) << "hipMemGetInfo failed with error " << retval; - if(retval == hipSuccess) + ++n_hip_failures; + std::stringstream ss; + ss << "hipMalloc failure for output buffer " << i << " size " << obuffer_sizes[i] + << "(" << bytes_to_GiB(obuffer_sizes[i]) << " GiB)" + << " with code " << hipError_to_string(hip_status); + if(skip_runtime_fails) { - std::cerr << "free vram: " << free << " (" << (double)free - << ") total vram: " << total << " (" << (double)total << ")" - << std::endl; - if(free > obuffer_sizes[i]) - { - std::cerr << "The system reports that there is enough space." << std::endl; - } + GTEST_SKIP() << ss.str(); + } + else + { + GTEST_FAIL() << ss.str(); } } - ASSERT_EQ(hip_status, hipSuccess) - << "hipMalloc failure for output buffer " << i << " size " << obuffer_sizes[i] - << " (" << static_cast(obuffer_sizes[i]) << ") " << params.str(); // If we're validating output strides, init the // output buffer to a known pattern and we can check @@ -1137,7 +1686,18 @@ { hip_status = hipMemset(obuffer_data[i].data(), OUTPUT_INIT_PATTERN, obuffer_sizes[i]); - ASSERT_EQ(hip_status, hipSuccess) << "hipMemset failure"; + if(hip_status != hipSuccess) + { + ++n_hip_failures; + if(skip_runtime_fails) + { + GTEST_SKIP() << "hipMemset failure with error " << hip_status; + } + else + { + GTEST_FAIL() << "hipMemset failure with error " << hip_status; + } + } } } } @@ -1186,8 +1746,11 @@ // execute GPU transform // // limited scope for local variables - fftw_data_t gpu_output; + std::vector gpu_output + = allocate_host_buffer(params.precision, params.otype, params.osize); + execute_gpu_fft(params, pibuffer, pobuffer, gpu_output); + params.free(); if(params.check_output_strides) { @@ -1210,29 +1773,74 @@ // // Compute the l-infinity and l-2 distance between the CPU and GPU output: // wait for cpu FFT so we can compute cutoff - cpu_fft.get(); - std::vector> linf_failures; - const auto total_length = std::accumulate(params.length.begin(), + + const auto total_length = std::accumulate(params.length.begin(), params.length.end(), static_cast(1), std::multiplies()); - const double linf_cutoff - = type_epsilon(params.precision) * cpu_output_norm.l_inf * log(total_length); - VectorNorms diff = distance(cpu_output, - gpu_output, - params.olength(), - params.nbatch, - params.precision, - contiguous_params.otype, - contiguous_params.ostride, - contiguous_params.odist, - params.otype, - params.ostride, - params.odist, - linf_failures, - linf_cutoff, - {0}, - params.ooffset); + + std::unique_ptr>> linf_failures; + if(verbose > 1) + linf_failures = std::make_unique>>(); + double linf_cutoff; + VectorNorms diff; + + std::shared_future compare_output = std::async(std::launch::async, [&]() { + cpu_fft.get(); + linf_cutoff = type_epsilon(params.precision) * cpu_output_norm.l_inf * log(total_length); + + diff = distance(cpu_output, + gpu_output, + params.olength(), + params.nbatch, + params.precision, + contiguous_params.otype, + contiguous_params.ostride, + contiguous_params.odist, + params.otype, + params.ostride, + params.odist, + linf_failures.get(), + linf_cutoff, + {0}, + params.ooffset); + }); + + // Update the cache if this current transform is different from + // what's stored. But if this transform only has a smaller batch + // than what's cached, we can still keep the cache around since + // the input/output we already have is still valid. + const bool update_last_cpu_fft_data + = last_cpu_fft_data.length != params.length + || last_cpu_fft_data.transform_type != params.transform_type + || last_cpu_fft_data.run_callbacks != params.run_callbacks + || last_cpu_fft_data.precision != params.precision + || params.nbatch > last_cpu_fft_data.nbatch; + + // store cpu output in cache + if(update_last_cpu_fft_data) + { + last_cpu_fft_data.length = params.length; + last_cpu_fft_data.nbatch = params.nbatch; + last_cpu_fft_data.transform_type = params.transform_type; + last_cpu_fft_data.run_callbacks = params.run_callbacks; + last_cpu_fft_data.precision = params.precision; + } + + compare_output.get(); + + if(!store_to_cache) + store_to_cache = std::make_unique(cpu_input, cpu_output); + + Tparams params_inverse; + + if(round_trip) + { + params_inverse.inverse_from_forward(params); + + run_round_trip_inverse( + params_inverse, ibuffer, pobuffer, pibuffer, gpu_input_data); + } ASSERT_TRUE(std::isfinite(cpu_input_norm.get().l_2)); ASSERT_TRUE(std::isfinite(cpu_input_norm.get().l_inf)); @@ -1245,8 +1853,8 @@ std::cout << "GPU output Linf norm: " << gpu_norm.get().l_inf << "\n"; std::cout << "GPU output L2 norm: " << gpu_norm.get().l_2 << "\n"; std::cout << "GPU linf norm failures:"; - std::sort(linf_failures.begin(), linf_failures.end()); - for(const auto& i : linf_failures) + std::sort(linf_failures->begin(), linf_failures->end()); + for(const auto& i : *linf_failures) { std::cout << " (" << i.first << "," << i.second << ")"; } @@ -1258,6 +1866,12 @@ switch(params.precision) { + case fft_precision_half: + max_linf_eps_half + = std::max(max_linf_eps_half, diff.l_inf / cpu_output_norm.l_inf / log(total_length)); + max_l2_eps_half + = std::max(max_l2_eps_half, diff.l_2 / cpu_output_norm.l_2 * sqrt(log2(total_length))); + break; case fft_precision_single: max_linf_eps_single = std::max(max_linf_eps_single, diff.l_inf / cpu_output_norm.l_inf / log(total_length)); @@ -1290,14 +1904,15 @@ << "\tepsilon: " << sqrt(log2(total_length)) * type_epsilon(params.precision) << params.str(); - // store cpu output in cache - last_cpu_fft_data.length = params.length; - last_cpu_fft_data.nbatch = params.nbatch; - last_cpu_fft_data.transform_type = params.transform_type; - last_cpu_fft_data.run_callbacks = params.run_callbacks; - last_cpu_fft_data.precision = params.precision; - last_cpu_fft_data.cpu_output.swap(cpu_output); - last_cpu_fft_data.cpu_input.swap(cpu_input); + if(round_trip) + { + compare_round_trip_inverse(params_inverse, + contiguous_params, + gpu_input_data, + cpu_input, + cpu_input_norm.get(), + total_length); + } } #endif diff -Nru rocfft-5.5.0/clients/tests/accuracy_test_1D.cpp rocfft-5.7.1/clients/tests/accuracy_test_1D.cpp --- rocfft-5.5.0/clients/tests/accuracy_test_1D.cpp 2023-01-31 06:20:16.000000000 +0000 +++ rocfft-5.7.1/clients/tests/accuracy_test_1D.cpp 2023-08-09 16:19:51.000000000 +0000 @@ -36,6 +36,9 @@ 131072, 262144, 524288, 1048576, 2097152, 4194304, 8388608, 16777216, 33554432, 67108864, 134217728, 268435456, 536870912, 1073741824}; +const static std::vector pow2_range_half + = {2, 4, 8, 16, 32, 64, 128, 256, 512, 1024, 2048, 4096, 8192, 16384, 32768, 65536}; + const static std::vector pow3_range = {3, 9, 27, @@ -129,7 +132,7 @@ INSTANTIATE_TEST_SUITE_P(pow2_1D, accuracy_test, ::testing::ValuesIn(param_generator(generate_lengths({pow2_range}), - precision_range, + precision_range_sp_dp, batch_range_1D, stride_range, stride_range, @@ -138,10 +141,11 @@ place_range, true)), accuracy_test::TestName); + INSTANTIATE_TEST_SUITE_P(DISABLED_offset_pow2_1D, accuracy_test, ::testing::ValuesIn(param_generator(generate_lengths({pow2_range}), - precision_range, + precision_range_sp_dp, batch_range_1D, stride_range, stride_range, @@ -151,10 +155,36 @@ true)), accuracy_test::TestName); +INSTANTIATE_TEST_SUITE_P(pow2_1D_half, + accuracy_test, + ::testing::ValuesIn(param_generator(generate_lengths({pow2_range_half}), + {fft_precision_half}, + batch_range_1D, + stride_range, + stride_range, + ioffset_range_zero, + ooffset_range_zero, + place_range, + true)), + accuracy_test::TestName); + +INSTANTIATE_TEST_SUITE_P(DISABLED_offset_pow2_1D_half, + accuracy_test, + ::testing::ValuesIn(param_generator(generate_lengths({pow2_range_half}), + {fft_precision_half}, + batch_range_1D, + stride_range, + stride_range, + ioffset_range_zero, + ooffset_range_zero, + place_range, + true)), + accuracy_test::TestName); + INSTANTIATE_TEST_SUITE_P(pow3_1D, accuracy_test, ::testing::ValuesIn(param_generator(generate_lengths({pow3_range}), - precision_range, + precision_range_sp_dp, batch_range_1D, stride_range, stride_range, @@ -166,7 +196,7 @@ INSTANTIATE_TEST_SUITE_P(DISABLED_offset_pow3_1D, accuracy_test, ::testing::ValuesIn(param_generator(generate_lengths({pow3_range}), - precision_range, + precision_range_full, batch_range_1D, stride_range, stride_range, @@ -179,7 +209,7 @@ INSTANTIATE_TEST_SUITE_P(pow5_1D, accuracy_test, ::testing::ValuesIn(param_generator(generate_lengths({pow5_range}), - precision_range, + precision_range_sp_dp, batch_range_1D, stride_range, stride_range, @@ -191,7 +221,7 @@ INSTANTIATE_TEST_SUITE_P(DISABLED_offset_pow5_1D, accuracy_test, ::testing::ValuesIn(param_generator(generate_lengths({pow5_range}), - precision_range, + precision_range_full, batch_range_1D, stride_range, stride_range, @@ -204,7 +234,7 @@ INSTANTIATE_TEST_SUITE_P(radX_1D, accuracy_test, ::testing::ValuesIn(param_generator(generate_lengths({radX_range}), - precision_range, + precision_range_full, batch_range_1D, stride_range, stride_range, @@ -216,7 +246,7 @@ INSTANTIATE_TEST_SUITE_P(DISABLED_offset_radX_1D, accuracy_test, ::testing::ValuesIn(param_generator(generate_lengths({radX_range}), - precision_range, + precision_range_full, batch_range_1D, stride_range, stride_range, @@ -229,7 +259,7 @@ INSTANTIATE_TEST_SUITE_P(prime_1D, accuracy_test, ::testing::ValuesIn(param_generator(generate_lengths({prime_range}), - precision_range, + precision_range_sp_dp, batch_range_1D, stride_range, stride_range, @@ -241,7 +271,7 @@ INSTANTIATE_TEST_SUITE_P(DISABLED_offset_prime_1D, accuracy_test, ::testing::ValuesIn(param_generator(generate_lengths({prime_range}), - precision_range, + precision_range_sp_dp, batch_range_1D, stride_range, stride_range, @@ -254,7 +284,7 @@ INSTANTIATE_TEST_SUITE_P(mix_1D, accuracy_test, ::testing::ValuesIn(param_generator(generate_lengths({mix_range}), - precision_range, + precision_range_full, batch_range_1D, stride_range, stride_range, @@ -266,7 +296,7 @@ INSTANTIATE_TEST_SUITE_P(DISABLED_offset_mix_1D, accuracy_test, ::testing::ValuesIn(param_generator(generate_lengths({mix_range}), - precision_range, + precision_range_full, batch_range_1D, stride_range, stride_range, @@ -312,14 +342,30 @@ // // The below test covers non-unit strides, pow of 2, middle sizes, which has SBCC/SBRC kernels // invloved. -const static std::vector pow2_range_for_stride = {4096, 8192, 524288}; -const static std::vector> stride_range_for_pow2 = {{2}, {3}}; -const static std::vector batch_range_for_stride = {2, 1}; +const static std::vector pow2_range_for_stride = {4096, 8192, 524288}; +const static std::vector pow2_range_for_stride_half = {4096, 8192}; +const static std::vector> stride_range_for_pow2 = {{2}, {3}}; +const static std::vector batch_range_for_stride = {2, 1}; + INSTANTIATE_TEST_SUITE_P( pow2_1D_stride_complex, accuracy_test, ::testing::ValuesIn(param_generator_complex(generate_lengths({pow2_range_for_stride}), - precision_range, + precision_range_sp_dp, + batch_range_1D, + stride_range_for_pow2, + stride_range_for_pow2, + ioffset_range_zero, + ooffset_range_zero, + place_range, + true)), + accuracy_test::TestName); + +INSTANTIATE_TEST_SUITE_P( + pow2_1D_stride_complex_half, + accuracy_test, + ::testing::ValuesIn(param_generator_complex(generate_lengths({pow2_range_for_stride_half}), + {fft_precision_half}, batch_range_1D, stride_range_for_pow2, stride_range_for_pow2, @@ -333,7 +379,21 @@ pow2_1D_stride_real, accuracy_test, ::testing::ValuesIn(param_generator_real(generate_lengths({pow2_range_for_stride}), - precision_range, + precision_range_sp_dp, + batch_range_1D, + stride_range_for_pow2, + stride_range_for_pow2, + ioffset_range_zero, + ooffset_range_zero, + place_range, + true)), + accuracy_test::TestName); + +INSTANTIATE_TEST_SUITE_P( + pow2_1D_stride_real_half, + accuracy_test, + ::testing::ValuesIn(param_generator_real(generate_lengths({pow2_range_for_stride_half}), + {fft_precision_half}, batch_range_1D, stride_range_for_pow2, stride_range_for_pow2, @@ -406,7 +466,7 @@ pow2_1D_complex_batched_2D_strided, accuracy_test, ::testing::ValuesIn(param_generator_complex_1d_batched_2d(generate_lengths({pow2_range_2D}), - precision_range, + precision_range_sp_dp, ioffset_range_zero, ooffset_range_zero, place_range)), @@ -417,7 +477,7 @@ pow3_1D_complex_batched_2D_strided, accuracy_test, ::testing::ValuesIn(param_generator_complex_1d_batched_2d(generate_lengths({pow3_range_2D}), - precision_range, + precision_range_sp_dp, ioffset_range_zero, ooffset_range_zero, place_range)), @@ -428,7 +488,7 @@ pow5_1D_complex_batched_2D_strided, accuracy_test, ::testing::ValuesIn(param_generator_complex_1d_batched_2d(generate_lengths({pow5_range_2D}), - precision_range, + precision_range_sp_dp, ioffset_range_zero, ooffset_range_zero, place_range)), @@ -440,7 +500,7 @@ prime_1D_complex_batched_2D_strided, accuracy_test, ::testing::ValuesIn(param_generator_complex_1d_batched_2d(generate_lengths({prime_range_2D}), - precision_range, + precision_range_sp_dp, ioffset_range_zero, ooffset_range_zero, place_range)), diff -Nru rocfft-5.5.0/clients/tests/accuracy_test_2D.cpp rocfft-5.7.1/clients/tests/accuracy_test_2D.cpp --- rocfft-5.5.0/clients/tests/accuracy_test_2D.cpp 2023-01-31 06:20:16.000000000 +0000 +++ rocfft-5.7.1/clients/tests/accuracy_test_2D.cpp 2023-08-09 16:19:51.000000000 +0000 @@ -36,6 +36,9 @@ const static std::vector pow2_range = {2, 4, 8, 16, 32, 64, 128, 256, 512, 1024, 2048, 4096, 8192}; +// For the current configuration, half-precision has a fft size limit of 65536 +const static std::vector pow2_range_half = {2, 4, 8, 16, 32, 64, 128, 256, 512, 1024, 2048}; + const static std::vector pow3_range = {3, 9, 27, 81, 243, 729, 2187, 6561}; const static std::vector pow5_range = {5, 25, 125, 625, 3125, 15625}; @@ -56,7 +59,21 @@ accuracy_test, ::testing::ValuesIn(param_generator(generate_lengths({pow2_range, pow2_range}), - precision_range, + precision_range_sp_dp, + batch_range, + stride_range, + stride_range, + ioffset_range_zero, + ooffset_range_zero, + place_range, + true)), + accuracy_test::TestName); + +INSTANTIATE_TEST_SUITE_P(pow2_2D_half, + accuracy_test, + ::testing::ValuesIn(param_generator(generate_lengths({pow2_range_half, + {2, 4, 8, 16, 32}}), + {fft_precision_half}, batch_range, stride_range, stride_range, @@ -65,11 +82,12 @@ place_range, true)), accuracy_test::TestName); + INSTANTIATE_TEST_SUITE_P(DISABLED_offset_pow2_2D, accuracy_test, ::testing::ValuesIn(param_generator(generate_lengths({pow2_range, pow2_range}), - precision_range, + precision_range_full, batch_range, stride_range, stride_range, @@ -83,7 +101,7 @@ accuracy_test, ::testing::ValuesIn(param_generator(generate_lengths({pow3_range, pow3_range}), - precision_range, + precision_range_sp_dp, batch_range, stride_range, stride_range, @@ -92,11 +110,12 @@ place_range, true)), accuracy_test::TestName); + INSTANTIATE_TEST_SUITE_P(DISABLED_offset_pow3_2D, accuracy_test, ::testing::ValuesIn(param_generator(generate_lengths({pow3_range, pow3_range}), - precision_range, + precision_range_full, batch_range, stride_range, stride_range, @@ -110,7 +129,7 @@ accuracy_test, ::testing::ValuesIn(param_generator(generate_lengths({pow5_range, pow5_range}), - precision_range, + precision_range_sp_dp, batch_range, stride_range, stride_range, @@ -119,11 +138,12 @@ place_range, true)), accuracy_test::TestName); + INSTANTIATE_TEST_SUITE_P(DISABLED_offset_pow5_2D, accuracy_test, ::testing::ValuesIn(param_generator(generate_lengths({pow5_range, pow5_range}), - precision_range, + precision_range_full, batch_range, stride_range, stride_range, @@ -137,7 +157,7 @@ accuracy_test, ::testing::ValuesIn(param_generator(generate_lengths({prime_range, prime_range}), - precision_range, + precision_range_sp_dp, batch_range, stride_range, stride_range, @@ -146,11 +166,12 @@ place_range, true)), accuracy_test::TestName); + INSTANTIATE_TEST_SUITE_P(DISABLED_offset_prime_2D, accuracy_test, ::testing::ValuesIn(param_generator(generate_lengths({prime_range, prime_range}), - precision_range, + precision_range_sp_dp, batch_range, stride_range, stride_range, @@ -164,7 +185,7 @@ accuracy_test, ::testing::ValuesIn(param_generator(generate_lengths({mix_range, mix_range}), - precision_range, + precision_range_sp_dp, batch_range, stride_range, stride_range, @@ -173,11 +194,12 @@ place_range, true)), accuracy_test::TestName); + INSTANTIATE_TEST_SUITE_P(DISABLED_offset_mix_2D, accuracy_test, ::testing::ValuesIn(param_generator(generate_lengths({mix_range, mix_range}), - precision_range, + precision_range_full, batch_range, stride_range, stride_range, @@ -192,7 +214,7 @@ accuracy_test, ::testing::ValuesIn(param_generator( generate_lengths({{1}, {4, 8, 8192, 3, 27, 7, 11, 5000, 8000}}), - precision_range, + precision_range_full, batch_range, stride_range, stride_range, @@ -207,7 +229,7 @@ accuracy_test, ::testing::ValuesIn(param_generator( generate_lengths({{4, 8, 8192, 3, 27, 7, 11, 5000, 8000}, {1}}), - precision_range, + precision_range_full, batch_range, stride_range, stride_range, diff -Nru rocfft-5.5.0/clients/tests/accuracy_test_3D.cpp rocfft-5.7.1/clients/tests/accuracy_test_3D.cpp --- rocfft-5.5.0/clients/tests/accuracy_test_3D.cpp 2023-01-31 06:20:16.000000000 +0000 +++ rocfft-5.7.1/clients/tests/accuracy_test_3D.cpp 2023-08-09 16:19:51.000000000 +0000 @@ -34,6 +34,9 @@ // TODO: 512, 1024, 2048 make the tests take too long; re-enable when // test speed is improved. static std::vector pow2_range = {4, 8, 16, 32, 128, 256}; +// For the current configuration, half-precision has a fft size limit of 65536 +static std::vector pow2_range_half = {4, 8, 16, 32}; + // SBCC+SBRC as a sub-node of a 3D TRTRTR std::vector> pow2_adhoc = {{4, 4, 8192}}; @@ -55,7 +58,7 @@ pow2_3D, accuracy_test, ::testing::ValuesIn(param_generator(generate_lengths({pow2_range, pow2_range, pow2_range}), - precision_range, + precision_range_sp_dp, batch_range, stride_range, stride_range, @@ -65,11 +68,25 @@ true)), accuracy_test::TestName); +INSTANTIATE_TEST_SUITE_P(pow2_3D_half, + accuracy_test, + ::testing::ValuesIn(param_generator( + generate_lengths({pow2_range_half, pow2_range_half, pow2_range_half}), + {fft_precision_half}, + batch_range, + stride_range, + stride_range, + ioffset_range_zero, + ooffset_range_zero, + place_range, + true)), + accuracy_test::TestName); + INSTANTIATE_TEST_SUITE_P( DISABLED_offset_pow2_3D, accuracy_test, ::testing::ValuesIn(param_generator(generate_lengths({pow2_range, pow2_range, pow2_range}), - precision_range, + precision_range_full, batch_range, stride_range, stride_range, @@ -83,7 +100,7 @@ pow3_3D, accuracy_test, ::testing::ValuesIn(param_generator(generate_lengths({pow3_range, pow3_range, pow3_range}), - precision_range, + precision_range_sp_dp, batch_range, stride_range, stride_range, @@ -96,7 +113,7 @@ DISABLED_offset_pow3_3D, accuracy_test, ::testing::ValuesIn(param_generator(generate_lengths({pow3_range, pow3_range, pow3_range}), - precision_range, + precision_range_full, batch_range, stride_range, stride_range, @@ -110,7 +127,7 @@ pow5_3D, accuracy_test, ::testing::ValuesIn(param_generator(generate_lengths({pow5_range, pow5_range, pow5_range}), - precision_range, + precision_range_sp_dp, batch_range, stride_range, stride_range, @@ -123,7 +140,7 @@ DISABLED_offset_pow5_3D, accuracy_test, ::testing::ValuesIn(param_generator(generate_lengths({pow5_range, pow5_range, pow5_range}), - precision_range, + precision_range_full, batch_range, stride_range, stride_range, @@ -137,7 +154,7 @@ prime_3D, accuracy_test, ::testing::ValuesIn(param_generator(generate_lengths({prime_range, prime_range, prime_range}), - precision_range, + precision_range_sp_dp, batch_range, stride_range, stride_range, @@ -150,7 +167,7 @@ DISABLED_offset_prime_3D, accuracy_test, ::testing::ValuesIn(param_generator(generate_lengths({prime_range, prime_range, prime_range}), - precision_range, + precision_range_full, batch_range, stride_range, stride_range, @@ -164,7 +181,7 @@ mix_3D, accuracy_test, ::testing::ValuesIn(param_generator(generate_lengths({pow2_range, pow3_range, prime_range}), - precision_range, + precision_range_sp_dp, batch_range, stride_range, stride_range, @@ -177,7 +194,7 @@ DISABLED_offset_mix_3D, accuracy_test, ::testing::ValuesIn(param_generator(generate_lengths({pow2_range, pow3_range, prime_range}), - precision_range, + precision_range_full, batch_range, stride_range, stride_range, @@ -195,7 +212,7 @@ sbrc_3D, accuracy_test, ::testing::ValuesIn(param_generator(generate_lengths({sbrc_range, sbrc_range, sbrc_range}), - precision_range, + precision_range_sp_dp, sbrc_batch_range, stride_range, stride_range, @@ -207,13 +224,15 @@ // pick small sizes that will exercise 2D_SINGLE and a couple of sizes that won't static std::vector inner_batch_3D_range = {4, 8, 16, 32, 20, 24, 64}; +static std::vector inner_batch_3D_range_half = {4, 8, 16, 32, 20, 24}; static std::vector inner_batch_3D_batch_range = {3, 2, 1}; + INSTANTIATE_TEST_SUITE_P( inner_batch_3D, accuracy_test, ::testing::ValuesIn(param_generator( generate_lengths({inner_batch_3D_range, inner_batch_3D_range, inner_batch_3D_range}), - precision_range, + precision_range_sp_dp, inner_batch_3D_batch_range, stride_generator_3D_inner_batch(stride_range), stride_generator_3D_inner_batch(stride_range), @@ -222,3 +241,19 @@ place_range, true)), accuracy_test::TestName); + +INSTANTIATE_TEST_SUITE_P( + inner_batch_3D_half, + accuracy_test, + ::testing::ValuesIn(param_generator(generate_lengths({inner_batch_3D_range_half, + inner_batch_3D_range_half, + inner_batch_3D_range_half}), + {fft_precision_half}, + inner_batch_3D_batch_range, + stride_generator_3D_inner_batch(stride_range), + stride_generator_3D_inner_batch(stride_range), + ioffset_range_zero, + ooffset_range_zero, + place_range, + true)), + accuracy_test::TestName); \ No newline at end of file diff -Nru rocfft-5.5.0/clients/tests/accuracy_test_adhoc.cpp rocfft-5.7.1/clients/tests/accuracy_test_adhoc.cpp --- rocfft-5.5.0/clients/tests/accuracy_test_adhoc.cpp 2023-01-31 06:20:16.000000000 +0000 +++ rocfft-5.7.1/clients/tests/accuracy_test_adhoc.cpp 2023-08-09 16:19:51.000000000 +0000 @@ -1,5 +1,5 @@ -// Copyright (C) 2021 - 2022 Advanced Micro Devices, Inc. All rights reserved. +// Copyright (C) 2021 - 2023 Advanced Micro Devices, Inc. All rights reserved. // // Permission is hereby granted, free of charge, to any person obtaining a copy // of this software and associated documentation files (the "Software"), to deal @@ -19,8 +19,6 @@ // OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN // THE SOFTWARE. -#include "../fft_params.h" - #include "accuracy_test.h" std::vector> adhoc_sizes = { @@ -54,6 +52,9 @@ // TILE_UNALIGNED type of SBRC 3D ERC {98, 98, 98}, + + // 3D_BLOCK_CR + {336, 336, 56}, }; const static std::vector> stride_range = {{1}}; @@ -67,7 +68,7 @@ INSTANTIATE_TEST_SUITE_P(adhoc, accuracy_test, ::testing::ValuesIn(param_generator(adhoc_sizes, - precision_range, + precision_range_sp_dp, batch_range, stride_range, stride_range, @@ -80,7 +81,7 @@ INSTANTIATE_TEST_SUITE_P(DISABLED_offset_adhoc, accuracy_test, ::testing::ValuesIn(param_generator(adhoc_sizes, - precision_range, + precision_range_full, batch_range, stride_range, stride_range, @@ -90,23 +91,23 @@ true)), accuracy_test::TestName); +// Test that dist is ignored for batch-1 transforms. Normally, +// in-place transforms require same dist, but for batch-1 dist isn't +// used for anything and differing dist should be allowed. inline auto param_permissive_iodist() { std::vector> lengths = adhoc_sizes; - // TODO- for these permissive iodist tests, - // some 98^3 sizes take too long for the exhaustive search buffer assignments - // about millions of assignments, thus the program is hung there. - // So we take this length out from iodist test for now. - lengths.erase(std::find(lengths.begin(), lengths.end(), std::vector{98, 98, 98})); lengths.push_back({4}); std::vector params; - for(const auto precision : precision_range) + for(const auto precision : precision_range_sp_dp) { for(const auto trans_type : trans_type_range) { for(const auto& types : generate_types(trans_type, place_range, true)) { + if(std::get<1>(types) != fft_placement_inplace) + continue; for(const auto& len : lengths) { fft_params param; @@ -133,14 +134,64 @@ ::testing::ValuesIn(param_permissive_iodist()), accuracy_test::TestName); -inline auto param_adhoc_stride() +inline auto param_adhoc_colmajor() { - std::vector> lengths = adhoc_sizes; - lengths.push_back({4}); + // generate basic FFTs of adhoc sizes + auto params = param_generator(adhoc_sizes, + {fft_precision_single}, + {2}, + stride_range, + stride_range, + ioffset_range_zero, + ooffset_range_zero, + {fft_placement_notinplace}, + false); + + // remove any params that are: + // - 1D (not enough dims to swap) + // - real-complex 2D (we only get to play with higher dims, so + // again not enough dims to swap) + params.erase(std::remove_if(params.begin(), + params.end(), + [](const fft_params& param) { + if(param.length.size() == 1) + return true; + if(param.length.size() == 2) + { + if(param.transform_type == fft_transform_type_real_forward + || param.transform_type + == fft_transform_type_real_inverse) + return true; + } + return false; + }), + params.end()); + + // reverse length/stride order on remaining params to make them + // col-major + std::for_each(params.begin(), params.end(), [](fft_params& param) { + size_t start_dim = 0; + // for real-complex we can't touch the fastest dim + if(param.transform_type == fft_transform_type_real_forward + || param.transform_type == fft_transform_type_real_inverse) + ++start_dim; + std::reverse(param.length.rbegin() + start_dim, param.length.rend()); + std::reverse(param.istride.rbegin() + start_dim, param.istride.rend()); + std::reverse(param.ostride.rbegin() + start_dim, param.ostride.rend()); + }); + return params; +} +INSTANTIATE_TEST_SUITE_P(adhoc_colmajor, + accuracy_test, + ::testing::ValuesIn(param_adhoc_colmajor()), + accuracy_test::TestName); + +inline auto param_adhoc_stride() +{ std::vector params; - for(const auto precision : precision_range) + for(const auto precision : precision_range_full) { for(const auto& types : generate_types(fft_transform_type_complex_forward, {fft_placement_inplace, fft_placement_notinplace}, @@ -164,38 +215,35 @@ // test C2R/R2C with non-contiguous higher strides and dist - we // want unit stride for length0 so we do the even-length optimization - for(const auto trans_type : - {fft_transform_type_real_forward, fft_transform_type_real_inverse}) + for(const auto& types : + generate_types(fft_transform_type_real_forward, {fft_placement_notinplace}, true)) { - for(const auto& types : generate_types(trans_type, {fft_placement_notinplace}, true)) - { - fft_params param; - param.length = {4, 4, 4}; - param.precision = precision; - param.idist = 0; - param.odist = 0; - param.transform_type = trans_type; - param.nbatch = 2; - param.placement = std::get<1>(types); - param.itype = std::get<2>(types); - param.otype = std::get<3>(types); - param.istride = {16, 4, 1}; - param.ostride = {16, 4, 1}; - params.push_back(param); - - param.length = {2, 2, 2}; - param.precision = precision; - param.idist = 0; - param.odist = 0; - param.transform_type = trans_type; - param.nbatch = 2; - param.placement = std::get<1>(types); - param.itype = std::get<2>(types); - param.otype = std::get<3>(types); - param.istride = {20, 6, 1}; - param.ostride = {20, 6, 1}; - params.push_back(param); - } + fft_params param; + param.length = {4, 4, 4}; + param.precision = precision; + param.idist = 0; + param.odist = 0; + param.transform_type = fft_transform_type_real_forward; + param.nbatch = 2; + param.placement = std::get<1>(types); + param.itype = std::get<2>(types); + param.otype = std::get<3>(types); + param.istride = {16, 4, 1}; + param.ostride = {16, 4, 1}; + params.push_back(param); + + param.length = {2, 2, 2}; + param.precision = precision; + param.idist = 0; + param.odist = 0; + param.transform_type = fft_transform_type_real_forward; + param.nbatch = 2; + param.placement = std::get<1>(types); + param.itype = std::get<2>(types); + param.otype = std::get<3>(types); + param.istride = {20, 6, 1}; + param.ostride = {20, 6, 1}; + params.push_back(param); } } diff -Nru rocfft-5.5.0/clients/tests/accuracy_test_callback.cpp rocfft-5.7.1/clients/tests/accuracy_test_callback.cpp --- rocfft-5.5.0/clients/tests/accuracy_test_callback.cpp 2023-01-31 06:20:16.000000000 +0000 +++ rocfft-5.7.1/clients/tests/accuracy_test_callback.cpp 2023-08-09 16:19:51.000000000 +0000 @@ -18,8 +18,6 @@ // OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN // THE SOFTWARE. -#include "../rocfft_params.h" - #include "accuracy_test.h" std::vector> callback_sizes = { @@ -102,16 +100,14 @@ const static std::vector> ioffset_range = {{0, 0}, {1, 1}}; const static std::vector> ooffset_range = {{0, 0}, {1, 1}}; -auto transform_types = {fft_transform_type_complex_forward, - fft_transform_type_complex_inverse, - fft_transform_type_real_forward, - fft_transform_type_real_inverse}; +auto forward_transform_types + = {fft_transform_type_complex_forward, fft_transform_type_real_forward}; INSTANTIATE_TEST_SUITE_P(callback, accuracy_test, - ::testing::ValuesIn(param_generator_base(transform_types, + ::testing::ValuesIn(param_generator_base(forward_transform_types, callback_sizes, - precision_range, + precision_range_sp_dp, batch_range, generate_types, stride_range, @@ -125,9 +121,9 @@ INSTANTIATE_TEST_SUITE_P(DISABLED_callback, accuracy_test, - ::testing::ValuesIn(param_generator_base(transform_types, + ::testing::ValuesIn(param_generator_base(forward_transform_types, callback_sizes, - precision_range, + precision_range_sp_dp, batch_range, generate_types, stride_range, @@ -145,7 +141,7 @@ inline auto param_generator_scaling(const std::vector>& v_lengths) { auto params = param_generator(callback_sizes, - precision_range, + precision_range_sp_dp, batch_range, stride_range, stride_range, diff -Nru rocfft-5.5.0/clients/tests/accuracy_test_checkstride.cpp rocfft-5.7.1/clients/tests/accuracy_test_checkstride.cpp --- rocfft-5.5.0/clients/tests/accuracy_test_checkstride.cpp 2023-01-31 06:20:16.000000000 +0000 +++ rocfft-5.7.1/clients/tests/accuracy_test_checkstride.cpp 2023-08-09 16:19:51.000000000 +0000 @@ -18,8 +18,6 @@ // OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN // THE SOFTWARE. -#include "../fft_params.h" - #include "accuracy_test.h" inline auto param_checkstride() @@ -59,7 +57,7 @@ { for(const auto& s : sizes) { - for(const auto precision : precision_range) + for(const auto precision : precision_range_sp_dp) { for(const auto& types : generate_types(trans_type, {fft_placement_notinplace}, true)) diff -Nru rocfft-5.5.0/clients/tests/cmake/FindFFTW.cmake rocfft-5.7.1/clients/tests/cmake/FindFFTW.cmake --- rocfft-5.5.0/clients/tests/cmake/FindFFTW.cmake 2023-01-31 06:20:16.000000000 +0000 +++ rocfft-5.7.1/clients/tests/cmake/FindFFTW.cmake 2023-08-09 16:19:51.000000000 +0000 @@ -40,6 +40,8 @@ # message( STATUS "FFTW_FIND_REQUIRED_FLOAT: ${FFTW_FIND_REQUIRED_FLOAT}" ) # message( STATUS "FFTW_FIND_REQUIRED_DOUBLE: ${FFTW_FIND_REQUIRED_DOUBLE}" ) +include( CheckSymbolExists ) + set( FFTW_LIBRARIES "" ) if( FFTW_FIND_REQUIRED_FLOAT OR FFTW_FIND_REQUIRED_SINGLE ) find_library( FFTW_LIBRARIES_SINGLE @@ -68,6 +70,9 @@ list( APPEND FFTW_LIBRARIES ${FFTWF_THREADS_LIBRARY} ) set( FFTW_MULTITHREAD TRUE ) endif() + + list( APPEND CMAKE_REQUIRED_LIBRARIES ${FFTW_LIBRARIES_SINGLE} ) + check_symbol_exists( fftwf_sprint_plan "fftw3.h" FFTW_HAVE_SPRINT_PLAN ) endif( ) if( FFTW_FIND_REQUIRED_DOUBLE ) @@ -97,8 +102,15 @@ list( APPEND FFTW_LIBRARIES ${FFTW_THREADS_LIBRARY} ) set( FFTW_MULTITHREAD TRUE ) endif() + + list( APPEND CMAKE_REQUIRED_LIBRARIES ${FFTW_LIBRARIES_DOUBLE} ) + check_symbol_exists( fftw_sprint_plan "fftw3.h" FFTW_HAVE_SPRINT_PLAN ) endif( ) +if( BUILD_FFTW OR FFTW_HAVE_SPRINT_PLAN ) + target_compile_definitions( rocfft-test PUBLIC FFTW_HAVE_SPRINT_PLAN ) +endif() + include( FindPackageHandleStandardArgs ) FIND_PACKAGE_HANDLE_STANDARD_ARGS( FFTW REQUIRED_VARS FFTW_INCLUDE_DIRS FFTW_LIBRARIES ) diff -Nru rocfft-5.5.0/clients/tests/default_callbacks_test.cpp rocfft-5.7.1/clients/tests/default_callbacks_test.cpp --- rocfft-5.5.0/clients/tests/default_callbacks_test.cpp 2023-01-31 06:20:16.000000000 +0000 +++ rocfft-5.7.1/clients/tests/default_callbacks_test.cpp 2023-08-09 16:19:51.000000000 +0000 @@ -26,7 +26,7 @@ #include -#include "../rocfft_params.h" +#include "../../shared/rocfft_params.h" #include "fftw_transform.h" #include "rocfft.h" @@ -40,10 +40,10 @@ return data[offset]; } -__device__ auto load_cb_double2 = load_cb; -__device__ auto load_cb_double = load_cb; -__device__ auto load_cb_float2 = load_cb; -__device__ auto load_cb_float = load_cb; +__device__ auto load_cb_complex_double = load_cb>; +__device__ auto load_cb_double = load_cb; +__device__ auto load_cb_complex_float = load_cb>; +__device__ auto load_cb_float = load_cb; // ------------------------------------- // default store callback definitions @@ -55,10 +55,10 @@ data[offset] = element; } -__device__ auto store_cb_double2 = store_cb; -__device__ auto store_cb_double = store_cb; -__device__ auto store_cb_float2 = store_cb; -__device__ auto store_cb_float = store_cb; +__device__ auto store_cb_complex_double = store_cb>; +__device__ auto store_cb_double = store_cb; +__device__ auto store_cb_complex_float = store_cb>; +__device__ auto store_cb_float = store_cb; // ------------------------------------- // type traits definitions @@ -71,13 +71,13 @@ }; template <> -struct is_hip_complex +struct is_hip_complex> { static const bool value = true; }; template <> -struct is_hip_complex +struct is_hip_complex> { static const bool value = true; }; @@ -110,15 +110,15 @@ float low_bound_f = -1.0f, up_bound_f = 1.0f; double low_bound_d = -1.0, up_bound_d = 1.0; - std::vector h_mem_out_f2, h_mem_out_no_cb_f2; - std::vector h_mem_out_d2, h_mem_out_no_cb_d2; + std::vector> h_mem_out_f2, h_mem_out_no_cb_f2; + std::vector> h_mem_out_d2, h_mem_out_no_cb_d2; switch(fwrd_transf_type) { case rocfft_transform_type_complex_forward: { - std::vector h_mem_in_f2; - std::vector h_mem_in_d2; + std::vector> h_mem_in_f2; + std::vector> h_mem_in_d2; (frwd_transf_precision == rocfft_precision_single) ? run(low_bound_f, up_bound_f, h_mem_in_f2, h_mem_out_f2, h_mem_out_no_cb_f2) @@ -293,18 +293,16 @@ void validate_test(const std::vector& host_mem_out, const std::vector& host_mem_out_no_cb) { - std::vector> linf_failures; - auto diff = distance_1to1_complex( - reinterpret_cast*>(host_mem_out.data()), - reinterpret_cast*>(host_mem_out_no_cb.data()), + reinterpret_cast*>(host_mem_out.data()), + reinterpret_cast*>(host_mem_out_no_cb.data()), host_mem_out.size(), 1, 1, host_mem_out.size(), 1, host_mem_out_no_cb.size(), - linf_failures, + nullptr, type_epsilon(), {0}, {0}); @@ -320,10 +318,11 @@ void set_load_callback(){}; template <> - void set_load_callback() + void set_load_callback>() { - EXPECT_EQ(hipMemcpyFromSymbol(&load_cb_host, HIP_SYMBOL(load_cb_double2), sizeof(void*)), - hipSuccess); + EXPECT_EQ( + hipMemcpyFromSymbol(&load_cb_host, HIP_SYMBOL(load_cb_complex_double), sizeof(void*)), + hipSuccess); }; template <> @@ -334,10 +333,11 @@ }; template <> - void set_load_callback() + void set_load_callback>() { - EXPECT_EQ(hipMemcpyFromSymbol(&load_cb_host, HIP_SYMBOL(load_cb_float2), sizeof(void*)), - hipSuccess); + EXPECT_EQ( + hipMemcpyFromSymbol(&load_cb_host, HIP_SYMBOL(load_cb_complex_float), sizeof(void*)), + hipSuccess); }; template <> @@ -355,10 +355,11 @@ void set_store_callback(){}; template <> - void set_store_callback() + void set_store_callback>() { - EXPECT_EQ(hipMemcpyFromSymbol(&store_cb_host, HIP_SYMBOL(store_cb_double2), sizeof(void*)), - hipSuccess); + EXPECT_EQ( + hipMemcpyFromSymbol(&store_cb_host, HIP_SYMBOL(store_cb_complex_double), sizeof(void*)), + hipSuccess); }; template <> @@ -369,10 +370,11 @@ }; template <> - void set_store_callback() + void set_store_callback>() { - EXPECT_EQ(hipMemcpyFromSymbol(&store_cb_host, HIP_SYMBOL(store_cb_float2), sizeof(void*)), - hipSuccess); + EXPECT_EQ( + hipMemcpyFromSymbol(&store_cb_host, HIP_SYMBOL(store_cb_complex_float), sizeof(void*)), + hipSuccess); }; template <> diff -Nru rocfft-5.5.0/clients/tests/fftw_transform.h rocfft-5.7.1/clients/tests/fftw_transform.h --- rocfft-5.5.0/clients/tests/fftw_transform.h 2023-01-31 06:20:16.000000000 +0000 +++ rocfft-5.7.1/clients/tests/fftw_transform.h 2023-08-09 16:19:51.000000000 +0000 @@ -1,4 +1,4 @@ -// Copyright (C) 2016 - 2022 Advanced Micro Devices, Inc. All rights reserved. +// Copyright (C) 2016 - 2023 Advanced Micro Devices, Inc. All rights reserved. // // Permission is hereby granted, free of charge, to any person obtaining a copy // of this software and associated documentation files (the "Software"), to deal @@ -23,7 +23,6 @@ #define FFTWTRANSFORM_H #include "test_params.h" -#include #include #include @@ -44,6 +43,11 @@ template inline double type_epsilon(); template <> +inline double type_epsilon<_Float16>() +{ + return half_epsilon; +} +template <> inline double type_epsilon() { return single_epsilon; @@ -61,6 +65,13 @@ template struct fftw_trait; template <> +struct fftw_trait<_Float16> +{ + // fftw does not support half precision, so use single precision and convert + using fftw_complex_type = fftwf_complex; + using fftw_plan_type = fftwf_plan; +}; +template <> struct fftw_trait { using fftw_complex_type = fftwf_complex; @@ -73,6 +84,36 @@ using fftw_plan_type = fftw_plan; }; +// Copies the half-precision input buffer to a single-precision +// buffer. Note that the input buffer is already sized like it's a +// single-precision buffer (but only half of it is filled), because +// we allocate a single-precision buffer for FFTW to plan with. +static hostbuf half_to_single_copy(const hostbuf& in) +{ + auto out = in.copy(); + auto in_begin = reinterpret_cast(in.data()); + std::copy_n(in_begin, in.size() / sizeof(_Float16) / 2, reinterpret_cast(out.data())); + return out; +} + +// converts a wider precision buffer to a narrower precision, in-place +template +void narrow_precision_inplace(hostbuf& in) +{ + // ensure we're actually shrinking the data + static_assert(sizeof(TfloatIn) > sizeof(TfloatOut)); + + auto readPtr = reinterpret_cast(in.data()); + auto writePtr = reinterpret_cast(in.data()); + std::copy_n(readPtr, in.size() / sizeof(TfloatIn), writePtr); + in.shrink(in.size() / (sizeof(TfloatIn) / sizeof(TfloatOut))); +} + +static void single_to_half_inplace(hostbuf& in) +{ + narrow_precision_inplace(in); +} + // Template wrappers for real-valued FFTW allocators: template inline Tfloat* fftw_alloc_real_type(size_t n); @@ -124,14 +165,14 @@ return fftw_alloc_complex_type(n); } template <> -inline std::complex* fftw_alloc_type>(size_t n) +inline rocfft_complex* fftw_alloc_type>(size_t n) { - return (std::complex*)fftw_alloc_complex_type(n); + return (rocfft_complex*)fftw_alloc_complex_type(n); } template <> -inline std::complex* fftw_alloc_type>(size_t n) +inline rocfft_complex* fftw_alloc_type>(size_t n) { - return (std::complex*)fftw_alloc_complex_type(n); + return (rocfft_complex*)fftw_alloc_complex_type(n); } // Template wrappers for FFTW plan executors: @@ -175,6 +216,20 @@ unsigned flags); template <> +inline typename fftw_trait<_Float16>::fftw_plan_type + fftw_plan_guru64_dft<_Float16>(int rank, + const fftw_iodim64* dims, + int howmany_rank, + const fftw_iodim64* howmany_dims, + typename fftw_trait<_Float16>::fftw_complex_type* in, + typename fftw_trait<_Float16>::fftw_complex_type* out, + int sign, + unsigned flags) +{ + return fftwf_plan_guru64_dft(rank, dims, howmany_rank, howmany_dims, in, out, sign, flags); +} + +template <> inline typename fftw_trait::fftw_plan_type fftw_plan_guru64_dft(int rank, const fftw_iodim64* dims, @@ -204,22 +259,42 @@ // Template wrappers for FFTW c2c executors: template -inline void fftw_plan_execute_c2c(typename fftw_trait::fftw_plan_type plan, - typename fftw_trait::fftw_complex_type* in, - typename fftw_trait::fftw_complex_type* out); +inline void fftw_plan_execute_c2c(typename fftw_trait::fftw_plan_type plan, + std::vector& in, + std::vector& out); + +template <> +inline void fftw_plan_execute_c2c<_Float16>(typename fftw_trait<_Float16>::fftw_plan_type plan, + std::vector& in, + std::vector& out) +{ + // since FFTW does not natively support half precision, convert + // input to single, execute, then convert output back to half + auto in_single = half_to_single_copy(in.front()); + fftwf_execute_dft(plan, + reinterpret_cast(in_single.data()), + reinterpret_cast(out.front().data())); + single_to_half_inplace(out.front()); +} + template <> -inline void fftw_plan_execute_c2c(typename fftw_trait::fftw_plan_type plan, - typename fftw_trait::fftw_complex_type* in, - typename fftw_trait::fftw_complex_type* out) +inline void fftw_plan_execute_c2c(typename fftw_trait::fftw_plan_type plan, + std::vector& in, + std::vector& out) { - fftwf_execute_dft(plan, in, out); + fftwf_execute_dft(plan, + reinterpret_cast(in.front().data()), + reinterpret_cast(out.front().data())); } + template <> -inline void fftw_plan_execute_c2c(typename fftw_trait::fftw_plan_type plan, - typename fftw_trait::fftw_complex_type* in, - typename fftw_trait::fftw_complex_type* out) +inline void fftw_plan_execute_c2c(typename fftw_trait::fftw_plan_type plan, + std::vector& in, + std::vector& out) { - fftw_execute_dft(plan, in, out); + fftw_execute_dft(plan, + reinterpret_cast(in.front().data()), + reinterpret_cast(out.front().data())); } // Template wrappers for FFTW r2c planners: @@ -233,6 +308,19 @@ typename fftw_trait::fftw_complex_type* out, unsigned flags); template <> +inline typename fftw_trait<_Float16>::fftw_plan_type + fftw_plan_guru64_r2c<_Float16>(int rank, + const fftw_iodim64* dims, + int howmany_rank, + const fftw_iodim64* howmany_dims, + _Float16* in, + typename fftw_trait<_Float16>::fftw_complex_type* out, + unsigned flags) +{ + return fftwf_plan_guru64_dft_r2c( + rank, dims, howmany_rank, howmany_dims, reinterpret_cast(in), out, flags); +} +template <> inline typename fftw_trait::fftw_plan_type fftw_plan_guru64_r2c(int rank, const fftw_iodim64* dims, @@ -259,22 +347,39 @@ // Template wrappers for FFTW r2c executors: template -inline void fftw_plan_execute_r2c(typename fftw_trait::fftw_plan_type plan, - Tfloat* in, - typename fftw_trait::fftw_complex_type* out); -template <> -inline void fftw_plan_execute_r2c(typename fftw_trait::fftw_plan_type plan, - float* in, - typename fftw_trait::fftw_complex_type* out) -{ - fftwf_execute_dft_r2c(plan, in, out); -} -template <> -inline void fftw_plan_execute_r2c(typename fftw_trait::fftw_plan_type plan, - double* in, - typename fftw_trait::fftw_complex_type* out) -{ - fftw_execute_dft_r2c(plan, in, out); +inline void fftw_plan_execute_r2c(typename fftw_trait::fftw_plan_type plan, + std::vector& in, + std::vector& out); +template <> +inline void fftw_plan_execute_r2c<_Float16>(typename fftw_trait::fftw_plan_type plan, + std::vector& in, + std::vector& out) +{ + // since FFTW does not natively support half precision, convert + // input to single, execute, then convert output back to half + auto in_single = half_to_single_copy(in.front()); + fftwf_execute_dft_r2c(plan, + reinterpret_cast(in_single.data()), + reinterpret_cast(out.front().data())); + single_to_half_inplace(out.front()); +} +template <> +inline void fftw_plan_execute_r2c(typename fftw_trait::fftw_plan_type plan, + std::vector& in, + std::vector& out) +{ + fftwf_execute_dft_r2c(plan, + reinterpret_cast(in.front().data()), + reinterpret_cast(out.front().data())); +} +template <> +inline void fftw_plan_execute_r2c(typename fftw_trait::fftw_plan_type plan, + std::vector& in, + std::vector& out) +{ + fftw_execute_dft_r2c(plan, + reinterpret_cast(in.front().data()), + reinterpret_cast(out.front().data())); } // Template wrappers for FFTW c2r planners: @@ -288,6 +393,19 @@ Tfloat* out, unsigned flags); template <> +inline typename fftw_trait<_Float16>::fftw_plan_type + fftw_plan_guru64_c2r<_Float16>(int rank, + const fftw_iodim64* dims, + int howmany_rank, + const fftw_iodim64* howmany_dims, + typename fftw_trait<_Float16>::fftw_complex_type* in, + _Float16* out, + unsigned flags) +{ + return fftwf_plan_guru64_dft_c2r( + rank, dims, howmany_rank, howmany_dims, in, reinterpret_cast(out), flags); +} +template <> inline typename fftw_trait::fftw_plan_type fftw_plan_guru64_c2r(int rank, const fftw_iodim64* dims, @@ -314,56 +432,60 @@ // Template wrappers for FFTW c2r executors: template -inline void fftw_plan_execute_c2r(typename fftw_trait::fftw_plan_type plan, - typename fftw_trait::fftw_complex_type* in, - Tfloat* out); +inline void fftw_plan_execute_c2r(typename fftw_trait::fftw_plan_type plan, + std::vector& in, + std::vector& out); template <> -inline void fftw_plan_execute_c2r(typename fftw_trait::fftw_plan_type plan, - typename fftw_trait::fftw_complex_type* in, - float* out) +inline void fftw_plan_execute_c2r<_Float16>(typename fftw_trait::fftw_plan_type plan, + std::vector& in, + std::vector& out) { - fftwf_execute_dft_c2r(plan, in, out); + // since FFTW does not natively support half precision, convert + // input to single, execute, then convert output back to half + auto in_single = half_to_single_copy(in.front()); + fftwf_execute_dft_c2r(plan, + reinterpret_cast(in_single.data()), + reinterpret_cast(out.front().data())); + single_to_half_inplace(out.front()); } template <> -inline void fftw_plan_execute_c2r(typename fftw_trait::fftw_plan_type plan, - typename fftw_trait::fftw_complex_type* in, - double* out) +inline void fftw_plan_execute_c2r(typename fftw_trait::fftw_plan_type plan, + std::vector& in, + std::vector& out) { - fftw_execute_dft_c2r(plan, in, out); + fftwf_execute_dft_c2r(plan, + reinterpret_cast(in.front().data()), + reinterpret_cast(out.front().data())); } - -// Allocator / deallocator for FFTW arrays. -template -struct fftwAllocator +template <> +inline void fftw_plan_execute_c2r(typename fftw_trait::fftw_plan_type plan, + std::vector& in, + std::vector& out) { - using value_type = Tdata; - - fftwAllocator() = default; - template - fftwAllocator(const fftwAllocator&) - { - } - - Tdata* allocate(size_t n) - { - return (Tdata*)fftw_malloc(sizeof(Tdata) * n); - } - void deallocate(Tdata* data, size_t n) - { - fftw_free(data); - } -}; + fftw_execute_dft_c2r(plan, + reinterpret_cast(in.front().data()), + reinterpret_cast(out.front().data())); +} -template -inline bool operator==(const fftwAllocator&, const fftwAllocator&) +#ifdef FFTW_HAVE_SPRINT_PLAN +// Template wrappers for FFTW print plan: +template +inline char* fftw_sprint_plan(const typename fftw_trait::fftw_plan_type plan); +template <> +inline char* fftw_sprint_plan<_Float16>(const typename fftw_trait<_Float16>::fftw_plan_type plan) { - return true; + return fftwf_sprint_plan(plan); } - -template -inline bool operator!=(const fftwAllocator& a, const fftwAllocator& b) +template <> +inline char* fftw_sprint_plan(const typename fftw_trait::fftw_plan_type plan) +{ + return fftwf_sprint_plan(plan); +} +template <> +inline char* fftw_sprint_plan(const typename fftw_trait::fftw_plan_type plan) { - return !(a == b); + return fftw_sprint_plan(plan); } +#endif #endif diff -Nru rocfft-5.5.0/clients/tests/gtest_main.cpp rocfft-5.7.1/clients/tests/gtest_main.cpp --- rocfft-5.5.0/clients/tests/gtest_main.cpp 2023-01-31 06:20:16.000000000 +0000 +++ rocfft-5.7.1/clients/tests/gtest_main.cpp 2023-08-09 16:19:51.000000000 +0000 @@ -1,4 +1,4 @@ -// Copyright (C) 2016 - 2022 Advanced Micro Devices, Inc. All rights reserved. +// Copyright (C) 2016 - 2023 Advanced Micro Devices, Inc. All rights reserved. // // Permission is hereby granted, free of charge, to any person obtaining a copy // of this software and associated documentation files (the "Software"), to deal @@ -34,7 +34,6 @@ #include "../../shared/concurrency.h" #include "../../shared/environment.h" #include "../../shared/work_queue.h" -#include "../rocfft_params.h" #include "rocfft.h" #include "rocfft_accuracy_test.h" #include "test_params.h" @@ -53,14 +52,27 @@ // User-defined random seed size_t random_seed; +// Probability of running individual planar FFTs +double planar_prob; +// Probability of running individual callback FFTs +double callback_prob; // Transform parameters for manual test: fft_params manual_params; -// Ram limitation for tests (GiB): +// Host memory limitation for tests (GiB): size_t ramgb; +// Device memory limitation for tests (GiB): +size_t vramgb; + +// Allow skipping tests if there is a runtime error +bool skip_runtime_fails; +// But count the number of failures +int n_hip_failures = 0; + // Manually specified precision cutoffs: +double half_epsilon; double single_epsilon; double double_epsilon; @@ -69,6 +81,8 @@ double max_l2_eps_double = 0.0; double max_linf_eps_single = 0.0; double max_l2_eps_single = 0.0; +double max_linf_eps_half = 0.0; +double max_l2_eps_half = 0.0; // Control whether we use FFTW's wisdom (which we use to imply FFTW_MEASURE). bool use_fftw_wisdom = false; @@ -76,27 +90,31 @@ // Cache the last cpu fft that was requested last_cpu_fft_cache last_cpu_fft_data; -static size_t get_system_memory_GiB() +system_memory get_system_memory() { - // system memory often has a little chunk carved out for other - // stuff, so round up to nearest GiB. + system_memory memory_data; #ifdef WIN32 MEMORYSTATUSEX info; info.dwLength = sizeof(info); if(!GlobalMemoryStatusEx(&info)) - return 0; - return (info.ullTotalPhys + ONE_GiB - 1) / ONE_GiB; + return memory_data; + memory_data.total_bytes = info.ullTotalPhys; + memory_data.free_bytes = info.ullAvailPhys; #else struct sysinfo info; if(sysinfo(&info) != 0) - return 0; - return (info.totalram * info.mem_unit + ONE_GiB - 1) / ONE_GiB; + return memory_data; + memory_data.total_bytes = info.totalram * info.mem_unit; + memory_data.free_bytes = info.freeram * info.mem_unit; #endif + return memory_data; } +system_memory start_memory = get_system_memory(); + void precompile_test_kernels(const std::string& precompile_file) { - std::cout << "precompiling test kernels..." << std::endl; + std::cout << "precompiling test kernels...\n"; WorkQueue tokenQueue; std::vector tokens; @@ -137,7 +155,7 @@ std::mt19937 dist(dev()); std::shuffle(tokens.begin(), tokens.end(), dist); auto precompile_begin = std::chrono::steady_clock::now(); - std::cout << "precompiling " << tokens.size() << " FFT plans..." << std::endl; + std::cout << "precompiling " << tokens.size() << " FFT plans...\n"; for(auto&& t : tokens) tokenQueue.push(std::move(t)); @@ -153,10 +171,30 @@ std::string token{tokenQueue.pop()}; if(token.empty()) break; - rocfft_params params; - params.from_token(token); - params.validate(); - params.setup_structs(); + + try + { + rocfft_params params_forward; + params_forward.from_token(token); + params_forward.validate(); + params_forward.setup_structs(); + + params_forward.free(); + + rocfft_params params_inverse; + params_inverse.inverse_from_forward(params_forward); + params_inverse.validate(); + params_inverse.setup_structs(); + } + catch(std::exception& e) + { + // failed to create a plan, abort + // + // we could continue on, but the test should just + // fail later anyway in the same way. so report + // which token failed early and get out + throw std::runtime_error(token + " plan creation failure: " + e.what()); + } } }); // insert empty tokens to tell threads to stop @@ -168,25 +206,12 @@ auto precompile_end = std::chrono::steady_clock::now(); std::chrono::duration precompile_ms = precompile_end - precompile_begin; std::cout << "done precompiling FFT plans in " << static_cast(precompile_ms.count()) - << " ms" << std::endl; + << " ms\n"; } int main(int argc, char* argv[]) { - // NB: If we initialize gtest first, then it removes all of its own command-line - // arguments and sets argc and argv correctly; no need to jump through hoops for - // boost::program_options. - ::testing::InitGoogleTest(&argc, argv); - - // Filename for fftw and fftwf wisdom. - std::string fftw_wisdom_filename; - - // Token string to fully specify fft params for the manual test. - std::string test_token; - - // Filename for precompiled kernels to be written to - std::string precompile_file; - + // We would like to parse a few arguments before initiating gtest. po::options_description opdesc( "\n" "rocFFT Runtime Test command line options\n" @@ -206,13 +231,46 @@ " HP - hermitian planar\n" "\n" "Usage"); + // clang-format off + opdesc.add_options() + ("verbose,v", + po::value()->default_value(0), + "print out detailed information for the tests.") + ("seed", po::value(&random_seed), + "Random seed; if unset, use an actual random seed.") + ("planar_prob", po::value(&planar_prob)->default_value(0.1), + "Probability of running individual planar transforms") + ("callback_prob", po::value(&callback_prob)->default_value(0.1), + "Probability of running individual callback transforms"); + // clang-format on + po::variables_map vm; + po::store(po::command_line_parser(argc, argv).options(opdesc).allow_unregistered().run(), vm); + po::notify(vm); + + verbose = vm["verbose"].as(); + + // NB: If we initialize gtest first, then it removes all of its own command-line + // arguments and sets argc and argv correctly; no need to jump through hoops for + // boost::program_options. + ::testing::InitGoogleTest(&argc, argv); + + // Filename for fftw and fftwf wisdom. + std::string fftw_wisdom_filename; + + // Token string to fully specify fft params for the manual test. + std::string test_token; + + // Filename for precompiled kernels to be written to + std::string precompile_file; + // Declare the supported options. // clang-format doesn't handle boost program options very well: // clang-format off opdesc.add_options() ("help,h", "produces this help message") - ("verbose,v", po::value()->default_value(0), - "print out detailed information for the tests.") + ("skip_runtime_fails", po::value(&skip_runtime_fails)->default_value(true), + "Skip the test if there is a runtime failure.") + ("version", "Print queryable version information from the rocfft library and exit") ("transformType,t", po::value(&manual_params.transform_type) ->default_value(fft_transform_type_complex_forward), "Type of transform:\n0) complex forward\n1) complex inverse\n2) real " @@ -220,7 +278,9 @@ ("notInPlace,o", "Not in-place FFT transform (default: in-place)") ("callback", "Inject load/store callbacks") ("checkstride", "Check that data is not written outside of output strides") - ("double", "Double precision transform (default: single)") + ("double", "Double precision transform (deprecated: use --precision double)") + ("precision", po::value(&manual_params.precision), + "Transform precision: single (default), double, half") ( "itype", po::value(&manual_params.itype) ->default_value(fft_array_type_unset), "Array type of input data:\n0) interleaved\n1) planar\n2) real\n3) " @@ -248,39 +308,46 @@ "Logical size of input buffer.") ("osize", po::value>(&manual_params.osize)->multitoken(), "Logical size of output.") - ("R", po::value(&ramgb)->default_value(get_system_memory_GiB()), "Ram limit in GiB for tests.") - ("single_epsilon", po::value(&single_epsilon)->default_value(3.75e-5)) - ("double_epsilon", po::value(&double_epsilon)->default_value(1e-15)) + ("R", po::value(&ramgb)->default_value((start_memory.total_bytes + ONE_GiB - 1) / ONE_GiB), "Ram limit in GiB for tests.") + ("V", po::value(&vramgb)->default_value(0), "vram limit in GiB for tests.") + ("half_epsilon", po::value(&half_epsilon)->default_value(9.77e-4)) + ("single_epsilon", po::value(&single_epsilon)->default_value(3.75e-5)) + ("double_epsilon", po::value(&double_epsilon)->default_value(1e-15)) ("wise,w", "use FFTW wisdom") ("wisdomfile,W", po::value(&fftw_wisdom_filename)->default_value("wisdom3.txt"), "FFTW3 wisdom filename") ("scalefactor", po::value(&manual_params.scale_factor), "Scale factor to apply to output.") ("token", po::value(&test_token)->default_value(""), "Test token name for manual test") - ("precompile", po::value(&precompile_file), "Precompile kernels to a file for all test cases before running tests") - ("seed", po::value(&random_seed), "Random seed; if unset, use an actual random seed."); + ("precompile", po::value(&precompile_file), "Precompile kernels to a file for all test cases before running tests"); // clang-format on - po::variables_map vm; po::store(po::parse_command_line(argc, argv, opdesc), vm); po::notify(vm); if(vm.count("help")) { - std::cout << opdesc << std::endl; + std::cout << opdesc << "\n"; return 0; } - verbose = vm["verbose"].as(); - std::cout << "single epsilon: " << single_epsilon << "\tdouble epsilon: " << double_epsilon - << std::endl; + if(vm.count("version")) + { + char v[256]; + rocfft_get_version_string(v, 256); + std::cout << "version " << v << "\n"; + return EXIT_SUCCESS; + } + + std::cout << "half epsilon: " << half_epsilon << "\tsingle epsilon: " << single_epsilon + << "\tdouble epsilon: " << double_epsilon << "\n"; if(!vm.count("seed")) { std::random_device dev; random_seed = dev(); } - std::cout << "Random seed: " << random_seed << std::endl; + std::cout << "Random seed: " << random_seed << "\n"; if(vm.count("wise")) { @@ -302,7 +369,7 @@ rocfft_setup(); char v[256]; rocfft_get_version_string(v, 256); - std::cout << "rocFFT version: " << v << std::endl; + std::cout << "rocFFT version: " << v << "\n"; #ifdef FFTW_MULTITHREAD fftw_init_threads(); @@ -359,7 +426,7 @@ if(test_token != "") { - std::cout << "Reading fft params from token:\n" << test_token << std::endl; + std::cout << "Reading fft params from token:\n" << test_token << "\n"; try { @@ -367,7 +434,7 @@ } catch(...) { - std::cout << "Unable to parse token." << std::endl; + std::cout << "Unable to parse token.\n"; return 1; } } @@ -381,7 +448,8 @@ manual_params.placement = vm.count("notInPlace") ? fft_placement_notinplace : fft_placement_inplace; - manual_params.precision = vm.count("double") ? fft_precision_double : fft_precision_single; + if(vm.count("double")) + manual_params.precision = fft_precision_double; if(vm.count("callback")) { @@ -424,23 +492,27 @@ rocfft_cleanup(); - std::cout << "single precision max l-inf epsilon: " << max_linf_eps_single << std::endl; - std::cout << "single precision max l2 epsilon: " << max_l2_eps_single << std::endl; - std::cout << "double precision max l-inf epsilon: " << max_linf_eps_double << std::endl; - std::cout << "double precision max l2 epsilon: " << max_l2_eps_double << std::endl; + std::cout << "Random seed: " << random_seed << "\n"; + std::cout << "half precision max l-inf epsilon: " << max_linf_eps_half << "\n"; + std::cout << "half precision max l2 epsilon: " << max_l2_eps_half << "\n"; + std::cout << "single precision max l-inf epsilon: " << max_linf_eps_single << "\n"; + std::cout << "single precision max l2 epsilon: " << max_l2_eps_single << "\n"; + std::cout << "double precision max l-inf epsilon: " << max_linf_eps_double << "\n"; + std::cout << "double precision max l2 epsilon: " << max_l2_eps_double << "\n"; + std::cout << "Number of runtime issues: " << n_hip_failures << "\n"; return retval; } -TEST(manual, vs_fftw) +TEST(manual, vs_fftw) // MANUAL TESTS HERE { // Run an individual test using the provided command-line parameters. manual_params.validate(); std::cout << "Manual test:" - << "\n\t" << manual_params.str("\n\t") << std::endl; + << "\n\t" << manual_params.str("\n\t") << "\n"; - std::cout << "Token: " << manual_params.token() << std::endl; + std::cout << "Token: " << manual_params.token() << "\n"; if(!manual_params.valid(verbose + 2)) { diff -Nru rocfft-5.5.0/clients/tests/hermitian_test.cpp rocfft-5.7.1/clients/tests/hermitian_test.cpp --- rocfft-5.5.0/clients/tests/hermitian_test.cpp 2023-01-31 06:20:16.000000000 +0000 +++ rocfft-5.7.1/clients/tests/hermitian_test.cpp 2023-08-09 16:19:51.000000000 +0000 @@ -1,4 +1,4 @@ -// Copyright (C) 2021 - 2022 Advanced Micro Devices, Inc. All rights reserved. +// Copyright (C) 2021 - 2023 Advanced Micro Devices, Inc. All rights reserved. // // Permission is hereby granted, free of charge, to any person obtaining a copy // of this software and associated documentation files (the "Software"), to deal @@ -19,7 +19,7 @@ // THE SOFTWARE. #include "../../shared/gpubuf.h" -#include "../rocfft_params.h" +#include "../../shared/rocfft_params.h" #include "../samples/rocfft/examplekernels.h" #include "../samples/rocfft/exampleutils.h" #include "accuracy_test.h" @@ -54,22 +54,24 @@ ASSERT_TRUE(p.valid(verbose)); - std::vector> h_input(p.isize[0]); + std::vector h_input(p.isize[0]); std::random_device rd; std::mt19937 gen(rd()); std::uniform_real_distribution dis(0.0, 1.0); for(auto& val : h_input) { - val = std::complex(dis(gen), dis(gen)); + val.x = dis(gen); + val.y = dis(gen); } - if(verbose) + if(verbose > 2) { std::cout << "non-Hermitian input:"; for(const auto& val : h_input) { - std::cout << " " << val; + std::cout << " " + << "(" << val.x << ", " << val.y << ")"; } std::cout << std::endl; } @@ -94,7 +96,7 @@ ASSERT_TRUE(hipDeviceSynchronize() == hipSuccess); - if(verbose) + if(verbose > 2) { std::cout << "output:"; for(const auto& val : h_output) @@ -104,20 +106,23 @@ std::cout << std::endl; } - std::vector> h_input1 = h_input; + std::vector h_input1(p.isize[0]); + std::copy(h_input.begin(), h_input.end(), h_input1.begin()); // Impose Hermitian symmetry on the input: - h_input1[0].imag(0.0); + h_input1[0].y = 0.0; + if(p.length[0] % 2 == 0) { - h_input1.back().imag(0.0); + h_input1.back().y = 0.0; } - if(verbose) + if(verbose > 2) { std::cout << "Hermitian input:"; for(const auto& val : h_input1) { - std::cout << " " << val; + std::cout << " " + << "(" << val.x << ", " << val.y << ")"; } std::cout << std::endl; } @@ -125,7 +130,8 @@ double maxdiff = 0.0; for(unsigned int i = 0; i < h_input.size(); ++i) { - auto val = std::abs(h_input[i] - h_input1[i]); + auto val = std::abs( + rocfft_complex(h_input[i].x - h_input1[i].x, h_input[i].y - h_input1[i].y)); if(val > maxdiff) maxdiff = val; } @@ -138,7 +144,7 @@ ASSERT_TRUE(hipMemcpy(h_output1.data(), obuf.data(), obuf.size(), hipMemcpyDeviceToHost) == hipSuccess); - if(verbose) + if(verbose > 2) { std::cout << "output:"; for(const auto& val : h_output1) @@ -227,8 +233,8 @@ // Data buffers: gpubuf buf; - ASSERT_TRUE(buf.alloc(sizeof(std::complex) * p.isize[0]) == hipSuccess); - std::vector> hbuf(p.isize[0]); + ASSERT_TRUE(buf.alloc(sizeof(hipDoubleComplex) * p.isize[0]) == hipSuccess); + std::vector hbuf(p.isize[0]); // Initialize a Hermitian-symmetric array; it should be symmetric. init_hermitiancomplex_cm(p.length_cm(), p.ilength_cm(), p.istride_cm(), buf.data()); @@ -249,7 +255,8 @@ std::uniform_real_distribution unif(0, 1); for(auto& v : hbuf) { - v = std::complex(unif(rng), unif(rng)); + v.x = unif(rng); + v.y = unif(rng); } if(verbose > 2) { @@ -288,15 +295,18 @@ ASSERT_TRUE(p.execute(pibuf.data(), pobuf.data()) == fft_status_success); - std::vector> h_output(p.osize[0]); - std::fill(h_output.begin(), h_output.end(), 0.0); + std::vector h_output(p.osize[0]); + std::fill(h_output.begin(), h_output.end(), hipDoubleComplex{0.0, 0.0}); + ASSERT_TRUE( hipMemcpy(h_output.data(), obuf.data(), p.obuffer_sizes()[0], hipMemcpyDeviceToHost) == hipSuccess); impose_hermitian_symmetry_cm(p.length_cm(), p.olength_cm(), p.ostride_cm(), obuf.data()); - std::vector> h_output_resym(p.osize[0]); - std::fill(h_output_resym.begin(), h_output_resym.end(), 0.0); + + std::vector h_output_resym(p.osize[0]); + std::fill(h_output_resym.begin(), h_output_resym.end(), hipDoubleComplex{0.0, 0.0}); + ASSERT_TRUE( hipMemcpy( h_output_resym.data(), obuf.data(), p.obuffer_sizes()[0], hipMemcpyDeviceToHost) @@ -305,8 +315,8 @@ double maxdiff = 0; for(unsigned int i = 0; i < h_output.size(); ++i) { - auto rdiff = std::abs(h_output[i].real() - h_output_resym[i].real()); - auto idiff = std::abs(h_output[i].imag() - h_output_resym[i].imag()); + auto rdiff = std::abs(h_output[i].x - h_output_resym[i].x); + auto idiff = std::abs(h_output[i].y - h_output_resym[i].y); maxdiff = std::max({maxdiff, rdiff, idiff}); } diff -Nru rocfft-5.5.0/clients/tests/hipGraph_test.cpp rocfft-5.7.1/clients/tests/hipGraph_test.cpp --- rocfft-5.5.0/clients/tests/hipGraph_test.cpp 2023-01-31 06:20:16.000000000 +0000 +++ rocfft-5.7.1/clients/tests/hipGraph_test.cpp 2023-08-09 16:19:51.000000000 +0000 @@ -1,4 +1,4 @@ -// Copyright (C) 2022 Advanced Micro Devices, Inc. All rights reserved. +// Copyright (C) 2023 Advanced Micro Devices, Inc. All rights reserved. // // Permission is hereby granted, free of charge, to any person obtaining a copy // of this software and associated documentation files (the "Software"), to deal @@ -20,7 +20,7 @@ #include "../../shared/arithmetic.h" #include "../../shared/gpubuf.h" -#include "../rocfft_params.h" +#include "../../shared/rocfft_params.h" #include "accuracy_test.h" #include "rocfft.h" #include "rocfft_against_fftw.h" @@ -32,7 +32,7 @@ static const unsigned int KERNEL_THREADS = 64; -__global__ void scale_data_kernel(float2* data, size_t length, float scale) +__global__ void scale_data_kernel(rocfft_complex* data, size_t length, float scale) { const auto idx = blockIdx.x * blockDim.x + threadIdx.x; @@ -44,7 +44,19 @@ } template -__global__ void offset_data_kernel(T* data, size_t length, T offset) +__global__ void offset_data_kernel_complex(T* data, size_t length, T offset) +{ + const auto idx = blockIdx.x * blockDim.x + threadIdx.x; + + if(idx < length) + { + data[idx].x += offset.x; + data[idx].y += offset.y; + } +} + +template +__global__ void offset_data_kernel_real(T* data, size_t length, T offset) { const auto idx = blockIdx.x * blockDim.x + threadIdx.x; @@ -54,10 +66,10 @@ } } -static void init_input_data(size_t N, - size_t seed, - std::vector& host_data, - gpubuf_t& device_data) +static void init_input_data(size_t N, + size_t seed, + std::vector>& host_data, + gpubuf_t>& device_data) { std::minstd_rand gen(seed); std::uniform_real_distribution dist(-1.0f, 1.0f); @@ -69,7 +81,7 @@ host_data[i].y = dist(gen); } - size_t Nbytes = N * sizeof(float2); + size_t Nbytes = N * sizeof(rocfft_complex); if(device_data.alloc(Nbytes) != hipSuccess) throw std::bad_alloc(); @@ -146,7 +158,8 @@ ASSERT_EQ(rocfft_execute(plan_inv, &in_ptr, &out_ptr, info), rocfft_status_success); } -static void scale_device_data(hipStream_t stream, float scale, size_t N, float2* data) +static void + scale_device_data(hipStream_t stream, float scale, size_t N, rocfft_complex* data) { auto blockSize = KERNEL_THREADS; auto numBlocks = DivRoundingUp(N, blockSize); @@ -161,11 +174,26 @@ } template -static void offset_device_data(hipStream_t stream, T offset, size_t N, T* data) +static void offset_device_data_real(hipStream_t stream, T offset, size_t N, T* data) +{ + auto blockSize = KERNEL_THREADS; + auto numBlocks = DivRoundingUp(N, blockSize); + hipLaunchKernelGGL(offset_data_kernel_real, + dim3(numBlocks), + dim3(blockSize), + 0, // sharedMemBytes + stream, // stream + data, + N, + offset); +} + +template +static void offset_device_data_complex(hipStream_t stream, T offset, size_t N, T* data) { auto blockSize = KERNEL_THREADS; auto numBlocks = DivRoundingUp(N, blockSize); - hipLaunchKernelGGL(offset_data_kernel, + hipLaunchKernelGGL(offset_data_kernel_complex, dim3(numBlocks), dim3(blockSize), 0, // sharedMemBytes @@ -196,34 +224,33 @@ ASSERT_EQ(host_data == host_data_compare, true); } -static void compare_data(const std::vector& original_host_data, - const gpubuf_t& modified_device_data) +static void compare_data(const std::vector>& original_host_data, + const gpubuf_t>& modified_device_data) { - std::vector modified_host_data(original_host_data.size()); + std::vector> modified_host_data(original_host_data.size()); // Copy result back to host ASSERT_EQ(hipMemcpy(modified_host_data.data(), modified_device_data.data(), - modified_host_data.size() * sizeof(float2), + modified_host_data.size() * sizeof(rocfft_complex), hipMemcpyDeviceToHost), hipSuccess); // Compare data we got to the original. // We're running 2 transforms (forward+inverse), so we // should tolerate 2x the error of a single transform. - std::vector> linf_failures; - const double MAX_TRANSFORM_ERROR = 2 * type_epsilon(); + const double MAX_TRANSFORM_ERROR = 2 * type_epsilon(); auto input_norm - = norm_complex(reinterpret_cast*>(original_host_data.data()), + = norm_complex(reinterpret_cast*>(original_host_data.data()), original_host_data.size(), 1, 1, original_host_data.size(), {0}); auto diff = distance_1to1_complex( - reinterpret_cast*>(original_host_data.data()), - reinterpret_cast*>(modified_host_data.data()), + reinterpret_cast*>(original_host_data.data()), + reinterpret_cast*>(modified_host_data.data()), // data is all contiguous, we can treat it as 1d original_host_data.size(), 1, @@ -231,7 +258,7 @@ original_host_data.size(), 1, modified_host_data.size(), - linf_failures, + nullptr, MAX_TRANSFORM_ERROR, {0}, {0}); @@ -253,26 +280,26 @@ size_t seed = 100; - auto offset_1 = float2(.1, .1); - auto offset_2 = float2(-.1, -.1); + auto offset_1 = rocfft_complex{.1, .1}; + auto offset_2 = rocfft_complex{-.1, -.1}; float scale = 2.2; float inv_scale = 1. / scale; - auto output_init_val = float2(0., 0.); + auto output_init_val = rocfft_complex(0., 0.); size_t num_kernel_launches = 100; size_t num_graph_launches = 10; - gpubuf_t device_mem_in; - std::vector host_mem_in; + gpubuf_t> device_mem_in; + std::vector> host_mem_in; init_input_data(N, seed, host_mem_in, device_mem_in); - float2* in_ptr = static_cast(device_mem_in.data()); + rocfft_complex* in_ptr = static_cast*>(device_mem_in.data()); - gpubuf_t device_mem_out; - std::vector host_mem_out; - init_data(N, output_init_val, host_mem_out, device_mem_out); - float2* out_ptr = static_cast(device_mem_out.data()); + gpubuf_t> device_mem_out; + std::vector> host_mem_out; + init_data>(N, output_init_val, host_mem_out, device_mem_out); + rocfft_complex* out_ptr = static_cast*>(device_mem_out.data()); gpubuf_t device_mem_counter; std::vector host_mem_counter; @@ -297,11 +324,11 @@ // add offset to device input data for(size_t i = 0; i < num_kernel_launches; ++i) - offset_device_data(stream, offset_1, N, in_ptr); + offset_device_data_complex>(stream, offset_1, N, in_ptr); // back out the offsets for(size_t i = 0; i < num_kernel_launches; ++i) - offset_device_data(stream, offset_2, N, in_ptr); + offset_device_data_complex>(stream, offset_2, N, in_ptr); // scale the device input data scale_device_data(stream, scale, N, in_ptr); @@ -326,20 +353,20 @@ // add offset to device output data for(size_t i = 0; i < num_kernel_launches; ++i) - offset_device_data(stream, offset_1, N, out_ptr); + offset_device_data_complex>(stream, offset_1, N, out_ptr); // back out the offsets for(size_t i = 0; i < num_kernel_launches; ++i) - offset_device_data(stream, offset_2, N, out_ptr); + offset_device_data_complex>(stream, offset_2, N, out_ptr); // increment counter - offset_device_data(stream, 1, N, counter_ptr); + offset_device_data_real(stream, 1, N, counter_ptr); ASSERT_EQ(hipStreamEndCapture(stream, &graph), hipSuccess); // make sure no actual work has been done for // the captured stream before graph execution - compare_data_exact_match(other_stream, host_mem_out, device_mem_out); + compare_data_exact_match>(other_stream, host_mem_out, device_mem_out); ASSERT_EQ(hipGraphInstantiate(&graph_exec, graph, NULL, NULL, 0), hipSuccess); ASSERT_EQ(hipGraphDestroy(graph), hipSuccess); diff -Nru rocfft-5.5.0/clients/tests/multithread_test.cpp rocfft-5.7.1/clients/tests/multithread_test.cpp --- rocfft-5.5.0/clients/tests/multithread_test.cpp 2023-01-31 06:20:16.000000000 +0000 +++ rocfft-5.7.1/clients/tests/multithread_test.cpp 2023-08-09 16:19:51.000000000 +0000 @@ -19,7 +19,7 @@ // THE SOFTWARE. #include "../../shared/gpubuf.h" -#include "../rocfft_params.h" +#include "../../shared/rocfft_params.h" #include "accuracy_test.h" #include "rocfft.h" #include "rocfft_against_fftw.h" @@ -32,7 +32,7 @@ // normalize results of an inverse transform, so it can be directly // compared to the original data before the forward transform -__global__ void normalize_inverse_results(float2* array, float N) +__global__ void normalize_inverse_results(rocfft_complex* array, float N) { const int idx = blockIdx.x * blockDim.x + threadIdx.x; array[idx].x /= N; @@ -58,7 +58,7 @@ datasize *= N; } - size_t Nbytes = datasize * sizeof(float2); + size_t Nbytes = datasize * sizeof(rocfft_complex); // Create HIP device buffers if(device_mem_in.alloc(Nbytes) != hipSuccess) @@ -156,7 +156,7 @@ 1, 0, // sharedMemBytes stream, // stream - static_cast(device_mem_out.data()), + static_cast*>(device_mem_out.data()), static_cast(host_mem_out.size())); ran_transform = true; } @@ -190,26 +190,25 @@ ASSERT_EQ(hipMemcpy(host_mem_out.data(), device_mem_out.data(), - host_mem_out.size() * sizeof(float2), + host_mem_out.size() * sizeof(rocfft_complex), hipMemcpyDeviceToHost), hipSuccess); // Compare data we got to the original. // We're running 2 transforms (forward+inverse), so we // should tolerate 2x the error of a single transform. - std::vector> linf_failures; - const double MAX_TRANSFORM_ERROR = 2 * type_epsilon(); + const double MAX_TRANSFORM_ERROR = 2 * type_epsilon(); auto input_norm - = norm_complex(reinterpret_cast*>(host_mem_in.data()), + = norm_complex(reinterpret_cast*>(host_mem_in.data()), host_mem_in.size(), 1, 1, host_mem_in.size(), {0}); auto diff = distance_1to1_complex( - reinterpret_cast*>(host_mem_in.data()), - reinterpret_cast*>(host_mem_out.data()), + reinterpret_cast*>(host_mem_in.data()), + reinterpret_cast*>(host_mem_out.data()), // data is all contiguous, we can treat it as 1d host_mem_in.size(), 1, @@ -217,7 +216,7 @@ host_mem_in.size(), 1, host_mem_out.size(), - linf_failures, + nullptr, MAX_TRANSFORM_ERROR, {0}, {0}); @@ -236,18 +235,18 @@ { do_cleanup(); } - size_t N = 0; - size_t dim = 0; - uint32_t seed = 0; - hipStream_t stream = nullptr; - rocfft_plan plan = nullptr; - rocfft_plan plan_inv = nullptr; - size_t work_buffer_size = 0; - void* work_buffer = nullptr; - gpubuf device_mem_in; - gpubuf device_mem_out; - std::vector host_mem_in; - std::vector host_mem_out; + size_t N = 0; + size_t dim = 0; + uint32_t seed = 0; + hipStream_t stream = nullptr; + rocfft_plan plan = nullptr; + rocfft_plan plan_inv = nullptr; + size_t work_buffer_size = 0; + void* work_buffer = nullptr; + gpubuf device_mem_in; + gpubuf device_mem_out; + std::vector> host_mem_in; + std::vector> host_mem_out; // ensure that we don't forget to actually run the transform bool ran_transform = false; diff -Nru rocfft-5.5.0/clients/tests/random.cpp rocfft-5.7.1/clients/tests/random.cpp --- rocfft-5.5.0/clients/tests/random.cpp 2023-01-31 06:20:16.000000000 +0000 +++ rocfft-5.7.1/clients/tests/random.cpp 2023-08-09 16:19:51.000000000 +0000 @@ -22,7 +22,6 @@ #include #include -#include "../rocfft_params.h" #include "accuracy_test.h" #include "rocfft_accuracy_test.h" @@ -93,14 +92,14 @@ std::cout << "Params are not valid\n"; } - fft_vs_reference(params); + fft_vs_reference(params, true); } INSTANTIATE_TEST_SUITE_P(random_complex_1d, random_params, ::testing::Combine(::testing::Range(0, n_random_tests), ::testing::ValuesIn({1}), - ::testing::ValuesIn(precision_range), + ::testing::ValuesIn(precision_range_sp_dp), ::testing::ValuesIn(place_range), ::testing::ValuesIn(trans_type_range_complex))); @@ -108,7 +107,7 @@ random_params, ::testing::Combine(::testing::Range(0, n_random_tests), ::testing::ValuesIn({2}), - ::testing::ValuesIn(precision_range), + ::testing::ValuesIn(precision_range_sp_dp), ::testing::ValuesIn(place_range), ::testing::ValuesIn(trans_type_range_complex))); @@ -116,7 +115,7 @@ random_params, ::testing::Combine(::testing::Range(0, n_random_tests), ::testing::ValuesIn({3}), - ::testing::ValuesIn(precision_range), + ::testing::ValuesIn(precision_range_sp_dp), ::testing::ValuesIn(place_range), ::testing::ValuesIn(trans_type_range_complex))); @@ -124,7 +123,7 @@ random_params, ::testing::Combine(::testing::Range(0, n_random_tests), ::testing::ValuesIn({1}), - ::testing::ValuesIn(precision_range), + ::testing::ValuesIn(precision_range_sp_dp), ::testing::ValuesIn({fft_placement_notinplace}), ::testing::ValuesIn(trans_type_range_real))); @@ -132,7 +131,7 @@ random_params, ::testing::Combine(::testing::Range(0, n_random_tests), ::testing::ValuesIn({2}), - ::testing::ValuesIn(precision_range), + ::testing::ValuesIn(precision_range_sp_dp), ::testing::ValuesIn({fft_placement_notinplace}), ::testing::ValuesIn(trans_type_range_real))); @@ -140,6 +139,6 @@ random_params, ::testing::Combine(::testing::Range(0, n_random_tests), ::testing::ValuesIn({3}), - ::testing::ValuesIn(precision_range), + ::testing::ValuesIn(precision_range_sp_dp), ::testing::ValuesIn({fft_placement_notinplace}), ::testing::ValuesIn(trans_type_range_real))); diff -Nru rocfft-5.5.0/clients/tests/rocfft_accuracy_test.cpp rocfft-5.7.1/clients/tests/rocfft_accuracy_test.cpp --- rocfft-5.5.0/clients/tests/rocfft_accuracy_test.cpp 2023-01-31 06:20:16.000000000 +0000 +++ rocfft-5.7.1/clients/tests/rocfft_accuracy_test.cpp 2023-08-09 16:19:51.000000000 +0000 @@ -1,4 +1,4 @@ -// Copyright (C) 2022 - 2022 Advanced Micro Devices, Inc. All rights reserved. +// Copyright (C) 2022 - 2023 Advanced Micro Devices, Inc. All rights reserved. // // Permission is hereby granted, free of charge, to any person obtaining a copy // of this software and associated documentation files (the "Software"), to deal @@ -25,22 +25,25 @@ #include #include +#include "rocfft_accuracy_test.h" + #include "../../shared/gpubuf.h" -#include "../rocfft_params.h" #include "fftw_transform.h" #include "rocfft.h" -#include "rocfft_accuracy_test.h" #include "rocfft_against_fftw.h" -void fft_vs_reference(rocfft_params& params) +void fft_vs_reference(rocfft_params& params, bool round_trip) { switch(params.precision) { + case fft_precision_half: + fft_vs_reference_impl<_Float16, rocfft_params>(params, round_trip); + break; case fft_precision_single: - fft_vs_reference_impl(params); + fft_vs_reference_impl(params, round_trip); break; case fft_precision_double: - fft_vs_reference_impl(params); + fft_vs_reference_impl(params, round_trip); break; } } @@ -68,6 +71,6 @@ GTEST_SKIP(); } - fft_vs_reference(params); + fft_vs_reference(params, true); SUCCEED(); } diff -Nru rocfft-5.5.0/clients/tests/rocfft_accuracy_test.h rocfft-5.7.1/clients/tests/rocfft_accuracy_test.h --- rocfft-5.5.0/clients/tests/rocfft_accuracy_test.h 2023-01-31 06:20:16.000000000 +0000 +++ rocfft-5.7.1/clients/tests/rocfft_accuracy_test.h 2023-08-09 16:19:51.000000000 +0000 @@ -18,15 +18,12 @@ // OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN // THE SOFTWARE. -#pragma once - #ifndef ROCFFT_ACCURACY_TEST #define ROCFFT_ACCURACY_TEST -#include "../rocfft_params.h" +#include "../../shared/rocfft_params.h" #include "accuracy_test.h" -#include "rocfft_accuracy_test.h" -void fft_vs_reference(rocfft_params& params); +void fft_vs_reference(rocfft_params& params, bool round_trip = false); #endif diff -Nru rocfft-5.5.0/clients/tests/rocfft_against_fftw.h rocfft-5.7.1/clients/tests/rocfft_against_fftw.h --- rocfft-5.5.0/clients/tests/rocfft_against_fftw.h 2023-01-31 06:20:16.000000000 +0000 +++ rocfft-5.7.1/clients/tests/rocfft_against_fftw.h 2023-08-09 16:19:51.000000000 +0000 @@ -1,4 +1,4 @@ -// Copyright (C) 2016 - 2022 Advanced Micro Devices, Inc. All rights reserved. +// Copyright (C) 2016 - 2023 Advanced Micro Devices, Inc. All rights reserved. // // Permission is hereby granted, free of charge, to any person obtaining a copy // of this software and associated documentation files (the "Software"), to deal @@ -107,17 +107,17 @@ // construct an FFTW plan, given rocFFT parameters. output is // required if planning with wisdom. -template +template static typename fftw_trait::fftw_plan_type - fftw_plan_via_rocfft(const std::vector& length, - const std::vector& istride, - const std::vector& ostride, - const size_t nbatch, - const size_t idist, - const size_t odist, - const fft_transform_type transformType, - std::vector>& input, - std::vector>& output) + fftw_plan_via_rocfft(const std::vector& length, + const std::vector& istride, + const std::vector& ostride, + const size_t nbatch, + const size_t idist, + const size_t odist, + const fft_transform_type transformType, + std::vector& input, + std::vector& output) { // Dimension configuration: std::vector dims(length.size()); @@ -145,39 +145,29 @@ template void fftw_run(fft_transform_type transformType, typename fftw_trait::fftw_plan_type cpu_plan, - void* cpu_in, - void* cpu_out) + std::vector& cpu_in, + std::vector& cpu_out) { - using fftw_complex_type = typename fftw_trait::fftw_complex_type; - switch(transformType) { case fft_transform_type_complex_forward: { - fftw_plan_execute_c2c(cpu_plan, - reinterpret_cast(cpu_in), - reinterpret_cast(cpu_out)); + fftw_plan_execute_c2c(cpu_plan, cpu_in, cpu_out); break; } case fft_transform_type_complex_inverse: { - fftw_plan_execute_c2c(cpu_plan, - reinterpret_cast(cpu_in), - reinterpret_cast(cpu_out)); + fftw_plan_execute_c2c(cpu_plan, cpu_in, cpu_out); break; } case fft_transform_type_real_forward: { - fftw_plan_execute_r2c(cpu_plan, - reinterpret_cast(cpu_in), - reinterpret_cast(cpu_out)); + fftw_plan_execute_r2c(cpu_plan, cpu_in, cpu_out); break; } case fft_transform_type_real_inverse: { - fftw_plan_execute_c2r(cpu_plan, - reinterpret_cast(cpu_in), - reinterpret_cast(cpu_out)); + fftw_plan_execute_c2r(cpu_plan, cpu_in, cpu_out); break; } } @@ -224,6 +214,9 @@ { switch(precision) { + case fft_precision_half: + return type_epsilon<_Float16>(); + break; case fft_precision_single: return type_epsilon(); break; @@ -232,7 +225,6 @@ break; default: throw std::runtime_error("Invalid precision"); - return 0.0; } } diff -Nru rocfft-5.5.0/clients/tests/rtc_helper_crash.cpp rocfft-5.7.1/clients/tests/rtc_helper_crash.cpp --- rocfft-5.5.0/clients/tests/rtc_helper_crash.cpp 2023-01-31 06:20:16.000000000 +0000 +++ rocfft-5.7.1/clients/tests/rtc_helper_crash.cpp 2023-08-09 16:19:51.000000000 +0000 @@ -1,3 +1,23 @@ +// Copyright (C) 2021 - 2023 Advanced Micro Devices, Inc. All rights reserved. +// +// Permission is hereby granted, free of charge, to any person obtaining a copy +// of this software and associated documentation files (the "Software"), to deal +// in the Software without restriction, including without limitation the rights +// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +// copies of the Software, and to permit persons to whom the Software is +// furnished to do so, subject to the following conditions: +// +// The above copyright notice and this permission notice shall be included in +// all copies or substantial portions of the Software. +// +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN +// THE SOFTWARE. + // just crash int main() { diff -Nru rocfft-5.5.0/clients/tests/test_params.h rocfft-5.7.1/clients/tests/test_params.h --- rocfft-5.5.0/clients/tests/test_params.h 2023-01-31 06:20:16.000000000 +0000 +++ rocfft-5.7.1/clients/tests/test_params.h 2023-08-09 16:19:51.000000000 +0000 @@ -1,4 +1,4 @@ -// Copyright (C) 2016 - 2022 Advanced Micro Devices, Inc. All rights reserved. +// Copyright (C) 2016 - 2023 Advanced Micro Devices, Inc. All rights reserved. // // Permission is hereby granted, free of charge, to any person obtaining a copy // of this software and associated documentation files (the "Software"), to deal @@ -27,14 +27,25 @@ #include extern int verbose; -extern size_t random_seed; extern size_t ramgb; +extern size_t vramgb; + +extern size_t random_seed; +extern double planar_prob; +extern double callback_prob; + +extern double half_epsilon; extern double single_epsilon; extern double double_epsilon; +extern bool skip_runtime_fails; extern double max_linf_eps_double; extern double max_l2_eps_double; extern double max_linf_eps_single; extern double max_l2_eps_single; +extern double max_linf_eps_half; +extern double max_l2_eps_half; + +extern int n_hip_failures; #endif diff -Nru rocfft-5.5.0/clients/tests/unit_test.cpp rocfft-5.7.1/clients/tests/unit_test.cpp --- rocfft-5.5.0/clients/tests/unit_test.cpp 2023-01-31 06:20:16.000000000 +0000 +++ rocfft-5.7.1/clients/tests/unit_test.cpp 2023-08-09 16:19:51.000000000 +0000 @@ -1,4 +1,4 @@ -// Copyright (C) 2016 - 2022 Advanced Micro Devices, Inc. All rights reserved. +// Copyright (C) 2016 - 2023 Advanced Micro Devices, Inc. All rights reserved. // // Permission is hereby granted, free of charge, to any person obtaining a copy // of this software and associated documentation files (the "Software"), to deal @@ -22,8 +22,8 @@ #include "../../shared/environment.h" #include "../../shared/gpubuf.h" +#include "../../shared/rocfft_complex.h" #include "hip/hip_runtime_api.h" -#include "hip/hip_vector_types.h" #include #include #include @@ -177,7 +177,7 @@ size_t requested_work_size = 0; ASSERT_EQ(rocfft_plan_get_work_buffer_size(plan, &requested_work_size), rocfft_status_success); - ASSERT_GT(requested_work_size, 0); + ASSERT_GT(requested_work_size, 0U); rocfft_execution_info info; ASSERT_EQ(rocfft_execution_info_create(&info), rocfft_status_success); @@ -311,8 +311,8 @@ rocfft_plan_destroy(plan); plan = nullptr; }; - // check the RTC log to see if a kernel got compiled - auto kernel_was_compiled = [&]() { + // check the RTC log to see if an FFT kernel got compiled + auto fft_kernel_was_compiled = [&]() { // HACK: logging is done in a worker thread, so sleep for a // bit to give it a chance to actually write. It at least // should flush after writing. @@ -320,9 +320,10 @@ // look for a ROCFFT_RTC_BEGIN line that indicates RTC happened std::ifstream logfile(rtc_log_path); std::string line; - while(logfile >> line) + while(std::getline(logfile, line)) { - if(line.find("ROCFFT_RTC_BEGIN") != std::string::npos) + if(line.find("ROCFFT_RTC_BEGIN") != std::string::npos + && line.find("fft_") != std::string::npos) return true; } return false; @@ -334,7 +335,7 @@ ASSERT_EQ(rocfft_cache_serialize(&onekernel_cache, &onekernel_cache_bytes), rocfft_status_success); rocfft_cleanup(); - ASSERT_TRUE(kernel_was_compiled()); + ASSERT_TRUE(fft_kernel_was_compiled()); // serialized cache should be bigger than empty cache ASSERT_GT(onekernel_cache_bytes, empty_cache_bytes); @@ -349,7 +350,7 @@ ASSERT_EQ(rocfft_cache_serialize(&onekernel_cache, &onekernel_cache_bytes), rocfft_status_success); rocfft_cleanup(); - ASSERT_TRUE(kernel_was_compiled()); + ASSERT_TRUE(fft_kernel_was_compiled()); ASSERT_GT(onekernel_cache_bytes, empty_cache_bytes); // re-init library without blowing away cache. rebuild plan and @@ -357,7 +358,7 @@ rocfft_setup(); build_plan(); rocfft_cleanup(); - ASSERT_FALSE(kernel_was_compiled()); + ASSERT_FALSE(fft_kernel_was_compiled()); // blow away cache again, deserialize one-kernel cache. re-init // library and rebuild plan - kernel should again not be @@ -367,12 +368,12 @@ ASSERT_EQ(rocfft_cache_deserialize(onekernel_cache, onekernel_cache_bytes), rocfft_status_success); rocfft_cleanup(); - ASSERT_FALSE(kernel_was_compiled()); + ASSERT_FALSE(fft_kernel_was_compiled()); rocfft_setup(); build_plan(); rocfft_cleanup(); - ASSERT_FALSE(kernel_was_compiled()); + ASSERT_FALSE(fft_kernel_was_compiled()); // use the cache as a system cache and make the user one an empty // in-memory cache. kernel should still not be recompiled. @@ -381,7 +382,7 @@ rocfft_setup(); build_plan(); rocfft_cleanup(); - ASSERT_FALSE(kernel_was_compiled()); + ASSERT_FALSE(fft_kernel_was_compiled()); // check that the system cache is not written to, even if it's // writable by the current user. after removing the cache, the @@ -391,11 +392,11 @@ rocfft_setup(); build_plan(); rocfft_cleanup(); - ASSERT_TRUE(kernel_was_compiled()); + ASSERT_TRUE(fft_kernel_was_compiled()); rocfft_setup(); build_plan(); rocfft_cleanup(); - ASSERT_TRUE(kernel_was_compiled()); + ASSERT_TRUE(fft_kernel_was_compiled()); } // make sure cache API functions tolerate null pointers without crashing @@ -443,8 +444,8 @@ nullptr)); // alloc a complex buffer - gpubuf_t data; - ASSERT_EQ(data.alloc(RTC_PROBLEM_SIZE * sizeof(float2)), hipSuccess); + gpubuf_t> data; + ASSERT_EQ(data.alloc(RTC_PROBLEM_SIZE * sizeof(rocfft_complex)), hipSuccess); std::vector ibuffers(1, static_cast(data.data())); diff -Nru rocfft-5.5.0/debian/changelog rocfft-5.7.1/debian/changelog --- rocfft-5.5.0/debian/changelog 2023-12-19 14:59:07.000000000 +0000 +++ rocfft-5.7.1/debian/changelog 2024-03-12 17:15:10.000000000 +0000 @@ -1,8 +1,46 @@ -rocfft (5.5.0-6build1) noble; urgency=medium +rocfft (5.7.1-1) unstable; urgency=medium - * No-change rebuild for boost defaults change. + * Migrate to unstable + * New upstream version includes fix for LDS over-allocation + (Closes: #1057251) - -- Matthias Klose Tue, 19 Dec 2023 15:59:07 +0100 + -- Cordell Bloor Tue, 12 Mar 2024 11:15:10 -0600 + +rocfft (5.7.1-1~exp2) experimental; urgency=medium + + * d/patches: Add missing DEP-3 headers + * d/control: Re-add accidentally removed B-D libhiprand-dev. + Fixes a FTBFS. + * symbols: Strip Debian revision + + -- Christian Kastner Sat, 02 Mar 2024 21:12:10 +0100 + +rocfft (5.7.1-1~exp1) experimental; urgency=medium + + * New upstream version. + - Update symbols file + - Refresh patches + - Add patch use-readthedocs-theme.patch + Restores documentation build using the simpler approach from a previous + version + * d/rules: + - Add gfx1100, gfx1101 and gfx1102 build targets + - Drop patchelf --remove-rpath from build rules + - Automate handling of rocFFT version string + * d/control: + - Constrain versions for clang-17 + - Drop unused rocminfo package + - Switch B-D from librocrand-dev to libhiprand-dev + - Add support for the 'nocheck' build profile + - Temporarily B-D on libamdhip64-dev >= 5.6.1 + Until either bin:hipcc is fixed or it is determined that a direct + dependency on libamdhip64-dev is the right thing to do. + * Upstream URL has changed + * autopkgtest: Export dmesg and other info as artifacts + * Bump copyrights + * dbgsym: Disable dwz and switch to compressed DWARF-5 + + -- Christian Kastner Fri, 01 Mar 2024 23:23:05 +0100 rocfft (5.5.0-6) unstable; urgency=medium diff -Nru rocfft-5.5.0/debian/clean rocfft-5.7.1/debian/clean --- rocfft-5.5.0/debian/clean 2023-11-10 09:02:29.000000000 +0000 +++ rocfft-5.7.1/debian/clean 2024-03-12 17:13:18.000000000 +0000 @@ -1,2 +1,2 @@ -docs/docBin/ +docs/.doxygen/docBin/ html/ diff -Nru rocfft-5.5.0/debian/control rocfft-5.7.1/debian/control --- rocfft-5.5.0/debian/control 2023-11-10 09:02:29.000000000 +0000 +++ rocfft-5.7.1/debian/control 2024-03-12 17:13:18.000000000 +0000 @@ -1,6 +1,6 @@ Source: rocfft Section: devel -Homepage: https://github.com/rocmsoftwareplatform/rocfft +Homepage: https://github.com/ROCm/rocfft Priority: optional Standards-Version: 4.6.2 Vcs-Git: https://salsa.debian.org/rocm-team/rocfft.git @@ -11,18 +11,19 @@ Christian Kastner , Build-Depends: debhelper-compat (= 13), cmake, - hipcc, - libamd-comgr-dev, - libhsa-runtime-dev, - rocminfo, - patchelf, + hipcc (>= 5.6.1~), +# ckk 2024-03-02: temporary until hipcc question is resolved: + libamdhip64-dev (>= 5.6.1~), +# end + libamd-comgr-dev (>= 6.0~), + libhsa-runtime-dev (>= 5.7.1~), rocm-cmake (>= 5.3.0), python3-dev, libsqlite3-dev, - librocrand-dev, + libhiprand-dev, libboost-program-options-dev, libfftw3-dev, - libgtest-dev, + libgtest-dev , Build-Depends-Indep: dh-sequence-sphinxdoc , doxygen , python3-breathe , @@ -67,6 +68,7 @@ Package: librocfft0-tests Section: libdevel Architecture: amd64 arm64 ppc64el +Build-Profiles: Depends: librocfft0 (= ${binary:Version}),${misc:Depends}, ${shlibs:Depends}, Description: ROCm library for computing Fast Fourier Transforms - tests rocFFT is a library for computing the discrete Fourier transform. It is diff -Nru rocfft-5.5.0/debian/copyright rocfft-5.7.1/debian/copyright --- rocfft-5.5.0/debian/copyright 2023-11-10 09:02:29.000000000 +0000 +++ rocfft-5.7.1/debian/copyright 2024-03-12 17:13:18.000000000 +0000 @@ -9,6 +9,7 @@ Files: debian/* Copyright: 2022, Maxime Chambonnet 2022-2023, Cordell Bloor + 2024, Christian Kastner License: Expat License: Expat diff -Nru rocfft-5.5.0/debian/librocfft0.install rocfft-5.7.1/debian/librocfft0.install --- rocfft-5.5.0/debian/librocfft0.install 2023-11-10 09:02:29.000000000 +0000 +++ rocfft-5.7.1/debian/librocfft0.install 2024-03-12 17:13:18.000000000 +0000 @@ -1,2 +1,2 @@ usr/lib/*/librocfft.so.* -usr/lib/*/rocfft/1.0.21/rocfft_rtc_helper +usr/lib/*/rocfft/1.0.23/rocfft_rtc_helper diff -Nru rocfft-5.5.0/debian/librocfft0.symbols.amd64 rocfft-5.7.1/debian/librocfft0.symbols.amd64 --- rocfft-5.5.0/debian/librocfft0.symbols.amd64 2023-11-10 09:02:29.000000000 +0000 +++ rocfft-5.7.1/debian/librocfft0.symbols.amd64 2024-03-12 17:13:18.000000000 +0000 @@ -1,96 +1,215 @@ librocfft.so.0 librocfft0 #MINVER# * Build-Depends-Package: librocfft-dev - (optional)_Z24GenerateHalfNTableKernelI15HIP_vector_typeIdLj2EEEvmmPT_@Base 5.5.0 - (optional)_Z24GenerateHalfNTableKernelI15HIP_vector_typeIfLj2EEEvmmPT_@Base 5.5.0 - (optional)_Z26GenerateTwiddleTableKernelI15HIP_vector_typeIdLj2EEEvmm9radices_tS2_S2_PT_@Base 5.5.0 - (optional)_Z26GenerateTwiddleTableKernelI15HIP_vector_typeIdLj2EEEvmmPT_@Base 5.5.0 - (optional)_Z26GenerateTwiddleTableKernelI15HIP_vector_typeIfLj2EEEvmm9radices_tS2_S2_PT_@Base 5.5.0 - (optional)_Z26GenerateTwiddleTableKernelI15HIP_vector_typeIfLj2EEEvmmPT_@Base 5.5.0 - (optional)_Z31GenerateTwiddleTableLargeKernelI15HIP_vector_typeIdLj2EEEvdmmmPT_@Base 5.5.0 - (optional)_Z31GenerateTwiddleTableLargeKernelI15HIP_vector_typeIfLj2EEEvdmmmPT_@Base 5.5.0 - (optional)_ZN13function_poolC1Ev@Base 5.5.0 - (optional)_ZN13function_poolC2Ev@Base 5.5.0 + (optional)_ZGVZNKSt8__detail11_AnyMatcherINSt7__cxx1112regex_traitsIcEELb0ELb0ELb0EEclEcE5__nul@Base 5.7.1 + (optional)_ZGVZNKSt8__detail11_AnyMatcherINSt7__cxx1112regex_traitsIcEELb0ELb0ELb1EEclEcE5__nul@Base 5.7.1 + (optional)_ZGVZNKSt8__detail11_AnyMatcherINSt7__cxx1112regex_traitsIcEELb0ELb1ELb0EEclEcE5__nul@Base 5.7.1 + (optional)_ZGVZNKSt8__detail11_AnyMatcherINSt7__cxx1112regex_traitsIcEELb0ELb1ELb1EEclEcE5__nul@Base 5.7.1 (optional)_ZN9__gnu_cxx12__to_xstringINSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEEEcEET_PFiPT0_mPKS8_P13__va_list_tagEmSB_z@Base 5.5.0 - (optional)_ZNSt10_HashtableISt5tupleIJSt5arrayImLm2EE18rocfft_precision_e13ComputeScheme19SBRC_TRANSPOSE_TYPEEESt4pairIKS6_9FFTKernelESaISA_ENSt8__detail10_Select1stESt8equal_toIS6_E10SimpleHashNSC_18_Mod_range_hashingENSC_20_Default_ranged_hashENSC_20_Prime_rehash_policyENSC_17_Hashtable_traitsILb0ELb0ELb1EEEE10_M_emplaceIJS6_S9_EEES7_INSC_14_Node_iteratorISA_Lb0ELb0EEEbESt17integral_constantIbLb1EEDpOT_@Base 5.5.0 - (optional)_ZNSt10_HashtableISt5tupleIJSt5arrayImLm2EE18rocfft_precision_e13ComputeScheme19SBRC_TRANSPOSE_TYPEEESt4pairIKS6_9FFTKernelESaISA_ENSt8__detail10_Select1stESt8equal_toIS6_E10SimpleHashNSC_18_Mod_range_hashingENSC_20_Default_ranged_hashENSC_20_Prime_rehash_policyENSC_17_Hashtable_traitsILb0ELb0ELb1EEEE12_Scoped_nodeD2Ev@Base 5.5.0 - (optional)_ZNSt10_HashtableISt5tupleIJSt5arrayImLm2EE18rocfft_precision_e13ComputeScheme19SBRC_TRANSPOSE_TYPEEESt4pairIKS6_9FFTKernelESaISA_ENSt8__detail10_Select1stESt8equal_toIS6_E10SimpleHashNSC_18_Mod_range_hashingENSC_20_Default_ranged_hashENSC_20_Prime_rehash_policyENSC_17_Hashtable_traitsILb0ELb0ELb1EEEE13_M_rehash_auxEmSt17integral_constantIbLb1EE@Base 5.5.0 - (optional)_ZNSt10_HashtableISt5tupleIJSt5arrayImLm2EE18rocfft_precision_e13ComputeScheme19SBRC_TRANSPOSE_TYPEEESt4pairIKS6_9FFTKernelESaISA_ENSt8__detail10_Select1stESt8equal_toIS6_E10SimpleHashNSC_18_Mod_range_hashingENSC_20_Default_ranged_hashENSC_20_Prime_rehash_policyENSC_17_Hashtable_traitsILb0ELb0ELb1EEEE21_M_insert_unique_nodeEmmPNSC_10_Hash_nodeISA_Lb0EEEm@Base 5.5.0 + (optional)_ZNKSt7__cxx1112regex_traitsIcE16lookup_classnameIPKcEENS1_10_RegexMaskET_S6_b@Base 5.7.1 + (optional)_ZNKSt7__cxx1112regex_traitsIcE18lookup_collatenameIPKcEENS_12basic_stringIcSt11char_traitsIcESaIcEEET_SA_@Base 5.7.1 + (optional)_ZNKSt7__cxx1112regex_traitsIcE5valueEci@Base 5.7.1 + (optional)_ZNKSt7__cxx1114regex_iteratorIN9__gnu_cxx17__normal_iteratorIPKcNS_12basic_stringIcSt11char_traitsIcESaIcEEEEEcNS_12regex_traitsIcEEEeqERKSD_@Base 5.7.1 + (optional)_ZNKSt7__cxx1120regex_token_iteratorIN9__gnu_cxx17__normal_iteratorIPKcNS_12basic_stringIcSt11char_traitsIcESaIcEEEEEcNS_12regex_traitsIcEEEeqERKSD_@Base 5.7.1 + (optional)_ZNKSt8__detail9_ExecutorIN9__gnu_cxx17__normal_iteratorIPKcNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEEEEESaINS5_9sub_matchISB_EEENS5_12regex_traitsIcEELb0EE16_M_word_boundaryEv@Base 5.7.1 + (optional)_ZNKSt8__detail9_ExecutorIN9__gnu_cxx17__normal_iteratorIPKcNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEEEEESaINS5_9sub_matchISB_EEENS5_12regex_traitsIcEELb1EE16_M_word_boundaryEv@Base 5.7.1 (optional)_ZNSt10_HashtableImSt4pairIKmPKcESaIS4_ENSt8__detail10_Select1stESt8equal_toImESt4hashImENS6_18_Mod_range_hashingENS6_20_Default_ranged_hashENS6_20_Prime_rehash_policyENS6_17_Hashtable_traitsILb0ELb0ELb1EEEE13_M_rehash_auxEmSt17integral_constantIbLb1EE@Base 5.5.0 (optional)_ZNSt10_HashtableImSt4pairIKmPKcESaIS4_ENSt8__detail10_Select1stESt8equal_toImESt4hashImENS6_18_Mod_range_hashingENS6_20_Default_ranged_hashENS6_20_Prime_rehash_policyENS6_17_Hashtable_traitsILb0ELb0ELb1EEEE16_M_insert_uniqueIRS1_RKS4_NS6_10_AllocNodeISaINS6_10_Hash_nodeIS4_Lb0EEEEEEEES0_INS6_14_Node_iteratorIS4_Lb0ELb0EEEbEOT_OT0_RKT1_@Base 5.5.0 (optional)_ZNSt10_HashtableImSt4pairIKmPKcESaIS4_ENSt8__detail10_Select1stESt8equal_toImESt4hashImENS6_18_Mod_range_hashingENS6_20_Default_ranged_hashENS6_20_Prime_rehash_policyENS6_17_Hashtable_traitsILb0ELb0ELb1EEEE21_M_insert_unique_nodeEmmPNS6_10_Hash_nodeIS4_Lb0EEEm@Base 5.5.0 (optional)_ZNSt10_HashtableImSt4pairIKmPKcESaIS4_ENSt8__detail10_Select1stESt8equal_toImESt4hashImENS6_18_Mod_range_hashingENS6_20_Default_ranged_hashENS6_20_Prime_rehash_policyENS6_17_Hashtable_traitsILb0ELb0ELb1EEEEC2IPKS4_EET_SL_mRKSB_RKS9_RKS5_St17integral_constantIbLb1EE@Base 5.5.0 (optional)_ZNSt10_HashtableImSt4pairIKmPKcESaIS4_ENSt8__detail10_Select1stESt8equal_toImESt4hashImENS6_18_Mod_range_hashingENS6_20_Default_ranged_hashENS6_20_Prime_rehash_policyENS6_17_Hashtable_traitsILb0ELb0ELb1EEEED2Ev@Base 5.5.0 (optional)_ZNSt10filesystem7__cxx11dvERKNS0_4pathES3_@Base 5.5.0 - (optional)_ZNSt13unordered_mapISt5tupleIJSt5arrayImLm2EE18rocfft_precision_e13ComputeScheme19SBRC_TRANSPOSE_TYPEEE9FFTKernel10SimpleHashSt8equal_toIS6_ESaISt4pairIKS6_S7_EEED2Ev@Base 5.5.0 + (optional)_ZNSt11_Deque_baseINSt8__detail9_StateSeqINSt7__cxx1112regex_traitsIcEEEESaIS5_EE17_M_initialize_mapEm@Base 5.7.1 + (optional)_ZNSt11_Deque_baseIlSaIlEE17_M_initialize_mapEm@Base 5.7.1 (optional)_ZNSt15__exception_ptr12__dest_thunkISt12future_errorEEvPv@Base 5.5.0 + (optional)_ZNSt5dequeINSt8__detail9_StateSeqINSt7__cxx1112regex_traitsIcEEEESaIS5_EE16_M_push_back_auxIJRKS5_EEEvDpOT_@Base 5.7.1 + (optional)_ZNSt5dequeINSt8__detail9_StateSeqINSt7__cxx1112regex_traitsIcEEEESaIS5_EE16_M_push_back_auxIJS5_EEEvDpOT_@Base 5.7.1 + (optional)_ZNSt5dequeINSt8__detail9_StateSeqINSt7__cxx1112regex_traitsIcEEEESaIS5_EE17_M_reallocate_mapEmb@Base 5.7.1 + (optional)_ZNSt5dequeIlSaIlEE16_M_push_back_auxIJRKlEEEvDpOT_@Base 5.7.1 + (optional)_ZNSt5dequeIlSaIlEE17_M_reallocate_mapEmb@Base 5.7.1 (optional)_ZNSt6vectorINSt10filesystem7__cxx114pathESaIS2_EE17_M_realloc_insertIJRKS2_EEEvN9__gnu_cxx17__normal_iteratorIPS2_S4_EEDpOT_@Base 5.5.0 (optional)_ZNSt6vectorINSt10filesystem7__cxx114pathESaIS2_EE17_M_realloc_insertIJS2_EEEvN9__gnu_cxx17__normal_iteratorIPS2_S4_EEDpOT_@Base 5.5.0 + (optional)_ZNSt6vectorINSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEEESaIS5_EE17_M_default_appendEm@Base 5.7.1 + (optional)_ZNSt6vectorINSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEEESaIS5_EE17_M_realloc_insertIJRKS5_EEEvN9__gnu_cxx17__normal_iteratorIPS5_S7_EEDpOT_@Base 5.7.1 (optional)_ZNSt6vectorINSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEEESaIS5_EE17_M_realloc_insertIJS5_EEEvN9__gnu_cxx17__normal_iteratorIPS5_S7_EEDpOT_@Base 5.5.0 (optional)_ZNSt6vectorINSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEEESaIS5_EEaSERKS7_@Base 5.5.0 - (optional)_ZNSt6vectorIS_IcSaIcEESaIS1_EE17_M_default_appendEm@Base 5.5.0 + (optional)_ZNSt6vectorINSt7__cxx119sub_matchIN9__gnu_cxx17__normal_iteratorIPKcNS0_12basic_stringIcSt11char_traitsIcESaIcEEEEEEESaISC_EE14_M_fill_assignEmRKSC_@Base 5.7.1 + (optional)_ZNSt6vectorINSt7__cxx119sub_matchIN9__gnu_cxx17__normal_iteratorIPKcNS0_12basic_stringIcSt11char_traitsIcESaIcEEEEEEESaISC_EEaSERKSE_@Base 5.7.1 + (optional)_ZNSt6vectorINSt8__detail6_StateIcEESaIS2_EE17_M_realloc_insertIJS2_EEEvN9__gnu_cxx17__normal_iteratorIPS2_S4_EEDpOT_@Base 5.7.1 + (optional)_ZNSt6vectorIS_ImSaImEESaIS1_EE17_M_realloc_insertIJRKS1_EEEvN9__gnu_cxx17__normal_iteratorIPS1_S3_EEDpOT_@Base 5.7.1 + (optional)_ZNSt6vectorISt3setINSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEEESt4lessIS6_ESaIS6_EESaISA_EE17_M_default_appendEm@Base 5.7.1 + (optional)_ZNSt6vectorISt4pairINSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEEES6_ESaIS7_EE17_M_realloc_insertIJS7_EEEvN9__gnu_cxx17__normal_iteratorIPS7_S9_EEDpOT_@Base 5.7.1 + (optional)_ZNSt6vectorISt4pairIlS_INSt7__cxx119sub_matchIN9__gnu_cxx17__normal_iteratorIPKcNS1_12basic_stringIcSt11char_traitsIcESaIcEEEEEEESaISD_EEESaISG_EE17_M_realloc_insertIJRlRKSF_EEEvNS4_IPSG_SI_EEDpOT_@Base 5.7.1 + (optional)_ZNSt6vectorIbSaIbEE14_M_fill_insertESt13_Bit_iteratormb@Base 5.7.1 (optional)_ZNSt6vectorIcSaIcEE17_M_default_appendEm@Base 5.5.0 + (optional)_ZNSt6vectorIdSaIdEE17_M_default_appendEm@Base 5.7.1 + (optional)_ZNSt6vectorIiSaIiEE17_M_default_appendEm@Base 5.7.1 + (optional)_ZNSt6vectorIiSaIiEEaSERKS1_@Base 5.7.1 (optional)_ZNSt6vectorIjSaIjEE15_M_range_insertIN9__gnu_cxx17__normal_iteratorIPjS1_EEEEvS6_T_S7_St20forward_iterator_tag@Base 5.5.0 + (optional)_ZNSt6vectorIjSaIjEEaSERKS1_@Base 5.7.1 (optional)_ZNSt6vectorImSaImEE13_M_assign_auxIPKmEEvT_S5_St20forward_iterator_tag@Base 5.5.0 (optional)_ZNSt6vectorImSaImEE15_M_range_insertIN9__gnu_cxx17__normal_iteratorIPKmS1_EEEEvNS4_IPmS1_EET_SA_St20forward_iterator_tag@Base 5.5.0 + (optional)_ZNSt6vectorImSaImEE15_M_range_insertIN9__gnu_cxx17__normal_iteratorIPmS1_EEEEvS6_T_S7_St20forward_iterator_tag@Base 5.7.1 (optional)_ZNSt6vectorImSaImEE17_M_default_appendEm@Base 5.5.0 (optional)_ZNSt6vectorImSaImEEaSERKS1_@Base 5.5.0 + (optional)_ZNSt7__cxx1114regex_iteratorIN9__gnu_cxx17__normal_iteratorIPKcNS_12basic_stringIcSt11char_traitsIcESaIcEEEEEcNS_12regex_traitsIcEEEppEv@Base 5.7.1 + (optional)_ZNSt7__cxx1120regex_token_iteratorIN9__gnu_cxx17__normal_iteratorIPKcNS_12basic_stringIcSt11char_traitsIcESaIcEEEEEcNS_12regex_traitsIcEEE7_M_initESA_SA_@Base 5.7.1 + (optional)_ZNSt7__cxx1120regex_token_iteratorIN9__gnu_cxx17__normal_iteratorIPKcNS_12basic_stringIcSt11char_traitsIcESaIcEEEEEcNS_12regex_traitsIcEEEaSERKSD_@Base 5.7.1 + (optional)_ZNSt7__cxx1120regex_token_iteratorIN9__gnu_cxx17__normal_iteratorIPKcNS_12basic_stringIcSt11char_traitsIcESaIcEEEEEcNS_12regex_traitsIcEEEppEv@Base 5.7.1 (optional)_ZNSt7__cxx119to_stringEi@Base 5.5.0 + (optional)_ZNSt7__cxx119to_stringEj@Base 5.7.1 (optional)_ZNSt7__cxx119to_stringEm@Base 5.5.0 (optional)_ZNSt8_Rb_treeINSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEEES5_St9_IdentityIS5_ESt4lessIS5_ESaIS5_EE16_M_insert_uniqueIRKS5_EESt4pairISt17_Rb_tree_iteratorIS5_EbEOT_@Base 5.5.0 (optional)_ZNSt8_Rb_treeINSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEEES5_St9_IdentityIS5_ESt4lessIS5_ESaIS5_EE24_M_get_insert_unique_posERKS5_@Base 5.5.0 (optional)_ZNSt8_Rb_treeINSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEEES5_St9_IdentityIS5_ESt4lessIS5_ESaIS5_EE8_M_eraseEPSt13_Rb_tree_nodeIS5_E@Base 5.5.0 + (optional)_ZNSt8_Rb_treeISt6vectorImSaImEES2_St9_IdentityIS2_ESt4lessIS2_ESaIS2_EE16_M_insert_uniqueIRKS2_EESt4pairISt17_Rb_tree_iteratorIS2_EbEOT_@Base 5.7.1 + (optional)_ZNSt8_Rb_treeISt6vectorImSaImEES2_St9_IdentityIS2_ESt4lessIS2_ESaIS2_EE16_M_insert_uniqueIS2_EESt4pairISt17_Rb_tree_iteratorIS2_EbEOT_@Base 5.7.1 + (optional)_ZNSt8_Rb_treeISt6vectorImSaImEES2_St9_IdentityIS2_ESt4lessIS2_ESaIS2_EE24_M_get_insert_unique_posERKS2_@Base 5.7.1 + (optional)_ZNSt8_Rb_treeISt6vectorImSaImEES2_St9_IdentityIS2_ESt4lessIS2_ESaIS2_EE29_M_get_insert_hint_unique_posESt23_Rb_tree_const_iteratorIS2_ERKS2_@Base 5.7.1 + (optional)_ZNSt8_Rb_treeISt6vectorImSaImEES2_St9_IdentityIS2_ESt4lessIS2_ESaIS2_EE8_M_eraseEPSt13_Rb_tree_nodeIS2_E@Base 5.7.1 + (optional)_ZNSt8_Rb_treeIlSt4pairIKllESt10_Select1stIS2_ESt4lessIlESaIS2_EE29_M_get_insert_hint_unique_posESt23_Rb_tree_const_iteratorIS2_ERS1_@Base 5.7.1 + (optional)_ZNSt8_Rb_treeIlSt4pairIKllESt10_Select1stIS2_ESt4lessIlESaIS2_EE8_M_eraseEPSt13_Rb_tree_nodeIS2_E@Base 5.7.1 (optional)_ZNSt8_Rb_treeImSt4pairIKmmESt10_Select1stIS2_ESt4lessImESaIS2_EE8_M_eraseEPSt13_Rb_tree_nodeIS2_E@Base 5.5.0 + (optional)_ZNSt8_Rb_treeImmSt9_IdentityImESt4lessImESaImEE5eraseERKm@Base 5.7.1 (optional)_ZNSt8_Rb_treeImmSt9_IdentityImESt4lessImESaImEE7_M_copyILb0ENS5_11_Alloc_nodeEEEPSt13_Rb_tree_nodeImESA_PSt18_Rb_tree_node_baseRT0_@Base 5.5.0 (optional)_ZNSt8_Rb_treeImmSt9_IdentityImESt4lessImESaImEE8_M_eraseEPSt13_Rb_tree_nodeImE@Base 5.5.0 - (optional)_ZNSt8__detail16_Hashtable_allocISaINS_10_Hash_nodeISt4pairIKSt5tupleIJSt5arrayImLm2EE18rocfft_precision_e13ComputeScheme19SBRC_TRANSPOSE_TYPEEE9FFTKernelELb0EEEEE16_M_allocate_nodeIJS9_SB_EEEPSD_DpOT_@Base 5.5.0 - (optional)_ZSt11__make_heapIN9__gnu_cxx17__normal_iteratorIPjSt6vectorIjSaIjEEEENS0_5__ops15_Iter_less_iterEEvT_S9_RT0_@Base 5.5.0 - (optional)_ZSt11__make_heapIN9__gnu_cxx17__normal_iteratorIPmSt6vectorImSaImEEEENS0_5__ops15_Iter_comp_iterISt7greaterImEEEEvT_SC_RT0_@Base 5.5.0 + (optional)_ZNSt8__detail17__regex_algo_implIN9__gnu_cxx17__normal_iteratorIPKcNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEEEEESaINS5_9sub_matchISB_EEEcNS5_12regex_traitsIcEEEEbT_SH_RNS5_13match_resultsISH_T0_EERKNS5_11basic_regexIT1_T2_EENSt15regex_constants15match_flag_typeENS_20_RegexExecutorPolicyEb@Base 5.7.1 + (optional)_ZNSt8__detail4_NFAINSt7__cxx1112regex_traitsIcEEE17_M_insert_backrefEm@Base 5.7.1 + (optional)_ZNSt8__detail8_ScannerIcE12_M_eat_classEc@Base 5.7.1 + (optional)_ZNSt8__detail8_ScannerIcE14_M_scan_normalEv@Base 5.7.1 + (optional)_ZNSt8__detail8_ScannerIcE16_M_scan_in_braceEv@Base 5.7.1 + (optional)_ZNSt8__detail8_ScannerIcE17_M_eat_escape_awkEv@Base 5.7.1 + (optional)_ZNSt8__detail8_ScannerIcE18_M_eat_escape_ecmaEv@Base 5.7.1 + (optional)_ZNSt8__detail8_ScannerIcE18_M_scan_in_bracketEv@Base 5.7.1 + (optional)_ZNSt8__detail8_ScannerIcE19_M_eat_escape_posixEv@Base 5.7.1 + (optional)_ZNSt8__detail8_ScannerIcEC2EPKcS3_NSt15regex_constants18syntax_option_typeESt6locale@Base 5.7.1 + (optional)_ZNSt8__detail9_CompilerINSt7__cxx1112regex_traitsIcEEE11_M_try_charEv@Base 5.7.1 + (optional)_ZNSt8__detail9_CompilerINSt7__cxx1112regex_traitsIcEEE12_M_assertionEv@Base 5.7.1 + (optional)_ZNSt8__detail9_CompilerINSt7__cxx1112regex_traitsIcEEE13_M_quantifierEv@Base 5.7.1 + (optional)_ZNSt8__detail9_CompilerINSt7__cxx1112regex_traitsIcEEE14_M_alternativeEv@Base 5.7.1 + (optional)_ZNSt8__detail9_CompilerINSt7__cxx1112regex_traitsIcEEE14_M_disjunctionEv@Base 5.7.1 + (optional)_ZNSt8__detail9_CompilerINSt7__cxx1112regex_traitsIcEEE14_M_match_tokenENS_12_ScannerBase7_TokenTE@Base 5.7.1 + (optional)_ZNSt8__detail9_CompilerINSt7__cxx1112regex_traitsIcEEE16_M_cur_int_valueEi@Base 5.7.1 + (optional)_ZNSt8__detail9_CompilerINSt7__cxx1112regex_traitsIcEEE18_M_expression_termILb0ELb0EEEbRNS4_13_BracketStateERNS_15_BracketMatcherIS3_XT_EXT0_EEE@Base 5.7.1 + (optional)_ZNSt8__detail9_CompilerINSt7__cxx1112regex_traitsIcEEE18_M_expression_termILb0ELb1EEEbRNS4_13_BracketStateERNS_15_BracketMatcherIS3_XT_EXT0_EEE@Base 5.7.1 + (optional)_ZNSt8__detail9_CompilerINSt7__cxx1112regex_traitsIcEEE18_M_expression_termILb1ELb0EEEbRNS4_13_BracketStateERNS_15_BracketMatcherIS3_XT_EXT0_EEE@Base 5.7.1 + (optional)_ZNSt8__detail9_CompilerINSt7__cxx1112regex_traitsIcEEE18_M_expression_termILb1ELb1EEEbRNS4_13_BracketStateERNS_15_BracketMatcherIS3_XT_EXT0_EEE@Base 5.7.1 + (optional)_ZNSt8__detail9_CompilerINSt7__cxx1112regex_traitsIcEEE21_M_bracket_expressionEv@Base 5.7.1 + (optional)_ZNSt8__detail9_CompilerINSt7__cxx1112regex_traitsIcEEE22_M_insert_char_matcherILb0ELb0EEEvv@Base 5.7.1 + (optional)_ZNSt8__detail9_CompilerINSt7__cxx1112regex_traitsIcEEE22_M_insert_char_matcherILb0ELb1EEEvv@Base 5.7.1 + (optional)_ZNSt8__detail9_CompilerINSt7__cxx1112regex_traitsIcEEE22_M_insert_char_matcherILb1ELb0EEEvv@Base 5.7.1 + (optional)_ZNSt8__detail9_CompilerINSt7__cxx1112regex_traitsIcEEE22_M_insert_char_matcherILb1ELb1EEEvv@Base 5.7.1 + (optional)_ZNSt8__detail9_CompilerINSt7__cxx1112regex_traitsIcEEE25_M_insert_bracket_matcherILb0ELb0EEEvb@Base 5.7.1 + (optional)_ZNSt8__detail9_CompilerINSt7__cxx1112regex_traitsIcEEE25_M_insert_bracket_matcherILb0ELb1EEEvb@Base 5.7.1 + (optional)_ZNSt8__detail9_CompilerINSt7__cxx1112regex_traitsIcEEE25_M_insert_bracket_matcherILb1ELb0EEEvb@Base 5.7.1 + (optional)_ZNSt8__detail9_CompilerINSt7__cxx1112regex_traitsIcEEE25_M_insert_bracket_matcherILb1ELb1EEEvb@Base 5.7.1 + (optional)_ZNSt8__detail9_CompilerINSt7__cxx1112regex_traitsIcEEE26_M_insert_any_matcher_ecmaILb0ELb0EEEvv@Base 5.7.1 + (optional)_ZNSt8__detail9_CompilerINSt7__cxx1112regex_traitsIcEEE26_M_insert_any_matcher_ecmaILb0ELb1EEEvv@Base 5.7.1 + (optional)_ZNSt8__detail9_CompilerINSt7__cxx1112regex_traitsIcEEE26_M_insert_any_matcher_ecmaILb1ELb0EEEvv@Base 5.7.1 + (optional)_ZNSt8__detail9_CompilerINSt7__cxx1112regex_traitsIcEEE26_M_insert_any_matcher_ecmaILb1ELb1EEEvv@Base 5.7.1 + (optional)_ZNSt8__detail9_CompilerINSt7__cxx1112regex_traitsIcEEE27_M_insert_any_matcher_posixILb0ELb0EEEvv@Base 5.7.1 + (optional)_ZNSt8__detail9_CompilerINSt7__cxx1112regex_traitsIcEEE27_M_insert_any_matcher_posixILb0ELb1EEEvv@Base 5.7.1 + (optional)_ZNSt8__detail9_CompilerINSt7__cxx1112regex_traitsIcEEE27_M_insert_any_matcher_posixILb1ELb0EEEvv@Base 5.7.1 + (optional)_ZNSt8__detail9_CompilerINSt7__cxx1112regex_traitsIcEEE27_M_insert_any_matcher_posixILb1ELb1EEEvv@Base 5.7.1 + (optional)_ZNSt8__detail9_CompilerINSt7__cxx1112regex_traitsIcEEE33_M_insert_character_class_matcherILb0ELb0EEEvv@Base 5.7.1 + (optional)_ZNSt8__detail9_CompilerINSt7__cxx1112regex_traitsIcEEE33_M_insert_character_class_matcherILb0ELb1EEEvv@Base 5.7.1 + (optional)_ZNSt8__detail9_CompilerINSt7__cxx1112regex_traitsIcEEE33_M_insert_character_class_matcherILb1ELb0EEEvv@Base 5.7.1 + (optional)_ZNSt8__detail9_CompilerINSt7__cxx1112regex_traitsIcEEE33_M_insert_character_class_matcherILb1ELb1EEEvv@Base 5.7.1 + (optional)_ZNSt8__detail9_CompilerINSt7__cxx1112regex_traitsIcEEE7_M_atomEv@Base 5.7.1 + (optional)_ZNSt8__detail9_CompilerINSt7__cxx1112regex_traitsIcEEEC2EPKcS6_RKSt6localeNSt15regex_constants18syntax_option_typeE@Base 5.7.1 + (optional)_ZNSt8__detail9_ExecutorIN9__gnu_cxx17__normal_iteratorIPKcNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEEEEESaINS5_9sub_matchISB_EEENS5_12regex_traitsIcEELb0EE12_M_lookaheadEl@Base 5.7.1 + (optional)_ZNSt8__detail9_ExecutorIN9__gnu_cxx17__normal_iteratorIPKcNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEEEEESaINS5_9sub_matchISB_EEENS5_12regex_traitsIcEELb0EE15_M_handle_matchENSH_11_Match_modeEl@Base 5.7.1 + (optional)_ZNSt8__detail9_ExecutorIN9__gnu_cxx17__normal_iteratorIPKcNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEEEEESaINS5_9sub_matchISB_EEENS5_12regex_traitsIcEELb0EE16_M_main_dispatchENSH_11_Match_modeESt17integral_constantIbLb0EE@Base 5.7.1 + (optional)_ZNSt8__detail9_ExecutorIN9__gnu_cxx17__normal_iteratorIPKcNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEEEEESaINS5_9sub_matchISB_EEENS5_12regex_traitsIcEELb0EE16_M_rep_once_moreENSH_11_Match_modeEl@Base 5.7.1 + (optional)_ZNSt8__detail9_ExecutorIN9__gnu_cxx17__normal_iteratorIPKcNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEEEEESaINS5_9sub_matchISB_EEENS5_12regex_traitsIcEELb0EE17_M_handle_backrefENSH_11_Match_modeEl@Base 5.7.1 + (optional)_ZNSt8__detail9_ExecutorIN9__gnu_cxx17__normal_iteratorIPKcNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEEEEESaINS5_9sub_matchISB_EEENS5_12regex_traitsIcEELb0EE6_M_dfsENSH_11_Match_modeEl@Base 5.7.1 + (optional)_ZNSt8__detail9_ExecutorIN9__gnu_cxx17__normal_iteratorIPKcNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEEEEESaINS5_9sub_matchISB_EEENS5_12regex_traitsIcEELb1EE12_M_lookaheadEl@Base 5.7.1 + (optional)_ZNSt8__detail9_ExecutorIN9__gnu_cxx17__normal_iteratorIPKcNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEEEEESaINS5_9sub_matchISB_EEENS5_12regex_traitsIcEELb1EE17_M_handle_backrefENSH_11_Match_modeEl@Base 5.7.1 + (optional)_ZNSt8__detail9_ExecutorIN9__gnu_cxx17__normal_iteratorIPKcNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEEEEESaINS5_9sub_matchISB_EEENS5_12regex_traitsIcEELb1EE6_M_dfsENSH_11_Match_modeEl@Base 5.7.1 + (optional)_ZNSt8__detail9_ExecutorIN9__gnu_cxx17__normal_iteratorIPKcNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEEEEESaINS5_9sub_matchISB_EEENS5_12regex_traitsIcEELb1EE9_M_searchEv@Base 5.7.1 + (optional)_ZNSt8__detail9_StateSeqINSt7__cxx1112regex_traitsIcEEE8_M_cloneEv@Base 5.7.1 (optional)_ZSt16__do_uninit_copyIN9__gnu_cxx17__normal_iteratorIPKNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEEESt6vectorIS7_SaIS7_EEEEPS7_ET0_T_SG_SF_@Base 5.5.0 (optional)_ZSt16__do_uninit_copyIPKNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEEEPS5_ET0_T_SA_S9_@Base 5.5.0 (optional)_ZSt16__do_uninit_copyIPNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEEES6_ET0_T_S8_S7_@Base 5.5.0 - (optional)_ZSt16__introsort_loopIN9__gnu_cxx17__normal_iteratorIPjSt6vectorIjSaIjEEEElNS0_5__ops15_Iter_less_iterEEvT_S9_T0_T1_@Base 5.5.0 - (optional)_ZSt16__introsort_loopIN9__gnu_cxx17__normal_iteratorIPmSt6vectorImSaImEEEElNS0_5__ops15_Iter_comp_iterISt7greaterImEEEEvT_SC_T0_T1_@Base 5.5.0 + (optional)_ZSt19__throw_regex_errorNSt15regex_constants10error_typeEPKc@Base 5.7.1 (optional)_ZSt19piecewise_construct@Base 5.5.0 - (optional)_ZSt20__throw_bad_any_castv@Base 5.5.0 - (optional)_ZSt22__final_insertion_sortIN9__gnu_cxx17__normal_iteratorIPjSt6vectorIjSaIjEEEENS0_5__ops15_Iter_less_iterEEvT_S9_T0_@Base 5.5.0 - (optional)_ZSt22__final_insertion_sortIN9__gnu_cxx17__normal_iteratorIPmSt6vectorImSaImEEEENS0_5__ops15_Iter_comp_iterISt7greaterImEEEEvT_SC_T0_@Base 5.5.0 (optional)_ZSt27__throw_bad_optional_accessv@Base 5.5.0 + (optional)_ZSt4sortIN9__gnu_cxx17__normal_iteratorIPmSt6vectorImSaImEEEEEvT_S7_@Base 5.7.1 (optional)_ZSt7find_ifIN9__gnu_cxx17__normal_iteratorIPKmSt6vectorImSaImEEEESt8functionIFbmEEET_SB_SB_T0_@Base 5.5.0 - (optional)_ZSt9__find_ifIN9__gnu_cxx17__normal_iteratorIPKNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEEESt6vectorIS7_SaIS7_EEEENS0_5__ops16_Iter_equals_valIS8_EEET_SH_SH_T0_St26random_access_iterator_tag@Base 5.5.0 - (optional)_ZSt9__find_ifIN9__gnu_cxx17__normal_iteratorIPKmSt6vectorImSaImEEEENS0_5__ops10_Iter_predISt8functionIFbmEEEEET_SE_SE_T0_St26random_access_iterator_tag@Base 5.5.0 (optional)_ZStneRKSt10error_codeRKSt15error_condition@Base 5.5.0 (optional)_ZStplIcSt11char_traitsIcESaIcEENSt7__cxx1112basic_stringIT_T0_T1_EEOS8_PKS5_@Base 5.5.0 + (optional)_ZStplIcSt11char_traitsIcESaIcEENSt7__cxx1112basic_stringIT_T0_T1_EEOS8_RKS8_@Base 5.7.1 (optional)_ZStplIcSt11char_traitsIcESaIcEENSt7__cxx1112basic_stringIT_T0_T1_EEOS8_S9_@Base 5.5.0 (optional)_ZStplIcSt11char_traitsIcESaIcEENSt7__cxx1112basic_stringIT_T0_T1_EEPKS5_OS8_@Base 5.5.0 (optional)_ZStplIcSt11char_traitsIcESaIcEENSt7__cxx1112basic_stringIT_T0_T1_EEPKS5_RKS8_@Base 5.5.0 + (optional)_ZStplIcSt11char_traitsIcESaIcEENSt7__cxx1112basic_stringIT_T0_T1_EERKS8_PKS5_@Base 5.7.1 (optional)_ZTINSt13__future_base13_State_baseV27_SetterIvvEE@Base 5.5.0 (optional)_ZTINSt13__future_base13_State_baseV2E@Base 5.5.0 (optional)_ZTINSt13__future_base21_Async_state_commonV2E@Base 5.5.0 (optional)_ZTINSt13__future_base7_ResultIvEE@Base 5.5.0 + (optional)_ZTINSt8__detail11_AnyMatcherINSt7__cxx1112regex_traitsIcEELb0ELb0ELb0EEE@Base 5.7.1 + (optional)_ZTINSt8__detail11_AnyMatcherINSt7__cxx1112regex_traitsIcEELb0ELb0ELb1EEE@Base 5.7.1 + (optional)_ZTINSt8__detail11_AnyMatcherINSt7__cxx1112regex_traitsIcEELb0ELb1ELb0EEE@Base 5.7.1 + (optional)_ZTINSt8__detail11_AnyMatcherINSt7__cxx1112regex_traitsIcEELb0ELb1ELb1EEE@Base 5.7.1 + (optional)_ZTINSt8__detail11_AnyMatcherINSt7__cxx1112regex_traitsIcEELb1ELb0ELb0EEE@Base 5.7.1 + (optional)_ZTINSt8__detail11_AnyMatcherINSt7__cxx1112regex_traitsIcEELb1ELb0ELb1EEE@Base 5.7.1 + (optional)_ZTINSt8__detail11_AnyMatcherINSt7__cxx1112regex_traitsIcEELb1ELb1ELb0EEE@Base 5.7.1 + (optional)_ZTINSt8__detail11_AnyMatcherINSt7__cxx1112regex_traitsIcEELb1ELb1ELb1EEE@Base 5.7.1 + (optional)_ZTINSt8__detail12_CharMatcherINSt7__cxx1112regex_traitsIcEELb0ELb0EEE@Base 5.7.1 + (optional)_ZTINSt8__detail12_CharMatcherINSt7__cxx1112regex_traitsIcEELb0ELb1EEE@Base 5.7.1 + (optional)_ZTINSt8__detail12_CharMatcherINSt7__cxx1112regex_traitsIcEELb1ELb0EEE@Base 5.7.1 + (optional)_ZTINSt8__detail12_CharMatcherINSt7__cxx1112regex_traitsIcEELb1ELb1EEE@Base 5.7.1 + (optional)_ZTINSt8__detail15_BracketMatcherINSt7__cxx1112regex_traitsIcEELb0ELb0EEE@Base 5.7.1 + (optional)_ZTINSt8__detail15_BracketMatcherINSt7__cxx1112regex_traitsIcEELb0ELb1EEE@Base 5.7.1 + (optional)_ZTINSt8__detail15_BracketMatcherINSt7__cxx1112regex_traitsIcEELb1ELb0EEE@Base 5.7.1 + (optional)_ZTINSt8__detail15_BracketMatcherINSt7__cxx1112regex_traitsIcEELb1ELb1EEE@Base 5.7.1 (optional)_ZTISt11_Mutex_baseILN9__gnu_cxx12_Lock_policyE2EE@Base 5.5.0 - (optional)_ZTISt12bad_any_cast@Base 5.5.0 (optional)_ZTISt16_Sp_counted_baseILN9__gnu_cxx12_Lock_policyE2EE@Base 5.5.0 (optional)_ZTISt18bad_variant_access@Base 5.5.0 (optional)_ZTISt19bad_optional_access@Base 5.5.0 (optional)_ZTISt23_Sp_counted_ptr_inplaceINSt13__future_base13_State_baseV2ESaIvELN9__gnu_cxx12_Lock_policyE2EE@Base 5.5.0 + (optional)_ZTISt23_Sp_counted_ptr_inplaceINSt8__detail4_NFAINSt7__cxx1112regex_traitsIcEEEESaIvELN9__gnu_cxx12_Lock_policyE2EE@Base 5.7.1 (optional)_ZTSNSt13__future_base13_State_baseV27_SetterIvvEE@Base 5.5.0 (optional)_ZTSNSt13__future_base13_State_baseV2E@Base 5.5.0 (optional)_ZTSNSt13__future_base21_Async_state_commonV2E@Base 5.5.0 (optional)_ZTSNSt13__future_base7_ResultIvEE@Base 5.5.0 + (optional)_ZTSNSt8__detail11_AnyMatcherINSt7__cxx1112regex_traitsIcEELb0ELb0ELb0EEE@Base 5.7.1 + (optional)_ZTSNSt8__detail11_AnyMatcherINSt7__cxx1112regex_traitsIcEELb0ELb0ELb1EEE@Base 5.7.1 + (optional)_ZTSNSt8__detail11_AnyMatcherINSt7__cxx1112regex_traitsIcEELb0ELb1ELb0EEE@Base 5.7.1 + (optional)_ZTSNSt8__detail11_AnyMatcherINSt7__cxx1112regex_traitsIcEELb0ELb1ELb1EEE@Base 5.7.1 + (optional)_ZTSNSt8__detail11_AnyMatcherINSt7__cxx1112regex_traitsIcEELb1ELb0ELb0EEE@Base 5.7.1 + (optional)_ZTSNSt8__detail11_AnyMatcherINSt7__cxx1112regex_traitsIcEELb1ELb0ELb1EEE@Base 5.7.1 + (optional)_ZTSNSt8__detail11_AnyMatcherINSt7__cxx1112regex_traitsIcEELb1ELb1ELb0EEE@Base 5.7.1 + (optional)_ZTSNSt8__detail11_AnyMatcherINSt7__cxx1112regex_traitsIcEELb1ELb1ELb1EEE@Base 5.7.1 + (optional)_ZTSNSt8__detail12_CharMatcherINSt7__cxx1112regex_traitsIcEELb0ELb0EEE@Base 5.7.1 + (optional)_ZTSNSt8__detail12_CharMatcherINSt7__cxx1112regex_traitsIcEELb0ELb1EEE@Base 5.7.1 + (optional)_ZTSNSt8__detail12_CharMatcherINSt7__cxx1112regex_traitsIcEELb1ELb0EEE@Base 5.7.1 + (optional)_ZTSNSt8__detail12_CharMatcherINSt7__cxx1112regex_traitsIcEELb1ELb1EEE@Base 5.7.1 + (optional)_ZTSNSt8__detail15_BracketMatcherINSt7__cxx1112regex_traitsIcEELb0ELb0EEE@Base 5.7.1 + (optional)_ZTSNSt8__detail15_BracketMatcherINSt7__cxx1112regex_traitsIcEELb0ELb1EEE@Base 5.7.1 + (optional)_ZTSNSt8__detail15_BracketMatcherINSt7__cxx1112regex_traitsIcEELb1ELb0EEE@Base 5.7.1 + (optional)_ZTSNSt8__detail15_BracketMatcherINSt7__cxx1112regex_traitsIcEELb1ELb1EEE@Base 5.7.1 (optional)_ZTSSt11_Mutex_baseILN9__gnu_cxx12_Lock_policyE2EE@Base 5.5.0 - (optional)_ZTSSt12bad_any_cast@Base 5.5.0 (optional)_ZTSSt16_Sp_counted_baseILN9__gnu_cxx12_Lock_policyE2EE@Base 5.5.0 (optional)_ZTSSt18bad_variant_access@Base 5.5.0 (optional)_ZTSSt19_Sp_make_shared_tag@Base 5.5.0 (optional)_ZTSSt19bad_optional_access@Base 5.5.0 (optional)_ZTSSt23_Sp_counted_ptr_inplaceINSt13__future_base13_State_baseV2ESaIvELN9__gnu_cxx12_Lock_policyE2EE@Base 5.5.0 + (optional)_ZTSSt23_Sp_counted_ptr_inplaceINSt8__detail4_NFAINSt7__cxx1112regex_traitsIcEEEESaIvELN9__gnu_cxx12_Lock_policyE2EE@Base 5.7.1 (optional)_ZTVNSt13__future_base13_State_baseV2E@Base 5.5.0 (optional)_ZTVNSt13__future_base21_Async_state_commonV2E@Base 5.5.0 (optional)_ZTVNSt13__future_base7_ResultIvEE@Base 5.5.0 - (optional)_ZTVSt12bad_any_cast@Base 5.5.0 (optional)_ZTVSt18bad_variant_access@Base 5.5.0 (optional)_ZTVSt19bad_optional_access@Base 5.5.0 (optional)_ZTVSt23_Sp_counted_ptr_inplaceINSt13__future_base13_State_baseV2ESaIvELN9__gnu_cxx12_Lock_policyE2EE@Base 5.5.0 + (optional)_ZTVSt23_Sp_counted_ptr_inplaceINSt8__detail4_NFAINSt7__cxx1112regex_traitsIcEEEESaIvELN9__gnu_cxx12_Lock_policyE2EE@Base 5.7.1 + (optional)_ZZNKSt7__cxx1112regex_traitsIcE16lookup_classnameIPKcEENS1_10_RegexMaskET_S6_bE12__classnamesB5cxx11@Base 5.7.1 + (optional)_ZZNKSt7__cxx1112regex_traitsIcE18lookup_collatenameIPKcEENS_12basic_stringIcSt11char_traitsIcESaIcEEET_SA_E14__collatenames@Base 5.7.1 + (optional)_ZZNKSt8__detail11_AnyMatcherINSt7__cxx1112regex_traitsIcEELb0ELb0ELb0EEclEcE5__nul@Base 5.7.1 + (optional)_ZZNKSt8__detail11_AnyMatcherINSt7__cxx1112regex_traitsIcEELb0ELb0ELb1EEclEcE5__nul@Base 5.7.1 + (optional)_ZZNKSt8__detail11_AnyMatcherINSt7__cxx1112regex_traitsIcEELb0ELb1ELb0EEclEcE5__nul@Base 5.7.1 + (optional)_ZZNKSt8__detail11_AnyMatcherINSt7__cxx1112regex_traitsIcEELb0ELb1ELb1EEclEcE5__nul@Base 5.7.1 + (optional)_ZZNKSt8__detail9_ExecutorIN9__gnu_cxx17__normal_iteratorIPKcNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEEEEESaINS5_9sub_matchISB_EEENS5_12regex_traitsIcEELb0EE10_M_is_wordEcE3__s@Base 5.7.1 + (optional)_ZZNKSt8__detail9_ExecutorIN9__gnu_cxx17__normal_iteratorIPKcNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEEEEESaINS5_9sub_matchISB_EEENS5_12regex_traitsIcEELb1EE10_M_is_wordEcE3__s@Base 5.7.1 (optional)_ZZNSt19_Sp_make_shared_tag5_S_tiEvE5__tag@Base 5.5.0 rocfft_cache_buffer_free@Base 5.5.0 rocfft_cache_deserialize@Base 5.5.0 diff -Nru rocfft-5.5.0/debian/patches/0001-remove-use-of-openmp.patch rocfft-5.7.1/debian/patches/0001-remove-use-of-openmp.patch --- rocfft-5.5.0/debian/patches/0001-remove-use-of-openmp.patch 2023-11-10 09:02:29.000000000 +0000 +++ rocfft-5.7.1/debian/patches/0001-remove-use-of-openmp.patch 1970-01-01 00:00:00.000000000 +0000 @@ -1,25 +0,0 @@ -From: Cordell Bloor -Date: Sun, 27 Nov 2022 00:34:47 -0700 -Subject: remove use of openmp - -It's not clear how OpenMP should be used with Clang. It appears that -in ROCm, the OpenMP support is provided by openmp-extras (which comes -from the AOMP build of LLVM). - -Fixed upstream in e7b1fe244ab0623e900b4efe75be29545df1163b. ---- - clients/fft_params.h | 1 - - 1 file changed, 1 deletion(-) - -diff --git a/clients/fft_params.h b/clients/fft_params.h -index d21ba85..af65c25 100644 ---- a/clients/fft_params.h -+++ b/clients/fft_params.h -@@ -28,7 +28,6 @@ - #include - #include - #include --#include - #include - #include - #include diff -Nru rocfft-5.5.0/debian/patches/0002-disable-fftw-install.patch rocfft-5.7.1/debian/patches/0002-disable-fftw-install.patch --- rocfft-5.5.0/debian/patches/0002-disable-fftw-install.patch 2023-11-10 09:02:29.000000000 +0000 +++ rocfft-5.7.1/debian/patches/0002-disable-fftw-install.patch 1970-01-01 00:00:00.000000000 +0000 @@ -1,37 +0,0 @@ -From afc1c41b4fea1ed7e4797236e3e666c9453033c2 Mon Sep 17 00:00:00 2001 -From: Steve Leung -Date: Wed, 21 Dec 2022 11:27:39 -0700 -Subject: [PATCH] cmake: only install fftw if we built it - ---- - clients/tests/CMakeLists.txt | 13 +++++-------- - 1 file changed, 5 insertions(+), 8 deletions(-) - -diff --git a/clients/tests/CMakeLists.txt b/clients/tests/CMakeLists.txt -index fe5781f8..5d865908 100644 ---- a/clients/tests/CMakeLists.txt -+++ b/clients/tests/CMakeLists.txt -@@ -161,18 +161,15 @@ if( BUILD_FFTW OR NOT FFTW_FOUND ) - - # FFTW we build is always threaded - set( FFTW_MULTITHREAD TRUE ) --endif() - --if( BUILD_FFTW OR NOT FFTW_FOUND ) - add_dependencies( rocfft-test fftw_double fftw_single ) -+ rocm_install( -+ FILES ${FFTW_LIBRARIES} -+ DESTINATION ${CMAKE_INSTALL_LIBDIR}/fftw -+ COMPONENT clients-common -+ ) - endif() - --rocm_install( -- FILES ${FFTW_LIBRARIES} -- DESTINATION ${CMAKE_INSTALL_LIBDIR}/fftw -- COMPONENT clients-common --) -- - set( rocfft-test_include_dirs - $ - $ diff -Nru rocfft-5.5.0/debian/patches/0003-fix-sample-includes.patch rocfft-5.7.1/debian/patches/0003-fix-sample-includes.patch --- rocfft-5.5.0/debian/patches/0003-fix-sample-includes.patch 2023-11-10 09:02:29.000000000 +0000 +++ rocfft-5.7.1/debian/patches/0003-fix-sample-includes.patch 1970-01-01 00:00:00.000000000 +0000 @@ -1,94 +0,0 @@ -From 3d6eed1651850c0a7669c75d344e2d54193e3a16 Mon Sep 17 00:00:00 2001 -From: Cory Bloor -Date: Mon, 13 Feb 2023 13:36:49 -0700 -Subject: [PATCH] Fix sample include statements (#1013) - -For the samples to be built separately from rocfft project, the include -style needs to be . ---- - docs/samples/complex_1d.cpp | 2 +- - docs/samples/complex_2d.cpp | 2 +- - docs/samples/complex_3d.cpp | 2 +- - docs/samples/real2complex_1d.cpp | 2 +- - docs/samples/real2complex_2d.cpp | 2 +- - docs/samples/real2complex_3d.cpp | 2 +- - 6 files changed, 6 insertions(+), 6 deletions(-) - -diff --git a/docs/samples/complex_1d.cpp b/docs/samples/complex_1d.cpp -index 5ee9347a..e815f152 100644 ---- a/docs/samples/complex_1d.cpp -+++ b/docs/samples/complex_1d.cpp -@@ -25,7 +25,7 @@ - - #include - --#include "rocfft.h" -+#include - - int main(int argc, char* argv[]) - { -diff --git a/docs/samples/complex_2d.cpp b/docs/samples/complex_2d.cpp -index 0010aa70..9da818be 100644 ---- a/docs/samples/complex_2d.cpp -+++ b/docs/samples/complex_2d.cpp -@@ -25,7 +25,7 @@ - - #include - --#include "rocfft.h" -+#include - - int main(int argc, char* argv[]) - { -diff --git a/docs/samples/complex_3d.cpp b/docs/samples/complex_3d.cpp -index ae8f60bd..b547d7ee 100644 ---- a/docs/samples/complex_3d.cpp -+++ b/docs/samples/complex_3d.cpp -@@ -25,7 +25,7 @@ - - #include - --#include "rocfft.h" -+#include - - int main(int argc, char* argv[]) - { -diff --git a/docs/samples/real2complex_1d.cpp b/docs/samples/real2complex_1d.cpp -index 4ef12d2d..8043bd97 100644 ---- a/docs/samples/real2complex_1d.cpp -+++ b/docs/samples/real2complex_1d.cpp -@@ -25,7 +25,7 @@ - - #include - --#include "rocfft.h" -+#include - - int main(int argc, char* argv[]) - { -diff --git a/docs/samples/real2complex_2d.cpp b/docs/samples/real2complex_2d.cpp -index 320e027e..06ecdd92 100644 ---- a/docs/samples/real2complex_2d.cpp -+++ b/docs/samples/real2complex_2d.cpp -@@ -25,7 +25,7 @@ - - #include - --#include "rocfft.h" -+#include - - int main(int argc, char* argv[]) - { -diff --git a/docs/samples/real2complex_3d.cpp b/docs/samples/real2complex_3d.cpp -index 854cdc09..baec2dfe 100644 ---- a/docs/samples/real2complex_3d.cpp -+++ b/docs/samples/real2complex_3d.cpp -@@ -25,7 +25,7 @@ - - #include - --#include "rocfft.h" -+#include - - int main(int argc, char* argv[]) - { diff -Nru rocfft-5.5.0/debian/patches/0004-fix-hiprtc-link.patch rocfft-5.7.1/debian/patches/0004-fix-hiprtc-link.patch --- rocfft-5.5.0/debian/patches/0004-fix-hiprtc-link.patch 2023-11-10 09:02:29.000000000 +0000 +++ rocfft-5.7.1/debian/patches/0004-fix-hiprtc-link.patch 2024-03-12 17:13:18.000000000 +0000 @@ -15,7 +15,7 @@ 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/library/src/CMakeLists.txt b/library/src/CMakeLists.txt -index ea78131..21c7d81 100644 +index 51faaff..f842795 100644 --- a/library/src/CMakeLists.txt +++ b/library/src/CMakeLists.txt @@ -46,7 +46,7 @@ else() diff -Nru rocfft-5.5.0/debian/patches/0005-add-debian-path-to-rocfft_rtc_helper.patch rocfft-5.7.1/debian/patches/0005-add-debian-path-to-rocfft_rtc_helper.patch --- rocfft-5.5.0/debian/patches/0005-add-debian-path-to-rocfft_rtc_helper.patch 2023-11-10 09:02:29.000000000 +0000 +++ rocfft-5.7.1/debian/patches/0005-add-debian-path-to-rocfft_rtc_helper.patch 1970-01-01 00:00:00.000000000 +0000 @@ -1,52 +0,0 @@ -From c2a92b6f25067aca8c603a065d3d63617f4d1f9c Mon Sep 17 00:00:00 2001 -From: Cory Bloor -Date: Tue, 11 Apr 2023 17:08:24 -0600 -Subject: [PATCH] Search Debian libexec dir for rocfft_rtc_helper (#1064) - -On Debian, the rocfft_rtc_helper can be found at -/usr/lib//rocfft//rocfft_rtc_helper ---- - library/src/CMakeLists.txt | 3 +++ - library/src/rtc_subprocess.cpp | 8 ++++++++ - 2 files changed, 11 insertions(+) - -diff --git a/library/src/CMakeLists.txt b/library/src/CMakeLists.txt -index 9008c7eb..1724dfeb 100644 ---- a/library/src/CMakeLists.txt -+++ b/library/src/CMakeLists.txt -@@ -261,6 +261,9 @@ add_library( rocfft-rtc-compile OBJECT - add_library( rocfft-rtc-subprocess OBJECT - rtc_subprocess.cpp - ) -+target_compile_definitions( rocfft-rtc-subprocess PRIVATE -+ -DROCFFT_VERSION=${VERSION_STRING} -+) - # generation of kernel source - add_library( rocfft-rtc-gen OBJECT - rtc_bluestein_gen.cpp -diff --git a/library/src/rtc_subprocess.cpp b/library/src/rtc_subprocess.cpp -index 8d85b467..e8deeec5 100644 ---- a/library/src/rtc_subprocess.cpp -+++ b/library/src/rtc_subprocess.cpp -@@ -53,6 +53,10 @@ static const char* HELPER_EXE = "rocfft_rtc_helper"; - typedef int file_handle_type; - #endif - -+#define TO_STR2(x) #x -+#define TO_STR(x) TO_STR2(x) -+#define ROCFFT_VERSION_STRING TO_STR(ROCFFT_VERSION) -+ - static fs::path find_rtc_helper() - { - // candidate directories for the helper -@@ -69,6 +73,10 @@ static fs::path find_rtc_helper() - fs::path library_parent_path = library_path.parent_path(); - helper_dirs.push_back(library_parent_path); - -+ // try in a versioned library subdirectory -+ fs::path subdir_path = library_path.parent_path() / "rocfft" / ROCFFT_VERSION_STRING; -+ helper_dirs.push_back(subdir_path); -+ - // try bin dir, one dir up from library - fs::path bin_path = library_parent_path.parent_path() / "bin"; - helper_dirs.push_back(bin_path); diff -Nru rocfft-5.5.0/debian/patches/0005-use-readthedocs-theme.patch rocfft-5.7.1/debian/patches/0005-use-readthedocs-theme.patch --- rocfft-5.5.0/debian/patches/0005-use-readthedocs-theme.patch 1970-01-01 00:00:00.000000000 +0000 +++ rocfft-5.7.1/debian/patches/0005-use-readthedocs-theme.patch 2024-03-12 17:13:18.000000000 +0000 @@ -0,0 +1,196 @@ +From: Christian Kastner +Date: Fri, 1 Mar 2024 22:36:47 +0100 +Subject: Use readthedocs theme + +The newer documentation build requires packages not yet available, so we +simply revert to conf.py from the 5.5.1 release for now. + +Forwarded: not-needed +--- + docs/conf.py | 177 +++++++++++++++++++++++++++++++++++++++++++++++++++++++++-- + 1 file changed, 171 insertions(+), 6 deletions(-) + +diff --git a/docs/conf.py b/docs/conf.py +index 101fe22..f8d5e68 100644 +--- a/docs/conf.py ++++ b/docs/conf.py +@@ -1,8 +1,173 @@ +-from rocm_docs import ROCmDocs ++# -*- coding: utf-8 -*- ++# ++# rocFFT documentation build configuration file, created by ++# sphinx-quickstart on Mon Jan 8 16:34:42 2018. ++# ++# This file is execfile()d with the current directory set to its ++# containing dir. ++# ++# Note that not all possible configuration values are present in this ++# autogenerated file. ++# ++# All configuration values have a default; values that are commented out ++# serve to show the default. + +-docs_core = ROCmDocs("rocFFT Documentation") +-docs_core.run_doxygen() +-docs_core.setup() ++# If extensions (or modules to document with autodoc) are in another directory, ++# add these directories to sys.path here. If the directory is relative to the ++# documentation root, use os.path.abspath to make it absolute, like shown here. ++# ++# import os ++# import sys ++# sys.path.insert(0, os.path.abspath('.')) + +-for sphinx_var in ROCmDocs.SPHINX_VARS: +- globals()[sphinx_var] = getattr(docs_core, sphinx_var) ++import os ++import sys ++import subprocess ++ ++read_the_docs_build = os.environ.get('READTHEDOCS', None) == 'True' ++ ++if read_the_docs_build: ++ subprocess.call('../run_doxygen.sh') ++ ++# -- General configuration ------------------------------------------------ ++ ++# If your documentation needs a minimal Sphinx version, state it here. ++# ++# needs_sphinx = '1.0' ++ ++# Add any Sphinx extension module names here, as strings. They can be ++# extensions coming with Sphinx (named 'sphinx.ext.*') or your custom ++# ones. ++extensions = ['sphinx.ext.mathjax', 'breathe'] ++breathe_projects = {"rocFFT": ".doxygen/docBin/xml"} ++breathe_default_project = "rocFFT" ++ ++# Add any paths that contain templates here, relative to this directory. ++templates_path = ['_templates'] ++ ++# The suffix(es) of source filenames. ++# You can specify multiple suffix as a list of string: ++# ++# source_suffix = ['.rst', '.md'] ++source_suffix = '.rst' ++ ++# The master toctree document. ++master_doc = 'index' ++ ++# General information about the project. ++project = u'rocFFT' ++copyright = u'2016 - 2023, Advanced Micro Devices' ++author = u'Advanced Micro Devices, Inc.' ++ ++# The version info for the project you're documenting, acts as replacement for ++# |version| and |release|, also used in various other places throughout the ++# built documents. ++# ++# The short X.Y version. ++version = u'5.7.1' ++# The full version, including alpha/beta/rc tags. ++release = u'5.7.1' ++ ++# The language for content autogenerated by Sphinx. Refer to documentation ++# for a list of supported languages. ++# ++# This is also used if you do content translation via gettext catalogs. ++# Usually you set "language" from the command line for these cases. ++language = None ++ ++# List of patterns, relative to source directory, that match files and ++# directories to ignore when looking for source files. ++# This patterns also effect to html_static_path and html_extra_path ++exclude_patterns = ['_build', 'Thumbs.db', '.DS_Store'] ++ ++# The name of the Pygments (syntax highlighting) style to use. ++pygments_style = 'sphinx' ++ ++# If true, `todo` and `todoList` produce output, else they produce nothing. ++todo_include_todos = False ++ ++# -- Options for HTML output ---------------------------------------------- ++ ++# The theme to use for HTML and HTML Help pages. See the documentation for ++# a list of builtin themes. ++# ++# html_theme = 'alabaster' ++ ++if read_the_docs_build: ++ html_theme = 'default' ++else: ++ import sphinx_rtd_theme ++ html_theme = "sphinx_rtd_theme" ++ html_theme_path = [sphinx_rtd_theme.get_html_theme_path()] ++ ++# Theme options are theme-specific and customize the look and feel of a theme ++# further. For a list of options available for each theme, see the ++# documentation. ++# ++# html_theme_options = {} ++ ++# Add any paths that contain custom static files (such as style sheets) here, ++# relative to this directory. They are copied after the builtin static files, ++# so a file named "default.css" will overwrite the builtin "default.css". ++# html_static_path = ['_static'] ++ ++# Custom sidebar templates, must be a dictionary that maps document names ++# to template names. ++# ++# This is required for the alabaster theme ++# refs: http://alabaster.readthedocs.io/en/latest/installation.html#sidebars ++# html_sidebars = { ++# '**': [ ++# 'relations.html', # needs 'show_related': True theme option to display ++# 'searchbox.html', ++# ] ++# } ++ ++# -- Options for HTMLHelp output ------------------------------------------ ++ ++# Output file base name for HTML help builder. ++htmlhelp_basename = 'rocFFTdoc' ++ ++# -- Options for LaTeX output --------------------------------------------- ++ ++latex_elements = { ++ # The paper size ('letterpaper' or 'a4paper'). ++ # ++ # 'papersize': 'letterpaper', ++ ++ # The font size ('10pt', '11pt' or '12pt'). ++ # ++ # 'pointsize': '10pt', ++ ++ # Additional stuff for the LaTeX preamble. ++ # ++ # 'preamble': '', ++ ++ # Latex figure (float) alignment ++ # ++ # 'figure_align': 'htbp', ++} ++ ++# Grouping the document tree into LaTeX files. List of tuples ++# (source start file, target name, title, ++# author, documentclass [howto, manual, or own class]). ++latex_documents = [ ++ (master_doc, 'rocFFT.tex', u'rocFFT Documentation', ++ u'Advanced Micro Devices', 'manual'), ++] ++ ++# -- Options for manual page output --------------------------------------- ++ ++# One entry per manual page. List of tuples ++# (source start file, name, description, authors, manual section). ++man_pages = [(master_doc, 'rocfft', u'rocFFT Documentation', [author], 1)] ++ ++# -- Options for Texinfo output ------------------------------------------- ++ ++# Grouping the document tree into Texinfo files. List of tuples ++# (source start file, target name, title, author, ++# dir menu entry, description, category) ++texinfo_documents = [ ++ (master_doc, 'rocFFT', u'rocFFT Documentation', author, 'rocFFT', ++ 'One line description of project.', 'Miscellaneous'), ++] diff -Nru rocfft-5.5.0/debian/patches/0006-use-local-mathjax.patch rocfft-5.7.1/debian/patches/0006-use-local-mathjax.patch --- rocfft-5.5.0/debian/patches/0006-use-local-mathjax.patch 2023-11-10 09:02:29.000000000 +0000 +++ rocfft-5.7.1/debian/patches/0006-use-local-mathjax.patch 2024-03-12 17:13:18.000000000 +0000 @@ -8,13 +8,13 @@ Forwarded: not-needed --- - docs/source/conf.py | 3 +++ + docs/conf.py | 3 +++ 1 file changed, 3 insertions(+) -diff --git a/docs/source/conf.py b/docs/source/conf.py -index d3b33df..2f35c0e 100644 ---- a/docs/source/conf.py -+++ b/docs/source/conf.py +diff --git a/docs/conf.py b/docs/conf.py +index 66c6b01..01ba07a 100644 +--- a/docs/conf.py ++++ b/docs/conf.py @@ -171,3 +171,6 @@ texinfo_documents = [ (master_doc, 'rocFFT', u'rocFFT Documentation', author, 'rocFFT', 'One line description of project.', 'Miscellaneous'), diff -Nru rocfft-5.5.0/debian/patches/0007-disable-kernel-cache-build.patch rocfft-5.7.1/debian/patches/0007-disable-kernel-cache-build.patch --- rocfft-5.5.0/debian/patches/0007-disable-kernel-cache-build.patch 2023-11-10 09:02:29.000000000 +0000 +++ rocfft-5.7.1/debian/patches/0007-disable-kernel-cache-build.patch 2024-03-12 17:13:18.000000000 +0000 @@ -8,14 +8,14 @@ Forwarded: not-needed --- - library/src/CMakeLists.txt | 28 ---------------------------- - 1 file changed, 28 deletions(-) + library/src/CMakeLists.txt | 26 -------------------------- + 1 file changed, 26 deletions(-) -diff --git a/library/src/CMakeLists.txt b/library/src/CMakeLists.txt -index 21c7d81..1ce5522 100644 ---- a/library/src/CMakeLists.txt -+++ b/library/src/CMakeLists.txt -@@ -395,23 +395,6 @@ endif() +Index: rocfft-5.7.1/library/src/CMakeLists.txt +=================================================================== +--- rocfft-5.7.1.orig/library/src/CMakeLists.txt ++++ rocfft-5.7.1/library/src/CMakeLists.txt +@@ -497,23 +497,6 @@ endif() # build. any kernels that already exist in this file will be reused # between builds. @@ -39,21 +39,26 @@ rocm_set_soversion( rocfft ${rocfft_SOVERSION} ) set_target_properties( rocfft PROPERTIES RUNTIME_OUTPUT_DIRECTORY "${PROJECT_BINARY_DIR}/staging" ) set_target_properties( rocfft PROPERTIES DEBUG_POSTFIX "-d" ) -@@ -443,17 +426,6 @@ rocm_install_targets( +@@ -545,22 +528,6 @@ rocm_install_targets( ${CMAKE_BINARY_DIR}/include ) --# kernel cache needs to go next to the library - Linux puts shared +-# kernel cache is architecture-dependent data for the library, placed +-# in a rocFFT subdirectory next to the library. Linux puts shared -# objects in lib, Windows puts DLLs in bin -if(WIN32) -- set(ROCFFT_KERNEL_CACHE_INSTALL_DIR ${CMAKE_INSTALL_BINDIR}) +- set(ROCFFT_KERNEL_CACHE_INSTALL_DIR ${CMAKE_INSTALL_BINDIR}/rocfft) -else() -- set(ROCFFT_KERNEL_CACHE_INSTALL_DIR ${ROCM_INSTALL_LIBDIR}) +- set(ROCFFT_KERNEL_CACHE_INSTALL_DIR ${ROCM_INSTALL_LIBDIR}/rocfft) -endif() --rocm_install(FILES ${ROCFFT_KERNEL_CACHE_PATH} -- DESTINATION "${ROCFFT_KERNEL_CACHE_INSTALL_DIR}" -- COMPONENT runtime --) - - # PERMISSIONS OWNER_EXECUTE OWNER_WRITE OWNER_READ GROUP_EXECUTE GROUP_READ WORLD_EXECUTE WORLD_READ - +- +-if( NOT ENABLE_ASAN_PACKAGING ) +- rocm_install(FILES ${ROCFFT_KERNEL_CACHE_PATH} +- DESTINATION "${ROCFFT_KERNEL_CACHE_INSTALL_DIR}" +- COMPONENT runtime +- ) +-endif() +- + # rtc helper is an internal library executable on Linux, placed in a + # rocFFT subdirectory of the library directory. On Windows it goes + # into bin next to the library, to simplify finding DLLs. diff -Nru rocfft-5.5.0/debian/patches/series rocfft-5.7.1/debian/patches/series --- rocfft-5.5.0/debian/patches/series 2023-11-10 09:02:29.000000000 +0000 +++ rocfft-5.7.1/debian/patches/series 2024-03-12 17:13:18.000000000 +0000 @@ -1,7 +1,4 @@ -0001-remove-use-of-openmp.patch -0002-disable-fftw-install.patch -0003-fix-sample-includes.patch 0004-fix-hiprtc-link.patch -0005-add-debian-path-to-rocfft_rtc_helper.patch +0005-use-readthedocs-theme.patch 0006-use-local-mathjax.patch 0007-disable-kernel-cache-build.patch diff -Nru rocfft-5.5.0/debian/rules rocfft-5.7.1/debian/rules --- rocfft-5.5.0/debian/rules 2023-11-10 09:02:29.000000000 +0000 +++ rocfft-5.7.1/debian/rules 2024-03-12 17:13:18.000000000 +0000 @@ -1,22 +1,28 @@ #!/usr/bin/make -f export CXX=hipcc export DEB_BUILD_MAINT_OPTIONS = hardening=+all optimize=-lto -export DEB_CXXFLAGS_MAINT_PREPEND = -gdwarf-4 +export DEB_CXXFLAGS_MAINT_PREPEND = -gz export VERBOSE=1 # filter incompatible options from affecting device code CXXFLAGS := $(subst -fstack-protector-strong,-Xarch_host -fstack-protector-strong,$(CXXFLAGS)) CXXFLAGS := $(subst -fcf-protection,-Xarch_host -fcf-protection,$(CXXFLAGS)) +# For installation rocfft_rtc_helper +VERSION_STRING = $(shell sed -nr 's/^set.*VERSION_STRING \"([.0-9]+)\".*/\1/p' CMakeLists.txt) + CMAKE_FLAGS = \ -DCMAKE_BUILD_TYPE=Release \ - -DAMDGPU_TARGETS="gfx803;gfx900;gfx906;gfx908;gfx90a;gfx1010;gfx1011;gfx1030" \ + -DAMDGPU_TARGETS="gfx803;gfx900;gfx906;gfx908;gfx90a;gfx1010;gfx1011;gfx1030;gfx1100;gfx1101;gfx1102" \ -DROCM_SYMLINK_LIBS=OFF \ -DBUILD_FILE_REORG_BACKWARD_COMPATIBILITY=OFF \ - -DBUILD_CLIENTS_TESTS=ON \ -DBUILD_CLIENTS_TESTS_OPENMP=OFF \ -DSQLITE_USE_SYSTEM_PACKAGE=ON +ifeq (,$(filter nocheck,$(DEB_BUILD_PROFILES))) +CMAKE_FLAGS += -DBUILD_CLIENTS_TESTS=ON +endif + %: dh $@ -Scmake @@ -34,22 +40,15 @@ endif execute_before_dh_install-arch: - # Note the rpath field setting may cause reproducible build issues. - # This should be removed earlier in the toolchain if possible. - patchelf --remove-rpath ./debian/tmp/usr/lib/$(DEB_HOST_MULTIARCH)/librocfft.so.0.1 - patchelf --remove-rpath ./debian/tmp/usr/bin/rocfft_rtc_helper - patchelf --remove-rpath ./debian/tmp/usr/bin/rocfft-test - # move rocfft_rtc_helper to a libexec directory - mkdir -p ./debian/tmp/usr/lib/$(DEB_HOST_MULTIARCH)/rocfft/1.0.21 - mv ./debian/tmp/usr/bin/rocfft_rtc_helper ./debian/tmp/usr/lib/$(DEB_HOST_MULTIARCH)/rocfft/1.0.21/ + dh_install -plibrocfft0 usr/lib/*/rocfft/${VERSION_STRING}/rocfft_rtc_helper override_dh_auto_configure-indep: : override_dh_auto_build-indep: ifeq (,$(filter nodoc,$(DEB_BUILD_OPTIONS))) - cd docs; doxygen - sphinx-build -b html docs/source html + cd docs/.doxygen && doxygen + sphinx-build -b html docs html endif override_dh_auto_test-indep: @@ -57,3 +56,7 @@ override_dh_auto_install-indep: : + +# dwz doesn't fully support DWARF-5 yet, see #1016936 +override_dh_dwz: + : diff -Nru rocfft-5.5.0/debian/tests/control rocfft-5.7.1/debian/tests/control --- rocfft-5.5.0/debian/tests/control 2023-11-10 09:02:29.000000000 +0000 +++ rocfft-5.7.1/debian/tests/control 2024-03-12 17:13:18.000000000 +0000 @@ -1,5 +1,5 @@ Test-Command: /bin/sh debian/tests/upstream-binaries librocfft0-tests Depends: librocfft0-tests -Restrictions: skippable, allow-stderr +Restrictions: skippable, allow-stderr, needs-sudo Architecture: amd64 arm64 ppc64el diff -Nru rocfft-5.5.0/debian/tests/upstream-binaries rocfft-5.7.1/debian/tests/upstream-binaries --- rocfft-5.5.0/debian/tests/upstream-binaries 2023-11-10 09:02:29.000000000 +0000 +++ rocfft-5.7.1/debian/tests/upstream-binaries 2024-03-12 17:13:18.000000000 +0000 @@ -7,7 +7,7 @@ # /usr/libexec/rocm/$1 # # Will run all executables in that directory, and exit with status=1 if -# any failure occured, otherwise with status=0. A failure is defined as an +# any failure occurred, otherwise with status=0. A failure is defined as an # executable exiting with a status != 0. @@ -19,7 +19,7 @@ echo "Skipping tests." # Magic number to signal 'skipped' exit 77 -elif [ "`id -u`" != "0" ] && [ ! -r /dev/kfd ] +elif [ "$(id -u)" != "0" ] && [ ! -r /dev/kfd ] then echo "/dev/kfd present but no read permission." echo "Skipping tests." @@ -36,13 +36,37 @@ exit 1 fi -cd "$AUTOPKGTEST_TMP" +# 16 = testbed failure +cd "$AUTOPKGTEST_TMP" || exit 16 + +# First, gather system info +sudo -n mount -t debugfs none /sys/kernel/debug || true +if sudo -n [ -d /sys/kernel/debug/dri ] +then + for index in $(sudo -n ls /sys/kernel/debug/dri) + do + info="/sys/kernel/debug/dri/$index/amdgpu_firmware_info" + if sudo -n [ -f "$info" ] + then + # shellcheck disable=SC2024 # we don't need privileged write + sudo -n cat "$info" > "$AUTOPKGTEST_ARTIFACTS/amdgpu_firmware_info.$index" + fi + done +else + echo "Could not read /sys/kernel/debug/dri" >> "$AUTOPKGTEST_ARTIFACTS/firmware.err" +fi +# shellcheck disable=SC2024 # we don't need privileged write +sudo -n dmesg > "$AUTOPKGTEST_ARTIFACTS/dmesg.before" || true # Any individual failure is overall failure EXITCODE=0 -for TESTNAME in $TESTSDIR/* +for TESTNAME in "$TESTSDIR"/* do $TESTNAME || EXITCODE=1 done +# Tests might have generated new messages +# shellcheck disable=SC2024 # we don't need privileged write +sudo -n dmesg > "$AUTOPKGTEST_ARTIFACTS/dmesg.after" || true + exit $EXITCODE diff -Nru rocfft-5.5.0/debian/upstream/metadata rocfft-5.7.1/debian/upstream/metadata --- rocfft-5.5.0/debian/upstream/metadata 2023-11-10 09:02:29.000000000 +0000 +++ rocfft-5.7.1/debian/upstream/metadata 2024-03-12 17:13:18.000000000 +0000 @@ -1,4 +1,4 @@ --- -Bug-Database: https://github.com/ROCmSoftwarePlatform/rocFFT/issues -Bug-Submit: https://github.com/ROCmSoftwarePlatform/rocFFT/issues/new -Repository-Browse: https://github.com/rocmsoftwareplatform/rocfft +Bug-Database: https://github.com/ROCm/rocFFT/issues +Bug-Submit: https://github.com/ROCm/rocFFT/issues/new +Repository-Browse: https://github.com/ROCm/rocfft diff -Nru rocfft-5.5.0/docs/.doxygen/Doxyfile rocfft-5.7.1/docs/.doxygen/Doxyfile --- rocfft-5.5.0/docs/.doxygen/Doxyfile 1970-01-01 00:00:00.000000000 +0000 +++ rocfft-5.7.1/docs/.doxygen/Doxyfile 2023-08-09 16:19:51.000000000 +0000 @@ -0,0 +1,2458 @@ +# Doxyfile 1.8.10 + +# This file describes the settings to be used by the documentation system +# doxygen (www.doxygen.org) for a project. +# +# All text after a double hash (##) is considered a comment and is placed in +# front of the TAG it is preceding. +# +# All text after a single hash (#) is considered a comment and will be ignored. +# The format is: +# TAG = value [value, ...] +# For lists, items can also be appended using: +# TAG += value [value, ...] +# Values that contain spaces should be placed between quotes (\" \"). + +#--------------------------------------------------------------------------- +# Project related configuration options +#--------------------------------------------------------------------------- + +# This tag specifies the encoding used for all characters in the config file +# that follow. The default is UTF-8 which is also the encoding used for all text +# before the first occurrence of this tag. Doxygen uses libiconv (or the iconv +# built into libc) for the transcoding. See http://www.gnu.org/software/libiconv +# for the list of possible encodings. +# The default value is: UTF-8. + +DOXYFILE_ENCODING = UTF-8 + +# The PROJECT_NAME tag is a single word (or a sequence of words surrounded by +# double-quotes, unless you are using Doxywizard) that should identify the +# project for which the documentation is generated. This name is used in the +# title of most generated pages and in a few other places. +# The default value is: My Project. + +PROJECT_NAME = "rocFFT" + +# The PROJECT_NUMBER tag can be used to enter a project or revision number. This +# could be handy for archiving the generated documentation or if some version +# control system is used. + +PROJECT_NUMBER = v1.0.23 + +# Using the PROJECT_BRIEF tag one can provide an optional one line description +# for a project that appears at the top of each page and should give viewer a +# quick idea about the purpose of the project. Keep the description short. + +PROJECT_BRIEF = "prototype interfaces compatible with ROCm platform and HiP" + +# With the PROJECT_LOGO tag one can specify a logo or an icon that is included +# in the documentation. The maximum height of the logo should not exceed 55 +# pixels and the maximum width should not exceed 200 pixels. Doxygen will copy +# the logo to the output directory. + +PROJECT_LOGO = + +# The OUTPUT_DIRECTORY tag is used to specify the (relative or absolute) path +# into which the generated documentation will be written. If a relative path is +# entered, it will be relative to the location where doxygen was started. If +# left blank the current directory will be used. + +OUTPUT_DIRECTORY = docBin + +# If the CREATE_SUBDIRS tag is set to YES then doxygen will create 4096 sub- +# directories (in 2 levels) under the output directory of each output format and +# will distribute the generated files over these directories. Enabling this +# option can be useful when feeding doxygen a huge amount of source files, where +# putting all generated files in the same directory would otherwise causes +# performance problems for the file system. +# The default value is: NO. + +CREATE_SUBDIRS = NO + +# If the ALLOW_UNICODE_NAMES tag is set to YES, doxygen will allow non-ASCII +# characters to appear in the names of generated files. If set to NO, non-ASCII +# characters will be escaped, for example _xE3_x81_x84 will be used for Unicode +# U+3044. +# The default value is: NO. + +ALLOW_UNICODE_NAMES = NO + +# The OUTPUT_LANGUAGE tag is used to specify the language in which all +# documentation generated by doxygen is written. Doxygen will use this +# information to generate all constant output in the proper language. +# Possible values are: Afrikaans, Arabic, Armenian, Brazilian, Catalan, Chinese, +# Chinese-Traditional, Croatian, Czech, Danish, Dutch, English (United States), +# Esperanto, Farsi (Persian), Finnish, French, German, Greek, Hungarian, +# Indonesian, Italian, Japanese, Japanese-en (Japanese with English messages), +# Korean, Korean-en (Korean with English messages), Latvian, Lithuanian, +# Macedonian, Norwegian, Persian (Farsi), Polish, Portuguese, Romanian, Russian, +# Serbian, Serbian-Cyrillic, Slovak, Slovene, Spanish, Swedish, Turkish, +# Ukrainian and Vietnamese. +# The default value is: English. + +OUTPUT_LANGUAGE = English + +# If the BRIEF_MEMBER_DESC tag is set to YES, doxygen will include brief member +# descriptions after the members that are listed in the file and class +# documentation (similar to Javadoc). Set to NO to disable this. +# The default value is: YES. + +BRIEF_MEMBER_DESC = YES + +# If the REPEAT_BRIEF tag is set to YES, doxygen will prepend the brief +# description of a member or function before the detailed description +# +# Note: If both HIDE_UNDOC_MEMBERS and BRIEF_MEMBER_DESC are set to NO, the +# brief descriptions will be completely suppressed. +# The default value is: YES. + +REPEAT_BRIEF = YES + +# This tag implements a quasi-intelligent brief description abbreviator that is +# used to form the text in various listings. Each string in this list, if found +# as the leading text of the brief description, will be stripped from the text +# and the result, after processing the whole list, is used as the annotated +# text. Otherwise, the brief description is used as-is. If left blank, the +# following values are used ($name is automatically replaced with the name of +# the entity):The $name class, The $name widget, The $name file, is, provides, +# specifies, contains, represents, a, an and the. + +ABBREVIATE_BRIEF = "The $name class" \ + "The $name widget" \ + "The $name file" \ + is \ + provides \ + specifies \ + contains \ + represents \ + a \ + an \ + the + +# If the ALWAYS_DETAILED_SEC and REPEAT_BRIEF tags are both set to YES then +# doxygen will generate a detailed section even if there is only a brief +# description. +# The default value is: NO. + +ALWAYS_DETAILED_SEC = NO + +# If the INLINE_INHERITED_MEMB tag is set to YES, doxygen will show all +# inherited members of a class in the documentation of that class as if those +# members were ordinary class members. Constructors, destructors and assignment +# operators of the base classes will not be shown. +# The default value is: NO. + +INLINE_INHERITED_MEMB = NO + +# If the FULL_PATH_NAMES tag is set to YES, doxygen will prepend the full path +# before files name in the file list and in the header files. If set to NO the +# shortest path that makes the file name unique will be used +# The default value is: YES. + +FULL_PATH_NAMES = YES + +# The STRIP_FROM_PATH tag can be used to strip a user-defined part of the path. +# Stripping is only done if one of the specified strings matches the left-hand +# part of the path. The tag can be used to show relative paths in the file list. +# If left blank the directory from which doxygen is run is used as the path to +# strip. +# +# Note that you can specify absolute paths here, but also relative paths, which +# will be relative from the directory where doxygen is started. +# This tag requires that the tag FULL_PATH_NAMES is set to YES. + +STRIP_FROM_PATH = ../../library/include + +# The STRIP_FROM_INC_PATH tag can be used to strip a user-defined part of the +# path mentioned in the documentation of a class, which tells the reader which +# header file to include in order to use a class. If left blank only the name of +# the header file containing the class definition is used. Otherwise one should +# specify the list of include paths that are normally passed to the compiler +# using the -I flag. + +STRIP_FROM_INC_PATH = + +# If the SHORT_NAMES tag is set to YES, doxygen will generate much shorter (but +# less readable) file names. This can be useful is your file systems doesn't +# support long names like on DOS, Mac, or CD-ROM. +# The default value is: NO. + +SHORT_NAMES = NO + +# If the JAVADOC_AUTOBRIEF tag is set to YES then doxygen will interpret the +# first line (until the first dot) of a Javadoc-style comment as the brief +# description. If set to NO, the Javadoc-style will behave just like regular Qt- +# style comments (thus requiring an explicit @brief command for a brief +# description.) +# The default value is: NO. + +JAVADOC_AUTOBRIEF = NO + +# If the QT_AUTOBRIEF tag is set to YES then doxygen will interpret the first +# line (until the first dot) of a Qt-style comment as the brief description. If +# set to NO, the Qt-style will behave just like regular Qt-style comments (thus +# requiring an explicit \brief command for a brief description.) +# The default value is: NO. + +QT_AUTOBRIEF = NO + +# The MULTILINE_CPP_IS_BRIEF tag can be set to YES to make doxygen treat a +# multi-line C++ special comment block (i.e. a block of //! or /// comments) as +# a brief description. This used to be the default behavior. The new default is +# to treat a multi-line C++ comment block as a detailed description. Set this +# tag to YES if you prefer the old behavior instead. +# +# Note that setting this tag to YES also means that rational rose comments are +# not recognized any more. +# The default value is: NO. + +MULTILINE_CPP_IS_BRIEF = NO + +# If the INHERIT_DOCS tag is set to YES then an undocumented member inherits the +# documentation from any documented member that it re-implements. +# The default value is: YES. + +INHERIT_DOCS = YES + +# If the SEPARATE_MEMBER_PAGES tag is set to YES then doxygen will produce a new +# page for each member. If set to NO, the documentation of a member will be part +# of the file/class/namespace that contains it. +# The default value is: NO. + +SEPARATE_MEMBER_PAGES = NO + +# The TAB_SIZE tag can be used to set the number of spaces in a tab. Doxygen +# uses this value to replace tabs by spaces in code fragments. +# Minimum value: 1, maximum value: 16, default value: 4. + +TAB_SIZE = 4 + +# This tag can be used to specify a number of aliases that act as commands in +# the documentation. An alias has the form: +# name=value +# For example adding +# "sideeffect=@par Side Effects:\n" +# will allow you to put the command \sideeffect (or @sideeffect) in the +# documentation, which will result in a user-defined paragraph with heading +# "Side Effects:". You can put \n's in the value part of an alias to insert +# newlines. + +ALIASES = + +# This tag can be used to specify a number of word-keyword mappings (TCL only). +# A mapping has the form "name=value". For example adding "class=itcl::class" +# will allow you to use the command class in the itcl::class meaning. + +TCL_SUBST = + +# Set the OPTIMIZE_OUTPUT_FOR_C tag to YES if your project consists of C sources +# only. Doxygen will then generate output that is more tailored for C. For +# instance, some of the names that are used will be different. The list of all +# members will be omitted, etc. +# The default value is: NO. + +OPTIMIZE_OUTPUT_FOR_C = NO + +# Set the OPTIMIZE_OUTPUT_JAVA tag to YES if your project consists of Java or +# Python sources only. Doxygen will then generate output that is more tailored +# for that language. For instance, namespaces will be presented as packages, +# qualified scopes will look different, etc. +# The default value is: NO. + +OPTIMIZE_OUTPUT_JAVA = NO + +# Set the OPTIMIZE_FOR_FORTRAN tag to YES if your project consists of Fortran +# sources. Doxygen will then generate output that is tailored for Fortran. +# The default value is: NO. + +OPTIMIZE_FOR_FORTRAN = NO + +# Set the OPTIMIZE_OUTPUT_VHDL tag to YES if your project consists of VHDL +# sources. Doxygen will then generate output that is tailored for VHDL. +# The default value is: NO. + +OPTIMIZE_OUTPUT_VHDL = NO + +# Doxygen selects the parser to use depending on the extension of the files it +# parses. With this tag you can assign which parser to use for a given +# extension. Doxygen has a built-in mapping, but you can override or extend it +# using this tag. The format is ext=language, where ext is a file extension, and +# language is one of the parsers supported by doxygen: IDL, Java, Javascript, +# C#, C, C++, D, PHP, Objective-C, Python, Fortran (fixed format Fortran: +# FortranFixed, free formatted Fortran: FortranFree, unknown formatted Fortran: +# Fortran. In the later case the parser tries to guess whether the code is fixed +# or free formatted code, this is the default for Fortran type files), VHDL. For +# instance to make doxygen treat .inc files as Fortran files (default is PHP), +# and .f files as C (default is Fortran), use: inc=Fortran f=C. +# +# Note: For files without extension you can use no_extension as a placeholder. +# +# Note that for custom extensions you also need to set FILE_PATTERNS otherwise +# the files are not read by doxygen. + +EXTENSION_MAPPING = + +# If the MARKDOWN_SUPPORT tag is enabled then doxygen pre-processes all comments +# according to the Markdown format, which allows for more readable +# documentation. See http://daringfireball.net/projects/markdown/ for details. +# The output of markdown processing is further processed by doxygen, so you can +# mix doxygen, HTML, and XML commands with Markdown formatting. Disable only in +# case of backward compatibilities issues. +# The default value is: YES. + +MARKDOWN_SUPPORT = YES + +# When enabled doxygen tries to link words that correspond to documented +# classes, or namespaces to their corresponding documentation. Such a link can +# be prevented in individual cases by putting a % sign in front of the word or +# globally by setting AUTOLINK_SUPPORT to NO. +# The default value is: YES. + +AUTOLINK_SUPPORT = YES + +# If you use STL classes (i.e. std::string, std::vector, etc.) but do not want +# to include (a tag file for) the STL sources as input, then you should set this +# tag to YES in order to let doxygen match functions declarations and +# definitions whose arguments contain STL classes (e.g. func(std::string); +# versus func(std::string) {}). This also make the inheritance and collaboration +# diagrams that involve STL classes more complete and accurate. +# The default value is: NO. + +BUILTIN_STL_SUPPORT = NO + +# If you use Microsoft's C++/CLI language, you should set this option to YES to +# enable parsing support. +# The default value is: NO. + +CPP_CLI_SUPPORT = NO + +# Set the SIP_SUPPORT tag to YES if your project consists of sip (see: +# http://www.riverbankcomputing.co.uk/software/sip/intro) sources only. Doxygen +# will parse them like normal C++ but will assume all classes use public instead +# of private inheritance when no explicit protection keyword is present. +# The default value is: NO. + +SIP_SUPPORT = NO + +# For Microsoft's IDL there are propget and propput attributes to indicate +# getter and setter methods for a property. Setting this option to YES will make +# doxygen to replace the get and set methods by a property in the documentation. +# This will only work if the methods are indeed getting or setting a simple +# type. If this is not the case, or you want to show the methods anyway, you +# should set this option to NO. +# The default value is: YES. + +IDL_PROPERTY_SUPPORT = YES + +# If member grouping is used in the documentation and the DISTRIBUTE_GROUP_DOC +# tag is set to YES then doxygen will reuse the documentation of the first +# member in the group (if any) for the other members of the group. By default +# all members of a group must be documented explicitly. +# The default value is: NO. + +DISTRIBUTE_GROUP_DOC = YES + +# If one adds a struct or class to a group and this option is enabled, then also +# any nested class or struct is added to the same group. By default this option +# is disabled and one has to add nested compounds explicitly via \ingroup. +# The default value is: NO. + +GROUP_NESTED_COMPOUNDS = NO + +# Set the SUBGROUPING tag to YES to allow class member groups of the same type +# (for instance a group of public functions) to be put as a subgroup of that +# type (e.g. under the Public Functions section). Set it to NO to prevent +# subgrouping. Alternatively, this can be done per class using the +# \nosubgrouping command. +# The default value is: YES. + +SUBGROUPING = YES + +# When the INLINE_GROUPED_CLASSES tag is set to YES, classes, structs and unions +# are shown inside the group in which they are included (e.g. using \ingroup) +# instead of on a separate page (for HTML and Man pages) or section (for LaTeX +# and RTF). +# +# Note that this feature does not work in combination with +# SEPARATE_MEMBER_PAGES. +# The default value is: NO. + +INLINE_GROUPED_CLASSES = NO + +# When the INLINE_SIMPLE_STRUCTS tag is set to YES, structs, classes, and unions +# with only public data fields or simple typedef fields will be shown inline in +# the documentation of the scope in which they are defined (i.e. file, +# namespace, or group documentation), provided this scope is documented. If set +# to NO, structs, classes, and unions are shown on a separate page (for HTML and +# Man pages) or section (for LaTeX and RTF). +# The default value is: NO. + +INLINE_SIMPLE_STRUCTS = NO + +# When TYPEDEF_HIDES_STRUCT tag is enabled, a typedef of a struct, union, or +# enum is documented as struct, union, or enum with the name of the typedef. So +# typedef struct TypeS {} TypeT, will appear in the documentation as a struct +# with name TypeT. When disabled the typedef will appear as a member of a file, +# namespace, or class. And the struct will be named TypeS. This can typically be +# useful for C code in case the coding convention dictates that all compound +# types are typedef'ed and only the typedef is referenced, never the tag name. +# The default value is: NO. + +TYPEDEF_HIDES_STRUCT = YES + +# The size of the symbol lookup cache can be set using LOOKUP_CACHE_SIZE. This +# cache is used to resolve symbols given their name and scope. Since this can be +# an expensive process and often the same symbol appears multiple times in the +# code, doxygen keeps a cache of pre-resolved symbols. If the cache is too small +# doxygen will become slower. If the cache is too large, memory is wasted. The +# cache size is given by this formula: 2^(16+LOOKUP_CACHE_SIZE). The valid range +# is 0..9, the default is 0, corresponding to a cache size of 2^16=65536 +# symbols. At the end of a run doxygen will report the cache usage and suggest +# the optimal cache size from a speed point of view. +# Minimum value: 0, maximum value: 9, default value: 0. + +LOOKUP_CACHE_SIZE = 0 + +#--------------------------------------------------------------------------- +# Build related configuration options +#--------------------------------------------------------------------------- + +SHOW_NAMESPACES = NO + +# If the EXTRACT_ALL tag is set to YES, doxygen will assume all entities in +# documentation are documented, even if no documentation was available. Private +# class members and static file members will be hidden unless the +# EXTRACT_PRIVATE respectively EXTRACT_STATIC tags are set to YES. +# Note: This will also disable the warnings about undocumented members that are +# normally produced when WARNINGS is set to YES. +# The default value is: NO. + +EXTRACT_ALL = NO + +# If the EXTRACT_PRIVATE tag is set to YES, all private members of a class will +# be included in the documentation. +# The default value is: NO. + +EXTRACT_PRIVATE = NO + +# If the EXTRACT_PACKAGE tag is set to YES, all members with package or internal +# scope will be included in the documentation. +# The default value is: NO. + +EXTRACT_PACKAGE = NO + +# If the EXTRACT_STATIC tag is set to YES, all static members of a file will be +# included in the documentation. +# The default value is: NO. + +EXTRACT_STATIC = NO + +# If the EXTRACT_LOCAL_CLASSES tag is set to YES, classes (and structs) defined +# locally in source files will be included in the documentation. If set to NO, +# only classes defined in header files are included. Does not have any effect +# for Java sources. +# The default value is: YES. + +EXTRACT_LOCAL_CLASSES = YES + +# This flag is only useful for Objective-C code. If set to YES, local methods, +# which are defined in the implementation section but not in the interface are +# included in the documentation. If set to NO, only methods in the interface are +# included. +# The default value is: NO. + +EXTRACT_LOCAL_METHODS = NO + +# If this flag is set to YES, the members of anonymous namespaces will be +# extracted and appear in the documentation as a namespace called +# 'anonymous_namespace{file}', where file will be replaced with the base name of +# the file that contains the anonymous namespace. By default anonymous namespace +# are hidden. +# The default value is: NO. + +EXTRACT_ANON_NSPACES = NO + +# If the HIDE_UNDOC_MEMBERS tag is set to YES, doxygen will hide all +# undocumented members inside documented classes or files. If set to NO these +# members will be included in the various overviews, but no documentation +# section is generated. This option has no effect if EXTRACT_ALL is enabled. +# The default value is: NO. + +HIDE_UNDOC_MEMBERS = NO + +# If the HIDE_UNDOC_CLASSES tag is set to YES, doxygen will hide all +# undocumented classes that are normally visible in the class hierarchy. If set +# to NO, these classes will be included in the various overviews. This option +# has no effect if EXTRACT_ALL is enabled. +# The default value is: NO. + +HIDE_UNDOC_CLASSES = NO + +# If the HIDE_FRIEND_COMPOUNDS tag is set to YES, doxygen will hide all friend +# (class|struct|union) declarations. If set to NO, these declarations will be +# included in the documentation. +# The default value is: NO. + +HIDE_FRIEND_COMPOUNDS = NO + +# If the HIDE_IN_BODY_DOCS tag is set to YES, doxygen will hide any +# documentation blocks found inside the body of a function. If set to NO, these +# blocks will be appended to the function's detailed documentation block. +# The default value is: NO. + +HIDE_IN_BODY_DOCS = NO + +# The INTERNAL_DOCS tag determines if documentation that is typed after a +# \internal command is included. If the tag is set to NO then the documentation +# will be excluded. Set it to YES to include the internal documentation. +# The default value is: NO. + +INTERNAL_DOCS = NO + +# If the CASE_SENSE_NAMES tag is set to NO then doxygen will only generate file +# names in lower-case letters. If set to YES, upper-case letters are also +# allowed. This is useful if you have classes or files whose names only differ +# in case and if your file system supports case sensitive file names. Windows +# and Mac users are advised to set this option to NO. +# The default value is: system dependent. + +CASE_SENSE_NAMES = NO + +# If the HIDE_SCOPE_NAMES tag is set to NO then doxygen will show members with +# their full class and namespace scopes in the documentation. If set to YES, the +# scope will be hidden. +# The default value is: NO. + +HIDE_SCOPE_NAMES = NO + +# If the HIDE_COMPOUND_REFERENCE tag is set to NO (default) then doxygen will +# append additional text to a page's title, such as Class Reference. If set to +# YES the compound reference will be hidden. +# The default value is: NO. + +HIDE_COMPOUND_REFERENCE= NO + +# If the SHOW_INCLUDE_FILES tag is set to YES then doxygen will put a list of +# the files that are included by a file in the documentation of that file. +# The default value is: YES. + +SHOW_INCLUDE_FILES = YES + +# If the SHOW_GROUPED_MEMB_INC tag is set to YES then Doxygen will add for each +# grouped member an include statement to the documentation, telling the reader +# which file to include in order to use the member. +# The default value is: NO. + +SHOW_GROUPED_MEMB_INC = NO + +# If the FORCE_LOCAL_INCLUDES tag is set to YES then doxygen will list include +# files with double quotes in the documentation rather than with sharp brackets. +# The default value is: NO. + +FORCE_LOCAL_INCLUDES = NO + +# If the INLINE_INFO tag is set to YES then a tag [inline] is inserted in the +# documentation for inline members. +# The default value is: YES. + +INLINE_INFO = YES + +# If the SORT_MEMBER_DOCS tag is set to YES then doxygen will sort the +# (detailed) documentation of file and class members alphabetically by member +# name. If set to NO, the members will appear in declaration order. +# The default value is: YES. + +SORT_MEMBER_DOCS = YES + +# If the SORT_BRIEF_DOCS tag is set to YES then doxygen will sort the brief +# descriptions of file, namespace and class members alphabetically by member +# name. If set to NO, the members will appear in declaration order. Note that +# this will also influence the order of the classes in the class list. +# The default value is: NO. + +SORT_BRIEF_DOCS = NO + +# If the SORT_MEMBERS_CTORS_1ST tag is set to YES then doxygen will sort the +# (brief and detailed) documentation of class members so that constructors and +# destructors are listed first. If set to NO the constructors will appear in the +# respective orders defined by SORT_BRIEF_DOCS and SORT_MEMBER_DOCS. +# Note: If SORT_BRIEF_DOCS is set to NO this option is ignored for sorting brief +# member documentation. +# Note: If SORT_MEMBER_DOCS is set to NO this option is ignored for sorting +# detailed member documentation. +# The default value is: NO. + +SORT_MEMBERS_CTORS_1ST = NO + +# If the SORT_GROUP_NAMES tag is set to YES then doxygen will sort the hierarchy +# of group names into alphabetical order. If set to NO the group names will +# appear in their defined order. +# The default value is: NO. + +SORT_GROUP_NAMES = NO + +# If the SORT_BY_SCOPE_NAME tag is set to YES, the class list will be sorted by +# fully-qualified names, including namespaces. If set to NO, the class list will +# be sorted only by class name, not including the namespace part. +# Note: This option is not very useful if HIDE_SCOPE_NAMES is set to YES. +# Note: This option applies only to the class list, not to the alphabetical +# list. +# The default value is: NO. + +SORT_BY_SCOPE_NAME = NO + +# If the STRICT_PROTO_MATCHING option is enabled and doxygen fails to do proper +# type resolution of all parameters of a function it will reject a match between +# the prototype and the implementation of a member function even if there is +# only one candidate or it is obvious which candidate to choose by doing a +# simple string match. By disabling STRICT_PROTO_MATCHING doxygen will still +# accept a match between prototype and implementation in such cases. +# The default value is: NO. + +STRICT_PROTO_MATCHING = NO + +# The GENERATE_TODOLIST tag can be used to enable (YES) or disable (NO) the todo +# list. This list is created by putting \todo commands in the documentation. +# The default value is: YES. + +GENERATE_TODOLIST = YES + +# The GENERATE_TESTLIST tag can be used to enable (YES) or disable (NO) the test +# list. This list is created by putting \test commands in the documentation. +# The default value is: YES. + +GENERATE_TESTLIST = YES + +# The GENERATE_BUGLIST tag can be used to enable (YES) or disable (NO) the bug +# list. This list is created by putting \bug commands in the documentation. +# The default value is: YES. + +GENERATE_BUGLIST = YES + +# The GENERATE_DEPRECATEDLIST tag can be used to enable (YES) or disable (NO) +# the deprecated list. This list is created by putting \deprecated commands in +# the documentation. +# The default value is: YES. + +GENERATE_DEPRECATEDLIST= YES + +# The ENABLED_SECTIONS tag can be used to enable conditional documentation +# sections, marked by \if ... \endif and \cond +# ... \endcond blocks. + +ENABLED_SECTIONS = + +# The MAX_INITIALIZER_LINES tag determines the maximum number of lines that the +# initial value of a variable or macro / define can have for it to appear in the +# documentation. If the initializer consists of more lines than specified here +# it will be hidden. Use a value of 0 to hide initializers completely. The +# appearance of the value of individual variables and macros / defines can be +# controlled using \showinitializer or \hideinitializer command in the +# documentation regardless of this setting. +# Minimum value: 0, maximum value: 10000, default value: 30. + +MAX_INITIALIZER_LINES = 30 + +# Set the SHOW_USED_FILES tag to NO to disable the list of files generated at +# the bottom of the documentation of classes and structs. If set to YES, the +# list will mention the files that were used to generate the documentation. +# The default value is: YES. + +SHOW_USED_FILES = YES + +# Set the SHOW_FILES tag to NO to disable the generation of the Files page. This +# will remove the Files entry from the Quick Index and from the Folder Tree View +# (if specified). +# The default value is: YES. + +SHOW_FILES = YES + +# Set the SHOW_NAMESPACES tag to NO to disable the generation of the Namespaces +# page. This will remove the Namespaces entry from the Quick Index and from the +# Folder Tree View (if specified). +# The default value is: YES. + +SHOW_NAMESPACES = YES + +# The FILE_VERSION_FILTER tag can be used to specify a program or script that +# doxygen should invoke to get the current version for each file (typically from +# the version control system). Doxygen will invoke the program by executing (via +# popen()) the command command input-file, where command is the value of the +# FILE_VERSION_FILTER tag, and input-file is the name of an input file provided +# by doxygen. Whatever the program writes to standard output is used as the file +# version. For an example see the documentation. + +FILE_VERSION_FILTER = + +# The LAYOUT_FILE tag can be used to specify a layout file which will be parsed +# by doxygen. The layout file controls the global structure of the generated +# output files in an output format independent way. To create the layout file +# that represents doxygen's defaults, run doxygen with the -l option. You can +# optionally specify a file name after the option, if omitted DoxygenLayout.xml +# will be used as the name of the layout file. +# +# Note that if you run doxygen from a directory containing a file called +# DoxygenLayout.xml, doxygen will parse it automatically even if the LAYOUT_FILE +# tag is left empty. + +LAYOUT_FILE = + +# The CITE_BIB_FILES tag can be used to specify one or more bib files containing +# the reference definitions. This must be a list of .bib files. The .bib +# extension is automatically appended if omitted. This requires the bibtex tool +# to be installed. See also http://en.wikipedia.org/wiki/BibTeX for more info. +# For LaTeX the style of the bibliography can be controlled using +# LATEX_BIB_STYLE. To use this feature you need bibtex and perl available in the +# search path. See also \cite for info how to create references. + +CITE_BIB_FILES = + +#--------------------------------------------------------------------------- +# Configuration options related to warning and progress messages +#--------------------------------------------------------------------------- + +# The QUIET tag can be used to turn on/off the messages that are generated to +# standard output by doxygen. If QUIET is set to YES this implies that the +# messages are off. +# The default value is: NO. + +QUIET = NO + +# The WARNINGS tag can be used to turn on/off the warning messages that are +# generated to standard error (stderr) by doxygen. If WARNINGS is set to YES +# this implies that the warnings are on. +# +# Tip: Turn warnings on while writing the documentation. +# The default value is: YES. + +WARNINGS = YES + +# If the WARN_IF_UNDOCUMENTED tag is set to YES then doxygen will generate +# warnings for undocumented members. If EXTRACT_ALL is set to YES then this flag +# will automatically be disabled. +# The default value is: YES. + +WARN_IF_UNDOCUMENTED = YES + +# If the WARN_IF_DOC_ERROR tag is set to YES, doxygen will generate warnings for +# potential errors in the documentation, such as not documenting some parameters +# in a documented function, or documenting parameters that don't exist or using +# markup commands wrongly. +# The default value is: YES. + +WARN_IF_DOC_ERROR = YES + +# This WARN_NO_PARAMDOC option can be enabled to get warnings for functions that +# are documented, but have no documentation for their parameters or return +# value. If set to NO, doxygen will only warn about wrong or incomplete +# parameter documentation, but not about the absence of documentation. +# The default value is: NO. + +WARN_NO_PARAMDOC = NO + +# The WARN_FORMAT tag determines the format of the warning messages that doxygen +# can produce. The string should contain the $file, $line, and $text tags, which +# will be replaced by the file and line number from which the warning originated +# and the warning text. Optionally the format may contain $version, which will +# be replaced by the version of the file (if it could be obtained via +# FILE_VERSION_FILTER) +# The default value is: $file:$line: $text. + +WARN_FORMAT = "$file:$line: $text" + +# The WARN_LOGFILE tag can be used to specify a file to which warning and error +# messages should be written. If left blank the output is written to standard +# error (stderr). + +WARN_LOGFILE = + +#--------------------------------------------------------------------------- +# Configuration options related to the input files +#--------------------------------------------------------------------------- + +# The INPUT tag is used to specify the files and/or directories that contain +# documented source files. You may enter file names like myfile.cpp or +# directories like /usr/src/myproject. Separate the files or directories with +# spaces. See also FILE_PATTERNS and EXTENSION_MAPPING +# Note: If this tag is empty the current directory is searched. + +INPUT = ../../library/include/ + +# This tag can be used to specify the character encoding of the source files +# that doxygen parses. Internally doxygen uses the UTF-8 encoding. Doxygen uses +# libiconv (or the iconv built into libc) for the transcoding. See the libiconv +# documentation (see: http://www.gnu.org/software/libiconv) for the list of +# possible encodings. +# The default value is: UTF-8. + +INPUT_ENCODING = UTF-8 + +# If the value of the INPUT tag contains directories, you can use the +# FILE_PATTERNS tag to specify one or more wildcard patterns (like *.cpp and +# *.h) to filter out the source-files in the directories. +# +# Note that for custom extensions or not directly supported extensions you also +# need to set EXTENSION_MAPPING for the extension otherwise the files are not +# read by doxygen. +# +# If left blank the following patterns are tested:*.c, *.cc, *.cxx, *.cpp, +# *.c++, *.java, *.ii, *.ixx, *.ipp, *.i++, *.inl, *.idl, *.ddl, *.odl, *.h, +# *.hh, *.hxx, *.hpp, *.h++, *.cs, *.d, *.php, *.php4, *.php5, *.phtml, *.inc, +# *.m, *.markdown, *.md, *.mm, *.dox, *.py, *.f90, *.f, *.for, *.tcl, *.vhd, +# *.vhdl, *.ucf, *.qsf, *.as and *.js. + +FILE_PATTERNS = *.c \ + *.cc \ + *.cxx \ + *.cpp \ + *.c++ \ + *.java \ + *.ii \ + *.ixx \ + *.ipp \ + *.i++ \ + *.inl \ + *.idl \ + *.ddl \ + *.odl \ + *.h \ + *.hh \ + *.hxx \ + *.hpp \ + *.h++ \ + *.cs \ + *.d \ + *.php \ + *.php4 \ + *.php5 \ + *.phtml \ + *.inc \ + *.m \ + *.markdown \ + *.md \ + *.mm \ + *.dox \ + *.py \ + *.f90 \ + *.f \ + *.for \ + *.tcl \ + *.vhd \ + *.vhdl \ + *.ucf \ + *.qsf \ + *.as \ + *.js + +# The RECURSIVE tag can be used to specify whether or not subdirectories should +# be searched for input files as well. +# The default value is: NO. + +RECURSIVE = NO + +# The EXCLUDE tag can be used to specify files and/or directories that should be +# excluded from the INPUT source files. This way you can easily exclude a +# subdirectory from a directory tree whose root is specified with the INPUT tag. +# +# Note that relative paths are relative to the directory from which doxygen is +# run. + +EXCLUDE = + +# The EXCLUDE_SYMLINKS tag can be used to select whether or not files or +# directories that are symbolic links (a Unix file system feature) are excluded +# from the input. +# The default value is: NO. + +EXCLUDE_SYMLINKS = NO + +# If the value of the INPUT tag contains directories, you can use the +# EXCLUDE_PATTERNS tag to specify one or more wildcard patterns to exclude +# certain files from those directories. +# +# Note that the wildcards are matched against the file with absolute path, so to +# exclude all test directories for example use the pattern */test/* + +EXCLUDE_PATTERNS = + +# The EXCLUDE_SYMBOLS tag can be used to specify one or more symbol names +# (namespaces, classes, functions, etc.) that should be excluded from the +# output. The symbol name can be a fully qualified name, a word, or if the +# wildcard * is used, a substring. Examples: ANamespace, AClass, +# AClass::ANamespace, ANamespace::*Test +# +# Note that the wildcards are matched against the file with absolute path, so to +# exclude all test directories use the pattern */test/* + +EXCLUDE_SYMBOLS = + +# The EXAMPLE_PATH tag can be used to specify one or more files or directories +# that contain example code fragments that are included (see the \include +# command). + +EXAMPLE_PATH = + +# If the value of the EXAMPLE_PATH tag contains directories, you can use the +# EXAMPLE_PATTERNS tag to specify one or more wildcard pattern (like *.cpp and +# *.h) to filter out the source-files in the directories. If left blank all +# files are included. + +EXAMPLE_PATTERNS = * + +# If the EXAMPLE_RECURSIVE tag is set to YES then subdirectories will be +# searched for input files to be used with the \include or \dontinclude commands +# irrespective of the value of the RECURSIVE tag. +# The default value is: NO. + +EXAMPLE_RECURSIVE = NO + +# The IMAGE_PATH tag can be used to specify one or more files or directories +# that contain images that are to be included in the documentation (see the +# \image command). + +IMAGE_PATH = + +# The INPUT_FILTER tag can be used to specify a program that doxygen should +# invoke to filter for each input file. Doxygen will invoke the filter program +# by executing (via popen()) the command: +# +# +# +# where is the value of the INPUT_FILTER tag, and is the +# name of an input file. Doxygen will then use the output that the filter +# program writes to standard output. If FILTER_PATTERNS is specified, this tag +# will be ignored. +# +# Note that the filter must not add or remove lines; it is applied before the +# code is scanned, but not when the output code is generated. If lines are added +# or removed, the anchors will not be placed correctly. + +INPUT_FILTER = + +# The FILTER_PATTERNS tag can be used to specify filters on a per file pattern +# basis. Doxygen will compare the file name with each pattern and apply the +# filter if there is a match. The filters are a list of the form: pattern=filter +# (like *.cpp=my_cpp_filter). See INPUT_FILTER for further information on how +# filters are used. If the FILTER_PATTERNS tag is empty or if none of the +# patterns match the file name, INPUT_FILTER is applied. + +FILTER_PATTERNS = + +# If the FILTER_SOURCE_FILES tag is set to YES, the input filter (if set using +# INPUT_FILTER) will also be used to filter the input files that are used for +# producing the source files to browse (i.e. when SOURCE_BROWSER is set to YES). +# The default value is: NO. + +FILTER_SOURCE_FILES = NO + +# The FILTER_SOURCE_PATTERNS tag can be used to specify source filters per file +# pattern. A pattern will override the setting for FILTER_PATTERN (if any) and +# it is also possible to disable source filtering for a specific pattern using +# *.ext= (so without naming a filter). +# This tag requires that the tag FILTER_SOURCE_FILES is set to YES. + +FILTER_SOURCE_PATTERNS = + +# If the USE_MDFILE_AS_MAINPAGE tag refers to the name of a markdown file that +# is part of the input, its contents will be placed on the main page +# (index.html). This can be useful if you have a project on for instance GitHub +# and want to reuse the introduction page also for the doxygen output. + +USE_MDFILE_AS_MAINPAGE = ../README.md + +#--------------------------------------------------------------------------- +# Configuration options related to source browsing +#--------------------------------------------------------------------------- + +# If the SOURCE_BROWSER tag is set to YES then a list of source files will be +# generated. Documented entities will be cross-referenced with these sources. +# +# Note: To get rid of all source code in the generated output, make sure that +# also VERBATIM_HEADERS is set to NO. +# The default value is: NO. + +SOURCE_BROWSER = NO + +# Setting the INLINE_SOURCES tag to YES will include the body of functions, +# classes and enums directly into the documentation. +# The default value is: NO. + +INLINE_SOURCES = NO + +# Setting the STRIP_CODE_COMMENTS tag to YES will instruct doxygen to hide any +# special comment blocks from generated source code fragments. Normal C, C++ and +# Fortran comments will always remain visible. +# The default value is: YES. + +STRIP_CODE_COMMENTS = YES + +# If the REFERENCED_BY_RELATION tag is set to YES then for each documented +# function all documented functions referencing it will be listed. +# The default value is: NO. + +REFERENCED_BY_RELATION = NO + +# If the REFERENCES_RELATION tag is set to YES then for each documented function +# all documented entities called/used by that function will be listed. +# The default value is: NO. + +REFERENCES_RELATION = NO + +# If the REFERENCES_LINK_SOURCE tag is set to YES and SOURCE_BROWSER tag is set +# to YES then the hyperlinks from functions in REFERENCES_RELATION and +# REFERENCED_BY_RELATION lists will link to the source code. Otherwise they will +# link to the documentation. +# The default value is: YES. + +REFERENCES_LINK_SOURCE = YES + +# If SOURCE_TOOLTIPS is enabled (the default) then hovering a hyperlink in the +# source code will show a tooltip with additional information such as prototype, +# brief description and links to the definition and documentation. Since this +# will make the HTML file larger and loading of large files a bit slower, you +# can opt to disable this feature. +# The default value is: YES. +# This tag requires that the tag SOURCE_BROWSER is set to YES. + +SOURCE_TOOLTIPS = YES + +# If the USE_HTAGS tag is set to YES then the references to source code will +# point to the HTML generated by the htags(1) tool instead of doxygen built-in +# source browser. The htags tool is part of GNU's global source tagging system +# (see http://www.gnu.org/software/global/global.html). You will need version +# 4.8.6 or higher. +# +# To use it do the following: +# - Install the latest version of global +# - Enable SOURCE_BROWSER and USE_HTAGS in the config file +# - Make sure the INPUT points to the root of the source tree +# - Run doxygen as normal +# +# Doxygen will invoke htags (and that will in turn invoke gtags), so these +# tools must be available from the command line (i.e. in the search path). +# +# The result: instead of the source browser generated by doxygen, the links to +# source code will now point to the output of htags. +# The default value is: NO. +# This tag requires that the tag SOURCE_BROWSER is set to YES. + +USE_HTAGS = NO + +# If the VERBATIM_HEADERS tag is set the YES then doxygen will generate a +# verbatim copy of the header file for each class for which an include is +# specified. Set to NO to disable this. +# See also: Section \class. +# The default value is: YES. + +VERBATIM_HEADERS = YES + +# If the CLANG_ASSISTED_PARSING tag is set to YES then doxygen will use the +# clang parser (see: http://clang.llvm.org/) for more accurate parsing at the +# cost of reduced performance. This can be particularly helpful with template +# rich C++ code for which doxygen's built-in parser lacks the necessary type +# information. +# Note: The availability of this option depends on whether or not doxygen was +# compiled with the --with-libclang option. +# The default value is: NO. + +CLANG_ASSISTED_PARSING = NO + +# If clang assisted parsing is enabled you can provide the compiler with command +# line options that you would normally use when invoking the compiler. Note that +# the include paths will already be set by doxygen for the files and directories +# specified with INPUT and INCLUDE_PATH. +# This tag requires that the tag CLANG_ASSISTED_PARSING is set to YES. + +CLANG_OPTIONS = + +#--------------------------------------------------------------------------- +# Configuration options related to the alphabetical class index +#--------------------------------------------------------------------------- + +# If the ALPHABETICAL_INDEX tag is set to YES, an alphabetical index of all +# compounds will be generated. Enable this if the project contains a lot of +# classes, structs, unions or interfaces. +# The default value is: YES. + +ALPHABETICAL_INDEX = YES + +# The COLS_IN_ALPHA_INDEX tag can be used to specify the number of columns in +# which the alphabetical index list will be split. +# Minimum value: 1, maximum value: 20, default value: 5. +# This tag requires that the tag ALPHABETICAL_INDEX is set to YES. + +COLS_IN_ALPHA_INDEX = 5 + +# In case all classes in a project start with a common prefix, all classes will +# be put under the same header in the alphabetical index. The IGNORE_PREFIX tag +# can be used to specify a prefix (or a list of prefixes) that should be ignored +# while generating the index headers. +# This tag requires that the tag ALPHABETICAL_INDEX is set to YES. + +IGNORE_PREFIX = + +#--------------------------------------------------------------------------- +# Configuration options related to the HTML output +#--------------------------------------------------------------------------- + +# If the GENERATE_HTML tag is set to YES, doxygen will generate HTML output +# The default value is: YES. + +GENERATE_HTML = YES + +# The HTML_OUTPUT tag is used to specify where the HTML docs will be put. If a +# relative path is entered the value of OUTPUT_DIRECTORY will be put in front of +# it. +# The default directory is: html. +# This tag requires that the tag GENERATE_HTML is set to YES. + +HTML_OUTPUT = html + +# The HTML_FILE_EXTENSION tag can be used to specify the file extension for each +# generated HTML page (for example: .htm, .php, .asp). +# The default value is: .html. +# This tag requires that the tag GENERATE_HTML is set to YES. + +HTML_FILE_EXTENSION = .html + +# The HTML_HEADER tag can be used to specify a user-defined HTML header file for +# each generated HTML page. If the tag is left blank doxygen will generate a +# standard header. +# +# To get valid HTML the header file that includes any scripts and style sheets +# that doxygen needs, which is dependent on the configuration options used (e.g. +# the setting GENERATE_TREEVIEW). It is highly recommended to start with a +# default header using +# doxygen -w html new_header.html new_footer.html new_stylesheet.css +# YourConfigFile +# and then modify the file new_header.html. See also section "Doxygen usage" +# for information on how to generate the default header that doxygen normally +# uses. +# Note: The header is subject to change so you typically have to regenerate the +# default header when upgrading to a newer version of doxygen. For a description +# of the possible markers and block names see the documentation. +# This tag requires that the tag GENERATE_HTML is set to YES. + +HTML_HEADER = + +# The HTML_FOOTER tag can be used to specify a user-defined HTML footer for each +# generated HTML page. If the tag is left blank doxygen will generate a standard +# footer. See HTML_HEADER for more information on how to generate a default +# footer and what special commands can be used inside the footer. See also +# section "Doxygen usage" for information on how to generate the default footer +# that doxygen normally uses. +# This tag requires that the tag GENERATE_HTML is set to YES. + +HTML_FOOTER = + +# The HTML_STYLESHEET tag can be used to specify a user-defined cascading style +# sheet that is used by each HTML page. It can be used to fine-tune the look of +# the HTML output. If left blank doxygen will generate a default style sheet. +# See also section "Doxygen usage" for information on how to generate the style +# sheet that doxygen normally uses. +# Note: It is recommended to use HTML_EXTRA_STYLESHEET instead of this tag, as +# it is more robust and this tag (HTML_STYLESHEET) will in the future become +# obsolete. +# This tag requires that the tag GENERATE_HTML is set to YES. + +HTML_STYLESHEET = + +# The HTML_EXTRA_STYLESHEET tag can be used to specify additional user-defined +# cascading style sheets that are included after the standard style sheets +# created by doxygen. Using this option one can overrule certain style aspects. +# This is preferred over using HTML_STYLESHEET since it does not replace the +# standard style sheet and is therefore more robust against future updates. +# Doxygen will copy the style sheet files to the output directory. +# Note: The order of the extra style sheet files is of importance (e.g. the last +# style sheet in the list overrules the setting of the previous ones in the +# list). For an example see the documentation. +# This tag requires that the tag GENERATE_HTML is set to YES. + +HTML_EXTRA_STYLESHEET = + +# The HTML_EXTRA_FILES tag can be used to specify one or more extra images or +# other source files which should be copied to the HTML output directory. Note +# that these files will be copied to the base HTML output directory. Use the +# $relpath^ marker in the HTML_HEADER and/or HTML_FOOTER files to load these +# files. In the HTML_STYLESHEET file, use the file name only. Also note that the +# files will be copied as-is; there are no commands or markers available. +# This tag requires that the tag GENERATE_HTML is set to YES. + +HTML_EXTRA_FILES = + +# The HTML_COLORSTYLE_HUE tag controls the color of the HTML output. Doxygen +# will adjust the colors in the style sheet and background images according to +# this color. Hue is specified as an angle on a colorwheel, see +# http://en.wikipedia.org/wiki/Hue for more information. For instance the value +# 0 represents red, 60 is yellow, 120 is green, 180 is cyan, 240 is blue, 300 +# purple, and 360 is red again. +# Minimum value: 0, maximum value: 359, default value: 220. +# This tag requires that the tag GENERATE_HTML is set to YES. + +HTML_COLORSTYLE_HUE = 220 + +# The HTML_COLORSTYLE_SAT tag controls the purity (or saturation) of the colors +# in the HTML output. For a value of 0 the output will use grayscales only. A +# value of 255 will produce the most vivid colors. +# Minimum value: 0, maximum value: 255, default value: 100. +# This tag requires that the tag GENERATE_HTML is set to YES. + +HTML_COLORSTYLE_SAT = 100 + +# The HTML_COLORSTYLE_GAMMA tag controls the gamma correction applied to the +# luminance component of the colors in the HTML output. Values below 100 +# gradually make the output lighter, whereas values above 100 make the output +# darker. The value divided by 100 is the actual gamma applied, so 80 represents +# a gamma of 0.8, The value 220 represents a gamma of 2.2, and 100 does not +# change the gamma. +# Minimum value: 40, maximum value: 240, default value: 80. +# This tag requires that the tag GENERATE_HTML is set to YES. + +HTML_COLORSTYLE_GAMMA = 80 + +# If the HTML_TIMESTAMP tag is set to YES then the footer of each generated HTML +# page will contain the date and time when the page was generated. Setting this +# to YES can help to show when doxygen was last run and thus if the +# documentation is up to date. +# The default value is: NO. +# This tag requires that the tag GENERATE_HTML is set to YES. + +HTML_TIMESTAMP = NO + +# If the HTML_DYNAMIC_SECTIONS tag is set to YES then the generated HTML +# documentation will contain sections that can be hidden and shown after the +# page has loaded. +# The default value is: NO. +# This tag requires that the tag GENERATE_HTML is set to YES. + +HTML_DYNAMIC_SECTIONS = NO + +# With HTML_INDEX_NUM_ENTRIES one can control the preferred number of entries +# shown in the various tree structured indices initially; the user can expand +# and collapse entries dynamically later on. Doxygen will expand the tree to +# such a level that at most the specified number of entries are visible (unless +# a fully collapsed tree already exceeds this amount). So setting the number of +# entries 1 will produce a full collapsed tree by default. 0 is a special value +# representing an infinite number of entries and will result in a full expanded +# tree by default. +# Minimum value: 0, maximum value: 9999, default value: 100. +# This tag requires that the tag GENERATE_HTML is set to YES. + +HTML_INDEX_NUM_ENTRIES = 100 + +# If the GENERATE_DOCSET tag is set to YES, additional index files will be +# generated that can be used as input for Apple's Xcode 3 integrated development +# environment (see: http://developer.apple.com/tools/xcode/), introduced with +# OSX 10.5 (Leopard). To create a documentation set, doxygen will generate a +# Makefile in the HTML output directory. Running make will produce the docset in +# that directory and running make install will install the docset in +# ~/Library/Developer/Shared/Documentation/DocSets so that Xcode will find it at +# startup. See http://developer.apple.com/tools/creatingdocsetswithdoxygen.html +# for more information. +# The default value is: NO. +# This tag requires that the tag GENERATE_HTML is set to YES. + +GENERATE_DOCSET = NO + +# This tag determines the name of the docset feed. A documentation feed provides +# an umbrella under which multiple documentation sets from a single provider +# (such as a company or product suite) can be grouped. +# The default value is: Doxygen generated docs. +# This tag requires that the tag GENERATE_DOCSET is set to YES. + +DOCSET_FEEDNAME = "Doxygen generated docs" + +# This tag specifies a string that should uniquely identify the documentation +# set bundle. This should be a reverse domain-name style string, e.g. +# com.mycompany.MyDocSet. Doxygen will append .docset to the name. +# The default value is: org.doxygen.Project. +# This tag requires that the tag GENERATE_DOCSET is set to YES. + +DOCSET_BUNDLE_ID = org.doxygen.Project + +# The DOCSET_PUBLISHER_ID tag specifies a string that should uniquely identify +# the documentation publisher. This should be a reverse domain-name style +# string, e.g. com.mycompany.MyDocSet.documentation. +# The default value is: org.doxygen.Publisher. +# This tag requires that the tag GENERATE_DOCSET is set to YES. + +DOCSET_PUBLISHER_ID = org.doxygen.Publisher + +# The DOCSET_PUBLISHER_NAME tag identifies the documentation publisher. +# The default value is: Publisher. +# This tag requires that the tag GENERATE_DOCSET is set to YES. + +DOCSET_PUBLISHER_NAME = Publisher + +# If the GENERATE_HTMLHELP tag is set to YES then doxygen generates three +# additional HTML index files: index.hhp, index.hhc, and index.hhk. The +# index.hhp is a project file that can be read by Microsoft's HTML Help Workshop +# (see: http://www.microsoft.com/en-us/download/details.aspx?id=21138) on +# Windows. +# +# The HTML Help Workshop contains a compiler that can convert all HTML output +# generated by doxygen into a single compiled HTML file (.chm). Compiled HTML +# files are now used as the Windows 98 help format, and will replace the old +# Windows help format (.hlp) on all Windows platforms in the future. Compressed +# HTML files also contain an index, a table of contents, and you can search for +# words in the documentation. The HTML workshop also contains a viewer for +# compressed HTML files. +# The default value is: NO. +# This tag requires that the tag GENERATE_HTML is set to YES. + +GENERATE_HTMLHELP = NO + +# The CHM_FILE tag can be used to specify the file name of the resulting .chm +# file. You can add a path in front of the file if the result should not be +# written to the html output directory. +# This tag requires that the tag GENERATE_HTMLHELP is set to YES. + +CHM_FILE = + +# The HHC_LOCATION tag can be used to specify the location (absolute path +# including file name) of the HTML help compiler (hhc.exe). If non-empty, +# doxygen will try to run the HTML help compiler on the generated index.hhp. +# The file has to be specified with full path. +# This tag requires that the tag GENERATE_HTMLHELP is set to YES. + +HHC_LOCATION = + +# The GENERATE_CHI flag controls if a separate .chi index file is generated +# (YES) or that it should be included in the master .chm file (NO). +# The default value is: NO. +# This tag requires that the tag GENERATE_HTMLHELP is set to YES. + +GENERATE_CHI = NO + +# The CHM_INDEX_ENCODING is used to encode HtmlHelp index (hhk), content (hhc) +# and project file content. +# This tag requires that the tag GENERATE_HTMLHELP is set to YES. + +CHM_INDEX_ENCODING = + +# The BINARY_TOC flag controls whether a binary table of contents is generated +# (YES) or a normal table of contents (NO) in the .chm file. Furthermore it +# enables the Previous and Next buttons. +# The default value is: NO. +# This tag requires that the tag GENERATE_HTMLHELP is set to YES. + +BINARY_TOC = NO + +# The TOC_EXPAND flag can be set to YES to add extra items for group members to +# the table of contents of the HTML help documentation and to the tree view. +# The default value is: NO. +# This tag requires that the tag GENERATE_HTMLHELP is set to YES. + +TOC_EXPAND = NO + +# If the GENERATE_QHP tag is set to YES and both QHP_NAMESPACE and +# QHP_VIRTUAL_FOLDER are set, an additional index file will be generated that +# can be used as input for Qt's qhelpgenerator to generate a Qt Compressed Help +# (.qch) of the generated HTML documentation. +# The default value is: NO. +# This tag requires that the tag GENERATE_HTML is set to YES. + +GENERATE_QHP = NO + +# If the QHG_LOCATION tag is specified, the QCH_FILE tag can be used to specify +# the file name of the resulting .qch file. The path specified is relative to +# the HTML output folder. +# This tag requires that the tag GENERATE_QHP is set to YES. + +QCH_FILE = + +# The QHP_NAMESPACE tag specifies the namespace to use when generating Qt Help +# Project output. For more information please see Qt Help Project / Namespace +# (see: http://qt-project.org/doc/qt-4.8/qthelpproject.html#namespace). +# The default value is: org.doxygen.Project. +# This tag requires that the tag GENERATE_QHP is set to YES. + +QHP_NAMESPACE = org.doxygen.Project + +# The QHP_VIRTUAL_FOLDER tag specifies the namespace to use when generating Qt +# Help Project output. For more information please see Qt Help Project / Virtual +# Folders (see: http://qt-project.org/doc/qt-4.8/qthelpproject.html#virtual- +# folders). +# The default value is: doc. +# This tag requires that the tag GENERATE_QHP is set to YES. + +QHP_VIRTUAL_FOLDER = doc + +# If the QHP_CUST_FILTER_NAME tag is set, it specifies the name of a custom +# filter to add. For more information please see Qt Help Project / Custom +# Filters (see: http://qt-project.org/doc/qt-4.8/qthelpproject.html#custom- +# filters). +# This tag requires that the tag GENERATE_QHP is set to YES. + +QHP_CUST_FILTER_NAME = + +# The QHP_CUST_FILTER_ATTRS tag specifies the list of the attributes of the +# custom filter to add. For more information please see Qt Help Project / Custom +# Filters (see: http://qt-project.org/doc/qt-4.8/qthelpproject.html#custom- +# filters). +# This tag requires that the tag GENERATE_QHP is set to YES. + +QHP_CUST_FILTER_ATTRS = + +# The QHP_SECT_FILTER_ATTRS tag specifies the list of the attributes this +# project's filter section matches. Qt Help Project / Filter Attributes (see: +# http://qt-project.org/doc/qt-4.8/qthelpproject.html#filter-attributes). +# This tag requires that the tag GENERATE_QHP is set to YES. + +QHP_SECT_FILTER_ATTRS = + +# The QHG_LOCATION tag can be used to specify the location of Qt's +# qhelpgenerator. If non-empty doxygen will try to run qhelpgenerator on the +# generated .qhp file. +# This tag requires that the tag GENERATE_QHP is set to YES. + +QHG_LOCATION = + +# If the GENERATE_ECLIPSEHELP tag is set to YES, additional index files will be +# generated, together with the HTML files, they form an Eclipse help plugin. To +# install this plugin and make it available under the help contents menu in +# Eclipse, the contents of the directory containing the HTML and XML files needs +# to be copied into the plugins directory of eclipse. The name of the directory +# within the plugins directory should be the same as the ECLIPSE_DOC_ID value. +# After copying Eclipse needs to be restarted before the help appears. +# The default value is: NO. +# This tag requires that the tag GENERATE_HTML is set to YES. + +GENERATE_ECLIPSEHELP = NO + +# A unique identifier for the Eclipse help plugin. When installing the plugin +# the directory name containing the HTML and XML files should also have this +# name. Each documentation set should have its own identifier. +# The default value is: org.doxygen.Project. +# This tag requires that the tag GENERATE_ECLIPSEHELP is set to YES. + +ECLIPSE_DOC_ID = org.doxygen.Project + +# If you want full control over the layout of the generated HTML pages it might +# be necessary to disable the index and replace it with your own. The +# DISABLE_INDEX tag can be used to turn on/off the condensed index (tabs) at top +# of each HTML page. A value of NO enables the index and the value YES disables +# it. Since the tabs in the index contain the same information as the navigation +# tree, you can set this option to YES if you also set GENERATE_TREEVIEW to YES. +# The default value is: NO. +# This tag requires that the tag GENERATE_HTML is set to YES. + +DISABLE_INDEX = NO + +# The GENERATE_TREEVIEW tag is used to specify whether a tree-like index +# structure should be generated to display hierarchical information. If the tag +# value is set to YES, a side panel will be generated containing a tree-like +# index structure (just like the one that is generated for HTML Help). For this +# to work a browser that supports JavaScript, DHTML, CSS and frames is required +# (i.e. any modern browser). Windows users are probably better off using the +# HTML help feature. Via custom style sheets (see HTML_EXTRA_STYLESHEET) one can +# further fine-tune the look of the index. As an example, the default style +# sheet generated by doxygen has an example that shows how to put an image at +# the root of the tree instead of the PROJECT_NAME. Since the tree basically has +# the same information as the tab index, you could consider setting +# DISABLE_INDEX to YES when enabling this option. +# The default value is: NO. +# This tag requires that the tag GENERATE_HTML is set to YES. + +GENERATE_TREEVIEW = NO + +# The ENUM_VALUES_PER_LINE tag can be used to set the number of enum values that +# doxygen will group on one line in the generated HTML documentation. +# +# Note that a value of 0 will completely suppress the enum values from appearing +# in the overview section. +# Minimum value: 0, maximum value: 20, default value: 4. +# This tag requires that the tag GENERATE_HTML is set to YES. + +ENUM_VALUES_PER_LINE = 1 + +# If the treeview is enabled (see GENERATE_TREEVIEW) then this tag can be used +# to set the initial width (in pixels) of the frame in which the tree is shown. +# Minimum value: 0, maximum value: 1500, default value: 250. +# This tag requires that the tag GENERATE_HTML is set to YES. + +TREEVIEW_WIDTH = 250 + +# If the EXT_LINKS_IN_WINDOW option is set to YES, doxygen will open links to +# external symbols imported via tag files in a separate window. +# The default value is: NO. +# This tag requires that the tag GENERATE_HTML is set to YES. + +EXT_LINKS_IN_WINDOW = NO + +# Use this tag to change the font size of LaTeX formulas included as images in +# the HTML documentation. When you change the font size after a successful +# doxygen run you need to manually remove any form_*.png images from the HTML +# output directory to force them to be regenerated. +# Minimum value: 8, maximum value: 50, default value: 10. +# This tag requires that the tag GENERATE_HTML is set to YES. + +FORMULA_FONTSIZE = 10 + +# Use the FORMULA_TRANPARENT tag to determine whether or not the images +# generated for formulas are transparent PNGs. Transparent PNGs are not +# supported properly for IE 6.0, but are supported on all modern browsers. +# +# Note that when changing this option you need to delete any form_*.png files in +# the HTML output directory before the changes have effect. +# The default value is: YES. +# This tag requires that the tag GENERATE_HTML is set to YES. + +FORMULA_TRANSPARENT = YES + +# Enable the USE_MATHJAX option to render LaTeX formulas using MathJax (see +# http://www.mathjax.org) which uses client side Javascript for the rendering +# instead of using pre-rendered bitmaps. Use this if you do not have LaTeX +# installed or if you want to formulas look prettier in the HTML output. When +# enabled you may also need to install MathJax separately and configure the path +# to it using the MATHJAX_RELPATH option. +# The default value is: NO. +# This tag requires that the tag GENERATE_HTML is set to YES. + +USE_MATHJAX = YES + +# When MathJax is enabled you can set the default output format to be used for +# the MathJax output. See the MathJax site (see: +# http://docs.mathjax.org/en/latest/output.html) for more details. +# Possible values are: HTML-CSS (which is slower, but has the best +# compatibility), NativeMML (i.e. MathML) and SVG. +# The default value is: HTML-CSS. +# This tag requires that the tag USE_MATHJAX is set to YES. + +MATHJAX_FORMAT = HTML-CSS + +# When MathJax is enabled you need to specify the location relative to the HTML +# output directory using the MATHJAX_RELPATH option. The destination directory +# should contain the MathJax.js script. For instance, if the mathjax directory +# is located at the same level as the HTML output directory, then +# MATHJAX_RELPATH should be ../mathjax. The default value points to the MathJax +# Content Delivery Network so you can quickly see the result without installing +# MathJax. However, it is strongly recommended to install a local copy of +# MathJax from http://www.mathjax.org before deployment. +# The default value is: http://cdn.mathjax.org/mathjax/latest. +# This tag requires that the tag USE_MATHJAX is set to YES. + +MATHJAX_RELPATH = http://cdn.mathjax.org/mathjax/latest + +# The MATHJAX_EXTENSIONS tag can be used to specify one or more MathJax +# extension names that should be enabled during MathJax rendering. For example +# MATHJAX_EXTENSIONS = TeX/AMSmath TeX/AMSsymbols +# This tag requires that the tag USE_MATHJAX is set to YES. + +MATHJAX_EXTENSIONS = + +# The MATHJAX_CODEFILE tag can be used to specify a file with javascript pieces +# of code that will be used on startup of the MathJax code. See the MathJax site +# (see: http://docs.mathjax.org/en/latest/output.html) for more details. For an +# example see the documentation. +# This tag requires that the tag USE_MATHJAX is set to YES. + +MATHJAX_CODEFILE = + +# When the SEARCHENGINE tag is enabled doxygen will generate a search box for +# the HTML output. The underlying search engine uses javascript and DHTML and +# should work on any modern browser. Note that when using HTML help +# (GENERATE_HTMLHELP), Qt help (GENERATE_QHP), or docsets (GENERATE_DOCSET) +# there is already a search function so this one should typically be disabled. +# For large projects the javascript based search engine can be slow, then +# enabling SERVER_BASED_SEARCH may provide a better solution. It is possible to +# search using the keyboard; to jump to the search box use + S +# (what the is depends on the OS and browser, but it is typically +# , /